|
import pandas as pd |
|
from typing import List |
|
from utils import TMP_DIR, INC_TOOLS, DATA_DIR |
|
|
|
|
|
def get_error_data_by_market( |
|
tools_df: pd.DataFrame, inc_tools: List[str] |
|
) -> pd.DataFrame: |
|
"""Gets the error data for the given tools and calculates the error percentage.""" |
|
tools_inc = tools_df[tools_df["tool"].isin(inc_tools)] |
|
error = ( |
|
tools_inc.groupby( |
|
["tool", "request_month_year_week", "market_creator", "error"], sort=False |
|
) |
|
.size() |
|
.unstack() |
|
.fillna(0) |
|
.reset_index() |
|
) |
|
error["error_perc"] = (error[1] / (error[0] + error[1])) * 100 |
|
error["total_requests"] = error[0] + error[1] |
|
return error |
|
|
|
|
|
def get_tool_winning_rate_by_market( |
|
tools_df: pd.DataFrame, inc_tools: List[str] |
|
) -> pd.DataFrame: |
|
"""Gets the tool winning rate data for the given tools by market and calculates the winning percentage.""" |
|
tools_inc = tools_df[tools_df["tool"].isin(inc_tools)] |
|
tools_non_error = tools_inc[tools_inc["error"] != 1] |
|
tools_non_error.loc[:, "currentAnswer"] = tools_non_error["currentAnswer"].replace( |
|
{"no": "No", "yes": "Yes"} |
|
) |
|
tools_non_error = tools_non_error[ |
|
tools_non_error["currentAnswer"].isin(["Yes", "No"]) |
|
] |
|
tools_non_error = tools_non_error[tools_non_error["vote"].isin(["Yes", "No"])] |
|
tools_non_error["win"] = ( |
|
tools_non_error["currentAnswer"] == tools_non_error["vote"] |
|
).astype(int) |
|
tools_non_error.columns = tools_non_error.columns.astype(str) |
|
wins = ( |
|
tools_non_error.groupby( |
|
["tool", "request_month_year_week", "market_creator", "win"], sort=False |
|
) |
|
.size() |
|
.unstack() |
|
.fillna(0) |
|
) |
|
wins["win_perc"] = (wins[1] / (wins[0] + wins[1])) * 100 |
|
wins.reset_index(inplace=True) |
|
wins["total_request"] = wins[0] + wins[1] |
|
wins.columns = wins.columns.astype(str) |
|
|
|
|
|
return wins |
|
|
|
|
|
def prepare_tools(tools: pd.DataFrame) -> pd.DataFrame: |
|
tools["request_time"] = pd.to_datetime(tools["request_time"]) |
|
tools = tools.sort_values(by="request_time", ascending=True) |
|
|
|
tools["request_month_year_week"] = ( |
|
pd.to_datetime(tools["request_time"]).dt.to_period("W").dt.strftime("%b-%d") |
|
) |
|
|
|
|
|
tools_all = tools.copy(deep=True) |
|
tools_all["market_creator"] = "all" |
|
|
|
tools = pd.concat([tools, tools_all], ignore_index=True) |
|
tools = tools.sort_values(by="request_time", ascending=True) |
|
return tools |
|
|
|
|
|
def compute_tools_based_datasets(): |
|
try: |
|
tools_df = pd.read_parquet(TMP_DIR / "tools.parquet") |
|
tools_df = prepare_tools(tools_df) |
|
except Exception as e: |
|
print(f"Error reading old tools parquet file {e}") |
|
return None |
|
|
|
error_by_markets = get_error_data_by_market(tools_df=tools_df, inc_tools=INC_TOOLS) |
|
error_by_markets.to_parquet(DATA_DIR / "error_by_markets.parquet", index=False) |
|
try: |
|
tools_df = pd.read_parquet(TMP_DIR / "tools.parquet") |
|
tools_df = prepare_tools(tools_df) |
|
except Exception as e: |
|
print(f"Error reading old tools parquet file {e}") |
|
return None |
|
winning_df = get_tool_winning_rate_by_market(tools_df, inc_tools=INC_TOOLS) |
|
winning_df.to_parquet(DATA_DIR / "winning_df.parquet", index=False) |
|
|