import pandas as pd from typing import List from utils import TMP_DIR, INC_TOOLS, DATA_DIR def get_error_data_by_market( tools_df: pd.DataFrame, inc_tools: List[str] ) -> pd.DataFrame: """Gets the error data for the given tools and calculates the error percentage.""" tools_inc = tools_df[tools_df["tool"].isin(inc_tools)] error = ( tools_inc.groupby( ["tool", "request_month_year_week", "market_creator", "error"], sort=False ) .size() .unstack() .fillna(0) .reset_index() ) error["error_perc"] = (error[1] / (error[0] + error[1])) * 100 error["total_requests"] = error[0] + error[1] return error def get_tool_winning_rate_by_market( tools_df: pd.DataFrame, inc_tools: List[str] ) -> pd.DataFrame: """Gets the tool winning rate data for the given tools by market and calculates the winning percentage.""" tools_inc = tools_df[tools_df["tool"].isin(inc_tools)] tools_non_error = tools_inc[tools_inc["error"] != 1] tools_non_error.loc[:, "currentAnswer"] = tools_non_error["currentAnswer"].replace( {"no": "No", "yes": "Yes"} ) tools_non_error = tools_non_error[ tools_non_error["currentAnswer"].isin(["Yes", "No"]) ] tools_non_error = tools_non_error[tools_non_error["vote"].isin(["Yes", "No"])] tools_non_error["win"] = ( tools_non_error["currentAnswer"] == tools_non_error["vote"] ).astype(int) tools_non_error.columns = tools_non_error.columns.astype(str) wins = ( tools_non_error.groupby( ["tool", "request_month_year_week", "market_creator", "win"], sort=False ) .size() .unstack() .fillna(0) ) wins["win_perc"] = (wins[1] / (wins[0] + wins[1])) * 100 wins.reset_index(inplace=True) wins["total_request"] = wins[0] + wins[1] wins.columns = wins.columns.astype(str) # Convert request_month_year_week to string and explicitly set type for Altair # wins["request_month_year_week"] = wins["request_month_year_week"].astype(str) return wins def prepare_tools(tools: pd.DataFrame) -> pd.DataFrame: tools["request_time"] = pd.to_datetime(tools["request_time"]) tools = tools.sort_values(by="request_time", ascending=True) tools["request_month_year_week"] = ( pd.to_datetime(tools["request_time"]) .dt.to_period("W") .dt.start_time.dt.strftime("%b-%d-%Y") ) # preparing the tools graph # adding the total tools_all = tools.copy(deep=True) tools_all["market_creator"] = "all" # merging both dataframes tools = pd.concat([tools, tools_all], ignore_index=True) tools = tools.sort_values(by="request_time", ascending=True) return tools def compute_tools_based_datasets(): try: tools_df = pd.read_parquet(TMP_DIR / "tools.parquet") tools_df = prepare_tools(tools_df) except Exception as e: print(f"Error reading old tools parquet file {e}") return None # error by markets error_by_markets = get_error_data_by_market(tools_df=tools_df, inc_tools=INC_TOOLS) error_by_markets.to_parquet(DATA_DIR / "error_by_markets.parquet", index=False) try: tools_df = pd.read_parquet(TMP_DIR / "tools.parquet") tools_df = prepare_tools(tools_df) except Exception as e: print(f"Error reading old tools parquet file {e}") return None winning_df = get_tool_winning_rate_by_market(tools_df, inc_tools=INC_TOOLS) winning_df.to_parquet(DATA_DIR / "winning_df.parquet", index=False)