File size: 3,585 Bytes
278fab8 285f2a6 278fab8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 |
import pandas as pd
from typing import List
from utils import TMP_DIR, INC_TOOLS, DATA_DIR
def get_error_data_by_market(
tools_df: pd.DataFrame, inc_tools: List[str]
) -> pd.DataFrame:
"""Gets the error data for the given tools and calculates the error percentage."""
tools_inc = tools_df[tools_df["tool"].isin(inc_tools)]
error = (
tools_inc.groupby(
["tool", "request_month_year_week", "market_creator", "error"], sort=False
)
.size()
.unstack()
.fillna(0)
.reset_index()
)
error["error_perc"] = (error[1] / (error[0] + error[1])) * 100
error["total_requests"] = error[0] + error[1]
return error
def get_tool_winning_rate_by_market(
tools_df: pd.DataFrame, inc_tools: List[str]
) -> pd.DataFrame:
"""Gets the tool winning rate data for the given tools by market and calculates the winning percentage."""
tools_inc = tools_df[tools_df["tool"].isin(inc_tools)]
tools_non_error = tools_inc[tools_inc["error"] != 1]
tools_non_error.loc[:, "currentAnswer"] = tools_non_error["currentAnswer"].replace(
{"no": "No", "yes": "Yes"}
)
tools_non_error = tools_non_error[
tools_non_error["currentAnswer"].isin(["Yes", "No"])
]
tools_non_error = tools_non_error[tools_non_error["vote"].isin(["Yes", "No"])]
tools_non_error["win"] = (
tools_non_error["currentAnswer"] == tools_non_error["vote"]
).astype(int)
tools_non_error.columns = tools_non_error.columns.astype(str)
wins = (
tools_non_error.groupby(
["tool", "request_month_year_week", "market_creator", "win"], sort=False
)
.size()
.unstack()
.fillna(0)
)
wins["win_perc"] = (wins[1] / (wins[0] + wins[1])) * 100
wins.reset_index(inplace=True)
wins["total_request"] = wins[0] + wins[1]
wins.columns = wins.columns.astype(str)
# Convert request_month_year_week to string and explicitly set type for Altair
# wins["request_month_year_week"] = wins["request_month_year_week"].astype(str)
return wins
def prepare_tools(tools: pd.DataFrame) -> pd.DataFrame:
tools["request_time"] = pd.to_datetime(tools["request_time"])
tools = tools.sort_values(by="request_time", ascending=True)
tools["request_month_year_week"] = (
pd.to_datetime(tools["request_time"])
.dt.to_period("W")
.dt.start_time.dt.strftime("%b-%d-%Y")
)
# preparing the tools graph
# adding the total
tools_all = tools.copy(deep=True)
tools_all["market_creator"] = "all"
# merging both dataframes
tools = pd.concat([tools, tools_all], ignore_index=True)
tools = tools.sort_values(by="request_time", ascending=True)
return tools
def compute_tools_based_datasets():
try:
tools_df = pd.read_parquet(TMP_DIR / "tools.parquet")
tools_df = prepare_tools(tools_df)
except Exception as e:
print(f"Error reading old tools parquet file {e}")
return None
# error by markets
error_by_markets = get_error_data_by_market(tools_df=tools_df, inc_tools=INC_TOOLS)
error_by_markets.to_parquet(DATA_DIR / "error_by_markets.parquet", index=False)
try:
tools_df = pd.read_parquet(TMP_DIR / "tools.parquet")
tools_df = prepare_tools(tools_df)
except Exception as e:
print(f"Error reading old tools parquet file {e}")
return None
winning_df = get_tool_winning_rate_by_market(tools_df, inc_tools=INC_TOOLS)
winning_df.to_parquet(DATA_DIR / "winning_df.parquet", index=False)
|