rosacastillo's picture
removed dependency with tools.parquet and new mech calls computation timestamps based
278fab8
raw
history blame
3.55 kB
import pandas as pd
from typing import List
from utils import TMP_DIR, INC_TOOLS, DATA_DIR
def get_error_data_by_market(
tools_df: pd.DataFrame, inc_tools: List[str]
) -> pd.DataFrame:
"""Gets the error data for the given tools and calculates the error percentage."""
tools_inc = tools_df[tools_df["tool"].isin(inc_tools)]
error = (
tools_inc.groupby(
["tool", "request_month_year_week", "market_creator", "error"], sort=False
)
.size()
.unstack()
.fillna(0)
.reset_index()
)
error["error_perc"] = (error[1] / (error[0] + error[1])) * 100
error["total_requests"] = error[0] + error[1]
return error
def get_tool_winning_rate_by_market(
tools_df: pd.DataFrame, inc_tools: List[str]
) -> pd.DataFrame:
"""Gets the tool winning rate data for the given tools by market and calculates the winning percentage."""
tools_inc = tools_df[tools_df["tool"].isin(inc_tools)]
tools_non_error = tools_inc[tools_inc["error"] != 1]
tools_non_error.loc[:, "currentAnswer"] = tools_non_error["currentAnswer"].replace(
{"no": "No", "yes": "Yes"}
)
tools_non_error = tools_non_error[
tools_non_error["currentAnswer"].isin(["Yes", "No"])
]
tools_non_error = tools_non_error[tools_non_error["vote"].isin(["Yes", "No"])]
tools_non_error["win"] = (
tools_non_error["currentAnswer"] == tools_non_error["vote"]
).astype(int)
tools_non_error.columns = tools_non_error.columns.astype(str)
wins = (
tools_non_error.groupby(
["tool", "request_month_year_week", "market_creator", "win"], sort=False
)
.size()
.unstack()
.fillna(0)
)
wins["win_perc"] = (wins[1] / (wins[0] + wins[1])) * 100
wins.reset_index(inplace=True)
wins["total_request"] = wins[0] + wins[1]
wins.columns = wins.columns.astype(str)
# Convert request_month_year_week to string and explicitly set type for Altair
# wins["request_month_year_week"] = wins["request_month_year_week"].astype(str)
return wins
def prepare_tools(tools: pd.DataFrame) -> pd.DataFrame:
tools["request_time"] = pd.to_datetime(tools["request_time"])
tools = tools.sort_values(by="request_time", ascending=True)
tools["request_month_year_week"] = (
pd.to_datetime(tools["request_time"]).dt.to_period("W").dt.strftime("%b-%d")
)
# preparing the tools graph
# adding the total
tools_all = tools.copy(deep=True)
tools_all["market_creator"] = "all"
# merging both dataframes
tools = pd.concat([tools, tools_all], ignore_index=True)
tools = tools.sort_values(by="request_time", ascending=True)
return tools
def compute_tools_based_datasets():
try:
tools_df = pd.read_parquet(TMP_DIR / "tools.parquet")
tools_df = prepare_tools(tools_df)
except Exception as e:
print(f"Error reading old tools parquet file {e}")
return None
# error by markets
error_by_markets = get_error_data_by_market(tools_df=tools_df, inc_tools=INC_TOOLS)
error_by_markets.to_parquet(DATA_DIR / "error_by_markets.parquet", index=False)
try:
tools_df = pd.read_parquet(TMP_DIR / "tools.parquet")
tools_df = prepare_tools(tools_df)
except Exception as e:
print(f"Error reading old tools parquet file {e}")
return None
winning_df = get_tool_winning_rate_by_market(tools_df, inc_tools=INC_TOOLS)
winning_df.to_parquet(DATA_DIR / "winning_df.parquet", index=False)