File size: 3,585 Bytes
278fab8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
285f2a6
 
 
278fab8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import pandas as pd
from typing import List
from utils import TMP_DIR, INC_TOOLS, DATA_DIR


def get_error_data_by_market(
    tools_df: pd.DataFrame, inc_tools: List[str]
) -> pd.DataFrame:
    """Gets the error data for the given tools and calculates the error percentage."""
    tools_inc = tools_df[tools_df["tool"].isin(inc_tools)]
    error = (
        tools_inc.groupby(
            ["tool", "request_month_year_week", "market_creator", "error"], sort=False
        )
        .size()
        .unstack()
        .fillna(0)
        .reset_index()
    )
    error["error_perc"] = (error[1] / (error[0] + error[1])) * 100
    error["total_requests"] = error[0] + error[1]
    return error


def get_tool_winning_rate_by_market(
    tools_df: pd.DataFrame, inc_tools: List[str]
) -> pd.DataFrame:
    """Gets the tool winning rate data for the given tools by market and calculates the winning percentage."""
    tools_inc = tools_df[tools_df["tool"].isin(inc_tools)]
    tools_non_error = tools_inc[tools_inc["error"] != 1]
    tools_non_error.loc[:, "currentAnswer"] = tools_non_error["currentAnswer"].replace(
        {"no": "No", "yes": "Yes"}
    )
    tools_non_error = tools_non_error[
        tools_non_error["currentAnswer"].isin(["Yes", "No"])
    ]
    tools_non_error = tools_non_error[tools_non_error["vote"].isin(["Yes", "No"])]
    tools_non_error["win"] = (
        tools_non_error["currentAnswer"] == tools_non_error["vote"]
    ).astype(int)
    tools_non_error.columns = tools_non_error.columns.astype(str)
    wins = (
        tools_non_error.groupby(
            ["tool", "request_month_year_week", "market_creator", "win"], sort=False
        )
        .size()
        .unstack()
        .fillna(0)
    )
    wins["win_perc"] = (wins[1] / (wins[0] + wins[1])) * 100
    wins.reset_index(inplace=True)
    wins["total_request"] = wins[0] + wins[1]
    wins.columns = wins.columns.astype(str)
    # Convert request_month_year_week to string and explicitly set type for Altair
    # wins["request_month_year_week"] = wins["request_month_year_week"].astype(str)
    return wins


def prepare_tools(tools: pd.DataFrame) -> pd.DataFrame:
    tools["request_time"] = pd.to_datetime(tools["request_time"])
    tools = tools.sort_values(by="request_time", ascending=True)

    tools["request_month_year_week"] = (
        pd.to_datetime(tools["request_time"])
        .dt.to_period("W")
        .dt.start_time.dt.strftime("%b-%d-%Y")
    )
    # preparing the tools graph
    # adding the total
    tools_all = tools.copy(deep=True)
    tools_all["market_creator"] = "all"
    # merging both dataframes
    tools = pd.concat([tools, tools_all], ignore_index=True)
    tools = tools.sort_values(by="request_time", ascending=True)
    return tools


def compute_tools_based_datasets():
    try:
        tools_df = pd.read_parquet(TMP_DIR / "tools.parquet")
        tools_df = prepare_tools(tools_df)
    except Exception as e:
        print(f"Error reading old tools parquet file {e}")
        return None
    # error by markets
    error_by_markets = get_error_data_by_market(tools_df=tools_df, inc_tools=INC_TOOLS)
    error_by_markets.to_parquet(DATA_DIR / "error_by_markets.parquet", index=False)
    try:
        tools_df = pd.read_parquet(TMP_DIR / "tools.parquet")
        tools_df = prepare_tools(tools_df)
    except Exception as e:
        print(f"Error reading old tools parquet file {e}")
        return None
    winning_df = get_tool_winning_rate_by_market(tools_df, inc_tools=INC_TOOLS)
    winning_df.to_parquet(DATA_DIR / "winning_df.parquet", index=False)