File size: 4,675 Bytes
0120490 1df3214 0120490 24d09a6 0120490 1df3214 0120490 24d09a6 9382590 24d09a6 9382590 24d09a6 9382590 24d09a6 0120490 1df3214 0120490 544f140 0120490 544f140 0120490 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 |
import os
import pandas as pd
import ipfshttpclient
from pathlib import Path
from utils import INC_TOOLS
from typing import List
ACCURACY_FILENAME = "tools_accuracy.csv"
IPFS_SERVER = "/dns/"
SCRIPTS_DIR = Path(__file__).parent
DATA_DIR = ROOT_DIR / "data"
def update_tools_accuracy(
tools_acc: pd.DataFrame, tools_df: pd.DataFrame, inc_tools: List[str]
) -> pd.DataFrame:
"""To compute/update the latest accuracy information for the different mech tools"""
# computation of the accuracy information
tools_inc = tools_df[tools_df["tool"].isin(inc_tools)]
# filtering errors
tools_non_error = tools_inc[tools_inc["error"] != 1]
tools_non_error.loc[:, "currentAnswer"] = tools_non_error["currentAnswer"].replace(
{"no": "No", "yes": "Yes"}
tools_non_error = tools_non_error[
tools_non_error["currentAnswer"].isin(["Yes", "No"])
tools_non_error = tools_non_error[tools_non_error["vote"].isin(["Yes", "No"])]
tools_non_error["win"] = (
tools_non_error["currentAnswer"] == tools_non_error["vote"]
tools_non_error.columns = tools_non_error.columns.astype(str)
print("Tools dataset after filtering")
wins = tools_non_error.groupby(["tool", "win"]).size().unstack().fillna(0)
wins["tool_accuracy"] = (wins[1] / (wins[0] + wins[1])) * 100
wins["total_requests"] = wins[0] + wins[1]
wins.columns = wins.columns.astype(str)
wins = wins[["tool", "tool_accuracy", "total_requests"]]
print("Wins dataset")
no_timeline_info = False
timeline = tools_non_error.groupby(["tool"])["request_time"].agg(["min", "max"])
print("timeline dataset")
acc_info = wins.merge(timeline, how="left", on="tool")
no_timeline_info = True
acc_info = wins
if tools_acc is None:
print("Creating accuracy file for the first time")
return acc_info
# update the old information
print("Updating accuracy information")
tools_to_update = list(acc_info["tool"].values)
print("tools to update")
existing_tools = list(tools_acc["tool"].values)
for tool in tools_to_update:
new_accuracy = acc_info[acc_info["tool"] == tool]["tool_accuracy"].values[0]
new_volume = acc_info[acc_info["tool"] == tool]["total_requests"].values[0]
if no_timeline_info:
new_min_timeline = None
new_max_timeline = None
new_min_timeline = acc_info[acc_info["tool"] == tool]["min"].values[0]
new_max_timeline = acc_info[acc_info["tool"] == tool]["max"].values[0]
if tool in existing_tools:
tools_acc.loc[tools_acc["tool"] == tool, "tool_accuracy"] = new_accuracy
tools_acc.loc[tools_acc["tool"] == tool, "total_requests"] = new_volume
tools_acc.loc[tools_acc["tool"] == tool, "min"] = new_min_timeline
tools_acc.loc[tools_acc["tool"] == tool, "max"] = new_max_timeline
# new tool to add to the file
# tool,tool_accuracy,total_requests,min,max
new_row = {
"tool": tool,
"tool_accuracy": new_accuracy,
"total_requests": new_volume,
"min": new_min_timeline,
"max": new_max_timeline,
tools_acc = pd.concat([tools_acc, pd.DataFrame(new_row)], ignore_index=True)
return tools_acc
def compute_tools_accuracy():
print("Computing accuracy of tools")
print("Reading tools parquet file")
tools = pd.read_parquet(DATA_DIR / "tools.parquet")
# Computing tools accuracy information
print("Computing tool accuracy information")
# Check if the file exists
acc_data = None
if os.path.exists(DATA_DIR / ACCURACY_FILENAME):
acc_data = pd.read_csv(DATA_DIR / ACCURACY_FILENAME)
acc_data = update_tools_accuracy(acc_data, tools, INC_TOOLS)
# save acc_data into a CSV file
print("Saving into a csv file")
acc_data.to_csv(DATA_DIR / ACCURACY_FILENAME, index=False)
# save the data into IPFS
client = ipfshttpclient.connect(IPFS_SERVER)
result = client.add(DATA_DIR / ACCURACY_FILENAME)
print(f"HASH of the tools accuracy file: {result['Hash']}")
if __name__ == "__main__":