Spaces:

valory
/

olas-prediction-live-dashboard

Running

App Files Files Community

rosacastillo commited on Dec 6, 2024

Commit

b3c2f09

1 Parent(s): e6dc6fa

adding new scripts

Browse files

Files changed (5) hide show

scripts/get_mech_info.py +7 -2
scripts/manage_space_files.py +40 -0
scripts/profitability.py +16 -12
scripts/pull_data.py +12 -87
scripts/staking.py +8 -19

scripts/get_mech_info.py CHANGED Viewed

@@ -14,6 +14,7 @@ from mech_request_utils import (
     get_ipfs_data,
     merge_json_files,
 )
 OLD_MECH_SUBGRAPH_URL = (
     "https://api.thegraph.com/subgraphs/name/stakewise/ethereum-gnosis"
@@ -175,7 +176,7 @@ def update_all_trades_parquet(new_trades_df: pd.DataFrame) -> pd.DataFrame:
     return merge_df
-def update_tools_parquet(new_tools_filename: pd.DataFrame):
     try:
         old_tools_df = pd.read_parquet(DATA_DIR / "tools.parquet")
     except Exception as e:
@@ -183,6 +184,8 @@ def update_tools_parquet(new_tools_filename: pd.DataFrame):
         return None
     try:
         new_tools_df = pd.read_parquet(DATA_DIR / new_tools_filename)
     except Exception as e:
         print(f"Error reading new trades parquet file {e}")
         return None
@@ -194,7 +197,9 @@ def update_tools_parquet(new_tools_filename: pd.DataFrame):
     print(f"Initial length before removing duplicates in tools= {len(merge_df)}")
     # Remove duplicates
-    merge_df.drop_duplicates(inplace=True)
     print(f"Final length after removing duplicates in tools= {len(merge_df)}")
     # save the parquet file

     get_ipfs_data,
     merge_json_files,
 )
+from web3_utils import updating_timestamps
 OLD_MECH_SUBGRAPH_URL = (
     "https://api.thegraph.com/subgraphs/name/stakewise/ethereum-gnosis"
     return merge_df
+def update_tools_parquet(rpc: str, new_tools_filename: pd.DataFrame):
     try:
         old_tools_df = pd.read_parquet(DATA_DIR / "tools.parquet")
     except Exception as e:
         return None
     try:
         new_tools_df = pd.read_parquet(DATA_DIR / new_tools_filename)
+        # the new file has no request_time yet
+        updating_timestamps(rpc, new_tools_filename)
     except Exception as e:
         print(f"Error reading new trades parquet file {e}")
         return None
     print(f"Initial length before removing duplicates in tools= {len(merge_df)}")
     # Remove duplicates
+    merge_df.drop_duplicates(
+        subset=["request_id", "request_time"], keep="last", inplace=True
+    )
     print(f"Final length after removing duplicates in tools= {len(merge_df)}")
     # save the parquet file

scripts/manage_space_files.py ADDED Viewed

	@@ -0,0 +1,40 @@

+import os
+import shutil
+# Define the file names to move
+files_to_move = [
+    "new_tools.parquet",
+    "new_fpmmTrades.parquet",
+    "fpmms.parquet",
+    "fpmmTrades.parquet",
+]
+# Get the current working directory
+current_dir = os.getcwd()
+# Define source and destination paths
+source_dir = os.path.join(current_dir, "data")
+dest_dir = os.path.join(current_dir, "tmp")
+def move_files():
+    # Create tmp directory if it doesn't exist
+    if not os.path.exists(dest_dir):
+        os.makedirs(dest_dir)
+    # Move each file
+    for file_name in files_to_move:
+        source_file = os.path.join(source_dir, file_name)
+        dest_file = os.path.join(dest_dir, file_name)
+        try:
+            if os.path.exists(source_file):
+                shutil.move(source_file, dest_file)
+                print(f"Moved {file_name} successfully")
+            else:
+                print(f"File not found: {file_name}")
+        except Exception as e:
+            print(f"Error moving {file_name}: {str(e)}")
+if __name__ == "__main__":
+    move_files()

scripts/profitability.py CHANGED Viewed

@@ -28,7 +28,7 @@ from enum import Enum
 from tqdm import tqdm
 import numpy as np
 import os
-from pathlib import Path
 from get_mech_info import (
     DATETIME_60_DAYS_AGO,
     update_fpmmTrades_parquet,
@@ -41,6 +41,7 @@ from utils import (
     convert_hex_to_int,
     _to_content,
     JSON_DATA_DIR,
 )
 from queries import omen_xdai_trades_query, conditional_tokens_gc_user_query
 from staking import label_trades_by_staking
@@ -58,9 +59,6 @@ DEFAULT_TO_TIMESTAMP = 2147483647  # around year 2038
 WXDAI_CONTRACT_ADDRESS = "0xe91D153E0b41518A2Ce8Dd3D7944Fa863463a97d"
 DEFAULT_MECH_FEE = 0.01
 DUST_THRESHOLD = 10000000000000
-SCRIPTS_DIR = Path(__file__).parent
-ROOT_DIR = SCRIPTS_DIR.parent
-DATA_DIR = ROOT_DIR / "data"
 class MarketState(Enum):
@@ -331,7 +329,7 @@ def prepare_profitalibity_data(
     tools_filename: str,
     trades_filename: str,
     from_timestamp: float,
-):
     """Prepare data for profitalibity analysis."""
     # Check if tools.parquet is in the same directory
@@ -344,9 +342,10 @@ def prepare_profitalibity_data(
         # lowercase and strip creator_address
         tools["trader_address"] = tools["trader_address"].str.lower().str.strip()
-        # drop duplicates
-        tools.drop_duplicates(inplace=True)
         print(f"{tools_filename} loaded")
     except FileNotFoundError:
         print("tools.parquet not found. Please run tools.py first.")
@@ -366,7 +365,7 @@ def prepare_profitalibity_data(
     # lowercase and strip creator_address
     fpmmTrades["trader_address"] = fpmmTrades["trader_address"].str.lower().str.strip()
-    return fpmmTrades, tools
 def determine_market_status(trade, current_answer):
@@ -455,6 +454,7 @@ def analyse_trader(
             # Compute mech calls
             if len(tools_usage) == 0:
                 num_mech_calls = 0
             else:
                 try:
@@ -582,21 +582,25 @@ def run_profitability_analysis(
     # load dfs from data folder for analysis
     print(f"Preparing data with {tools_filename} and {trades_filename}")
-    fpmmTrades, tools = prepare_profitalibity_data(
         rpc, tools_filename, trades_filename, from_timestamp
     )
     print("Analysing trades...")
     all_trades_df = analyse_all_traders(fpmmTrades, tools)
-    # merge previous files if requested
     if merge:
         update_fpmmTrades_parquet(trades_filename)
-        update_tools_parquet(tools_filename)
         all_trades_df = update_all_trades_parquet(all_trades_df)
     # debugging purposes
     all_trades_df.to_parquet(JSON_DATA_DIR / "all_trades_df.parquet")
     # filter invalid markets. Condition: "is_invalid" is True
     invalid_trades = all_trades_df.loc[all_trades_df["is_invalid"] == True]
     if len(invalid_trades) == 0:

 from tqdm import tqdm
 import numpy as np
 import os
 from get_mech_info import (
     DATETIME_60_DAYS_AGO,
     update_fpmmTrades_parquet,
     convert_hex_to_int,
     _to_content,
     JSON_DATA_DIR,
+    DATA_DIR,
 )
 from queries import omen_xdai_trades_query, conditional_tokens_gc_user_query
 from staking import label_trades_by_staking
 WXDAI_CONTRACT_ADDRESS = "0xe91D153E0b41518A2Ce8Dd3D7944Fa863463a97d"
 DEFAULT_MECH_FEE = 0.01
 DUST_THRESHOLD = 10000000000000
 class MarketState(Enum):
     tools_filename: str,
     trades_filename: str,
     from_timestamp: float,
+) -> pd.DataFrame:
     """Prepare data for profitalibity analysis."""
     # Check if tools.parquet is in the same directory
         # lowercase and strip creator_address
         tools["trader_address"] = tools["trader_address"].str.lower().str.strip()
+        tools.drop_duplicates(
+            subset=["request_id", "request_block"], keep="last", inplace=True
+        )
+        tools.to_parquet(DATA_DIR / tools_filename)
         print(f"{tools_filename} loaded")
     except FileNotFoundError:
         print("tools.parquet not found. Please run tools.py first.")
     # lowercase and strip creator_address
     fpmmTrades["trader_address"] = fpmmTrades["trader_address"].str.lower().str.strip()
+    return fpmmTrades
 def determine_market_status(trade, current_answer):
             # Compute mech calls
             if len(tools_usage) == 0:
+                print("No tools usage information")
                 num_mech_calls = 0
             else:
                 try:
     # load dfs from data folder for analysis
     print(f"Preparing data with {tools_filename} and {trades_filename}")
+    fpmmTrades = prepare_profitalibity_data(
         rpc, tools_filename, trades_filename, from_timestamp
     )
+    if merge:
+        update_tools_parquet(rpc, tools_filename)
+    tools = pd.read_parquet(DATA_DIR / "tools.parquet")
     print("Analysing trades...")
     all_trades_df = analyse_all_traders(fpmmTrades, tools)
+    # # merge previous files if requested
     if merge:
         update_fpmmTrades_parquet(trades_filename)
         all_trades_df = update_all_trades_parquet(all_trades_df)
     # debugging purposes
     all_trades_df.to_parquet(JSON_DATA_DIR / "all_trades_df.parquet")
+    # all_trades_df = pd.read_parquet(JSON_DATA_DIR / "all_trades_df.parquet")
     # filter invalid markets. Condition: "is_invalid" is True
     invalid_trades = all_trades_df.loc[all_trades_df["is_invalid"] == True]
     if len(invalid_trades) == 0:

scripts/pull_data.py CHANGED Viewed

@@ -1,19 +1,20 @@
 import logging
-import pickle
 from datetime import datetime
-from concurrent.futures import ThreadPoolExecutor
-from tqdm import tqdm
-from web3 import Web3
 import pandas as pd
-from pathlib import Path
-from functools import partial
 from markets import (
     etl as mkt_etl,
     DEFAULT_FILENAME as MARKETS_FILENAME,
 )
 from tools import DEFAULT_FILENAME as TOOLS_FILENAME, generate_tools_file
 from profitability import run_profitability_analysis, DEFAULT_60_DAYS_AGO_TIMESTAMP
-from utils import get_question, current_answer, RPC, measure_execution_time
 from get_mech_info import (
     get_mech_events_last_60_days,
     get_mech_events_since_last_run,
@@ -21,31 +22,10 @@ from get_mech_info import (
 )
 from update_tools_accuracy import compute_tools_accuracy
 from cleaning_old_info import clean_old_data_from_parquet_files
-import gc
-logging.basicConfig(level=logging.INFO)
-SCRIPTS_DIR = Path(__file__).parent
-ROOT_DIR = SCRIPTS_DIR.parent
-DATA_DIR = ROOT_DIR / "data"
-HIST_DIR = ROOT_DIR / "historical_data"
-def block_number_to_timestamp(block_number: int, web3: Web3) -> str:
-    """Convert a block number to a timestamp."""
-    block = web3.eth.get_block(block_number)
-    timestamp = datetime.utcfromtimestamp(block["timestamp"])
-    return timestamp.strftime("%Y-%m-%d %H:%M:%S")
-def parallelize_timestamp_conversion(df: pd.DataFrame, function: callable) -> list:
-    """Parallelize the timestamp conversion."""
-    block_numbers = df["request_block"].tolist()
-    with ThreadPoolExecutor(max_workers=10) as executor:
-        results = list(
-            tqdm(executor.map(function, block_numbers), total=len(block_numbers))
-        )
-    return results
 def add_current_answer(tools_filename: str):
@@ -65,61 +45,6 @@ def add_current_answer(tools_filename: str):
     del fpmms
-def updating_timestamps(rpc: str, tools_filename: str):
-    web3 = Web3(Web3.HTTPProvider(rpc))
-    tools = pd.read_parquet(DATA_DIR / tools_filename)
-    # Convert block number to timestamp
-    logging.info("Converting block number to timestamp")
-    t_map = pickle.load(open(DATA_DIR / "t_map.pkl", "rb"))
-    tools["request_time"] = tools["request_block"].map(t_map)
-    no_data = tools["request_time"].isna().sum()
-    logging.info(f"Total rows with no request time info = {no_data}")
-    # Identify tools with missing request_time and fill them
-    missing_time_indices = tools[tools["request_time"].isna()].index
-    if not missing_time_indices.empty:
-        partial_block_number_to_timestamp = partial(
-            block_number_to_timestamp, web3=web3
-        )
-        missing_timestamps = parallelize_timestamp_conversion(
-            tools.loc[missing_time_indices], partial_block_number_to_timestamp
-        )
-        # Update the original DataFrame with the missing timestamps
-        for i, timestamp in zip(missing_time_indices, missing_timestamps):
-            tools.at[i, "request_time"] = timestamp
-    tools["request_month_year"] = pd.to_datetime(tools["request_time"]).dt.strftime(
-        "%Y-%m"
-    )
-    tools["request_month_year_week"] = (
-        pd.to_datetime(tools["request_time"]).dt.to_period("W").astype(str)
-    )
-    # Save the tools data after the updates on the content
-    tools.to_parquet(DATA_DIR / tools_filename, index=False)
-    # Update t_map with new timestamps
-    new_timestamps = (
-        tools[["request_block", "request_time"]]
-        .dropna()
-        .set_index("request_block")
-        .to_dict()["request_time"]
-    )
-    t_map.update(new_timestamps)
-    with open(DATA_DIR / "t_map.pkl", "wb") as f:
-        pickle.dump(t_map, f)
-    # clean and release all memory
-    del tools
-    del t_map
-    gc.collect()
 def save_historical_data():
     """Function to save a copy of the main trades and tools file
     into the historical folder"""
@@ -196,14 +121,14 @@ def only_new_weekly_analysis():
     save_historical_data()
-    clean_old_data_from_parquet_files("2024-09-29")
     compute_tools_accuracy()
     logging.info("Weekly analysis files generated and saved")
-def weekly_analysis():
     """Run weekly analysis for the FPMMS project."""
     rpc = RPC
     # Run markets ETL

 import logging
 from datetime import datetime
 import pandas as pd
 from markets import (
     etl as mkt_etl,
     DEFAULT_FILENAME as MARKETS_FILENAME,
 )
 from tools import DEFAULT_FILENAME as TOOLS_FILENAME, generate_tools_file
 from profitability import run_profitability_analysis, DEFAULT_60_DAYS_AGO_TIMESTAMP
+from utils import (
+    get_question,
+    current_answer,
+    RPC,
+    measure_execution_time,
+    DATA_DIR,
+    HIST_DIR,
+)
 from get_mech_info import (
     get_mech_events_last_60_days,
     get_mech_events_since_last_run,
 )
 from update_tools_accuracy import compute_tools_accuracy
 from cleaning_old_info import clean_old_data_from_parquet_files
+from web3_utils import updating_timestamps
+logging.basicConfig(level=logging.INFO)
 def add_current_answer(tools_filename: str):
     del fpmms
 def save_historical_data():
     """Function to save a copy of the main trades and tools file
     into the historical folder"""
     save_historical_data()
+    clean_old_data_from_parquet_files("2024-10-06")
     compute_tools_accuracy()
     logging.info("Weekly analysis files generated and saved")
+def old_weekly_analysis():
     """Run weekly analysis for the FPMMS project."""
     rpc = RPC
     # Run markets ETL

scripts/staking.py CHANGED Viewed

@@ -110,7 +110,7 @@ def update_service_map(start: int = 1, end: int = 1000):
             service_map = pickle.load(f)
     else:
         service_map = {}
     # we do not know which is the last service id right now
     service_registry = _get_contract(SERVICE_REGISTRY_ADDRESS)
     with ThreadPoolExecutor(max_workers=NUM_WORKERS) as executor:
@@ -174,16 +174,16 @@ def get_trader_address_staking(trader_address: str, service_map: dict) -> str:
     return check_owner_staking_contract(owner_address=owner)
-def label_trades_by_staking(
-    trades_df: pd.DataFrame, update: bool = True
-) -> pd.DataFrame:
     with open(DATA_DIR / "service_map.pkl", "rb") as f:
         service_map = pickle.load(f)
     # get the last service id
     keys = service_map.keys()
-    last_key = max(keys)
-    if update:
-        update_service_map(start=last_key)
     all_traders = trades_df.trader_address.unique()
     trades_df["staking"] = ""
     for trader in tqdm(all_traders, desc="Labeling traders by staking", unit="trader"):
@@ -200,17 +200,6 @@ def label_trades_by_staking(
 if __name__ == "__main__":
     # create_service_map()
     trades_df = pd.read_parquet(DATA_DIR / "all_trades_profitability.parquet")
-    label_trades_by_staking(trades_df=trades_df)
-    print(
-        trades_df[
-            [
-                "trader_address",
-                "creation_timestamp",
-                "market_creator",
-                "staking",
-                "collateral_amount",
-            ]
-        ]
-    )
     print(trades_df.staking.value_counts())
     trades_df.to_parquet(DATA_DIR / "all_trades_profitability.parquet", index=False)

             service_map = pickle.load(f)
     else:
         service_map = {}
+    print(f"updating service map from service id={start}")
     # we do not know which is the last service id right now
     service_registry = _get_contract(SERVICE_REGISTRY_ADDRESS)
     with ThreadPoolExecutor(max_workers=NUM_WORKERS) as executor:
     return check_owner_staking_contract(owner_address=owner)
+def label_trades_by_staking(trades_df: pd.DataFrame, start: int = None) -> pd.DataFrame:
     with open(DATA_DIR / "service_map.pkl", "rb") as f:
         service_map = pickle.load(f)
     # get the last service id
     keys = service_map.keys()
+    if start is None:
+        last_key = max(keys)
+    else:
+        last_key = start
+    update_service_map(start=last_key)
     all_traders = trades_df.trader_address.unique()
     trades_df["staking"] = ""
     for trader in tqdm(all_traders, desc="Labeling traders by staking", unit="trader"):
 if __name__ == "__main__":
     # create_service_map()
     trades_df = pd.read_parquet(DATA_DIR / "all_trades_profitability.parquet")
+    label_trades_by_staking(trades_df=trades_df, start=20)
     print(trades_df.staking.value_counts())
     trades_df.to_parquet(DATA_DIR / "all_trades_profitability.parquet", index=False)