rosacastillo commited on
Commit
ae57283
·
1 Parent(s): 3859cbd

updating data and new cleaning script

Browse files
data/all_trades_profitability.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1df952a693ba00cc0b11bca8ff4c6805415f2d006b3cd47242e43e7cdc7d5fe1
3
- size 3266876
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:804722496702da46e6034175b54c73778fd4c5b7794d29967dccb2f2f6432603
3
+ size 3290989
data/fpmmTrades.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:390f96495582e69ae82225a61e6473c1fe6536081b326a6bd11617be45ce672a
3
- size 10816943
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:59bd9c58e15de9dcb4ae76cd8adca7750b460abfa2bdf79ee5042d3e3b5c396e
3
+ size 13934569
data/fpmms.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3b5dcd19c7922e3f7168a139b0d63c335c921343faa15852b6ae04888f7e006a
3
- size 504817
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:16b0570a9c07e0ef5d137ede96584fcfe1645a784a7380a83b9bdfa5829ad3e2
3
+ size 515347
data/invalid_trades.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:30e0fa52d0c605961b5a12bec47bc3b0288b02b814c61cc7f8a33ad793f8bd30
3
- size 84013
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:18f4b47e3c764b8c7f157b4b408d0c97e3436f58d86eb39edecf2a7cf2748a21
3
+ size 84033
data/new_fpmmTrades.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:84442b8ab800d01ec66e0c78efebfc31a5d954d3fdddfaf5fab41a75030a3967
3
+ size 3267040
data/new_tools.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:67da51ef342b56056a5bfb49f78ecab7354731e2cd88d16ccbffeaa141e175ec
3
+ size 64443733
data/outliers.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3910fd14580aac1b02c49152dbc5fb7b282aaa52b81e3e634801bf673590c8fb
3
  size 18274
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9fa72b86e20493b5a02dff2cf9173d394546b5eaba1de21469bb66593f7939e1
3
  size 18274
data/summary_profitability.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a55a6c4c7ef5da8db27b61c268eccbd1d426c456a0d17efa4b22b7c69ed1454d
3
- size 78788
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:522003ca4b76df815bf662ebc92478bc103652ac9f82dc82718578c26c650509
3
+ size 87497
data/t_map.pkl CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7b67cf178943b82b5286b7c2adb6329e1e23fffce807ebf299684746813f55de
3
- size 22992649
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:69d6fcd0360c5bbd646fa748b3f5a1e4bcccae358f32c85aa96509cdb6319c76
3
+ size 24153722
data/tools.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7e4ace8d172836c379ee23bde678f19d9eeec28e7bd31bf9e95dc914ac5c9bc5
3
- size 407088092
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aa6d6407da787ae9d2ed80233d939f57feae82cd66a8193937b861c601f24828
3
+ size 406224765
data/tools_accuracy.csv CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f7a3622338d1eb2f23824031733ecdd77ae77eff7cb2b1c879aba05b0966d2cc
3
- size 1133
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b0113d5cbc3c5eb981b5f1c5a7776f616fc52f44b15b5f96880a16989fa07d16
3
+ size 1240
scripts/cleaning_old_info.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import json
3
+ import pandas as pd
4
+ import numpy as np
5
+ from profitability import DATA_DIR, summary_analyse
6
+ from staking import label_trades_by_staking
7
+
8
+
9
+ def clean_old_data_from_parquet_files(cutoff_date: str):
10
+ # Convert the string to datetime64[ns, UTC]
11
+ min_date_utc = pd.to_datetime(cutoff_date, format="%Y-%m-%d", utc=True)
12
+
13
+ # clean tools.parquet
14
+ try:
15
+ tools = pd.read_parquet(DATA_DIR / "tools.parquet")
16
+
17
+ # make sure creator_address is in the columns
18
+ assert "trader_address" in tools.columns, "trader_address column not found"
19
+
20
+ # lowercase and strip creator_address
21
+ tools["trader_address"] = tools["trader_address"].str.lower().str.strip()
22
+
23
+ tools["request_time"] = pd.to_datetime(tools["request_time"], utc=True)
24
+
25
+ print(f"length before filtering {len(tools)}")
26
+ tools = tools.loc[tools["request_time"] > min_date_utc]
27
+ print(f"length after filtering {len(tools)}")
28
+ tools.to_parquet(DATA_DIR / "tools.parquet", index=False)
29
+
30
+ except Exception as e:
31
+ print(f"Error cleaning tools file {e}")
32
+
33
+ # clean all_trades_profitability.parquet
34
+ try:
35
+ all_trades = pd.read_parquet(DATA_DIR / "all_trades_profitability.parquet")
36
+
37
+ all_trades["creation_timestamp"] = pd.to_datetime(
38
+ all_trades["creation_timestamp"], utc=True
39
+ )
40
+
41
+ print(f"length before filtering {len(all_trades)}")
42
+ all_trades = all_trades.loc[all_trades["creation_timestamp"] > min_date_utc]
43
+ print(f"length after filtering {len(all_trades)}")
44
+
45
+ except Exception as e:
46
+ print(f"Error cleaning all trades profitability file {e}")
47
+
48
+ # generate again summary_profitability.parquet
49
+ try:
50
+ print("Summarising trades...")
51
+ summary_df = summary_analyse(all_trades)
52
+
53
+ # add staking labels
54
+ label_trades_by_staking(trades_df=all_trades, update=False)
55
+
56
+ # save to parquet
57
+ all_trades.to_parquet(
58
+ DATA_DIR / "all_trades_profitability.parquet", index=False
59
+ )
60
+ summary_df.to_parquet(DATA_DIR / "summary_profitability.parquet", index=False)
61
+ except Exception as e:
62
+ print(f"Error generating summary and saving all trades profitability file {e}")
63
+
64
+
65
+ if __name__ == "__main__":
66
+ clean_old_data_from_parquet_files("2024-09-15")
scripts/get_mech_info.py CHANGED
@@ -3,7 +3,6 @@ from typing import Any
3
  from datetime import datetime, timedelta, UTC
4
  from utils import SUBGRAPH_API_KEY, measure_execution_time, DATA_DIR
5
  import requests
6
- import json
7
  import pandas as pd
8
  import numpy as np
9
  from mech_request_utils import (
@@ -13,7 +12,6 @@ from mech_request_utils import (
13
  fix_duplicate_requestIds,
14
  merge_requests_delivers,
15
  get_ipfs_data,
16
- only_delivers_loop,
17
  merge_json_files,
18
  )
19
 
 
3
  from datetime import datetime, timedelta, UTC
4
  from utils import SUBGRAPH_API_KEY, measure_execution_time, DATA_DIR
5
  import requests
 
6
  import pandas as pd
7
  import numpy as np
8
  from mech_request_utils import (
 
12
  fix_duplicate_requestIds,
13
  merge_requests_delivers,
14
  get_ipfs_data,
 
15
  merge_json_files,
16
  )
17
 
scripts/profitability.py CHANGED
@@ -40,7 +40,6 @@ from utils import (
40
  wei_to_unit,
41
  convert_hex_to_int,
42
  _to_content,
43
- read_parquet_files,
44
  JSON_DATA_DIR,
45
  )
46
  from queries import omen_xdai_trades_query, conditional_tokens_gc_user_query
@@ -581,13 +580,19 @@ def run_profitability_analysis(
581
  update_tools_parquet(tools_filename)
582
  all_trades_df = update_all_trades_parquet(all_trades_df)
583
 
 
 
584
  # filter invalid markets. Condition: "is_invalid" is True
585
  invalid_trades = all_trades_df.loc[all_trades_df["is_invalid"] == True]
586
  if merge:
587
  try:
588
  old_invalid_trades = pd.read_parquet(DATA_DIR / "invalid_trades.parquet")
589
- merge_df = pd.concat([old_invalid_trades, invalid_trades], ignore_index=True)
 
 
590
  invalid_trades = merge_df.drop_duplicates()
 
 
591
  invalid_trades.to_parquet(DATA_DIR / "invalid_trades.parquet", index=False)
592
 
593
  all_trades_df = all_trades_df.loc[all_trades_df["is_invalid"] == False]
 
40
  wei_to_unit,
41
  convert_hex_to_int,
42
  _to_content,
 
43
  JSON_DATA_DIR,
44
  )
45
  from queries import omen_xdai_trades_query, conditional_tokens_gc_user_query
 
580
  update_tools_parquet(tools_filename)
581
  all_trades_df = update_all_trades_parquet(all_trades_df)
582
 
583
+ # debugging purposes
584
+ all_trades_df.to_parquet(JSON_DATA_DIR / "all_trades_df.parquets")
585
  # filter invalid markets. Condition: "is_invalid" is True
586
  invalid_trades = all_trades_df.loc[all_trades_df["is_invalid"] == True]
587
  if merge:
588
  try:
589
  old_invalid_trades = pd.read_parquet(DATA_DIR / "invalid_trades.parquet")
590
+ merge_df = pd.concat(
591
+ [old_invalid_trades, invalid_trades], ignore_index=True
592
+ )
593
  invalid_trades = merge_df.drop_duplicates()
594
+ except Exception as e:
595
+ print(f"Error updating the invalid trades parquet {e}")
596
  invalid_trades.to_parquet(DATA_DIR / "invalid_trades.parquet", index=False)
597
 
598
  all_trades_df = all_trades_df.loc[all_trades_df["is_invalid"] == False]
scripts/pull_data.py CHANGED
@@ -124,7 +124,7 @@ def only_new_weekly_analysis():
124
  rpc = RPC
125
  # Run markets ETL
126
  logging.info("Running markets ETL")
127
- mkt_etl(MARKETS_FILENAME)
128
  logging.info("Markets ETL completed")
129
 
130
  # New tools ETL
 
124
  rpc = RPC
125
  # Run markets ETL
126
  logging.info("Running markets ETL")
127
+ # mkt_etl(MARKETS_FILENAME)
128
  logging.info("Markets ETL completed")
129
 
130
  # New tools ETL
scripts/staking.py CHANGED
@@ -174,13 +174,16 @@ def get_trader_address_staking(trader_address: str, service_map: dict) -> str:
174
  return check_owner_staking_contract(owner_address=owner)
175
 
176
 
177
- def label_trades_by_staking(trades_df: pd.DataFrame) -> pd.DataFrame:
 
 
178
  with open(DATA_DIR / "service_map.pkl", "rb") as f:
179
  service_map = pickle.load(f)
180
  # get the last service id
181
  keys = service_map.keys()
182
  last_key = max(keys)
183
- update_service_map(start=last_key)
 
184
  all_traders = trades_df.trader_address.unique()
185
  trades_df["staking"] = ""
186
  for trader in tqdm(all_traders, desc="Labeling traders by staking", unit="trader"):
 
174
  return check_owner_staking_contract(owner_address=owner)
175
 
176
 
177
+ def label_trades_by_staking(
178
+ trades_df: pd.DataFrame, update: bool = True
179
+ ) -> pd.DataFrame:
180
  with open(DATA_DIR / "service_map.pkl", "rb") as f:
181
  service_map = pickle.load(f)
182
  # get the last service id
183
  keys = service_map.keys()
184
  last_key = max(keys)
185
+ if update:
186
+ update_service_map(start=last_key)
187
  all_traders = trades_df.trader_address.unique()
188
  trades_df["staking"] = ""
189
  for trader in tqdm(all_traders, desc="Labeling traders by staking", unit="trader"):
scripts/tools.py CHANGED
@@ -98,7 +98,8 @@ HTTP_TIMEOUT = 10
98
  N_IPFS_RETRIES = 1
99
  N_RPC_RETRIES = 100
100
  RPC_POLL_INTERVAL = 0.05
101
- IPFS_POLL_INTERVAL = 0.05
 
102
  IRRELEVANT_TOOLS = [
103
  "openai-text-davinci-002",
104
  "openai-text-davinci-003",
@@ -585,7 +586,7 @@ def parse_store_json_events_parallel(json_events: Dict[str, Any], output_filenam
585
  contents.append(current_mech_contents)
586
 
587
  tools = pd.concat(contents, ignore_index=True)
588
- print(f"Adding market creators info. Length of the tools file = {tools}")
589
  tools = add_market_creator(tools)
590
  print(
591
  f"Length of the tools dataframe after adding market creators info= {len(tools)}"
 
98
  N_IPFS_RETRIES = 1
99
  N_RPC_RETRIES = 100
100
  RPC_POLL_INTERVAL = 0.05
101
+ # IPFS_POLL_INTERVAL = 0.05 # low speed
102
+ IPFS_POLL_INTERVAL = 0.2 # high speed
103
  IRRELEVANT_TOOLS = [
104
  "openai-text-davinci-002",
105
  "openai-text-davinci-003",
 
586
  contents.append(current_mech_contents)
587
 
588
  tools = pd.concat(contents, ignore_index=True)
589
+ print(f"Adding market creators info. Length of the tools file = {len(tools)}")
590
  tools = add_market_creator(tools)
591
  print(
592
  f"Length of the tools dataframe after adding market creators info= {len(tools)}"