gyigit commited on
Commit
5b94380
·
1 Parent(s): de474d4
Files changed (2) hide show
  1. src/data/function_results.csv +2 -2
  2. src/utils.py +213 -0
src/data/function_results.csv CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3064f439b465b4013f0116ddb2240196d794e1c2a85fd02627d054f9d1e528fe
3
- size 41071
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2bb6e504784eeae2a09313539759a7bf02757c08fcc7d1dabf5ba4efeab3eb6a
3
+ size 3475
src/utils.py ADDED
@@ -0,0 +1,213 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import os
3
+
4
+ import sys
5
+
6
+ script_dir = os.path.dirname(os.path.abspath(__file__))
7
+ sys.path.append('..')
8
+ sys.path.append('.')
9
+
10
+ def save_similarity_output(output_dict, method_name, leaderboard_path="./data/leaderboard_results.csv", similarity_path="./data/similarity_results.csv"):
11
+ # Load or initialize the DataFrames
12
+ if os.path.exists(leaderboard_path):
13
+ leaderboard_df = pd.read_csv(leaderboard_path)
14
+ else:
15
+ leaderboard_df = pd.DataFrame()
16
+
17
+ if os.path.exists(similarity_path):
18
+ similarity_df = pd.read_csv(similarity_path)
19
+ else:
20
+ similarity_df = pd.DataFrame(columns=['Method'])
21
+
22
+ # Check if method exists in similarity results
23
+ if method_name not in similarity_df['Method'].values:
24
+ similarity_df = pd.concat([similarity_df, pd.DataFrame({'Method': [method_name]})], ignore_index=True)
25
+
26
+ # Initialize storage for averages
27
+ averages = {}
28
+
29
+ # Iterate through the output_dict and calculate averages if all aspects (MF, CC, BP) are present
30
+ for dataset in ['sparse', '200', '500']:
31
+ correlation_values = []
32
+ pvalue_values = []
33
+
34
+ # Check each aspect within the dataset (MF, BP, CC)
35
+ for aspect in ['MF', 'BP', 'CC']:
36
+ correlation_key = f"{dataset}_{aspect}_correlation"
37
+ pvalue_key = f"{dataset}_{aspect}_pvalue"
38
+
39
+ # Process correlation if present
40
+ if correlation_key in output_dict:
41
+ correlation_values.append(output_dict[correlation_key])
42
+ similarity_df.at[similarity_df['Method'] == method_name, f"{dataset}_{aspect}_correlation"] = output_dict[correlation_key]
43
+ leaderboard_df.at[0, f"sim_{dataset}_{aspect}_correlation"] = output_dict[correlation_key]
44
+
45
+ # Process pvalue if present
46
+ if pvalue_key in output_dict:
47
+ pvalue_values.append(output_dict[pvalue_key])
48
+ similarity_df.at[similarity_df['Method'] == method_name, f"{dataset}_{aspect}_pvalue"] = output_dict[pvalue_key]
49
+ leaderboard_df.at[0, f"sim_{dataset}_{aspect}_pvalue"] = output_dict[pvalue_key]
50
+
51
+ # Calculate averages if all three aspects (MF, BP, CC) are present
52
+ if len(correlation_values) == 3:
53
+ averages[f"{dataset}_Ave_correlation"] = sum(correlation_values) / 3
54
+ similarity_df.at[similarity_df['Method'] == method_name, f"{dataset}_Ave_correlation"] = averages[f"{dataset}_Ave_correlation"]
55
+ leaderboard_df.at[0, f"sim_{dataset}_Ave_correlation"] = averages[f"{dataset}_Ave_correlation"]
56
+
57
+ if len(pvalue_values) == 3:
58
+ averages[f"{dataset}_Ave_pvalue"] = sum(pvalue_values) / 3
59
+ similarity_df.at[similarity_df['Method'] == method_name, f"{dataset}_Ave_pvalue"] = averages[f"{dataset}_Ave_pvalue"]
60
+ leaderboard_df.at[0, f"sim_{dataset}_Ave_pvalue"] = averages[f"{dataset}_Ave_pvalue"]
61
+
62
+ # Save the updated DataFrames back to CSV
63
+ leaderboard_df.to_csv(leaderboard_path, index=False)
64
+ similarity_df.to_csv(similarity_path, index=False)
65
+
66
+ return 0
67
+
68
+ def save_function_output(model_output, method_name, func_results_path="./data/function_results.csv", leaderboard_path="./data/leaderboard_results.csv"):
69
+ # Load or initialize the DataFrames
70
+ if os.path.exists(func_results_path):
71
+ func_results_df = pd.read_csv(func_results_path)
72
+ else:
73
+ func_results_df = pd.DataFrame(columns=['Method'])
74
+
75
+ if os.path.exists(leaderboard_path):
76
+ leaderboard_df = pd.read_csv(leaderboard_path)
77
+ else:
78
+ leaderboard_df = pd.DataFrame()
79
+
80
+ # Ensure the method_name row exists in function results
81
+ if method_name not in func_results_df['Method'].values:
82
+ func_results_df = pd.concat([func_results_df, pd.DataFrame({'Method': [method_name]})], ignore_index=True)
83
+
84
+ # Storage for averaging in leaderboard results
85
+ metrics_sum = {
86
+ 'accuracy': {'BP': [], 'CC': [], 'MF': []},
87
+ 'F1': {'BP': [], 'CC': [], 'MF': []},
88
+ 'precision': {'BP': [], 'CC': [], 'MF': []},
89
+ 'recall': {'BP': [], 'CC': [], 'MF': []}
90
+ }
91
+
92
+ # Iterate over each entry in model_output
93
+ for entry in model_output:
94
+ key = entry[0]
95
+ accuracy, f1, precision, recall = entry[1], entry[4], entry[7], entry[10]
96
+
97
+ # Parse the key to extract the aspect and datasets
98
+ aspect, dataset1, dataset2 = key.split('_')
99
+
100
+ # Save each metric to function_results under its respective column
101
+ func_results_df.at[func_results_df['Method'] == method_name, f"{aspect}_{dataset1}_{dataset2}_accuracy"] = accuracy
102
+ func_results_df.at[func_results_df['Method'] == method_name, f"{aspect}_{dataset1}_{dataset2}_F1"] = f1
103
+ func_results_df.at[func_results_df['Method'] == method_name, f"{aspect}_{dataset1}_{dataset2}_precision"] = precision
104
+ func_results_df.at[func_results_df['Method'] == method_name, f"{aspect}_{dataset1}_{dataset2}_recall"] = recall
105
+
106
+ # Add values for leaderboard averaging
107
+ metrics_sum['accuracy'][aspect].append(accuracy)
108
+ metrics_sum['F1'][aspect].append(f1)
109
+ metrics_sum['precision'][aspect].append(precision)
110
+ metrics_sum['recall'][aspect].append(recall)
111
+
112
+ # Calculate averages for each aspect and overall (if all aspects have entries)
113
+ for metric in ['accuracy', 'F1', 'precision', 'recall']:
114
+ for aspect in ['BP', 'CC', 'MF']:
115
+ if metrics_sum[metric][aspect]:
116
+ aspect_average = sum(metrics_sum[metric][aspect]) / len(metrics_sum[metric][aspect])
117
+ leaderboard_df.at[0, f"func_{aspect}_{metric}"] = aspect_average
118
+
119
+ # Calculate overall average if each aspect has entries
120
+ if all(metrics_sum[metric][aspect] for aspect in ['BP', 'CC', 'MF']):
121
+ overall_average = sum(
122
+ sum(metrics_sum[metric][aspect]) / len(metrics_sum[metric][aspect])
123
+ for aspect in ['BP', 'CC', 'MF']
124
+ ) / 3
125
+ leaderboard_df.at[0, f"func_Ave_{metric}"] = overall_average
126
+
127
+ # Save updated DataFrames to CSV
128
+ func_results_df.to_csv(func_results_path, index=False)
129
+ leaderboard_df.to_csv(leaderboard_path, index=False)
130
+
131
+ return 0
132
+
133
+ def save_family_output(model_output, method_name, leaderboard_path="./data/leaderboard_results.csv", family_results_path="./data/family_results.csv"):
134
+ # Load or initialize the DataFrames
135
+ if os.path.exists(leaderboard_path):
136
+ leaderboard_df = pd.read_csv(leaderboard_path)
137
+ else:
138
+ leaderboard_df = pd.DataFrame(columns=['Method'])
139
+
140
+ if os.path.exists(family_results_path):
141
+ family_results_df = pd.read_csv(family_results_path)
142
+ else:
143
+ family_results_df = pd.DataFrame(columns=['Method'])
144
+
145
+ # Ensure the method_name row exists in the leaderboard results
146
+ if method_name not in leaderboard_df['Method'].values:
147
+ leaderboard_df = pd.concat([leaderboard_df, pd.DataFrame({'Method': [method_name]})], ignore_index=True)
148
+
149
+ # Ensure the method_name row exists in family results
150
+ if method_name not in family_results_df['Method'].values:
151
+ family_results_df = pd.concat([family_results_df, pd.DataFrame({'Method': [method_name]})], ignore_index=True)
152
+
153
+ # Iterate through the datasets and metrics
154
+ for dataset, metrics in model_output.items():
155
+ for metric, values in metrics.items():
156
+ # Calculate the average for each metric in leaderboard results
157
+ avg_value = sum(values) / len(values) if values else None
158
+ leaderboard_df.at[leaderboard_df['Method'] == method_name, f"fam_{dataset}_{metric}_ave"] = avg_value
159
+
160
+ # Save each fold result for family results
161
+ for i, value in enumerate(values):
162
+ family_results_df.at[family_results_df['Method'] == method_name, f"{dataset}_{metric}_{i}"] = value
163
+
164
+ # Save updated DataFrames to CSV
165
+ leaderboard_df.to_csv(leaderboard_path, index=False)
166
+ family_results_df.to_csv(family_results_path, index=False)
167
+
168
+ return leaderboard_df, family_results_df
169
+
170
+ def save_affinity_output(model_output, method_name, leaderboard_path="./data/leaderboard_results.csv", affinity_results_path="./data/affinity_results.csv"):
171
+ # Load or initialize DataFrames
172
+ if os.path.exists(leaderboard_path):
173
+ leaderboard_df = pd.read_csv(leaderboard_path)
174
+ else:
175
+ leaderboard_df = pd.DataFrame(columns=['Method'])
176
+
177
+ if os.path.exists(affinity_results_path):
178
+ affinity_results_df = pd.read_csv(affinity_results_path)
179
+ else:
180
+ affinity_results_df = pd.DataFrame(columns=['Method'])
181
+
182
+ # Ensure the method_name row exists in the leaderboard results
183
+ if method_name not in leaderboard_df['Method'].values:
184
+ leaderboard_df = pd.concat([leaderboard_df, pd.DataFrame({'Method': [method_name]})], ignore_index=True)
185
+
186
+ # Ensure the method_name row exists in affinity results
187
+ if method_name not in affinity_results_df['Method'].values:
188
+ affinity_results_df = pd.concat([affinity_results_df, pd.DataFrame({'Method': [method_name]})], ignore_index=True)
189
+
190
+ # Process 'summary' section for leaderboard results
191
+ summary = model_output.get('summary', {})
192
+ if summary:
193
+ leaderboard_df.at[leaderboard_df['Method'] == method_name, 'aff_mse_ave'] = summary.get('val_mse_error')
194
+ leaderboard_df.at[leaderboard_df['Method'] == method_name, 'aff_mae_ave'] = summary.get('val_mae_error')
195
+ leaderboard_df.at[leaderboard_df['Method'] == method_name, 'aff_corr_ave'] = summary.get('validation_corr')
196
+
197
+ # Process 'detail' section for affinity results
198
+ detail = model_output.get('detail', {})
199
+ if detail:
200
+ # Save each 10-fold cross-validation result for mse, mae, and corr
201
+ for i in range(10):
202
+ if 'val_mse_errors' in detail:
203
+ affinity_results_df.at[affinity_results_df['Method'] == method_name, f"mse_{i}"] = detail['val_mse_errors'][i]
204
+ if 'val_mae_errors' in detail:
205
+ affinity_results_df.at[affinity_results_df['Method'] == method_name, f"mae_{i}"] = detail['val_mae_errors'][i]
206
+ if 'validation_corrs' in detail:
207
+ affinity_results_df.at[affinity_results_df['Method'] == method_name, f"corr_{i}"] = detail['validation_corrs'][i]
208
+
209
+ # Save updated DataFrames to CSV
210
+ leaderboard_df.to_csv(leaderboard_path, index=False)
211
+ affinity_results_df.to_csv(affinity_results_path, index=False)
212
+
213
+ return 0