ctheodoris
commited on
Commit
·
f4fea1e
1
Parent(s):
9f2c6cc
Reorder/sort isp stats output in vs_null mode
Browse files
geneformer/in_silico_perturber_stats.py
CHANGED
@@ -19,6 +19,7 @@ import logging
|
|
19 |
import numpy as np
|
20 |
import pandas as pd
|
21 |
import pickle
|
|
|
22 |
import statsmodels.stats.multitest as smt
|
23 |
from pathlib import Path
|
24 |
from scipy.stats import ranksums
|
@@ -66,8 +67,7 @@ def n_detections(token, dict_list):
|
|
66 |
def get_fdr(pvalues):
|
67 |
return list(smt.multipletests(pvalues, alpha=0.05, method="fdr_bh")[1])
|
68 |
|
69 |
-
def isp_stats(cos_sims_df, dict_list
|
70 |
-
|
71 |
random_tuples = []
|
72 |
for i in trange(cos_sims_df.shape[0]):
|
73 |
token = cos_sims_df["Gene"][i]
|
@@ -131,6 +131,40 @@ def isp_stats(cos_sims_df, dict_list, cell_states_to_model):
|
|
131 |
|
132 |
return cos_sims_full_df
|
133 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
134 |
class InSilicoPerturberStats:
|
135 |
valid_option_dict = {
|
136 |
"mode": {"goal_state_shift","vs_null","vs_random"},
|
@@ -255,17 +289,16 @@ class InSilicoPerturberStats:
|
|
255 |
output_prefix : str
|
256 |
Prefix for output .dataset
|
257 |
"""
|
258 |
-
|
259 |
-
self.
|
260 |
-
self.gene_id_name_dict = invert_dict(self.gene_name_id_dict)
|
261 |
-
|
262 |
-
if self.mode == "goal_state_shift":
|
263 |
-
dict_list = read_dictionaries(input_data_directory,"cell")
|
264 |
-
else:
|
265 |
logger.error(
|
266 |
-
|
|
|
267 |
raise
|
268 |
-
|
|
|
|
|
|
|
269 |
# obtain total gene list
|
270 |
gene_list = get_gene_list(dict_list)
|
271 |
|
@@ -278,18 +311,24 @@ class InSilicoPerturberStats:
|
|
278 |
self.gene_token_id_dict[genes] \
|
279 |
for genes in gene_list]}, \
|
280 |
index=[i for i in range(len(gene_list))])
|
281 |
-
|
282 |
-
|
283 |
-
|
284 |
-
|
285 |
-
|
286 |
-
|
287 |
-
|
288 |
-
|
289 |
-
|
290 |
-
|
291 |
-
|
292 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
293 |
|
294 |
# save perturbation stats to output_path
|
295 |
output_path = (Path(output_directory) / output_prefix).with_suffix(".csv")
|
|
|
19 |
import numpy as np
|
20 |
import pandas as pd
|
21 |
import pickle
|
22 |
+
import random
|
23 |
import statsmodels.stats.multitest as smt
|
24 |
from pathlib import Path
|
25 |
from scipy.stats import ranksums
|
|
|
67 |
def get_fdr(pvalues):
|
68 |
return list(smt.multipletests(pvalues, alpha=0.05, method="fdr_bh")[1])
|
69 |
|
70 |
+
def isp_stats(cos_sims_df, dict_list):
|
|
|
71 |
random_tuples = []
|
72 |
for i in trange(cos_sims_df.shape[0]):
|
73 |
token = cos_sims_df["Gene"][i]
|
|
|
131 |
|
132 |
return cos_sims_full_df
|
133 |
|
134 |
+
def isp_stats_vs_null(cos_sims_df, dict_list, null_dict_list):
|
135 |
+
cos_sims_full_df = cos_sims_df.copy()
|
136 |
+
|
137 |
+
cos_sims_full_df["Test_avg_shift"] = np.zeros(cos_sims_df.shape[0], dtype=float)
|
138 |
+
cos_sims_full_df["Null_avg_shift"] = np.zeros(cos_sims_df.shape[0], dtype=float)
|
139 |
+
cos_sims_full_df["Test_v_null_avg_shift"] = np.zeros(cos_sims_df.shape[0], dtype=float)
|
140 |
+
cos_sims_full_df["Test_v_null_pval"] = np.zeros(cos_sims_df.shape[0], dtype=float)
|
141 |
+
cos_sims_full_df["Test_v_null_FDR"] = np.zeros(cos_sims_df.shape[0], dtype=float)
|
142 |
+
cos_sims_full_df["N_Detections_test"] = np.zeros(cos_sims_df.shape[0], dtype="uint32")
|
143 |
+
cos_sims_full_df["N_Detections_null"] = np.zeros(cos_sims_df.shape[0], dtype="uint32")
|
144 |
+
|
145 |
+
for i in trange(cos_sims_df.shape[0]):
|
146 |
+
token = cos_sims_df["Gene"][i]
|
147 |
+
test_shifts = []
|
148 |
+
null_shifts = []
|
149 |
+
|
150 |
+
for dict_i in dict_list:
|
151 |
+
token_tuples += dict_i.get((token, "cell_emb"),[])
|
152 |
+
|
153 |
+
for dict_i in null_dict_list:
|
154 |
+
null_tuples += dict_i.get((token, "cell_emb"),[])
|
155 |
+
|
156 |
+
cos_sims_full_df.loc[i, "Test_avg_shift"] = np.mean(test_shifts)
|
157 |
+
cos_sims_full_df.loc[i, "Null_avg_shift"] = np.mean(null_shifts)
|
158 |
+
cos_sims_full_df.loc[i, "Test_v_null_avg_shift"] = np.mean(test_shifts)-np.mean(null_shifts)
|
159 |
+
cos_sims_full_df.loc[i, "Test_v_null_pval"] = ranksums(test_shifts,
|
160 |
+
null_shifts, nan_policy="omit").pvalue
|
161 |
+
|
162 |
+
cos_sims_full_df.loc[i, "N_Detections_test"] = len(test_shifts)
|
163 |
+
cos_sims_full_df.loc[i, "N_Detections_null"] = len(null_shifts)
|
164 |
+
|
165 |
+
cos_sims_full_df["Test_v_null_FDR"] = get_fdr(cos_sims_full_df["Test_v_null_pval"])
|
166 |
+
return cos_sims_full_df
|
167 |
+
|
168 |
class InSilicoPerturberStats:
|
169 |
valid_option_dict = {
|
170 |
"mode": {"goal_state_shift","vs_null","vs_random"},
|
|
|
289 |
output_prefix : str
|
290 |
Prefix for output .dataset
|
291 |
"""
|
292 |
+
|
293 |
+
if self.mode not in ["goal_state_shift", "vs_null"]:
|
|
|
|
|
|
|
|
|
|
|
294 |
logger.error(
|
295 |
+
"Currently, only modes available are stats for goal_state_shift \
|
296 |
+
and comparing vs a null distribution.")
|
297 |
raise
|
298 |
+
|
299 |
+
self.gene_token_id_dict = invert_dict(self.gene_token_dict)
|
300 |
+
self.gene_id_name_dict = invert_dict(self.gene_name_id_dict)
|
301 |
+
|
302 |
# obtain total gene list
|
303 |
gene_list = get_gene_list(dict_list)
|
304 |
|
|
|
311 |
self.gene_token_id_dict[genes] \
|
312 |
for genes in gene_list]}, \
|
313 |
index=[i for i in range(len(gene_list))])
|
314 |
+
|
315 |
+
dict_list = read_dictionaries(input_data_directory, "cell")
|
316 |
+
if self.mode == "goal_state_shift":
|
317 |
+
cos_sims_df = isp_stats(cos_sims_df_initial, dict_list)
|
318 |
+
|
319 |
+
# quantify number of detections of each gene
|
320 |
+
cos_sims_df["N_Detections"] = [n_detections(i, dict_list) for i in cos_sims_df["Gene"]]
|
321 |
+
|
322 |
+
# sort by shift to desired state
|
323 |
+
cos_sims_df = cos_sims_df.sort_values(by=["Shift_from_goal_end",
|
324 |
+
"Goal_end_FDR"])
|
325 |
+
elif self.mode == "vs_null":
|
326 |
+
dict_list = read_dictionaries(input_data_directory, "cell")
|
327 |
+
null_dict_list = read_dictionaries(null_dist_data_directory, "cell")
|
328 |
+
cos_sims_df = isp_stats_vs_null(cos_sims_df_initial, dict_list,
|
329 |
+
null_dict_list)
|
330 |
+
cos_sims_df = cos_sims_df.sort_values(by=["Test_v_null_avg_shift",
|
331 |
+
"Test_v_null_FDR"])
|
332 |
|
333 |
# save perturbation stats to output_path
|
334 |
output_path = (Path(output_directory) / output_prefix).with_suffix(".csv")
|