ctheodoris commited on
Commit
f4fea1e
·
1 Parent(s): 9f2c6cc

Reorder/sort isp stats output in vs_null mode

Browse files
geneformer/in_silico_perturber_stats.py CHANGED
@@ -19,6 +19,7 @@ import logging
19
  import numpy as np
20
  import pandas as pd
21
  import pickle
 
22
  import statsmodels.stats.multitest as smt
23
  from pathlib import Path
24
  from scipy.stats import ranksums
@@ -66,8 +67,7 @@ def n_detections(token, dict_list):
66
  def get_fdr(pvalues):
67
  return list(smt.multipletests(pvalues, alpha=0.05, method="fdr_bh")[1])
68
 
69
- def isp_stats(cos_sims_df, dict_list, cell_states_to_model):
70
-
71
  random_tuples = []
72
  for i in trange(cos_sims_df.shape[0]):
73
  token = cos_sims_df["Gene"][i]
@@ -131,6 +131,40 @@ def isp_stats(cos_sims_df, dict_list, cell_states_to_model):
131
 
132
  return cos_sims_full_df
133
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
134
  class InSilicoPerturberStats:
135
  valid_option_dict = {
136
  "mode": {"goal_state_shift","vs_null","vs_random"},
@@ -255,17 +289,16 @@ class InSilicoPerturberStats:
255
  output_prefix : str
256
  Prefix for output .dataset
257
  """
258
-
259
- self.gene_token_id_dict = invert_dict(self.gene_token_dict)
260
- self.gene_id_name_dict = invert_dict(self.gene_name_id_dict)
261
-
262
- if self.mode == "goal_state_shift":
263
- dict_list = read_dictionaries(input_data_directory,"cell")
264
- else:
265
  logger.error(
266
- "Currently, only mode available is stats for goal_state_shift.")
 
267
  raise
268
-
 
 
 
269
  # obtain total gene list
270
  gene_list = get_gene_list(dict_list)
271
 
@@ -278,18 +311,24 @@ class InSilicoPerturberStats:
278
  self.gene_token_id_dict[genes] \
279
  for genes in gene_list]}, \
280
  index=[i for i in range(len(gene_list))])
281
-
282
- # # add ENSEMBL ID for genes
283
- # cos_sims_df_initial["Ensembl_ID"] = [self.gene_token_id_dict[genes[1]] if isinstance(genes,tuple) else self.gene_token_id_dict[genes] for genes in list(cos_sims_df_initial["Gene"])]
284
-
285
- cos_sims_df = isp_stats(cos_sims_df_initial, dict_list, self.cell_states_to_model)
286
-
287
- # quantify number of detections of each gene
288
- cos_sims_df["N_Detections"] = [n_detections(i, dict_list) for i in cos_sims_df["Gene"]]
289
-
290
- # sort by shift to desired state
291
- cos_sims_df = cos_sims_df.sort_values(by=["Shift_from_goal_end",
292
- "Goal_end_FDR"])
 
 
 
 
 
 
293
 
294
  # save perturbation stats to output_path
295
  output_path = (Path(output_directory) / output_prefix).with_suffix(".csv")
 
19
  import numpy as np
20
  import pandas as pd
21
  import pickle
22
+ import random
23
  import statsmodels.stats.multitest as smt
24
  from pathlib import Path
25
  from scipy.stats import ranksums
 
67
  def get_fdr(pvalues):
68
  return list(smt.multipletests(pvalues, alpha=0.05, method="fdr_bh")[1])
69
 
70
+ def isp_stats(cos_sims_df, dict_list):
 
71
  random_tuples = []
72
  for i in trange(cos_sims_df.shape[0]):
73
  token = cos_sims_df["Gene"][i]
 
131
 
132
  return cos_sims_full_df
133
 
134
+ def isp_stats_vs_null(cos_sims_df, dict_list, null_dict_list):
135
+ cos_sims_full_df = cos_sims_df.copy()
136
+
137
+ cos_sims_full_df["Test_avg_shift"] = np.zeros(cos_sims_df.shape[0], dtype=float)
138
+ cos_sims_full_df["Null_avg_shift"] = np.zeros(cos_sims_df.shape[0], dtype=float)
139
+ cos_sims_full_df["Test_v_null_avg_shift"] = np.zeros(cos_sims_df.shape[0], dtype=float)
140
+ cos_sims_full_df["Test_v_null_pval"] = np.zeros(cos_sims_df.shape[0], dtype=float)
141
+ cos_sims_full_df["Test_v_null_FDR"] = np.zeros(cos_sims_df.shape[0], dtype=float)
142
+ cos_sims_full_df["N_Detections_test"] = np.zeros(cos_sims_df.shape[0], dtype="uint32")
143
+ cos_sims_full_df["N_Detections_null"] = np.zeros(cos_sims_df.shape[0], dtype="uint32")
144
+
145
+ for i in trange(cos_sims_df.shape[0]):
146
+ token = cos_sims_df["Gene"][i]
147
+ test_shifts = []
148
+ null_shifts = []
149
+
150
+ for dict_i in dict_list:
151
+ token_tuples += dict_i.get((token, "cell_emb"),[])
152
+
153
+ for dict_i in null_dict_list:
154
+ null_tuples += dict_i.get((token, "cell_emb"),[])
155
+
156
+ cos_sims_full_df.loc[i, "Test_avg_shift"] = np.mean(test_shifts)
157
+ cos_sims_full_df.loc[i, "Null_avg_shift"] = np.mean(null_shifts)
158
+ cos_sims_full_df.loc[i, "Test_v_null_avg_shift"] = np.mean(test_shifts)-np.mean(null_shifts)
159
+ cos_sims_full_df.loc[i, "Test_v_null_pval"] = ranksums(test_shifts,
160
+ null_shifts, nan_policy="omit").pvalue
161
+
162
+ cos_sims_full_df.loc[i, "N_Detections_test"] = len(test_shifts)
163
+ cos_sims_full_df.loc[i, "N_Detections_null"] = len(null_shifts)
164
+
165
+ cos_sims_full_df["Test_v_null_FDR"] = get_fdr(cos_sims_full_df["Test_v_null_pval"])
166
+ return cos_sims_full_df
167
+
168
  class InSilicoPerturberStats:
169
  valid_option_dict = {
170
  "mode": {"goal_state_shift","vs_null","vs_random"},
 
289
  output_prefix : str
290
  Prefix for output .dataset
291
  """
292
+
293
+ if self.mode not in ["goal_state_shift", "vs_null"]:
 
 
 
 
 
294
  logger.error(
295
+ "Currently, only modes available are stats for goal_state_shift \
296
+ and comparing vs a null distribution.")
297
  raise
298
+
299
+ self.gene_token_id_dict = invert_dict(self.gene_token_dict)
300
+ self.gene_id_name_dict = invert_dict(self.gene_name_id_dict)
301
+
302
  # obtain total gene list
303
  gene_list = get_gene_list(dict_list)
304
 
 
311
  self.gene_token_id_dict[genes] \
312
  for genes in gene_list]}, \
313
  index=[i for i in range(len(gene_list))])
314
+
315
+ dict_list = read_dictionaries(input_data_directory, "cell")
316
+ if self.mode == "goal_state_shift":
317
+ cos_sims_df = isp_stats(cos_sims_df_initial, dict_list)
318
+
319
+ # quantify number of detections of each gene
320
+ cos_sims_df["N_Detections"] = [n_detections(i, dict_list) for i in cos_sims_df["Gene"]]
321
+
322
+ # sort by shift to desired state
323
+ cos_sims_df = cos_sims_df.sort_values(by=["Shift_from_goal_end",
324
+ "Goal_end_FDR"])
325
+ elif self.mode == "vs_null":
326
+ dict_list = read_dictionaries(input_data_directory, "cell")
327
+ null_dict_list = read_dictionaries(null_dist_data_directory, "cell")
328
+ cos_sims_df = isp_stats_vs_null(cos_sims_df_initial, dict_list,
329
+ null_dict_list)
330
+ cos_sims_df = cos_sims_df.sort_values(by=["Test_v_null_avg_shift",
331
+ "Test_v_null_FDR"])
332
 
333
  # save perturbation stats to output_path
334
  output_path = (Path(output_directory) / output_prefix).with_suffix(".csv")