Christina Theodoris
commited on
Commit
·
3072225
1
Parent(s):
98ce6d7
Add explanation of output columns and sort by largest shift
Browse files
geneformer/in_silico_perturber_stats.py
CHANGED
@@ -142,12 +142,12 @@ def isp_stats_to_goal_state(cos_sims_df, dict_list, cell_states_to_model):
|
|
142 |
names=["Gene",
|
143 |
"Gene_name",
|
144 |
"Ensembl_ID",
|
145 |
-
"
|
146 |
-
"
|
147 |
"Goal_end_vs_random_pval",
|
148 |
"Alt_end_vs_random_pval"]
|
149 |
if alt_end_state_exists == False:
|
150 |
-
names.remove("
|
151 |
names.remove("Alt_end_vs_random_pval")
|
152 |
cos_sims_full_df = pd.DataFrame(columns=names)
|
153 |
|
@@ -197,8 +197,9 @@ def isp_stats_to_goal_state(cos_sims_df, dict_list, cell_states_to_model):
|
|
197 |
cos_sims_full_df["N_Detections"] = [n_detections(i, dict_list, "cell", None) for i in cos_sims_full_df["Gene"]]
|
198 |
|
199 |
# sort by shift to desired state
|
200 |
-
cos_sims_full_df = cos_sims_full_df.sort_values(by=["
|
201 |
-
"Goal_end_FDR"]
|
|
|
202 |
|
203 |
return cos_sims_full_df
|
204 |
|
@@ -208,9 +209,9 @@ def isp_stats_vs_null(cos_sims_df, dict_list, null_dict_list):
|
|
208 |
|
209 |
cos_sims_full_df["Test_avg_shift"] = np.zeros(cos_sims_df.shape[0], dtype=float)
|
210 |
cos_sims_full_df["Null_avg_shift"] = np.zeros(cos_sims_df.shape[0], dtype=float)
|
211 |
-
cos_sims_full_df["
|
212 |
-
cos_sims_full_df["
|
213 |
-
cos_sims_full_df["
|
214 |
cos_sims_full_df["N_Detections_test"] = np.zeros(cos_sims_df.shape[0], dtype="uint32")
|
215 |
cos_sims_full_df["N_Detections_null"] = np.zeros(cos_sims_df.shape[0], dtype="uint32")
|
216 |
|
@@ -227,17 +228,18 @@ def isp_stats_vs_null(cos_sims_df, dict_list, null_dict_list):
|
|
227 |
|
228 |
cos_sims_full_df.loc[i, "Test_avg_shift"] = np.mean(test_shifts)
|
229 |
cos_sims_full_df.loc[i, "Null_avg_shift"] = np.mean(null_shifts)
|
230 |
-
cos_sims_full_df.loc[i, "
|
231 |
-
cos_sims_full_df.loc[i, "
|
232 |
null_shifts, nan_policy="omit").pvalue
|
233 |
|
234 |
cos_sims_full_df.loc[i, "N_Detections_test"] = len(test_shifts)
|
235 |
cos_sims_full_df.loc[i, "N_Detections_null"] = len(null_shifts)
|
236 |
|
237 |
-
cos_sims_full_df["
|
238 |
|
239 |
-
cos_sims_full_df = cos_sims_full_df.sort_values(by=["
|
240 |
-
"
|
|
|
241 |
return cos_sims_full_df
|
242 |
|
243 |
# stats for identifying perturbations with largest effect within a given set of cells
|
@@ -498,6 +500,46 @@ class InSilicoPerturberStats:
|
|
498 |
Path to directory where perturbation data will be saved as .csv
|
499 |
output_prefix : str
|
500 |
Prefix for output .dataset
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
501 |
"""
|
502 |
|
503 |
if self.mode not in ["goal_state_shift", "vs_null", "mixture_model"]:
|
|
|
142 |
names=["Gene",
|
143 |
"Gene_name",
|
144 |
"Ensembl_ID",
|
145 |
+
"Shift_to_goal_end",
|
146 |
+
"Shift_to_alt_end",
|
147 |
"Goal_end_vs_random_pval",
|
148 |
"Alt_end_vs_random_pval"]
|
149 |
if alt_end_state_exists == False:
|
150 |
+
names.remove("Shift_to_alt_end")
|
151 |
names.remove("Alt_end_vs_random_pval")
|
152 |
cos_sims_full_df = pd.DataFrame(columns=names)
|
153 |
|
|
|
197 |
cos_sims_full_df["N_Detections"] = [n_detections(i, dict_list, "cell", None) for i in cos_sims_full_df["Gene"]]
|
198 |
|
199 |
# sort by shift to desired state
|
200 |
+
cos_sims_full_df = cos_sims_full_df.sort_values(by=["Shift_to_goal_end",
|
201 |
+
"Goal_end_FDR"],
|
202 |
+
ascending=[False,True])
|
203 |
|
204 |
return cos_sims_full_df
|
205 |
|
|
|
209 |
|
210 |
cos_sims_full_df["Test_avg_shift"] = np.zeros(cos_sims_df.shape[0], dtype=float)
|
211 |
cos_sims_full_df["Null_avg_shift"] = np.zeros(cos_sims_df.shape[0], dtype=float)
|
212 |
+
cos_sims_full_df["Test_vs_null_avg_shift"] = np.zeros(cos_sims_df.shape[0], dtype=float)
|
213 |
+
cos_sims_full_df["Test_vs_null_pval"] = np.zeros(cos_sims_df.shape[0], dtype=float)
|
214 |
+
cos_sims_full_df["Test_vs_null_FDR"] = np.zeros(cos_sims_df.shape[0], dtype=float)
|
215 |
cos_sims_full_df["N_Detections_test"] = np.zeros(cos_sims_df.shape[0], dtype="uint32")
|
216 |
cos_sims_full_df["N_Detections_null"] = np.zeros(cos_sims_df.shape[0], dtype="uint32")
|
217 |
|
|
|
228 |
|
229 |
cos_sims_full_df.loc[i, "Test_avg_shift"] = np.mean(test_shifts)
|
230 |
cos_sims_full_df.loc[i, "Null_avg_shift"] = np.mean(null_shifts)
|
231 |
+
cos_sims_full_df.loc[i, "Test_vs_null_avg_shift"] = np.mean(test_shifts)-np.mean(null_shifts)
|
232 |
+
cos_sims_full_df.loc[i, "Test_vs_null_pval"] = ranksums(test_shifts,
|
233 |
null_shifts, nan_policy="omit").pvalue
|
234 |
|
235 |
cos_sims_full_df.loc[i, "N_Detections_test"] = len(test_shifts)
|
236 |
cos_sims_full_df.loc[i, "N_Detections_null"] = len(null_shifts)
|
237 |
|
238 |
+
cos_sims_full_df["Test_vs_null_FDR"] = get_fdr(cos_sims_full_df["Test_vs_null_pval"])
|
239 |
|
240 |
+
cos_sims_full_df = cos_sims_full_df.sort_values(by=["Test_vs_null_avg_shift",
|
241 |
+
"Test_vs_null_FDR"],
|
242 |
+
ascending=[False,True])
|
243 |
return cos_sims_full_df
|
244 |
|
245 |
# stats for identifying perturbations with largest effect within a given set of cells
|
|
|
500 |
Path to directory where perturbation data will be saved as .csv
|
501 |
output_prefix : str
|
502 |
Prefix for output .dataset
|
503 |
+
|
504 |
+
Outputs
|
505 |
+
----------
|
506 |
+
Definition of possible columns in .csv output file.
|
507 |
+
|
508 |
+
Of note, not all columns will be present in all output files.
|
509 |
+
Some columns are specific to particular perturbation modes.
|
510 |
+
|
511 |
+
"Gene": gene token
|
512 |
+
"Gene_name": gene name
|
513 |
+
"Ensembl_ID": gene Ensembl ID
|
514 |
+
"N_Detections": number of cells in which each gene or gene combination was detected in the input dataset
|
515 |
+
|
516 |
+
"Shift_to_goal_end": cosine shift from start state towards goal end state in response to given perturbation
|
517 |
+
"Shift_to_alt_end": cosine shift from start state towards alternate end state in response to given perturbation
|
518 |
+
"Goal_end_vs_random_pval": pvalue of cosine shift from start state towards goal end state by Wilcoxon
|
519 |
+
pvalue compares shift caused by perturbing given gene compared to random genes
|
520 |
+
"Alt_end_vs_random_pval": pvalue of cosine shift from start state towards alternate end state by Wilcoxon
|
521 |
+
pvalue compares shift caused by perturbing given gene compared to random genes
|
522 |
+
"Goal_end_FDR": Benjamini-Hochberg correction of "Goal_end_vs_random_pval"
|
523 |
+
"Alt_end_FDR": Benjamini-Hochberg correction of "Alt_end_vs_random_pval"
|
524 |
+
|
525 |
+
"Test_avg_shift": cosine shift in response to given perturbation in cells from test distribution
|
526 |
+
"Null_avg_shift": cosine shift in response to given perturbation in cells from null distribution (e.g. random cells)
|
527 |
+
"Test_vs_null_avg_shift": difference in cosine shift in cells from test vs. null distribution
|
528 |
+
(i.e. "Test_avg_shift" minus "Null_avg_shift")
|
529 |
+
"Test_vs_null_pval": pvalue of cosine shift in test vs. null distribution
|
530 |
+
"Test_vs_null_FDR": Benjamini-Hochberg correction of "Test_vs_null_pval"
|
531 |
+
"N_Detections_test": "N_Detections" in cells from test distribution
|
532 |
+
"N_Detections_null": "N_Detections" in cells from null distribution
|
533 |
+
|
534 |
+
"Anchor_shift": cosine shift in response to given perturbation of anchor gene
|
535 |
+
"Test_token_shift": cosine shift in response to given perturbation of test gene
|
536 |
+
"Sum_of_indiv_shifts": sum of cosine shifts in response to individually perturbing test and anchor genes
|
537 |
+
"Combo_shift": cosine shift in response to given perturbation of both anchor and test gene(s) in combination
|
538 |
+
"Combo_minus_sum_shift": difference of cosine shifts in response combo perturbation vs. sum of individual perturbations
|
539 |
+
(i.e. "Combo_shift" minus "Sum_of_indiv_shifts")
|
540 |
+
"Impact_component": whether the given perturbation was modeled to be within the impact component by the mixture model
|
541 |
+
1: within impact component; 0: not within impact component
|
542 |
+
"Impact_component_percent": percent of cells in which given perturbation was modeled to be within impact component
|
543 |
"""
|
544 |
|
545 |
if self.mode not in ["goal_state_shift", "vs_null", "mixture_model"]:
|