Christina Theodoris commited on
Commit
e2ee685
·
1 Parent(s): b9028ba

further emphasize different token dictionaries in examples

Browse files
examples/cell_classification.ipynb CHANGED
@@ -68,6 +68,10 @@
68
  " \"per_device_train_batch_size\": 12,\n",
69
  " \"seed\": 73,\n",
70
  "}\n",
 
 
 
 
71
  "cc = Classifier(classifier=\"cell\",\n",
72
  " cell_state_dict = {\"state_key\": \"disease\", \"states\": \"all\"},\n",
73
  " filter_data=filter_data_dict,\n",
@@ -125,7 +129,7 @@
125
  " \"train\": train_ids+eval_ids,\n",
126
  " \"test\": test_ids}\n",
127
  "\n",
128
- "# Example input_data_file: https://huggingface.co/datasets/ctheodoris/Genecorpus-30M/tree/main/example_input_files/cell_classification/disease_classification/human_dcm_hcm_nf.dataset\n",
129
  "cc.prepare_data(input_data_file=\"/path/to/human_dcm_hcm_nf_2048_w_length.dataset\",\n",
130
  " output_directory=output_dir,\n",
131
  " output_prefix=output_prefix,\n",
@@ -260,7 +264,7 @@
260
  " \"train\": train_ids,\n",
261
  " \"eval\": eval_ids}\n",
262
  "\n",
263
- "# 6 layer Geneformer: https://huggingface.co/ctheodoris/Geneformer/blob/main/model.safetensors\n",
264
  "all_metrics = cc.validate(model_directory=\"/path/to/Geneformer\",\n",
265
  " prepared_input_data_file=f\"{output_dir}/{output_prefix}_labeled_train.dataset\",\n",
266
  " id_class_dict_file=f\"{output_dir}/{output_prefix}_id_class_dict.pkl\",\n",
@@ -446,7 +450,7 @@
446
  "name": "python",
447
  "nbconvert_exporter": "python",
448
  "pygments_lexer": "ipython3",
449
- "version": "3.11.5"
450
  }
451
  },
452
  "nbformat": 4,
 
68
  " \"per_device_train_batch_size\": 12,\n",
69
  " \"seed\": 73,\n",
70
  "}\n",
71
+ "\n",
72
+ "# OF NOTE: token_dictionary_file must be set to the gc-30M token dictionary if using a 30M series model\n",
73
+ "# (otherwise the Classifier will use the current default model dictionary)\n",
74
+ "# 30M token dictionary: https://huggingface.co/ctheodoris/Geneformer/blob/main/geneformer/gene_dictionaries_30m/token_dictionary_gc30M.pkl\n",
75
  "cc = Classifier(classifier=\"cell\",\n",
76
  " cell_state_dict = {\"state_key\": \"disease\", \"states\": \"all\"},\n",
77
  " filter_data=filter_data_dict,\n",
 
129
  " \"train\": train_ids+eval_ids,\n",
130
  " \"test\": test_ids}\n",
131
  "\n",
132
+ "# Example input_data_file for 30M model: https://huggingface.co/datasets/ctheodoris/Genecorpus-30M/tree/main/example_input_files/cell_classification/disease_classification/human_dcm_hcm_nf.dataset\n",
133
  "cc.prepare_data(input_data_file=\"/path/to/human_dcm_hcm_nf_2048_w_length.dataset\",\n",
134
  " output_directory=output_dir,\n",
135
  " output_prefix=output_prefix,\n",
 
264
  " \"train\": train_ids,\n",
265
  " \"eval\": eval_ids}\n",
266
  "\n",
267
+ "# Example 6 layer 30M Geneformer model: https://huggingface.co/ctheodoris/Geneformer/blob/main/gf-6L-30M-i2048/model.safetensors\n",
268
  "all_metrics = cc.validate(model_directory=\"/path/to/Geneformer\",\n",
269
  " prepared_input_data_file=f\"{output_dir}/{output_prefix}_labeled_train.dataset\",\n",
270
  " id_class_dict_file=f\"{output_dir}/{output_prefix}_id_class_dict.pkl\",\n",
 
450
  "name": "python",
451
  "nbconvert_exporter": "python",
452
  "pygments_lexer": "ipython3",
453
+ "version": "3.10.15"
454
  }
455
  },
456
  "nbformat": 4,
examples/extract_and_plot_cell_embeddings.ipynb CHANGED
@@ -18,6 +18,8 @@
18
  "outputs": [],
19
  "source": [
20
  "# initiate EmbExtractor\n",
 
 
21
  "embex = EmbExtractor(model_type=\"CellClassifier\",\n",
22
  " num_classes=3,\n",
23
  " filter_data={\"cell_type\":[\"Cardiomyocyte1\",\"Cardiomyocyte2\",\"Cardiomyocyte3\"]},\n",
@@ -26,12 +28,13 @@
26
  " emb_label=[\"disease\",\"cell_type\"],\n",
27
  " labels_to_plot=[\"disease\"],\n",
28
  " forward_batch_size=200,\n",
29
- " nproc=16)\n",
 
30
  "\n",
31
  "# extracts embedding from input data\n",
32
  "# input data is tokenized rank value encodings generated by Geneformer tokenizer (see tokenizing_scRNAseq_data.ipynb)\n",
33
- "# example dataset: https://huggingface.co/datasets/ctheodoris/Genecorpus-30M/tree/main/example_input_files/cell_classification/disease_classification/human_dcm_hcm_nf.dataset\n",
34
- "embs = embex.extract_embs(\"../fine_tuned_models/geneformer-6L-30M_CellClassifier_cardiomyopathies_220224\",\n",
35
  " \"path/to/input_data/\",\n",
36
  " \"path/to/output_directory/\",\n",
37
  " \"output_prefix\")\n"
@@ -129,7 +132,7 @@
129
  "name": "python",
130
  "nbconvert_exporter": "python",
131
  "pygments_lexer": "ipython3",
132
- "version": "3.11.5"
133
  }
134
  },
135
  "nbformat": 4,
 
18
  "outputs": [],
19
  "source": [
20
  "# initiate EmbExtractor\n",
21
+ "# OF NOTE: token_dictionary_file must be set to the gc-30M token dictionary if using a 30M series model\n",
22
+ "# (otherwise the EmbExtractor will use the current default model dictionary)\n",
23
  "embex = EmbExtractor(model_type=\"CellClassifier\",\n",
24
  " num_classes=3,\n",
25
  " filter_data={\"cell_type\":[\"Cardiomyocyte1\",\"Cardiomyocyte2\",\"Cardiomyocyte3\"]},\n",
 
28
  " emb_label=[\"disease\",\"cell_type\"],\n",
29
  " labels_to_plot=[\"disease\"],\n",
30
  " forward_batch_size=200,\n",
31
+ " nproc=16,\n",
32
+ " token_dictionary_file=\"./gene_dictionaries_30m/token_dictionary_gc30M.pkl\") # change from current default dictionary for 30M model series\n",
33
  "\n",
34
  "# extracts embedding from input data\n",
35
  "# input data is tokenized rank value encodings generated by Geneformer tokenizer (see tokenizing_scRNAseq_data.ipynb)\n",
36
+ "# example dataset for 30M model series: https://huggingface.co/datasets/ctheodoris/Genecorpus-30M/tree/main/example_input_files/cell_classification/disease_classification/human_dcm_hcm_nf.dataset\n",
37
+ "embs = embex.extract_embs(\"../fine_tuned_models/gf-6L-30M-i2048_CellClassifier_cardiomyopathies_220224\", # example 30M fine-tuned model\n",
38
  " \"path/to/input_data/\",\n",
39
  " \"path/to/output_directory/\",\n",
40
  " \"output_prefix\")\n"
 
132
  "name": "python",
133
  "nbconvert_exporter": "python",
134
  "pygments_lexer": "ipython3",
135
+ "version": "3.10.15"
136
  }
137
  },
138
  "nbformat": 4,
examples/gene_classification.ipynb CHANGED
@@ -71,6 +71,9 @@
71
  }
72
  ],
73
  "source": [
 
 
 
74
  "cc = Classifier(classifier=\"gene\",\n",
75
  " gene_class_dict = gene_class_dict,\n",
76
  " max_ncells = 10_000,\n",
@@ -102,7 +105,7 @@
102
  }
103
  ],
104
  "source": [
105
- "# Example input_data_file: https://huggingface.co/datasets/ctheodoris/Genecorpus-30M/tree/main/example_input_files/gene_classification/dosage_sensitive_tfs/gc-30M_sample50k.dataset\n",
106
  "cc.prepare_data(input_data_file=\"/path/to/gc-30M_sample50k.dataset\",\n",
107
  " output_directory=output_dir,\n",
108
  " output_prefix=output_prefix)"
@@ -840,7 +843,7 @@
840
  }
841
  ],
842
  "source": [
843
- "# 6 layer Geneformer: https://huggingface.co/ctheodoris/Geneformer/blob/main/model.safetensors\n",
844
  "all_metrics = cc.validate(model_directory=\"/path/to/Geneformer\",\n",
845
  " prepared_input_data_file=f\"{output_dir}/{output_prefix}_labeled.dataset\",\n",
846
  " id_class_dict_file=f\"{output_dir}/{output_prefix}_id_class_dict.pkl\",\n",
@@ -1240,7 +1243,7 @@
1240
  "name": "python",
1241
  "nbconvert_exporter": "python",
1242
  "pygments_lexer": "ipython3",
1243
- "version": "3.11.5"
1244
  }
1245
  },
1246
  "nbformat": 4,
 
71
  }
72
  ],
73
  "source": [
74
+ "# OF NOTE: token_dictionary_file must be set to the gc-30M token dictionary if using a 30M series model\n",
75
+ "# (otherwise the Classifier will use the current default model dictionary)\n",
76
+ "# 30M token dictionary: https://huggingface.co/ctheodoris/Geneformer/blob/main/geneformer/gene_dictionaries_30m/token_dictionary_gc30M.pkl\n",
77
  "cc = Classifier(classifier=\"gene\",\n",
78
  " gene_class_dict = gene_class_dict,\n",
79
  " max_ncells = 10_000,\n",
 
105
  }
106
  ],
107
  "source": [
108
+ "# Example input_data_file for 30M model series: https://huggingface.co/datasets/ctheodoris/Genecorpus-30M/tree/main/example_input_files/gene_classification/dosage_sensitive_tfs/gc-30M_sample50k.dataset\n",
109
  "cc.prepare_data(input_data_file=\"/path/to/gc-30M_sample50k.dataset\",\n",
110
  " output_directory=output_dir,\n",
111
  " output_prefix=output_prefix)"
 
843
  }
844
  ],
845
  "source": [
846
+ "# 6 layer 30M Geneformer model: https://huggingface.co/ctheodoris/Geneformer/blob/main/gf-6L-30M-i2048/model.safetensors\n",
847
  "all_metrics = cc.validate(model_directory=\"/path/to/Geneformer\",\n",
848
  " prepared_input_data_file=f\"{output_dir}/{output_prefix}_labeled.dataset\",\n",
849
  " id_class_dict_file=f\"{output_dir}/{output_prefix}_id_class_dict.pkl\",\n",
 
1243
  "name": "python",
1244
  "nbconvert_exporter": "python",
1245
  "pygments_lexer": "ipython3",
1246
+ "version": "3.10.15"
1247
  }
1248
  },
1249
  "nbformat": 4,
examples/in_silico_perturbation.ipynb CHANGED
@@ -39,7 +39,10 @@
39
  "\n",
40
  "filter_data_dict={\"cell_type\":[\"Cardiomyocyte1\",\"Cardiomyocyte2\",\"Cardiomyocyte3\"]}\n",
41
  "\n",
42
- "embex = EmbExtractor(model_type=\"CellClassifier\",\n",
 
 
 
43
  " num_classes=3,\n",
44
  " filter_data=filter_data_dict,\n",
45
  " max_ncells=1000,\n",
@@ -49,7 +52,7 @@
49
  " nproc=16)\n",
50
  "\n",
51
  "state_embs_dict = embex.get_state_embs(cell_states_to_model,\n",
52
- " \"path/to/model\",\n",
53
  " \"path/to/input_data\",\n",
54
  " \"path/to/output_directory\",\n",
55
  " \"output_prefix\")"
@@ -64,12 +67,15 @@
64
  },
65
  "outputs": [],
66
  "source": [
 
 
 
67
  "isp = InSilicoPerturber(perturb_type=\"delete\",\n",
68
  " perturb_rank_shift=None,\n",
69
  " genes_to_perturb=\"all\",\n",
70
  " combos=0,\n",
71
  " anchor_gene=None,\n",
72
- " model_type=\"CellClassifier\",\n",
73
  " num_classes=3,\n",
74
  " emb_mode=\"cell\",\n",
75
  " cell_emb_style=\"mean_pool\",\n",
@@ -90,9 +96,10 @@
90
  "outputs": [],
91
  "source": [
92
  "# outputs intermediate files from in silico perturbation\n",
93
- "isp.perturb_data(\"path/to/model\",\n",
 
94
  " \"path/to/input_data\",\n",
95
- " \"path/to/output_directory\",\n",
96
  " \"output_prefix\")"
97
  ]
98
  },
@@ -103,6 +110,9 @@
103
  "metadata": {},
104
  "outputs": [],
105
  "source": [
 
 
 
106
  "ispstats = InSilicoPerturberStats(mode=\"goal_state_shift\",\n",
107
  " genes_perturbed=\"all\",\n",
108
  " combos=0,\n",
@@ -118,9 +128,9 @@
118
  "outputs": [],
119
  "source": [
120
  "# extracts data from intermediate files and processes stats to output in final .csv\n",
121
- "ispstats.get_stats(\"path/to/input_data\",\n",
122
  " None,\n",
123
- " \"path/to/output_directory\",\n",
124
  " \"output_prefix\")"
125
  ]
126
  }
@@ -141,7 +151,7 @@
141
  "name": "python",
142
  "nbconvert_exporter": "python",
143
  "pygments_lexer": "ipython3",
144
- "version": "3.10.11"
145
  }
146
  },
147
  "nbformat": 4,
 
39
  "\n",
40
  "filter_data_dict={\"cell_type\":[\"Cardiomyocyte1\",\"Cardiomyocyte2\",\"Cardiomyocyte3\"]}\n",
41
  "\n",
42
+ "# OF NOTE: token_dictionary_file must be set to the gc-30M token dictionary if using a 30M series model\n",
43
+ "# (otherwise the EmbExtractor will use the current default model dictionary)\n",
44
+ "# 30M token dictionary: https://huggingface.co/ctheodoris/Geneformer/blob/main/geneformer/gene_dictionaries_30m/token_dictionary_gc30M.pkl\n",
45
+ "embex = EmbExtractor(model_type=\"CellClassifier\", # if using previously fine-tuned cell classifier model\n",
46
  " num_classes=3,\n",
47
  " filter_data=filter_data_dict,\n",
48
  " max_ncells=1000,\n",
 
52
  " nproc=16)\n",
53
  "\n",
54
  "state_embs_dict = embex.get_state_embs(cell_states_to_model,\n",
55
+ " \"../fine_tuned_models/gf-6L-30M-i2048_CellClassifier_cardiomyopathies_220224\", # example 30M fine-tuned model\n",
56
  " \"path/to/input_data\",\n",
57
  " \"path/to/output_directory\",\n",
58
  " \"output_prefix\")"
 
67
  },
68
  "outputs": [],
69
  "source": [
70
+ "# OF NOTE: token_dictionary_file must be set to the gc-30M token dictionary if using a 30M series model\n",
71
+ "# (otherwise the InSilicoPerturber will use the current default model dictionary)\n",
72
+ "# 30M token dictionary: https://huggingface.co/ctheodoris/Geneformer/blob/main/geneformer/gene_dictionaries_30m/token_dictionary_gc30M.pkl\n",
73
  "isp = InSilicoPerturber(perturb_type=\"delete\",\n",
74
  " perturb_rank_shift=None,\n",
75
  " genes_to_perturb=\"all\",\n",
76
  " combos=0,\n",
77
  " anchor_gene=None,\n",
78
+ " model_type=\"CellClassifier\", # if using previously fine-tuned cell classifier model\n",
79
  " num_classes=3,\n",
80
  " emb_mode=\"cell\",\n",
81
  " cell_emb_style=\"mean_pool\",\n",
 
96
  "outputs": [],
97
  "source": [
98
  "# outputs intermediate files from in silico perturbation\n",
99
+ "\n",
100
+ "isp.perturb_data(\"../fine_tuned_models/gf-6L-30M-i2048_CellClassifier_cardiomyopathies_220224\", # example 30M fine-tuned model\n",
101
  " \"path/to/input_data\",\n",
102
+ " \"path/to/isp_output_directory\",\n",
103
  " \"output_prefix\")"
104
  ]
105
  },
 
110
  "metadata": {},
111
  "outputs": [],
112
  "source": [
113
+ "# OF NOTE: token_dictionary_file must be set to the gc-30M token dictionary if using a 30M series model\n",
114
+ "# (otherwise the InSilicoPerturberStats will use the current default model dictionary)\n",
115
+ "# 30M token dictionary: https://huggingface.co/ctheodoris/Geneformer/blob/main/geneformer/gene_dictionaries_30m/token_dictionary_gc30M.pkl\n",
116
  "ispstats = InSilicoPerturberStats(mode=\"goal_state_shift\",\n",
117
  " genes_perturbed=\"all\",\n",
118
  " combos=0,\n",
 
128
  "outputs": [],
129
  "source": [
130
  "# extracts data from intermediate files and processes stats to output in final .csv\n",
131
+ "ispstats.get_stats(\"path/to/isp_output_directory\", # this should be the directory \n",
132
  " None,\n",
133
+ " \"path/to/isp_stats_output_directory\",\n",
134
  " \"output_prefix\")"
135
  ]
136
  }
 
151
  "name": "python",
152
  "nbconvert_exporter": "python",
153
  "pygments_lexer": "ipython3",
154
+ "version": "3.10.15"
155
  }
156
  },
157
  "nbformat": 4,
examples/tokenizing_scRNAseq_data.ipynb CHANGED
@@ -12,7 +12,7 @@
12
  },
13
  {
14
  "cell_type": "markdown",
15
- "id": "350e6252-b783-494b-9767-f087eb868a15",
16
  "metadata": {},
17
  "source": [
18
  "#### Input data is a directory with .loom or .h5ad files containing raw counts from single cell RNAseq data, including all genes detected in the transcriptome without feature selection. The input file type is specified by the argument file_format in the tokenize_data function.\n",
@@ -25,11 +25,21 @@
25
  "\n",
26
  "#### Additionally, if the original .loom file contains a cell column attribute called \"filter_pass\", this column will be used as a binary indicator of whether to include these cells in the tokenized data. All cells with \"1\" in this attribute will be tokenized, whereas the others will be excluded. One may use this column to indicate QC filtering or other criteria for selection for inclusion in the final tokenized dataset.\n",
27
  "\n",
28
- "#### If one's data is in other formats besides .loom or .h5ad, one can use the relevant tools (such as Anndata tools) to convert the file to a .loom or .h5ad format prior to running the transcriptome tokenizer.\n",
29
- "\n",
 
 
 
 
 
 
 
30
  "#### OF NOTE: PLEASE ENSURE THE CORRECT TOKEN DICTIONARY AND GENE MEDIAN FILE IS USED FOR THE CORRECT MODEL.\n",
 
31
  "\n",
32
- "#### The 95M model series also require the special_token argument to be set to True and model_input_size to be 4096."
 
 
33
  ]
34
  },
35
  {
@@ -73,7 +83,7 @@
73
  "name": "python",
74
  "nbconvert_exporter": "python",
75
  "pygments_lexer": "ipython3",
76
- "version": "3.10.11"
77
  }
78
  },
79
  "nbformat": 4,
 
12
  },
13
  {
14
  "cell_type": "markdown",
15
+ "id": "1fe86f48-5578-47df-b373-58c21ec170ab",
16
  "metadata": {},
17
  "source": [
18
  "#### Input data is a directory with .loom or .h5ad files containing raw counts from single cell RNAseq data, including all genes detected in the transcriptome without feature selection. The input file type is specified by the argument file_format in the tokenize_data function.\n",
 
25
  "\n",
26
  "#### Additionally, if the original .loom file contains a cell column attribute called \"filter_pass\", this column will be used as a binary indicator of whether to include these cells in the tokenized data. All cells with \"1\" in this attribute will be tokenized, whereas the others will be excluded. One may use this column to indicate QC filtering or other criteria for selection for inclusion in the final tokenized dataset.\n",
27
  "\n",
28
+ "#### If one's data is in other formats besides .loom or .h5ad, one can use the relevant tools (such as Anndata tools) to convert the file to a .loom or .h5ad format prior to running the transcriptome tokenizer."
29
+ ]
30
+ },
31
+ {
32
+ "cell_type": "markdown",
33
+ "id": "32c69493-4e5a-4b07-8dc1-958ff2ee7d0b",
34
+ "metadata": {},
35
+ "source": [
36
+ "**********************************************************************************************************\n",
37
  "#### OF NOTE: PLEASE ENSURE THE CORRECT TOKEN DICTIONARY AND GENE MEDIAN FILE IS USED FOR THE CORRECT MODEL.\n",
38
+ "#### 95M: current defaults; 30M: https://huggingface.co/ctheodoris/Geneformer/tree/main/geneformer/gene_dictionaries_30m\n",
39
  "\n",
40
+ "#### ADDITIONALLY:\n",
41
+ "#### The 95M model series require the special_token argument to be set to True and model_input_size to be 4096. (current defaults)\n",
42
+ "#### The 30M model series require the special_token argument to be set to False and the model_input_size to be 2048."
43
  ]
44
  },
45
  {
 
83
  "name": "python",
84
  "nbconvert_exporter": "python",
85
  "pygments_lexer": "ipython3",
86
+ "version": "3.10.15"
87
  }
88
  },
89
  "nbformat": 4,