ctheodoris
commited on
update tokenizer to defaults for 95M models for special token and input size
Browse files- geneformer/tokenizer.py +17 -4
geneformer/tokenizer.py
CHANGED
@@ -18,6 +18,9 @@ Geneformer tokenizer.
|
|
18 |
| No cell metadata is required, but custom cell attributes may be passed onto the tokenized dataset by providing a dictionary of custom attributes to be added, which is formatted as loom_col_attr_name : desired_dataset_col_attr_name. For example, if the original .loom dataset has column attributes "cell_type" and "organ_major" and one would like to retain these attributes as labels in the tokenized dataset with the new names "cell_type" and "organ", respectively, the following custom attribute dictionary should be provided: {"cell_type": "cell_type", "organ_major": "organ"}.
|
19 |
| Additionally, if the original .loom file contains a cell column attribute called "filter_pass", this column will be used as a binary indicator of whether to include these cells in the tokenized data. All cells with "1" in this attribute will be tokenized, whereas the others will be excluded. One may use this column to indicate QC filtering or other criteria for selection for inclusion in the final tokenized dataset.
|
20 |
| If one's data is in other formats besides .loom or .h5ad, one can use the relevant tools (such as Anndata tools) to convert the file to a .loom or .h5ad format prior to running the transcriptome tokenizer.
|
|
|
|
|
|
|
21 |
"""
|
22 |
|
23 |
from __future__ import annotations
|
@@ -255,8 +258,8 @@ class TranscriptomeTokenizer:
|
|
255 |
custom_attr_name_dict=None,
|
256 |
nproc=1,
|
257 |
chunk_size=512,
|
258 |
-
model_input_size=
|
259 |
-
special_token=
|
260 |
collapse_gene_ids=True,
|
261 |
gene_median_file=GENE_MEDIAN_FILE,
|
262 |
token_dictionary_file=TOKEN_DICTIONARY_FILE,
|
@@ -273,10 +276,12 @@ class TranscriptomeTokenizer:
|
|
273 |
| Number of processes to use for dataset mapping.
|
274 |
chunk_size : int = 512
|
275 |
| Chunk size for anndata tokenizer.
|
276 |
-
model_input_size : int =
|
277 |
| Max input size of model to truncate input to.
|
278 |
-
|
|
|
279 |
| Adds CLS token before and EOS token after rank value encoding.
|
|
|
280 |
collapse_gene_ids : bool = True
|
281 |
| Whether to collapse gene IDs based on gene mapping dictionary.
|
282 |
gene_median_file : Path
|
@@ -321,6 +326,14 @@ class TranscriptomeTokenizer:
|
|
321 |
)
|
322 |
raise
|
323 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
324 |
# if collapsing duplicate gene IDs
|
325 |
self.collapse_gene_ids = collapse_gene_ids
|
326 |
|
|
|
18 |
| No cell metadata is required, but custom cell attributes may be passed onto the tokenized dataset by providing a dictionary of custom attributes to be added, which is formatted as loom_col_attr_name : desired_dataset_col_attr_name. For example, if the original .loom dataset has column attributes "cell_type" and "organ_major" and one would like to retain these attributes as labels in the tokenized dataset with the new names "cell_type" and "organ", respectively, the following custom attribute dictionary should be provided: {"cell_type": "cell_type", "organ_major": "organ"}.
|
19 |
| Additionally, if the original .loom file contains a cell column attribute called "filter_pass", this column will be used as a binary indicator of whether to include these cells in the tokenized data. All cells with "1" in this attribute will be tokenized, whereas the others will be excluded. One may use this column to indicate QC filtering or other criteria for selection for inclusion in the final tokenized dataset.
|
20 |
| If one's data is in other formats besides .loom or .h5ad, one can use the relevant tools (such as Anndata tools) to convert the file to a .loom or .h5ad format prior to running the transcriptome tokenizer.
|
21 |
+
| OF NOTE: Take care that the correct token dictionary and gene median file is used for the correct model.
|
22 |
+
| OF NOTE: For 95M model series, special_token should be True and model_input_size should be 4096.
|
23 |
+
| OF NOTE: For 30M model series, special_token should be False and model_input_size should be 2048.
|
24 |
"""
|
25 |
|
26 |
from __future__ import annotations
|
|
|
258 |
custom_attr_name_dict=None,
|
259 |
nproc=1,
|
260 |
chunk_size=512,
|
261 |
+
model_input_size=4096,
|
262 |
+
special_token=True,
|
263 |
collapse_gene_ids=True,
|
264 |
gene_median_file=GENE_MEDIAN_FILE,
|
265 |
token_dictionary_file=TOKEN_DICTIONARY_FILE,
|
|
|
276 |
| Number of processes to use for dataset mapping.
|
277 |
chunk_size : int = 512
|
278 |
| Chunk size for anndata tokenizer.
|
279 |
+
model_input_size : int = 4096
|
280 |
| Max input size of model to truncate input to.
|
281 |
+
| For the 30M model series, should be 2048. For the 95M model series, should be 4096.
|
282 |
+
special_token : bool = True
|
283 |
| Adds CLS token before and EOS token after rank value encoding.
|
284 |
+
| For the 30M model series, should be False. For the 95M model series, should be True.
|
285 |
collapse_gene_ids : bool = True
|
286 |
| Whether to collapse gene IDs based on gene mapping dictionary.
|
287 |
gene_median_file : Path
|
|
|
326 |
)
|
327 |
raise
|
328 |
|
329 |
+
if not self.special_token:
|
330 |
+
if ("<cls>" in self.gene_token_dict.keys()) and (
|
331 |
+
"<eos>" in self.gene_token_dict.keys()
|
332 |
+
):
|
333 |
+
logger.warning(
|
334 |
+
"<cls> and <eos> are in gene_token_dict but special_token = False. Please note that for 95M model series, special_token should be True."
|
335 |
+
)
|
336 |
+
|
337 |
# if collapsing duplicate gene IDs
|
338 |
self.collapse_gene_ids = collapse_gene_ids
|
339 |
|