Update geneformer/tokenizer.py
Browse filesAdd ensembl_id_check under if not collapse_gene_ids
- geneformer/tokenizer.py +8 -2
geneformer/tokenizer.py
CHANGED
@@ -110,7 +110,10 @@ def sum_ensembl_ids(
|
|
110 |
# Check for duplicate Ensembl IDs if collapse_gene_ids is False.
|
111 |
# Comparing to gene_token_dict here, would not perform any mapping steps
|
112 |
if not collapse_gene_ids:
|
113 |
-
|
|
|
|
|
|
|
114 |
return data_directory
|
115 |
else:
|
116 |
raise ValueError("Error: data Ensembl IDs non-unique.")
|
@@ -212,7 +215,10 @@ def sum_ensembl_ids(
|
|
212 |
# Check for duplicate Ensembl IDs if collapse_gene_ids is False.
|
213 |
# Comparing to gene_token_dict here, would not perform any mapping steps
|
214 |
if not collapse_gene_ids:
|
215 |
-
|
|
|
|
|
|
|
216 |
return data_directory
|
217 |
else:
|
218 |
raise ValueError("Error: data Ensembl IDs non-unique.")
|
|
|
110 |
# Check for duplicate Ensembl IDs if collapse_gene_ids is False.
|
111 |
# Comparing to gene_token_dict here, would not perform any mapping steps
|
112 |
if not collapse_gene_ids:
|
113 |
+
ensembl_id_check = [
|
114 |
+
gene for gene in ensembl_ids if gene in gene_token_dict.keys()
|
115 |
+
]
|
116 |
+
if len(ensembl_id_check) == len(set(ensembl_id_check)):
|
117 |
return data_directory
|
118 |
else:
|
119 |
raise ValueError("Error: data Ensembl IDs non-unique.")
|
|
|
215 |
# Check for duplicate Ensembl IDs if collapse_gene_ids is False.
|
216 |
# Comparing to gene_token_dict here, would not perform any mapping steps
|
217 |
if not collapse_gene_ids:
|
218 |
+
ensembl_id_check = [
|
219 |
+
gene for gene in ensembl_ids if gene in gene_token_dict.keys()
|
220 |
+
]
|
221 |
+
if len(ensembl_id_check) == len(set(ensembl_id_check)):
|
222 |
return data_directory
|
223 |
else:
|
224 |
raise ValueError("Error: data Ensembl IDs non-unique.")
|