hchen725 commited on
Commit
b880879
·
verified ·
1 Parent(s): 889cadc

Update geneformer/tokenizer.py

Browse files

Add ensembl_id_check under if not collapse_gene_ids

Files changed (1) hide show
  1. geneformer/tokenizer.py +8 -2
geneformer/tokenizer.py CHANGED
@@ -110,7 +110,10 @@ def sum_ensembl_ids(
110
  # Check for duplicate Ensembl IDs if collapse_gene_ids is False.
111
  # Comparing to gene_token_dict here, would not perform any mapping steps
112
  if not collapse_gene_ids:
113
- if len(ensembl_ids) == len(set(ensembl_ids)):
 
 
 
114
  return data_directory
115
  else:
116
  raise ValueError("Error: data Ensembl IDs non-unique.")
@@ -212,7 +215,10 @@ def sum_ensembl_ids(
212
  # Check for duplicate Ensembl IDs if collapse_gene_ids is False.
213
  # Comparing to gene_token_dict here, would not perform any mapping steps
214
  if not collapse_gene_ids:
215
- if len(ensembl_ids) == len(set(ensembl_ids)):
 
 
 
216
  return data_directory
217
  else:
218
  raise ValueError("Error: data Ensembl IDs non-unique.")
 
110
  # Check for duplicate Ensembl IDs if collapse_gene_ids is False.
111
  # Comparing to gene_token_dict here, would not perform any mapping steps
112
  if not collapse_gene_ids:
113
+ ensembl_id_check = [
114
+ gene for gene in ensembl_ids if gene in gene_token_dict.keys()
115
+ ]
116
+ if len(ensembl_id_check) == len(set(ensembl_id_check)):
117
  return data_directory
118
  else:
119
  raise ValueError("Error: data Ensembl IDs non-unique.")
 
215
  # Check for duplicate Ensembl IDs if collapse_gene_ids is False.
216
  # Comparing to gene_token_dict here, would not perform any mapping steps
217
  if not collapse_gene_ids:
218
+ ensembl_id_check = [
219
+ gene for gene in ensembl_ids if gene in gene_token_dict.keys()
220
+ ]
221
+ if len(ensembl_id_check) == len(set(ensembl_id_check)):
222
  return data_directory
223
  else:
224
  raise ValueError("Error: data Ensembl IDs non-unique.")