hchen725 commited on
Commit
cf2a3b5
·
verified ·
1 Parent(s): 704ef0d

Update geneformer/tokenizer.py

Browse files
Files changed (1) hide show
  1. geneformer/tokenizer.py +13 -0
geneformer/tokenizer.py CHANGED
@@ -100,6 +100,9 @@ def sum_ensembl_ids(
100
  "ensembl_id" in data.ra.keys()
101
  ), "'ensembl_id' column missing from data.ra.keys()"
102
 
 
 
 
103
  # Check for duplicate Ensembl IDs if collapse_gene_ids is False.
104
  # Comparing to gene_token_dict here, would not perform any mapping steps
105
  gene_ids_in_dict = [
@@ -197,6 +200,10 @@ def sum_ensembl_ids(
197
  "ensembl_id" in data.var.columns
198
  ), "'ensembl_id' column missing from data.var"
199
 
 
 
 
 
200
  # Check for duplicate Ensembl IDs if collapse_gene_ids is False.
201
  # Comparing to gene_token_dict here, would not perform any mapping steps
202
  gene_ids_in_dict = [
@@ -516,6 +523,7 @@ class TranscriptomeTokenizer:
516
  file_cell_metadata = {
517
  attr_key: [] for attr_key in self.custom_attr_name_dict.keys()
518
  }
 
519
 
520
  dedup_filename = loom_file_path.with_name(loom_file_path.stem + "__dedup.loom")
521
  loom_file_path = sum_ensembl_ids(
@@ -591,6 +599,11 @@ class TranscriptomeTokenizer:
591
  if str(dedup_filename) == str(loom_file_path):
592
  os.remove(str(dedup_filename))
593
 
 
 
 
 
 
594
  return tokenized_cells, file_cell_metadata
595
 
596
  def create_dataset(
 
100
  "ensembl_id" in data.ra.keys()
101
  ), "'ensembl_id' column missing from data.ra.keys()"
102
 
103
+ assert (
104
+ "ensembl_id_collapsed" not in data.ra.keys()
105
+ ), "'ensembl_id_collapsed' column already exists in data.ra.keys()"
106
  # Check for duplicate Ensembl IDs if collapse_gene_ids is False.
107
  # Comparing to gene_token_dict here, would not perform any mapping steps
108
  gene_ids_in_dict = [
 
200
  "ensembl_id" in data.var.columns
201
  ), "'ensembl_id' column missing from data.var"
202
 
203
+ assert (
204
+ "ensembl_id_collapsed" not in data.var.columns
205
+ ), "'ensembl_id_collapsed' column already exists in data.var"
206
+
207
  # Check for duplicate Ensembl IDs if collapse_gene_ids is False.
208
  # Comparing to gene_token_dict here, would not perform any mapping steps
209
  gene_ids_in_dict = [
 
523
  file_cell_metadata = {
524
  attr_key: [] for attr_key in self.custom_attr_name_dict.keys()
525
  }
526
+ loom_file_path_original = loom_file_path
527
 
528
  dedup_filename = loom_file_path.with_name(loom_file_path.stem + "__dedup.loom")
529
  loom_file_path = sum_ensembl_ids(
 
599
  if str(dedup_filename) == str(loom_file_path):
600
  os.remove(str(dedup_filename))
601
 
602
+ with lp.connect(str(loom_file_path_original)) as data:
603
+ if "ensembl_id_collapsed" in data.ra.keys():
604
+ del data.ra["ensembl_id_collapsed"]
605
+
606
+
607
  return tokenized_cells, file_cell_metadata
608
 
609
  def create_dataset(