Update geneformer/perturber_utils.py

#362

by hchen725 - opened Jul 4, 2024

base: refs/heads/main

←

from: refs/pr/362

Discussion Files changed

+836

-63

Files changed (3) hide show

geneformer/in_silico_perturber.py +581 -43
geneformer/perturber_utils.py +120 -16
geneformer/tokenizer.py +135 -4

geneformer/in_silico_perturber.py CHANGED Viewed

@@ -38,21 +38,17 @@ import logging
 import os
 import pickle
 from collections import defaultdict
-from typing import List
 from multiprocess import set_start_method
-import seaborn as sns
 import torch
-from datasets import Dataset
 from tqdm.auto import trange
 from . import perturber_utils as pu
 from .emb_extractor import get_embs
 from .perturber_utils import TOKEN_DICTIONARY_FILE
-sns.set()
 logger = logging.getLogger(__name__)
@@ -66,7 +62,7 @@ class InSilicoPerturber:
         "anchor_gene": {None, str},
         "model_type": {"Pretrained", "GeneClassifier", "CellClassifier"},
         "num_classes": {int},
-        "emb_mode": {"cell", "cell_and_gene"},
         "cell_emb_style": {"mean_pool"},
         "filter_data": {None, dict},
         "cell_states_to_model": {None, dict},
@@ -74,6 +70,7 @@ class InSilicoPerturber:
         "max_ncells": {None, int},
         "cell_inds_to_perturb": {"all", dict},
         "emb_layer": {-1, 0},
         "forward_batch_size": {int},
         "nproc": {int},
     }
@@ -97,7 +94,8 @@ class InSilicoPerturber:
         emb_layer=-1,
         forward_batch_size=100,
         nproc=4,
-        token_dictionary_file=TOKEN_DICTIONARY_FILE,
     ):
         """
         Initialize in silico perturber.
@@ -137,11 +135,11 @@ class InSilicoPerturber:
         num_classes : int
             | If model is a gene or cell classifier, specify number of classes it was trained to classify.
             | For the pretrained Geneformer model, number of classes is 0 as it is not a classifier.
-        emb_mode : {"cell", "cell_and_gene"}
-            | Whether to output impact of perturbation on cell and/or gene embeddings.
             | Gene embedding shifts only available as compared to original cell, not comparing to goal state.
         cell_emb_style : "mean_pool"
-            | Method for summarizing cell embeddings.
             | Currently only option is mean pooling of gene embeddings for given cell.
         filter_data : None, dict
             | Default is to use all input data for in silico perturbation study.
@@ -186,6 +184,8 @@ class InSilicoPerturber:
             | Number of CPU processes to use.
         token_dictionary_file : Path
             | Path to pickle file containing token dictionary (Ensembl ID:token).
         """
         try:
             set_start_method("spawn")
@@ -222,14 +222,31 @@ class InSilicoPerturber:
         self.emb_layer = emb_layer
         self.forward_batch_size = forward_batch_size
         self.nproc = nproc
         self.validate_options()
         # load token dictionary (Ensembl IDs:token)
         with open(token_dictionary_file, "rb") as f:
             self.gene_token_dict = pickle.load(f)
         self.pad_token_id = self.gene_token_dict.get("<pad>")
         if self.anchor_gene is None:
             self.anchor_token = None
@@ -287,7 +304,7 @@ class InSilicoPerturber:
                         continue
             valid_type = False
             for option in valid_options:
-                if (option in [bool, int, list, dict]) and isinstance(
                     attr_value, option
                 ):
                     valid_type = True
@@ -428,22 +445,46 @@ class InSilicoPerturber:
         self.max_len = pu.get_model_input_size(model)
         layer_to_quant = pu.quant_layers(model) + self.emb_layer
         ### filter input data ###
         # general filtering of input data based on filter_data argument
         filtered_input_data = pu.load_and_filter(
             self.filter_data, self.nproc, input_data_file
         )
         filtered_input_data = self.apply_additional_filters(filtered_input_data)
         if self.perturb_group is True:
-            self.isp_perturb_set(
-                model, filtered_input_data, layer_to_quant, output_path_prefix
-            )
         else:
-            self.isp_perturb_all(
-                model, filtered_input_data, layer_to_quant, output_path_prefix
-            )
     def apply_additional_filters(self, filtered_input_data):
         # additional filtering of input data dependent on isp mode
@@ -488,6 +529,7 @@ class InSilicoPerturber:
         layer_to_quant: int,
         output_path_prefix: str,
     ):
         def make_group_perturbation_batch(example):
             example_input_ids = example["input_ids"]
             example["tokens_to_perturb"] = self.tokens_to_perturb
@@ -506,7 +548,7 @@ class InSilicoPerturber:
             if self.perturb_type == "delete":
                 example = pu.delete_indices(example)
             elif self.perturb_type == "overexpress":
-                example = pu.overexpress_tokens(example, self.max_len)
                 example["n_overflow"] = pu.calc_n_overflow(
                     self.max_len,
                     example["length"],
@@ -560,6 +602,7 @@ class InSilicoPerturber:
                     layer_to_quant,
                     self.pad_token_id,
                     self.forward_batch_size,
                     summary_stat=None,
                     silent=True,
                 )
@@ -579,6 +622,7 @@ class InSilicoPerturber:
                     layer_to_quant,
                     self.pad_token_id,
                     self.forward_batch_size,
                     summary_stat=None,
                     silent=True,
                 )
@@ -678,8 +722,6 @@ class InSilicoPerturber:
                 cos_sims_dict = self.update_perturbation_dictionary(
                     cos_sims_dict,
                     cos_sims_data,
-                    filtered_input_data,
-                    indices_to_perturb,
                     gene_list,
                 )
             else:
@@ -688,8 +730,6 @@ class InSilicoPerturber:
                     cos_sims_dict[state] = self.update_perturbation_dictionary(
                         cos_sims_dict[state],
                         cos_sims_data[state],
-                        filtered_input_data,
-                        indices_to_perturb,
                         gene_list,
                     )
             del minibatch
@@ -711,6 +751,256 @@ class InSilicoPerturber:
                 f"{output_path_prefix}_gene_embs_dict_{self.tokens_to_perturb}",
             )
     def isp_perturb_all(
         self,
         model,
@@ -738,10 +1028,10 @@ class InSilicoPerturber:
                 layer_to_quant,
                 self.pad_token_id,
                 self.forward_batch_size,
                 summary_stat=None,
                 silent=True,
             )
             # gene_list is used to assign cos sims back to genes
             # need to remove the anchor gene
             gene_list = example_cell["input_ids"][0][:]
@@ -765,10 +1055,13 @@ class InSilicoPerturber:
                 layer_to_quant,
                 self.pad_token_id,
                 self.forward_batch_size,
                 summary_stat=None,
                 silent=True,
             )
             num_inds_perturbed = 1 + self.combos
             # need to remove overexpressed gene to quantify cosine shifts
             if self.perturb_type == "overexpress":
@@ -780,11 +1073,11 @@ class InSilicoPerturber:
             elif self.perturb_type == "delete":
                 perturbation_emb = full_perturbation_emb
-            original_batch = pu.make_comparison_batch(
-                full_original_emb, indices_to_perturb, perturb_group=False
-            )
             if self.cell_states_to_model is None or self.emb_mode == "cell_and_gene":
                 gene_cos_sims = pu.quant_cos_sims(
                     perturbation_emb,
                     original_batch,
@@ -792,6 +1085,8 @@ class InSilicoPerturber:
                     self.state_embs_dict,
                     emb_mode="gene",
                 )
             if self.cell_states_to_model is not None:
                 original_cell_emb = pu.compute_nonpadded_cell_embedding(
                     full_original_emb, "mean_pool"
@@ -807,6 +1102,8 @@ class InSilicoPerturber:
                     self.state_embs_dict,
                     emb_mode="cell",
                 )
             if self.emb_mode == "cell_and_gene":
                 # remove perturbed index for gene list
@@ -828,13 +1125,14 @@ class InSilicoPerturber:
                                 (perturbed_gene, affected_gene)
                             ] = gene_cos_sims[perturbation_i, gene_j].item()
             if self.cell_states_to_model is None:
                 cos_sims_data = torch.mean(gene_cos_sims, dim=1)
                 cos_sims_dict = self.update_perturbation_dictionary(
                     cos_sims_dict,
                     cos_sims_data,
-                    filtered_input_data,
-                    indices_to_perturb,
                     gene_list,
                 )
             else:
@@ -843,25 +1141,23 @@ class InSilicoPerturber:
                     cos_sims_dict[state] = self.update_perturbation_dictionary(
                         cos_sims_dict[state],
                         cos_sims_data[state],
-                        filtered_input_data,
-                        indices_to_perturb,
                         gene_list,
                     )
             # save dict to disk every 100 cells
-            if i % 100 == 0:
                 pu.write_perturbation_dictionary(
                     cos_sims_dict,
-                    f"{output_path_prefix}_dict_cell_embs_1Kbatch{pickle_batch}",
                 )
                 if self.emb_mode == "cell_and_gene":
                     pu.write_perturbation_dictionary(
                         stored_gene_embs_dict,
-                        f"{output_path_prefix}_dict_gene_embs_1Kbatch{pickle_batch}",
                     )
             # reset and clear memory every 1000 cells
-            if i % 1000 == 0:
                 pickle_batch += 1
                 if self.cell_states_to_model is None:
                     cos_sims_dict = defaultdict(list)
@@ -877,28 +1173,270 @@ class InSilicoPerturber:
                 torch.cuda.empty_cache()
         pu.write_perturbation_dictionary(
-            cos_sims_dict, f"{output_path_prefix}_dict_cell_embs_1Kbatch{pickle_batch}"
         )
         if self.emb_mode == "cell_and_gene":
             pu.write_perturbation_dictionary(
                 stored_gene_embs_dict,
-                f"{output_path_prefix}_dict_gene_embs_1Kbatch{pickle_batch}",
             )
     def update_perturbation_dictionary(
         self,
         cos_sims_dict: defaultdict,
         cos_sims_data: torch.Tensor,
-        filtered_input_data: Dataset,
-        indices_to_perturb: List[List[int]],
         gene_list=None,
     ):
         if gene_list is not None and cos_sims_data.shape[0] != len(gene_list):
             logger.error(
                 f"len(cos_sims_data.shape[0]) != len(gene_list). \n \
-                            cos_sims_data.shape[0] = {cos_sims_data.shape[0]}.\n \
-                            len(gene_list) = {len(gene_list)}."
             )
             raise
@@ -922,4 +1460,4 @@ class InSilicoPerturber:
             for i, cos in enumerate(cos_sims_data.tolist()):
                 cos_sims_dict[(gene_list[i], "cell_emb")].append(cos)
-        return cos_sims_dict

 import os
 import pickle
 from collections import defaultdict
 from multiprocess import set_start_method
 import torch
+from datasets import Dataset, disable_progress_bars
 from tqdm.auto import trange
 from . import perturber_utils as pu
 from .emb_extractor import get_embs
 from .perturber_utils import TOKEN_DICTIONARY_FILE
+disable_progress_bars()
 logger = logging.getLogger(__name__)
         "anchor_gene": {None, str},
         "model_type": {"Pretrained", "GeneClassifier", "CellClassifier"},
         "num_classes": {int},
+        "emb_mode": {"cls", "cell", "cls_and_gene", "cell_and_gene"},
         "cell_emb_style": {"mean_pool"},
         "filter_data": {None, dict},
         "cell_states_to_model": {None, dict},
         "max_ncells": {None, int},
         "cell_inds_to_perturb": {"all", dict},
         "emb_layer": {-1, 0},
+        "token_dictionary_file" : {None, str},
         "forward_batch_size": {int},
         "nproc": {int},
     }
         emb_layer=-1,
         forward_batch_size=100,
         nproc=4,
+        token_dictionary_file=None,
+        clear_mem_ncells=1000,
     ):
         """
         Initialize in silico perturber.
         num_classes : int
             | If model is a gene or cell classifier, specify number of classes it was trained to classify.
             | For the pretrained Geneformer model, number of classes is 0 as it is not a classifier.
+        emb_mode : {"cls", "cell", "cls_and_gene","cell_and_gene"}
+            | Whether to output impact of perturbation on CLS token, cell, and/or gene embeddings.
             | Gene embedding shifts only available as compared to original cell, not comparing to goal state.
         cell_emb_style : "mean_pool"
+            | Method for summarizing cell embeddings if not using CLS token.
             | Currently only option is mean pooling of gene embeddings for given cell.
         filter_data : None, dict
             | Default is to use all input data for in silico perturbation study.
             | Number of CPU processes to use.
         token_dictionary_file : Path
             | Path to pickle file containing token dictionary (Ensembl ID:token).
+        clear_mem_ncells : int
+            | Clear memory every n cells.
         """
         try:
             set_start_method("spawn")
         self.emb_layer = emb_layer
         self.forward_batch_size = forward_batch_size
         self.nproc = nproc
+        self.token_dictionary_file = token_dictionary_file
+        self.clear_mem_ncells = clear_mem_ncells
         self.validate_options()
         # load token dictionary (Ensembl IDs:token)
+        if self.token_dictionary_file is None:
+            token_dictionary_file = TOKEN_DICTIONARY_FILE
         with open(token_dictionary_file, "rb") as f:
             self.gene_token_dict = pickle.load(f)
+        self.token_gene_dict = {v: k for k, v in self.gene_token_dict.items()}
         self.pad_token_id = self.gene_token_dict.get("<pad>")
+        self.cls_token_id = self.gene_token_dict.get("<cls>")
+        self.eos_token_id = self.gene_token_dict.get("<eos>")
+        # Identify if special token is present in the token dictionary
+        if (self.cls_token_id is not None) and (self.eos_token_id is not None):
+            self.special_token = True
+        else:
+            if "cls" in self.emb_mode:
+                logger.error(f"emb_mode set to {self.emb_mode} but <cls> or <eos> token not in token dictionary.")
+                raise
+            self.special_token = False
         if self.anchor_gene is None:
             self.anchor_token = None
                         continue
             valid_type = False
             for option in valid_options:
+                if (option in [bool, int, list, dict, str]) and isinstance(
                     attr_value, option
                 ):
                     valid_type = True
         self.max_len = pu.get_model_input_size(model)
         layer_to_quant = pu.quant_layers(model) + self.emb_layer
         ### filter input data ###
         # general filtering of input data based on filter_data argument
         filtered_input_data = pu.load_and_filter(
             self.filter_data, self.nproc, input_data_file
         )
+        # Ensure emb_mode is cls if first token of the filtered input data is cls token
+        if self.special_token:
+            if (filtered_input_data["input_ids"][0][0] == self.cls_token_id) and ("cls" not in self.emb_mode):
+                logger.error(
+                            "Emb mode 'cls' or 'cls_and_gene' required when first token is <cls>."
+                        )
+                raise
+            if ("cls" in self.emb_mode):
+                if (filtered_input_data["input_ids"][0][0] != self.cls_token_id) or (filtered_input_data["input_ids"][0][-1] != self.eos_token_id):
+                    logger.error(
+                                "Emb mode 'cls' and 'cls_and_gene' require that first token is <cls> and last token is <eos>."
+                            )
+                    raise
         filtered_input_data = self.apply_additional_filters(filtered_input_data)
         if self.perturb_group is True:
+            if (self.special_token) and ("cls" in self.emb_mode):
+                self.isp_perturb_set_special(
+                    model, filtered_input_data, layer_to_quant, output_path_prefix
+                )
+            else:
+                self.isp_perturb_set(
+                    model, filtered_input_data, layer_to_quant, output_path_prefix
+                )
         else:
+            if (self.special_token) and ("cls" in self.emb_mode):
+                self.isp_perturb_all_special(
+                    model, filtered_input_data, layer_to_quant, output_path_prefix
+                )
+            else:
+                self.isp_perturb_all(
+                    model, filtered_input_data, layer_to_quant, output_path_prefix
+                )
     def apply_additional_filters(self, filtered_input_data):
         # additional filtering of input data dependent on isp mode
         layer_to_quant: int,
         output_path_prefix: str,
     ):
         def make_group_perturbation_batch(example):
             example_input_ids = example["input_ids"]
             example["tokens_to_perturb"] = self.tokens_to_perturb
             if self.perturb_type == "delete":
                 example = pu.delete_indices(example)
             elif self.perturb_type == "overexpress":
+                example = pu.overexpress_tokens(example, self.max_len, self.special_token)
                 example["n_overflow"] = pu.calc_n_overflow(
                     self.max_len,
                     example["length"],
                     layer_to_quant,
                     self.pad_token_id,
                     self.forward_batch_size,
+                    token_gene_dict=self.token_gene_dict,
                     summary_stat=None,
                     silent=True,
                 )
                     layer_to_quant,
                     self.pad_token_id,
                     self.forward_batch_size,
+                    token_gene_dict=self.token_gene_dict,
                     summary_stat=None,
                     silent=True,
                 )
                 cos_sims_dict = self.update_perturbation_dictionary(
                     cos_sims_dict,
                     cos_sims_data,
                     gene_list,
                 )
             else:
                     cos_sims_dict[state] = self.update_perturbation_dictionary(
                         cos_sims_dict[state],
                         cos_sims_data[state],
                         gene_list,
                     )
             del minibatch
                 f"{output_path_prefix}_gene_embs_dict_{self.tokens_to_perturb}",
             )
+    def isp_perturb_set_special(
+        self,
+        model,
+        filtered_input_data: Dataset,
+        layer_to_quant: int,
+        output_path_prefix: str,
+    ):
+        def make_group_perturbation_batch(example):
+            example_input_ids = example["input_ids"]
+            example["tokens_to_perturb"] = self.tokens_to_perturb
+            indices_to_perturb = [
+                example_input_ids.index(token) if token in example_input_ids else None
+                for token in self.tokens_to_perturb
+            ]
+            indices_to_perturb = [
+                item for item in indices_to_perturb if item is not None
+            ]
+            if len(indices_to_perturb) > 0:
+                example["perturb_index"] = indices_to_perturb
+            else:
+                # -100 indicates tokens to overexpress are not present in rank value encoding
+                example["perturb_index"] = [-100]
+            if self.perturb_type == "delete":
+                example = pu.delete_indices(example)
+            elif self.perturb_type == "overexpress":
+                example = pu.overexpress_tokens(example, self.max_len, self.special_token)
+                example["n_overflow"] = pu.calc_n_overflow(
+                    self.max_len,
+                    example["length"],
+                    self.tokens_to_perturb,
+                    indices_to_perturb,
+                )
+            return example
+        total_batch_length = len(filtered_input_data)
+        if self.cell_states_to_model is None:
+            cos_sims_dict = defaultdict(list)
+        else:
+            cos_sims_dict = {
+                state: defaultdict(list)
+                for state in pu.get_possible_states(self.cell_states_to_model)
+            }
+        perturbed_data = filtered_input_data.map(
+            make_group_perturbation_batch, num_proc=self.nproc
+        )
+        if self.perturb_type == "overexpress":
+            filtered_input_data = filtered_input_data.add_column(
+                "n_overflow", perturbed_data["n_overflow"]
+            )
+            filtered_input_data = filtered_input_data.map(
+                pu.truncate_by_n_overflow_special, num_proc=self.nproc
+            )
+        if self.emb_mode == "cls_and_gene":
+            stored_gene_embs_dict = defaultdict(list)
+        # iterate through batches
+        for i in trange(0, total_batch_length, self.forward_batch_size):
+            max_range = min(i + self.forward_batch_size, total_batch_length)
+            inds_select = [i for i in range(i, max_range)]
+            minibatch = filtered_input_data.select(inds_select)
+            perturbation_batch = perturbed_data.select(inds_select)
+            ##### CLS Embedding Mode #####
+            if self.emb_mode == "cls":
+                indices_to_perturb = perturbation_batch["perturb_index"]
+                original_cls_emb = get_embs(
+                    model,
+                    minibatch,
+                    "cls",
+                    layer_to_quant,
+                    self.pad_token_id,
+                    self.forward_batch_size,
+                    token_gene_dict=self.token_gene_dict,
+                    summary_stat=None,
+                    silent=True,
+                )
+                perturbation_cls_emb = get_embs(
+                    model,
+                    perturbation_batch,
+                    "cls",
+                    layer_to_quant,
+                    self.pad_token_id,
+                    self.forward_batch_size,
+                    token_gene_dict=self.token_gene_dict,
+                    summary_stat=None,
+                    silent=True,
+                )
+                # Calculate the cosine similarities
+                cls_cos_sims = pu.quant_cos_sims(
+                    perturbation_cls_emb,
+                    original_cls_emb,
+                    self.cell_states_to_model,
+                    self.state_embs_dict,
+                    emb_mode="cell")
+                # Update perturbation dictionary
+                if self.cell_states_to_model is None:
+                    cos_sims_dict = self.update_perturbation_dictionary(
+                        cos_sims_dict,
+                        cls_cos_sims,
+                        gene_list = None,
+                    )
+                else:
+                    for state in cos_sims_dict.keys():
+                        cos_sims_dict[state] = self.update_perturbation_dictionary(
+                            cos_sims_dict[state],
+                            cls_cos_sims[state],
+                            gene_list = None,
+                        )
+            ##### CLS and Gene Embedding Mode #####
+            elif self.emb_mode == "cls_and_gene":
+                full_original_emb = get_embs(
+                    model,
+                    minibatch,
+                    "gene",
+                    layer_to_quant,
+                    self.pad_token_id,
+                    self.forward_batch_size,
+                    self.token_gene_dict,
+                    summary_stat=None,
+                    silent=True,
+                )
+                indices_to_perturb = perturbation_batch["perturb_index"]
+                # remove indices that were perturbed
+                original_emb = pu.remove_perturbed_indices_set(
+                    full_original_emb,
+                    self.perturb_type,
+                    indices_to_perturb,
+                    self.tokens_to_perturb,
+                    minibatch["length"],
+                )
+                full_perturbation_emb = get_embs(
+                    model,
+                    perturbation_batch,
+                    "gene",
+                    layer_to_quant,
+                    self.pad_token_id,
+                    self.forward_batch_size,
+                    self.token_gene_dict,
+                    summary_stat=None,
+                    silent=True,
+                )
+                # remove special tokens and padding
+                original_emb = original_emb[:, 1:-1, :]
+                if self.perturb_type == "overexpress":
+                    perturbation_emb = full_perturbation_emb[:,1+len(self.tokens_to_perturb):-1,:]
+                elif self.perturb_type == "delete":
+                    perturbation_emb = full_perturbation_emb[:,1:max(perturbation_batch["length"])-1,:]
+                n_perturbation_genes = perturbation_emb.size()[1]
+                gene_cos_sims = pu.quant_cos_sims(
+                    perturbation_emb,
+                    original_emb,
+                    self.cell_states_to_model,
+                    self.state_embs_dict,
+                    emb_mode="gene",
+                )
+                # get cls emb
+                original_cls_emb = full_original_emb[:,0,:]
+                perturbation_cls_emb = full_perturbation_emb[:,0,:]
+                cls_cos_sims = pu.quant_cos_sims(
+                    perturbation_cls_emb,
+                    original_cls_emb,
+                    self.cell_states_to_model,
+                    self.state_embs_dict,
+                    emb_mode="cell",
+                )
+                # get cosine similarities in gene embeddings
+                # since getting gene embeddings, need gene names
+                gene_list = minibatch["input_ids"]
+                # need to truncate gene_list
+                genes_to_exclude = self.tokens_to_perturb + [self.cls_token_id, self.eos_token_id]
+                gene_list = [
+                    [g for g in genes if g not in genes_to_exclude][
+                        :n_perturbation_genes
+                    ]
+                    for genes in gene_list
+                ]
+                for cell_i, genes in enumerate(gene_list):
+                    for gene_j, affected_gene in enumerate(genes):
+                        if len(self.genes_to_perturb) > 1:
+                            tokens_to_perturb = tuple(self.tokens_to_perturb)
+                        else:
+                            tokens_to_perturb = self.tokens_to_perturb[0]
+                        # fill in the gene cosine similarities
+                        try:
+                            stored_gene_embs_dict[
+                                (tokens_to_perturb, affected_gene)
+                            ].append(gene_cos_sims[cell_i, gene_j].item())
+                        except KeyError:
+                            stored_gene_embs_dict[
+                                (tokens_to_perturb, affected_gene)
+                            ] = gene_cos_sims[cell_i, gene_j].item()
+                if self.cell_states_to_model is None:
+                    cos_sims_dict = self.update_perturbation_dictionary(
+                        cos_sims_dict,
+                        cls_cos_sims,
+                        gene_list = None,
+                    )
+                else:
+                    for state in cos_sims_dict.keys():
+                        cos_sims_dict[state] = self.update_perturbation_dictionary(
+                            cos_sims_dict[state],
+                            cls_cos_sims[state],
+                            gene_list = None,
+                        )
+                del full_original_emb
+                del original_emb
+                del full_perturbation_emb
+                del perturbation_emb
+                del gene_cos_sims
+            del original_cls_emb
+            del perturbation_cls_emb
+            del cls_cos_sims
+            del minibatch
+            del perturbation_batch
+            torch.cuda.empty_cache()
+        pu.write_perturbation_dictionary(
+            cos_sims_dict,
+            f"{output_path_prefix}_cell_embs_dict_{self.tokens_to_perturb}",
+        )
+        if self.emb_mode == "cls_and_gene":
+            pu.write_perturbation_dictionary(
+                stored_gene_embs_dict,
+                f"{output_path_prefix}_gene_embs_dict_{self.tokens_to_perturb}",
+            )
     def isp_perturb_all(
         self,
         model,
                 layer_to_quant,
                 self.pad_token_id,
                 self.forward_batch_size,
+                self.token_gene_dict,
                 summary_stat=None,
                 silent=True,
             )
             # gene_list is used to assign cos sims back to genes
             # need to remove the anchor gene
             gene_list = example_cell["input_ids"][0][:]
                 layer_to_quant,
                 self.pad_token_id,
                 self.forward_batch_size,
+                self.token_gene_dict,
                 summary_stat=None,
                 silent=True,
             )
+            del perturbation_batch
             num_inds_perturbed = 1 + self.combos
             # need to remove overexpressed gene to quantify cosine shifts
             if self.perturb_type == "overexpress":
             elif self.perturb_type == "delete":
                 perturbation_emb = full_perturbation_emb
             if self.cell_states_to_model is None or self.emb_mode == "cell_and_gene":
+                original_batch = pu.make_comparison_batch(
+                    full_original_emb, indices_to_perturb, perturb_group=False
+                )
                 gene_cos_sims = pu.quant_cos_sims(
                     perturbation_emb,
                     original_batch,
                     self.state_embs_dict,
                     emb_mode="gene",
                 )
+                del original_batch
             if self.cell_states_to_model is not None:
                 original_cell_emb = pu.compute_nonpadded_cell_embedding(
                     full_original_emb, "mean_pool"
                     self.state_embs_dict,
                     emb_mode="cell",
                 )
+                del original_cell_emb
+                del perturbation_cell_emb
             if self.emb_mode == "cell_and_gene":
                 # remove perturbed index for gene list
                                 (perturbed_gene, affected_gene)
                             ] = gene_cos_sims[perturbation_i, gene_j].item()
+            del full_original_emb
+            del full_perturbation_emb
             if self.cell_states_to_model is None:
                 cos_sims_data = torch.mean(gene_cos_sims, dim=1)
                 cos_sims_dict = self.update_perturbation_dictionary(
                     cos_sims_dict,
                     cos_sims_data,
                     gene_list,
                 )
             else:
                     cos_sims_dict[state] = self.update_perturbation_dictionary(
                         cos_sims_dict[state],
                         cos_sims_data[state],
                         gene_list,
                     )
             # save dict to disk every 100 cells
+            if i % self.clear_mem_ncells/10 == 0:
                 pu.write_perturbation_dictionary(
                     cos_sims_dict,
+                    f"{output_path_prefix}_dict_cell_embs_batch{pickle_batch}",
                 )
                 if self.emb_mode == "cell_and_gene":
                     pu.write_perturbation_dictionary(
                         stored_gene_embs_dict,
+                        f"{output_path_prefix}_dict_gene_embs_batch{pickle_batch}",
                     )
             # reset and clear memory every 1000 cells
+            if i % self.clear_mem_ncells == 0:
                 pickle_batch += 1
                 if self.cell_states_to_model is None:
                     cos_sims_dict = defaultdict(list)
                 torch.cuda.empty_cache()
         pu.write_perturbation_dictionary(
+            cos_sims_dict, f"{output_path_prefix}_dict_cell_embs_batch{pickle_batch}"
         )
         if self.emb_mode == "cell_and_gene":
             pu.write_perturbation_dictionary(
                 stored_gene_embs_dict,
+                f"{output_path_prefix}_dict_gene_embs_batch{pickle_batch}",
+            )
+    def isp_perturb_all_special(
+        self,
+        model,
+        filtered_input_data: Dataset,
+        layer_to_quant: int,
+        output_path_prefix: str,
+    ):
+        pickle_batch = -1
+        if self.cell_states_to_model is None:
+            cos_sims_dict = defaultdict(list)
+        else:
+            cos_sims_dict = {
+                state: defaultdict(list)
+                for state in pu.get_possible_states(self.cell_states_to_model)
+            }
+        if self.emb_mode == "cls_and_gene":
+            stored_gene_embs_dict = defaultdict(list)
+        num_inds_perturbed = 1 + self.combos
+        for i in trange(len(filtered_input_data)):
+            example_cell = filtered_input_data.select([i])
+            # gene_list is used to assign cos sims back to genes
+            # need to remove the anchor gene and special tokens
+            gene_list = example_cell["input_ids"][0][:]
+            for token in [self.cls_token_id, self.eos_token_id]:
+                gene_list.remove(token)
+            if self.anchor_token is not None:
+                for token in self.anchor_token:
+                    gene_list.remove(token)
+            else:
+                if self.perturb_type == "overexpress":
+                    gene_list = gene_list[
+                        num_inds_perturbed:
+                    ]  # index 0 is not overexpressed
+            perturbation_batch, indices_to_perturb = pu.make_perturbation_batch_special(
+                example_cell,
+                self.perturb_type,
+                self.tokens_to_perturb,
+                self.anchor_token,
+                self.combos,
+                self.nproc,
+            )
+            ##### CLS Embedding Mode #####
+            if self.emb_mode == "cls":
+                # Extract cls embeddings from original and perturbed cells
+                perturbation_cls_emb = get_embs(
+                    model,
+                    perturbation_batch,
+                    "cls",
+                    layer_to_quant,
+                    self.pad_token_id,
+                    self.forward_batch_size,
+                    self.token_gene_dict,
+                    summary_stat=None,
+                    silent=True,
+                )
+                original_cls_emb = get_embs(
+                    model,
+                    example_cell,
+                    "cls",
+                    layer_to_quant,
+                    self.pad_token_id,
+                    self.forward_batch_size,
+                    self.token_gene_dict,
+                    summary_stat=None,
+                    silent=True,
+                )
+                # Calculate cosine similarities
+                cls_cos_sims = pu.quant_cos_sims(
+                    perturbation_cls_emb,
+                    original_cls_emb,
+                    self.cell_states_to_model,
+                    self.state_embs_dict,
+                    emb_mode="cell",
+                )
+                if self.cell_states_to_model is None:
+                    cos_sims_dict = self.update_perturbation_dictionary(
+                        cos_sims_dict,
+                        cls_cos_sims,
+                        gene_list,
+                    )
+                else:
+                    for state in cos_sims_dict.keys():
+                        cos_sims_dict[state] = self.update_perturbation_dictionary(
+                            cos_sims_dict[state],
+                            cls_cos_sims[state],
+                            gene_list,
+                        )
+                del perturbation_batch
+                del original_cls_emb
+                del perturbation_cls_emb
+                del cls_cos_sims
+            ##### CLS and Gene Embedding Mode #####
+            elif self.emb_mode == "cls_and_gene":
+                full_perturbation_emb = get_embs(
+                    model,
+                    perturbation_batch,
+                    "gene",
+                    layer_to_quant,
+                    self.pad_token_id,
+                    self.forward_batch_size,
+                    self.token_gene_dict,
+                    summary_stat=None,
+                    silent=True,
+                )
+                # need to remove overexpressed gene and cls/eos to quantify cosine shifts
+                if self.perturb_type == "overexpress":
+                    perturbation_emb = full_perturbation_emb[:, 1+num_inds_perturbed:-1, :].clone().detach()
+                elif self.perturb_type == "delete":
+                    perturbation_emb = full_perturbation_emb[:, 1:-1, :].clone().detach()
+                full_original_emb = get_embs(
+                    model,
+                    example_cell,
+                    "gene",
+                    layer_to_quant,
+                    self.pad_token_id,
+                    self.forward_batch_size,
+                    self.token_gene_dict,
+                    summary_stat=None,
+                    silent=True,
+                )
+                original_batch = pu.make_comparison_batch(
+                    full_original_emb, indices_to_perturb, perturb_group=False
+                )
+                original_batch = original_batch[:, 1:-1, :].clone().detach()
+                gene_cos_sims = pu.quant_cos_sims(
+                    perturbation_emb,
+                    original_batch,
+                    self.cell_states_to_model,
+                    self.state_embs_dict,
+                    emb_mode="gene",
+                )
+                # remove perturbed index for gene list
+                perturbed_gene_dict = {
+                    gene: gene_list[:i] + gene_list[i + 1 :]
+                    for i, gene in enumerate(gene_list)
+                }
+                for perturbation_i, perturbed_gene in enumerate(gene_list):
+                    for gene_j, affected_gene in enumerate(
+                        perturbed_gene_dict[perturbed_gene]
+                    ):
+                        try:
+                            stored_gene_embs_dict[
+                                (perturbed_gene, affected_gene)
+                            ].append(gene_cos_sims[perturbation_i, gene_j].item())
+                        except KeyError:
+                            stored_gene_embs_dict[
+                                (perturbed_gene, affected_gene)
+                            ] = gene_cos_sims[perturbation_i, gene_j].item()
+                # get cls emb
+                original_cls_emb = full_original_emb[:,0,:].clone().detach()
+                perturbation_cls_emb = full_perturbation_emb[:,0,:].clone().detach()
+                cls_cos_sims = pu.quant_cos_sims(
+                    perturbation_cls_emb,
+                    original_cls_emb,
+                    self.cell_states_to_model,
+                    self.state_embs_dict,
+                    emb_mode="cell",
+                )
+                if self.cell_states_to_model is None:
+                    cos_sims_dict = self.update_perturbation_dictionary(
+                        cos_sims_dict,
+                        cls_cos_sims,
+                        gene_list,
+                    )
+                else:
+                    for state in cos_sims_dict.keys():
+                        cos_sims_dict[state] = self.update_perturbation_dictionary(
+                            cos_sims_dict[state],
+                            cls_cos_sims[state],
+                            gene_list,
+                        )
+                del perturbation_batch
+                del original_batch
+                del full_original_emb
+                del full_perturbation_emb
+                del perturbation_emb
+                del original_cls_emb
+                del perturbation_cls_emb
+                del cls_cos_sims
+                del gene_cos_sims
+            # save dict to disk every self.clear_mem_ncells/10 (default 100) cells
+            if i % max(1,self.clear_mem_ncells/10) == 0:
+                pu.write_perturbation_dictionary(
+                    cos_sims_dict,
+                    f"{output_path_prefix}_dict_cell_embs_batch{pickle_batch}",
+                )
+                if self.emb_mode == "cls_and_gene":
+                    pu.write_perturbation_dictionary(
+                        stored_gene_embs_dict,
+                        f"{output_path_prefix}_dict_gene_embs_batch{pickle_batch}",
+                    )
+            # reset and clear memory every self.clear_mem_ncells (default 1000) cells
+            if i % self.clear_mem_ncells == 0:
+                pickle_batch += 1
+                if self.cell_states_to_model is None:
+                    cos_sims_dict = defaultdict(list)
+                else:
+                    cos_sims_dict = {
+                        state: defaultdict(list)
+                        for state in pu.get_possible_states(self.cell_states_to_model)
+                    }
+                if self.emb_mode == "cls_and_gene":
+                    stored_gene_embs_dict = defaultdict(list)
+                torch.cuda.empty_cache()
+        pu.write_perturbation_dictionary(
+            cos_sims_dict, f"{output_path_prefix}_dict_cell_embs_batch{pickle_batch}"
+        )
+        if self.emb_mode == "cls_and_gene":
+            pu.write_perturbation_dictionary(
+                stored_gene_embs_dict,
+                f"{output_path_prefix}_dict_gene_embs_batch{pickle_batch}",
             )
     def update_perturbation_dictionary(
         self,
         cos_sims_dict: defaultdict,
         cos_sims_data: torch.Tensor,
         gene_list=None,
     ):
         if gene_list is not None and cos_sims_data.shape[0] != len(gene_list):
             logger.error(
                 f"len(cos_sims_data.shape[0]) != len(gene_list). \n \
+                            {cos_sims_data.shape[0]=}.\n \
+                            {len(gene_list)=}."
             )
             raise
             for i, cos in enumerate(cos_sims_data.tolist()):
                 cos_sims_dict[(gene_list[i], "cell_emb")].append(cos)
+        return cos_sims_dict

geneformer/perturber_utils.py CHANGED Viewed

@@ -23,8 +23,6 @@ TOKEN_DICTIONARY_FILE = Path(__file__).parent / "token_dictionary.pkl"
 ENSEMBL_DICTIONARY_FILE = Path(__file__).parent / "gene_name_id_dict.pkl"
-sns.set()
 logger = logging.getLogger(__name__)
@@ -156,8 +154,12 @@ def quant_layers(model):
     return int(max(layer_nums)) + 1
 def get_model_input_size(model):
-    return int(re.split("\(|,", str(model.bert.embeddings.position_embeddings))[1])
 def flatten_list(megalist):
@@ -222,27 +224,47 @@ def overexpress_indices(example):
     indices = example["perturb_index"]
     if any(isinstance(el, list) for el in indices):
         indices = flatten_list(indices)
-    for index in sorted(indices, reverse=True):
-        example["input_ids"].insert(0, example["input_ids"].pop(index))
     example["length"] = len(example["input_ids"])
     return example
 # for genes_to_perturb = list of genes to overexpress that are not necessarily expressed in cell
-def overexpress_tokens(example, max_len):
     # -100 indicates tokens to overexpress are not present in rank value encoding
     if example["perturb_index"] != [-100]:
         example = delete_indices(example)
-    [
-        example["input_ids"].insert(0, token)
-        for token in example["tokens_to_perturb"][::-1]
-    ]
     # truncate to max input size, must also truncate original emb to be comparable
     if len(example["input_ids"]) > max_len:
-        example["input_ids"] = example["input_ids"][0:max_len]
     example["length"] = len(example["input_ids"])
     return example
@@ -259,6 +281,13 @@ def truncate_by_n_overflow(example):
     example["length"] = len(example["input_ids"])
     return example
 def remove_indices_from_emb(emb, indices_to_remove, gene_dim):
     # indices_to_remove is list of indices to remove
@@ -392,7 +421,81 @@ def make_perturbation_batch(
     return perturbation_dataset, indices_to_perturb
-# perturbed cell emb removing the activated/overexpressed/inhibited gene emb
 # so that only non-perturbed gene embeddings are compared to each other
 # in original or perturbed context
 def make_comparison_batch(original_emb_batch, indices_to_perturb, perturb_group):
@@ -589,9 +692,10 @@ def quant_cos_sims(
         cos = torch.nn.CosineSimilarity(dim=1)
     # if emb_mode == "gene", can only calculate gene cos sims
-    # against original cell anyways
     if cell_states_to_model is None or emb_mode == "gene":
         cos_sims = cos(perturbation_emb, original_emb).to("cuda")
     elif cell_states_to_model is not None and emb_mode == "cell":
         possible_states = get_possible_states(cell_states_to_model)
         cos_sims = dict(zip(possible_states, [[] for _ in range(len(possible_states))]))
@@ -758,4 +862,4 @@ class GeneIdHandler:
         return self.ens_to_symbol(self.token_to_ens(token))
     def symbol_to_token(self, symbol):
-        return self.ens_to_token(self.symbol_to_ens(symbol))

 ENSEMBL_DICTIONARY_FILE = Path(__file__).parent / "gene_name_id_dict.pkl"
 logger = logging.getLogger(__name__)
     return int(max(layer_nums)) + 1
+def get_model_emb_dims(model):
+    return model.config.hidden_size
 def get_model_input_size(model):
+    return model.config.max_position_embeddings
 def flatten_list(megalist):
     indices = example["perturb_index"]
     if any(isinstance(el, list) for el in indices):
         indices = flatten_list(indices)
+    insert_pos = 0
+    for index in sorted(indices, reverse=False):
+        example["input_ids"].insert(insert_pos, example["input_ids"].pop(index))
+        insert_pos += 1
     example["length"] = len(example["input_ids"])
     return example
+# if CLS token present, move to 1st rather than 0th position
+def overexpress_indices_special(example):
+    indices = example["perturb_index"]
+    if any(isinstance(el, list) for el in indices):
+        indices = flatten_list(indices)
+    insert_pos = 1 # Insert starting after CLS token
+    for index in sorted(indices, reverse=False):
+        example["input_ids"].insert(insert_pos, example["input_ids"].pop(index))
+        insert_pos += 1
+    example["length"] = len(example["input_ids"])
+    return example
 # for genes_to_perturb = list of genes to overexpress that are not necessarily expressed in cell
+def overexpress_tokens(example, max_len, special_token):
     # -100 indicates tokens to overexpress are not present in rank value encoding
     if example["perturb_index"] != [-100]:
         example = delete_indices(example)
+    if special_token:
+        [
+            example["input_ids"].insert(1, token)
+            for token in example["tokens_to_perturb"][::-1]
+        ]
+    else:
+        [
+            example["input_ids"].insert(0, token)
+            for token in example["tokens_to_perturb"][::-1]
+        ]
     # truncate to max input size, must also truncate original emb to be comparable
     if len(example["input_ids"]) > max_len:
+        if special_token:
+            example["input_ids"] = example["input_ids"][0:max_len-1]+[example["input_ids"][-1]]
+        else:
+            example["input_ids"] = example["input_ids"][0:max_len]
     example["length"] = len(example["input_ids"])
     return example
     example["length"] = len(example["input_ids"])
     return example
+def truncate_by_n_overflow_special(example):
+    if example["n_overflow"] > 0:
+        new_max_len = example["length"] - example["n_overflow"]
+        example["input_ids"] = example["input_ids"][0:new_max_len-1]+[example["input_ids"][-1]]
+        example["length"] = len(example["input_ids"])
+    return example
 def remove_indices_from_emb(emb, indices_to_remove, gene_dim):
     # indices_to_remove is list of indices to remove
     return perturbation_dataset, indices_to_perturb
+def make_perturbation_batch_special(
+    example_cell, perturb_type, tokens_to_perturb, anchor_token, combo_lvl, num_proc
+) -> tuple[Dataset, List[int]]:
+    if combo_lvl == 0 and tokens_to_perturb == "all":
+        if perturb_type in ["overexpress", "activate"]:
+            range_start = 1
+        elif perturb_type in ["delete", "inhibit"]:
+            range_start = 0
+        range_start += 1 # Starting after the CLS token
+        indices_to_perturb = [
+            [i] for i in range(range_start, example_cell["length"][0]-1) # And excluding the EOS token
+        ]
+    # elif combo_lvl > 0 and anchor_token is None:
+    ## to implement
+    elif combo_lvl > 0 and (anchor_token is not None):
+        example_input_ids = example_cell["input_ids"][0]
+        anchor_index = example_input_ids.index(anchor_token[0])
+        indices_to_perturb = [
+            sorted([anchor_index, i]) if i != anchor_index else None
+            for i in range(1, example_cell["length"][0]-1) # Exclude CLS and EOS tokens
+        ]
+        indices_to_perturb = [item for item in indices_to_perturb if item is not None]
+    else:
+        example_input_ids = example_cell["input_ids"][0]
+        indices_to_perturb = [
+            [example_input_ids.index(token)] if token in example_input_ids else None
+            for token in tokens_to_perturb
+        ]
+        indices_to_perturb = [item for item in indices_to_perturb if item is not None]
+    # create all permutations of combo_lvl of modifiers from tokens_to_perturb
+    if combo_lvl > 0 and (anchor_token is None):
+        if tokens_to_perturb != "all":
+            if len(tokens_to_perturb) == combo_lvl + 1:
+                indices_to_perturb = [
+                    list(x) for x in it.combinations(indices_to_perturb, combo_lvl + 1)
+                ]
+        else:
+            all_indices = [[i] for i in range(1, example_cell["length"][0]-1)] # Exclude CLS and EOS tokens
+            all_indices = [
+                index for index in all_indices if index not in indices_to_perturb
+            ]
+            indices_to_perturb = [
+                [[j for i in indices_to_perturb for j in i], x] for x in all_indices
+            ]
+    length = len(indices_to_perturb)
+    perturbation_dataset = Dataset.from_dict(
+        {
+            "input_ids": example_cell["input_ids"] * length,
+            "perturb_index": indices_to_perturb,
+        }
+    )
+    if length < 400:
+        num_proc_i = 1
+    else:
+        num_proc_i = num_proc
+    if perturb_type == "delete":
+        perturbation_dataset = perturbation_dataset.map(
+            delete_indices, num_proc=num_proc_i
+        )
+    elif perturb_type == "overexpress":
+        perturbation_dataset = perturbation_dataset.map(
+                overexpress_indices_special, num_proc=num_proc_i
+        )
+    perturbation_dataset = perturbation_dataset.map(measure_length, num_proc=num_proc_i)
+    return perturbation_dataset, indices_to_perturb
+# original cell emb removing the activated/overexpressed/inhibited gene emb
 # so that only non-perturbed gene embeddings are compared to each other
 # in original or perturbed context
 def make_comparison_batch(original_emb_batch, indices_to_perturb, perturb_group):
         cos = torch.nn.CosineSimilarity(dim=1)
     # if emb_mode == "gene", can only calculate gene cos sims
+    # against original cell
     if cell_states_to_model is None or emb_mode == "gene":
         cos_sims = cos(perturbation_emb, original_emb).to("cuda")
     elif cell_states_to_model is not None and emb_mode == "cell":
         possible_states = get_possible_states(cell_states_to_model)
         cos_sims = dict(zip(possible_states, [[] for _ in range(len(possible_states))]))
         return self.ens_to_symbol(self.token_to_ens(token))
     def symbol_to_token(self, symbol):
+        return self.ens_to_token(self.symbol_to_ens(symbol))

geneformer/tokenizer.py CHANGED Viewed

@@ -36,14 +36,21 @@ Geneformer tokenizer.
 from __future__ import annotations
 import logging
 import pickle
 import warnings
 from pathlib import Path
 from typing import Literal
-import anndata as ad
 import numpy as np
 import scipy.sparse as sp
 from datasets import Dataset
@@ -52,7 +59,7 @@ import loompy as lp  # noqa
 logger = logging.getLogger(__name__)
-from .perturber_utils import GENE_MEDIAN_FILE, TOKEN_DICTIONARY_FILE
 def rank_genes(gene_vector, gene_tokens):
@@ -74,6 +81,115 @@ def tokenize_cell(gene_vector, gene_tokens):
     # rank by median-scaled gene values
     return rank_genes(gene_vector[nonzero_mask], gene_tokens[nonzero_mask])
 class TranscriptomeTokenizer:
     def __init__(
@@ -85,6 +201,7 @@ class TranscriptomeTokenizer:
         special_token=False,
         gene_median_file=GENE_MEDIAN_FILE,
         token_dictionary_file=TOKEN_DICTIONARY_FILE,
     ):
         """
         Initialize tokenizer.
@@ -103,11 +220,15 @@ class TranscriptomeTokenizer:
             | Max input size of model to truncate input to.
         special_token : bool = False
             | Adds CLS token before and EOS token after rank value encoding.
         gene_median_file : Path
             | Path to pickle file containing dictionary of non-zero median
             | gene expression values across Genecorpus-30M.
         token_dictionary_file : Path
             | Path to pickle file containing token dictionary (Ensembl IDs:token).
         """
         # dictionary of custom attributes {output dataset column name: input .loom column name}
@@ -134,6 +255,10 @@ class TranscriptomeTokenizer:
         with open(token_dictionary_file, "rb") as f:
             self.gene_token_dict = pickle.load(f)
         # gene keys for full vocabulary
         self.gene_keys = list(self.gene_token_dict.keys())
@@ -214,7 +339,7 @@ class TranscriptomeTokenizer:
         return tokenized_cells, cell_metadata
     def tokenize_anndata(self, adata_file_path, target_sum=10_000):
-        adata = ad.read(adata_file_path, backed="r")
         if self.custom_attr_name_dict is not None:
             file_cell_metadata = {
@@ -256,7 +381,8 @@ class TranscriptomeTokenizer:
             idx = filter_pass_loc[i : i + self.chunk_size]
             n_counts = adata[idx].obs["n_counts"].values[:, None]
-            X_view = adata[idx, coding_miRNA_loc].X
             X_norm = X_view / n_counts * target_sum / norm_factor_vector
             X_norm = sp.csr_matrix(X_norm)
@@ -280,6 +406,8 @@ class TranscriptomeTokenizer:
                 attr_key: [] for attr_key in self.custom_attr_name_dict.keys()
             }
         with lp.connect(str(loom_file_path)) as data:
             # define coordinates of detected protein-coding or miRNA genes and vector of their normalization factors
             coding_miRNA_loc = np.where(
@@ -341,6 +469,9 @@ class TranscriptomeTokenizer:
                 else:
                     file_cell_metadata = None
         return tokenized_cells, file_cell_metadata
     def create_dataset(

 from __future__ import annotations
+import os
 import logging
 import pickle
+import sys
 import warnings
 from pathlib import Path
 from typing import Literal
+from tqdm import tqdm
+from collections import Counter
 import numpy as np
+import scanpy as sc
+import loompy as lp
+import pandas as pd
+import anndata as ad
 import scipy.sparse as sp
 from datasets import Dataset
 logger = logging.getLogger(__name__)
+from .perturber_utils import GENE_MEDIAN_FILE, TOKEN_DICTIONARY_FILE, ENSEMBL_DICTIONARY_FILE
 def rank_genes(gene_vector, gene_tokens):
     # rank by median-scaled gene values
     return rank_genes(gene_vector[nonzero_mask], gene_tokens[nonzero_mask])
+def sum_ensembl_ids(data_directory,
+                    gene_mapping_dict,
+                    file_format = "loom",
+                    chunk_size = 512):
+    if file_format == "loom":
+        """
+        Map Ensembl IDs from gene mapping dictionary. If duplicate Ensembl IDs are found, sum counts together.
+        """
+        with lp.connect(data_directory) as data:
+            assert "ensembl_id" in data.ra.keys(), "'ensembl_id' column missing from data.ra.keys()"
+            gene_ids_collapsed = [gene_mapping_dict.get(gene_id.upper()) for gene_id in data.ra.ensembl_id]
+            if len(set(gene_ids_collapsed)) == len(set(data.ra.ensembl_id)):
+                return data_directory
+            else:
+                dedup_filename = data_directory.with_name(data_directory.stem + "__dedup.loom")
+                dup_genes = [idx for idx, count in Counter(data.ra["ensembl_id"]).items() if count > 1]
+                num_chunks = int(np.ceil(data.shape[1] / chunk_size))
+                first_chunk = True
+                for _, _, view in tqdm(data.scan(axis = 1, batch_size = chunk_size), total = num_chunks):
+                    def process_chunk(view, duplic_genes):
+                        data_count_view = pd.DataFrame(view, index=data.ra["ensembl_id"])
+                        unique_data_df = data_count_view.loc[~data_count_view.index.isin(duplic_genes)]
+                        dup_data_df = data_count_view.loc[data_count_view.index.isin(duplic_genes)]
+                        summed_data = dup_data_df.groupby(dup_data_df.index).sum()
+                        if not summed_data.index.is_unique:
+                            raise ValueError("Error: summed data frame non-unique.")
+                        data_count_view = pd.concat([unique_data_df, summed_data], axis=0)
+                        if not data_count_view.index.is_unique:
+                            raise ValueError("Error: final data frame non-unique.")
+                        return data_count_view
+                    processed_chunk = process_chunk(view[:, :], dup_genes)
+                    processed_array = processed_chunk.to_numpy()
+                    new_row_attrs = {"ensembl_id": processed_chunk.index.to_numpy()}
+                    ra_keys = [k for k in data.ra.keys() if k != "ensembl_id"]
+                    for ra_value in ra_keys:
+                        mapping_dict = dict(zip(data.ra["ensembl_id"], data.ra[ra_value]))
+                        values_new = [mapping_dict[i] for i in processed_chunk.index]
+                        new_row_attrs[ra_value] = np.array(values_new)
+                    if "n_counts" not in view.ca.keys():
+                        total_count_view = np.sum(view[:,:], axis=0).astype(int)
+                        view.ca["n_counts"] = total_count_view
+                    if first_chunk: # Create the Loom file with the first chunk
+                        lp.create(f"{dedup_filename}", processed_array, row_attrs=new_row_attrs, col_attrs=view.ca)
+                        first_chunk = False
+                    else: # Append subsequent chunks
+                        with lp.connect(dedup_filename, mode='r+') as dsout:
+                            dsout.add_columns(processed_array, col_attrs=view.ca)
+                return dedup_filename
+    elif file_format == "h5ad":
+        """
+        Map Ensembl IDs from gene mapping dictionary. If duplicate Ensembl IDs are found, sum counts together.
+        Returns adata object with deduplicated Ensembl IDs.
+        """
+        data = sc.read_h5ad(str(data_directory))
+        assert "ensembl_id" in data.var.columns, "'ensembl_id' column missing from data.var"
+        gene_ids_collapsed = [gene_mapping_dict.get(gene_id.upper()) for gene_id in data.var.ensembl_id]
+        if len(set(gene_ids_collapsed)) == len(set(data.var.ensembl_id)):
+            return data
+        else:
+            data.var["gene_ids_collapsed"] = gene_ids_collapsed
+            data.var_names = gene_ids_collapsed
+            data = data[:, ~data.var.index.isna()]
+            dup_genes = [idx for idx, count in Counter(data.var_names).items() if count > 1]
+            num_chunks = int(np.ceil(data.shape[0] / chunk_size))
+            processed_genes = []
+            for i in tqdm(range(num_chunks)):
+                start_idx = i * chunk_size
+                end_idx = min((i + 1) * chunk_size, data.shape[0])
+                data_chunk = data[start_idx:end_idx, :]
+                processed_chunks = []
+                for dup_gene in dup_genes:
+                    data_dup_gene = data_chunk[:, data_chunk.var_names == dup_gene]
+                    df = pd.DataFrame.sparse.from_spmatrix(data_dup_gene.X,
+                                                           index=data_dup_gene.obs_names,
+                                                           columns=data_dup_gene.var_names)
+                    df_sum = pd.DataFrame(df.sum(axis=1))
+                    df_sum.columns = [dup_gene]
+                    df_sum.index = data_dup_gene.obs.index
+                    processed_chunks.append(df_sum)
+                processed_chunks = pd.concat(processed_chunks, axis=1)
+                processed_genes.append(processed_chunks)
+            processed_genes = pd.concat(processed_genes, axis = 0)
+            var_df = pd.DataFrame({"gene_ids_collapsed" : processed_genes.columns})
+            var_df.index = processed_genes.columns
+            processed_genes = sc.AnnData(X = processed_genes,
+                                        obs = data.obs,
+                                        var = var_df)
+            data_dedup = data[:, ~data.var.index.isin(dup_genes)] # Deduplicated data
+            data_dedup = sc.concat([data_dedup, processed_genes], axis = 1)
+            data_dedup.obs = data.obs
+            data_dedup.var = data_dedup.var.rename(columns = {"gene_ids_collapsed" : "ensembl_id"})
+            return data_dedup
 class TranscriptomeTokenizer:
     def __init__(
         special_token=False,
         gene_median_file=GENE_MEDIAN_FILE,
         token_dictionary_file=TOKEN_DICTIONARY_FILE,
+        gene_mapping_file=ENSEMBL_DICTIONARY_FILE,
     ):
         """
         Initialize tokenizer.
             | Max input size of model to truncate input to.
         special_token : bool = False
             | Adds CLS token before and EOS token after rank value encoding.
+        collapse_gene_ids : bool = False
+            | Whether to collapse gene IDs based on gene mapping dictionary.
         gene_median_file : Path
             | Path to pickle file containing dictionary of non-zero median
             | gene expression values across Genecorpus-30M.
         token_dictionary_file : Path
             | Path to pickle file containing token dictionary (Ensembl IDs:token).
+        gene_mapping_file : Path
+            | Path to pickle file containing dictionary for collapsing gene IDs.
         """
         # dictionary of custom attributes {output dataset column name: input .loom column name}
         with open(token_dictionary_file, "rb") as f:
             self.gene_token_dict = pickle.load(f)
+        # load gene mappings dictionary (Ensembl IDs:Ensembl ID)
+        with open(gene_mapping_file, "rb") as f:
+            self.gene_mapping_dict = pickle.load(f)
         # gene keys for full vocabulary
         self.gene_keys = list(self.gene_token_dict.keys())
         return tokenized_cells, cell_metadata
     def tokenize_anndata(self, adata_file_path, target_sum=10_000):
+        adata = sum_ensembl_ids(adata_file_path, self.gene_mapping_dict, file_format = "h5ad", chunk_size = self.chunk_size)
         if self.custom_attr_name_dict is not None:
             file_cell_metadata = {
             idx = filter_pass_loc[i : i + self.chunk_size]
             n_counts = adata[idx].obs["n_counts"].values[:, None]
+            X_view0 = adata[idx,:].X
+            X_view = X_view0[:, coding_miRNA_loc]
             X_norm = X_view / n_counts * target_sum / norm_factor_vector
             X_norm = sp.csr_matrix(X_norm)
                 attr_key: [] for attr_key in self.custom_attr_name_dict.keys()
             }
+        loom_file_path = sum_ensembl_ids(loom_file_path, self.gene_mapping_dict, file_format = "loom", chunk_size = self.chunk_size)
         with lp.connect(str(loom_file_path)) as data:
             # define coordinates of detected protein-coding or miRNA genes and vector of their normalization factors
             coding_miRNA_loc = np.where(
                 else:
                     file_cell_metadata = None
+        if "__dedup" in str(loom_file_path):
+            os.remove(str(loom_file_path))
         return tokenized_cells, file_cell_metadata
     def create_dataset(