Synthyra
/

ESMplusplus_small

Fill-Mask

Transformers

Safetensors

ESMplusplus

custom_code

Model card Files Files and versions Community

lhallee commited on Dec 6, 2024

Commit

c58309c

verified ·

1 Parent(s): 2af14cc

Update modeling_esm_plusplus.py

Browse files

Files changed (1) hide show

modeling_esm_plusplus.py +9 -7

modeling_esm_plusplus.py CHANGED Viewed

@@ -537,10 +537,10 @@ class ESMplusplusForMaskedLM(PreTrainedModel):
             batch_size: Batch size for processing
             max_len: Maximum sequence length
             full_embeddings: Whether to return full residue-wise (True) embeddings or pooled (False)
-            full_precision: Whether to cast to full precision (float32) before storage
             pooling_type: Type of pooling ('mean' or 'cls')
             num_workers: Number of workers for data loading, 0 for the main process
-            sql: Whether to store embeddings in SQLite database
             sql_db_path: Path to SQLite database
         Returns:
@@ -553,12 +553,12 @@ class ESMplusplusForMaskedLM(PreTrainedModel):
         device = self.device
         def get_embeddings(residue_embeddings: torch.Tensor, attention_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
-            if full_precision:
-                residue_embeddings = residue_embeddings.float()
             if full_embeddings:
                 return residue_embeddings
-            return (self.mean_pooling(residue_embeddings, attention_mask) if pooling_type == 'mean'
-                   else residue_embeddings[:, 0, :])
         if sql:
             import sqlite3
@@ -575,7 +575,7 @@ class ESMplusplusForMaskedLM(PreTrainedModel):
                     seqs = sequences[i * batch_size:(i + 1) * batch_size]
                     input_ids, attention_mask = batch['input_ids'].to(device), batch['attention_mask'].to(device)
                     x = self.embed(input_ids)
-                    residue_embeddings = self.transformer(x, attention_mask).last_hidden_state
                     embeddings = get_embeddings(residue_embeddings, attention_mask)
                     for seq, emb in zip(seqs, embeddings):
@@ -596,6 +596,8 @@ class ESMplusplusForMaskedLM(PreTrainedModel):
                 input_ids, attention_mask = batch['input_ids'].to(device), batch['attention_mask'].to(device)
                 x = self.embed(input_ids)
                 residue_embeddings = self.transformer(x, attention_mask).last_hidden_state
                 embeddings = get_embeddings(residue_embeddings, attention_mask)
                 for seq, emb in zip(seqs, embeddings):
                     embeddings_dict[seq] = emb

             batch_size: Batch size for processing
             max_len: Maximum sequence length
             full_embeddings: Whether to return full residue-wise (True) embeddings or pooled (False)
+            full_precision: Whether to cast to full precision (float32) before storage - relevant for dict storage
             pooling_type: Type of pooling ('mean' or 'cls')
             num_workers: Number of workers for data loading, 0 for the main process
+            sql: Whether to store embeddings in SQLite database - will be stored in float32
             sql_db_path: Path to SQLite database
         Returns:
         device = self.device
         def get_embeddings(residue_embeddings: torch.Tensor, attention_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
             if full_embeddings:
                 return residue_embeddings
+            elif pooling_type == 'mean':
+                return self.mean_pooling(residue_embeddings, attention_mask)
+            else:
+                return residue_embeddings[:, 0, :]
         if sql:
             import sqlite3
                     seqs = sequences[i * batch_size:(i + 1) * batch_size]
                     input_ids, attention_mask = batch['input_ids'].to(device), batch['attention_mask'].to(device)
                     x = self.embed(input_ids)
+                    residue_embeddings = self.transformer(x, attention_mask).last_hidden_state.float() # required for sql
                     embeddings = get_embeddings(residue_embeddings, attention_mask)
                     for seq, emb in zip(seqs, embeddings):
                 input_ids, attention_mask = batch['input_ids'].to(device), batch['attention_mask'].to(device)
                 x = self.embed(input_ids)
                 residue_embeddings = self.transformer(x, attention_mask).last_hidden_state
+                if full_precision:
+                    residue_embeddings = residue_embeddings.float()
                 embeddings = get_embeddings(residue_embeddings, attention_mask)
                 for seq, emb in zip(seqs, embeddings):
                     embeddings_dict[seq] = emb