togethercomputer
/

StripedHyena-Hessian-7B

Zymrael commited on Dec 4, 2023

Commit

521ac0e

1 Parent(s): 8711fb6

chore: update gradient checkpointing

Files changed (1) hide show

model.py CHANGED Viewed

@@ -311,8 +311,8 @@ class StripedHyena(nn.Module):
         self.embedding_layer = VocabParallelEmbedding(config)
         self.norm = RMSNorm(config) if config.get("final_norm", True) else None
         self.unembed = self.emb if config.tie_embeddings else VocabParallelEmbedding(config)
-        self.scratchpad = None
         if config.get("use_flashfft", "False"):
             raise NotImplementedError("Please use standalone SH code for other custom kernels")
         else:
@@ -349,8 +349,18 @@ class StripedHyena(nn.Module):
         if type(padding_mask) == torch.Tensor:
             x = x * padding_mask[..., None]
-        for _, block in enumerate(self.blocks):
-            x, _ = block(x, inference_params=None, padding_mask=padding_mask)
         return x, None
     def initialize_inference_params(self):

         self.embedding_layer = VocabParallelEmbedding(config)
         self.norm = RMSNorm(config) if config.get("final_norm", True) else None
         self.unembed = self.emb if config.tie_embeddings else VocabParallelEmbedding(config)
+        self.gradient_checkpointing = False
         if config.get("use_flashfft", "False"):
             raise NotImplementedError("Please use standalone SH code for other custom kernels")
         else:
         if type(padding_mask) == torch.Tensor:
             x = x * padding_mask[..., None]
+        for block_idx, block in enumerate(self.blocks):
+            if self.gradient_checkpointing and self.training:
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        # None for past_key_value
+                        return module(*inputs, inference_params=None, padding_mask=padding_mask)
+                    return custom_forward
+                x, _ = checkpoint(create_custom_forward(block), x, use_reentrant=False)
+            else:
+                x, _ = block(x, inference_params=None, padding_mask=padding_mask)
         return x, None
     def initialize_inference_params(self):