Tu2003716
/

COCOM_disabled_flash_attn

Tu2003716 commited on Dec 6, 2024

Commit

9d3c242

verified ·

1 Parent(s): 027ecf9

Change flow to allow user to choose the attn type

Files changed (1) hide show

modeling_cocom.py CHANGED Viewed

@@ -71,7 +71,7 @@ class COCOMConfig(PretrainedConfig):
                 lora = False,
                 training_form="both",
                 lora_r=16,
-                disable_flash_attention=True,
                  **kwargs):
         super().__init__(**kwargs)
@@ -85,14 +85,14 @@ class COCOMConfig(PretrainedConfig):
         self.lora = lora # boolean type, whether to use lora trsining
         self.training_form = training_form # training form, could be compressor: training only comprssor; both:
         self.lora_r = lora_r # lora_r for lora training, we use 16 throughout the experiment.
-        self.disable_flash_attention = disable_flash_attention
 class COCOM(PreTrainedModel):
     config_class = COCOMConfig
     def __init__(self, cfg):
         super().__init__(cfg)
         # define models
-        attn_impl = "flash_attention_2" if not cfg.disable_flash_attention else "default"
         # model could be loaded in three quantization modes: no, int4, int8
         if cfg.quantization == "no":
             self.decoder = AutoModelForCausalLM.from_pretrained(

                 lora = False,
                 training_form="both",
                 lora_r=16,
+                attn_implementation="eager",
                  **kwargs):
         super().__init__(**kwargs)
         self.lora = lora # boolean type, whether to use lora trsining
         self.training_form = training_form # training form, could be compressor: training only comprssor; both:
         self.lora_r = lora_r # lora_r for lora training, we use 16 throughout the experiment.
+        self.attn_implementation = attn_implementation
 class COCOM(PreTrainedModel):
     config_class = COCOMConfig
     def __init__(self, cfg):
         super().__init__(cfg)
         # define models
+        attn_impl = cfg.attn_implementation
         # model could be loaded in three quantization modes: no, int4, int8
         if cfg.quantization == "no":
             self.decoder = AutoModelForCausalLM.from_pretrained(