Improved precision / reduced frequency of nan outputs, allow bf16 t5, f32 rmsnorm, larger clamp

Browse files

Files changed (4) hide show

README.md +7 -0
modules/conditioner.py +7 -1
modules/flux_model.py +14 -9
util.py +2 -1

README.md CHANGED Viewed

@@ -79,6 +79,13 @@ pipeline = FluxPipeline.load_pipeline_from_config_path(config_path, **config_ove
 pipeline.load_lora(lora_path, scale=1.0)
 ```
 ## Installation
 This repo _requires_ at least pytorch with cuda=12.4 and an ADA gpu with fp8 support, otherwise `torch._scaled_mm` will throw a CUDA error saying it's not supported. To install with conda/mamba:

 pipeline.load_lora(lora_path, scale=1.0)
 ```
+### Updates 09/07/24
+-   Improve quality by ensuring that the RMSNorm layers use fp32
+-   Raise the clamp range for single blocks & double blocks to +/-32000 to reduce deviation from expected outputs.
+-   Make BF16 _not_ clamp, which improves quality and isn't needed because bf16 is the expected dtype for flux. **I would now recommend always using `"flow_dtype": "bfloat16"` in the config**, though it will slow things down on consumer gpus- but not by much at all since most of the compute still happens via fp8.
+-   Allow for the T5 Model to be run without any quantization, by specifying `"text_enc_quantization_dtype": "bfloat16"` in the config - or also `"float16"`, though not recommended since t5 deviates a bit when running with float16. I noticed that even with qint8/qfloat8 there is a bit of deviation from bf16 text encoder outputs- so for those who want more accurate / expected text encoder outputs, you can use this option.
 ## Installation
 This repo _requires_ at least pytorch with cuda=12.4 and an ADA gpu with fp8 support, otherwise `torch._scaled_mm` will throw a CUDA error saying it's not supported. To install with conda/mamba:

modules/conditioner.py CHANGED Viewed

@@ -29,6 +29,8 @@ def auto_quantization_config(
         return BitsAndBytesConfig(load_in_8bit=True, llm_int8_has_fp16_weight=False)
     elif quantization_dtype == "qint2":
         return QuantoConfig(weights="int2")
     else:
         raise ValueError(f"Unsupported quantization dtype: {quantization_dtype}")
@@ -57,7 +59,11 @@ class HFEmbedder(nn.Module):
         self.output_key = "pooler_output" if self.is_clip else "last_hidden_state"
         auto_quant_config = (
-            auto_quantization_config(quantization_dtype) if quantization_dtype else None
         )
         # BNB will move to cuda:0 by default if not specified

         return BitsAndBytesConfig(load_in_8bit=True, llm_int8_has_fp16_weight=False)
     elif quantization_dtype == "qint2":
         return QuantoConfig(weights="int2")
+    elif quantization_dtype is None or quantization_dtype == "bfloat16":
+        return None
     else:
         raise ValueError(f"Unsupported quantization dtype: {quantization_dtype}")
         self.output_key = "pooler_output" if self.is_clip else "last_hidden_state"
         auto_quant_config = (
+            auto_quantization_config(quantization_dtype)
+            if quantization_dtype is not None
+            and quantization_dtype != "bfloat16"
+            and quantization_dtype != "float16"
+            else None
         )
         # BNB will move to cuda:0 by default if not specified

modules/flux_model.py CHANGED Viewed

@@ -159,7 +159,7 @@ class RMSNorm(torch.nn.Module):
         self.scale = nn.Parameter(torch.ones(dim))
     def forward(self, x: Tensor):
-        return F.rms_norm(x, self.scale.shape, self.scale, eps=1e-6)
 class QKNorm(torch.nn.Module):
@@ -344,7 +344,7 @@ class DoubleStreamBlock(nn.Module):
         self.K = 3
         self.H = self.num_heads
         self.KH = self.K * self.H
     def rearrange_for_norm(self, x: Tensor) -> tuple[Tensor, Tensor, Tensor]:
         B, L, D = x.shape
         q, k, v = x.reshape(B, L, self.K, self.H, D // self.KH).permute(2, 0, 3, 1, 4)
@@ -384,14 +384,16 @@ class DoubleStreamBlock(nn.Module):
         img = img + img_mod1.gate * self.img_attn.proj(img_attn)
         img = img + img_mod2.gate * self.img_mlp(
             (1 + img_mod2.scale) * self.img_norm2(img) + img_mod2.shift
-        ).clamp(min=-384 * 2, max=384 * 2)
         # calculate the txt bloks
         txt = txt + txt_mod1.gate * self.txt_attn.proj(txt_attn)
         txt = txt + txt_mod2.gate * self.txt_mlp(
             (1 + txt_mod2.scale) * self.txt_norm2(txt) + txt_mod2.shift
-        ).clamp(min=-384 * 2, max=384 * 2)
         return img, txt
@@ -457,6 +459,7 @@ class SingleStreamBlock(nn.Module):
         self.K = 3
         self.H = self.num_heads
         self.KH = self.K * self.H
     def forward(self, x: Tensor, vec: Tensor, pe: Tensor) -> Tensor:
         mod = self.modulation(vec)[0]
@@ -471,10 +474,12 @@ class SingleStreamBlock(nn.Module):
         q, k, v = qkv.reshape(B, L, self.K, self.H, D // self.KH).permute(2, 0, 3, 1, 4)
         q, k = self.norm(q, k, v)
         attn = attention(q, k, v, pe=pe)
-        output = self.linear2(torch.cat((attn, self.mlp_act(mlp)), 2)).clamp(
-            min=-384 * 4, max=384 * 4
-        )
-        return x + mod.gate * output
 class LastLayer(nn.Module):

         self.scale = nn.Parameter(torch.ones(dim))
     def forward(self, x: Tensor):
+        return F.rms_norm(x.float(), self.scale.shape, self.scale, eps=1e-6).to(x)
 class QKNorm(torch.nn.Module):
         self.K = 3
         self.H = self.num_heads
         self.KH = self.K * self.H
+        self.do_clamp = dtype == torch.float16
     def rearrange_for_norm(self, x: Tensor) -> tuple[Tensor, Tensor, Tensor]:
         B, L, D = x.shape
         q, k, v = x.reshape(B, L, self.K, self.H, D // self.KH).permute(2, 0, 3, 1, 4)
         img = img + img_mod1.gate * self.img_attn.proj(img_attn)
         img = img + img_mod2.gate * self.img_mlp(
             (1 + img_mod2.scale) * self.img_norm2(img) + img_mod2.shift
+        )
         # calculate the txt bloks
         txt = txt + txt_mod1.gate * self.txt_attn.proj(txt_attn)
         txt = txt + txt_mod2.gate * self.txt_mlp(
             (1 + txt_mod2.scale) * self.txt_norm2(txt) + txt_mod2.shift
+        )
+        if self.do_clamp:
+            img = img.clamp(min=-32000, max=32000)
+            txt = txt.clamp(min=-32000, max=32000)
         return img, txt
         self.K = 3
         self.H = self.num_heads
         self.KH = self.K * self.H
+        self.do_clamp = dtype == torch.float16
     def forward(self, x: Tensor, vec: Tensor, pe: Tensor) -> Tensor:
         mod = self.modulation(vec)[0]
         q, k, v = qkv.reshape(B, L, self.K, self.H, D // self.KH).permute(2, 0, 3, 1, 4)
         q, k = self.norm(q, k, v)
         attn = attention(q, k, v, pe=pe)
+        output = self.linear2(torch.cat((attn, self.mlp_act(mlp)), 2))
+        if self.do_clamp:
+            out = (x + mod.gate * output).clamp(min=-32000, max=32000)
+        else:
+            out = x + mod.gate * output
+        return out
 class LastLayer(nn.Module):

util.py CHANGED Viewed

@@ -31,7 +31,8 @@ class QuantizationDtype(StrEnum):
     qint2 = "qint2"
     qint4 = "qint4"
     qint8 = "qint8"
 class ModelSpec(BaseModel):
     version: ModelVersion

     qint2 = "qint2"
     qint4 = "qint4"
     qint8 = "qint8"
+    bfloat16 = "bfloat16"
+    float16 = "float16"
 class ModelSpec(BaseModel):
     version: ModelVersion