fireworks-ai
/

FLUX.1-schnell-fp8-flumina

Safetensors

Model card Files Files and versions Community

aredden commited on Aug 18, 2024

Commit

b6617b1

1 Parent(s): 2f2c44c

cuda version checks

Browse files

Files changed (3) hide show

README.md +24 -2
float8_quantize.py +22 -6
requirements.txt +2 -1

README.md CHANGED Viewed

@@ -1,4 +1,4 @@
-# Flux FP16 Accumulate Model Implementation with FastAPI
 This repository contains an implementation of the Flux model, along with an API that allows you to generate images based on text prompts. The API can be run via command-line arguments.
@@ -13,12 +13,34 @@ This repository contains an implementation of the Flux model, along with an API
 ## Installation
 To install the required dependencies, run:
 ```bash
-pip install -r requirements.txt
 ```
 ## Usage
 You can run the API server using the following command:

+# Flux FP8 (true) Matmul Implementation with FastAPI
 This repository contains an implementation of the Flux model, along with an API that allows you to generate images based on text prompts. The API can be run via command-line arguments.
 ## Installation
+This repo _requires_ at least pytorch with cuda=12.4 and an ADA gpu with fp8 support, otherwise `torch._scaled_mm` will throw a CUDA error saying it's not supported. To install with conda/mamba:
+```bash
+mamba create -n flux-fp8-matmul-api python=3.11 pytorch torchvision torchaudio pytorch-cuda=12.4 -c pytorch -c nvidia
+mamba activate flux-fp8-matmul-api
+# or with conda
+conda create -n flux-fp8-matmul-api python=3.11 pytorch torchvision torchaudio pytorch-cuda=12.4 -c pytorch -c nvidia
+conda activate flux-fp8-matmul-api
+# or with nightly... (which is what I am using) - also, just switch 'mamba' to 'conda' if you are using conda
+mamba create -n flux-fp8-matmul-api python=3.11 pytorch torchvision torchaudio pytorch-cuda=12.4 -c pytorch-nightly -c nvidia
+mamba activate flux-fp8-matmul-api
+# or with pip
+python -m pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124
+# or pip nightly
+python -m pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu124
+```
 To install the required dependencies, run:
 ```bash
+python -m pip install -r requirements.txt
 ```
+If you get errors installing `torch-cublas-hgemm`, feel free to comment it out in requirements.txt, since it's not necessary, but will speed up inference for non-fp8 linear layers.
 ## Usage
 You can run the API server using the following command:

float8_quantize.py CHANGED Viewed

@@ -9,8 +9,21 @@ from torchao.float8.float8_utils import (
 from torch.nn import init
 import math
 from torch.compiler import is_compiling
 try:
     from cublas_ops import CublasLinear
 except ImportError:
@@ -244,19 +257,22 @@ class F8Linear(nn.Module):
             x = self.quantize_input(x)
         prev_dims = x.shape[:-1]
         x = x.view(-1, self.in_features)
         # float8 matmul, much faster than float16 matmul w/ float32 accumulate on ADA devices!
-        return torch._scaled_mm(
             x,
             self.float8_data.T,
-            self.input_scale_reciprocal,
-            self.scale_reciprocal,
             bias=self.bias,
             out_dtype=self.weight.dtype,
             use_fast_accum=True,
-        ).view(*prev_dims, self.out_features)
     @classmethod
     def from_linear(

 from torch.nn import init
 import math
 from torch.compiler import is_compiling
+from torch import __version__
+from torch.version import cuda
+IS_TORCH_2_4 = __version__ >= (2, 4) and __version__ < (2, 5)
+LT_TORCH_2_4 = __version__ < (2, 4)
+if LT_TORCH_2_4:
+    if not hasattr(torch, "_scaled_mm"):
+        raise RuntimeError(
+            "This version of PyTorch is not supported. Please upgrade to PyTorch 2.4 with CUDA 12.4 or later."
+        )
+CUDA_VERSION = float(cuda) if cuda else 0
+if CUDA_VERSION < 12.4:
+    raise RuntimeError(
+        f"This version of PyTorch is not supported. Please upgrade to PyTorch 2.4 with CUDA 12.4 or later got torch version {__version__} and CUDA version {cuda}."
+    )
 try:
     from cublas_ops import CublasLinear
 except ImportError:
             x = self.quantize_input(x)
         prev_dims = x.shape[:-1]
         x = x.view(-1, self.in_features)
         # float8 matmul, much faster than float16 matmul w/ float32 accumulate on ADA devices!
+        out = torch._scaled_mm(
             x,
             self.float8_data.T,
+            scale_a=self.input_scale_reciprocal,
+            scale_b=self.scale_reciprocal,
             bias=self.bias,
             out_dtype=self.weight.dtype,
             use_fast_accum=True,
+        )
+        if IS_TORCH_2_4:
+            out = out[0]
+        out = out.view(*prev_dims, self.out_features)
+        return out
     @classmethod
     def from_linear(

requirements.txt CHANGED Viewed

@@ -12,4 +12,5 @@ sentencepiece
 click
 accelerate
 quanto
-pydash

 click
 accelerate
 quanto
+pydash
+pybase64