mrm8488 commited on
Commit
c2ebb01
·
1 Parent(s): 1b5c44e

Create new file

Browse files
Files changed (1) hide show
  1. Utils.py +145 -0
Utils.py ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import transformers
2
+ import torch
3
+ import torch.nn.functional as F
4
+ from torch import nn
5
+ from torch.cuda.amp import custom_fwd, custom_bwd
6
+
7
+ from bitsandbytes.functional import quantize_blockwise, dequantize_blockwise
8
+
9
+ class FrozenBNBLinear(nn.Module):
10
+ def __init__(self, weight, absmax, code, bias=None):
11
+ assert isinstance(bias, nn.Parameter) or bias is None
12
+ super().__init__()
13
+ self.out_features, self.in_features = weight.shape
14
+ self.register_buffer("weight", weight.requires_grad_(False))
15
+ self.register_buffer("absmax", absmax.requires_grad_(False))
16
+ self.register_buffer("code", code.requires_grad_(False))
17
+ self.adapter = None
18
+ self.bias = bias
19
+
20
+ def forward(self, input):
21
+ output = DequantizeAndLinear.apply(input, self.weight, self.absmax, self.code, self.bias)
22
+ if self.adapter:
23
+ output += self.adapter(input)
24
+ return output
25
+
26
+ @classmethod
27
+ def from_linear(cls, linear: nn.Linear) -> "FrozenBNBLinear":
28
+ weights_int8, state = quantize_blockise_lowmemory(linear.weight)
29
+ return cls(weights_int8, *state, linear.bias)
30
+
31
+ def __repr__(self):
32
+ return f"{self.__class__.__name__}({self.in_features}, {self.out_features})"
33
+
34
+
35
+ class DequantizeAndLinear(torch.autograd.Function):
36
+ @staticmethod
37
+ @custom_fwd
38
+ def forward(ctx, input: torch.Tensor, weights_quantized: torch.ByteTensor,
39
+ absmax: torch.FloatTensor, code: torch.FloatTensor, bias: torch.FloatTensor):
40
+ weights_deq = dequantize_blockwise(weights_quantized, absmax=absmax, code=code)
41
+ ctx.save_for_backward(input, weights_quantized, absmax, code)
42
+ ctx._has_bias = bias is not None
43
+ return F.linear(input, weights_deq, bias)
44
+
45
+ @staticmethod
46
+ @custom_bwd
47
+ def backward(ctx, grad_output: torch.Tensor):
48
+ assert not ctx.needs_input_grad[1] and not ctx.needs_input_grad[2] and not ctx.needs_input_grad[3]
49
+ input, weights_quantized, absmax, code = ctx.saved_tensors
50
+ # grad_output: [*batch, out_features]
51
+ weights_deq = dequantize_blockwise(weights_quantized, absmax=absmax, code=code)
52
+ grad_input = grad_output @ weights_deq
53
+ grad_bias = grad_output.flatten(0, -2).sum(dim=0) if ctx._has_bias else None
54
+ return grad_input, None, None, None, grad_bias
55
+
56
+
57
+ class FrozenBNBEmbedding(nn.Module):
58
+ def __init__(self, weight, absmax, code):
59
+ super().__init__()
60
+ self.num_embeddings, self.embedding_dim = weight.shape
61
+ self.register_buffer("weight", weight.requires_grad_(False))
62
+ self.register_buffer("absmax", absmax.requires_grad_(False))
63
+ self.register_buffer("code", code.requires_grad_(False))
64
+ self.adapter = None
65
+
66
+ def forward(self, input, **kwargs):
67
+ with torch.no_grad():
68
+ # note: both quantized weights and input indices are *not* differentiable
69
+ weight_deq = dequantize_blockwise(self.weight, absmax=self.absmax, code=self.code)
70
+ output = F.embedding(input, weight_deq, **kwargs)
71
+ if self.adapter:
72
+ output += self.adapter(input)
73
+ return output
74
+
75
+ @classmethod
76
+ def from_embedding(cls, embedding: nn.Embedding) -> "FrozenBNBEmbedding":
77
+ weights_int8, state = quantize_blockise_lowmemory(embedding.weight)
78
+ return cls(weights_int8, *state)
79
+
80
+ def __repr__(self):
81
+ return f"{self.__class__.__name__}({self.num_embeddings}, {self.embedding_dim})"
82
+
83
+
84
+ def quantize_blockise_lowmemory(matrix: torch.Tensor, chunk_size: int = 2 ** 20):
85
+ assert chunk_size % 4096 == 0
86
+ code = None
87
+ chunks = []
88
+ absmaxes = []
89
+ flat_tensor = matrix.view(-1)
90
+ for i in range((matrix.numel() - 1) // chunk_size + 1):
91
+ input_chunk = flat_tensor[i * chunk_size: (i + 1) * chunk_size].clone()
92
+ quantized_chunk, (absmax_chunk, code) = quantize_blockwise(input_chunk, code=code)
93
+ chunks.append(quantized_chunk)
94
+ absmaxes.append(absmax_chunk)
95
+
96
+ matrix_i8 = torch.cat(chunks).reshape_as(matrix)
97
+ absmax = torch.cat(absmaxes)
98
+ return matrix_i8, (absmax, code)
99
+
100
+
101
+ def convert_to_int8(model):
102
+ """Convert linear and embedding modules to 8-bit with optional adapters"""
103
+ for module in list(model.modules()):
104
+ for name, child in module.named_children():
105
+ if isinstance(child, nn.Linear):
106
+ print(name, child)
107
+ setattr(
108
+ module,
109
+ name,
110
+ FrozenBNBLinear(
111
+ weight=torch.zeros(child.out_features, child.in_features, dtype=torch.uint8),
112
+ absmax=torch.zeros((child.weight.numel() - 1) // 4096 + 1),
113
+ code=torch.zeros(256),
114
+ bias=child.bias,
115
+ ),
116
+ )
117
+ elif isinstance(child, nn.Embedding):
118
+ setattr(
119
+ module,
120
+ name,
121
+ FrozenBNBEmbedding(
122
+ weight=torch.zeros(child.num_embeddings, child.embedding_dim, dtype=torch.uint8),
123
+ absmax=torch.zeros((child.weight.numel() - 1) // 4096 + 1),
124
+ code=torch.zeros(256),
125
+ )
126
+ )
127
+
128
+ class GPTJBlock(transformers.models.gptj.modeling_gptj.GPTJBlock):
129
+ def __init__(self, config):
130
+ super().__init__(config)
131
+
132
+ convert_to_int8(self.attn)
133
+ convert_to_int8(self.mlp)
134
+
135
+
136
+ class GPTJModel(transformers.models.gptj.modeling_gptj.GPTJModel):
137
+ def __init__(self, config):
138
+ super().__init__(config)
139
+ convert_to_int8(self)
140
+
141
+
142
+ class GPTJForCausalLM(transformers.models.gptj.modeling_gptj.GPTJForCausalLM):
143
+ def __init__(self, config):
144
+ super().__init__(config)
145
+ convert_to_int8(self)