Runtime error
Runtime error
import torch | |
import torch.nn as nn | |
import torch.nn.functional as F | |
from data_utils import * | |
from attention import SelfAttentionHead, MultiHeadAttention, FeedForwardNet, DecoderBlock | |
class BigramLanguageModel(nn.Module): | |
def __init__(self, vocab_size, n_embed, block_size, num_heads, n_layers) -> None: | |
super().__init__() | |
self.token_embedding_table = nn.Embedding(vocab_size, n_embed) | |
self.position_embedding_table = nn.Embedding(block_size, n_embed) | |
self.decoder_blocks = nn.Sequential(*[DecoderBlock(n_embed, num_heads, block_size=block_size) for _ in range(n_layers)] ) | |
self.ln_final = nn.LayerNorm(n_embed) | |
## self.sa_head = SelfAttentionHead(vocab_size, n_embed, block_size) | |
# self.sa_heads = MultiHeadAttention(num_heads=4, head_size=n_embed//4, n_embed=n_embed, block_size=block_size) | |
# self.ffn = FeedForwardNet(n_embed, dropout=0.2) | |
self.lm_head = nn.Linear(n_embed, vocab_size) | |
def forward(self, idx, targets=None): | |
# idx and targets both are tensors of shape (B, T) -> B = batch_sz, T = seq_len ("time steps", here 8) | |
B, T = idx.shape | |
tok_embed = self.token_embedding_table(idx) # (B, T, C) C = "channels", here vocab_size or embedding dim for each token | |
pos_embed = self.position_embedding_table(torch.arange(T, device=idx.device)) # (T, C) C = "channels", here vocab_size or embedding dim for each token | |
x_in = tok_embed + pos_embed | |
# x_in = self.sa_heads(x_in) | |
# x_in = self.ffn(x_in) | |
x_in = self.ln_final(self.decoder_blocks(x_in)) | |
logits = self.lm_head(x_in) # (B, T, C) C = "channels", here vocab_size or embedding dim for each token | |
if targets is None: | |
loss = None | |
else: | |
B, T, C = logits.shape | |
# Cross entropy requires the 2nd param to be C "channels" | |
loss = F.cross_entropy(logits.view(B*T, C), targets.view(B*T), ignore_index=0) | |
return logits, loss | |
def generate(self, idx, max_new_tokens): | |
# idx is (B, T) shaped array of indices in current context | |
for _ in range(max_new_tokens): | |
#limit input idx to last "block size" tokens | |
idx_cond = idx[:, -BLOCK_SIZE:] | |
logits, loss = self(idx_cond) | |
#focus only on the last time step | |
logits = logits[:, -1, :] # becomes (B, C) | |
# apply softmax for probs | |
probs = F.softmax(logits, dim=-1) # (B, C) | |
#sample from distribudion | |
idx_next = torch.multinomial(probs, num_samples=1) # (B, 1) | |
#append sampled index to running sequence idx | |
idx =[idx, idx_next], dim=1) # (B, T+1) | |
return idx | |
def get_num_params(self, non_embedding=True): | |
""" | |
Return the number of parameters in the model. | |
For non-embedding count (default), the position embeddings get subtracted. | |
The token embeddings would too, except due to the parameter sharing these | |
params are actually used as weights in the final layer, so we include them. | |
""" | |
n_params = sum(p.numel() for p in self.parameters()) | |
if non_embedding: | |
n_params -= self.transformer.wpe.weight.numel() | |
return n_params | |
def _init_weights(self, module): | |
if isinstance(module, nn.Linear): | |
torch.nn.init.normal_(module.weight, mean=0.0, std=0.02) | |
if module.bias is not None: | |
torch.nn.init.zeros_(module.bias) | |
elif isinstance(module, nn.Embedding): | |
torch.nn.init.normal_(module.weight, mean=0.0, std=0.02) | |
if __name__ == "__main__": | |
from data_utils import * | |
xb, yb = get_random_batch('train') | |
xb = | |
yb = | |
m = BigramLanguageModel(vocab_size=65, n_embed=n_embed, block_size=BLOCK_SIZE, num_heads=n_head, n_layers=n_layer).to(device) | |
logits, loss = m(xb, yb) | |
print(logits.shape) | |
print(loss) | |