from typing import List import torch from tokenizer_utils import IntCharTokenizer from conf import nanogpt_conf BLOCK_SIZE = 256 #context length BATCH_SIZE = 128 max_iters = 5000 eval_interval = 500 learning_rate = 3e-4 device = "cuda" if torch.cuda.is_available() else "cpu" eval_iters = 100 n_embed = 384 n_head = 6 n_layer = 6 dropout = 0.2 def load_text() -> str: with open(nanogpt_conf["text_file"], "r") as f: text = f.read() return text def load_int_char_tokenizer(text: str) -> IntCharTokenizer: return IntCharTokenizer(text) def tokenize_char_to_int(text: str) -> List[int]: tokenizer = load_int_char_tokenizer(text) return tokenizer.encode(text) # def decode_int_to_char(tokens: List[int]) -> str: # tokenizer = load_int_char_tokenizer(text) # return tokenizer.decode(tokens) def load_text_as_tensor(text: str) -> torch.Tensor: data = torch.tensor(tokenize_char_to_int(text), dtype=torch.long) return data def split_train_val(text): n = int(0.9 * len(text)) train_data = text[:n] val_data = text[n:] return train_data, val_data def get_random_batch(split): train_data, val_data = split_train_val(load_text_as_tensor(load_text())) data = train_data if split == 'train' else val_data ix = torch.randint(len(data) - BLOCK_SIZE, (BATCH_SIZE, )) x = torch.stack([data[i: i + BLOCK_SIZE] for i in ix]) y = torch.stack([data[i + 1: i + BLOCK_SIZE + 1] for i in ix]) if device == 'cuda': # pin arrays x,y, which allows us to move them to GPU asynchronously (non_blocking=True) x, y = x.pin_memory().to(device, non_blocking=True), y.pin_memory().to(device, non_blocking=True) else: x, y = x.to(device), y.to(device) return x, y