|
import warnings |
|
warnings.filterwarnings("ignore") |
|
import torch |
|
import torch.nn as nn |
|
import math |
|
from transformers import MarianTokenizer |
|
from datasets import load_dataset |
|
from typing import List |
|
from torch import Tensor |
|
from torch.nn import Transformer |
|
from torch.nn.utils.rnn import pad_sequence |
|
from torch.utils.data import DataLoader, Dataset |
|
from timeit import default_timer as timer |
|
import urllib.request |
|
import os |
|
from torch.cuda.amp import GradScaler, autocast |
|
import logging |
|
|
|
logging.getLogger("datasets").setLevel(logging.ERROR) |
|
|
|
print("CUDA是否可用:", torch.cuda.is_available()) |
|
print("PyTorch版本:", torch.__version__) |
|
if torch.cuda.is_available(): |
|
print("CUDA版本:", torch.version.cuda) |
|
|
|
|
|
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu') |
|
print("当前使用设备:", DEVICE) |
|
if torch.cuda.is_available(): |
|
print(f"GPU信息: {torch.cuda.get_device_name(0)}") |
|
print(f"当前GPU显存使用: {torch.cuda.memory_allocated(0)/1024**2:.2f} MB") |
|
|
|
|
|
tokenizer = MarianTokenizer.from_pretrained('Helsinki-NLP/opus-mt-de-en') |
|
|
|
|
|
PAD_IDX = tokenizer.pad_token_id |
|
BOS_IDX = tokenizer.bos_token_id |
|
EOS_IDX = tokenizer.eos_token_id |
|
UNK_IDX = tokenizer.unk_token_id |
|
|
|
|
|
SRC_VOCAB_SIZE = tokenizer.vocab_size |
|
TGT_VOCAB_SIZE = tokenizer.vocab_size |
|
|
|
class PositionalEncoding(nn.Module): |
|
def __init__(self, emb_size: int, dropout: float, maxlen: int = 5000): |
|
super(PositionalEncoding, self).__init__() |
|
den = torch.exp(-torch.arange(0, emb_size, 2) * math.log(10000) / emb_size) |
|
pos = torch.arange(0, maxlen).reshape(maxlen, 1) |
|
pos_embedding = torch.zeros((maxlen, emb_size)) |
|
pos_embedding[:, 0::2] = torch.sin(pos * den) |
|
pos_embedding[:, 1::2] = torch.cos(pos * den) |
|
pos_embedding = pos_embedding.unsqueeze(-2) |
|
self.dropout = nn.Dropout(dropout) |
|
self.register_buffer('pos_embedding', pos_embedding) |
|
|
|
def forward(self, token_embedding: Tensor): |
|
return self.dropout(token_embedding + self.pos_embedding[:token_embedding.size(0), :]) |
|
|
|
class TokenEmbedding(nn.Module): |
|
def __init__(self, vocab_size: int, emb_size): |
|
super(TokenEmbedding, self).__init__() |
|
self.embedding = nn.Embedding(vocab_size, emb_size) |
|
self.emb_size = emb_size |
|
|
|
def forward(self, tokens: Tensor): |
|
return self.embedding(tokens.long()) * math.sqrt(self.emb_size) |
|
|
|
class Seq2SeqTransformer(nn.Module): |
|
def __init__(self, num_encoder_layers: int, num_decoder_layers: int, |
|
emb_size: int, nhead: int, src_vocab_size: int, |
|
tgt_vocab_size: int, dim_feedforward: int = 512, dropout: float = 0.1): |
|
super(Seq2SeqTransformer, self).__init__() |
|
self.transformer = Transformer(d_model=emb_size, |
|
nhead=nhead, |
|
num_encoder_layers=num_encoder_layers, |
|
num_decoder_layers=num_decoder_layers, |
|
dim_feedforward=dim_feedforward, |
|
dropout=dropout) |
|
self.generator = nn.Linear(emb_size, tgt_vocab_size) |
|
self.src_tok_emb = TokenEmbedding(src_vocab_size, emb_size) |
|
self.tgt_tok_emb = TokenEmbedding(tgt_vocab_size, emb_size) |
|
self.positional_encoding = PositionalEncoding(emb_size, dropout=dropout) |
|
|
|
def forward(self, src: Tensor, trg: Tensor, src_mask: Tensor, |
|
tgt_mask: Tensor, src_padding_mask: Tensor, |
|
tgt_padding_mask: Tensor, memory_key_padding_mask: Tensor): |
|
src_emb = self.positional_encoding(self.src_tok_emb(src)) |
|
tgt_emb = self.positional_encoding(self.tgt_tok_emb(trg)) |
|
outs = self.transformer(src_emb, tgt_emb, src_mask, tgt_mask, None, |
|
src_padding_mask, tgt_padding_mask, memory_key_padding_mask) |
|
return self.generator(outs) |
|
|
|
def encode(self, src: Tensor, src_mask: Tensor): |
|
return self.transformer.encoder(self.positional_encoding(self.src_tok_emb(src)), src_mask) |
|
|
|
def decode(self, tgt: Tensor, memory: Tensor, tgt_mask: Tensor): |
|
return self.transformer.decoder(self.positional_encoding(self.tgt_tok_emb(tgt)), memory, tgt_mask) |
|
|
|
def generate_square_subsequent_mask(sz): |
|
mask = (torch.triu(torch.ones((sz, sz), device=DEVICE)) == 1).transpose(0, 1) |
|
mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0)) |
|
return mask |
|
|
|
def create_mask(src, tgt): |
|
src_seq_len = src.shape[0] |
|
tgt_seq_len = tgt.shape[0] |
|
|
|
tgt_mask = generate_square_subsequent_mask(tgt_seq_len) |
|
src_mask = torch.zeros((src_seq_len, src_seq_len), device=DEVICE).type(torch.bool) |
|
|
|
src_padding_mask = (src == PAD_IDX).transpose(0, 1) |
|
tgt_padding_mask = (tgt == PAD_IDX).transpose(0, 1) |
|
return src_mask, tgt_mask, src_padding_mask, tgt_padding_mask |
|
|
|
def download_multi30k(): |
|
base_url = "https://raw.githubusercontent.com/multi30k/dataset/master/data/task1/raw/" |
|
|
|
|
|
os.makedirs("multi30k", exist_ok=True) |
|
|
|
|
|
splits = ['train', 'val', 'test'] |
|
languages = ['de', 'en'] |
|
|
|
for split in splits: |
|
for lang in languages: |
|
filename = f"{split}.{lang}" |
|
url = f"{base_url}{filename}" |
|
path = f"multi30k/{filename}" |
|
|
|
if not os.path.exists(path): |
|
print(f"Downloading {filename}...") |
|
urllib.request.urlretrieve(url, path) |
|
|
|
def load_data(): |
|
|
|
dataset = load_dataset("wmt14", "de-en", cache_dir=".cache") |
|
|
|
|
|
train_size = 29000 |
|
val_size = 1000 |
|
test_size = 1000 |
|
|
|
|
|
data = { |
|
'train': { |
|
'de': [item['de'] for item in dataset['train']['translation'][:train_size]], |
|
'en': [item['en'] for item in dataset['train']['translation'][:train_size]] |
|
}, |
|
'val': { |
|
'de': [item['de'] for item in dataset['validation']['translation'][:val_size]], |
|
'en': [item['en'] for item in dataset['validation']['translation'][:val_size]] |
|
}, |
|
'test': { |
|
'de': [item['de'] for item in dataset['test']['translation'][:test_size]], |
|
'en': [item['en'] for item in dataset['test']['translation'][:test_size]] |
|
} |
|
} |
|
|
|
return data |
|
|
|
|
|
class TranslationDataset(Dataset): |
|
def __init__(self, de_texts, en_texts): |
|
self.de_texts = de_texts |
|
self.en_texts = en_texts |
|
|
|
def __len__(self): |
|
return len(self.de_texts) |
|
|
|
def __getitem__(self, idx): |
|
return { |
|
'de': self.de_texts[idx], |
|
'en': self.en_texts[idx] |
|
} |
|
|
|
print("正在加载数据集...") |
|
_cached_data = load_data() |
|
|
|
def get_dataloader(split='train', batch_size=32): |
|
|
|
data = _cached_data[split] |
|
|
|
|
|
dataset = TranslationDataset(data['de'], data['en']) |
|
|
|
return DataLoader( |
|
dataset, |
|
batch_size=batch_size, |
|
shuffle=(split == 'train') |
|
) |
|
|
|
|
|
BATCH_SIZE = 32 |
|
EMB_SIZE = 512 |
|
NHEAD = 8 |
|
FFN_HID_DIM = 512 |
|
NUM_ENCODER_LAYERS = 3 |
|
NUM_DECODER_LAYERS = 3 |
|
NUM_EPOCHS = 18 |
|
|
|
|
|
transformer = Seq2SeqTransformer(NUM_ENCODER_LAYERS, NUM_DECODER_LAYERS, EMB_SIZE, |
|
NHEAD, SRC_VOCAB_SIZE, TGT_VOCAB_SIZE, FFN_HID_DIM) |
|
transformer = transformer.to(DEVICE) |
|
|
|
|
|
for p in transformer.parameters(): |
|
if p.dim() > 1: |
|
nn.init.xavier_uniform_(p) |
|
|
|
|
|
loss_fn = nn.CrossEntropyLoss(ignore_index=PAD_IDX) |
|
optimizer = torch.optim.Adam(transformer.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9) |
|
|
|
|
|
scaler = GradScaler() |
|
|
|
def train_epoch(model, optimizer): |
|
try: |
|
model.train() |
|
losses = 0 |
|
train_dataloader = get_dataloader('train', BATCH_SIZE) |
|
|
|
for batch in train_dataloader: |
|
src_texts = batch['de'] |
|
tgt_texts = batch['en'] |
|
|
|
|
|
with autocast(): |
|
src_tokens = tokenizer(src_texts, padding=True, return_tensors='pt') |
|
tgt_tokens = tokenizer(tgt_texts, padding=True, return_tensors='pt') |
|
|
|
src = src_tokens['input_ids'].transpose(0, 1).to(DEVICE) |
|
tgt = tgt_tokens['input_ids'].transpose(0, 1).to(DEVICE) |
|
|
|
tgt_input = tgt[:-1, :] |
|
src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_input) |
|
|
|
logits = model(src, tgt_input, src_mask, tgt_mask, |
|
src_padding_mask, tgt_padding_mask, src_padding_mask) |
|
|
|
tgt_out = tgt[1:, :] |
|
loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) |
|
|
|
optimizer.zero_grad() |
|
scaler.scale(loss).backward() |
|
scaler.step(optimizer) |
|
scaler.update() |
|
losses += loss.item() |
|
|
|
return losses / len(train_dataloader) |
|
except KeyboardInterrupt: |
|
print("\n训练被手动中断!正在保存当前模型状态...") |
|
|
|
checkpoint = { |
|
'model_state_dict': model.state_dict(), |
|
'optimizer_state_dict': optimizer.state_dict(), |
|
'epoch': epoch, |
|
'train_loss': train_loss, |
|
'val_loss': val_loss |
|
} |
|
torch.save(checkpoint, 'transformer_translation.pth') |
|
print("模型检查点已保存到 transformer_translation.pth") |
|
raise KeyboardInterrupt |
|
|
|
def evaluate(model): |
|
model.eval() |
|
losses = 0 |
|
val_dataloader = get_dataloader('val', BATCH_SIZE) |
|
|
|
for batch in val_dataloader: |
|
src_texts = batch['de'] |
|
tgt_texts = batch['en'] |
|
|
|
src_tokens = tokenizer(src_texts, padding=True, return_tensors='pt') |
|
tgt_tokens = tokenizer(tgt_texts, padding=True, return_tensors='pt') |
|
|
|
src = src_tokens['input_ids'].transpose(0, 1).to(DEVICE) |
|
tgt = tgt_tokens['input_ids'].transpose(0, 1).to(DEVICE) |
|
|
|
tgt_input = tgt[:-1, :] |
|
src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_input) |
|
|
|
logits = model(src, tgt_input, src_mask, tgt_mask, |
|
src_padding_mask, tgt_padding_mask, src_padding_mask) |
|
|
|
tgt_out = tgt[1:, :] |
|
loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)) |
|
losses += loss.item() |
|
|
|
return losses / len(val_dataloader) |
|
|
|
def greedy_decode(model, src, src_mask, max_len, start_symbol): |
|
src = src.to(DEVICE) |
|
src_mask = src_mask.to(DEVICE) |
|
|
|
memory = model.encode(src, src_mask) |
|
ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).to(DEVICE) |
|
|
|
for i in range(max_len-1): |
|
memory = memory.to(DEVICE) |
|
tgt_mask = (generate_square_subsequent_mask(ys.size(0)) |
|
.type(torch.bool)).to(DEVICE) |
|
out = model.decode(ys, memory, tgt_mask) |
|
out = out.transpose(0, 1) |
|
prob = model.generator(out[:, -1]) |
|
_, next_word = torch.max(prob, dim=1) |
|
next_word = next_word.item() |
|
|
|
ys = torch.cat([ys, torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0) |
|
if next_word == EOS_IDX: |
|
break |
|
return ys |
|
|
|
def translate(model: torch.nn.Module, src_sentence: str): |
|
model.eval() |
|
tokens = tokenizer(src_sentence, return_tensors='pt', padding=True) |
|
src = tokens['input_ids'].transpose(0, 1).to(DEVICE) |
|
src_mask = (torch.zeros(src.shape[0], src.shape[0])).type(torch.bool).to(DEVICE) |
|
|
|
tgt_tokens = greedy_decode(model, src, src_mask, max_len=src.shape[0] + 5, start_symbol=BOS_IDX).flatten() |
|
return tokenizer.decode(tgt_tokens.tolist(), skip_special_tokens=True) |
|
|
|
|
|
if torch.cuda.is_available(): |
|
torch.cuda.empty_cache() |
|
|
|
|
|
|
|
|
|
for epoch in range(1, NUM_EPOCHS + 1): |
|
start_time = timer() |
|
train_loss = train_epoch(transformer, optimizer) |
|
end_time = timer() |
|
val_loss = evaluate(transformer) |
|
print(f"Epoch: {epoch}, Train loss: {train_loss:.3f}, Val loss: {val_loss:.3f}, " |
|
f"Epoch time = {(end_time - start_time):.3f}s") |
|
|
|
|
|
path = 'transformer_translation.pth' |
|
torch.save(transformer.state_dict(), path) |
|
print("模型保存成功!") |
|
|
|
|
|
transformer = Seq2SeqTransformer(NUM_ENCODER_LAYERS, NUM_DECODER_LAYERS, EMB_SIZE, |
|
NHEAD, SRC_VOCAB_SIZE, TGT_VOCAB_SIZE, FFN_HID_DIM) |
|
transformer.load_state_dict(torch.load(path)) |
|
transformer = transformer.to(DEVICE) |
|
print("模型加载成功!") |
|
|
|
|
|
print(translate(transformer, "Eine Gruppe von Freunden spielt Billiade.")) |
|
|
|
|