|
|
|
|
|
from transformers import AutoTokenizer, AutoModelForMaskedLM, RobertaConfig , RobertaTokenizer,RobertaForMaskedLM, DataCollatorForLanguageModeling, LineByLineTextDataset, Trainer, TrainingArguments |
|
|
|
|
|
from pathlib import Path |
|
from tokenizers import ByteLevelBPETokenizer |
|
from tokenizers.implementations import ByteLevelBPETokenizer |
|
from tokenizers.processors import BertProcessing |
|
import torch |
|
from torchinfo import summary |
|
|
|
|
|
import os |
|
|
|
paths = [str(x) for x in Path(".").glob("**/el_*.txt")] |
|
print(paths) |
|
|
|
tokenizer = ByteLevelBPETokenizer() |
|
|
|
tokenizer.train(files=paths, vocab_size=52_000, min_frequency=2, |
|
special_tokens=[ |
|
"<s>", |
|
"<pad>", |
|
"</s>", |
|
"<unk>", |
|
"<mask>", |
|
]) |
|
|
|
|
|
dir_path = os.getcwd() |
|
token_dir = os.path.join(dir_path, 'QuijoBERT') |
|
|
|
if not os.path.exists(token_dir): |
|
os.makedirs(token_dir) |
|
tokenizer.save_model('QuijoBERT') |
|
|
|
tokenizer = ByteLevelBPETokenizer( |
|
"./QuijoBERT/vocab.json", |
|
"./QuijoBERT/merges.txt", |
|
) |
|
|
|
tokenizer._tokenizer.post_processor = BertProcessing( |
|
("</s>", tokenizer.token_to_id("</s>")), |
|
("<s>", tokenizer.token_to_id("<s>")), |
|
) |
|
tokenizer.enable_truncation(max_length=512) |
|
|
|
|
|
|
|
config = RobertaConfig( |
|
vocab_size=52_000, |
|
max_position_embeddings=514, |
|
num_attention_heads=12, |
|
num_hidden_layers=6, |
|
type_vocab_size=1, |
|
) |
|
|
|
"""# Step 8: Re-creating the Tokenizer in Transformers""" |
|
|
|
tokenizer = RobertaTokenizer.from_pretrained("./QuijoBERT", max_length=512) |
|
|
|
|
|
|
|
model = RobertaForMaskedLM(config=config) |
|
|
|
|
|
|
|
|
|
|
|
print(model) |
|
|
|
summary(model) |
|
|
|
|
|
dataset = LineByLineTextDataset( |
|
tokenizer=tokenizer, |
|
file_path="./el_quijote.txt", |
|
block_size=128, |
|
) |
|
|
|
|
|
|
|
|
|
data_collator = DataCollatorForLanguageModeling( |
|
tokenizer=tokenizer, mlm=True, mlm_probability=0.15 |
|
) |
|
|
|
|
|
training_args = TrainingArguments( |
|
output_dir="./QuijoBERT", |
|
overwrite_output_dir=True, |
|
num_train_epochs=1, |
|
per_device_train_batch_size=64, |
|
save_steps=1000, |
|
save_total_limit=2, |
|
) |
|
trainer = Trainer( |
|
model=model, |
|
args=training_args, |
|
data_collator=data_collator, |
|
train_dataset=dataset, |
|
) |
|
|
|
|
|
|
|
print('aqui') |
|
trainer.train() |
|
trainer.save_model("./QuijoBERT") |
|
|
|
|
|
trainer.save_model("./QuijoBERT") |
|
|
|
|