File size: 2,448 Bytes
165560d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 |
from transformers import AutoTokenizer, AutoModelForMaskedLM, RobertaConfig , RobertaTokenizer,RobertaForMaskedLM, DataCollatorForLanguageModeling, LineByLineTextDataset, Trainer, TrainingArguments
from pathlib import Path
from tokenizers import ByteLevelBPETokenizer
from tokenizers.implementations import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing
import torch
from torchinfo import summary
import os
paths = [str(x) for x in Path(".").glob("**/el_*.txt")]
print(paths)
# Initialize a tokenizer
tokenizer = ByteLevelBPETokenizer()
# Customize training
tokenizer.train(files=paths, vocab_size=52_000, min_frequency=2,
special_tokens=[
"<s>",
"<pad>",
"</s>",
"<unk>",
"<mask>",
])
dir_path = os.getcwd()
token_dir = os.path.join(dir_path, 'QuijoBERT')
if not os.path.exists(token_dir):
os.makedirs(token_dir)
tokenizer.save_model('QuijoBERT')
tokenizer = ByteLevelBPETokenizer(
"./QuijoBERT/vocab.json",
"./QuijoBERT/merges.txt",
)
tokenizer._tokenizer.post_processor = BertProcessing(
("</s>", tokenizer.token_to_id("</s>")),
("<s>", tokenizer.token_to_id("<s>")),
)
tokenizer.enable_truncation(max_length=512)
config = RobertaConfig(
vocab_size=52_000,
max_position_embeddings=514,
num_attention_heads=12,
num_hidden_layers=6,
type_vocab_size=1,
)
"""# Step 8: Re-creating the Tokenizer in Transformers"""
tokenizer = RobertaTokenizer.from_pretrained("./QuijoBERT", max_length=512)
#Initializing a Model
model = RobertaForMaskedLM(config=config)
#In case we want to recover the after a crash
#model = RobertaForMaskedLM.from_pretrained("./QuijoBERT/Checkpoint-xxxxx")
#Tensorflow
print(model)
#Pytorch
summary(model)
dataset = LineByLineTextDataset(
tokenizer=tokenizer,
file_path="./el_quijote.txt",
block_size=128,
)
#Defining a Data Collator
data_collator = DataCollatorForLanguageModeling(
tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)
# Initializing the Trainer Object
training_args = TrainingArguments(
output_dir="./QuijoBERT",
overwrite_output_dir=True,
num_train_epochs=1,
per_device_train_batch_size=64,
save_steps=1000,
save_total_limit=2,
)
trainer = Trainer(
model=model,
args=training_args,
data_collator=data_collator,
train_dataset=dataset,
)
#Training the Model
print('aqui')
trainer.train()
trainer.save_model("./QuijoBERT")
#Saving the Final Model(+tokenizer + config) to disk
trainer.save_model("./QuijoBERT")
|