import json from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained("ghosh-r/bangla-gpt2") train_path = 'train.txt' test_path = 'test.txt' from transformers import TextDataset,DataCollatorForLanguageModeling def load_dataset(train_path,test_path,tokenizer): train_dataset = TextDataset( tokenizer=tokenizer, file_path=train_path, block_size=128) test_dataset = TextDataset( tokenizer=tokenizer, file_path=test_path, block_size=128) data_collator = DataCollatorForLanguageModeling( tokenizer=tokenizer, mlm=False, ) return train_dataset, test_dataset, data_collator train_dataset,test_dataset,data_collator = load_dataset(train_path,test_path,tokenizer) from transformers import Trainer, TrainingArguments, AutoModelWithLMHead model = AutoModelWithLMHead.from_pretrained("ghosh-r/bangla-gpt2") training_args = TrainingArguments( output_dir="./bn-poets", overwrite_output_dir=True, num_train_epochs=3, per_device_train_batch_size=32, per_device_eval_batch_size=64, eval_steps = 400, save_steps=800, warmup_steps=500, prediction_loss_only=True, ) trainer = Trainer( model=model, args=training_args, data_collator=data_collator, train_dataset=train_dataset, eval_dataset=test_dataset, ) trainer.train() trainer.save_model() tokenizer.save_pretrained('./bn-poets')