Coyoteranger's picture
Upload 3 files
cfe5488 verified
raw
history blame
5.89 kB
# from datasets import load_dataset
# from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
# import torch
# # Check for GPU
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# print(f"Using device: {device}")
# # Step 1: Load the dataset
# dataset = load_dataset("wraps/codegen-flutter-v1")
# # Step 2: Load the tokenizer and model
# model_name = "Salesforce/codegen-350M-mono"
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# tokenizer.pad_token = tokenizer.eos_token # Set the padding token
# model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
# # Step 3: Tokenize the dataset
# def tokenize_function(examples):
# return tokenizer(examples["content"], truncation=True, padding="max_length", max_length=512)
# tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["content"])
# # Step 4: Set up training arguments
# training_args = TrainingArguments(
# output_dir="./flutter_codegen_model",
# evaluation_strategy="epoch",
# learning_rate=5e-5,
# per_device_train_batch_size=4, # Adjust based on GPU memory
# num_train_epochs=3,
# save_steps=500,
# save_total_limit=2,
# fp16=torch.cuda.is_available(), # Use mixed precision if GPU is available
# logging_dir="./logs",
# logging_steps=10,
# report_to="none"
# )
# # Step 5: Initialize the Trainer
# trainer = Trainer(
# model=model,
# args=training_args,
# train_dataset=tokenized_dataset["train"],
# eval_dataset=tokenized_dataset["validation"],
# tokenizer=tokenizer,
# )
# # Step 6: Train the model
# trainer.train()
# # Step 7: Save the fine-tuned model
# model.save_pretrained("./flutter_codegen_model")
# tokenizer.save_pretrained("./flutter_codegen_model")
# # # # # # # # # # # # # # # # #
# Train on multiple datasets #
# # # # # # # # # # # # # # # # #
from datasets import load_dataset, concatenate_datasets
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
import torch
# Check for GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
# Step 1: Load the datasets
print("Loading datasets...")
dataset1 = load_dataset("wraps/codegen-flutter-v1")
dataset2 = load_dataset("limcheekin/flutter-website-3.7")
dataset3 = load_dataset("deepklarity/top-flutter-packages")
# Step 2: Preprocess datasets to extract relevant text
def preprocess_dataset1(example):
return {"text": example["content"]}
def preprocess_dataset2(example):
return {"text": example["text"]}
def preprocess_dataset3(example):
# Combine title and description into one text entry
return {"text": f"{example['title']} - {example['description']}"}
print("Preprocessing datasets...")
dataset1_train = dataset1["train"].map(preprocess_dataset1, remove_columns=["repo_id", "file_path", "content", "__index_level_0__"])
dataset2_train = dataset2["train"].map(preprocess_dataset2, remove_columns=["id", "source"])
dataset3_train = dataset3["train"].map(preprocess_dataset3, remove_columns=["title", "description", "likes", "dependencies"])
# Combine all datasets into a single dataset
print("Combining datasets...")
combined_dataset = concatenate_datasets([dataset1_train, dataset2_train, dataset3_train])
# Step 3: Create train-validation split
print("Creating train-validation split...")
train_test_split = combined_dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = train_test_split["train"]
validation_dataset = train_test_split["test"]
# Step 4: Load the tokenizer and model from the checkpoint
print("Loading tokenizer and model from checkpoint...")
checkpoint_path = "./flutter_codegen_model/checkpoint-1500"
tokenizer = AutoTokenizer.from_pretrained(checkpoint_path)
tokenizer.pad_token = tokenizer.eos_token # Set the padding token
model = AutoModelForCausalLM.from_pretrained(checkpoint_path).to(device)
# Step 5: Tokenize the datasets
def tokenize_function(examples):
# Tokenize the text and add labels
tokenized = tokenizer(
examples["text"],
truncation=True,
padding="max_length",
max_length=512,
)
tokenized["labels"] = tokenized["input_ids"].copy() # Duplicate input_ids as labels
return tokenized
print("Tokenizing datasets...")
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True, remove_columns=["text"])
tokenized_validation_dataset = validation_dataset.map(tokenize_function, batched=True, remove_columns=["text"])
# Step 6: Set up training arguments
print("Setting up training arguments...")
training_args = TrainingArguments(
output_dir="./flutter_codegen_model",
evaluation_strategy="epoch",
learning_rate=5e-5,
per_device_train_batch_size=4, # Adjust based on GPU memory
num_train_epochs=3,
save_steps=500,
save_total_limit=2,
fp16=torch.cuda.is_available(), # Use mixed precision if GPU is available
logging_dir="./logs",
logging_steps=10,
resume_from_checkpoint=checkpoint_path, # Resume from the checkpoint
report_to="none"
)
# Step 7: Initialize the Trainer
print("Initializing Trainer...")
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_train_dataset,
eval_dataset=tokenized_validation_dataset, # Use the new validation dataset
tokenizer=tokenizer,
)
# Step 8: Train the model
print("Starting training from checkpoint...")
trainer.train()
# Step 9: Save the fine-tuned model
print("Saving the model...")
model.save_pretrained("./flutter_codegen_model")
tokenizer.save_pretrained("./flutter_codegen_model")
print("Training complete. Model saved to './flutter_codegen_model'.")