Spaces:
Build error
Build error
# from datasets import load_dataset | |
# from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer | |
# import torch | |
# # Check for GPU | |
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
# print(f"Using device: {device}") | |
# # Step 1: Load the dataset | |
# dataset = load_dataset("wraps/codegen-flutter-v1") | |
# # Step 2: Load the tokenizer and model | |
# model_name = "Salesforce/codegen-350M-mono" | |
# tokenizer = AutoTokenizer.from_pretrained(model_name) | |
# tokenizer.pad_token = tokenizer.eos_token # Set the padding token | |
# model = AutoModelForCausalLM.from_pretrained(model_name).to(device) | |
# # Step 3: Tokenize the dataset | |
# def tokenize_function(examples): | |
# return tokenizer(examples["content"], truncation=True, padding="max_length", max_length=512) | |
# tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["content"]) | |
# # Step 4: Set up training arguments | |
# training_args = TrainingArguments( | |
# output_dir="./flutter_codegen_model", | |
# evaluation_strategy="epoch", | |
# learning_rate=5e-5, | |
# per_device_train_batch_size=4, # Adjust based on GPU memory | |
# num_train_epochs=3, | |
# save_steps=500, | |
# save_total_limit=2, | |
# fp16=torch.cuda.is_available(), # Use mixed precision if GPU is available | |
# logging_dir="./logs", | |
# logging_steps=10, | |
# report_to="none" | |
# ) | |
# # Step 5: Initialize the Trainer | |
# trainer = Trainer( | |
# model=model, | |
# args=training_args, | |
# train_dataset=tokenized_dataset["train"], | |
# eval_dataset=tokenized_dataset["validation"], | |
# tokenizer=tokenizer, | |
# ) | |
# # Step 6: Train the model | |
# trainer.train() | |
# # Step 7: Save the fine-tuned model | |
# model.save_pretrained("./flutter_codegen_model") | |
# tokenizer.save_pretrained("./flutter_codegen_model") | |
# # # # # # # # # # # # # # # # # | |
# Train on multiple datasets # | |
# # # # # # # # # # # # # # # # # | |
from datasets import load_dataset, concatenate_datasets | |
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer | |
import torch | |
# Check for GPU | |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
print(f"Using device: {device}") | |
# Step 1: Load the datasets | |
print("Loading datasets...") | |
dataset1 = load_dataset("wraps/codegen-flutter-v1") | |
dataset2 = load_dataset("limcheekin/flutter-website-3.7") | |
dataset3 = load_dataset("deepklarity/top-flutter-packages") | |
# Step 2: Preprocess datasets to extract relevant text | |
def preprocess_dataset1(example): | |
return {"text": example["content"]} | |
def preprocess_dataset2(example): | |
return {"text": example["text"]} | |
def preprocess_dataset3(example): | |
# Combine title and description into one text entry | |
return {"text": f"{example['title']} - {example['description']}"} | |
print("Preprocessing datasets...") | |
dataset1_train = dataset1["train"].map(preprocess_dataset1, remove_columns=["repo_id", "file_path", "content", "__index_level_0__"]) | |
dataset2_train = dataset2["train"].map(preprocess_dataset2, remove_columns=["id", "source"]) | |
dataset3_train = dataset3["train"].map(preprocess_dataset3, remove_columns=["title", "description", "likes", "dependencies"]) | |
# Combine all datasets into a single dataset | |
print("Combining datasets...") | |
combined_dataset = concatenate_datasets([dataset1_train, dataset2_train, dataset3_train]) | |
# Step 3: Create train-validation split | |
print("Creating train-validation split...") | |
train_test_split = combined_dataset.train_test_split(test_size=0.1, seed=42) | |
train_dataset = train_test_split["train"] | |
validation_dataset = train_test_split["test"] | |
# Step 4: Load the tokenizer and model from the checkpoint | |
print("Loading tokenizer and model from checkpoint...") | |
checkpoint_path = "./flutter_codegen_model/checkpoint-1500" | |
tokenizer = AutoTokenizer.from_pretrained(checkpoint_path) | |
tokenizer.pad_token = tokenizer.eos_token # Set the padding token | |
model = AutoModelForCausalLM.from_pretrained(checkpoint_path).to(device) | |
# Step 5: Tokenize the datasets | |
def tokenize_function(examples): | |
# Tokenize the text and add labels | |
tokenized = tokenizer( | |
examples["text"], | |
truncation=True, | |
padding="max_length", | |
max_length=512, | |
) | |
tokenized["labels"] = tokenized["input_ids"].copy() # Duplicate input_ids as labels | |
return tokenized | |
print("Tokenizing datasets...") | |
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True, remove_columns=["text"]) | |
tokenized_validation_dataset = validation_dataset.map(tokenize_function, batched=True, remove_columns=["text"]) | |
# Step 6: Set up training arguments | |
print("Setting up training arguments...") | |
training_args = TrainingArguments( | |
output_dir="./flutter_codegen_model", | |
evaluation_strategy="epoch", | |
learning_rate=5e-5, | |
per_device_train_batch_size=4, # Adjust based on GPU memory | |
num_train_epochs=3, | |
save_steps=500, | |
save_total_limit=2, | |
fp16=torch.cuda.is_available(), # Use mixed precision if GPU is available | |
logging_dir="./logs", | |
logging_steps=10, | |
resume_from_checkpoint=checkpoint_path, # Resume from the checkpoint | |
report_to="none" | |
) | |
# Step 7: Initialize the Trainer | |
print("Initializing Trainer...") | |
trainer = Trainer( | |
model=model, | |
args=training_args, | |
train_dataset=tokenized_train_dataset, | |
eval_dataset=tokenized_validation_dataset, # Use the new validation dataset | |
tokenizer=tokenizer, | |
) | |
# Step 8: Train the model | |
print("Starting training from checkpoint...") | |
trainer.train() | |
# Step 9: Save the fine-tuned model | |
print("Saving the model...") | |
model.save_pretrained("./flutter_codegen_model") | |
tokenizer.save_pretrained("./flutter_codegen_model") | |
print("Training complete. Model saved to './flutter_codegen_model'.") | |