Spaces:

Coyoteranger
/

flutter-code-generator

Build error

File size: 5,892 Bytes

cfe5488

# from datasets import load_dataset
# from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
# import torch

# # Check for GPU
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# print(f"Using device: {device}")

# # Step 1: Load the dataset
# dataset = load_dataset("wraps/codegen-flutter-v1")

# # Step 2: Load the tokenizer and model
# model_name = "Salesforce/codegen-350M-mono"
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# tokenizer.pad_token = tokenizer.eos_token  # Set the padding token
# model = AutoModelForCausalLM.from_pretrained(model_name).to(device)

# # Step 3: Tokenize the dataset
# def tokenize_function(examples):
#     return tokenizer(examples["content"], truncation=True, padding="max_length", max_length=512)

# tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["content"])

# # Step 4: Set up training arguments
# training_args = TrainingArguments(
#     output_dir="./flutter_codegen_model",
#     evaluation_strategy="epoch",
#     learning_rate=5e-5,
#     per_device_train_batch_size=4,  # Adjust based on GPU memory
#     num_train_epochs=3,
#     save_steps=500,
#     save_total_limit=2,
#     fp16=torch.cuda.is_available(),  # Use mixed precision if GPU is available
#     logging_dir="./logs",
#     logging_steps=10,
#     report_to="none"
# )

# # Step 5: Initialize the Trainer
# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=tokenized_dataset["train"],
#     eval_dataset=tokenized_dataset["validation"],
#     tokenizer=tokenizer,
# )

# # Step 6: Train the model
# trainer.train()

# # Step 7: Save the fine-tuned model
# model.save_pretrained("./flutter_codegen_model")
# tokenizer.save_pretrained("./flutter_codegen_model")

# # # # # # # # # # # # # # # # #
#   Train on multiple datasets  #
# # # # # # # # # # # # # # # # #

from datasets import load_dataset, concatenate_datasets
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
import torch

# Check for GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Step 1: Load the datasets
print("Loading datasets...")
dataset1 = load_dataset("wraps/codegen-flutter-v1")
dataset2 = load_dataset("limcheekin/flutter-website-3.7")
dataset3 = load_dataset("deepklarity/top-flutter-packages")

# Step 2: Preprocess datasets to extract relevant text
def preprocess_dataset1(example):
    return {"text": example["content"]}

def preprocess_dataset2(example):
    return {"text": example["text"]}

def preprocess_dataset3(example):
    # Combine title and description into one text entry
    return {"text": f"{example['title']} - {example['description']}"}

print("Preprocessing datasets...")
dataset1_train = dataset1["train"].map(preprocess_dataset1, remove_columns=["repo_id", "file_path", "content", "__index_level_0__"])
dataset2_train = dataset2["train"].map(preprocess_dataset2, remove_columns=["id", "source"])
dataset3_train = dataset3["train"].map(preprocess_dataset3, remove_columns=["title", "description", "likes", "dependencies"])

# Combine all datasets into a single dataset
print("Combining datasets...")
combined_dataset = concatenate_datasets([dataset1_train, dataset2_train, dataset3_train])

# Step 3: Create train-validation split
print("Creating train-validation split...")
train_test_split = combined_dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = train_test_split["train"]
validation_dataset = train_test_split["test"]

# Step 4: Load the tokenizer and model from the checkpoint
print("Loading tokenizer and model from checkpoint...")
checkpoint_path = "./flutter_codegen_model/checkpoint-1500"
tokenizer = AutoTokenizer.from_pretrained(checkpoint_path)
tokenizer.pad_token = tokenizer.eos_token  # Set the padding token
model = AutoModelForCausalLM.from_pretrained(checkpoint_path).to(device)

# Step 5: Tokenize the datasets
def tokenize_function(examples):
    # Tokenize the text and add labels
    tokenized = tokenizer(
        examples["text"],
        truncation=True,
        padding="max_length",
        max_length=512,
    )
    tokenized["labels"] = tokenized["input_ids"].copy()  # Duplicate input_ids as labels
    return tokenized

print("Tokenizing datasets...")
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True, remove_columns=["text"])
tokenized_validation_dataset = validation_dataset.map(tokenize_function, batched=True, remove_columns=["text"])

# Step 6: Set up training arguments
print("Setting up training arguments...")
training_args = TrainingArguments(
    output_dir="./flutter_codegen_model",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=4,  # Adjust based on GPU memory
    num_train_epochs=3,
    save_steps=500,
    save_total_limit=2,
    fp16=torch.cuda.is_available(),  # Use mixed precision if GPU is available
    logging_dir="./logs",
    logging_steps=10,
    resume_from_checkpoint=checkpoint_path,  # Resume from the checkpoint
    report_to="none"
)

# Step 7: Initialize the Trainer
print("Initializing Trainer...")
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_validation_dataset,  # Use the new validation dataset
    tokenizer=tokenizer,
)

# Step 8: Train the model
print("Starting training from checkpoint...")
trainer.train()

# Step 9: Save the fine-tuned model
print("Saving the model...")
model.save_pretrained("./flutter_codegen_model")
tokenizer.save_pretrained("./flutter_codegen_model")

print("Training complete. Model saved to './flutter_codegen_model'.")