Spaces:
Build error
Build error
File size: 5,892 Bytes
cfe5488 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 |
# from datasets import load_dataset
# from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
# import torch
# # Check for GPU
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# print(f"Using device: {device}")
# # Step 1: Load the dataset
# dataset = load_dataset("wraps/codegen-flutter-v1")
# # Step 2: Load the tokenizer and model
# model_name = "Salesforce/codegen-350M-mono"
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# tokenizer.pad_token = tokenizer.eos_token # Set the padding token
# model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
# # Step 3: Tokenize the dataset
# def tokenize_function(examples):
# return tokenizer(examples["content"], truncation=True, padding="max_length", max_length=512)
# tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["content"])
# # Step 4: Set up training arguments
# training_args = TrainingArguments(
# output_dir="./flutter_codegen_model",
# evaluation_strategy="epoch",
# learning_rate=5e-5,
# per_device_train_batch_size=4, # Adjust based on GPU memory
# num_train_epochs=3,
# save_steps=500,
# save_total_limit=2,
# fp16=torch.cuda.is_available(), # Use mixed precision if GPU is available
# logging_dir="./logs",
# logging_steps=10,
# report_to="none"
# )
# # Step 5: Initialize the Trainer
# trainer = Trainer(
# model=model,
# args=training_args,
# train_dataset=tokenized_dataset["train"],
# eval_dataset=tokenized_dataset["validation"],
# tokenizer=tokenizer,
# )
# # Step 6: Train the model
# trainer.train()
# # Step 7: Save the fine-tuned model
# model.save_pretrained("./flutter_codegen_model")
# tokenizer.save_pretrained("./flutter_codegen_model")
# # # # # # # # # # # # # # # # #
# Train on multiple datasets #
# # # # # # # # # # # # # # # # #
from datasets import load_dataset, concatenate_datasets
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
import torch
# Check for GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
# Step 1: Load the datasets
print("Loading datasets...")
dataset1 = load_dataset("wraps/codegen-flutter-v1")
dataset2 = load_dataset("limcheekin/flutter-website-3.7")
dataset3 = load_dataset("deepklarity/top-flutter-packages")
# Step 2: Preprocess datasets to extract relevant text
def preprocess_dataset1(example):
return {"text": example["content"]}
def preprocess_dataset2(example):
return {"text": example["text"]}
def preprocess_dataset3(example):
# Combine title and description into one text entry
return {"text": f"{example['title']} - {example['description']}"}
print("Preprocessing datasets...")
dataset1_train = dataset1["train"].map(preprocess_dataset1, remove_columns=["repo_id", "file_path", "content", "__index_level_0__"])
dataset2_train = dataset2["train"].map(preprocess_dataset2, remove_columns=["id", "source"])
dataset3_train = dataset3["train"].map(preprocess_dataset3, remove_columns=["title", "description", "likes", "dependencies"])
# Combine all datasets into a single dataset
print("Combining datasets...")
combined_dataset = concatenate_datasets([dataset1_train, dataset2_train, dataset3_train])
# Step 3: Create train-validation split
print("Creating train-validation split...")
train_test_split = combined_dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = train_test_split["train"]
validation_dataset = train_test_split["test"]
# Step 4: Load the tokenizer and model from the checkpoint
print("Loading tokenizer and model from checkpoint...")
checkpoint_path = "./flutter_codegen_model/checkpoint-1500"
tokenizer = AutoTokenizer.from_pretrained(checkpoint_path)
tokenizer.pad_token = tokenizer.eos_token # Set the padding token
model = AutoModelForCausalLM.from_pretrained(checkpoint_path).to(device)
# Step 5: Tokenize the datasets
def tokenize_function(examples):
# Tokenize the text and add labels
tokenized = tokenizer(
examples["text"],
truncation=True,
padding="max_length",
max_length=512,
)
tokenized["labels"] = tokenized["input_ids"].copy() # Duplicate input_ids as labels
return tokenized
print("Tokenizing datasets...")
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True, remove_columns=["text"])
tokenized_validation_dataset = validation_dataset.map(tokenize_function, batched=True, remove_columns=["text"])
# Step 6: Set up training arguments
print("Setting up training arguments...")
training_args = TrainingArguments(
output_dir="./flutter_codegen_model",
evaluation_strategy="epoch",
learning_rate=5e-5,
per_device_train_batch_size=4, # Adjust based on GPU memory
num_train_epochs=3,
save_steps=500,
save_total_limit=2,
fp16=torch.cuda.is_available(), # Use mixed precision if GPU is available
logging_dir="./logs",
logging_steps=10,
resume_from_checkpoint=checkpoint_path, # Resume from the checkpoint
report_to="none"
)
# Step 7: Initialize the Trainer
print("Initializing Trainer...")
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_train_dataset,
eval_dataset=tokenized_validation_dataset, # Use the new validation dataset
tokenizer=tokenizer,
)
# Step 8: Train the model
print("Starting training from checkpoint...")
trainer.train()
# Step 9: Save the fine-tuned model
print("Saving the model...")
model.save_pretrained("./flutter_codegen_model")
tokenizer.save_pretrained("./flutter_codegen_model")
print("Training complete. Model saved to './flutter_codegen_model'.")
|