Spaces:
Build error
Build error
Coyoteranger
commited on
Upload 3 files
Browse files
README.md
CHANGED
@@ -1,13 +1,54 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
---
|
12 |
|
13 |
-
|
|
|
1 |
+
|
2 |
+
# Flutter Code Generator - Hugging Face Space
|
3 |
+
|
4 |
+
This project leverages Transformers and Hugging Face's Spaces to generate Flutter code based on user prompts. It is designed to simplify Flutter development by providing intelligent suggestions for UI and functionality implementations.
|
5 |
+
|
6 |
+
## Features
|
7 |
+
- **Customizable Flutter Code Generation**: Enter prompts like "Create a responsive login screen" to get Dart code snippets.
|
8 |
+
- **Streamlit Web App**: Interactive UI for generating Flutter code with adjustable parameters.
|
9 |
+
- **Fine-tuned Model**: Trained on multiple datasets for Flutter-specific code generation.
|
10 |
+
|
11 |
+
## Installation
|
12 |
+
To run the app locally, follow these steps:
|
13 |
+
|
14 |
+
### Clone the Repository
|
15 |
+
```bash
|
16 |
+
git clone https://github.com/cod-e-codes/flutter-code-generator.git
|
17 |
+
cd flutter-code-generator
|
18 |
+
```
|
19 |
+
|
20 |
+
### Install Dependencies
|
21 |
+
```bash
|
22 |
+
pip install -r requirements.txt
|
23 |
+
```
|
24 |
+
|
25 |
+
### Run the Streamlit App
|
26 |
+
```bash
|
27 |
+
streamlit run app.py
|
28 |
+
```
|
29 |
+
|
30 |
+
## Parameters
|
31 |
+
Adjust the following settings via the sidebar for code generation:
|
32 |
+
- **Temperature**: Controls randomness (higher = more random outputs).
|
33 |
+
- **Top-p**: Cumulative probability for nucleus sampling.
|
34 |
+
- **Max Length**: Maximum tokens in output.
|
35 |
+
- **Repetition Penalty**: Penalizes repetitive text.
|
36 |
+
- **Top-k**: Limits the sampling pool.
|
37 |
+
|
38 |
+
## Deploying to Hugging Face Spaces
|
39 |
+
1. Create a Hugging Face Space and select the "Streamlit" template.
|
40 |
+
2. Upload the code files from this repository.
|
41 |
+
3. Configure the environment by adding the required packages.
|
42 |
+
4. Deploy and access the app via your Space's URL.
|
43 |
+
|
44 |
+
## Model Training
|
45 |
+
The model was fine-tuned using:
|
46 |
+
- Datasets from Hugging Face such as `wraps/codegen-flutter-v1`, `limcheekin/flutter-website-3.7`, and `deepklarity/top-flutter-packages`.
|
47 |
+
- A checkpoint from Salesforce's CodeGen model (`codegen-350M-mono`).
|
48 |
+
|
49 |
+
## License
|
50 |
+
This project is open-source and available under the [MIT License](LICENSE).
|
51 |
+
|
52 |
---
|
53 |
|
54 |
+
Built with ❤️ by [Cod-e-Codes](https://github.com/cod-e-codes)
|
app.py
ADDED
@@ -0,0 +1,115 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer
|
3 |
+
|
4 |
+
# Load pre-trained model and tokenizer from the checkpoint
|
5 |
+
model_name = "./flutter_codegen_model/checkpoint-1500"
|
6 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
7 |
+
tokenizer.pad_token = tokenizer.eos_token
|
8 |
+
model = AutoModelForCausalLM.from_pretrained(model_name)
|
9 |
+
|
10 |
+
# Function to clean up repetitive lines in code
|
11 |
+
def clean_code_response(response):
|
12 |
+
lines = response.splitlines()
|
13 |
+
unique_lines = []
|
14 |
+
for line in lines:
|
15 |
+
if line.strip() not in unique_lines: # Avoid duplicates
|
16 |
+
unique_lines.append(line.strip())
|
17 |
+
return "\n".join(unique_lines)
|
18 |
+
|
19 |
+
# Function to generate Flutter code
|
20 |
+
def generate_flutter_code(prompt, temperature, top_p, max_length, num_return_sequences, repetition_penalty, top_k):
|
21 |
+
inputs = tokenizer(
|
22 |
+
prompt,
|
23 |
+
return_tensors="pt",
|
24 |
+
padding=True,
|
25 |
+
truncation=True,
|
26 |
+
)
|
27 |
+
outputs = model.generate(
|
28 |
+
inputs["input_ids"],
|
29 |
+
max_length=max_length,
|
30 |
+
num_return_sequences=num_return_sequences,
|
31 |
+
temperature=temperature,
|
32 |
+
top_p=top_p,
|
33 |
+
top_k=top_k,
|
34 |
+
repetition_penalty=repetition_penalty,
|
35 |
+
do_sample=True,
|
36 |
+
pad_token_id=tokenizer.pad_token_id,
|
37 |
+
)
|
38 |
+
code = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
|
39 |
+
return [clean_code_response(c) for c in code]
|
40 |
+
|
41 |
+
# App Title
|
42 |
+
st.title("Flutter Code Generator")
|
43 |
+
|
44 |
+
# Default parameter values
|
45 |
+
DEFAULT_TEMPERATURE = 0.7
|
46 |
+
DEFAULT_TOP_P = 0.9
|
47 |
+
DEFAULT_MAX_LENGTH = 512
|
48 |
+
DEFAULT_NUM_RETURN_SEQUENCES = 1
|
49 |
+
DEFAULT_REPETITION_PENALTY = 1.2
|
50 |
+
DEFAULT_TOP_K = 50
|
51 |
+
|
52 |
+
# Sidebar for settings
|
53 |
+
st.sidebar.title("Generation Settings")
|
54 |
+
|
55 |
+
temperature = st.sidebar.slider(
|
56 |
+
"Temperature (randomness)",
|
57 |
+
0.1, 1.0, DEFAULT_TEMPERATURE, step=0.1,
|
58 |
+
)
|
59 |
+
|
60 |
+
top_p = st.sidebar.slider(
|
61 |
+
"Top-p (cumulative probability)",
|
62 |
+
0.1, 1.0, DEFAULT_TOP_P, step=0.1,
|
63 |
+
)
|
64 |
+
|
65 |
+
max_length = st.sidebar.slider(
|
66 |
+
"Max Output Length (tokens)",
|
67 |
+
128, 1024, DEFAULT_MAX_LENGTH, step=64,
|
68 |
+
)
|
69 |
+
|
70 |
+
num_return_sequences = st.sidebar.slider(
|
71 |
+
"Number of Outputs",
|
72 |
+
1, 5, DEFAULT_NUM_RETURN_SEQUENCES,
|
73 |
+
)
|
74 |
+
|
75 |
+
repetition_penalty = st.sidebar.slider(
|
76 |
+
"Repetition Penalty",
|
77 |
+
1.0, 2.0, DEFAULT_REPETITION_PENALTY, step=0.1,
|
78 |
+
)
|
79 |
+
|
80 |
+
top_k = st.sidebar.slider(
|
81 |
+
"Top-k (limit sampling pool)",
|
82 |
+
0, 100, DEFAULT_TOP_K,
|
83 |
+
)
|
84 |
+
|
85 |
+
# Reset to defaults button
|
86 |
+
if st.sidebar.button("Reset to Defaults"):
|
87 |
+
st.session_state.update(
|
88 |
+
{
|
89 |
+
"temperature": DEFAULT_TEMPERATURE,
|
90 |
+
"top_p": DEFAULT_TOP_P,
|
91 |
+
"max_length": DEFAULT_MAX_LENGTH,
|
92 |
+
"num_return_sequences": DEFAULT_NUM_RETURN_SEQUENCES,
|
93 |
+
"repetition_penalty": DEFAULT_REPETITION_PENALTY,
|
94 |
+
"top_k": DEFAULT_TOP_K,
|
95 |
+
}
|
96 |
+
)
|
97 |
+
|
98 |
+
# Input Section
|
99 |
+
user_input = st.text_area(
|
100 |
+
"Enter your prompt (e.g., 'Create a responsive login screen'):",
|
101 |
+
max_chars=200,
|
102 |
+
)
|
103 |
+
|
104 |
+
# Output Section
|
105 |
+
if st.button("Generate Code"):
|
106 |
+
if user_input.strip():
|
107 |
+
prompt = f"{user_input.strip()}"
|
108 |
+
generated_code = generate_flutter_code(
|
109 |
+
prompt, temperature, top_p, max_length, num_return_sequences, repetition_penalty, top_k
|
110 |
+
)
|
111 |
+
for i, code in enumerate(generated_code, start=1):
|
112 |
+
st.subheader(f"Output {i}")
|
113 |
+
st.code(code, language="dart")
|
114 |
+
else:
|
115 |
+
st.error("Please enter a prompt before clicking 'Generate Code'.")
|
train.py
ADDED
@@ -0,0 +1,158 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# from datasets import load_dataset
|
2 |
+
# from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
|
3 |
+
# import torch
|
4 |
+
|
5 |
+
# # Check for GPU
|
6 |
+
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
7 |
+
# print(f"Using device: {device}")
|
8 |
+
|
9 |
+
# # Step 1: Load the dataset
|
10 |
+
# dataset = load_dataset("wraps/codegen-flutter-v1")
|
11 |
+
|
12 |
+
# # Step 2: Load the tokenizer and model
|
13 |
+
# model_name = "Salesforce/codegen-350M-mono"
|
14 |
+
# tokenizer = AutoTokenizer.from_pretrained(model_name)
|
15 |
+
# tokenizer.pad_token = tokenizer.eos_token # Set the padding token
|
16 |
+
# model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
|
17 |
+
|
18 |
+
# # Step 3: Tokenize the dataset
|
19 |
+
# def tokenize_function(examples):
|
20 |
+
# return tokenizer(examples["content"], truncation=True, padding="max_length", max_length=512)
|
21 |
+
|
22 |
+
# tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["content"])
|
23 |
+
|
24 |
+
# # Step 4: Set up training arguments
|
25 |
+
# training_args = TrainingArguments(
|
26 |
+
# output_dir="./flutter_codegen_model",
|
27 |
+
# evaluation_strategy="epoch",
|
28 |
+
# learning_rate=5e-5,
|
29 |
+
# per_device_train_batch_size=4, # Adjust based on GPU memory
|
30 |
+
# num_train_epochs=3,
|
31 |
+
# save_steps=500,
|
32 |
+
# save_total_limit=2,
|
33 |
+
# fp16=torch.cuda.is_available(), # Use mixed precision if GPU is available
|
34 |
+
# logging_dir="./logs",
|
35 |
+
# logging_steps=10,
|
36 |
+
# report_to="none"
|
37 |
+
# )
|
38 |
+
|
39 |
+
# # Step 5: Initialize the Trainer
|
40 |
+
# trainer = Trainer(
|
41 |
+
# model=model,
|
42 |
+
# args=training_args,
|
43 |
+
# train_dataset=tokenized_dataset["train"],
|
44 |
+
# eval_dataset=tokenized_dataset["validation"],
|
45 |
+
# tokenizer=tokenizer,
|
46 |
+
# )
|
47 |
+
|
48 |
+
# # Step 6: Train the model
|
49 |
+
# trainer.train()
|
50 |
+
|
51 |
+
# # Step 7: Save the fine-tuned model
|
52 |
+
# model.save_pretrained("./flutter_codegen_model")
|
53 |
+
# tokenizer.save_pretrained("./flutter_codegen_model")
|
54 |
+
|
55 |
+
# # # # # # # # # # # # # # # # #
|
56 |
+
# Train on multiple datasets #
|
57 |
+
# # # # # # # # # # # # # # # # #
|
58 |
+
|
59 |
+
from datasets import load_dataset, concatenate_datasets
|
60 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
|
61 |
+
import torch
|
62 |
+
|
63 |
+
# Check for GPU
|
64 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
65 |
+
print(f"Using device: {device}")
|
66 |
+
|
67 |
+
# Step 1: Load the datasets
|
68 |
+
print("Loading datasets...")
|
69 |
+
dataset1 = load_dataset("wraps/codegen-flutter-v1")
|
70 |
+
dataset2 = load_dataset("limcheekin/flutter-website-3.7")
|
71 |
+
dataset3 = load_dataset("deepklarity/top-flutter-packages")
|
72 |
+
|
73 |
+
# Step 2: Preprocess datasets to extract relevant text
|
74 |
+
def preprocess_dataset1(example):
|
75 |
+
return {"text": example["content"]}
|
76 |
+
|
77 |
+
def preprocess_dataset2(example):
|
78 |
+
return {"text": example["text"]}
|
79 |
+
|
80 |
+
def preprocess_dataset3(example):
|
81 |
+
# Combine title and description into one text entry
|
82 |
+
return {"text": f"{example['title']} - {example['description']}"}
|
83 |
+
|
84 |
+
print("Preprocessing datasets...")
|
85 |
+
dataset1_train = dataset1["train"].map(preprocess_dataset1, remove_columns=["repo_id", "file_path", "content", "__index_level_0__"])
|
86 |
+
dataset2_train = dataset2["train"].map(preprocess_dataset2, remove_columns=["id", "source"])
|
87 |
+
dataset3_train = dataset3["train"].map(preprocess_dataset3, remove_columns=["title", "description", "likes", "dependencies"])
|
88 |
+
|
89 |
+
# Combine all datasets into a single dataset
|
90 |
+
print("Combining datasets...")
|
91 |
+
combined_dataset = concatenate_datasets([dataset1_train, dataset2_train, dataset3_train])
|
92 |
+
|
93 |
+
# Step 3: Create train-validation split
|
94 |
+
print("Creating train-validation split...")
|
95 |
+
train_test_split = combined_dataset.train_test_split(test_size=0.1, seed=42)
|
96 |
+
train_dataset = train_test_split["train"]
|
97 |
+
validation_dataset = train_test_split["test"]
|
98 |
+
|
99 |
+
# Step 4: Load the tokenizer and model from the checkpoint
|
100 |
+
print("Loading tokenizer and model from checkpoint...")
|
101 |
+
checkpoint_path = "./flutter_codegen_model/checkpoint-1500"
|
102 |
+
tokenizer = AutoTokenizer.from_pretrained(checkpoint_path)
|
103 |
+
tokenizer.pad_token = tokenizer.eos_token # Set the padding token
|
104 |
+
model = AutoModelForCausalLM.from_pretrained(checkpoint_path).to(device)
|
105 |
+
|
106 |
+
# Step 5: Tokenize the datasets
|
107 |
+
def tokenize_function(examples):
|
108 |
+
# Tokenize the text and add labels
|
109 |
+
tokenized = tokenizer(
|
110 |
+
examples["text"],
|
111 |
+
truncation=True,
|
112 |
+
padding="max_length",
|
113 |
+
max_length=512,
|
114 |
+
)
|
115 |
+
tokenized["labels"] = tokenized["input_ids"].copy() # Duplicate input_ids as labels
|
116 |
+
return tokenized
|
117 |
+
|
118 |
+
print("Tokenizing datasets...")
|
119 |
+
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True, remove_columns=["text"])
|
120 |
+
tokenized_validation_dataset = validation_dataset.map(tokenize_function, batched=True, remove_columns=["text"])
|
121 |
+
|
122 |
+
# Step 6: Set up training arguments
|
123 |
+
print("Setting up training arguments...")
|
124 |
+
training_args = TrainingArguments(
|
125 |
+
output_dir="./flutter_codegen_model",
|
126 |
+
evaluation_strategy="epoch",
|
127 |
+
learning_rate=5e-5,
|
128 |
+
per_device_train_batch_size=4, # Adjust based on GPU memory
|
129 |
+
num_train_epochs=3,
|
130 |
+
save_steps=500,
|
131 |
+
save_total_limit=2,
|
132 |
+
fp16=torch.cuda.is_available(), # Use mixed precision if GPU is available
|
133 |
+
logging_dir="./logs",
|
134 |
+
logging_steps=10,
|
135 |
+
resume_from_checkpoint=checkpoint_path, # Resume from the checkpoint
|
136 |
+
report_to="none"
|
137 |
+
)
|
138 |
+
|
139 |
+
# Step 7: Initialize the Trainer
|
140 |
+
print("Initializing Trainer...")
|
141 |
+
trainer = Trainer(
|
142 |
+
model=model,
|
143 |
+
args=training_args,
|
144 |
+
train_dataset=tokenized_train_dataset,
|
145 |
+
eval_dataset=tokenized_validation_dataset, # Use the new validation dataset
|
146 |
+
tokenizer=tokenizer,
|
147 |
+
)
|
148 |
+
|
149 |
+
# Step 8: Train the model
|
150 |
+
print("Starting training from checkpoint...")
|
151 |
+
trainer.train()
|
152 |
+
|
153 |
+
# Step 9: Save the fine-tuned model
|
154 |
+
print("Saving the model...")
|
155 |
+
model.save_pretrained("./flutter_codegen_model")
|
156 |
+
tokenizer.save_pretrained("./flutter_codegen_model")
|
157 |
+
|
158 |
+
print("Training complete. Model saved to './flutter_codegen_model'.")
|