Spaces:
Sleeping
Sleeping
Gabriel Okiri
commited on
Commit
·
4bb9d41
1
Parent(s):
614c0fa
Initial commit
Browse files- .github/workflows/cicd.yaml +17 -0
- .gitignore +10 -0
- Dockerfile +10 -0
- README.md +0 -14
- app/interface/gardio_app.py +28 -0
- app/model/config.py +14 -0
- app/model/model.py +38 -0
- app/model/tokenizer.py +14 -0
- app/utils/data_processing.py +13 -0
- app/utils/text_processing.py +10 -0
- configs/model_config.yaml +10 -0
- configs/training_config.yaml +10 -0
- requirements.txt +7 -0
- scripts/evaluate.py +13 -0
- scripts/train.py +38 -0
- setup.py +13 -0
- tests/test_model.py +9 -0
- tests/test_processing.py +7 -0
- tests/test_tokenizer.py +10 -0
.github/workflows/cicd.yaml
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# .github/workflows/sync-to-hub.yml
|
2 |
+
name: Sync to Hugging Face Hub
|
3 |
+
on:
|
4 |
+
push:
|
5 |
+
branches: [main]
|
6 |
+
|
7 |
+
jobs:
|
8 |
+
sync:
|
9 |
+
runs-on: ubuntu-latest
|
10 |
+
steps:
|
11 |
+
- uses: actions/checkout@v3
|
12 |
+
- name: Push to hub
|
13 |
+
env:
|
14 |
+
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
15 |
+
run: |
|
16 |
+
git push https://YOUR_USERNAME:[email protected]/spaces/YOUR_USERNAME/nigerian-language-generator-space main
|
17 |
+
|
.gitignore
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
__pycache__/
|
2 |
+
*.pyc
|
3 |
+
.env
|
4 |
+
venv/
|
5 |
+
*.pth
|
6 |
+
.DS_Store
|
7 |
+
outputs/
|
8 |
+
logs/
|
9 |
+
data/raw/
|
10 |
+
data/processed/
|
Dockerfile
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM python:3.9-slim
|
2 |
+
|
3 |
+
WORKDIR /code
|
4 |
+
|
5 |
+
COPY requirements.txt .
|
6 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
7 |
+
|
8 |
+
COPY . .
|
9 |
+
|
10 |
+
CMD ["python", "app.py"]
|
README.md
CHANGED
@@ -1,14 +0,0 @@
|
|
1 |
-
---
|
2 |
-
title: Nigerian Languages
|
3 |
-
emoji: 📉
|
4 |
-
colorFrom: purple
|
5 |
-
colorTo: yellow
|
6 |
-
sdk: gradio
|
7 |
-
sdk_version: 5.11.0
|
8 |
-
app_file: app.py
|
9 |
-
pinned: false
|
10 |
-
license: apache-2.0
|
11 |
-
short_description: Nigerian_languages
|
12 |
-
---
|
13 |
-
|
14 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app/interface/gardio_app.py
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
from app.model.model import NigerianLanguageModel
|
3 |
+
from app.model.config import ModelConfig
|
4 |
+
|
5 |
+
# Initialize model
|
6 |
+
config = ModelConfig()
|
7 |
+
model = NigerianLanguageModel(config)
|
8 |
+
|
9 |
+
# Create interface
|
10 |
+
def generate_text(prompt: str, language: str) -> str:
|
11 |
+
tagged_prompt = f"[{language.upper()}] {prompt}"
|
12 |
+
return model.generate(tagged_prompt)
|
13 |
+
|
14 |
+
# Define Gradio interface
|
15 |
+
interface = gr.Interface(
|
16 |
+
fn=generate_text,
|
17 |
+
inputs=[
|
18 |
+
gr.Textbox(label="Enter your prompt"),
|
19 |
+
gr.Dropdown(choices=["YORUBA", "IGBO", "HAUSA"], label="Select Language")
|
20 |
+
],
|
21 |
+
outputs=gr.Textbox(label="Generated Text"),
|
22 |
+
title="Nigerian Language Generator",
|
23 |
+
description="Generate text in Yoruba, Igbo, or Hausa using a fine-tuned GPT model."
|
24 |
+
)
|
25 |
+
|
26 |
+
# Start the interface
|
27 |
+
if __name__ == "__main__":
|
28 |
+
interface.launch()
|
app/model/config.py
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from dataclasses import dataclass
|
2 |
+
from typing import List, Optional
|
3 |
+
import torch
|
4 |
+
|
5 |
+
@dataclass
|
6 |
+
class ModelConfig:
|
7 |
+
model_name: str = "gpt2"
|
8 |
+
max_length: int = 128
|
9 |
+
batch_size: int = 16
|
10 |
+
learning_rate: float = 2e-5
|
11 |
+
num_train_epochs: int = 3
|
12 |
+
languages: List[str] = ("YORUBA", "IGBO", "HAUSA")
|
13 |
+
device: str = "cuda" if torch.cuda.is_available() else "cpu"
|
14 |
+
output_dir: str = "outputs"
|
app/model/model.py
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# 1. app/model/config.py
|
2 |
+
from dataclasses import dataclass
|
3 |
+
from typing import List, Optional
|
4 |
+
import torch
|
5 |
+
|
6 |
+
@dataclass
|
7 |
+
class ModelConfig:
|
8 |
+
model_name: str = "gpt2"
|
9 |
+
max_length: int = 128
|
10 |
+
batch_size: int = 16
|
11 |
+
learning_rate: float = 2e-5
|
12 |
+
num_train_epochs: int = 3
|
13 |
+
languages: List[str] = ("YORUBA", "IGBO", "HAUSA")
|
14 |
+
device: str = "cuda" if torch.cuda.is_available() else "cpu"
|
15 |
+
output_dir: str = "outputs"
|
16 |
+
|
17 |
+
# app/model/model.py
|
18 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM
|
19 |
+
import torch
|
20 |
+
from .config import ModelConfig
|
21 |
+
|
22 |
+
class NigerianLanguageModel:
|
23 |
+
def __init__(self, config: ModelConfig):
|
24 |
+
self.config = config
|
25 |
+
self.setup_model()
|
26 |
+
|
27 |
+
def setup_model(self):
|
28 |
+
self.tokenizer = AutoTokenizer.from_pretrained(self.config.model_name)
|
29 |
+
self.model = AutoModelForCausalLM.from_pretrained(self.config.model_name)
|
30 |
+
self._setup_special_tokens()
|
31 |
+
self.model.to(self.config.device)
|
32 |
+
|
33 |
+
def _setup_special_tokens(self):
|
34 |
+
special_tokens = {
|
35 |
+
"additional_special_tokens": [f"[{lang}]" for lang in self.config.languages]
|
36 |
+
}
|
37 |
+
self.tokenizer.add_special_tokens(special_tokens)
|
38 |
+
self.model.resize_token_embeddings(len(self.tokenizer))
|
app/model/tokenizer.py
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import PreTrainedTokenizerFast
|
2 |
+
from typing import List, Dict
|
3 |
+
|
4 |
+
class NigerianLanguageTokenizer:
|
5 |
+
def __init__(self, base_tokenizer: PreTrainedTokenizerFast):
|
6 |
+
self.tokenizer = base_tokenizer
|
7 |
+
|
8 |
+
def tokenize_batch(self, texts: List[str]) -> Dict:
|
9 |
+
return self.tokenizer(
|
10 |
+
texts,
|
11 |
+
padding=True,
|
12 |
+
truncation=True,
|
13 |
+
return_tensors="pt"
|
14 |
+
)
|
app/utils/data_processing.py
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
from typing import List, Dict
|
3 |
+
import os
|
4 |
+
|
5 |
+
def load_language_data(data_dir: str, language: str) -> List[str]:
|
6 |
+
filepath = os.path.join(data_dir, f"{language.lower()}/texts.txt")
|
7 |
+
with open(filepath, 'r', encoding='utf-8') as f:
|
8 |
+
return f.readlines()
|
9 |
+
|
10 |
+
def preprocess_text(text: str) -> str:
|
11 |
+
text = text.strip()
|
12 |
+
text = ' '.join(text.split())
|
13 |
+
return text
|
app/utils/text_processing.py
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
from typing import List
|
3 |
+
|
4 |
+
def clean_text(text: str) -> str:
|
5 |
+
text = re.sub(r'\s+', ' ', text)
|
6 |
+
text = text.strip()
|
7 |
+
return text
|
8 |
+
|
9 |
+
def split_into_sentences(text: str) -> List[str]:
|
10 |
+
return [s.strip() for s in re.split(r'[.!?]+', text) if s.strip()]
|
configs/model_config.yaml
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
model:
|
2 |
+
name: "gpt2"
|
3 |
+
max_length: 128
|
4 |
+
batch_size: 16
|
5 |
+
learning_rate: 2e-5
|
6 |
+
num_train_epochs: 3
|
7 |
+
languages:
|
8 |
+
- YORUBA
|
9 |
+
- IGBO
|
10 |
+
- HAUSA
|
configs/training_config.yaml
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
training:
|
2 |
+
output_dir: "outputs"
|
3 |
+
evaluation_strategy: "steps"
|
4 |
+
eval_steps: 500
|
5 |
+
save_steps: 500
|
6 |
+
logging_steps: 100
|
7 |
+
learning_rate: 2e-5
|
8 |
+
num_train_epochs: 3
|
9 |
+
per_device_train_batch_size: 16
|
10 |
+
per_device_eval_batch_size: 16
|
requirements.txt
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
transformers>=4.30.0
|
2 |
+
torch>=2.0.0
|
3 |
+
gradio>=3.50.0
|
4 |
+
datasets>=2.14.0
|
5 |
+
pandas>=1.5.0
|
6 |
+
pytest>=7.0.0
|
7 |
+
pyyaml>=6.0.0
|
scripts/evaluate.py
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from app.model.model import NigerianLanguageModel
|
2 |
+
import torch
|
3 |
+
from typing import Dict
|
4 |
+
import json
|
5 |
+
|
6 |
+
def evaluate_model(model: NigerianLanguageModel, test_data) -> Dict:
|
7 |
+
results = {
|
8 |
+
"perplexity": [],
|
9 |
+
"generation_samples": []
|
10 |
+
}
|
11 |
+
|
12 |
+
# Add evaluation logic here
|
13 |
+
return results
|
scripts/train.py
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from transformers import Trainer, TrainingArguments
|
3 |
+
from app.model.model import NigerianLanguageModel
|
4 |
+
from app.model.config import ModelConfig
|
5 |
+
|
6 |
+
def train_model(model: NigerianLanguageModel, train_dataset, eval_dataset=None):
|
7 |
+
training_args = TrainingArguments(
|
8 |
+
output_dir="outputs",
|
9 |
+
num_train_epochs=model.config.num_train_epochs,
|
10 |
+
per_device_train_batch_size=model.config.batch_size,
|
11 |
+
learning_rate=model.config.learning_rate,
|
12 |
+
save_steps=500,
|
13 |
+
)
|
14 |
+
|
15 |
+
trainer = Trainer(
|
16 |
+
model=model.model,
|
17 |
+
args=training_args,
|
18 |
+
train_dataset=train_dataset,
|
19 |
+
eval_dataset=eval_dataset
|
20 |
+
)
|
21 |
+
|
22 |
+
trainer.train()
|
23 |
+
|
24 |
+
# scripts/preprocess.py
|
25 |
+
from app.utils.data_preprocessing import load_language_data, preprocess_text
|
26 |
+
import os
|
27 |
+
|
28 |
+
def main():
|
29 |
+
languages = ["yoruba", "igbo", "hausa"]
|
30 |
+
for lang in languages:
|
31 |
+
data = load_language_data("data/raw", lang)
|
32 |
+
processed_data = [preprocess_text(text) for text in data]
|
33 |
+
|
34 |
+
output_dir = f"data/processed/{lang}"
|
35 |
+
os.makedirs(output_dir, exist_ok=True)
|
36 |
+
|
37 |
+
with open(f"{output_dir}/processed_texts.txt", 'w', encoding='utf-8') as f:
|
38 |
+
f.writelines(processed_data)
|
setup.py
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from setuptools import setup, find_packages
|
2 |
+
|
3 |
+
setup(
|
4 |
+
name="nigerian-language-generator",
|
5 |
+
version="0.1.0",
|
6 |
+
packages=find_packages(),
|
7 |
+
install_requires=[
|
8 |
+
"transformers>=4.30.0",
|
9 |
+
"torch>=2.0.0",
|
10 |
+
"gradio>=3.50.0",
|
11 |
+
"datasets>=2.14.0",
|
12 |
+
],
|
13 |
+
)
|
tests/test_model.py
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pytest
|
2 |
+
from app.model.model import NigerianLanguageModel
|
3 |
+
from app.model.config import ModelConfig
|
4 |
+
|
5 |
+
def test_model_initialization():
|
6 |
+
config = ModelConfig()
|
7 |
+
model = NigerianLanguageModel(config)
|
8 |
+
assert model.tokenizer is not None
|
9 |
+
assert model.model is not None
|
tests/test_processing.py
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pytest
|
2 |
+
from app.utils.data_preprocessing import preprocess_text
|
3 |
+
|
4 |
+
def test_preprocess_text():
|
5 |
+
text = " Sample text with spaces "
|
6 |
+
processed = preprocess_text(text)
|
7 |
+
assert processed == "Sample text with spaces"
|
tests/test_tokenizer.py
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pytest
|
2 |
+
from app.model.tokenizer import NigerianLanguageTokenizer
|
3 |
+
from transformers import AutoTokenizer
|
4 |
+
|
5 |
+
def test_tokenizer():
|
6 |
+
base_tokenizer = AutoTokenizer.from_pretrained("gpt2")
|
7 |
+
tokenizer = NigerianLanguageTokenizer(base_tokenizer)
|
8 |
+
text = "Sample text"
|
9 |
+
tokens = tokenizer.tokenize_batch([text])
|
10 |
+
assert tokens is not None
|