Gabriel Okiri commited on
Commit
4bb9d41
·
1 Parent(s): 614c0fa

Initial commit

Browse files
.github/workflows/cicd.yaml ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # .github/workflows/sync-to-hub.yml
2
+ name: Sync to Hugging Face Hub
3
+ on:
4
+ push:
5
+ branches: [main]
6
+
7
+ jobs:
8
+ sync:
9
+ runs-on: ubuntu-latest
10
+ steps:
11
+ - uses: actions/checkout@v3
12
+ - name: Push to hub
13
+ env:
14
+ HF_TOKEN: ${{ secrets.HF_TOKEN }}
15
+ run: |
16
+ git push https://YOUR_USERNAME:[email protected]/spaces/YOUR_USERNAME/nigerian-language-generator-space main
17
+
.gitignore ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ __pycache__/
2
+ *.pyc
3
+ .env
4
+ venv/
5
+ *.pth
6
+ .DS_Store
7
+ outputs/
8
+ logs/
9
+ data/raw/
10
+ data/processed/
Dockerfile ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.9-slim
2
+
3
+ WORKDIR /code
4
+
5
+ COPY requirements.txt .
6
+ RUN pip install --no-cache-dir -r requirements.txt
7
+
8
+ COPY . .
9
+
10
+ CMD ["python", "app.py"]
README.md CHANGED
@@ -1,14 +0,0 @@
1
- ---
2
- title: Nigerian Languages
3
- emoji: 📉
4
- colorFrom: purple
5
- colorTo: yellow
6
- sdk: gradio
7
- sdk_version: 5.11.0
8
- app_file: app.py
9
- pinned: false
10
- license: apache-2.0
11
- short_description: Nigerian_languages
12
- ---
13
-
14
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
app/interface/gardio_app.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from app.model.model import NigerianLanguageModel
3
+ from app.model.config import ModelConfig
4
+
5
+ # Initialize model
6
+ config = ModelConfig()
7
+ model = NigerianLanguageModel(config)
8
+
9
+ # Create interface
10
+ def generate_text(prompt: str, language: str) -> str:
11
+ tagged_prompt = f"[{language.upper()}] {prompt}"
12
+ return model.generate(tagged_prompt)
13
+
14
+ # Define Gradio interface
15
+ interface = gr.Interface(
16
+ fn=generate_text,
17
+ inputs=[
18
+ gr.Textbox(label="Enter your prompt"),
19
+ gr.Dropdown(choices=["YORUBA", "IGBO", "HAUSA"], label="Select Language")
20
+ ],
21
+ outputs=gr.Textbox(label="Generated Text"),
22
+ title="Nigerian Language Generator",
23
+ description="Generate text in Yoruba, Igbo, or Hausa using a fine-tuned GPT model."
24
+ )
25
+
26
+ # Start the interface
27
+ if __name__ == "__main__":
28
+ interface.launch()
app/model/config.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+ from typing import List, Optional
3
+ import torch
4
+
5
+ @dataclass
6
+ class ModelConfig:
7
+ model_name: str = "gpt2"
8
+ max_length: int = 128
9
+ batch_size: int = 16
10
+ learning_rate: float = 2e-5
11
+ num_train_epochs: int = 3
12
+ languages: List[str] = ("YORUBA", "IGBO", "HAUSA")
13
+ device: str = "cuda" if torch.cuda.is_available() else "cpu"
14
+ output_dir: str = "outputs"
app/model/model.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 1. app/model/config.py
2
+ from dataclasses import dataclass
3
+ from typing import List, Optional
4
+ import torch
5
+
6
+ @dataclass
7
+ class ModelConfig:
8
+ model_name: str = "gpt2"
9
+ max_length: int = 128
10
+ batch_size: int = 16
11
+ learning_rate: float = 2e-5
12
+ num_train_epochs: int = 3
13
+ languages: List[str] = ("YORUBA", "IGBO", "HAUSA")
14
+ device: str = "cuda" if torch.cuda.is_available() else "cpu"
15
+ output_dir: str = "outputs"
16
+
17
+ # app/model/model.py
18
+ from transformers import AutoTokenizer, AutoModelForCausalLM
19
+ import torch
20
+ from .config import ModelConfig
21
+
22
+ class NigerianLanguageModel:
23
+ def __init__(self, config: ModelConfig):
24
+ self.config = config
25
+ self.setup_model()
26
+
27
+ def setup_model(self):
28
+ self.tokenizer = AutoTokenizer.from_pretrained(self.config.model_name)
29
+ self.model = AutoModelForCausalLM.from_pretrained(self.config.model_name)
30
+ self._setup_special_tokens()
31
+ self.model.to(self.config.device)
32
+
33
+ def _setup_special_tokens(self):
34
+ special_tokens = {
35
+ "additional_special_tokens": [f"[{lang}]" for lang in self.config.languages]
36
+ }
37
+ self.tokenizer.add_special_tokens(special_tokens)
38
+ self.model.resize_token_embeddings(len(self.tokenizer))
app/model/tokenizer.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import PreTrainedTokenizerFast
2
+ from typing import List, Dict
3
+
4
+ class NigerianLanguageTokenizer:
5
+ def __init__(self, base_tokenizer: PreTrainedTokenizerFast):
6
+ self.tokenizer = base_tokenizer
7
+
8
+ def tokenize_batch(self, texts: List[str]) -> Dict:
9
+ return self.tokenizer(
10
+ texts,
11
+ padding=True,
12
+ truncation=True,
13
+ return_tensors="pt"
14
+ )
app/utils/data_processing.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from typing import List, Dict
3
+ import os
4
+
5
+ def load_language_data(data_dir: str, language: str) -> List[str]:
6
+ filepath = os.path.join(data_dir, f"{language.lower()}/texts.txt")
7
+ with open(filepath, 'r', encoding='utf-8') as f:
8
+ return f.readlines()
9
+
10
+ def preprocess_text(text: str) -> str:
11
+ text = text.strip()
12
+ text = ' '.join(text.split())
13
+ return text
app/utils/text_processing.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from typing import List
3
+
4
+ def clean_text(text: str) -> str:
5
+ text = re.sub(r'\s+', ' ', text)
6
+ text = text.strip()
7
+ return text
8
+
9
+ def split_into_sentences(text: str) -> List[str]:
10
+ return [s.strip() for s in re.split(r'[.!?]+', text) if s.strip()]
configs/model_config.yaml ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ name: "gpt2"
3
+ max_length: 128
4
+ batch_size: 16
5
+ learning_rate: 2e-5
6
+ num_train_epochs: 3
7
+ languages:
8
+ - YORUBA
9
+ - IGBO
10
+ - HAUSA
configs/training_config.yaml ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ training:
2
+ output_dir: "outputs"
3
+ evaluation_strategy: "steps"
4
+ eval_steps: 500
5
+ save_steps: 500
6
+ logging_steps: 100
7
+ learning_rate: 2e-5
8
+ num_train_epochs: 3
9
+ per_device_train_batch_size: 16
10
+ per_device_eval_batch_size: 16
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ transformers>=4.30.0
2
+ torch>=2.0.0
3
+ gradio>=3.50.0
4
+ datasets>=2.14.0
5
+ pandas>=1.5.0
6
+ pytest>=7.0.0
7
+ pyyaml>=6.0.0
scripts/evaluate.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from app.model.model import NigerianLanguageModel
2
+ import torch
3
+ from typing import Dict
4
+ import json
5
+
6
+ def evaluate_model(model: NigerianLanguageModel, test_data) -> Dict:
7
+ results = {
8
+ "perplexity": [],
9
+ "generation_samples": []
10
+ }
11
+
12
+ # Add evaluation logic here
13
+ return results
scripts/train.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from transformers import Trainer, TrainingArguments
3
+ from app.model.model import NigerianLanguageModel
4
+ from app.model.config import ModelConfig
5
+
6
+ def train_model(model: NigerianLanguageModel, train_dataset, eval_dataset=None):
7
+ training_args = TrainingArguments(
8
+ output_dir="outputs",
9
+ num_train_epochs=model.config.num_train_epochs,
10
+ per_device_train_batch_size=model.config.batch_size,
11
+ learning_rate=model.config.learning_rate,
12
+ save_steps=500,
13
+ )
14
+
15
+ trainer = Trainer(
16
+ model=model.model,
17
+ args=training_args,
18
+ train_dataset=train_dataset,
19
+ eval_dataset=eval_dataset
20
+ )
21
+
22
+ trainer.train()
23
+
24
+ # scripts/preprocess.py
25
+ from app.utils.data_preprocessing import load_language_data, preprocess_text
26
+ import os
27
+
28
+ def main():
29
+ languages = ["yoruba", "igbo", "hausa"]
30
+ for lang in languages:
31
+ data = load_language_data("data/raw", lang)
32
+ processed_data = [preprocess_text(text) for text in data]
33
+
34
+ output_dir = f"data/processed/{lang}"
35
+ os.makedirs(output_dir, exist_ok=True)
36
+
37
+ with open(f"{output_dir}/processed_texts.txt", 'w', encoding='utf-8') as f:
38
+ f.writelines(processed_data)
setup.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from setuptools import setup, find_packages
2
+
3
+ setup(
4
+ name="nigerian-language-generator",
5
+ version="0.1.0",
6
+ packages=find_packages(),
7
+ install_requires=[
8
+ "transformers>=4.30.0",
9
+ "torch>=2.0.0",
10
+ "gradio>=3.50.0",
11
+ "datasets>=2.14.0",
12
+ ],
13
+ )
tests/test_model.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ import pytest
2
+ from app.model.model import NigerianLanguageModel
3
+ from app.model.config import ModelConfig
4
+
5
+ def test_model_initialization():
6
+ config = ModelConfig()
7
+ model = NigerianLanguageModel(config)
8
+ assert model.tokenizer is not None
9
+ assert model.model is not None
tests/test_processing.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ import pytest
2
+ from app.utils.data_preprocessing import preprocess_text
3
+
4
+ def test_preprocess_text():
5
+ text = " Sample text with spaces "
6
+ processed = preprocess_text(text)
7
+ assert processed == "Sample text with spaces"
tests/test_tokenizer.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ import pytest
2
+ from app.model.tokenizer import NigerianLanguageTokenizer
3
+ from transformers import AutoTokenizer
4
+
5
+ def test_tokenizer():
6
+ base_tokenizer = AutoTokenizer.from_pretrained("gpt2")
7
+ tokenizer = NigerianLanguageTokenizer(base_tokenizer)
8
+ text = "Sample text"
9
+ tokens = tokenizer.tokenize_batch([text])
10
+ assert tokens is not None