Upload 5 files

Browse files

Files changed (5) hide show

README.md +118 -0
app.py +136 -0
app_gradio.py +123 -0
requirements.txt +11 -0
tokenizer_config.json +14 -0

README.md ADDED Viewed

	@@ -0,0 +1,118 @@

+---
+language: hi
+tags:
+- hindi
+- tokenizer
+- bpe
+- subword
+- text-processing
+pipeline_tag: text2text-generation
+inference: true
+license: mit
+spaces:
+- aayushraina/bpe-hindi
+---
+# Hindi Byte Pair Encoding (BPE) Tokenizer
+A specialized BPE tokenizer for Hindi text that achieves efficient compression while maintaining linguistic coherence.
+## Online Demo
+Try the tokenizer in your browser: [Hindi BPE Tokenizer Demo](https://huggingface.co/spaces/aayushraina/bpe-hindi)
+## Project Overview
+This project implements a Byte Pair Encoding (BPE) tokenizer specifically designed for Hindi text. It features:
+- Efficient trie-based tokenization
+- Visualization of training progress
+- Compression ratio optimization
+- Support for large Hindi text datasets
+- Hugging Face compatibility
+## Project Structure
+hindi-bpe/
+├── data/ # Dataset directory
+│ ├── train/ # Training data
+│ └── valid/ # Validation data
+├── tokenizer/ # Saved tokenizer files
+│ ├── encoder.json # Encoder state
+│ └── vocab_stats.json # Vocabulary statistics
+├── output/ # Visualization outputs
+├── byte_pair_encoder.py # Core BPE implementation
+├── hindi_bpe.py # Hindi-specific wrapper
+├── test_hindi_bpe.py # Test suite
+└── requirements.txt # Dependencies
+## Training stats
+    - Iteration 4500:
+    - Vocabulary size: 4,477
+    - Data size: 448,754
+    - Compression ratio: 3.66
+    - Max token length: 64
+## File Descriptions
+1. **byte_pair_encoder.py**
+   - Core BPE implementation
+   - Trie-based tokenization
+   - Training statistics tracking
+   - Visualization utilities
+2. **hindi_bpe.py**
+   - Hindi-specific tokenizer wrapper
+   - Text preprocessing
+   - Model saving/loading
+   - Compression ratio calculation
+3. **app.py**
+   - Interactive web interface
+   - Real-time tokenization
+   - Training visualization
+   - Model parameter tuning
+4. **test_hindi_bpe.py**
+   - Test suite for tokenizer
+   - Performance benchmarks
+   - Example usage
+## Installation
+    - bash
+    - Clone repository
+    - git clone https://github.com/yourusername/hindi-bpe.git
+    - cd hindi-bpe
+    - pip install -r requirements.txt
+## Download and prepare dataset
+    - python download_dataset.py
+### Web Interface
+    - streamlit run app.py
+### Test-
+    - python test_hindi_bpe.py
+    - The test suite includes:
+    - Training pipeline verification
+    - Compression ratio validation
+    - Token count requirements
+    - Encoding/decoding accuracy
+## Performance Metrics
+    The tokenizer aims to achieve:
+    - Vocabulary size < 5000 tokens
+    - Compression ratio ≥ 3.2
+    - Fast encoding/decoding
+    - Memory-efficient operation
+## Contributing
+1. Fork the repository
+2. Create feature branch
+3. Commit changes
+4. Push to branch
+5. Create Pull Request
+## License
+This project is licensed under the MIT License - see the LICENSE file for details.

app.py ADDED Viewed

	@@ -0,0 +1,136 @@

+import gradio as gr
+from huggingface_hub import snapshot_download
+from hindi_bpe import HindiBPE, preprocess_hindi_text
+import pandas as pd
+import plotly.express as px
+import os
+# Download tokenizer if not exists
+if not os.path.exists("tokenizer"):
+    snapshot_download(
+        repo_id="aayushraina/bpe-hindi",
+        local_dir="tokenizer",
+        allow_patterns=["*.json"]
+    )
+class TokenizerDemo:
+    def __init__(self):
+        self.tokenizer = HindiBPE.load_tokenizer("tokenizer")
+    def tokenize_text(self, text: str) -> tuple:
+        """Tokenize text and return visualization"""
+        if not text:
+            return "", None, "Please enter some text"
+        # Preprocess
+        text = preprocess_hindi_text(text)
+        # Tokenize
+        tokens = self.tokenizer.encode(text)
+        # Create visualization
+        token_df = pd.DataFrame({
+            'Token': tokens,
+            'Length': [len(token) for token in tokens]
+        })
+        fig = px.scatter(token_df,
+                        x=range(len(tokens)),
+                        y='Length',
+                        hover_data=['Token'],
+                        title='Token Lengths in Sequence')
+        # Calculate statistics
+        stats = {
+            'Total Tokens': len(tokens),
+            'Unique Tokens': len(set(tokens)),
+            'Average Token Length': sum(len(t) for t in tokens) / len(tokens),
+            'Compression Ratio': len(text) / sum(len(t) for t in tokens)
+        }
+        stats_str = "\n".join(f"{k}: {v:.2f}" if isinstance(v, float) else f"{k}: {v}"
+                             for k, v in stats.items())
+        return (
+            " ".join(tokens),  # Tokenized text
+            fig,              # Visualization
+            stats_str        # Statistics
+        )
+    def decode_tokens(self, tokens_text: str) -> str:
+        """Decode space-separated tokens back to text"""
+        if not tokens_text:
+            return "Please tokenize some text first"
+        tokens = tokens_text.split()
+        return self.tokenizer.decode(tokens)
+# Create Gradio interface
+demo = TokenizerDemo()
+interface = gr.Blocks(title="Hindi BPE Tokenizer")
+with interface:
+    gr.Markdown("""
+    # Hindi BPE Tokenizer Demo
+    This demo showcases a Byte Pair Encoding (BPE) tokenizer specifically trained for Hindi text.
+    Enter Hindi text to see how it gets tokenized and analyze the token distribution.
+    [View model on Hugging Face](https://huggingface.co/aayushraina/bpe-hindi)
+    """)
+    with gr.Row():
+        with gr.Column():
+            input_text = gr.Textbox(
+                label="Input Hindi Text",
+                placeholder="हिंदी में टेक्स्ट दर्ज करें...",
+                lines=5
+            )
+            tokenize_btn = gr.Button("Tokenize")
+        with gr.Column():
+            tokens_output = gr.Textbox(
+                label="Tokenized Output",
+                lines=5
+            )
+            decode_btn = gr.Button("Decode")
+    original_output = gr.Textbox(
+        label="Decoded Text",
+        lines=5
+    )
+    stats_output = gr.Textbox(
+        label="Tokenization Statistics",
+        lines=4
+    )
+    plot_output = gr.Plot(
+        label="Token Length Distribution"
+    )
+    # Set up event handlers
+    tokenize_btn.click(
+        fn=demo.tokenize_text,
+        inputs=input_text,
+        outputs=[tokens_output, plot_output, stats_output]
+    )
+    decode_btn.click(
+        fn=demo.decode_tokens,
+        inputs=tokens_output,
+        outputs=original_output
+    )
+    # Add examples
+    gr.Examples(
+        examples=[
+            ["हिंदी भाषा बहुत सुंदर है।"],
+            ["भारत एक विशाल देश है। यहाँ की संस्कृति बहुत पुरानी है।"],
+            ["मैं हिंदी में प्रोग्रामिंग सीख रहा हूं।"]
+        ],
+        inputs=input_text
+    )
+# Launch the interface
+interface.launch()

app_gradio.py ADDED Viewed

	@@ -0,0 +1,123 @@

+import gradio as gr
+from hindi_bpe import HindiBPE, preprocess_hindi_text
+import pandas as pd
+import plotly.express as px
+import json
+class TokenizerDemo:
+    def __init__(self):
+        self.tokenizer = HindiBPE.load_tokenizer("tokenizer")
+    def tokenize_text(self, text: str) -> tuple:
+        """Tokenize text and return visualization"""
+        # Preprocess
+        text = preprocess_hindi_text(text)
+        # Tokenize
+        tokens = self.tokenizer.encode(text)
+        # Create visualization
+        token_df = pd.DataFrame({
+            'Token': tokens,
+            'Length': [len(token) for token in tokens]
+        })
+        fig = px.scatter(token_df,
+                        x=range(len(tokens)),
+                        y='Length',
+                        hover_data=['Token'],
+                        title='Token Lengths in Sequence')
+        # Calculate statistics
+        stats = {
+            'Total Tokens': len(tokens),
+            'Unique Tokens': len(set(tokens)),
+            'Average Token Length': sum(len(t) for t in tokens) / len(tokens),
+            'Compression Ratio': len(text) / sum(len(t) for t in tokens)
+        }
+        stats_str = "\n".join(f"{k}: {v:.2f}" if isinstance(v, float) else f"{k}: {v}"
+                             for k, v in stats.items())
+        return (
+            " ".join(tokens),  # Tokenized text
+            fig,              # Visualization
+            stats_str        # Statistics
+        )
+    def decode_tokens(self, tokens_text: str) -> str:
+        """Decode space-separated tokens back to text"""
+        tokens = tokens_text.split()
+        return self.tokenizer.decode(tokens)
+def create_demo() -> gr.Interface:
+    """Create Gradio interface"""
+    demo = TokenizerDemo()
+    with gr.Blocks(title="Hindi BPE Tokenizer") as interface:
+        gr.Markdown("""
+        # Hindi BPE Tokenizer Demo
+        This demo showcases a Byte Pair Encoding (BPE) tokenizer specifically trained for Hindi text.
+        Enter Hindi text to see how it gets tokenized and analyze the token distribution.
+        """)
+        with gr.Row():
+            with gr.Column():
+                input_text = gr.Textbox(
+                    label="Input Hindi Text",
+                    placeholder="हिंदी में टेक्स्ट दर्ज करें...",
+                    lines=5
+                )
+                tokenize_btn = gr.Button("Tokenize")
+            with gr.Column():
+                tokens_output = gr.Textbox(
+                    label="Tokenized Output",
+                    lines=5
+                )
+                decode_btn = gr.Button("Decode")
+        original_output = gr.Textbox(
+            label="Decoded Text",
+            lines=5
+        )
+        stats_output = gr.Textbox(
+            label="Tokenization Statistics",
+            lines=4
+        )
+        plot_output = gr.Plot(
+            label="Token Length Distribution"
+        )
+        # Set up event handlers
+        tokenize_btn.click(
+            fn=demo.tokenize_text,
+            inputs=input_text,
+            outputs=[tokens_output, plot_output, stats_output]
+        )
+        decode_btn.click(
+            fn=demo.decode_tokens,
+            inputs=tokens_output,
+            outputs=original_output
+        )
+        # Add examples
+        gr.Examples(
+            examples=[
+                ["हिंदी भाषा बहुत सुंदर है।"],
+                ["भारत एक विशाल देश है। यहाँ की संस्कृति बहुत पुरानी है।"],
+                ["मैं हिंदी में प्रोग्रामिंग सीख रहा हूं।"]
+            ],
+            inputs=input_text
+        )
+    return interface
+# Create and launch the demo
+if __name__ == "__main__":
+    demo = create_demo()
+    demo.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+numpy==1.23.5
+pandas==1.5.3
+plotly==5.13.0
+kagglehub
+streamlit
+beautifulsoup4
+huggingface-hub>=0.19.0
+tqdm
+matplotlib
+gitpython>=3.1.0
+gradio>=4.0.0

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+    "model_type": "hindi_bpe",
+        "vocab_size": 4477,
+        "max_token_length": 64,
+        "compression_ratio": 3.66,
+        "special_tokens": {
+            "pad_token": "",
+            "unk_token": "",
+            "mask_token": "",
+        },
+        "do_lower_case": false,
+        "strip_accents": false,
+        "tokenizer_class": "HindiBPE"
+}