aayushraina commited on
Commit
a0bc510
·
verified ·
1 Parent(s): 04cd73a

Upload 5 files

Browse files
Files changed (5) hide show
  1. README.md +118 -0
  2. app.py +136 -0
  3. app_gradio.py +123 -0
  4. requirements.txt +11 -0
  5. tokenizer_config.json +14 -0
README.md ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language: hi
3
+ tags:
4
+ - hindi
5
+ - tokenizer
6
+ - bpe
7
+ - subword
8
+ - text-processing
9
+ pipeline_tag: text2text-generation
10
+ inference: true
11
+ license: mit
12
+ spaces:
13
+ - aayushraina/bpe-hindi
14
+ ---
15
+
16
+ # Hindi Byte Pair Encoding (BPE) Tokenizer
17
+
18
+ A specialized BPE tokenizer for Hindi text that achieves efficient compression while maintaining linguistic coherence.
19
+
20
+ ## Online Demo
21
+
22
+ Try the tokenizer in your browser: [Hindi BPE Tokenizer Demo](https://huggingface.co/spaces/aayushraina/bpe-hindi)
23
+
24
+ ## Project Overview
25
+
26
+ This project implements a Byte Pair Encoding (BPE) tokenizer specifically designed for Hindi text. It features:
27
+ - Efficient trie-based tokenization
28
+ - Visualization of training progress
29
+ - Compression ratio optimization
30
+ - Support for large Hindi text datasets
31
+ - Hugging Face compatibility
32
+
33
+ ## Project Structure
34
+ hindi-bpe/
35
+ ├── data/ # Dataset directory
36
+ │ ├── train/ # Training data
37
+ │ └── valid/ # Validation data
38
+ ├── tokenizer/ # Saved tokenizer files
39
+ │ ├── encoder.json # Encoder state
40
+ │ └── vocab_stats.json # Vocabulary statistics
41
+ ├── output/ # Visualization outputs
42
+ ├── byte_pair_encoder.py # Core BPE implementation
43
+ ├── hindi_bpe.py # Hindi-specific wrapper
44
+ ├── test_hindi_bpe.py # Test suite
45
+ └── requirements.txt # Dependencies
46
+
47
+ ## Training stats
48
+ - Iteration 4500:
49
+ - Vocabulary size: 4,477
50
+ - Data size: 448,754
51
+ - Compression ratio: 3.66
52
+ - Max token length: 64
53
+
54
+ ## File Descriptions
55
+
56
+ 1. **byte_pair_encoder.py**
57
+ - Core BPE implementation
58
+ - Trie-based tokenization
59
+ - Training statistics tracking
60
+ - Visualization utilities
61
+
62
+ 2. **hindi_bpe.py**
63
+ - Hindi-specific tokenizer wrapper
64
+ - Text preprocessing
65
+ - Model saving/loading
66
+ - Compression ratio calculation
67
+
68
+ 3. **app.py**
69
+ - Interactive web interface
70
+ - Real-time tokenization
71
+ - Training visualization
72
+ - Model parameter tuning
73
+
74
+ 4. **test_hindi_bpe.py**
75
+ - Test suite for tokenizer
76
+ - Performance benchmarks
77
+ - Example usage
78
+
79
+ ## Installation
80
+ - bash
81
+ - Clone repository
82
+ - git clone https://github.com/yourusername/hindi-bpe.git
83
+ - cd hindi-bpe
84
+ - pip install -r requirements.txt
85
+
86
+ ## Download and prepare dataset
87
+ - python download_dataset.py
88
+
89
+ ### Web Interface
90
+ - streamlit run app.py
91
+
92
+ ### Test-
93
+ - python test_hindi_bpe.py
94
+ - The test suite includes:
95
+ - Training pipeline verification
96
+ - Compression ratio validation
97
+ - Token count requirements
98
+ - Encoding/decoding accuracy
99
+
100
+ ## Performance Metrics
101
+
102
+ The tokenizer aims to achieve:
103
+ - Vocabulary size < 5000 tokens
104
+ - Compression ratio ≥ 3.2
105
+ - Fast encoding/decoding
106
+ - Memory-efficient operation
107
+
108
+ ## Contributing
109
+
110
+ 1. Fork the repository
111
+ 2. Create feature branch
112
+ 3. Commit changes
113
+ 4. Push to branch
114
+ 5. Create Pull Request
115
+
116
+ ## License
117
+
118
+ This project is licensed under the MIT License - see the LICENSE file for details.
app.py ADDED
@@ -0,0 +1,136 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from huggingface_hub import snapshot_download
3
+ from hindi_bpe import HindiBPE, preprocess_hindi_text
4
+ import pandas as pd
5
+ import plotly.express as px
6
+ import os
7
+
8
+ # Download tokenizer if not exists
9
+ if not os.path.exists("tokenizer"):
10
+ snapshot_download(
11
+ repo_id="aayushraina/bpe-hindi",
12
+ local_dir="tokenizer",
13
+ allow_patterns=["*.json"]
14
+ )
15
+
16
+ class TokenizerDemo:
17
+ def __init__(self):
18
+ self.tokenizer = HindiBPE.load_tokenizer("tokenizer")
19
+
20
+ def tokenize_text(self, text: str) -> tuple:
21
+ """Tokenize text and return visualization"""
22
+ if not text:
23
+ return "", None, "Please enter some text"
24
+
25
+ # Preprocess
26
+ text = preprocess_hindi_text(text)
27
+
28
+ # Tokenize
29
+ tokens = self.tokenizer.encode(text)
30
+
31
+ # Create visualization
32
+ token_df = pd.DataFrame({
33
+ 'Token': tokens,
34
+ 'Length': [len(token) for token in tokens]
35
+ })
36
+
37
+ fig = px.scatter(token_df,
38
+ x=range(len(tokens)),
39
+ y='Length',
40
+ hover_data=['Token'],
41
+ title='Token Lengths in Sequence')
42
+
43
+ # Calculate statistics
44
+ stats = {
45
+ 'Total Tokens': len(tokens),
46
+ 'Unique Tokens': len(set(tokens)),
47
+ 'Average Token Length': sum(len(t) for t in tokens) / len(tokens),
48
+ 'Compression Ratio': len(text) / sum(len(t) for t in tokens)
49
+ }
50
+
51
+ stats_str = "\n".join(f"{k}: {v:.2f}" if isinstance(v, float) else f"{k}: {v}"
52
+ for k, v in stats.items())
53
+
54
+ return (
55
+ " ".join(tokens), # Tokenized text
56
+ fig, # Visualization
57
+ stats_str # Statistics
58
+ )
59
+
60
+ def decode_tokens(self, tokens_text: str) -> str:
61
+ """Decode space-separated tokens back to text"""
62
+ if not tokens_text:
63
+ return "Please tokenize some text first"
64
+ tokens = tokens_text.split()
65
+ return self.tokenizer.decode(tokens)
66
+
67
+ # Create Gradio interface
68
+ demo = TokenizerDemo()
69
+
70
+ interface = gr.Blocks(title="Hindi BPE Tokenizer")
71
+
72
+ with interface:
73
+ gr.Markdown("""
74
+ # Hindi BPE Tokenizer Demo
75
+
76
+ This demo showcases a Byte Pair Encoding (BPE) tokenizer specifically trained for Hindi text.
77
+ Enter Hindi text to see how it gets tokenized and analyze the token distribution.
78
+
79
+ [View model on Hugging Face](https://huggingface.co/aayushraina/bpe-hindi)
80
+ """)
81
+
82
+ with gr.Row():
83
+ with gr.Column():
84
+ input_text = gr.Textbox(
85
+ label="Input Hindi Text",
86
+ placeholder="हिंदी में टेक्स्ट दर्ज करें...",
87
+ lines=5
88
+ )
89
+ tokenize_btn = gr.Button("Tokenize")
90
+
91
+ with gr.Column():
92
+ tokens_output = gr.Textbox(
93
+ label="Tokenized Output",
94
+ lines=5
95
+ )
96
+ decode_btn = gr.Button("Decode")
97
+
98
+ original_output = gr.Textbox(
99
+ label="Decoded Text",
100
+ lines=5
101
+ )
102
+
103
+ stats_output = gr.Textbox(
104
+ label="Tokenization Statistics",
105
+ lines=4
106
+ )
107
+
108
+ plot_output = gr.Plot(
109
+ label="Token Length Distribution"
110
+ )
111
+
112
+ # Set up event handlers
113
+ tokenize_btn.click(
114
+ fn=demo.tokenize_text,
115
+ inputs=input_text,
116
+ outputs=[tokens_output, plot_output, stats_output]
117
+ )
118
+
119
+ decode_btn.click(
120
+ fn=demo.decode_tokens,
121
+ inputs=tokens_output,
122
+ outputs=original_output
123
+ )
124
+
125
+ # Add examples
126
+ gr.Examples(
127
+ examples=[
128
+ ["हिंदी भाषा बहुत सुंदर है।"],
129
+ ["भारत एक विशाल देश है। यहाँ की संस्कृति बहुत पुरानी है।"],
130
+ ["मैं हिंदी में प्रोग्रामिंग सीख रहा हूं।"]
131
+ ],
132
+ inputs=input_text
133
+ )
134
+
135
+ # Launch the interface
136
+ interface.launch()
app_gradio.py ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from hindi_bpe import HindiBPE, preprocess_hindi_text
3
+ import pandas as pd
4
+ import plotly.express as px
5
+ import json
6
+
7
+ class TokenizerDemo:
8
+ def __init__(self):
9
+ self.tokenizer = HindiBPE.load_tokenizer("tokenizer")
10
+
11
+ def tokenize_text(self, text: str) -> tuple:
12
+ """Tokenize text and return visualization"""
13
+ # Preprocess
14
+ text = preprocess_hindi_text(text)
15
+
16
+ # Tokenize
17
+ tokens = self.tokenizer.encode(text)
18
+
19
+ # Create visualization
20
+ token_df = pd.DataFrame({
21
+ 'Token': tokens,
22
+ 'Length': [len(token) for token in tokens]
23
+ })
24
+
25
+ fig = px.scatter(token_df,
26
+ x=range(len(tokens)),
27
+ y='Length',
28
+ hover_data=['Token'],
29
+ title='Token Lengths in Sequence')
30
+
31
+ # Calculate statistics
32
+ stats = {
33
+ 'Total Tokens': len(tokens),
34
+ 'Unique Tokens': len(set(tokens)),
35
+ 'Average Token Length': sum(len(t) for t in tokens) / len(tokens),
36
+ 'Compression Ratio': len(text) / sum(len(t) for t in tokens)
37
+ }
38
+
39
+ stats_str = "\n".join(f"{k}: {v:.2f}" if isinstance(v, float) else f"{k}: {v}"
40
+ for k, v in stats.items())
41
+
42
+ return (
43
+ " ".join(tokens), # Tokenized text
44
+ fig, # Visualization
45
+ stats_str # Statistics
46
+ )
47
+
48
+ def decode_tokens(self, tokens_text: str) -> str:
49
+ """Decode space-separated tokens back to text"""
50
+ tokens = tokens_text.split()
51
+ return self.tokenizer.decode(tokens)
52
+
53
+ def create_demo() -> gr.Interface:
54
+ """Create Gradio interface"""
55
+ demo = TokenizerDemo()
56
+
57
+ with gr.Blocks(title="Hindi BPE Tokenizer") as interface:
58
+ gr.Markdown("""
59
+ # Hindi BPE Tokenizer Demo
60
+
61
+ This demo showcases a Byte Pair Encoding (BPE) tokenizer specifically trained for Hindi text.
62
+ Enter Hindi text to see how it gets tokenized and analyze the token distribution.
63
+ """)
64
+
65
+ with gr.Row():
66
+ with gr.Column():
67
+ input_text = gr.Textbox(
68
+ label="Input Hindi Text",
69
+ placeholder="हिंदी में टेक्स्ट दर्ज करें...",
70
+ lines=5
71
+ )
72
+ tokenize_btn = gr.Button("Tokenize")
73
+
74
+ with gr.Column():
75
+ tokens_output = gr.Textbox(
76
+ label="Tokenized Output",
77
+ lines=5
78
+ )
79
+ decode_btn = gr.Button("Decode")
80
+
81
+ original_output = gr.Textbox(
82
+ label="Decoded Text",
83
+ lines=5
84
+ )
85
+
86
+ stats_output = gr.Textbox(
87
+ label="Tokenization Statistics",
88
+ lines=4
89
+ )
90
+
91
+ plot_output = gr.Plot(
92
+ label="Token Length Distribution"
93
+ )
94
+
95
+ # Set up event handlers
96
+ tokenize_btn.click(
97
+ fn=demo.tokenize_text,
98
+ inputs=input_text,
99
+ outputs=[tokens_output, plot_output, stats_output]
100
+ )
101
+
102
+ decode_btn.click(
103
+ fn=demo.decode_tokens,
104
+ inputs=tokens_output,
105
+ outputs=original_output
106
+ )
107
+
108
+ # Add examples
109
+ gr.Examples(
110
+ examples=[
111
+ ["हिंदी भाषा बहुत सुंदर है।"],
112
+ ["भारत एक विशाल देश है। यहाँ की संस्कृति बहुत पुरानी है।"],
113
+ ["मैं हिंदी में प्रोग्रामिंग सीख रहा हूं।"]
114
+ ],
115
+ inputs=input_text
116
+ )
117
+
118
+ return interface
119
+
120
+ # Create and launch the demo
121
+ if __name__ == "__main__":
122
+ demo = create_demo()
123
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ numpy==1.23.5
2
+ pandas==1.5.3
3
+ plotly==5.13.0
4
+ kagglehub
5
+ streamlit
6
+ beautifulsoup4
7
+ huggingface-hub>=0.19.0
8
+ tqdm
9
+ matplotlib
10
+ gitpython>=3.1.0
11
+ gradio>=4.0.0
tokenizer_config.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_type": "hindi_bpe",
3
+ "vocab_size": 4477,
4
+ "max_token_length": 64,
5
+ "compression_ratio": 3.66,
6
+ "special_tokens": {
7
+ "pad_token": "",
8
+ "unk_token": "",
9
+ "mask_token": "",
10
+ },
11
+ "do_lower_case": false,
12
+ "strip_accents": false,
13
+ "tokenizer_class": "HindiBPE"
14
+ }