Omarrran commited on
Commit
3dd27dd
·
verified ·
1 Parent(s): fef2e8d

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +247 -0
app.py ADDED
@@ -0,0 +1,247 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import nltk
4
+ import gradio as gr
5
+ from datetime import datetime
6
+ from pathlib import Path
7
+ import shutil
8
+
9
+ # Download NLTK data
10
+ nltk.download('punkt')
11
+
12
+ class TTSDatasetCollector:
13
+ """Manages TTS dataset collection and organization"""
14
+
15
+ def __init__(self, root_path: str = "dataset_root"):
16
+ self.root_path = Path(root_path)
17
+ self.sentences = []
18
+ self.current_index = 0
19
+ self.setup_directories()
20
+
21
+ def setup_directories(self):
22
+ """Create necessary directory structure"""
23
+ for subdir in ['audio', 'transcriptions', 'metadata']:
24
+ (self.root_path / subdir).mkdir(parents=True, exist_ok=True)
25
+
26
+ def load_text_file(self, file):
27
+ """Process and load text file"""
28
+ try:
29
+ with open(file.name, 'r', encoding='utf-8') as f:
30
+ text = f.read()
31
+ self.sentences = nltk.sent_tokenize(text)
32
+ self.current_index = 0
33
+ return True, f"Loaded {len(self.sentences)} sentences"
34
+ except Exception as e:
35
+ return False, f"Error loading file: {str(e)}"
36
+
37
+ def generate_filenames(self, dataset_name: str, speaker_id: str) -> tuple:
38
+ """Generate unique filenames for audio and text"""
39
+ timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
40
+ sentence_id = f"{self.current_index+1:04d}"
41
+ base_name = f"{dataset_name}_{speaker_id}_{sentence_id}_{timestamp}"
42
+ return f"{base_name}.wav", f"{base_name}.txt"
43
+
44
+ def save_recording(self, audio_file, speaker_id: str, dataset_name: str):
45
+ """Save recording and transcription"""
46
+ if not audio_file or not speaker_id or not dataset_name:
47
+ return False, "Missing required information"
48
+
49
+ try:
50
+ # Generate filenames
51
+ audio_name, text_name = self.generate_filenames(dataset_name, speaker_id)
52
+
53
+ # Create speaker directories
54
+ audio_dir = self.root_path / 'audio' / speaker_id
55
+ text_dir = self.root_path / 'transcriptions' / speaker_id
56
+ audio_dir.mkdir(exist_ok=True)
57
+ text_dir.mkdir(exist_ok=True)
58
+
59
+ # Save audio file
60
+ shutil.copy2(audio_file, audio_dir / audio_name)
61
+
62
+ # Save transcription
63
+ self.save_transcription(
64
+ text_dir / text_name,
65
+ self.sentences[self.current_index],
66
+ {
67
+ 'speaker_id': speaker_id,
68
+ 'dataset_name': dataset_name,
69
+ 'timestamp': datetime.now().isoformat(),
70
+ 'audio_file': audio_name
71
+ }
72
+ )
73
+
74
+ return True, "Recording saved successfully"
75
+ except Exception as e:
76
+ return False, f"Error saving recording: {str(e)}"
77
+
78
+ def save_transcription(self, file_path: Path, text: str, metadata: dict):
79
+ """Save transcription with metadata"""
80
+ content = f"""[METADATA]
81
+ Recording_ID: {metadata['audio_file']}
82
+ Speaker_ID: {metadata['speaker_id']}
83
+ Dataset_Name: {metadata['dataset_name']}
84
+ Timestamp: {metadata['timestamp']}
85
+
86
+ [TEXT]
87
+ {text}
88
+ """
89
+ with open(file_path, 'w', encoding='utf-8') as f:
90
+ f.write(content)
91
+
92
+ def create_interface():
93
+ """Create Gradio interface for TTS data collection"""
94
+
95
+ collector = TTSDatasetCollector()
96
+
97
+ with gr.Blocks(title="TTS Dataset Collection Tool") as interface:
98
+ gr.Markdown("# TTS Dataset Collection Tool")
99
+
100
+ with gr.Row():
101
+ # Left column - Configuration
102
+ with gr.Column():
103
+ file_input = gr.File(
104
+ label="Upload Text File (.txt)",
105
+ file_types=[".txt"]
106
+ )
107
+ speaker_id = gr.Textbox(
108
+ label="Speaker ID",
109
+ placeholder="Enter unique speaker identifier"
110
+ )
111
+ dataset_name = gr.Textbox(
112
+ label="Dataset Name",
113
+ placeholder="Enter dataset name"
114
+ )
115
+
116
+ # Right column - Recording
117
+ with gr.Column():
118
+ current_text = gr.Textbox(
119
+ label="Current Sentence",
120
+ interactive=False
121
+ )
122
+ audio_recorder = gr.Audio(
123
+ label="Record Audio",
124
+ type="filepath"
125
+ )
126
+ next_text = gr.Textbox(
127
+ label="Next Sentence",
128
+ interactive=False
129
+ )
130
+
131
+ # Controls
132
+ with gr.Row():
133
+ prev_btn = gr.Button("Previous")
134
+ next_btn = gr.Button("Next")
135
+ save_btn = gr.Button("Save Recording", variant="primary")
136
+
137
+ # Status
138
+ with gr.Row():
139
+ progress = gr.Textbox(
140
+ label="Progress",
141
+ interactive=False
142
+ )
143
+ status = gr.Textbox(
144
+ label="Status",
145
+ interactive=False
146
+ )
147
+
148
+ # Event handlers
149
+ def load_file(file):
150
+ if not file:
151
+ return {
152
+ current_text: "",
153
+ next_text: "",
154
+ progress: "",
155
+ status: "No file selected"
156
+ }
157
+
158
+ success, msg = collector.load_text_file(file)
159
+ if not success:
160
+ return {
161
+ current_text: "",
162
+ next_text: "",
163
+ progress: "",
164
+ status: msg
165
+ }
166
+
167
+ return {
168
+ current_text: collector.sentences[0],
169
+ next_text: collector.sentences[1] if len(collector.sentences) > 1 else "",
170
+ progress: f"Sentence 1 of {len(collector.sentences)}",
171
+ status: msg
172
+ }
173
+
174
+ def update_display():
175
+ """Update interface display"""
176
+ if not collector.sentences:
177
+ return {
178
+ current_text: "",
179
+ next_text: "",
180
+ progress: "",
181
+ status: "No text loaded"
182
+ }
183
+
184
+ next_idx = collector.current_index + 1
185
+ return {
186
+ current_text: collector.sentences[collector.current_index],
187
+ next_text: collector.sentences[next_idx] if next_idx < len(collector.sentences) else "",
188
+ progress: f"Sentence {collector.current_index + 1} of {len(collector.sentences)}",
189
+ status: "Ready for recording"
190
+ }
191
+
192
+ def next_sentence():
193
+ """Move to next sentence"""
194
+ if collector.sentences and collector.current_index < len(collector.sentences) - 1:
195
+ collector.current_index += 1
196
+ return update_display()
197
+
198
+ def prev_sentence():
199
+ """Move to previous sentence"""
200
+ if collector.sentences and collector.current_index > 0:
201
+ collector.current_index -= 1
202
+ return update_display()
203
+
204
+ def save_recording(audio, spk_id, ds_name):
205
+ """Handle saving recording"""
206
+ if not audio:
207
+ return {status: "No audio recorded"}
208
+ if not spk_id:
209
+ return {status: "Speaker ID required"}
210
+ if not ds_name:
211
+ return {status: "Dataset name required"}
212
+
213
+ success, msg = collector.save_recording(audio, spk_id, ds_name)
214
+ return {status: msg}
215
+
216
+ # Connect event handlers
217
+ file_input.change(
218
+ load_file,
219
+ inputs=[file_input],
220
+ outputs=[current_text, next_text, progress, status]
221
+ )
222
+
223
+ next_btn.click(
224
+ next_sentence,
225
+ outputs=[current_text, next_text, progress, status]
226
+ )
227
+
228
+ prev_btn.click(
229
+ prev_sentence,
230
+ outputs=[current_text, next_text, progress, status]
231
+ )
232
+
233
+ save_btn.click(
234
+ save_recording,
235
+ inputs=[audio_recorder, speaker_id, dataset_name],
236
+ outputs=[status]
237
+ )
238
+
239
+ return interface
240
+
241
+ if __name__ == "__main__":
242
+ interface = create_interface()
243
+ interface.launch(
244
+ server_name="0.0.0.0",
245
+ server_port=7860,
246
+ share=True
247
+ )