Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,7 +1,6 @@
|
|
1 |
"""
|
2 |
TTS Dataset Collection Tool with Custom Fonts and Enhanced Features
|
3 |
"""
|
4 |
-
|
5 |
import os
|
6 |
import json
|
7 |
import nltk
|
@@ -15,11 +14,13 @@ import traceback
|
|
15 |
import soundfile as sf
|
16 |
import re
|
17 |
|
18 |
-
# Download NLTK data during initialization
|
19 |
try:
|
|
|
20 |
nltk.data.find('tokenizers/punkt')
|
21 |
-
except
|
22 |
-
|
|
|
23 |
|
24 |
# Configure logging
|
25 |
logging.basicConfig(
|
@@ -58,12 +59,20 @@ class TTSDatasetCollector:
|
|
58 |
|
59 |
def __init__(self):
|
60 |
"""Initialize the TTS Dataset Collector"""
|
61 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
62 |
self.fonts_path = self.root_path / "fonts"
|
63 |
self.sentences = []
|
64 |
self.current_index = 0
|
65 |
self.current_font = "english_serif"
|
66 |
self.custom_fonts = {}
|
|
|
67 |
self.setup_directories()
|
68 |
|
69 |
# Ensure NLTK data is downloaded
|
@@ -258,7 +267,7 @@ class TTSDatasetCollector:
|
|
258 |
base_name = f"{dataset_name_safe}_{speaker_id_safe}_line{line_number}_{sentence_excerpt}_{timestamp}"
|
259 |
return f"{base_name}.wav", f"{base_name}.txt"
|
260 |
|
261 |
-
def save_recording(self, audio_file, speaker_id: str, dataset_name: str) -> Tuple[bool, str]:
|
262 |
"""Save recording with enhanced error handling and logging"""
|
263 |
if not all([audio_file, speaker_id, dataset_name]):
|
264 |
missing = []
|
@@ -268,20 +277,20 @@ class TTSDatasetCollector:
|
|
268 |
missing.append("speaker ID")
|
269 |
if not dataset_name:
|
270 |
missing.append("dataset name")
|
271 |
-
return False, f"Missing required information: {', '.join(missing)}"
|
272 |
|
273 |
# Check if sentences have been loaded
|
274 |
if not self.sentences:
|
275 |
-
return False, "No sentences have been loaded. Please load text before saving recordings."
|
276 |
if self.current_index >= len(self.sentences):
|
277 |
-
return False, "Current sentence index is out of range."
|
278 |
|
279 |
try:
|
280 |
# Validate inputs
|
281 |
if not speaker_id.strip().isalnum():
|
282 |
-
return False, "Speaker ID must contain only letters and numbers"
|
283 |
if not dataset_name.strip().isalnum():
|
284 |
-
return False, "Dataset name must contain only letters and numbers"
|
285 |
|
286 |
# Get current sentence text
|
287 |
sentence_text = self.sentences[self.current_index]
|
@@ -321,19 +330,27 @@ class TTSDatasetCollector:
|
|
321 |
# Update metadata
|
322 |
self.update_metadata(speaker_id, dataset_name)
|
323 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
324 |
# Log success
|
325 |
self.log_operation(
|
326 |
f"Saved recording: Speaker={speaker_id}, Dataset={dataset_name}, "
|
327 |
f"Audio={audio_name}, Text={text_name}"
|
328 |
)
|
329 |
|
330 |
-
return True, f"Recording saved successfully as {audio_name}"
|
331 |
|
332 |
except Exception as e:
|
333 |
error_msg = f"Error saving recording: {str(e)}"
|
334 |
self.log_operation(error_msg, "error")
|
335 |
logger.error(traceback.format_exc())
|
336 |
-
return False, error_msg
|
337 |
|
338 |
def save_transcription(self, file_path: Path, text: str, metadata: Dict) -> None:
|
339 |
"""Save transcription with metadata"""
|
@@ -491,6 +508,34 @@ Font_Style: {metadata['font_style']}
|
|
491 |
else:
|
492 |
return None
|
493 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
494 |
|
495 |
def create_interface():
|
496 |
"""Create Gradio interface with enhanced features"""
|
@@ -597,11 +642,16 @@ def create_interface():
|
|
597 |
)
|
598 |
progress = gr.HTML("")
|
599 |
|
600 |
-
|
601 |
-
|
602 |
-
|
603 |
-
|
604 |
-
|
|
|
|
|
|
|
|
|
|
|
605 |
# Controls
|
606 |
with gr.Row():
|
607 |
prev_btn = gr.Button("Previous", variant="secondary")
|
@@ -610,8 +660,43 @@ def create_interface():
|
|
610 |
|
611 |
# Download Links
|
612 |
with gr.Row():
|
613 |
-
download_audio = gr.File(label="Download Audio", interactive=False)
|
614 |
-
download_transcript = gr.File(label="Download Transcript", interactive=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
615 |
|
616 |
def process_pasted_text(text):
|
617 |
"""Handle pasted text input"""
|
@@ -694,10 +779,13 @@ def create_interface():
|
|
694 |
return {
|
695 |
status: "⚠️ Please record audio first",
|
696 |
download_audio: None,
|
697 |
-
download_transcript: None
|
|
|
|
|
|
|
698 |
}
|
699 |
|
700 |
-
success, msg = collector.save_recording(
|
701 |
audio_file, speaker_id_value, dataset_name_value
|
702 |
)
|
703 |
|
@@ -706,25 +794,49 @@ def create_interface():
|
|
706 |
status: f"❌ {msg}",
|
707 |
dataset_info: collector.get_dataset_statistics(),
|
708 |
download_audio: None,
|
709 |
-
download_transcript: None
|
|
|
|
|
710 |
}
|
711 |
|
712 |
# Get paths to the saved files
|
713 |
audio_path = collector.get_last_audio_path(speaker_id_value)
|
714 |
transcript_path = collector.get_last_transcript_path(speaker_id_value)
|
|
|
715 |
|
716 |
# Auto-advance to next sentence after successful save
|
717 |
nav_info = collector.navigate("next")
|
718 |
progress_bar = f"<progress value='{collector.current_index + 1}' max='{len(collector.sentences)}'></progress> {nav_info['progress']}"
|
719 |
-
|
|
|
|
|
|
|
|
|
720 |
current_text: nav_info['current'],
|
721 |
next_text: nav_info['next'],
|
722 |
progress: progress_bar,
|
723 |
status: f"✅ {msg}",
|
724 |
dataset_info: collector.get_dataset_statistics(),
|
725 |
download_audio: audio_path,
|
726 |
-
download_transcript: transcript_path
|
|
|
|
|
|
|
727 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
728 |
|
729 |
def navigate_sentences(direction):
|
730 |
"""Handle navigation between sentences"""
|
@@ -758,6 +870,19 @@ def create_interface():
|
|
758 |
status: f"✅ {msg}"
|
759 |
}
|
760 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
761 |
# Event handlers
|
762 |
text_input.change(
|
763 |
process_pasted_text,
|
@@ -786,7 +911,9 @@ def create_interface():
|
|
786 |
save_btn.click(
|
787 |
save_current_recording,
|
788 |
inputs=[audio_recorder, speaker_id, dataset_name],
|
789 |
-
outputs=[current_text, next_text, progress, status, dataset_info,
|
|
|
|
|
790 |
)
|
791 |
|
792 |
prev_btn.click(
|
|
|
1 |
"""
|
2 |
TTS Dataset Collection Tool with Custom Fonts and Enhanced Features
|
3 |
"""
|
|
|
4 |
import os
|
5 |
import json
|
6 |
import nltk
|
|
|
14 |
import soundfile as sf
|
15 |
import re
|
16 |
|
17 |
+
# Download required NLTK data during initialization
|
18 |
try:
|
19 |
+
nltk.download('punkt') # Download punkt tokenizer data
|
20 |
nltk.data.find('tokenizers/punkt')
|
21 |
+
except Exception as e:
|
22 |
+
logger.warning(f"Error downloading NLTK data: {str(e)}")
|
23 |
+
logger.warning("NLTK tokenization might not work properly")
|
24 |
|
25 |
# Configure logging
|
26 |
logging.basicConfig(
|
|
|
59 |
|
60 |
def __init__(self):
|
61 |
"""Initialize the TTS Dataset Collector"""
|
62 |
+
# Handle both script and notebook environments for root path
|
63 |
+
try:
|
64 |
+
# When running as a script
|
65 |
+
self.root_path = Path(os.path.dirname(os.path.abspath(__file__))) / "dataset"
|
66 |
+
except NameError:
|
67 |
+
# When running in Jupyter/IPython
|
68 |
+
self.root_path = Path.cwd() / "dataset"
|
69 |
+
|
70 |
self.fonts_path = self.root_path / "fonts"
|
71 |
self.sentences = []
|
72 |
self.current_index = 0
|
73 |
self.current_font = "english_serif"
|
74 |
self.custom_fonts = {}
|
75 |
+
self.recordings = {} # Store recordings by sentence index
|
76 |
self.setup_directories()
|
77 |
|
78 |
# Ensure NLTK data is downloaded
|
|
|
267 |
base_name = f"{dataset_name_safe}_{speaker_id_safe}_line{line_number}_{sentence_excerpt}_{timestamp}"
|
268 |
return f"{base_name}.wav", f"{base_name}.txt"
|
269 |
|
270 |
+
def save_recording(self, audio_file, speaker_id: str, dataset_name: str) -> Tuple[bool, str, Dict]:
|
271 |
"""Save recording with enhanced error handling and logging"""
|
272 |
if not all([audio_file, speaker_id, dataset_name]):
|
273 |
missing = []
|
|
|
277 |
missing.append("speaker ID")
|
278 |
if not dataset_name:
|
279 |
missing.append("dataset name")
|
280 |
+
return False, f"Missing required information: {', '.join(missing)}", {}
|
281 |
|
282 |
# Check if sentences have been loaded
|
283 |
if not self.sentences:
|
284 |
+
return False, "No sentences have been loaded. Please load text before saving recordings.", {}
|
285 |
if self.current_index >= len(self.sentences):
|
286 |
+
return False, "Current sentence index is out of range.", {}
|
287 |
|
288 |
try:
|
289 |
# Validate inputs
|
290 |
if not speaker_id.strip().isalnum():
|
291 |
+
return False, "Speaker ID must contain only letters and numbers", {}
|
292 |
if not dataset_name.strip().isalnum():
|
293 |
+
return False, "Dataset name must contain only letters and numbers", {}
|
294 |
|
295 |
# Get current sentence text
|
296 |
sentence_text = self.sentences[self.current_index]
|
|
|
330 |
# Update metadata
|
331 |
self.update_metadata(speaker_id, dataset_name)
|
332 |
|
333 |
+
# Store the recording
|
334 |
+
self.recordings[self.current_index] = {
|
335 |
+
'audio_file': audio_file,
|
336 |
+
'speaker_id': speaker_id,
|
337 |
+
'dataset_name': dataset_name,
|
338 |
+
'sentence': self.sentences[self.current_index]
|
339 |
+
}
|
340 |
+
|
341 |
# Log success
|
342 |
self.log_operation(
|
343 |
f"Saved recording: Speaker={speaker_id}, Dataset={dataset_name}, "
|
344 |
f"Audio={audio_name}, Text={text_name}"
|
345 |
)
|
346 |
|
347 |
+
return True, f"Recording saved successfully as {audio_name}", self.recordings
|
348 |
|
349 |
except Exception as e:
|
350 |
error_msg = f"Error saving recording: {str(e)}"
|
351 |
self.log_operation(error_msg, "error")
|
352 |
logger.error(traceback.format_exc())
|
353 |
+
return False, error_msg, self.recordings
|
354 |
|
355 |
def save_transcription(self, file_path: Path, text: str, metadata: Dict) -> None:
|
356 |
"""Save transcription with metadata"""
|
|
|
508 |
else:
|
509 |
return None
|
510 |
|
511 |
+
def create_zip_archive(self, speaker_id: str) -> Optional[str]:
|
512 |
+
"""Create a ZIP archive of all recordings and transcriptions for a speaker"""
|
513 |
+
try:
|
514 |
+
from zipfile import ZipFile
|
515 |
+
import tempfile
|
516 |
+
|
517 |
+
# Create temporary zip file
|
518 |
+
temp_dir = Path(tempfile.gettempdir())
|
519 |
+
zip_path = temp_dir / f"{speaker_id}_recordings.zip"
|
520 |
+
|
521 |
+
with ZipFile(zip_path, 'w') as zipf:
|
522 |
+
# Add audio files
|
523 |
+
audio_dir = self.root_path / 'audio' / speaker_id
|
524 |
+
if audio_dir.exists():
|
525 |
+
for audio_file in audio_dir.glob('*.wav'):
|
526 |
+
zipf.write(audio_file, f"audio/{audio_file.name}")
|
527 |
+
|
528 |
+
# Add transcription files
|
529 |
+
text_dir = self.root_path / 'transcriptions' / speaker_id
|
530 |
+
if text_dir.exists():
|
531 |
+
for text_file in text_dir.glob('*.txt'):
|
532 |
+
zipf.write(text_file, f"transcriptions/{text_file.name}")
|
533 |
+
|
534 |
+
return str(zip_path)
|
535 |
+
except Exception as e:
|
536 |
+
logger.error(f"Error creating zip archive: {str(e)}")
|
537 |
+
return None
|
538 |
+
|
539 |
|
540 |
def create_interface():
|
541 |
"""Create Gradio interface with enhanced features"""
|
|
|
642 |
)
|
643 |
progress = gr.HTML("")
|
644 |
|
645 |
+
with gr.Row():
|
646 |
+
audio_recorder = gr.Audio(
|
647 |
+
label="Record Audio",
|
648 |
+
type="filepath",
|
649 |
+
elem_classes=["record-button"],
|
650 |
+
interactive=True,
|
651 |
+
streaming=False # Disable streaming to prevent freezing
|
652 |
+
)
|
653 |
+
clear_btn = gr.Button("Clear Recording", variant="secondary")
|
654 |
+
|
655 |
# Controls
|
656 |
with gr.Row():
|
657 |
prev_btn = gr.Button("Previous", variant="secondary")
|
|
|
660 |
|
661 |
# Download Links
|
662 |
with gr.Row():
|
663 |
+
download_audio = gr.File(label="Download Last Audio", interactive=False)
|
664 |
+
download_transcript = gr.File(label="Download Last Transcript", interactive=False)
|
665 |
+
download_all = gr.File(label="Download All Recordings", interactive=False)
|
666 |
+
|
667 |
+
def download_all_recordings(speaker_id_value):
|
668 |
+
"""Handle downloading all recordings for a speaker"""
|
669 |
+
if not speaker_id_value:
|
670 |
+
return {
|
671 |
+
status: "⚠️ Please enter a Speaker ID first",
|
672 |
+
download_all: None
|
673 |
+
}
|
674 |
+
|
675 |
+
zip_path = collector.create_zip_archive(speaker_id_value)
|
676 |
+
if zip_path:
|
677 |
+
return {
|
678 |
+
status: "✅ Archive created successfully",
|
679 |
+
download_all: zip_path
|
680 |
+
}
|
681 |
+
return {
|
682 |
+
status: "❌ Failed to create archive",
|
683 |
+
download_all: None
|
684 |
+
}
|
685 |
+
|
686 |
+
# Add download all button and its event handler
|
687 |
+
download_all_btn = gr.Button("Download All Recordings", variant="secondary")
|
688 |
+
download_all_btn.click(
|
689 |
+
download_all_recordings,
|
690 |
+
inputs=[speaker_id],
|
691 |
+
outputs=[status, download_all]
|
692 |
+
)
|
693 |
+
|
694 |
+
# Add recordings display
|
695 |
+
with gr.Column(scale=2):
|
696 |
+
recordings_display = gr.HTML(
|
697 |
+
label="Saved Recordings",
|
698 |
+
value="<div id='recordings-list'></div>"
|
699 |
+
)
|
700 |
|
701 |
def process_pasted_text(text):
|
702 |
"""Handle pasted text input"""
|
|
|
779 |
return {
|
780 |
status: "⚠️ Please record audio first",
|
781 |
download_audio: None,
|
782 |
+
download_transcript: None,
|
783 |
+
download_all: None,
|
784 |
+
recordings_display: "<div id='recordings-list'>No recordings yet</div>",
|
785 |
+
audio_recorder: None # Clear the recorder
|
786 |
}
|
787 |
|
788 |
+
success, msg, recordings = collector.save_recording(
|
789 |
audio_file, speaker_id_value, dataset_name_value
|
790 |
)
|
791 |
|
|
|
794 |
status: f"❌ {msg}",
|
795 |
dataset_info: collector.get_dataset_statistics(),
|
796 |
download_audio: None,
|
797 |
+
download_transcript: None,
|
798 |
+
download_all: None,
|
799 |
+
recordings_display: "<div id='recordings-list'>No recordings yet</div>"
|
800 |
}
|
801 |
|
802 |
# Get paths to the saved files
|
803 |
audio_path = collector.get_last_audio_path(speaker_id_value)
|
804 |
transcript_path = collector.get_last_transcript_path(speaker_id_value)
|
805 |
+
zip_path = collector.create_zip_archive(speaker_id_value)
|
806 |
|
807 |
# Auto-advance to next sentence after successful save
|
808 |
nav_info = collector.navigate("next")
|
809 |
progress_bar = f"<progress value='{collector.current_index + 1}' max='{len(collector.sentences)}'></progress> {nav_info['progress']}"
|
810 |
+
|
811 |
+
# Update recordings display
|
812 |
+
recordings_html = create_recordings_display(recordings)
|
813 |
+
|
814 |
+
result = {
|
815 |
current_text: nav_info['current'],
|
816 |
next_text: nav_info['next'],
|
817 |
progress: progress_bar,
|
818 |
status: f"✅ {msg}",
|
819 |
dataset_info: collector.get_dataset_statistics(),
|
820 |
download_audio: audio_path,
|
821 |
+
download_transcript: transcript_path,
|
822 |
+
download_all: zip_path,
|
823 |
+
recordings_display: recordings_html,
|
824 |
+
audio_recorder: None # Clear the recorder after successful save
|
825 |
}
|
826 |
+
return result
|
827 |
+
|
828 |
+
def create_recordings_display(recordings):
|
829 |
+
"""Create HTML display for recordings"""
|
830 |
+
recordings_html = "<div id='recordings-list'><h3>Saved Recordings:</h3>"
|
831 |
+
for idx, rec in recordings.items():
|
832 |
+
recordings_html += f"""
|
833 |
+
<div style='margin: 10px 0; padding: 10px; border: 1px solid #ddd; border-radius: 5px;'>
|
834 |
+
<p><strong>Sentence {idx + 1}:</strong> {rec['sentence']}</p>
|
835 |
+
<audio controls src='{rec['audio_file']}'></audio>
|
836 |
+
</div>
|
837 |
+
"""
|
838 |
+
recordings_html += "</div>"
|
839 |
+
return recordings_html
|
840 |
|
841 |
def navigate_sentences(direction):
|
842 |
"""Handle navigation between sentences"""
|
|
|
870 |
status: f"✅ {msg}"
|
871 |
}
|
872 |
|
873 |
+
def clear_recording():
|
874 |
+
"""Clear the current recording"""
|
875 |
+
return {
|
876 |
+
audio_recorder: None,
|
877 |
+
status: "Recording cleared"
|
878 |
+
}
|
879 |
+
|
880 |
+
# Add clear button handler
|
881 |
+
clear_btn.click(
|
882 |
+
clear_recording,
|
883 |
+
outputs=[audio_recorder, status]
|
884 |
+
)
|
885 |
+
|
886 |
# Event handlers
|
887 |
text_input.change(
|
888 |
process_pasted_text,
|
|
|
911 |
save_btn.click(
|
912 |
save_current_recording,
|
913 |
inputs=[audio_recorder, speaker_id, dataset_name],
|
914 |
+
outputs=[current_text, next_text, progress, status, dataset_info,
|
915 |
+
download_audio, download_transcript, download_all, recordings_display,
|
916 |
+
audio_recorder] # Add audio_recorder to outputs
|
917 |
)
|
918 |
|
919 |
prev_btn.click(
|