TTS_DATASET_MAKER

Sleeping

App Files Files Community

Pierizvi commited on Nov 25, 2024

Commit

f26107e

verified ·

1 Parent(s): 4e9e165

Update app.py

Browse files

Files changed (1) hide show

app.py +153 -26

app.py CHANGED Viewed

@@ -1,7 +1,6 @@
 """
 TTS Dataset Collection Tool with Custom Fonts and Enhanced Features
 """
 import os
 import json
 import nltk
@@ -15,11 +14,13 @@ import traceback
 import soundfile as sf
 import re
-# Download NLTK data during initialization
 try:
     nltk.data.find('tokenizers/punkt')
-except LookupError:
-    nltk.download('punkt', quiet=True)
 # Configure logging
 logging.basicConfig(
@@ -58,12 +59,20 @@ class TTSDatasetCollector:
     def __init__(self):
         """Initialize the TTS Dataset Collector"""
-        self.root_path = Path(os.path.dirname(os.path.abspath(__file__))) / "dataset"
         self.fonts_path = self.root_path / "fonts"
         self.sentences = []
         self.current_index = 0
         self.current_font = "english_serif"
         self.custom_fonts = {}
         self.setup_directories()
         # Ensure NLTK data is downloaded
@@ -258,7 +267,7 @@ class TTSDatasetCollector:
         base_name = f"{dataset_name_safe}_{speaker_id_safe}_line{line_number}_{sentence_excerpt}_{timestamp}"
         return f"{base_name}.wav", f"{base_name}.txt"
-    def save_recording(self, audio_file, speaker_id: str, dataset_name: str) -> Tuple[bool, str]:
         """Save recording with enhanced error handling and logging"""
         if not all([audio_file, speaker_id, dataset_name]):
             missing = []
@@ -268,20 +277,20 @@ class TTSDatasetCollector:
                 missing.append("speaker ID")
             if not dataset_name:
                 missing.append("dataset name")
-            return False, f"Missing required information: {', '.join(missing)}"
         # Check if sentences have been loaded
         if not self.sentences:
-            return False, "No sentences have been loaded. Please load text before saving recordings."
         if self.current_index >= len(self.sentences):
-            return False, "Current sentence index is out of range."
         try:
             # Validate inputs
             if not speaker_id.strip().isalnum():
-                return False, "Speaker ID must contain only letters and numbers"
             if not dataset_name.strip().isalnum():
-                return False, "Dataset name must contain only letters and numbers"
             # Get current sentence text
             sentence_text = self.sentences[self.current_index]
@@ -321,19 +330,27 @@ class TTSDatasetCollector:
             # Update metadata
             self.update_metadata(speaker_id, dataset_name)
             # Log success
             self.log_operation(
                 f"Saved recording: Speaker={speaker_id}, Dataset={dataset_name}, "
                 f"Audio={audio_name}, Text={text_name}"
             )
-            return True, f"Recording saved successfully as {audio_name}"
         except Exception as e:
             error_msg = f"Error saving recording: {str(e)}"
             self.log_operation(error_msg, "error")
             logger.error(traceback.format_exc())
-            return False, error_msg
     def save_transcription(self, file_path: Path, text: str, metadata: Dict) -> None:
         """Save transcription with metadata"""
@@ -491,6 +508,34 @@ Font_Style: {metadata['font_style']}
         else:
             return None
 def create_interface():
     """Create Gradio interface with enhanced features"""
@@ -597,11 +642,16 @@ def create_interface():
                 )
                 progress = gr.HTML("")
-                audio_recorder = gr.Audio(
-                    label="Record Audio",
-                    type="filepath",
-                    elem_classes=["record-button"]
-                )
                 # Controls
                 with gr.Row():
                     prev_btn = gr.Button("Previous", variant="secondary")
@@ -610,8 +660,43 @@ def create_interface():
                 # Download Links
                 with gr.Row():
-                    download_audio = gr.File(label="Download Audio", interactive=False)
-                    download_transcript = gr.File(label="Download Transcript", interactive=False)
         def process_pasted_text(text):
             """Handle pasted text input"""
@@ -694,10 +779,13 @@ def create_interface():
                 return {
                     status: "⚠️ Please record audio first",
                     download_audio: None,
-                    download_transcript: None
                 }
-            success, msg = collector.save_recording(
                 audio_file, speaker_id_value, dataset_name_value
             )
@@ -706,25 +794,49 @@ def create_interface():
                     status: f"❌ {msg}",
                     dataset_info: collector.get_dataset_statistics(),
                     download_audio: None,
-                    download_transcript: None
                 }
             # Get paths to the saved files
             audio_path = collector.get_last_audio_path(speaker_id_value)
             transcript_path = collector.get_last_transcript_path(speaker_id_value)
             # Auto-advance to next sentence after successful save
             nav_info = collector.navigate("next")
             progress_bar = f"<progress value='{collector.current_index + 1}' max='{len(collector.sentences)}'></progress> {nav_info['progress']}"
-            return {
                 current_text: nav_info['current'],
                 next_text: nav_info['next'],
                 progress: progress_bar,
                 status: f"✅ {msg}",
                 dataset_info: collector.get_dataset_statistics(),
                 download_audio: audio_path,
-                download_transcript: transcript_path
             }
         def navigate_sentences(direction):
             """Handle navigation between sentences"""
@@ -758,6 +870,19 @@ def create_interface():
                 status: f"✅ {msg}"
             }
         # Event handlers
         text_input.change(
             process_pasted_text,
@@ -786,7 +911,9 @@ def create_interface():
         save_btn.click(
             save_current_recording,
             inputs=[audio_recorder, speaker_id, dataset_name],
-            outputs=[current_text, next_text, progress, status, dataset_info, download_audio, download_transcript]
         )
         prev_btn.click(

 """
 TTS Dataset Collection Tool with Custom Fonts and Enhanced Features
 """
 import os
 import json
 import nltk
 import soundfile as sf
 import re
+# Download required NLTK data during initialization
 try:
+    nltk.download('punkt')  # Download punkt tokenizer data
     nltk.data.find('tokenizers/punkt')
+except Exception as e:
+    logger.warning(f"Error downloading NLTK data: {str(e)}")
+    logger.warning("NLTK tokenization might not work properly")
 # Configure logging
 logging.basicConfig(
     def __init__(self):
         """Initialize the TTS Dataset Collector"""
+        # Handle both script and notebook environments for root path
+        try:
+            # When running as a script
+            self.root_path = Path(os.path.dirname(os.path.abspath(__file__))) / "dataset"
+        except NameError:
+            # When running in Jupyter/IPython
+            self.root_path = Path.cwd() / "dataset"
         self.fonts_path = self.root_path / "fonts"
         self.sentences = []
         self.current_index = 0
         self.current_font = "english_serif"
         self.custom_fonts = {}
+        self.recordings = {}  # Store recordings by sentence index
         self.setup_directories()
         # Ensure NLTK data is downloaded
         base_name = f"{dataset_name_safe}_{speaker_id_safe}_line{line_number}_{sentence_excerpt}_{timestamp}"
         return f"{base_name}.wav", f"{base_name}.txt"
+    def save_recording(self, audio_file, speaker_id: str, dataset_name: str) -> Tuple[bool, str, Dict]:
         """Save recording with enhanced error handling and logging"""
         if not all([audio_file, speaker_id, dataset_name]):
             missing = []
                 missing.append("speaker ID")
             if not dataset_name:
                 missing.append("dataset name")
+            return False, f"Missing required information: {', '.join(missing)}", {}
         # Check if sentences have been loaded
         if not self.sentences:
+            return False, "No sentences have been loaded. Please load text before saving recordings.", {}
         if self.current_index >= len(self.sentences):
+            return False, "Current sentence index is out of range.", {}
         try:
             # Validate inputs
             if not speaker_id.strip().isalnum():
+                return False, "Speaker ID must contain only letters and numbers", {}
             if not dataset_name.strip().isalnum():
+                return False, "Dataset name must contain only letters and numbers", {}
             # Get current sentence text
             sentence_text = self.sentences[self.current_index]
             # Update metadata
             self.update_metadata(speaker_id, dataset_name)
+            # Store the recording
+            self.recordings[self.current_index] = {
+                'audio_file': audio_file,
+                'speaker_id': speaker_id,
+                'dataset_name': dataset_name,
+                'sentence': self.sentences[self.current_index]
+            }
             # Log success
             self.log_operation(
                 f"Saved recording: Speaker={speaker_id}, Dataset={dataset_name}, "
                 f"Audio={audio_name}, Text={text_name}"
             )
+            return True, f"Recording saved successfully as {audio_name}", self.recordings
         except Exception as e:
             error_msg = f"Error saving recording: {str(e)}"
             self.log_operation(error_msg, "error")
             logger.error(traceback.format_exc())
+            return False, error_msg, self.recordings
     def save_transcription(self, file_path: Path, text: str, metadata: Dict) -> None:
         """Save transcription with metadata"""
         else:
             return None
+    def create_zip_archive(self, speaker_id: str) -> Optional[str]:
+        """Create a ZIP archive of all recordings and transcriptions for a speaker"""
+        try:
+            from zipfile import ZipFile
+            import tempfile
+            # Create temporary zip file
+            temp_dir = Path(tempfile.gettempdir())
+            zip_path = temp_dir / f"{speaker_id}_recordings.zip"
+            with ZipFile(zip_path, 'w') as zipf:
+                # Add audio files
+                audio_dir = self.root_path / 'audio' / speaker_id
+                if audio_dir.exists():
+                    for audio_file in audio_dir.glob('*.wav'):
+                        zipf.write(audio_file, f"audio/{audio_file.name}")
+                # Add transcription files
+                text_dir = self.root_path / 'transcriptions' / speaker_id
+                if text_dir.exists():
+                    for text_file in text_dir.glob('*.txt'):
+                        zipf.write(text_file, f"transcriptions/{text_file.name}")
+            return str(zip_path)
+        except Exception as e:
+            logger.error(f"Error creating zip archive: {str(e)}")
+            return None
 def create_interface():
     """Create Gradio interface with enhanced features"""
                 )
                 progress = gr.HTML("")
+                with gr.Row():
+                    audio_recorder = gr.Audio(
+                        label="Record Audio",
+                        type="filepath",
+                        elem_classes=["record-button"],
+                        interactive=True,
+                        streaming=False  # Disable streaming to prevent freezing
+                    )
+                    clear_btn = gr.Button("Clear Recording", variant="secondary")
                 # Controls
                 with gr.Row():
                     prev_btn = gr.Button("Previous", variant="secondary")
                 # Download Links
                 with gr.Row():
+                    download_audio = gr.File(label="Download Last Audio", interactive=False)
+                    download_transcript = gr.File(label="Download Last Transcript", interactive=False)
+                    download_all = gr.File(label="Download All Recordings", interactive=False)
+                def download_all_recordings(speaker_id_value):
+                    """Handle downloading all recordings for a speaker"""
+                    if not speaker_id_value:
+                        return {
+                            status: "⚠️ Please enter a Speaker ID first",
+                            download_all: None
+                        }
+                    zip_path = collector.create_zip_archive(speaker_id_value)
+                    if zip_path:
+                        return {
+                            status: "✅ Archive created successfully",
+                            download_all: zip_path
+                        }
+                    return {
+                        status: "❌ Failed to create archive",
+                        download_all: None
+                    }
+                # Add download all button and its event handler
+                download_all_btn = gr.Button("Download All Recordings", variant="secondary")
+                download_all_btn.click(
+                    download_all_recordings,
+                    inputs=[speaker_id],
+                    outputs=[status, download_all]
+                )
+        # Add recordings display
+        with gr.Column(scale=2):
+            recordings_display = gr.HTML(
+                label="Saved Recordings",
+                value="<div id='recordings-list'></div>"
+            )
         def process_pasted_text(text):
             """Handle pasted text input"""
                 return {
                     status: "⚠️ Please record audio first",
                     download_audio: None,
+                    download_transcript: None,
+                    download_all: None,
+                    recordings_display: "<div id='recordings-list'>No recordings yet</div>",
+                    audio_recorder: None  # Clear the recorder
                 }
+            success, msg, recordings = collector.save_recording(
                 audio_file, speaker_id_value, dataset_name_value
             )
                     status: f"❌ {msg}",
                     dataset_info: collector.get_dataset_statistics(),
                     download_audio: None,
+                    download_transcript: None,
+                    download_all: None,
+                    recordings_display: "<div id='recordings-list'>No recordings yet</div>"
                 }
             # Get paths to the saved files
             audio_path = collector.get_last_audio_path(speaker_id_value)
             transcript_path = collector.get_last_transcript_path(speaker_id_value)
+            zip_path = collector.create_zip_archive(speaker_id_value)
             # Auto-advance to next sentence after successful save
             nav_info = collector.navigate("next")
             progress_bar = f"<progress value='{collector.current_index + 1}' max='{len(collector.sentences)}'></progress> {nav_info['progress']}"
+            # Update recordings display
+            recordings_html = create_recordings_display(recordings)
+            result = {
                 current_text: nav_info['current'],
                 next_text: nav_info['next'],
                 progress: progress_bar,
                 status: f"✅ {msg}",
                 dataset_info: collector.get_dataset_statistics(),
                 download_audio: audio_path,
+                download_transcript: transcript_path,
+                download_all: zip_path,
+                recordings_display: recordings_html,
+                audio_recorder: None  # Clear the recorder after successful save
             }
+            return result
+        def create_recordings_display(recordings):
+            """Create HTML display for recordings"""
+            recordings_html = "<div id='recordings-list'><h3>Saved Recordings:</h3>"
+            for idx, rec in recordings.items():
+                recordings_html += f"""
+                <div style='margin: 10px 0; padding: 10px; border: 1px solid #ddd; border-radius: 5px;'>
+                    <p><strong>Sentence {idx + 1}:</strong> {rec['sentence']}</p>
+                    <audio controls src='{rec['audio_file']}'></audio>
+                </div>
+                """
+            recordings_html += "</div>"
+            return recordings_html
         def navigate_sentences(direction):
             """Handle navigation between sentences"""
                 status: f"✅ {msg}"
             }
+        def clear_recording():
+            """Clear the current recording"""
+            return {
+                audio_recorder: None,
+                status: "Recording cleared"
+            }
+        # Add clear button handler
+        clear_btn.click(
+            clear_recording,
+            outputs=[audio_recorder, status]
+        )
         # Event handlers
         text_input.change(
             process_pasted_text,
         save_btn.click(
             save_current_recording,
             inputs=[audio_recorder, speaker_id, dataset_name],
+            outputs=[current_text, next_text, progress, status, dataset_info,
+                    download_audio, download_transcript, download_all, recordings_display,
+                    audio_recorder]  # Add audio_recorder to outputs
         )
         prev_btn.click(