Pierizvi commited on
Commit
f26107e
·
verified ·
1 Parent(s): 4e9e165

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +153 -26
app.py CHANGED
@@ -1,7 +1,6 @@
1
  """
2
  TTS Dataset Collection Tool with Custom Fonts and Enhanced Features
3
  """
4
-
5
  import os
6
  import json
7
  import nltk
@@ -15,11 +14,13 @@ import traceback
15
  import soundfile as sf
16
  import re
17
 
18
- # Download NLTK data during initialization
19
  try:
 
20
  nltk.data.find('tokenizers/punkt')
21
- except LookupError:
22
- nltk.download('punkt', quiet=True)
 
23
 
24
  # Configure logging
25
  logging.basicConfig(
@@ -58,12 +59,20 @@ class TTSDatasetCollector:
58
 
59
  def __init__(self):
60
  """Initialize the TTS Dataset Collector"""
61
- self.root_path = Path(os.path.dirname(os.path.abspath(__file__))) / "dataset"
 
 
 
 
 
 
 
62
  self.fonts_path = self.root_path / "fonts"
63
  self.sentences = []
64
  self.current_index = 0
65
  self.current_font = "english_serif"
66
  self.custom_fonts = {}
 
67
  self.setup_directories()
68
 
69
  # Ensure NLTK data is downloaded
@@ -258,7 +267,7 @@ class TTSDatasetCollector:
258
  base_name = f"{dataset_name_safe}_{speaker_id_safe}_line{line_number}_{sentence_excerpt}_{timestamp}"
259
  return f"{base_name}.wav", f"{base_name}.txt"
260
 
261
- def save_recording(self, audio_file, speaker_id: str, dataset_name: str) -> Tuple[bool, str]:
262
  """Save recording with enhanced error handling and logging"""
263
  if not all([audio_file, speaker_id, dataset_name]):
264
  missing = []
@@ -268,20 +277,20 @@ class TTSDatasetCollector:
268
  missing.append("speaker ID")
269
  if not dataset_name:
270
  missing.append("dataset name")
271
- return False, f"Missing required information: {', '.join(missing)}"
272
 
273
  # Check if sentences have been loaded
274
  if not self.sentences:
275
- return False, "No sentences have been loaded. Please load text before saving recordings."
276
  if self.current_index >= len(self.sentences):
277
- return False, "Current sentence index is out of range."
278
 
279
  try:
280
  # Validate inputs
281
  if not speaker_id.strip().isalnum():
282
- return False, "Speaker ID must contain only letters and numbers"
283
  if not dataset_name.strip().isalnum():
284
- return False, "Dataset name must contain only letters and numbers"
285
 
286
  # Get current sentence text
287
  sentence_text = self.sentences[self.current_index]
@@ -321,19 +330,27 @@ class TTSDatasetCollector:
321
  # Update metadata
322
  self.update_metadata(speaker_id, dataset_name)
323
 
 
 
 
 
 
 
 
 
324
  # Log success
325
  self.log_operation(
326
  f"Saved recording: Speaker={speaker_id}, Dataset={dataset_name}, "
327
  f"Audio={audio_name}, Text={text_name}"
328
  )
329
 
330
- return True, f"Recording saved successfully as {audio_name}"
331
 
332
  except Exception as e:
333
  error_msg = f"Error saving recording: {str(e)}"
334
  self.log_operation(error_msg, "error")
335
  logger.error(traceback.format_exc())
336
- return False, error_msg
337
 
338
  def save_transcription(self, file_path: Path, text: str, metadata: Dict) -> None:
339
  """Save transcription with metadata"""
@@ -491,6 +508,34 @@ Font_Style: {metadata['font_style']}
491
  else:
492
  return None
493
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
494
 
495
  def create_interface():
496
  """Create Gradio interface with enhanced features"""
@@ -597,11 +642,16 @@ def create_interface():
597
  )
598
  progress = gr.HTML("")
599
 
600
- audio_recorder = gr.Audio(
601
- label="Record Audio",
602
- type="filepath",
603
- elem_classes=["record-button"]
604
- )
 
 
 
 
 
605
  # Controls
606
  with gr.Row():
607
  prev_btn = gr.Button("Previous", variant="secondary")
@@ -610,8 +660,43 @@ def create_interface():
610
 
611
  # Download Links
612
  with gr.Row():
613
- download_audio = gr.File(label="Download Audio", interactive=False)
614
- download_transcript = gr.File(label="Download Transcript", interactive=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
615
 
616
  def process_pasted_text(text):
617
  """Handle pasted text input"""
@@ -694,10 +779,13 @@ def create_interface():
694
  return {
695
  status: "⚠️ Please record audio first",
696
  download_audio: None,
697
- download_transcript: None
 
 
 
698
  }
699
 
700
- success, msg = collector.save_recording(
701
  audio_file, speaker_id_value, dataset_name_value
702
  )
703
 
@@ -706,25 +794,49 @@ def create_interface():
706
  status: f"❌ {msg}",
707
  dataset_info: collector.get_dataset_statistics(),
708
  download_audio: None,
709
- download_transcript: None
 
 
710
  }
711
 
712
  # Get paths to the saved files
713
  audio_path = collector.get_last_audio_path(speaker_id_value)
714
  transcript_path = collector.get_last_transcript_path(speaker_id_value)
 
715
 
716
  # Auto-advance to next sentence after successful save
717
  nav_info = collector.navigate("next")
718
  progress_bar = f"<progress value='{collector.current_index + 1}' max='{len(collector.sentences)}'></progress> {nav_info['progress']}"
719
- return {
 
 
 
 
720
  current_text: nav_info['current'],
721
  next_text: nav_info['next'],
722
  progress: progress_bar,
723
  status: f"✅ {msg}",
724
  dataset_info: collector.get_dataset_statistics(),
725
  download_audio: audio_path,
726
- download_transcript: transcript_path
 
 
 
727
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
728
 
729
  def navigate_sentences(direction):
730
  """Handle navigation between sentences"""
@@ -758,6 +870,19 @@ def create_interface():
758
  status: f"✅ {msg}"
759
  }
760
 
 
 
 
 
 
 
 
 
 
 
 
 
 
761
  # Event handlers
762
  text_input.change(
763
  process_pasted_text,
@@ -786,7 +911,9 @@ def create_interface():
786
  save_btn.click(
787
  save_current_recording,
788
  inputs=[audio_recorder, speaker_id, dataset_name],
789
- outputs=[current_text, next_text, progress, status, dataset_info, download_audio, download_transcript]
 
 
790
  )
791
 
792
  prev_btn.click(
 
1
  """
2
  TTS Dataset Collection Tool with Custom Fonts and Enhanced Features
3
  """
 
4
  import os
5
  import json
6
  import nltk
 
14
  import soundfile as sf
15
  import re
16
 
17
+ # Download required NLTK data during initialization
18
  try:
19
+ nltk.download('punkt') # Download punkt tokenizer data
20
  nltk.data.find('tokenizers/punkt')
21
+ except Exception as e:
22
+ logger.warning(f"Error downloading NLTK data: {str(e)}")
23
+ logger.warning("NLTK tokenization might not work properly")
24
 
25
  # Configure logging
26
  logging.basicConfig(
 
59
 
60
  def __init__(self):
61
  """Initialize the TTS Dataset Collector"""
62
+ # Handle both script and notebook environments for root path
63
+ try:
64
+ # When running as a script
65
+ self.root_path = Path(os.path.dirname(os.path.abspath(__file__))) / "dataset"
66
+ except NameError:
67
+ # When running in Jupyter/IPython
68
+ self.root_path = Path.cwd() / "dataset"
69
+
70
  self.fonts_path = self.root_path / "fonts"
71
  self.sentences = []
72
  self.current_index = 0
73
  self.current_font = "english_serif"
74
  self.custom_fonts = {}
75
+ self.recordings = {} # Store recordings by sentence index
76
  self.setup_directories()
77
 
78
  # Ensure NLTK data is downloaded
 
267
  base_name = f"{dataset_name_safe}_{speaker_id_safe}_line{line_number}_{sentence_excerpt}_{timestamp}"
268
  return f"{base_name}.wav", f"{base_name}.txt"
269
 
270
+ def save_recording(self, audio_file, speaker_id: str, dataset_name: str) -> Tuple[bool, str, Dict]:
271
  """Save recording with enhanced error handling and logging"""
272
  if not all([audio_file, speaker_id, dataset_name]):
273
  missing = []
 
277
  missing.append("speaker ID")
278
  if not dataset_name:
279
  missing.append("dataset name")
280
+ return False, f"Missing required information: {', '.join(missing)}", {}
281
 
282
  # Check if sentences have been loaded
283
  if not self.sentences:
284
+ return False, "No sentences have been loaded. Please load text before saving recordings.", {}
285
  if self.current_index >= len(self.sentences):
286
+ return False, "Current sentence index is out of range.", {}
287
 
288
  try:
289
  # Validate inputs
290
  if not speaker_id.strip().isalnum():
291
+ return False, "Speaker ID must contain only letters and numbers", {}
292
  if not dataset_name.strip().isalnum():
293
+ return False, "Dataset name must contain only letters and numbers", {}
294
 
295
  # Get current sentence text
296
  sentence_text = self.sentences[self.current_index]
 
330
  # Update metadata
331
  self.update_metadata(speaker_id, dataset_name)
332
 
333
+ # Store the recording
334
+ self.recordings[self.current_index] = {
335
+ 'audio_file': audio_file,
336
+ 'speaker_id': speaker_id,
337
+ 'dataset_name': dataset_name,
338
+ 'sentence': self.sentences[self.current_index]
339
+ }
340
+
341
  # Log success
342
  self.log_operation(
343
  f"Saved recording: Speaker={speaker_id}, Dataset={dataset_name}, "
344
  f"Audio={audio_name}, Text={text_name}"
345
  )
346
 
347
+ return True, f"Recording saved successfully as {audio_name}", self.recordings
348
 
349
  except Exception as e:
350
  error_msg = f"Error saving recording: {str(e)}"
351
  self.log_operation(error_msg, "error")
352
  logger.error(traceback.format_exc())
353
+ return False, error_msg, self.recordings
354
 
355
  def save_transcription(self, file_path: Path, text: str, metadata: Dict) -> None:
356
  """Save transcription with metadata"""
 
508
  else:
509
  return None
510
 
511
+ def create_zip_archive(self, speaker_id: str) -> Optional[str]:
512
+ """Create a ZIP archive of all recordings and transcriptions for a speaker"""
513
+ try:
514
+ from zipfile import ZipFile
515
+ import tempfile
516
+
517
+ # Create temporary zip file
518
+ temp_dir = Path(tempfile.gettempdir())
519
+ zip_path = temp_dir / f"{speaker_id}_recordings.zip"
520
+
521
+ with ZipFile(zip_path, 'w') as zipf:
522
+ # Add audio files
523
+ audio_dir = self.root_path / 'audio' / speaker_id
524
+ if audio_dir.exists():
525
+ for audio_file in audio_dir.glob('*.wav'):
526
+ zipf.write(audio_file, f"audio/{audio_file.name}")
527
+
528
+ # Add transcription files
529
+ text_dir = self.root_path / 'transcriptions' / speaker_id
530
+ if text_dir.exists():
531
+ for text_file in text_dir.glob('*.txt'):
532
+ zipf.write(text_file, f"transcriptions/{text_file.name}")
533
+
534
+ return str(zip_path)
535
+ except Exception as e:
536
+ logger.error(f"Error creating zip archive: {str(e)}")
537
+ return None
538
+
539
 
540
  def create_interface():
541
  """Create Gradio interface with enhanced features"""
 
642
  )
643
  progress = gr.HTML("")
644
 
645
+ with gr.Row():
646
+ audio_recorder = gr.Audio(
647
+ label="Record Audio",
648
+ type="filepath",
649
+ elem_classes=["record-button"],
650
+ interactive=True,
651
+ streaming=False # Disable streaming to prevent freezing
652
+ )
653
+ clear_btn = gr.Button("Clear Recording", variant="secondary")
654
+
655
  # Controls
656
  with gr.Row():
657
  prev_btn = gr.Button("Previous", variant="secondary")
 
660
 
661
  # Download Links
662
  with gr.Row():
663
+ download_audio = gr.File(label="Download Last Audio", interactive=False)
664
+ download_transcript = gr.File(label="Download Last Transcript", interactive=False)
665
+ download_all = gr.File(label="Download All Recordings", interactive=False)
666
+
667
+ def download_all_recordings(speaker_id_value):
668
+ """Handle downloading all recordings for a speaker"""
669
+ if not speaker_id_value:
670
+ return {
671
+ status: "⚠️ Please enter a Speaker ID first",
672
+ download_all: None
673
+ }
674
+
675
+ zip_path = collector.create_zip_archive(speaker_id_value)
676
+ if zip_path:
677
+ return {
678
+ status: "✅ Archive created successfully",
679
+ download_all: zip_path
680
+ }
681
+ return {
682
+ status: "❌ Failed to create archive",
683
+ download_all: None
684
+ }
685
+
686
+ # Add download all button and its event handler
687
+ download_all_btn = gr.Button("Download All Recordings", variant="secondary")
688
+ download_all_btn.click(
689
+ download_all_recordings,
690
+ inputs=[speaker_id],
691
+ outputs=[status, download_all]
692
+ )
693
+
694
+ # Add recordings display
695
+ with gr.Column(scale=2):
696
+ recordings_display = gr.HTML(
697
+ label="Saved Recordings",
698
+ value="<div id='recordings-list'></div>"
699
+ )
700
 
701
  def process_pasted_text(text):
702
  """Handle pasted text input"""
 
779
  return {
780
  status: "⚠️ Please record audio first",
781
  download_audio: None,
782
+ download_transcript: None,
783
+ download_all: None,
784
+ recordings_display: "<div id='recordings-list'>No recordings yet</div>",
785
+ audio_recorder: None # Clear the recorder
786
  }
787
 
788
+ success, msg, recordings = collector.save_recording(
789
  audio_file, speaker_id_value, dataset_name_value
790
  )
791
 
 
794
  status: f"❌ {msg}",
795
  dataset_info: collector.get_dataset_statistics(),
796
  download_audio: None,
797
+ download_transcript: None,
798
+ download_all: None,
799
+ recordings_display: "<div id='recordings-list'>No recordings yet</div>"
800
  }
801
 
802
  # Get paths to the saved files
803
  audio_path = collector.get_last_audio_path(speaker_id_value)
804
  transcript_path = collector.get_last_transcript_path(speaker_id_value)
805
+ zip_path = collector.create_zip_archive(speaker_id_value)
806
 
807
  # Auto-advance to next sentence after successful save
808
  nav_info = collector.navigate("next")
809
  progress_bar = f"<progress value='{collector.current_index + 1}' max='{len(collector.sentences)}'></progress> {nav_info['progress']}"
810
+
811
+ # Update recordings display
812
+ recordings_html = create_recordings_display(recordings)
813
+
814
+ result = {
815
  current_text: nav_info['current'],
816
  next_text: nav_info['next'],
817
  progress: progress_bar,
818
  status: f"✅ {msg}",
819
  dataset_info: collector.get_dataset_statistics(),
820
  download_audio: audio_path,
821
+ download_transcript: transcript_path,
822
+ download_all: zip_path,
823
+ recordings_display: recordings_html,
824
+ audio_recorder: None # Clear the recorder after successful save
825
  }
826
+ return result
827
+
828
+ def create_recordings_display(recordings):
829
+ """Create HTML display for recordings"""
830
+ recordings_html = "<div id='recordings-list'><h3>Saved Recordings:</h3>"
831
+ for idx, rec in recordings.items():
832
+ recordings_html += f"""
833
+ <div style='margin: 10px 0; padding: 10px; border: 1px solid #ddd; border-radius: 5px;'>
834
+ <p><strong>Sentence {idx + 1}:</strong> {rec['sentence']}</p>
835
+ <audio controls src='{rec['audio_file']}'></audio>
836
+ </div>
837
+ """
838
+ recordings_html += "</div>"
839
+ return recordings_html
840
 
841
  def navigate_sentences(direction):
842
  """Handle navigation between sentences"""
 
870
  status: f"✅ {msg}"
871
  }
872
 
873
+ def clear_recording():
874
+ """Clear the current recording"""
875
+ return {
876
+ audio_recorder: None,
877
+ status: "Recording cleared"
878
+ }
879
+
880
+ # Add clear button handler
881
+ clear_btn.click(
882
+ clear_recording,
883
+ outputs=[audio_recorder, status]
884
+ )
885
+
886
  # Event handlers
887
  text_input.change(
888
  process_pasted_text,
 
911
  save_btn.click(
912
  save_current_recording,
913
  inputs=[audio_recorder, speaker_id, dataset_name],
914
+ outputs=[current_text, next_text, progress, status, dataset_info,
915
+ download_audio, download_transcript, download_all, recordings_display,
916
+ audio_recorder] # Add audio_recorder to outputs
917
  )
918
 
919
  prev_btn.click(