OpenSound commited on
Commit
788a499
·
1 Parent(s): 0b4cc89

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +36 -86
app.py CHANGED
@@ -20,6 +20,7 @@ import io
20
  import numpy as np
21
  import random
22
  import uuid
 
23
  import spaces
24
  import nltk
25
  nltk.download('punkt')
@@ -29,11 +30,20 @@ TMP_PATH = os.getenv("TMP_PATH", "./demo/temp")
29
  MODELS_PATH = os.getenv("MODELS_PATH", "./pretrained_models")
30
  device = "cuda" if torch.cuda.is_available() else "cpu"
31
  whisper_model, align_model, ssrspeech_model = None, None, None
32
- _whitespace_re = re.compile(r"\s+")
33
 
34
  def get_random_string():
35
  return "".join(str(uuid.uuid4()).split("-"))
36
 
 
 
 
 
 
 
 
 
 
 
37
  @spaces.GPU
38
  def seed_everything(seed):
39
  if seed != -1:
@@ -75,9 +85,9 @@ def get_mask_interval(transcribe_state, word_span):
75
 
76
  @spaces.GPU
77
  class WhisperxAlignModel:
78
- def __init__(self):
79
  from whisperx import load_align_model
80
- self.model, self.metadata = load_align_model(language_code="en", device=device)
81
 
82
  def align(self, segments, audio_path):
83
  from whisperx import align, load_audio
@@ -86,12 +96,12 @@ class WhisperxAlignModel:
86
 
87
  @spaces.GPU
88
  class WhisperModel:
89
- def __init__(self, model_name):
90
  from whisper import load_model
91
- self.model = load_model(model_name, device)
92
 
93
  from whisper.tokenizer import get_tokenizer
94
- tokenizer = get_tokenizer(multilingual=False)
95
  self.supress_tokens = [-1] + [
96
  i
97
  for i in range(tokenizer.eot)
@@ -103,9 +113,9 @@ class WhisperModel:
103
 
104
  @spaces.GPU
105
  class WhisperxModel:
106
- def __init__(self, model_name, align_model: WhisperxAlignModel):
107
  from whisperx import load_model
108
- self.model = load_model(model_name, device, asr_options={"suppress_numerals": True, "max_new_tokens": None, "clip_timestamps": None, "hallucination_silence_threshold": None})
109
  self.align_model = align_model
110
 
111
  def transcribe(self, audio_path):
@@ -158,71 +168,45 @@ def load_models(whisper_backend_name, whisper_model_name, alignment_model_name,
158
  "text_tokenizer": text_tokenizer,
159
  "audio_tokenizer": AudioTokenizer(signature=encodec_fn)
160
  }
161
- return gr.Accordion()
 
 
 
 
 
162
 
163
 
164
  def get_transcribe_state(segments):
165
- words_info = [word_info for segment in segments for word_info in segment["words"]]
166
  transcript = " ".join([segment["text"] for segment in segments])
167
  transcript = transcript[1:] if transcript[0] == " " else transcript
168
  return {
169
  "segments": segments,
170
  "transcript": transcript,
171
- "words_info": words_info,
172
- "transcript_with_start_time": " ".join([f"{word['start']} {word['word']}" for word in words_info]),
173
- "transcript_with_end_time": " ".join([f"{word['word']} {word['end']}" for word in words_info]),
174
- "word_bounds": [f"{word['start']} {word['word']} {word['end']}" for word in words_info]
175
  }
176
 
177
  @spaces.GPU
178
- def transcribe(seed, audio_path):
 
 
179
  if transcribe_model is None:
180
  raise gr.Error("Transcription model not loaded")
181
- seed_everything(seed)
182
 
183
  segments = transcribe_model.transcribe(audio_path)
184
  state = get_transcribe_state(segments)
185
  success_message = "<span style='color:green;'>Success: Transcribe completed successfully!</span>"
186
 
187
  return [
188
- state["transcript"], state["transcript_with_start_time"], state["transcript_with_end_time"],
189
  state, success_message
190
  ]
191
 
192
- @spaces.GPU
193
- def align_segments(transcript, audio_path):
194
- # from aeneas.executetask import ExecuteTask
195
- # from aeneas.task import Task
196
- # import json
197
- # config_string = 'task_language=eng|os_task_file_format=json|is_text_type=plain'
198
-
199
- # tmp_transcript_path = os.path.join(TMP_PATH, f"{get_random_string()}.txt")
200
- # tmp_sync_map_path = os.path.join(TMP_PATH, f"{get_random_string()}.json")
201
- # with open(tmp_transcript_path, "w") as f:
202
- # f.write(transcript)
203
-
204
- # task = Task(config_string=config_string)
205
- # task.audio_file_path_absolute = os.path.abspath(audio_path)
206
- # task.text_file_path_absolute = os.path.abspath(tmp_transcript_path)
207
- # task.sync_map_file_path_absolute = os.path.abspath(tmp_sync_map_path)
208
- # ExecuteTask(task).execute()
209
- # task.output_sync_map_file()
210
-
211
- # with open(tmp_sync_map_path, "r") as f:
212
- # return json.load(f)
213
 
214
  @spaces.GPU
215
- def align(seed, transcript, audio_path):
 
216
  if align_model is None:
217
  raise gr.Error("Align model not loaded")
218
- seed_everything(seed)
219
- transcript = replace_numbers_with_words(transcript).replace(" ", " ").replace(" ", " ")
220
- fragments = align_segments(transcript, audio_path)
221
- segments = [{
222
- "start": float(fragment["begin"]),
223
- "end": float(fragment["end"]),
224
- "text": " ".join(fragment["lines"])
225
- } for fragment in fragments["fragments"]]
226
  segments = align_model.align(segments, audio_path)
227
  state = get_transcribe_state(segments)
228
  success_message = "<span style='color:green;'>Success: Alignment completed successfully!</span>"
@@ -255,7 +239,8 @@ def run(seed, sub_amount, ssrspeech_model_choice, codec_audio_sr, codec_sr, top_
255
  stop_repetition, kvcache, silence_tokens, aug_text, cfg_coef,
256
  audio_path, transcribe_state, original_transcript, transcript,
257
  mode, selected_sentence, previous_audio_tensors):
258
-
 
259
  aug_text = True if aug_text == 1 else False
260
  if ssrspeech_model is None:
261
  raise gr.Error("ssrspeech model not loaded")
@@ -387,35 +372,16 @@ def load_sentence(selected_sentence, codec_audio_sr, audio_tensors):
387
  selected_sentence_idx = int(selected_sentence[:colon_position])
388
  return get_output_audio([audio_tensors[selected_sentence_idx]], codec_audio_sr)
389
 
390
- smart_transcript_info = """
391
- If enabled, the target transcript will be constructed for you:</br>
392
- - In TTS and Long TTS mode just write the text you want to synthesize.</br>
393
- - In Edit mode just write the text to replace selected editing segment.</br>
394
- If disabled, you should write the target transcript yourself:</br>
395
- - In TTS mode write prompt transcript followed by generation transcript.</br>
396
- - In Long TTS select split by newline (<b>SENTENCE SPLIT WON'T WORK</b>) and start each line with a prompt transcript.</br>
397
- - In Edit mode write full prompt</br>
398
- """
399
 
400
  demo_original_transcript = "Gwynplaine had, besides, for his work and for his feats of strength, round his neck and over his shoulders, an esclavine of leather."
401
 
402
  demo_text = {
403
  "TTS": {
404
- "smart": "I cannot believe that the same model can also do text to speech synthesis too!",
405
  "regular": "Gwynplaine had, besides, for his work and for his feats of strength, I cannot believe that the same model can also do text to speech synthesis too!"
406
  },
407
  "Edit": {
408
- "smart": "take over the stage for half an hour,",
409
  "regular": "Gwynplaine had, besides, for his work and for his feats of strength, take over the stage for half an hour, an esclavine of leather."
410
  },
411
- "Long TTS": {
412
- "smart": "You can run the model on a big text!\n"
413
- "Just write it line-by-line. Or sentence-by-sentence.\n"
414
- "If some sentences sound odd, just rerun the model on them, no need to generate the whole text again!",
415
- "regular": "Gwynplaine had, besides, for his work and for his feats of strength, You can run the model on a big text!\n"
416
- "Gwynplaine had, besides, for his work and for his feats of strength, Just write it line-by-line. Or sentence-by-sentence.\n"
417
- "Gwynplaine had, besides, for his work and for his feats of strength, If some sentences sound odd, just rerun the model on them, no need to generate the whole text again!"
418
- }
419
  }
420
 
421
  all_demo_texts = {vv for k, v in demo_text.items() for kk, vv in v.items()}
@@ -425,22 +391,6 @@ demo_words = ['0.069 Gwynplain 0.611', '0.671 had, 0.912', '0.952 besides, 1.414
425
  demo_words_info = [{'word': 'Gwynplain', 'start': 0.069, 'end': 0.611, 'score': 0.833}, {'word': 'had,', 'start': 0.671, 'end': 0.912, 'score': 0.879}, {'word': 'besides,', 'start': 0.952, 'end': 1.414, 'score': 0.863}, {'word': 'for', 'start': 1.494, 'end': 1.634, 'score': 0.89}, {'word': 'his', 'start': 1.695, 'end': 1.835, 'score': 0.669}, {'word': 'work', 'start': 1.915, 'end': 2.136, 'score': 0.916}, {'word': 'and', 'start': 2.196, 'end': 2.297, 'score': 0.766}, {'word': 'for', 'start': 2.337, 'end': 2.517, 'score': 0.808}, {'word': 'his', 'start': 2.557, 'end': 2.678, 'score': 0.786}, {'word': 'feats', 'start': 2.758, 'end': 3.019, 'score': 0.97}, {'word': 'of', 'start': 3.079, 'end': 3.139, 'score': 0.752}, {'word': 'strength,', 'start': 3.2, 'end': 3.561, 'score': 0.742}, {'word': 'round', 'start': 4.022, 'end': 4.263, 'score': 0.916}, {'word': 'his', 'start': 4.303, 'end': 4.444, 'score': 0.666}, {'word': 'neck', 'start': 4.524, 'end': 4.705, 'score': 0.908}, {'word': 'and', 'start': 4.745, 'end': 4.825, 'score': 0.882}, {'word': 'over', 'start': 4.905, 'end': 5.086, 'score': 0.847}, {'word': 'his', 'start': 5.146, 'end': 5.266, 'score': 0.791}, {'word': 'shoulders,', 'start': 5.307, 'end': 5.768, 'score': 0.729}, {'word': 'an', 'start': 6.23, 'end': 6.33, 'score': 0.854}, {'word': 'esclavine', 'start': 6.531, 'end': 7.133, 'score': 0.803}, {'word': 'of', 'start': 7.213, 'end': 7.293, 'score': 0.772}, {'word': 'leather.', 'start': 7.353, 'end': 7.614, 'score': 0.896}]
426
 
427
 
428
- def update_demo(mode, smart_transcript, edit_word_mode, transcript, edit_from_word, edit_to_word):
429
- if transcript not in all_demo_texts:
430
- return transcript, edit_from_word, edit_to_word
431
-
432
- replace_half = edit_word_mode == "Replace half"
433
- change_edit_from_word = edit_from_word == demo_words[2] or edit_from_word == demo_words[3]
434
- change_edit_to_word = edit_to_word == demo_words[11] or edit_to_word == demo_words[12]
435
- demo_edit_from_word_value = demo_words[2] if replace_half else demo_words[3]
436
- demo_edit_to_word_value = demo_words[12] if replace_half else demo_words[11]
437
- return [
438
- demo_text[mode]["smart" if smart_transcript else "regular"],
439
- demo_edit_from_word_value if change_edit_from_word else edit_from_word,
440
- demo_edit_to_word_value if change_edit_to_word else edit_to_word,
441
- ]
442
-
443
-
444
  def get_app():
445
  with gr.Blocks() as app:
446
  with gr.Row():
@@ -472,7 +422,7 @@ def get_app():
472
 
473
  with gr.Column(scale=3):
474
  with gr.Group():
475
- transcript = gr.Textbox(label="Text", lines=7, value=demo_text["TTS"]["smart"])
476
 
477
  with gr.Row():
478
  mode = gr.Radio(label="Mode", choices=["Edit", "TTS"], value="Edit")
@@ -515,7 +465,7 @@ def get_app():
515
 
516
  load_models_btn.click(fn=load_models,
517
  inputs=[whisper_backend_choice, whisper_model_choice, align_model_choice, ssrspeech_model_choice],
518
- outputs=[models_selector])
519
 
520
 
521
  transcribe_btn.click(fn=transcribe,
@@ -573,4 +523,4 @@ if __name__ == "__main__":
573
  MODELS_PATH = args.models_path
574
 
575
  app = get_app()
576
- app.queue().launch(share=args.share, server_port=args.port)
 
20
  import numpy as np
21
  import random
22
  import uuid
23
+ import opencc
24
  import spaces
25
  import nltk
26
  nltk.download('punkt')
 
30
  MODELS_PATH = os.getenv("MODELS_PATH", "./pretrained_models")
31
  device = "cuda" if torch.cuda.is_available() else "cpu"
32
  whisper_model, align_model, ssrspeech_model = None, None, None
 
33
 
34
  def get_random_string():
35
  return "".join(str(uuid.uuid4()).split("-"))
36
 
37
+ def traditional_to_simplified(segments):
38
+ converter = opencc.OpenCC('t2s')
39
+ seg_num = len(segments)
40
+ for i in range(seg_num):
41
+ words = segments[i]['words']
42
+ for j in range(len(words)):
43
+ segments[i]['words'][j]['word'] = converter.convert(segments[i]['words'][j]['word'])
44
+ segments[i]['text'] = converter.convert(segments[i]['text'])
45
+ return segments
46
+
47
  @spaces.GPU
48
  def seed_everything(seed):
49
  if seed != -1:
 
85
 
86
  @spaces.GPU
87
  class WhisperxAlignModel:
88
+ def __init__(self, language):
89
  from whisperx import load_align_model
90
+ self.model, self.metadata = load_align_model(language_code=language, device=device)
91
 
92
  def align(self, segments, audio_path):
93
  from whisperx import align, load_audio
 
96
 
97
  @spaces.GPU
98
  class WhisperModel:
99
+ def __init__(self, model_name, language):
100
  from whisper import load_model
101
+ self.model = load_model(model_name, device, language=language)
102
 
103
  from whisper.tokenizer import get_tokenizer
104
+ tokenizer = get_tokenizer(multilingual=False, language=language)
105
  self.supress_tokens = [-1] + [
106
  i
107
  for i in range(tokenizer.eot)
 
113
 
114
  @spaces.GPU
115
  class WhisperxModel:
116
+ def __init__(self, model_name, align_model, language):
117
  from whisperx import load_model
118
+ self.model = load_model(model_name, device, asr_options={"suppress_numerals": True, "max_new_tokens": None, "clip_timestamps": None, "hallucination_silence_threshold": None}, language=language)
119
  self.align_model = align_model
120
 
121
  def transcribe(self, audio_path):
 
168
  "text_tokenizer": text_tokenizer,
169
  "audio_tokenizer": AudioTokenizer(signature=encodec_fn)
170
  }
171
+ success_message = "<span style='color:green;'>Success: Models loading completed successfully!</span>"
172
+
173
+ return [
174
+ gr.Accordion(),
175
+ success_message
176
+ ]
177
 
178
 
179
  def get_transcribe_state(segments):
 
180
  transcript = " ".join([segment["text"] for segment in segments])
181
  transcript = transcript[1:] if transcript[0] == " " else transcript
182
  return {
183
  "segments": segments,
184
  "transcript": transcript,
 
 
 
 
185
  }
186
 
187
  @spaces.GPU
188
+ def transcribe(audio_path):
189
+ global transcribe_model
190
+
191
  if transcribe_model is None:
192
  raise gr.Error("Transcription model not loaded")
 
193
 
194
  segments = transcribe_model.transcribe(audio_path)
195
  state = get_transcribe_state(segments)
196
  success_message = "<span style='color:green;'>Success: Transcribe completed successfully!</span>"
197
 
198
  return [
199
+ state["transcript"], state['segments'],
200
  state, success_message
201
  ]
202
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
203
 
204
  @spaces.GPU
205
+ def align(segments, audio_path):
206
+ global align_model
207
  if align_model is None:
208
  raise gr.Error("Align model not loaded")
209
+
 
 
 
 
 
 
 
210
  segments = align_model.align(segments, audio_path)
211
  state = get_transcribe_state(segments)
212
  success_message = "<span style='color:green;'>Success: Alignment completed successfully!</span>"
 
239
  stop_repetition, kvcache, silence_tokens, aug_text, cfg_coef,
240
  audio_path, transcribe_state, original_transcript, transcript,
241
  mode, selected_sentence, previous_audio_tensors):
242
+
243
+ global ssrspeech_model
244
  aug_text = True if aug_text == 1 else False
245
  if ssrspeech_model is None:
246
  raise gr.Error("ssrspeech model not loaded")
 
372
  selected_sentence_idx = int(selected_sentence[:colon_position])
373
  return get_output_audio([audio_tensors[selected_sentence_idx]], codec_audio_sr)
374
 
 
 
 
 
 
 
 
 
 
375
 
376
  demo_original_transcript = "Gwynplaine had, besides, for his work and for his feats of strength, round his neck and over his shoulders, an esclavine of leather."
377
 
378
  demo_text = {
379
  "TTS": {
 
380
  "regular": "Gwynplaine had, besides, for his work and for his feats of strength, I cannot believe that the same model can also do text to speech synthesis too!"
381
  },
382
  "Edit": {
 
383
  "regular": "Gwynplaine had, besides, for his work and for his feats of strength, take over the stage for half an hour, an esclavine of leather."
384
  },
 
 
 
 
 
 
 
 
385
  }
386
 
387
  all_demo_texts = {vv for k, v in demo_text.items() for kk, vv in v.items()}
 
391
  demo_words_info = [{'word': 'Gwynplain', 'start': 0.069, 'end': 0.611, 'score': 0.833}, {'word': 'had,', 'start': 0.671, 'end': 0.912, 'score': 0.879}, {'word': 'besides,', 'start': 0.952, 'end': 1.414, 'score': 0.863}, {'word': 'for', 'start': 1.494, 'end': 1.634, 'score': 0.89}, {'word': 'his', 'start': 1.695, 'end': 1.835, 'score': 0.669}, {'word': 'work', 'start': 1.915, 'end': 2.136, 'score': 0.916}, {'word': 'and', 'start': 2.196, 'end': 2.297, 'score': 0.766}, {'word': 'for', 'start': 2.337, 'end': 2.517, 'score': 0.808}, {'word': 'his', 'start': 2.557, 'end': 2.678, 'score': 0.786}, {'word': 'feats', 'start': 2.758, 'end': 3.019, 'score': 0.97}, {'word': 'of', 'start': 3.079, 'end': 3.139, 'score': 0.752}, {'word': 'strength,', 'start': 3.2, 'end': 3.561, 'score': 0.742}, {'word': 'round', 'start': 4.022, 'end': 4.263, 'score': 0.916}, {'word': 'his', 'start': 4.303, 'end': 4.444, 'score': 0.666}, {'word': 'neck', 'start': 4.524, 'end': 4.705, 'score': 0.908}, {'word': 'and', 'start': 4.745, 'end': 4.825, 'score': 0.882}, {'word': 'over', 'start': 4.905, 'end': 5.086, 'score': 0.847}, {'word': 'his', 'start': 5.146, 'end': 5.266, 'score': 0.791}, {'word': 'shoulders,', 'start': 5.307, 'end': 5.768, 'score': 0.729}, {'word': 'an', 'start': 6.23, 'end': 6.33, 'score': 0.854}, {'word': 'esclavine', 'start': 6.531, 'end': 7.133, 'score': 0.803}, {'word': 'of', 'start': 7.213, 'end': 7.293, 'score': 0.772}, {'word': 'leather.', 'start': 7.353, 'end': 7.614, 'score': 0.896}]
392
 
393
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
394
  def get_app():
395
  with gr.Blocks() as app:
396
  with gr.Row():
 
422
 
423
  with gr.Column(scale=3):
424
  with gr.Group():
425
+ transcript = gr.Textbox(label="Text", lines=7, value=demo_text["TTS"]["regular"])
426
 
427
  with gr.Row():
428
  mode = gr.Radio(label="Mode", choices=["Edit", "TTS"], value="Edit")
 
465
 
466
  load_models_btn.click(fn=load_models,
467
  inputs=[whisper_backend_choice, whisper_model_choice, align_model_choice, ssrspeech_model_choice],
468
+ outputs=[models_selector, success_output])
469
 
470
 
471
  transcribe_btn.click(fn=transcribe,
 
523
  MODELS_PATH = args.models_path
524
 
525
  app = get_app()
526
+ app.queue().launch(share=args.share, server_port=args.port)