OpenSound commited on
Commit
3daaddd
·
1 Parent(s): 41c3bad

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +169 -178
app.py CHANGED
@@ -125,18 +125,21 @@ class WhisperxModel:
125
  return self.align_model.align(segments, audio_path)
126
 
127
  @spaces.GPU
128
- def load_models(whisper_backend_name, whisper_model_name, alignment_model_name, ssrspeech_model_name):
129
  global transcribe_model, align_model, ssrspeech_model
130
 
131
-
132
  if ssrspeech_model_name == "English":
133
  ssrspeech_model_name = "English"
134
  text_tokenizer = TextTokenizer(backend="espeak")
135
  language = "en"
 
 
136
  elif ssrspeech_model_name == "Mandarin":
137
  ssrspeech_model_name = "Mandarin"
138
  text_tokenizer = TextTokenizer(backend="espeak", language='cmn')
139
  language = "zh"
 
140
 
141
  if alignment_model_name is not None:
142
  align_model = WhisperxAlignModel(language)
@@ -212,12 +215,8 @@ def align(segments, audio_path):
212
 
213
  segments = align_model.align(segments, audio_path)
214
  state = get_transcribe_state(segments)
215
- success_message = "<span style='color:green;'>Success: Alignment completed successfully!</span>"
216
 
217
- return [
218
- state["transcript_with_start_time"], state["transcript_with_end_time"],
219
- state, success_message
220
- ]
221
 
222
 
223
  def get_output_audio(audio_tensors, codec_audio_sr):
@@ -239,141 +238,166 @@ def replace_numbers_with_words(sentence):
239
 
240
  @spaces.GPU
241
  def run(seed, sub_amount, ssrspeech_model_choice, codec_audio_sr, codec_sr, top_k, top_p, temperature,
242
- stop_repetition, kvcache, silence_tokens, aug_text, cfg_coef,
243
- audio_path, transcribe_state, original_transcript, transcript,
244
- mode, selected_sentence, previous_audio_tensors):
245
 
246
- global ssrspeech_model
247
  aug_text = True if aug_text == 1 else False
248
  if ssrspeech_model is None:
249
  raise gr.Error("ssrspeech model not loaded")
250
 
 
 
 
 
 
 
 
251
  # resample audio
252
  audio, _ = librosa.load(audio_path, sr=16000)
253
  sf.write(audio_path, audio, 16000)
254
-
255
- seed_everything(seed)
256
- transcript = replace_numbers_with_words(transcript).replace(" ", " ").replace(" ", " ") # replace numbers with words, so that the phonemizer can do a better job
257
-
258
- if mode == "Rerun":
259
- colon_position = selected_sentence.find(':')
260
- selected_sentence_idx = int(selected_sentence[:colon_position])
261
- sentences = [selected_sentence[colon_position + 1:]]
262
- else:
263
- sentences = [transcript.replace("\n", " ")]
264
-
265
- audio_tensors = []
266
- inference_transcript = ""
267
- for sentence in sentences:
268
- decode_config = {"top_k": top_k, "top_p": top_p, "temperature": temperature, "stop_repetition": stop_repetition,
269
- "kvcache": kvcache, "codec_audio_sr": codec_audio_sr, "codec_sr": codec_sr}
270
-
271
- # run the script to turn user input to the format that the model can take
272
- if mode == "Edit":
273
- operations, orig_spans = parse_edit_en(original_transcript, sentence) if ssrspeech_model_choice == 'English' else parse_edit_zh(original_transcript, sentence)
274
- print(operations)
275
- print("orig_spans: ", orig_spans)
276
-
277
- if len(orig_spans) > 3:
278
- raise gr.Error("Current model only supports maximum 3 editings")
279
-
280
- starting_intervals = []
281
- ending_intervals = []
282
- for orig_span in orig_spans:
283
- start, end = get_mask_interval(transcribe_state, orig_span)
284
- starting_intervals.append(start)
285
- ending_intervals.append(end)
286
-
287
- print("intervals: ", starting_intervals, ending_intervals)
288
 
289
- info = torchaudio.info(audio_path)
290
- audio_dur = info.num_frames / info.sample_rate
291
-
292
- def combine_spans(spans, threshold=0.2):
293
- spans.sort(key=lambda x: x[0])
294
- combined_spans = []
295
- current_span = spans[0]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
296
 
297
- for i in range(1, len(spans)):
298
- next_span = spans[i]
299
- if current_span[1] >= next_span[0] - threshold:
300
- current_span[1] = max(current_span[1], next_span[1])
301
- else:
302
- combined_spans.append(current_span)
303
- current_span = next_span
304
- combined_spans.append(current_span)
305
- return combined_spans
306
 
307
- morphed_span = [[max(start - sub_amount, 0), min(end + sub_amount, audio_dur)]
308
- for start, end in zip(starting_intervals, ending_intervals)] # in seconds
309
- morphed_span = combine_spans(morphed_span, threshold=0.2)
310
- print("morphed_spans: ", morphed_span)
311
- mask_interval = [[round(span[0]*codec_sr), round(span[1]*codec_sr)] for span in morphed_span]
312
- mask_interval = torch.LongTensor(mask_interval) # [M,2], M==1 for now
313
-
314
- gen_audio = inference_one_sample(
315
- ssrspeech_model["model"],
316
- ssrspeech_model["config"],
317
- ssrspeech_model["phn2num"],
318
- ssrspeech_model["text_tokenizer"],
319
- ssrspeech_model["audio_tokenizer"],
320
- audio_path, original_transcript, sentence, mask_interval,
321
- cfg_coef, aug_text, False, True, False,
322
- device, decode_config
323
- )
324
- else:
325
- orig_spans = parse_tts_en(original_transcript, sentence) if ssrspeech_model_choice == 'English' else parse_tts_zh(original_transcript, sentence)
326
- print("orig_spans: ", orig_spans)
327
-
328
- starting_intervals = []
329
- ending_intervals = []
330
- for orig_span in orig_spans:
331
- start, end = get_mask_interval(transcribe_state, orig_span)
332
- starting_intervals.append(start)
333
- ending_intervals.append(end)
334
 
335
- print("intervals: ", starting_intervals, ending_intervals)
 
 
 
 
 
 
 
 
 
 
 
 
 
336
 
337
- info = torchaudio.info(audio_path)
338
- audio_dur = info.num_frames / info.sample_rate
339
-
340
- morphed_span = [(max(start, 1/codec_sr), min(end, audio_dur))
341
- for start, end in zip(starting_intervals, ending_intervals)] # in seconds
342
- mask_interval = [[round(span[0]*codec_sr), round(span[1]*codec_sr)] for span in morphed_span]
343
- mask_interval = torch.LongTensor(mask_interval) # [M,2], M==1 for now
344
- print("mask_interval: ", mask_interval)
345
- gen_audio = inference_one_sample(
346
- ssrspeech_model["model"],
347
- ssrspeech_model["config"],
348
- ssrspeech_model["phn2num"],
349
- ssrspeech_model["text_tokenizer"],
350
- ssrspeech_model["audio_tokenizer"],
351
- audio_path, original_transcript, sentence, mask_interval,
352
- cfg_coef, aug_text, False, True, True,
353
- device, decode_config
354
- )
355
-
356
- gen_audio = gen_audio[0].cpu()
357
- audio_tensors.append(gen_audio)
358
-
359
- if mode != "Rerun":
360
- output_audio = get_output_audio(audio_tensors, codec_audio_sr)
361
- sentences = [f"{idx}: {text}" for idx, text in enumerate(sentences)]
362
- component = gr.Dropdown(choices=sentences, value=sentences[0])
363
- return output_audio, inference_transcript, component, audio_tensors
364
  else:
365
- previous_audio_tensors[selected_sentence_idx] = audio_tensors[0]
366
- output_audio = get_output_audio(previous_audio_tensors, codec_audio_sr)
367
- sentence_audio = get_output_audio(audio_tensors, codec_audio_sr)
368
- return output_audio, inference_transcript, sentence_audio, previous_audio_tensors
369
-
 
 
370
 
371
- def load_sentence(selected_sentence, codec_audio_sr, audio_tensors):
372
- if selected_sentence is None:
373
- return None
374
- colon_position = selected_sentence.find(':')
375
- selected_sentence_idx = int(selected_sentence[:colon_position])
376
- return get_output_audio([audio_tensors[selected_sentence_idx]], codec_audio_sr)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
377
 
378
 
379
  demo_original_transcript = "Gwynplaine had, besides, for his work and for his feats of strength, round his neck and over his shoulders, an esclavine of leather."
@@ -405,23 +429,14 @@ def get_app():
405
  ssrspeech_model_choice = gr.Radio(label="ssrspeech model", value="English",
406
  choices=["English", "Mandarin"])
407
  whisper_backend_choice = gr.Radio(label="Whisper backend", value="whisperX", choices=["whisperX", "whisper"])
408
- whisper_model_choice = gr.Radio(label="Whisper model", value="base.en",
409
- choices=[None, "base.en", "small.en", "medium.en", "large"])
410
- align_model_choice = gr.Radio(label="Forced alignment model", value="whisperX", choices=["whisperX", None])
411
 
412
  with gr.Row():
413
  with gr.Column(scale=2):
414
  input_audio = gr.Audio(value=f"{DEMO_PATH}/5895_34622_000026_000002.wav", label="Input Audio", type="filepath", interactive=True)
415
  with gr.Group():
416
  original_transcript = gr.Textbox(label="Original transcript", lines=5, value=demo_original_transcript,
417
- info="Use whisperx model to get the transcript. Fix and align it if necessary.")
418
- with gr.Accordion("Word start time", open=False):
419
- transcript_with_start_time = gr.Textbox(label="Start time", lines=5, interactive=False, info="Start time before each word")
420
- with gr.Accordion("Word end time", open=False):
421
- transcript_with_end_time = gr.Textbox(label="End time", lines=5, interactive=False, info="End time after each word")
422
-
423
  transcribe_btn = gr.Button(value="Transcribe")
424
- align_btn = gr.Button(value="Align")
425
 
426
  with gr.Column(scale=3):
427
  with gr.Group():
@@ -437,11 +452,6 @@ def get_app():
437
  with gr.Accordion("Inference transcript", open=False):
438
  inference_transcript = gr.Textbox(label="Inference transcript", lines=5, interactive=False,
439
  info="Inference was performed on this transcript.")
440
- with gr.Group(visible=False) as long_tts_sentence_editor:
441
- sentence_selector = gr.Dropdown(label="Sentence", value=None,
442
- info="Select sentence you want to regenerate")
443
- sentence_audio = gr.Audio(label="Sentence Audio", scale=2)
444
- rerun_btn = gr.Button(value="Rerun")
445
 
446
  with gr.Row():
447
  with gr.Accordion("Generation Parameters - change these if you are unhappy with the generation", open=False):
@@ -453,57 +463,38 @@ def get_app():
453
  aug_text = gr.Radio(label="aug_text", choices=[0, 1], value=1,
454
  info="set to 1 to use cfg")
455
  cfg_coef = gr.Number(label="cfg_coef", value=1.5,
456
- info="cfg guidance scale, 1.5 is a good value")
457
- sub_amount = gr.Number(label="sub_amount", value=0.12, info="margin to the left and right of the editing segment")
 
 
458
  top_p = gr.Number(label="top_p", value=0.8, info="0.9 is a good value, 0.8 is also good")
459
- temperature = gr.Number(label="temperature", value=1, info="haven't try other values, do not recommend to change")
460
  top_k = gr.Number(label="top_k", value=0, info="0 means we don't use topk sampling, because we use topp sampling")
461
- codec_audio_sr = gr.Number(label="codec_audio_sr", value=16000, info='encodec specific, Do not change')
462
- codec_sr = gr.Number(label="codec_sr", value=50, info='encodec specific, Do not change')
463
  silence_tokens = gr.Textbox(label="silence tokens", value="[1388,1898,131]", info="encodec specific, do not change")
464
 
465
  success_output = gr.HTML()
466
- audio_tensors = gr.State()
467
- transcribe_state = gr.State(value={"words_info": demo_words_info})
468
 
469
  load_models_btn.click(fn=load_models,
470
- inputs=[whisper_backend_choice, whisper_model_choice, align_model_choice, ssrspeech_model_choice],
471
  outputs=[models_selector, success_output])
472
 
473
-
474
  transcribe_btn.click(fn=transcribe,
475
- inputs=[seed, input_audio],
476
- outputs=[original_transcript, transcript_with_start_time, transcript_with_end_time, transcribe_state, success_output])
477
- align_btn.click(fn=align,
478
- inputs=[seed, original_transcript, input_audio],
479
- outputs=[transcript_with_start_time, transcript_with_end_time, transcribe_state, success_output])
480
-
481
  run_btn.click(fn=run,
482
  inputs=[
483
  seed, sub_amount, ssrspeech_model_choice,
484
  codec_audio_sr, codec_sr,
485
- top_k, top_p, temperature,
486
- stop_repetition,
487
- kvcache, silence_tokens, aug_text, cfg_coef,
488
- input_audio, transcribe_state, original_transcript, transcript,
489
- mode, sentence_selector, audio_tensors
490
  ],
491
- outputs=[output_audio, inference_transcript, sentence_selector, audio_tensors])
492
-
493
- sentence_selector.change(fn=load_sentence,
494
- inputs=[sentence_selector, codec_audio_sr, audio_tensors],
495
- outputs=[sentence_audio])
496
- rerun_btn.click(fn=run,
497
- inputs=[
498
- seed, sub_amount, ssrspeech_model_choice,
499
- codec_audio_sr, codec_sr,
500
- top_k, top_p, temperature,
501
- stop_repetition,
502
- kvcache, silence_tokens, aug_text, cfg_coef,
503
- input_audio, transcribe_state, original_transcript, transcript,
504
- gr.State(value="Rerun"), sentence_selector, audio_tensors
505
- ],
506
- outputs=[output_audio, inference_transcript, sentence_audio, audio_tensors])
507
 
508
  return app
509
 
 
125
  return self.align_model.align(segments, audio_path)
126
 
127
  @spaces.GPU
128
+ def load_models(whisper_backend_name, ssrspeech_model_name):
129
  global transcribe_model, align_model, ssrspeech_model
130
 
131
+ alignment_model_name = "whisperX"
132
  if ssrspeech_model_name == "English":
133
  ssrspeech_model_name = "English"
134
  text_tokenizer = TextTokenizer(backend="espeak")
135
  language = "en"
136
+ whisper_model_name = "base.en"
137
+
138
  elif ssrspeech_model_name == "Mandarin":
139
  ssrspeech_model_name = "Mandarin"
140
  text_tokenizer = TextTokenizer(backend="espeak", language='cmn')
141
  language = "zh"
142
+ whisper_model_name = "base"
143
 
144
  if alignment_model_name is not None:
145
  align_model = WhisperxAlignModel(language)
 
215
 
216
  segments = align_model.align(segments, audio_path)
217
  state = get_transcribe_state(segments)
 
218
 
219
+ return state
 
 
 
220
 
221
 
222
  def get_output_audio(audio_tensors, codec_audio_sr):
 
238
 
239
  @spaces.GPU
240
  def run(seed, sub_amount, ssrspeech_model_choice, codec_audio_sr, codec_sr, top_k, top_p, temperature,
241
+ stop_repetition, kvcache, silence_tokens, aug_text, cfg_coef, prompt_length,
242
+ audio_path, original_transcript, transcript, mode):
 
243
 
244
+ global transcribe_model, align_model, ssrspeech_model
245
  aug_text = True if aug_text == 1 else False
246
  if ssrspeech_model is None:
247
  raise gr.Error("ssrspeech model not loaded")
248
 
249
+ seed_everything(seed)
250
+
251
+ if ssrspeech_model_choice == "English":
252
+ language = "en"
253
+ elif ssrspeech_model_choice == "Mandarin":
254
+ language = "zh"
255
+
256
  # resample audio
257
  audio, _ = librosa.load(audio_path, sr=16000)
258
  sf.write(audio_path, audio, 16000)
259
+
260
+ # text normalization
261
+ target_transcript = replace_numbers_with_words(transcript).replace(" ", " ").replace(" ", " ").replace("\n", " ")
262
+ orig_transcript = replace_numbers_with_words(original_transcript).replace(" ", " ").replace(" ", " ").replace("\n", " ")
263
+
264
+ orig_transcript, segments = transcribe(audio_path)
265
+ if language == 'zh':
266
+ converter = opencc.OpenCC('t2s')
267
+ orig_transcript = converter.convert(orig_transcript)
268
+ transcribe_state = align(traditional_to_simplified(segments), audio_path)
269
+ transcribe_state['segments'] = traditional_to_simplified(transcribe_state['segments'])
270
+ elif language == 'en':
271
+ orig_transcript = orig_transcript.lower()
272
+ target_transcript = target_transcript.lower()
273
+ transcribe_state = align(segments, audio_path)
274
+ print(orig_transcript)
275
+ print(target_transcript)
276
+
277
+ if mode == "TTS":
278
+ info = torchaudio.info(audio_path)
279
+ duration = info.num_frames / info.sample_rate
280
+ cut_length = duration
281
+ # Cut long audio for tts
282
+ if duration > prompt_length:
283
+ seg_num = len(transcribe_state['segments'])
284
+ for i in range(seg_num):
285
+ words = transcribe_state['segments'][i]['words']
286
+ for item in words:
287
+ if item['end'] >= prompt_length:
288
+ cut_length = min(item['end'], cut_length)
289
+
290
+ audio, _ = librosa.load(audio_path, sr=16000, duration=cut_length)
291
+ sf.write(audio_path, audio, 16000)
292
+ orig_transcript, segments = transcribe(audio_path)
293
 
294
+ if language == 'zh':
295
+ converter = opencc.OpenCC('t2s')
296
+ orig_transcript = converter.convert(orig_transcript)
297
+ transcribe_state = align(traditional_to_simplified(segments), audio_path)
298
+ transcribe_state['segments'] = traditional_to_simplified(transcribe_state['segments'])
299
+ elif language == 'en':
300
+ orig_transcript = orig_transcript.lower()
301
+ target_transcript = target_transcript.lower()
302
+ transcribe_state = align(segments, audio_path)
303
+ print(orig_transcript)
304
+ target_transcript_copy = target_transcript # for tts cut out
305
+ if language == 'en':
306
+ target_transcript_copy = target_transcript_copy.split(' ')[0]
307
+ elif language == 'zh':
308
+ target_transcript_copy = target_transcript_copy[0]
309
+ target_transcript = orig_transcript + ' ' + target_transcript if language == 'en' else orig_transcript + target_transcript
310
+ print(target_transcript)
311
+
312
+ if mode == "Edit":
313
+ operations, orig_spans = parse_edit_en(orig_transcript, target_transcript) if language == 'en' else parse_edit_zh(orig_transcript, target_transcript)
314
+ print(operations)
315
+ print("orig_spans: ", orig_spans)
316
 
317
+ if len(orig_spans) > 3:
318
+ raise gr.Error("Current model only supports maximum 3 editings")
 
 
 
 
 
 
 
319
 
320
+ starting_intervals = []
321
+ ending_intervals = []
322
+ for orig_span in orig_spans:
323
+ start, end = get_mask_interval(transcribe_state, orig_span)
324
+ starting_intervals.append(start)
325
+ ending_intervals.append(end)
326
+
327
+ print("intervals: ", starting_intervals, ending_intervals)
328
+
329
+ info = torchaudio.info(audio_path)
330
+ audio_dur = info.num_frames / info.sample_rate
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
331
 
332
+ def combine_spans(spans, threshold=0.2):
333
+ spans.sort(key=lambda x: x[0])
334
+ combined_spans = []
335
+ current_span = spans[0]
336
+
337
+ for i in range(1, len(spans)):
338
+ next_span = spans[i]
339
+ if current_span[1] >= next_span[0] - threshold:
340
+ current_span[1] = max(current_span[1], next_span[1])
341
+ else:
342
+ combined_spans.append(current_span)
343
+ current_span = next_span
344
+ combined_spans.append(current_span)
345
+ return combined_spans
346
 
347
+ morphed_span = [[max(start - sub_amount, 0), min(end + sub_amount, audio_dur)]
348
+ for start, end in zip(starting_intervals, ending_intervals)] # in seconds
349
+ morphed_span = combine_spans(morphed_span, threshold=0.2)
350
+ print("morphed_spans: ", morphed_span)
351
+ mask_interval = [[round(span[0]*codec_sr), round(span[1]*codec_sr)] for span in morphed_span]
352
+ mask_interval = torch.LongTensor(mask_interval) # [M,2], M==1 for now
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
353
  else:
354
+ info = torchaudio.info(audio_path)
355
+ audio_dur = info.num_frames / info.sample_rate
356
+
357
+ morphed_span = [(audio_dur, audio_dur)] # in seconds
358
+ mask_interval = [[round(span[0]*codec_sr), round(span[1]*codec_sr)] for span in morphed_span]
359
+ mask_interval = torch.LongTensor(mask_interval) # [M,2], M==1 for now
360
+ print("mask_interval: ", mask_interval)
361
 
362
+ decode_config = {'top_k': top_k, 'top_p': top_p, 'temperature': temperature, 'stop_repetition': stop_repetition, 'kvcache': kvcache, "codec_audio_sr": codec_audio_sr, "codec_sr": codec_sr}
363
+
364
+ tts = True if mode == "TTS" else False
365
+ new_audio = inference_one_sample(
366
+ ssrspeech_model["model"],
367
+ ssrspeech_model["config"],
368
+ ssrspeech_model["phn2num"],
369
+ ssrspeech_model["text_tokenizer"],
370
+ ssrspeech_model["audio_tokenizer"],
371
+ audio_path, orig_transcript, target_transcript, mask_interval,
372
+ cfg_coef, aug_text, False, True, tts,
373
+ device, decode_config
374
+ )
375
+ audio_tensors = []
376
+ # save segments for comparison
377
+ new_audio = new_audio[0].cpu()
378
+ torchaudio.save(audio_path, new_audio, codec_audio_sr)
379
+ if tts: # remove the start parts
380
+ new_transcript, new_segments = transcribe(audio_path)
381
+ if language == 'zh':
382
+ transcribe_state = align(traditional_to_simplified(new_segments), audio_path)
383
+ transcribe_state['segments'] = traditional_to_simplified(transcribe_state['segments'])
384
+ tmp1 = transcribe_state['segments'][0]['words'][0]['word']
385
+ tmp2 = target_transcript_copy
386
+ elif language == 'en':
387
+ transcribe_state = align(new_segments, audio_path)
388
+ tmp1 = transcribe_state['segments'][0]['words'][0]['word'].lower()
389
+ tmp2 = target_transcript_copy.lower()
390
+ if tmp1 == tmp2:
391
+ offset = transcribe_state['segments'][0]['words'][0]['start']
392
+ else:
393
+ offset = transcribe_state['segments'][0]['words'][1]['start']
394
+
395
+ new_audio, _ = torchaudio.load(audio_path, frame_offset=int(offset*codec_audio_sr))
396
+ audio_tensors.append(new_audio)
397
+ output_audio = get_output_audio(audio_tensors, codec_audio_sr)
398
+
399
+ success_message = "<span style='color:green;'>Success: Inference successfully!</span>"
400
+ return output_audio, success_message
401
 
402
 
403
  demo_original_transcript = "Gwynplaine had, besides, for his work and for his feats of strength, round his neck and over his shoulders, an esclavine of leather."
 
429
  ssrspeech_model_choice = gr.Radio(label="ssrspeech model", value="English",
430
  choices=["English", "Mandarin"])
431
  whisper_backend_choice = gr.Radio(label="Whisper backend", value="whisperX", choices=["whisperX", "whisper"])
 
 
 
432
 
433
  with gr.Row():
434
  with gr.Column(scale=2):
435
  input_audio = gr.Audio(value=f"{DEMO_PATH}/5895_34622_000026_000002.wav", label="Input Audio", type="filepath", interactive=True)
436
  with gr.Group():
437
  original_transcript = gr.Textbox(label="Original transcript", lines=5, value=demo_original_transcript,
438
+ info="Use whisperx model to get the transcript.")
 
 
 
 
 
439
  transcribe_btn = gr.Button(value="Transcribe")
 
440
 
441
  with gr.Column(scale=3):
442
  with gr.Group():
 
452
  with gr.Accordion("Inference transcript", open=False):
453
  inference_transcript = gr.Textbox(label="Inference transcript", lines=5, interactive=False,
454
  info="Inference was performed on this transcript.")
 
 
 
 
 
455
 
456
  with gr.Row():
457
  with gr.Accordion("Generation Parameters - change these if you are unhappy with the generation", open=False):
 
463
  aug_text = gr.Radio(label="aug_text", choices=[0, 1], value=1,
464
  info="set to 1 to use cfg")
465
  cfg_coef = gr.Number(label="cfg_coef", value=1.5,
466
+ info="cfg guidance scale, 1.5 is a good value, change if you don't like the results")
467
+ prompt_length = gr.Number(label="prompt_length", value=3,
468
+ info="used for tts prompt, will automatically cut the prompt audio to this length")
469
+ sub_amount = gr.Number(label="sub_amount", value=0.12, info="margin to the left and right of the editing segment, change if you don't like the results")
470
  top_p = gr.Number(label="top_p", value=0.8, info="0.9 is a good value, 0.8 is also good")
471
+ temperature = gr.Number(label="temperature", value=1, info="haven't try other values, do not change")
472
  top_k = gr.Number(label="top_k", value=0, info="0 means we don't use topk sampling, because we use topp sampling")
473
+ codec_audio_sr = gr.Number(label="codec_audio_sr", value=16000, info='encodec specific, do not change')
474
+ codec_sr = gr.Number(label="codec_sr", value=50, info='encodec specific, do not change')
475
  silence_tokens = gr.Textbox(label="silence tokens", value="[1388,1898,131]", info="encodec specific, do not change")
476
 
477
  success_output = gr.HTML()
 
 
478
 
479
  load_models_btn.click(fn=load_models,
480
+ inputs=[whisper_backend_choice, ssrspeech_model_choice],
481
  outputs=[models_selector, success_output])
482
 
483
+ semgents = None # not used
484
  transcribe_btn.click(fn=transcribe,
485
+ inputs=[input_audio],
486
+ outputs=[original_transcript, semgents, success_output])
487
+
 
 
 
488
  run_btn.click(fn=run,
489
  inputs=[
490
  seed, sub_amount, ssrspeech_model_choice,
491
  codec_audio_sr, codec_sr,
492
+ top_k, top_p, temperature, stop_repetition, kvcache, silence_tokens,
493
+ aug_text, cfg_coef, prompt_length,
494
+ input_audio, original_transcript, transcript,
495
+ mode
 
496
  ],
497
+ outputs=[output_audio, success_output])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
498
 
499
  return app
500