poc_call_transcription / app_gradio.py
pustozerov's picture
Created and tested an alternative gradio GUI for the app.
e053abc
import shutil
import gradio as gr
import random
import os
import numpy as np
from pydub import AudioSegment
from datasets import load_dataset
from scipy.io.wavfile import write
from modules.diarization.nemo_diarization import diarization
from modules.nlp.nemo_ner import detect_ner
from modules.nlp.nemo_punct_cap import punctuation_capitalization
FOLDER_WAV_DB = "data/database/"
FOLDER_USER_DATA = "data/user_data/"
FOLDER_USER_DATA_WAV = "data/user_data_wav/"
FOLDER_MANIFESTS = "info/configs/manifests/"
SAMPLE_RATE = 16000
dataset = load_dataset("pustozerov/crema_d_diarization", split='validation')
os.makedirs(FOLDER_WAV_DB, exist_ok=True)
os.makedirs(FOLDER_MANIFESTS, exist_ok=True)
def process_audio(uploaded_file=None):
if uploaded_file:
secondary_audio = False
folder_wav = FOLDER_USER_DATA_WAV
os.makedirs(folder_wav, exist_ok=True)
print(uploaded_file)
shutil.move(uploaded_file, os.path.join(FOLDER_USER_DATA, os.path.basename(uploaded_file)))
uploaded_file = os.path.join(FOLDER_USER_DATA, os.path.basename(uploaded_file))
print(uploaded_file)
if ".mp3" in uploaded_file:
sound = AudioSegment.from_mp3(uploaded_file)
elif ".ogg" in uploaded_file:
sound = AudioSegment.from_ogg(uploaded_file)
else:
sound = AudioSegment.from_wav(uploaded_file)
save_path = folder_wav + os.path.basename(uploaded_file)
os.makedirs(folder_wav, exist_ok=True)
sound.export(save_path, format="wav", parameters=["-ac", "1"])
file_name = os.path.basename(save_path).split(".")[0]
result = diarization(save_path)
else:
secondary_audio = True
folder_wav = FOLDER_WAV_DB
os.makedirs(folder_wav, exist_ok=True)
shuffled_dataset = dataset.shuffle(seed=random.randint(0, 100))
file_name = str(shuffled_dataset["file"][0]).split(".")[0]
audio_bytes = np.array(shuffled_dataset["data"][0])
audio_bytes_scaled = np.int16(audio_bytes / np.max(np.abs(audio_bytes)) * 32767)
write(os.path.join(folder_wav, file_name + '.wav'), rate=SAMPLE_RATE, data=audio_bytes_scaled)
result = diarization(os.path.join(folder_wav, file_name + '.wav'))
transcript_path = "info/transcripts/pred_rttms/" + file_name + ".txt"
with open(transcript_path) as f:
transcript = f.read()
sentences = result[file_name]["sentences"]
all_strings = ""
for sentence in sentences:
all_strings = all_strings + sentence["sentence"] + "\n"
all_strings = punctuation_capitalization([all_strings])[0]
tagged_string, tags_summary = detect_ner(all_strings)
transcript = transcript + '\n' + tagged_string
with open(transcript_path, 'w') as f:
f.write(transcript)
output = "<p>Number of speakers: %s" % result[file_name]["speaker_count"] + "<br>" \
+ "Sentences: %s" % len(result[file_name]["sentences"]) + "<br>" \
+ "Words: %s" % len(result[file_name]["words"]) + "<br>" \
+ "Found named entities: %s" % tags_summary + "</p>"
return [audio_output.update(os.path.join(folder_wav, file_name + '.wav'), visible=secondary_audio),
output, file_output.update(transcript_path, visible=True)]
with gr.Blocks() as demo:
gr.HTML('<br><h1><font size="+4">Call Transcription demo</font></h1>')
gr.HTML('<p><font size="+1">This simple demo shows the possibilities of ASR and NLP in the task of automatic '
'speech recognition '
'and diarization. It works with mp3, ogg, and wav files. You can randomly pick an audio file with the '
'dialogue from the built-in database or try uploading your files.</font></p>')
gr.Markdown('<p><font size="+1">Note: this demo shows up a reduced-performance model. To get a full-performance '
'neural network or '
'develop a system adapted to your task – contact <a '
'href="mailto:[email protected]?subject=Request for '
'information">[email protected]</a>.</font></p>')
audio_input = gr.Audio(source="upload", type="filepath")
second_btn = gr.Button('Try uploaded audiofile')
gr.Markdown('<center><p>or</p></center>')
first_btn = gr.Button('Try a random sample from the database')
# Output zone
audio_output = gr.Audio(visible=False, interactive=True)
text_output = gr.HTML()
file_output = gr.File(label="Download audio transcript", visible=False)
# noinspection PyTypeChecker
first_btn.click(fn=process_audio, inputs=None,
outputs=[audio_output, text_output, file_output])
# noinspection PyTypeChecker
second_btn.click(fn=process_audio, inputs=audio_input, outputs=[audio_output, text_output, file_output])
demo.launch(share=True)