pustozerov commited on
Commit
e053abc
·
1 Parent(s): cd64b5d

Created and tested an alternative gradio GUI for the app.

Browse files
.gitignore CHANGED
@@ -2,5 +2,4 @@
2
  /data/database/
3
  /info/configs/manifests/
4
  /info/transcripts/
5
- /data/user_data/
6
  /data/user_data_wav/
 
2
  /data/database/
3
  /info/configs/manifests/
4
  /info/transcripts/
 
5
  /data/user_data_wav/
.idea/PoCCallTranscription.iml CHANGED
@@ -8,4 +8,7 @@
8
  <orderEntry type="inheritedJdk" />
9
  <orderEntry type="sourceFolder" forTests="false" />
10
  </component>
 
 
 
11
  </module>
 
8
  <orderEntry type="inheritedJdk" />
9
  <orderEntry type="sourceFolder" forTests="false" />
10
  </component>
11
+ <component name="PackageRequirementsSettings">
12
+ <option name="versionSpecifier" value="Don't specify version" />
13
+ </component>
14
  </module>
app_gradio.py ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import shutil
2
+
3
+ import gradio as gr
4
+ import random
5
+ import os
6
+ import numpy as np
7
+ from pydub import AudioSegment
8
+ from datasets import load_dataset
9
+ from scipy.io.wavfile import write
10
+
11
+ from modules.diarization.nemo_diarization import diarization
12
+ from modules.nlp.nemo_ner import detect_ner
13
+ from modules.nlp.nemo_punct_cap import punctuation_capitalization
14
+
15
+ FOLDER_WAV_DB = "data/database/"
16
+ FOLDER_USER_DATA = "data/user_data/"
17
+ FOLDER_USER_DATA_WAV = "data/user_data_wav/"
18
+ FOLDER_MANIFESTS = "info/configs/manifests/"
19
+ SAMPLE_RATE = 16000
20
+ dataset = load_dataset("pustozerov/crema_d_diarization", split='validation')
21
+ os.makedirs(FOLDER_WAV_DB, exist_ok=True)
22
+ os.makedirs(FOLDER_MANIFESTS, exist_ok=True)
23
+
24
+
25
+ def process_audio(uploaded_file=None):
26
+ if uploaded_file:
27
+ secondary_audio = False
28
+ folder_wav = FOLDER_USER_DATA_WAV
29
+ os.makedirs(folder_wav, exist_ok=True)
30
+ print(uploaded_file)
31
+ shutil.move(uploaded_file, os.path.join(FOLDER_USER_DATA, os.path.basename(uploaded_file)))
32
+ uploaded_file = os.path.join(FOLDER_USER_DATA, os.path.basename(uploaded_file))
33
+ print(uploaded_file)
34
+ if ".mp3" in uploaded_file:
35
+ sound = AudioSegment.from_mp3(uploaded_file)
36
+ elif ".ogg" in uploaded_file:
37
+ sound = AudioSegment.from_ogg(uploaded_file)
38
+ else:
39
+ sound = AudioSegment.from_wav(uploaded_file)
40
+ save_path = folder_wav + os.path.basename(uploaded_file)
41
+ os.makedirs(folder_wav, exist_ok=True)
42
+ sound.export(save_path, format="wav", parameters=["-ac", "1"])
43
+ file_name = os.path.basename(save_path).split(".")[0]
44
+ result = diarization(save_path)
45
+ else:
46
+ secondary_audio = True
47
+ folder_wav = FOLDER_WAV_DB
48
+ os.makedirs(folder_wav, exist_ok=True)
49
+ shuffled_dataset = dataset.shuffle(seed=random.randint(0, 100))
50
+ file_name = str(shuffled_dataset["file"][0]).split(".")[0]
51
+ audio_bytes = np.array(shuffled_dataset["data"][0])
52
+ audio_bytes_scaled = np.int16(audio_bytes / np.max(np.abs(audio_bytes)) * 32767)
53
+ write(os.path.join(folder_wav, file_name + '.wav'), rate=SAMPLE_RATE, data=audio_bytes_scaled)
54
+ result = diarization(os.path.join(folder_wav, file_name + '.wav'))
55
+ transcript_path = "info/transcripts/pred_rttms/" + file_name + ".txt"
56
+ with open(transcript_path) as f:
57
+ transcript = f.read()
58
+ sentences = result[file_name]["sentences"]
59
+ all_strings = ""
60
+ for sentence in sentences:
61
+ all_strings = all_strings + sentence["sentence"] + "\n"
62
+ all_strings = punctuation_capitalization([all_strings])[0]
63
+ tagged_string, tags_summary = detect_ner(all_strings)
64
+ transcript = transcript + '\n' + tagged_string
65
+ with open(transcript_path, 'w') as f:
66
+ f.write(transcript)
67
+ output = "<p>Number of speakers: %s" % result[file_name]["speaker_count"] + "<br>" \
68
+ + "Sentences: %s" % len(result[file_name]["sentences"]) + "<br>" \
69
+ + "Words: %s" % len(result[file_name]["words"]) + "<br>" \
70
+ + "Found named entities: %s" % tags_summary + "</p>"
71
+ return [audio_output.update(os.path.join(folder_wav, file_name + '.wav'), visible=secondary_audio),
72
+ output, file_output.update(transcript_path, visible=True)]
73
+
74
+
75
+ with gr.Blocks() as demo:
76
+ gr.HTML('<br><h1><font size="+4">Call Transcription demo</font></h1>')
77
+ gr.HTML('<p><font size="+1">This simple demo shows the possibilities of ASR and NLP in the task of automatic '
78
+ 'speech recognition '
79
+ 'and diarization. It works with mp3, ogg, and wav files. You can randomly pick an audio file with the '
80
+ 'dialogue from the built-in database or try uploading your files.</font></p>')
81
+ gr.Markdown('<p><font size="+1">Note: this demo shows up a reduced-performance model. To get a full-performance '
82
+ 'neural network or '
83
+ 'develop a system adapted to your task – contact <a '
84
+ 'href="mailto:[email protected]?subject=Request for '
85
+ 'information">[email protected]</a>.</font></p>')
86
+ audio_input = gr.Audio(source="upload", type="filepath")
87
+ second_btn = gr.Button('Try uploaded audiofile')
88
+ gr.Markdown('<center><p>or</p></center>')
89
+ first_btn = gr.Button('Try a random sample from the database')
90
+
91
+ # Output zone
92
+ audio_output = gr.Audio(visible=False, interactive=True)
93
+ text_output = gr.HTML()
94
+ file_output = gr.File(label="Download audio transcript", visible=False)
95
+
96
+ # noinspection PyTypeChecker
97
+ first_btn.click(fn=process_audio, inputs=None,
98
+ outputs=[audio_output, text_output, file_output])
99
+ # noinspection PyTypeChecker
100
+ second_btn.click(fn=process_audio, inputs=audio_input, outputs=[audio_output, text_output, file_output])
101
+
102
+ demo.launch(share=True)
requirements.txt CHANGED
@@ -13,7 +13,7 @@ kenlm @ https://github.com/kpu/kenlm/archive/master.zip
13
  librosa==0.9.2
14
  mecab-python3==1.0.5
15
  nemo-toolkit @ git+https://github.com/NVIDIA/NeMo.git@6442e339a47d30a106d869d1ef29cc1294753b75
16
- omegaconf==2.1.2
17
  OpenCC
18
  pangu==4.0.6.1
19
  praat-parselmouth==0.4.1
@@ -25,21 +25,24 @@ pyannote.database==4.1.3
25
  pyannote.metrics==3.2
26
  pyannote.pipeline==2.3
27
  pyctcdecode==0.3.0
28
- pydub==0.25.1
29
  pynini
30
  pytorch-lightning==1.6.5
31
  sacrebleu==2.1.0
32
  sacremoses==0.0.53
33
  sentencepiece==0.1.96
34
- SoundFile==0.10.3.post1
35
  spacy==3.4.0
36
  speechbrain @ git+https://github.com/speechbrain/speechbrain.git
37
- streamlit~=1.11.1
38
  torch==1.12.0
39
  torchaudio==0.12.0
40
  transformers==4.20.0
41
  webdataset==0.1.62
42
  Cython==0.29.14
43
  youtokentome
44
- datasets~=2.4.0
45
- NEMO~=4.1.1
 
 
 
 
13
  librosa==0.9.2
14
  mecab-python3==1.0.5
15
  nemo-toolkit @ git+https://github.com/NVIDIA/NeMo.git@6442e339a47d30a106d869d1ef29cc1294753b75
16
+ omegaconf
17
  OpenCC
18
  pangu==4.0.6.1
19
  praat-parselmouth==0.4.1
 
25
  pyannote.metrics==3.2
26
  pyannote.pipeline==2.3
27
  pyctcdecode==0.3.0
28
+ pydub
29
  pynini
30
  pytorch-lightning==1.6.5
31
  sacrebleu==2.1.0
32
  sacremoses==0.0.53
33
  sentencepiece==0.1.96
34
+ SoundFile
35
  spacy==3.4.0
36
  speechbrain @ git+https://github.com/speechbrain/speechbrain.git
37
+ streamlit
38
  torch==1.12.0
39
  torchaudio==0.12.0
40
  transformers==4.20.0
41
  webdataset==0.1.62
42
  Cython==0.29.14
43
  youtokentome
44
+ datasets
45
+ NEMO
46
+ numpy
47
+ scipy
48
+ gradio