remzicam commited on
Commit
86e3069
·
1 Parent(s): 2be0e9c

Upload 3 files

Browse files
Files changed (3) hide show
  1. app.py +37 -103
  2. requirements.txt +0 -0
  3. utils.py +157 -0
app.py CHANGED
@@ -1,122 +1,56 @@
1
- from transformers import (BlenderbotSmallTokenizer,
2
- logging)
3
- from mtranslate import translate
4
- from io import BytesIO
5
- from base64 import b64encode
6
- import gradio as gr
7
- from speech_recognition import Recognizer,AudioFile
8
- from gtts import gTTS
9
- from blender_model import blender_onnx_model
10
 
11
- #supress huggingface warnings
12
- logging.set_verbosity_error()
13
- bot_tokenizer_name="facebook/blenderbot_small-90M"
14
- max_answer_length=100
15
- bot_language="en"
16
- main_language = 'de'
17
- bot_tokenizer = BlenderbotSmallTokenizer.from_pretrained(bot_tokenizer_name)
18
- #load chatbot model
19
- bot_model=blender_onnx_model
20
 
21
- def app(audio):
22
- """
23
- It takes voice input from user then
24
- responds it both verbally and in text.
25
- """
26
- text=stt(audio)
27
- bot_response_en,bot_response_de=answer_generation(text)
28
- voice_bot=tts(bot_response_de)
29
- b64 = b64encode(voice_bot).decode()
30
- #html code that automatically play sounds
31
- html = f"""
32
- <audio controls autoplay>
33
- <source src="data:audio/wav;base64,{b64}" type="audio/wav">
34
- </audio>
35
- """
36
- return text,html,bot_response_de,bot_response_en
37
 
38
- def stt(audio):
39
- """
40
- speech to text converter
41
 
42
- Args:
43
- audio: record of user speech
44
 
45
- Returns:
46
- text (str): recognized speech of user
47
- """
48
- r = Recognizer()
49
- # open the file
50
- with AudioFile(audio) as source:
51
- # listen for the data (load audio to memory)
52
- audio_data = r.record(source)
53
- # recognize (convert from speech to text)
54
- text = r.recognize_google(audio_data,
55
- language=main_language)
56
- return text
57
-
58
- def answer_generation(user_input_de:str):
59
- """
60
- it takes user input as text in german language.
61
- Then it translates into English. Blenderbot works only in English.
62
- Then the model generates an answer w.r.t English version of the input.
63
- Finally, bot's response is translated into German.
64
 
65
  Args:
66
- user_input (str): text version of user's speech
67
 
68
  Returns:
69
- translated_bot_response (str): bot's response in german language
70
- """
71
- #de-en translation
72
- user_input_en=translate(user_input_de,
73
- bot_language,
74
- main_language)
75
- inputs = bot_tokenizer(user_input_en,
76
- return_tensors="pt")
77
- generation= bot_model.generate(**inputs,
78
- max_length=max_answer_length)
79
- bot_response_en=bot_tokenizer.decode(generation[0],
80
- skip_special_tokens = True)
81
- #en-de translation
82
- bot_response_de=translate(bot_response_en,
83
- main_language,
84
- bot_language)
85
-
86
- return bot_response_en,bot_response_de
87
 
88
- def tts(text:str):
89
- """converts text into audio bytes
90
-
91
- Args:
92
- text (str): generated answer of bot
93
-
94
- Returns:
95
- bytes_object(bytes): suitable format for html autoplay sound option
96
  """
97
- tts = gTTS(text=text,
98
- lang=main_language,
99
- slow=False)
100
- bytes_object = BytesIO()
101
- tts.write_to_fp(bytes_object)
102
- bytes_object.seek(0)
103
- return bytes_object.getvalue()
 
 
104
 
105
- logo_image_path="German_AI_Voicebot.png"
106
- logo = f"<center><img src='file/{logo_image_path}' width=180px></center>"
107
- gr.Interface(
108
- fn=app,
109
  inputs=[
110
- gr.Audio(source="microphone", type="filepath",
111
- ),
 
 
112
  ],
113
  outputs=[
114
- gr.Textbox(label="You said: ").style(css="{color: red}"),
 
 
115
  "html",
116
- gr.Textbox(label="AI said: "),
117
- gr.Textbox(label="AI said (English): "),
118
  ],
119
  live=True,
120
  allow_flagging="never",
121
- description=logo,
122
- ).launch()
 
1
+ """Deploying AI Voice Chatbot Gradio App."""
2
+ from gradio import Audio, Interface, Textbox
 
 
 
 
 
 
 
3
 
4
+ from utils import (TextGenerationPipeline, from_en_translation,
5
+ html_audio_autoplay, stt, to_en_translation, tts,
6
+ tts_to_bytesio)
 
 
 
 
 
 
7
 
8
+ max_answer_length = 100
9
+ desired_language = "de"
10
+ response_generator_pipe = TextGenerationPipeline(max_length=max_answer_length)
 
 
 
 
 
 
 
 
 
 
 
 
 
11
 
 
 
 
12
 
13
+ def main(audio: object) -> tuple[str, str, str, object]:
14
+ """Calls functions for deploying gradio app.
15
 
16
+ It responds both verbally and in text
17
+ by taking voice input from user.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
 
19
  Args:
20
+ audio (object): recorded speech of user
21
 
22
  Returns:
23
+ tuple containing
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
 
25
+ - user_speech_text (str) : recognized speech
26
+ - bot_response_de (str) : translated answer of bot
27
+ - bot_response_en (str) : bot's original answer
28
+ - html (object) : autoplayer for bot's speech
 
 
 
 
29
  """
30
+ user_speech_text = stt(audio, desired_language)
31
+ tranlated_text = to_en_translation(user_speech_text, desired_language)
32
+ bot_response_en = response_generator_pipe(tranlated_text)
33
+ bot_response_de = from_en_translation(bot_response_en, desired_language)
34
+ bot_voice = tts(bot_response_de, desired_language)
35
+ bot_voice_bytes = tts_to_bytesio(bot_voice)
36
+ html = html_audio_autoplay(bot_voice_bytes)
37
+ return user_speech_text, bot_response_de, bot_response_en, html
38
+
39
 
40
+ Interface(
41
+ fn=main,
 
 
42
  inputs=[
43
+ Audio(
44
+ source="microphone",
45
+ type="filepath",
46
+ ),
47
  ],
48
  outputs=[
49
+ Textbox(label="You said: "),
50
+ Textbox(label="AI said: "),
51
+ Textbox(label="AI said (English): "),
52
  "html",
 
 
53
  ],
54
  live=True,
55
  allow_flagging="never",
56
+ ).launch(debug=True)
 
requirements.txt CHANGED
Binary files a/requirements.txt and b/requirements.txt differ
 
utils.py ADDED
@@ -0,0 +1,157 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Some utility functions for the app."""
2
+ from base64 import b64encode
3
+ from io import BytesIO
4
+
5
+ from gtts import gTTS
6
+ from mtranslate import translate
7
+ from speech_recognition import AudioFile, Recognizer
8
+ from transformers import (BlenderbotSmallForConditionalGeneration,
9
+ BlenderbotSmallTokenizer)
10
+
11
+
12
+ def stt(audio: object, language: str) -> str:
13
+ """Converts speech to text.
14
+
15
+ Args:
16
+ audio: record of user speech
17
+
18
+ Returns:
19
+ text (str): recognized speech of user
20
+ """
21
+ r = Recognizer()
22
+ # open the audio file
23
+ with AudioFile(audio) as source:
24
+ # listen for the data (load audio to memory)
25
+ audio_data = r.record(source)
26
+ # recognize (convert from speech to text)
27
+ text = r.recognize_google(audio_data, language=language)
28
+ return text
29
+
30
+
31
+ def to_en_translation(text: str, language: str) -> str:
32
+ """Translates text from specified language to English.
33
+
34
+ Args:
35
+ text (str): input text
36
+ language (str): desired language
37
+
38
+ Returns:
39
+ str: translated text
40
+ """
41
+ return translate(text, "en", language)
42
+
43
+
44
+ def from_en_translation(text: str, language: str) -> str:
45
+ """Translates text from english to specified language.
46
+
47
+ Args:
48
+ text (str): input text
49
+ language (str): desired language
50
+
51
+ Returns:
52
+ str: translated text
53
+ """
54
+ return translate(text, language, "en")
55
+
56
+
57
+ class TextGenerationPipeline:
58
+ """Pipeline for text generation of blenderbot model.
59
+
60
+ Returns:
61
+ str: generated text
62
+ """
63
+
64
+ # load tokenizer and the model
65
+ model_name = "facebook/blenderbot_small-90M"
66
+ tokenizer = BlenderbotSmallTokenizer.from_pretrained(model_name)
67
+ model = BlenderbotSmallForConditionalGeneration.from_pretrained(model_name)
68
+
69
+ def __init__(self, **kwargs):
70
+ """Specififying text generation parameters.
71
+
72
+ For example: max_length=100 which generates text shorter than
73
+ 100 tokens. Visit:
74
+ https://huggingface.co/docs/transformers/main_classes/text_generation
75
+ for more parameters
76
+ """
77
+ self.__dict__.update(kwargs)
78
+
79
+ def preprocess(self, text) -> str:
80
+ """Tokenizes input text.
81
+
82
+ Args:
83
+ text (str): user specified text
84
+
85
+ Returns:
86
+ torch.Tensor (obj): text representation as tensors
87
+ """
88
+ return self.tokenizer(text, return_tensors="pt")
89
+
90
+ def postprocess(self, outputs) -> str:
91
+ """Converts tensors into text.
92
+
93
+ Args:
94
+ outputs (torch.Tensor obj): model text generation output
95
+
96
+ Returns:
97
+ str: generated text
98
+ """
99
+ return self.tokenizer.decode(outputs[0], skip_special_tokens=True)
100
+
101
+ def __call__(self, text: str) -> str:
102
+ """Generates text from input text.
103
+
104
+ Args:
105
+ text (str): user specified text
106
+
107
+ Returns:
108
+ str: generated text
109
+ """
110
+ tokenized_text = self.preprocess(text)
111
+ output = self.model.generate(**tokenized_text, **self.__dict__)
112
+ return self.postprocess(output)
113
+
114
+
115
+ def tts(text: str, language: str) -> object:
116
+ """Converts text into audio object.
117
+
118
+ Args:
119
+ text (str): generated answer of bot
120
+
121
+ Returns:
122
+ object: text to speech object
123
+ """
124
+ return gTTS(text=text, lang=language, slow=False)
125
+
126
+
127
+ def tts_to_bytesio(tts_object: object) -> bytes:
128
+ """Converts tts object to bytes.
129
+
130
+ Args:
131
+ tts_object (object): audio object obtained from gtts
132
+
133
+ Returns:
134
+ bytes: audio bytes
135
+ """
136
+ bytes_object = BytesIO()
137
+ tts_object.write_to_fp(bytes_object)
138
+ bytes_object.seek(0)
139
+ return bytes_object.getvalue()
140
+
141
+
142
+ def html_audio_autoplay(bytes: bytes) -> object:
143
+ """Creates html object for autoplaying audio at gradio app.
144
+
145
+ Args:
146
+ bytes (bytes): audio bytes
147
+
148
+ Returns:
149
+ object: html object that provides audio autoplaying
150
+ """
151
+ b64 = b64encode(bytes).decode()
152
+ html = f"""
153
+ <audio controls autoplay>
154
+ <source src="data:audio/wav;base64,{b64}" type="audio/wav">
155
+ </audio>
156
+ """
157
+ return html