nomnomnonono's picture
udpate
9ecdb48
import openai
import soundfile
# import whisper
from gtts import gTTS
dic = {"Japanese": "ja", "English": "en"}
class CahtBOT:
def __init__(self):
self.messages = None
def setup(
self,
role1,
content1,
role2,
content2,
role3,
content3,
role4,
content4,
role5,
content5,
api_key,
language,
):
openai.api_key = api_key
self.language = dic[language]
self.messages = [
{"role": role, "content": content}
for role, content in [
[role1, content1],
[role2, content2],
[role3, content3],
[role4, content4],
[role5, content5],
]
if role != "" and content != ""
]
def transcribe(self, audio):
sample_rate, data = audio
soundfile.write(file="tmp.wav", data=data, samplerate=sample_rate)
audio_file = open("tmp.wav", "rb")
transcript = openai.Audio.transcribe("whisper-1", audio_file)
return transcript.text
def answer_by_chat(self, history, question):
self.messages.append({"role": "user", "content": question})
history += [(question, None)]
response = openai.ChatCompletion.create(
model="gpt-3.5-turbo", messages=self.messages
)
response_text = response["choices"][0]["message"]["content"]
response_role = response["choices"][0]["message"]["role"]
response_audio = self.speech_synthesis(response_text)
self.messages.append({"role": response_role, "content": response_text})
# history += [(None, response_text)]
history += [(None, (response_audio,))]
return history
def speech_synthesis(self, sentence):
tts = gTTS(sentence, lang=self.language)
tts.save("tmp.wav")
return "tmp.wav"