speech-to-speech-translation

Sleeping

App Files Files Community

speech-to-speech-translation / app.py

yuvscherbatov

Upload app.py

8e253e0 about 1 year ago

raw

history blame

3.12 kB

	# -- coding: utf-8 --
	"""app.ipynb

	Automatically generated by Colaboratory.

	Original file is located at
	https://colab.research.google.com/drive/16MxXQeF3O0htL9eQ61aa6ZxnApGg9TKN
	"""

	import gradio as gr
	import numpy as np
	import torch

	from transformers import pipeline, VitsModel, VitsTokenizer, FSMTForConditionalGeneration, FSMTTokenizer

	device = "cuda:0" if torch.cuda.is_available() else "cpu"

	#eng audio to text transformation
	asr_pipe = pipeline("automatic-speech-recognition", model="asapp/sew-d-tiny-100k-ft-ls100h", device=device)

	#eng text to rus text translation
	mname = "facebook/wmt19-en-ru"
	tokenizer = FSMTTokenizer.from_pretrained(mname)
	model = FSMTForConditionalGeneration.from_pretrained(mname)

	#translation_pipe = pipeline("translation", model="facebook/wmt19-en-ru")

	#rus text to rus speech transformation
	vits_model = VitsModel.from_pretrained("facebook/mms-tts-rus")
	vits_tokenizer = VitsTokenizer.from_pretrained("facebook/mms-tts-rus")

	def transform_audio_to_speech_en(audio):
	outputs = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"task": "translate"})
	return outputs["text"]

	def translator(text):
	input_ids = tokenizer.encode(text, return_tensors="pt")
	outputs = model.generate(input_ids)
	decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
	return decoded
	#translated_text = translation_pipe(text)
	#return translated_text[0]['translation_text']

	def synthesise(translated_text):
	translated_text = translator(translated_text)
	inputs = vits_tokenizer(translated_text, return_tensors="pt")
	with torch.no_grad():
	speech = vits_model(**inputs).waveform
	return speech.cpu()

	def speech_to_speech_translation(audio):
	translated_text = transform_audio_to_speech_en(audio)
	synthesised_speech = synthesise(translated_text)
	synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
	return 16000, synthesised_speech[0]

	title = "Cascaded STST"
	description = """
	Demo for cascaded speech-to-speech translation (STST), mapping from source speech in any language to target speech in English. Demo uses OpenAI's [Whisper Base](https://huggingface.co/openai/whisper-base) model for speech translation, and Microsoft's
	[SpeechT5 TTS](https://huggingface.co/microsoft/speecht5_tts) model for text-to-speech:
	![Cascaded STST](https://huggingface.co/datasets/huggingface-course/audio-course-images/resolve/main/s2st_cascaded.png "Diagram of cascaded speech to speech translation")
	"""

	demo = gr.Blocks()

	mic_translate = gr.Interface(
	fn=speech_to_speech_translation,
	inputs=gr.Audio(type="filepath"),
	outputs=gr.Audio(label="Generated Speech", type="numpy"),
	title=title,
	description=description,
	)

	file_translate = gr.Interface(
	fn=speech_to_speech_translation,
	inputs=gr.Audio(type="filepath"),
	outputs=gr.Audio(label="Generated Speech", type="numpy"),
	examples=[["/content/drive/MyDrive/test_2.wav"]],
	title=title,
	description=description,
	)

	with demo:
	gr.TabbedInterface([mic_translate, file_translate], ["Microphone", "Audio File"])

	demo.launch()