Spaces:

NouFuS
/

French_To_English_Speech

Running

App Files Files Community

NouFuS commited on Mar 16, 2024

Commit

494f8dc

verified ·

1 Parent(s): 6a11bad

added tab with voice input.

Browse files

Files changed (1) hide show

app.py +94 -37

app.py CHANGED Viewed

@@ -1,57 +1,114 @@
 import gradio as gr
-from transformers import pipeline
 import torch
 import numpy as np
 device = "cuda:0" if torch.cuda.is_available() else "cpu"
-device = 'cpu'
 print("Device:", device)
 pipe_translate = pipeline("translation", model="Helsinki-NLP/opus-mt-fr-en", device=device)
 pipe_tts = pipeline("text-to-speech", model="facebook/mms-tts-eng", device=device) # Better quality, way faster than bark
 def get_translation(text):
     return pipe_translate(text)[0]["translation_text"]
 def get_audio(text):
     speech = pipe_tts(text)
     return speech["sampling_rate"], (speech["audio"]* 32767).astype(np.int16).T
 with gr.Blocks() as demo:
-    input_text = gr.Textbox(
-                label="Input text",
-                info="Your text",
-                lines=3,
-                placeholder="Écrire le texte à traduire",
-            )
-    translation_button = gr.Button("Traduire !")
-    output_text = gr.Textbox(
-                label="Output text",
-                info="Your text",
-                lines=3,
-                placeholder="Votre traduction",
-            )
-    speech_button = gr.Button("Générer audio !")
-    translation_button.click(
-            get_translation,
-            inputs=[
-                input_text
-            ],
-            outputs=[
-                output_text
-            ],
-    )
-    speech_button.click(
-            get_audio,
-            inputs=[
-                output_text
-            ],
-            outputs=[
-                gr.Audio(label="Output")
-            ],
-    )
 demo.launch()

 import gradio as gr
+from transformers import pipeline, AutoModelForSpeechSeq2Seq, AutoProcessor
 import torch
 import numpy as np
 device = "cuda:0" if torch.cuda.is_available() else "cpu"
+device = "cpu"
+torch_dtype = torch.float16 if device != "cpu" else torch.float32
 print("Device:", device)
+model_id = "openai/whisper-large-v3"
+model = AutoModelForSpeechSeq2Seq.from_pretrained(
+    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
+)
+model.to(device)
+processor = AutoProcessor.from_pretrained(model_id)
+pipe_transcription = pipeline(
+    "automatic-speech-recognition",
+    model=model,
+    tokenizer=processor.tokenizer,
+    feature_extractor=processor.feature_extractor,
+    max_new_tokens=128,
+    chunk_length_s=30,
+    batch_size=16,
+    return_timestamps=True,
+    torch_dtype=torch_dtype,
+    device=device,
+)
 pipe_translate = pipeline("translation", model="Helsinki-NLP/opus-mt-fr-en", device=device)
 pipe_tts = pipeline("text-to-speech", model="facebook/mms-tts-eng", device=device) # Better quality, way faster than bark
 def get_translation(text):
     return pipe_translate(text)[0]["translation_text"]
+def get_transcript(voice):
+    return pipe_transcription(voice, generate_kwargs={"task": "translate", "language": "french"})["text"]
 def get_audio(text):
     speech = pipe_tts(text)
     return speech["sampling_rate"], (speech["audio"]* 32767).astype(np.int16).T
 with gr.Blocks() as demo:
+    with gr.Tab("Voix (plus lent)"):
+        voice = gr.Audio(sources=["microphone"], type="filepath")
+        translation_button = gr.Button("Traduire votre enregistrement !")
+        output_text = gr.Textbox(
+                    label="Texte traduit",
+                    info="Votre texte",
+                    lines=3,
+                    placeholder="Votre traduction",
+                )
+        speech_button = gr.Button("Générer audio !")
+        translation_button.click(
+                get_transcript,
+                inputs=[
+                    voice
+                ],
+                outputs=[
+                    output_text
+                ],
+        )
+        speech_button.click(
+                get_audio,
+                inputs=[
+                    output_text
+                ],
+                outputs=[
+                    gr.Audio(label="Output")
+                ],
+        )
+    with gr.Tab("Texte (rapide)"):
+        input_text = gr.Textbox(
+                    label="Input text",
+                    info="Your text",
+                    lines=3,
+                    placeholder="Écrire le texte à traduire",
+                )
+        translation_button = gr.Button("Traduire...")
+        output_text = gr.Textbox(
+                    label="Output text",
+                    info="Your text",
+                    lines=3,
+                    placeholder="Votre traduction",
+                )
+        speech_button = gr.Button("Générer audio...")
+        translation_button.click(
+                get_translation,
+                inputs=[
+                    input_text
+                ],
+                outputs=[
+                    output_text
+                ],
+        )
+        speech_button.click(
+                get_audio,
+                inputs=[
+                    output_text
+                ],
+                outputs=[
+                    gr.Audio(label="Output")
+                ],
+        )
 demo.launch()