Spaces:
Runtime error
Runtime error
""" | |
Imports | |
""" | |
from transformers import pipeline | |
from pytube import YouTube | |
import gradio as gr | |
import requests | |
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM | |
""" | |
Pipeline and models | |
""" | |
transcribe_pipe = pipeline(model="Silemo/whisper-it") # change to "your-username/the-name-you-picked" | |
tags_model = AutoModelForSeq2SeqLM.from_pretrained("efederici/text2tags") | |
tags_tokenizer = AutoTokenizer.from_pretrained("efederici/text2tags") | |
""" | |
Methods | |
""" | |
def transcribe(audio): | |
text = transcribe_pipe(audio)["text"] | |
return text | |
def transcribe_video(url): | |
yt = YouTube(url) | |
stream = yt.streams.get_audio_only() | |
# Saves the audio in the /audio folder | |
audio = stream.download() #output_path = "audio/" | |
text = transcribe_and_tag(audio) | |
return text | |
def transcribe_and_tag(audio): | |
text = transcribe(audio) | |
tags = tag(text=text) | |
return text, tags | |
def download_audio(audio_url, filename): | |
# URL of the image to be downloaded is defined as audio_url | |
r = requests.get(audio_url) # create HTTP response object | |
# send a HTTP request to the server and save | |
# the HTTP response in a response object called r | |
with open(filename,'wb') as f: #"audio/" + | |
# Saving received content as a mp3 file in | |
# binary format | |
# write the contents of the response (r.content) | |
# to a new file in binary mode. | |
f.write(r.content) | |
def tag(text: str): | |
""" | |
Generates tags from given text | |
""" | |
text = text.strip().replace('\n', '') | |
text = 'summarize: ' + text | |
tokenized_text = tags_tokenizer.encode(text, return_tensors="pt") | |
tags_ids = tags_model.generate(tokenized_text, | |
num_beams=4, | |
no_repeat_ngram_size=2, | |
max_length=20, | |
early_stopping=True) | |
output = tags_tokenizer.decode(tags_ids[0], skip_special_tokens=True) | |
return output.split(', ') | |
""" | |
Downloading audio files | |
""" | |
audio1_url = "https://github.com/Silemo/sml-lab2-2023-manfredi-meneghin/raw/main/task1/audio/offer.mp3" | |
audio1_filename = "offer.mp3" | |
download_audio(audio1_url, audio1_filename) | |
audio2_url = "https://github.com/Silemo/sml-lab2-2023-manfredi-meneghin/raw/main/task1/audio/fantozzi.mp3" | |
audio2_filename = "fantozzi.mp3" | |
download_audio(audio2_url, audio2_filename) | |
""" | |
Interfaces | |
""" | |
audio_transcription = gr.Textbox(label="Transcription") | |
audio_tags = gr.Textbox(label="Tags") | |
yt_transcription = gr.Textbox(label="Transcription") | |
yt_tags = gr.Textbox(label="Tags") | |
# Multiple interfaces using tabs -> https://github.com/gradio-app/gradio/issues/450 | |
io1 = gr.Interface( | |
fn = transcribe_and_tag, | |
inputs = gr.Audio(sources=["upload", "microphone"], type="filepath"), | |
outputs = [audio_transcription, audio_tags], | |
examples = [ | |
[audio1_filename], | |
[audio2_filename], | |
], | |
title = "Whisper Small - Italian - Microphone or Audio file", | |
description = "Realtime demo for Italian speech recognition using a fine-tuned Whisper small model. It uses the computer microphone as audio input. It outputs a transcription and the tags of the text.", | |
) | |
io2 = gr.Interface( | |
fn = transcribe_video, | |
inputs = gr.Textbox(label = "YouTube URL", placeholder = "https://youtu.be/9DImRZERJNs?si=1Lme7o_KH2oCxU7y"), | |
outputs=[yt_transcription, yt_tags], | |
examples=[ | |
# Meloni - Confindustria | |
["https://www.youtube.com/watch?v=qMslwA7RCcc"], | |
# Montemagno - Ripartire da zero | |
["https://www.youtube.com/watch?v=WlT3dCAGjRo"], | |
], | |
title = "Whisper Small - Italian - YouTube link", | |
description = "Realtime demo for Italian speech recognition using a fine-tuned Whisper small model. It uses a YouTube link as audio input. It outputs a transcription and the tags of the text.", | |
) | |
gr.TabbedInterface( | |
[io1, io2], {"Microphone or audio file", "YouTube"} | |
).launch() |