import gradio as gr import librosa import numpy as np import torch from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan from datasets import load_dataset, Audio dataset = load_dataset( "divakaivan/glaswegian_audio" ) dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))['train'] from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts") model = SpeechT5ForTextToSpeech.from_pretrained("divakaivan/glaswegian_tts") tokenizer = processor.tokenizer def extract_all_chars(batch): all_text = " ".join(batch["transcription"]) vocab = list(set(all_text)) return {"vocab": [vocab], "all_text": [all_text]} vocabs = dataset.map( extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=dataset.column_names, ) dataset_vocab = set(vocabs["vocab"][0]) tokenizer_vocab = {k for k,_ in tokenizer.get_vocab().items()} replacements = [ ('à', 'a'), ('ç', 'c'), ('è', 'e'), ('ë', 'e'), ('í', 'i'), ('ï', 'i'), ('ö', 'o'), ('ü', 'u'), ] def cleanup_text(inputs): for src, dst in replacements: inputs["transcription"] = inputs["transcription"].replace(src, dst) return inputs dataset = dataset.map(cleanup_text) import os import torch from speechbrain.inference.speaker import EncoderClassifier spk_model_name = "speechbrain/spkrec-xvect-voxceleb" device = "cuda" if torch.cuda.is_available() else "cpu" speaker_model = EncoderClassifier.from_hparams( source=spk_model_name, run_opts={"device": device}, savedir=os.path.join("/tmp", spk_model_name), ) def create_speaker_embedding(waveform): with torch.no_grad(): speaker_embeddings = speaker_model.encode_batch(torch.tensor(waveform)) speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=2) speaker_embeddings = speaker_embeddings.squeeze().cpu().numpy() return speaker_embeddings def prepare_dataset(example): # load the audio data; if necessary, this resamples the audio to 16kHz audio = example["audio"] # feature extraction and tokenization example = processor( text=example["transcription"], audio_target=audio["array"], sampling_rate=audio["sampling_rate"], return_attention_mask=False, ) # strip off the batch dimension example["labels"] = example["labels"][0] # use SpeechBrain to obtain x-vector example["speaker_embeddings"] = create_speaker_embedding(audio["array"]) return example processed_example = prepare_dataset(dataset[0]) from transformers import SpeechT5HifiGan vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan") spectrogram = torch.tensor(processed_example["labels"]) with torch.no_grad(): speech = vocoder(spectrogram) dataset = dataset.map( prepare_dataset, remove_columns=dataset.column_names, ) dataset = dataset.train_test_split(test_size=0.1) def predict(text, speaker): if len(text.strip()) == 0: return (16000, np.zeros(0).astype(np.int16)) inputs = processor(text=text, return_tensors="pt") # limit input length # input_ids = inputs["input_ids"] # input_ids = input_ids[..., :model.config.max_text_positions] ### ### ### example = dataset['test'][11] speaker_embeddings = torch.tensor(example["speaker_embeddings"]).unsqueeze(0) spectrogram = model.generate_speech(inputs["input_ids"], speaker_embeddings) with torch.no_grad(): speech = vocoder(spectrogram) speech = (speech.numpy() * 32767).astype(np.int16) return (16000, speech) title = "Glaswegian TTS" article = "Model fine-tuned and gradle demo generated thanks to this notebook: https://colab.research.google.com/drive/1i7I5pzBcU3WDFarDnzweIj4-sVVoIUFJ#scrollTo=wm7B3zxrumfF" gr.Interface( fn=predict, inputs=[ gr.Text(label="Input Text"), ], outputs=[ gr.Audio(label="Generated Speech", type="numpy"), ], title=title, article=article, ).launch()