whispertube_backend / audiobook.py
uzi007's picture
Removed Unneeded Files
955f567
raw
history blame
1.72 kB
import os
from IPython.display import Audio
import nltk # we'll use this to split into sentences
import numpy as np
from bark.generation import (
generate_text_semantic,
preload_models,
)
from bark.api import semantic_to_waveform
from bark import generate_audio, SAMPLE_RATE
import soundfile as sf
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
# Loads the model, should be run one time
preload_models()
class AudioBook:
def __init__(self, output_folder="output"):
self.output_folder = output_folder
# Create the output folder if it doesn't exist
if not os.path.exists(output_folder):
os.makedirs(output_folder)
def generate_audio_from_text(self, text, speaker="male", filename="output_audio"):
# Preprocess text
text = text.replace("\n", " ").strip()
sentences = nltk.sent_tokenize(text)
# Choose the speaker based on the input
if speaker == "male":
SPEAKER = "v2/en_speaker_6"
elif speaker == "female":
SPEAKER = "v2/en_speaker_9"
else:
raise ValueError("Invalid speaker selection. Use 'male' or 'female'.")
silence = np.zeros(int(0.25 * SAMPLE_RATE)) # quarter-second of silence
pieces = []
for sentence in sentences:
audio_array = generate_audio(sentence, history_prompt=SPEAKER, text_temp=0.7, waveform_temp=0.7)
pieces += [audio_array, silence.copy()]
audio_data = np.concatenate(pieces)
# Save the audio to a WAV file in the output folder
output_path = os.path.join(self.output_folder, f"{filename}.wav")
sf.write(output_path, audio_data, SAMPLE_RATE)
return output_path