|
|
|
|
|
|
|
|
|
|
|
import cv2 |
|
import subprocess |
|
import numpy as np |
|
import soundfile |
|
import docx |
|
import audresample |
|
import urllib |
|
from pathlib import Path |
|
from moviepy.editor import * |
|
|
|
FS = 24000 |
|
ROOT_DIR = './tts_audiobooks/voices/' |
|
Path(ROOT_DIR).mkdir(parents=True, |
|
exist_ok=True) |
|
voices = [ |
|
|
|
|
|
'en_US/vctk_low#p326', |
|
|
|
] |
|
|
|
|
|
|
|
d = docx.Document('assets/audiobook_TTS.docx') |
|
|
|
last_paragraph_was_silence = False |
|
|
|
chapter_counter = 0 |
|
|
|
for vox in voices: |
|
|
|
|
|
|
|
vox_str = vox.replace( |
|
'/', '_').replace( |
|
'#', '_').replace( |
|
'cmu-arctic', 'cmu_arctic').replace( |
|
'_low', '').replace('-','') |
|
|
|
|
|
|
|
Path(ROOT_DIR + vox_str + '/').mkdir(parents=True, |
|
exist_ok=True) |
|
|
|
|
|
print(vox) |
|
|
|
|
|
|
|
total = [] |
|
chapter = [] |
|
|
|
for para in d.paragraphs: |
|
t = para.text |
|
|
|
|
|
|
|
|
|
|
|
|
|
if t.startswith('CHAPTER:'): |
|
|
|
|
|
|
|
|
|
|
|
chapter.append(np.zeros(int(.24 * FS), |
|
dtype=np.float32)) |
|
|
|
|
|
|
|
audio = np.concatenate(chapter) |
|
|
|
soundfile.write( |
|
ROOT_DIR + vox_str + f'/{vox_str}_chapter_{chapter_counter}.wav', |
|
audio, |
|
16000) |
|
|
|
|
|
|
|
total.append(audio) |
|
|
|
|
|
|
|
chapter = [] |
|
|
|
chapter_counter += 1 |
|
|
|
print(f'Start Chapter {chapter_counter}, timestamp:{int(np.concatenate(total).shape[0]/16000)//60}:{int(np.concatenate(total).shape[0]/16000)%60}') |
|
|
|
|
|
|
|
|
|
|
|
if len(t) > 2 and t[0] != '{' and t[-1] != '}' and 'Figure' not in t: |
|
|
|
|
|
|
|
with open('_tmp.txt', 'w') as f: |
|
f.write(t.lower()) |
|
|
|
|
|
|
|
|
|
|
|
|
|
subprocess.run( |
|
[ |
|
"python", |
|
"tts.py", |
|
"--text", |
|
"_tmp.txt", |
|
|
|
|
|
|
|
'--voice', vox, |
|
'--out_file', '_tmp' |
|
]) |
|
|
|
audio, _fs = soundfile.read('out/_tmp.wav') |
|
audio = audresample.resample(audio.astype(np.float32), 24000, 16000)[0, :] |
|
|
|
chapter.append(audio) |
|
|
|
|
|
|
|
last_paragraph_was_silence = False |
|
|
|
|
|
|
|
else: |
|
|
|
if not last_paragraph_was_silence: |
|
|
|
chapter.append(np.zeros(int(.1 * FS), |
|
dtype=np.float32)) |
|
|
|
last_paragraph_was_silence = True |
|
|
|
|
|
|
|
soundfile.write( |
|
ROOT_DIR + vox_str + '/' + f'{vox_str}_full_audiobook.wav', |
|
np.concatenate(total), |
|
16000) |
|
|
|
|
|
|
|
|
|
|
|
|
|
voice_pic = np.zeros((574, 1024, 3), dtype=np.uint8) |
|
|
|
shift_logo = cv2.imread('assets/shift_banner.png') |
|
|
|
voice_pic[:100, :400, :] = shift_logo[:100, :400, :] |
|
|
|
|
|
|
|
font = cv2.FONT_HERSHEY_SIMPLEX |
|
bottomLeftCornerOfText = (0, 640) |
|
fontScale = 2 |
|
fontColor = (69, 74, 74) |
|
thickness = 4 |
|
lineType = 2 |
|
|
|
cv2.putText(voice_pic, vox, |
|
bottomLeftCornerOfText, |
|
font, |
|
fontScale, |
|
fontColor, |
|
thickness, |
|
lineType) |
|
|
|
cv2.putText(voice_pic, 'AUDIOBOOK', |
|
(170, 170), |
|
font, |
|
4, |
|
fontColor, |
|
thickness, |
|
lineType) |
|
|
|
cv2.putText(voice_pic, 'TTS voice =', |
|
(0, 500), |
|
font, |
|
fontScale, |
|
fontColor, |
|
thickness, |
|
lineType) |
|
STATIC_FRAME = '_tmp.png' |
|
cv2.imwrite(STATIC_FRAME, voice_pic) |
|
|
|
|
|
|
|
|
|
|
|
SILENT_VIDEO = '_tmp.mp4' |
|
|
|
|
|
|
|
clip_silent = ImageClip(STATIC_FRAME).set_duration(5) |
|
clip_silent.write_videofile(SILENT_VIDEO, fps=24) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
subprocess.call( |
|
["ffmpeg", |
|
"-y", |
|
"-i", |
|
SILENT_VIDEO, |
|
"-i", |
|
ROOT_DIR + vox_str + '/' + f'{vox_str}_full_audiobook.wav', |
|
"-c:v", |
|
"copy", |
|
"-map", |
|
"0:v:0", |
|
"-map", |
|
" 1:a:0", |
|
ROOT_DIR + vox_str + '/' + f'{vox_str}_full_audiobook.mp4', |
|
]) |
|
|