tune voice for audiobook

9d6172b 2 days ago

6.66 kB

	# creates .wav file per chapter & full audiobook.wav for assets/INCLUSION_IN_MUSEUMS_audiobook.docx
	# __________________________________________________________________________________________________
	# ROOT_DIR/voice/voice_CHAPTER_0.wav, .., ROOT_DIR/voice/voice_CHAPTER_10.wav
	# ROOT_DIR/voice/voice_full_book.wav

	import cv2
	import subprocess
	import numpy as np
	import soundfile
	import docx # package = python-docx
	import audresample
	import urllib
	from pathlib import Path
	from moviepy.editor import *

	FS = 24000
	ROOT_DIR = './tts_audiobooks/voices/'
	Path(ROOT_DIR).mkdir(parents=True,
	exist_ok=True)
	voices = [
	# 'en_US/vctk_low#p228', # https://huggingface.co/dkounadis/artificial-styletts2/discussions/1#67854dcbd3e6beb1a78f7f20
	# 'af_ZA_google-nwu_0184', # https://huggingface.co/dkounadis/artificial-styletts2/discussions/1#6783e3b00e7d90facec060c6
	'en_US/vctk_low#p326', # Native voice
	# 'jv_ID_google-gmu_06207',
	] # select any voice from - https://audeering.github.io/shift/

	#urllib.request.urlretrieve("https://github.com/audeering/shift/raw/refs/heads/main/assets/INCLUSION_IN_MUSEUMS_audiobook.docx", "audiobook_TTS.docx")

	d = docx.Document('assets/audiobook_TTS.docx') # slightly changed from the original .docx to be audible as by adding extra 'by them from this of etc.'

	last_paragraph_was_silence = False # to know to add silence only once after only at the 1st empty paragraph we detect

	chapter_counter = 0 # assure chapters start with CHAPTER: ONCE UPON A TIME

	for vox in voices:

	# string cleanup

	vox_str = vox.replace(
	'/', '_').replace(
	'#', '_').replace(
	'cmu-arctic', 'cmu_arctic').replace(
	'_low', '').replace('-','')

	# create dir for chapter_x.wav & audiobook.wav - for this voice vox

	Path(ROOT_DIR + vox_str + '/').mkdir(parents=True,
	exist_ok=True)


	print(vox)

	# for new voice start list of audio tiles making up the 1st chapter of book

	total = []
	chapter = []

	for para in d.paragraphs: #[:41]
	t = para.text




	# start new chapter

	if t.startswith('CHAPTER:'):



	# silence for end chapter

	chapter.append(np.zeros(int(.24 * FS),
	dtype=np.float32))

	# chapter.wav

	audio = np.concatenate(chapter)

	soundfile.write(
	ROOT_DIR + vox_str + f'/{vox_str}_chapter_{chapter_counter}.wav',
	audio,
	16000) # 27400?

	# fill AUDIO of this chapter into total (for complete audiobook)

	total.append(audio)

	# new chapter

	chapter = []

	chapter_counter += 1

	print(f'Start Chapter {chapter_counter}, timestamp:{int(np.concatenate(total).shape[0]/16000)//60}:{int(np.concatenate(total).shape[0]/16000)%60}')



	# If paragraph is non empty -> TTS

	if len(t) > 2 and t[0] != '{' and t[-1] != '}' and 'Figure' not in t:

	# place paragraph text to .txt for tts.py

	with open('_tmp.txt', 'w') as f:
	f.write(t.lower()) # WARNING! cast to lower otherwise accesibiliTy is pronounces accessibili..tay




	# TTS

	subprocess.run(
	[
	"python",
	"tts.py",
	"--text",
	"_tmp.txt", #t, # paragraph text tts and append to voice_chapter.wav
	# "--affect",
	#'--image', '_tmp_banner.png',
	# '--scene', 'calm sounds of castle',
	'--voice', vox,
	'--out_file', '_tmp' # save on _tmp load audio and concat to total
	])

	audio, _fs = soundfile.read('out/_tmp.wav')
	audio = audresample.resample(audio.astype(np.float32), 24000, 16000)[0, :]
	# print('CHAPTER\n\n\n\n____', audio.shape,'____\n')
	chapter.append(audio)

	# flag

	last_paragraph_was_silence = False

	# append silence if empty paragraph (e.g. end of Section)

	else:

	if not last_paragraph_was_silence: # skip multiple empty pargraphs - silence is added only once

	chapter.append(np.zeros(int(.1 * FS),
	dtype=np.float32))

	last_paragraph_was_silence = True

	# save full .wav audiobook - for this voice

	soundfile.write(
	ROOT_DIR + vox_str + '/' + f'{vox_str}_full_audiobook.wav',
	np.concatenate(total),
	16000) # 27400?




	# pic TTS voice

	voice_pic = np.zeros((574, 1024, 3), dtype=np.uint8)

	shift_logo = cv2.imread('assets/shift_banner.png')

	voice_pic[:100, :400, :] = shift_logo[:100, :400, :]

	# voice name
	# frame_tts = np.zeros((104, 1920, 3), dtype=np.uint8)
	font = cv2.FONT_HERSHEY_SIMPLEX
	bottomLeftCornerOfText = (0, 640) # w,h
	fontScale = 2
	fontColor = (69, 74, 74)
	thickness = 4
	lineType = 2
	# voice
	cv2.putText(voice_pic, vox, #'en_US/m-ailabs_low#mary_ann',
	bottomLeftCornerOfText,
	font,
	fontScale,
	fontColor,
	thickness,
	lineType)
	# = AUDIOBOOK
	cv2.putText(voice_pic, 'AUDIOBOOK',
	(170, 170),
	font,
	4,
	fontColor,
	thickness,
	lineType)
	# = VOICE
	cv2.putText(voice_pic, 'TTS voice =',
	(0, 500),
	font,
	fontScale,
	fontColor,
	thickness,
	lineType)
	STATIC_FRAME = '_tmp.png'
	cv2.imwrite(STATIC_FRAME, voice_pic)


	# MoviePy silence video


	SILENT_VIDEO = '_tmp.mp4'

	# SILENT CLIP

	clip_silent = ImageClip(STATIC_FRAME).set_duration(5) # as long as the audio - TTS first
	clip_silent.write_videofile(SILENT_VIDEO, fps=24)





	# fuse vox_full_audiobook.wav & SILENT_VIDEO -> TO FINALLY CONCATENATE into YouTube Video

	# write final output video
	subprocess.call(
	["ffmpeg",
	"-y",
	"-i",
	SILENT_VIDEO,
	"-i",
	ROOT_DIR + vox_str + '/' + f'{vox_str}_full_audiobook.wav',
	"-c:v",
	"copy",
	"-map",
	"0:v:0",
	"-map",
	" 1:a:0",
	ROOT_DIR + vox_str + '/' + f'{vox_str}_full_audiobook.mp4', # OUT_FILE
	])