# creates .wav file per chapter & full audiobook.wav for assets/INCLUSION_IN_MUSEUMS_audiobook.docx # __________________________________________________________________________________________________ # ROOT_DIR/voice/voice_CHAPTER_0.wav, .., ROOT_DIR/voice/voice_CHAPTER_10.wav # ROOT_DIR/voice/voice_full_book.wav import cv2 import subprocess import numpy as np import soundfile import docx # package = python-docx import audresample import urllib from pathlib import Path from moviepy.editor import * FS = 24000 ROOT_DIR = './tts_audiobooks/voices/' Path(ROOT_DIR).mkdir(parents=True, exist_ok=True) voices = [ # 'en_US/vctk_low#p228', # https://huggingface.co/dkounadis/artificial-styletts2/discussions/1#67854dcbd3e6beb1a78f7f20 # 'af_ZA_google-nwu_0184', # https://huggingface.co/dkounadis/artificial-styletts2/discussions/1#6783e3b00e7d90facec060c6 'en_US/vctk_low#p326', # Native voice # 'jv_ID_google-gmu_06207', ] # select any voice from - https://audeering.github.io/shift/ #urllib.request.urlretrieve("https://github.com/audeering/shift/raw/refs/heads/main/assets/INCLUSION_IN_MUSEUMS_audiobook.docx", "audiobook_TTS.docx") d = docx.Document('assets/audiobook_TTS.docx') # slightly changed from the original .docx to be audible as by adding extra 'by them from this of etc.' last_paragraph_was_silence = False # to know to add silence only once after only at the 1st empty paragraph we detect chapter_counter = 0 # assure chapters start with CHAPTER: ONCE UPON A TIME for vox in voices: # string cleanup vox_str = vox.replace( '/', '_').replace( '#', '_').replace( 'cmu-arctic', 'cmu_arctic').replace( '_low', '').replace('-','') # create dir for chapter_x.wav & audiobook.wav - for this voice vox Path(ROOT_DIR + vox_str + '/').mkdir(parents=True, exist_ok=True) print(vox) # for new voice start list of audio tiles making up the 1st chapter of book total = [] chapter = [] for para in d.paragraphs: #[:41] t = para.text # start new chapter if t.startswith('CHAPTER:'): # silence for end chapter chapter.append(np.zeros(int(.24 * FS), dtype=np.float32)) # chapter.wav audio = np.concatenate(chapter) soundfile.write( ROOT_DIR + vox_str + f'/{vox_str}_chapter_{chapter_counter}.wav', audio, 16000) # 27400? # fill AUDIO of this chapter into total (for complete audiobook) total.append(audio) # new chapter chapter = [] chapter_counter += 1 print(f'Start Chapter {chapter_counter}, timestamp:{int(np.concatenate(total).shape[0]/16000)//60}:{int(np.concatenate(total).shape[0]/16000)%60}') # If paragraph is non empty -> TTS if len(t) > 2 and t[0] != '{' and t[-1] != '}' and 'Figure' not in t: # place paragraph text to .txt for tts.py with open('_tmp.txt', 'w') as f: f.write(t.lower()) # WARNING! cast to lower otherwise accesibiliTy is pronounces accessibili..tay # TTS subprocess.run( [ "python", "tts.py", "--text", "_tmp.txt", #t, # paragraph text tts and append to voice_chapter.wav # "--affect", #'--image', '_tmp_banner.png', # '--scene', 'calm sounds of castle', '--voice', vox, '--out_file', '_tmp' # save on _tmp load audio and concat to total ]) audio, _fs = soundfile.read('out/_tmp.wav') audio = audresample.resample(audio.astype(np.float32), 24000, 16000)[0, :] # print('CHAPTER\n\n\n\n____', audio.shape,'____\n') chapter.append(audio) # flag last_paragraph_was_silence = False # append silence if empty paragraph (e.g. end of Section) else: if not last_paragraph_was_silence: # skip multiple empty pargraphs - silence is added only once chapter.append(np.zeros(int(.1 * FS), dtype=np.float32)) last_paragraph_was_silence = True # save full .wav audiobook - for this voice soundfile.write( ROOT_DIR + vox_str + '/' + f'{vox_str}_full_audiobook.wav', np.concatenate(total), 16000) # 27400? # pic TTS voice voice_pic = np.zeros((574, 1024, 3), dtype=np.uint8) shift_logo = cv2.imread('assets/shift_banner.png') voice_pic[:100, :400, :] = shift_logo[:100, :400, :] # voice name # frame_tts = np.zeros((104, 1920, 3), dtype=np.uint8) font = cv2.FONT_HERSHEY_SIMPLEX bottomLeftCornerOfText = (0, 640) # w,h fontScale = 2 fontColor = (69, 74, 74) thickness = 4 lineType = 2 # voice cv2.putText(voice_pic, vox, #'en_US/m-ailabs_low#mary_ann', bottomLeftCornerOfText, font, fontScale, fontColor, thickness, lineType) # = AUDIOBOOK cv2.putText(voice_pic, 'AUDIOBOOK', (170, 170), font, 4, fontColor, thickness, lineType) # = VOICE cv2.putText(voice_pic, 'TTS voice =', (0, 500), font, fontScale, fontColor, thickness, lineType) STATIC_FRAME = '_tmp.png' cv2.imwrite(STATIC_FRAME, voice_pic) # MoviePy silence video SILENT_VIDEO = '_tmp.mp4' # SILENT CLIP clip_silent = ImageClip(STATIC_FRAME).set_duration(5) # as long as the audio - TTS first clip_silent.write_videofile(SILENT_VIDEO, fps=24) # fuse vox_full_audiobook.wav & SILENT_VIDEO -> TO FINALLY CONCATENATE into YouTube Video # write final output video subprocess.call( ["ffmpeg", "-y", "-i", SILENT_VIDEO, "-i", ROOT_DIR + vox_str + '/' + f'{vox_str}_full_audiobook.wav', "-c:v", "copy", "-map", "0:v:0", "-map", " 1:a:0", ROOT_DIR + vox_str + '/' + f'{vox_str}_full_audiobook.mp4', # OUT_FILE ])