File size: 6,360 Bytes

# creates .wav file per chapter & full audiobook.wav for assets/INCLUSION_IN_MUSEUMS_audiobook.docx
# __________________________________________________________________________________________________
#   ROOT_DIR/voice/voice_CHAPTER_0.wav, .., ROOT_DIR/voice/voice_CHAPTER_10.wav
#   ROOT_DIR/voice/voice_full_book.wav

import cv2
import subprocess
import numpy as np
import soundfile
import docx  # package = python-docx
import audresample
import urllib
from pathlib import Path
from moviepy.editor import *

FS = 24000
ROOT_DIR = './tts_audiobooks/voices/'
Path(ROOT_DIR).mkdir(parents=True,
                     exist_ok=True)
voices = [
    'en_US/vctk_low#p228',
    # 'en_US/vctk_low#p326',
    ]  # select any voice from - https://audeering.github.io/shift/

urllib.request.urlretrieve("https://github.com/audeering/shift/raw/refs/heads/main/assets/INCLUSION_IN_MUSEUMS_audiobook.docx", "tmp.docx")

d = docx.Document('tmp.docx')  # slightly changed from the original .docx to be audible as by adding extra 'by them from this of etc.'

last_paragraph_was_silence = False  # to know to add silence only once after only at the 1st empty paragraph we detect

chapter_counter = 0  # assure chapters start with CHAPTER: ONCE UPON A TIME

for vox in voices:

    # string cleanup

    vox_str = vox.replace(
                '/', '_').replace(
                '#', '_').replace(
                'cmu-arctic', 'cmu_arctic').replace(
                '_low', '').replace('-','')

    # create dir for chapter_x.wav & audiobook.wav - for this voice vox

    Path(ROOT_DIR + vox_str + '/').mkdir(parents=True,
                                         exist_ok=True)


    print(vox)

    # for new voice start list of audio tiles making up the 1st chapter of book

    total = []
    chapter = []

    for para in d.paragraphs:  #[:41]
        t = para.text




        # start new chapter

        if t.startswith('CHAPTER:'):



            # silence for end chapter

            chapter.append(np.zeros(int(.24 * FS),
            dtype=np.float32))

            # chapter.wav

            audio = np.concatenate(chapter)

            soundfile.write(
                        ROOT_DIR + vox_str + f'/{vox_str}_chapter_{chapter_counter}.wav',
                        audio,
                        16000)  # 27400?

            # fill AUDIO of this chapter into total (for complete audiobook)

            total.append(audio)

            # new chapter

            chapter = []

            chapter_counter += 1

            print(f'Start Chapter {chapter_counter}, timestamp:{int(np.concatenate(total).shape[0]/16000)//60}:{int(np.concatenate(total).shape[0]/16000)%60}')



        # If paragraph is non empty -> TTS

        if len(t) > 2 and t[0] != '{' and t[-1] != '}' and 'Figure' not in t:

            # place paragraph text to .txt for tts.py

            with open('_tmp.txt', 'w') as f:
                f.write(t.lower())  # WARNING! cast to lower otherwise accesibiliTy is pronounces accessibili..tay




            # TTS

            subprocess.run(
                [
                "python",
                "tts.py",
                "--text", 
                "_tmp.txt", #t,         # paragraph text tts and append to voice_chapter.wav
                # "--affect",
                #'--image', '_tmp_banner.png',
                # '--scene', 'calm sounds of castle',
                '--voice', vox,
                '--out_file', '_tmp'  # save on _tmp load audio and concat to total
                ])

            audio, _fs = soundfile.read('out/_tmp.wav')
            audio = audresample.resample(audio.astype(np.float32), 24000, 16000)[0, :]
            # print('CHAPTER\n\n\n\n____', audio.shape,'____\n')
            chapter.append(audio)

            # flag

            last_paragraph_was_silence = False

            # append silence if empty paragraph (e.g. end of Section)

        else:

            if not last_paragraph_was_silence:  # skip multiple empty pargraphs - silence is added only once

                chapter.append(np.zeros(int(.1 * FS), 
                               dtype=np.float32))

                last_paragraph_was_silence = True

    # save full .wav audiobook - for this voice

    soundfile.write(
            ROOT_DIR + vox_str + '/' + f'{vox_str}_full_audiobook.wav',
            np.concatenate(total),
            16000)  # 27400?




    # pic TTS voice

    voice_pic = np.zeros((574, 1024, 3), dtype=np.uint8)

    shift_logo = cv2.imread('assets/shift_banner.png')

    voice_pic[:100, :400, :] = shift_logo[:100, :400, :]

    # voice name
    # frame_tts = np.zeros((104, 1920, 3), dtype=np.uint8)
    font                   = cv2.FONT_HERSHEY_SIMPLEX
    bottomLeftCornerOfText = (0, 640)  # w,h
    fontScale              = 2
    fontColor              = (69, 74, 74)
    thickness              = 4
    lineType               = 2
    # voice
    cv2.putText(voice_pic, vox, #'en_US/m-ailabs_low#mary_ann',
                bottomLeftCornerOfText,
                font,
                fontScale,
                fontColor,
                thickness,
                lineType)
    # = AUDIOBOOK
    cv2.putText(voice_pic, 'AUDIOBOOK',
                (170, 170),
                font,
                4,
                fontColor,
                thickness,
                lineType)
    # = VOICE
    cv2.putText(voice_pic, 'TTS voice =',
                (0, 500),
                font,
                fontScale,
                fontColor,
                thickness,
                lineType)
    STATIC_FRAME = '_tmp.png'
    cv2.imwrite(STATIC_FRAME, voice_pic)


    # MoviePy silence video


    SILENT_VIDEO = '_tmp.mp4'

    # SILENT CLIP

    clip_silent = ImageClip(STATIC_FRAME).set_duration(5)  # as long as the audio - TTS first
    clip_silent.write_videofile(SILENT_VIDEO, fps=24)





    # fuse vox_full_audiobook.wav & SILENT_VIDEO -> TO FINALLY CONCATENATE into YouTube Video

    # write final output video
    subprocess.call(
        ["ffmpeg",
        "-y",
        "-i",
        SILENT_VIDEO,
        "-i",
        ROOT_DIR + vox_str + '/' + f'{vox_str}_full_audiobook.wav',
        "-c:v",
        "copy",
        "-map",
        "0:v:0",
        "-map",
        " 1:a:0",
        ROOT_DIR + vox_str + '/' + f'{vox_str}_full_audiobook.mp4',       #  OUT_FILE
        ])