Dionyssos commited on
Commit
25b87f7
·
1 Parent(s): 5ebd2bd

fx audiobook (TODO cleanup .docx)

Browse files
Files changed (2) hide show
  1. README.md +8 -6
  2. audiobook.py +260 -0
README.md CHANGED
@@ -94,7 +94,7 @@ For SHIFT demo / Collaboration with [SMB](https://www.smb.museum/home/)
94
 
95
 
96
 
97
- # Live Demo - Paplay
98
 
99
  Special Flask API for playing sounds live
100
 
@@ -108,17 +108,19 @@ Client - Describe any sound with words and it will be played back to you.
108
  python live_demo.py # will ask text input & play soundscape
109
  ```
110
 
111
- # Simple Demo
112
 
113
  ```python
114
  CUDA_DEVICE_ORDER=PCI_BUS_ID HF_HOME=/data/dkounadis/.hf7/ CUDA_VISIBLE_DEVICES=4 python demo.py
115
  ```
116
 
117
- # AudioBook
118
 
119
- Convert your `.docx` to audio `.wav`. Via multiple voices, then concatenate all `audiobooks.wav` made with each voice to a full one
120
- `concatenate audiobook has noisy speech, the individual single-voice audiobooks are clean, some issue with ffmpeg`. Therefore, for now, SHIFT repo only produces
121
- single-voice audiobook. Archiving the multiple-voice `audiobook.py` here.
 
 
122
 
123
  ```python
124
  # uses Flask api.py
 
94
 
95
 
96
 
97
+ # SoundScape Live (iterative) Demo - Paplay
98
 
99
  Special Flask API for playing sounds live
100
 
 
108
  python live_demo.py # will ask text input & play soundscape
109
  ```
110
 
111
+ # SoundScape (basic) Demo
112
 
113
  ```python
114
  CUDA_DEVICE_ORDER=PCI_BUS_ID HF_HOME=/data/dkounadis/.hf7/ CUDA_VISIBLE_DEVICES=4 python demo.py
115
  ```
116
 
117
+ ##
118
 
119
+ # Audionook
120
+
121
+ Convert `.docx` to audio `.wav` & `.mp4`. Via multiple voices, then concatenate all `audiobook.wav` made with each voice to a full `.mp4`
122
+ `concatenate of .mp4s has noisy speech, however Individual single voice .mp4s are noiseless, debug args ffmpeg`. Therefore, for now, SHIFT repo only produces
123
+ single-voice audiobook. Archiving here the multiple-voice `audiobook.py`.
124
 
125
  ```python
126
  # uses Flask api.py
audiobook.py ADDED
@@ -0,0 +1,260 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # FOR EACH VOICE -> create .wav file per chapter & full audiobook.wav from assets/INCLUSION_IN_MUSEUMS_audiobook.docx
2
+ #
3
+ # Chapters
4
+ #
5
+ # ROOT_DIR/voice/voxstr_CHAPTER_0.wav
6
+ # ..
7
+ # ROOT_DIR/voice/voxstr_CHAPTER_10.wav
8
+ # ROOT_DIR/voice/voxstr_full_book.wav
9
+ #
10
+ # Full AudioBook
11
+ #
12
+ # ROOT_DIR/full_audiobook_all_voices.wav
13
+
14
+ import cv2
15
+ import subprocess
16
+ import numpy as np
17
+ import soundfile
18
+ import docx # pip install python-docx
19
+
20
+ from pathlib import Path
21
+ from moviepy.editor import *
22
+
23
+ FS = 24000
24
+ ROOT_DIR = './tts_audiobooks/voices/'
25
+ Path(ROOT_DIR).mkdir(parents=True,
26
+ exist_ok=True)
27
+ voices = [
28
+ # 'en_US/hifi-tts_low#9017' ,
29
+ 'en_US/m-ailabs_low#mary_ann',
30
+ 'en_US/cmu-arctic_low#jmk',
31
+ # 'en_US/cmu-arctic_low#eey',
32
+ 'en_UK/apope_low'
33
+ ] # select any voice from - https://audeering.github.io/shift/
34
+
35
+ d = docx.Document('../shift/assets/INCLUSION_IN_MUSEUMS_audiobook.docx') # slightly changed from the original .docx to be audible as by adding extra 'by them from this of etc.'
36
+
37
+ last_paragraph_was_silence = False # to know to add silence only once after only at the 1st empty paragraph we detect
38
+
39
+ chapter_counter = 0 # assure chapters start with CHAPTER: ONCE UPON A TIME
40
+
41
+ youtube_video_parts = [] # audiobook .mp4 from each voice
42
+
43
+ for vox in voices:
44
+
45
+ # string (map for assets/)
46
+
47
+ vox_str = vox.replace(
48
+ '/', '_').replace(
49
+ '#', '_').replace(
50
+ 'cmu-arctic', 'cmu_arctic').replace(
51
+ '_low', '').replace('-','')
52
+
53
+ # create dir for chapter_x.wav & audiobook.wav - for this voice vox
54
+
55
+ Path(ROOT_DIR + vox_str + '/').mkdir(parents=True,
56
+ exist_ok=True)
57
+
58
+
59
+ print(vox)
60
+
61
+ # for new voice start list of audio tiles making up the 1st chapter of book
62
+
63
+ total = []
64
+ chapter = []
65
+
66
+ for para in d.paragraphs[:41]:
67
+ t = para.text
68
+
69
+
70
+
71
+
72
+ # start new chapter
73
+
74
+ if t.startswith('CHAPTER:'):
75
+
76
+
77
+
78
+ # silence for end chapter
79
+
80
+ chapter.append(np.zeros(int(.1 * FS),
81
+ dtype=np.float32))
82
+
83
+ # chapter.wav
84
+
85
+ audio = np.concatenate(chapter)
86
+
87
+ soundfile.write(
88
+ ROOT_DIR + vox_str + f'/{vox_str}_chapter_{chapter_counter}.wav',
89
+ audio,
90
+ FS) # 27400?
91
+
92
+ # fill AUDIO of this chapter into total (for complete audiobook)
93
+
94
+ total.append(audio)
95
+
96
+ # new chapter
97
+
98
+ chapter = []
99
+
100
+ chapter_counter += 1
101
+
102
+
103
+
104
+
105
+
106
+ # If paragraph is non empty -> TTS
107
+
108
+ if len(t) > 2 and t[0] != '{' and t[-1] != '}' and 'Figure' not in t:
109
+
110
+ # place paragraph text to .txt for tts.py
111
+
112
+ with open('_tmp.txt', 'w') as f:
113
+ f.write(t.lower()) # WARNING! cast to lower otherwise accesibiliTy is pronounces accessibili..tay
114
+
115
+
116
+ print(t,'\n_____________________________\n')
117
+
118
+ # TTS
119
+
120
+ subprocess.run(
121
+ [
122
+ "python",
123
+ "tts.py",
124
+ "--text",
125
+ "_tmp.txt", #t, # paragraph text tts and append to voice_chapter.wav
126
+ # "--affect",
127
+ #'--image', '_tmp_banner.png',
128
+ # '--scene', 'calm sounds of castle',
129
+ '--voice', vox,
130
+ '--out_file', '_tmp' # save on _tmp load audio and concat to total
131
+ ])
132
+
133
+ audio, _fs = soundfile.read('out/_tmp.wav')
134
+ print('CHAPTER\n\n\n\n____', audio.shape,'____\n')
135
+ chapter.append(audio)
136
+
137
+ # flag
138
+
139
+ last_paragraph_was_silence = False
140
+
141
+ # append silence if empty paragraph (e.g. end of Section)
142
+
143
+ else:
144
+
145
+ if not last_paragraph_was_silence: # skip multiple empty pargraphs - silence is added only once
146
+
147
+ chapter.append(np.zeros(int(.1 * FS),
148
+ dtype=np.float32))
149
+
150
+ last_paragraph_was_silence = True
151
+
152
+ # save full .wav audiobook - for this voice
153
+
154
+ soundfile.write(
155
+ ROOT_DIR + vox_str + '/' + f'{vox_str}_full_audiobook.wav',
156
+ np.concatenate(total),
157
+ FS) # 27400?
158
+
159
+
160
+
161
+
162
+ # pic TTS voice
163
+
164
+ voice_pic = np.zeros((768, 1024, 3), dtype=np.uint8)
165
+
166
+ shift_logo = cv2.imread('assets/shift_banner.png')
167
+
168
+ voice_pic[:100, :400, :] = shift_logo[:100, :400, :]
169
+
170
+ # voice name
171
+ # frame_tts = np.zeros((104, 1920, 3), dtype=np.uint8)
172
+ font = cv2.FONT_HERSHEY_SIMPLEX
173
+ bottomLeftCornerOfText = (0, 640) # w,h
174
+ fontScale = 2
175
+ fontColor = (69, 74, 74)
176
+ thickness = 4
177
+ lineType = 2
178
+ # voice
179
+ cv2.putText(voice_pic, vox, #'en_US/m-ailabs_low#mary_ann',
180
+ bottomLeftCornerOfText,
181
+ font,
182
+ fontScale,
183
+ fontColor,
184
+ thickness,
185
+ lineType)
186
+ # =
187
+ cv2.putText(voice_pic, 'TTS voice =',
188
+ (0, 500),
189
+ font,
190
+ fontScale,
191
+ fontColor,
192
+ thickness,
193
+ lineType)
194
+ STATIC_FRAME = '_tmp.png'
195
+ cv2.imwrite(STATIC_FRAME, voice_pic)
196
+
197
+
198
+ # MoviePy silence video
199
+
200
+
201
+ SILENT_VIDEO = '_tmp.mp4'
202
+
203
+ # SILENT CLIP
204
+
205
+ clip_silent = ImageClip(STATIC_FRAME).set_duration(5) # as long as the audio - TTS first
206
+ clip_silent.write_videofile(SILENT_VIDEO, fps=24)
207
+
208
+
209
+
210
+
211
+
212
+ # fuse vox_full_audiobook.wav & SILENT_VIDEO -> TO FINALLY CONCATENATE into YouTube Video
213
+
214
+ # write final output video
215
+ subprocess.call(
216
+ ["ffmpeg",
217
+ "-y",
218
+ "-i",
219
+ SILENT_VIDEO,
220
+ "-i",
221
+ ROOT_DIR + vox_str + '/' + f'{vox_str}_full_audiobook.wav',
222
+ "-c:v",
223
+ "copy",
224
+ "-map",
225
+ "0:v:0",
226
+ "-map",
227
+ " 1:a:0",
228
+ ROOT_DIR + vox_str + '/' + f'{vox_str}_full_audiobook.mp4', # OUT_FILE
229
+ ])
230
+
231
+ youtube_video_parts.append(ROOT_DIR + vox_str + '/' + f'{vox_str}_full_audiobook.mp4')
232
+ # Final vid for YouTube
233
+
234
+ with open('_youtube_video_parts.txt', 'w') as f:
235
+ _str = 'file ' + ' \n file '.join(youtube_video_parts)
236
+ f.write(_str)
237
+
238
+ # # list of audiobooks of single vox
239
+ # # --
240
+ # # $ cat mylist.txt
241
+ # # file '/path/to/file1'
242
+ # # file '/path/to/file2'
243
+ # # file '/path/to/file3'
244
+
245
+ youtube_video_file = 'audiobook_shift_youtube.mp4'
246
+
247
+ # ffmpeg -f concat -i video_parts.txt -c copy output.mp4
248
+ subprocess.call(
249
+ ["ffmpeg",
250
+ "-y", # https://stackoverflow.com/questions/39788972/ffmpeg-overwrite-output-file-if-exists
251
+ "-safe",
252
+ "0", # https://stackoverflow.com/questions/38996925/ffmpeg-concat-unsafe-file-name
253
+ "-f",
254
+ "concat", # https://stackoverflow.com/questions/7333232/how-to-concatenate-two-mp4-files-using-ffmpeg
255
+ "-i",
256
+ '_youtube_video_parts.txt',
257
+ "-c",
258
+ "copy",
259
+ youtube_video_file]
260
+ )