Dionyssos commited on
Commit
966f861
·
1 Parent(s): 2502403

scripts for landscape2soundscape

Browse files
Files changed (4) hide show
  1. api.py +25 -20
  2. landscape2soundscape.py +272 -0
  3. msinference.py +1 -1
  4. tts.py +3 -2
api.py CHANGED
@@ -38,18 +38,18 @@ def _shift(x):
38
  # x = x * fade_in
39
  return x
40
 
41
- def _background(x, sound_background=None):
42
  if sound_background is not None:
43
- sound_background = sound_background[0, :]
44
  len_speech = len(x)
45
- if len_speech < len(sound_background):
46
  n_repeat = len_speech // len(sound_background) + 1
47
  replica = [sound_background] * n_repeat
48
  replica = [_shift(_) for _ in replica]
49
  sound_background = np.concatenate(replica)
50
 
51
 
52
- print(f'\nSOUND\nBACKGROUND\nSHAPE\n{sound_background=}\n{x.shape=}\n- - - -')
53
  x = .74 * x + .26 * sound_background[:len_speech]
54
  return x
55
 
@@ -90,7 +90,7 @@ def tts_multi_sentence(precomputed_style_vector=None,
90
  embedding_scale=1))
91
  x = np.concatenate(x)
92
 
93
- return _background(x, sound_background)
94
 
95
  # Fallback - Mimic-3
96
  text_utils.store_ssml(text=text, voice=voice) # Text has to be list of single sentences
@@ -99,7 +99,7 @@ def tts_multi_sentence(precomputed_style_vector=None,
99
  x, fs = soundfile.read('_tmp.wav')
100
  x = audresample.resample(x.astype(np.float32), 24000, fs)[0, :] # reshapes (64,) -> (1,64)
101
 
102
- return _background(x, sound_background)
103
 
104
 
105
 
@@ -131,14 +131,14 @@ def serve_wav():
131
 
132
  print('Saved all files on Server Side\n\n')
133
 
134
- args = SimpleNamespace(text=None if r.get('text') is None else 'flask_cache/' + r.get('text')[0],
135
- video=None if r.get('video') is None else 'flask_cache/' + r.get('video')[0],
136
- image=None if r.get('image') is None else 'flask_cache/' + r.get('image')[0],
137
- voice=r.get('voice')[0],
138
- native=None if r.get('native') is None else 'flask_cache/' + r.get('native')[0],
139
- affective = r.get('affective')[0],
140
- scene=r.get('scene')[0]
141
- )
142
  # print('\n==RECOMPOSED as \n',request.data,request.form,'\n==')
143
 
144
 
@@ -202,7 +202,7 @@ def serve_wav():
202
  '#', '_').replace(
203
  'cmu-arctic', 'cmu_arctic').replace(
204
  '_low', '') + '.wav')
205
- print('\n STYLE VECTOR \n', precomputed_style_vector)
206
  # ====SILENT VIDEO====
207
 
208
  if args.video is not None:
@@ -369,9 +369,9 @@ def serve_wav():
369
 
370
  # Fallback: No image nor video provided - do only tts
371
  x = tts_multi_sentence(text=text,
372
- precomputed_style_vector=precomputed_style_vector,
373
- voice=args.voice,
374
- scene=args.scene)
375
  OUT_FILE = './flask_cache/tmp.wav' #args.out_file + '.wav'
376
  soundfile.write(OUT_FILE, x, 24000)
377
 
@@ -388,8 +388,13 @@ def serve_wav():
388
  # response.headers["Content-Type"] = "audio/wav"
389
  # https://stackoverflow.com/questions/67591467/
390
  # flask-shows-typeerror-send-from-directory-missing-1-required-positional-argum
391
- response = send_from_directory('flask_cache/', path=OUT_FILE.split('/')[-1])
392
- response.headers['suffix-file-type'] = OUT_FILE.split('/')[-1]
 
 
 
 
 
393
  return response
394
 
395
 
 
38
  # x = x * fade_in
39
  return x
40
 
41
+ def overlay(x, sound_background=None):
42
  if sound_background is not None:
43
+ sound_background = sound_background.detach().cpu().numpy()[0, :]
44
  len_speech = len(x)
45
+ if len_speech > len(sound_background):
46
  n_repeat = len_speech // len(sound_background) + 1
47
  replica = [sound_background] * n_repeat
48
  replica = [_shift(_) for _ in replica]
49
  sound_background = np.concatenate(replica)
50
 
51
 
52
+ print(f'\nSOUND BACKGROUND SHAPE\n{sound_background.shape=}\n{x.shape=}\n- - - -')
53
  x = .74 * x + .26 * sound_background[:len_speech]
54
  return x
55
 
 
90
  embedding_scale=1))
91
  x = np.concatenate(x)
92
 
93
+ return overlay(x, sound_background)
94
 
95
  # Fallback - Mimic-3
96
  text_utils.store_ssml(text=text, voice=voice) # Text has to be list of single sentences
 
99
  x, fs = soundfile.read('_tmp.wav')
100
  x = audresample.resample(x.astype(np.float32), 24000, fs)[0, :] # reshapes (64,) -> (1,64)
101
 
102
+ return overlay(x, sound_background)
103
 
104
 
105
 
 
131
 
132
  print('Saved all files on Server Side\n\n')
133
 
134
+ args = SimpleNamespace(text=None if r.get('text') is None else 'flask_cache/' + r.get('text')[0].replace("/",""),
135
+ video=None if r.get('video') is None else 'flask_cache/' + r.get('video')[0].replace("/",""),
136
+ image=None if r.get('image') is None else 'flask_cache/' + r.get('image')[0].replace("/",""),
137
+ voice=r.get('voice')[0],
138
+ native=None if r.get('native') is None else 'flask_cache/' + r.get('native')[0].replace("/",""),
139
+ affective = r.get('affective')[0],
140
+ scene=r.get('scene')[0]
141
+ )
142
  # print('\n==RECOMPOSED as \n',request.data,request.form,'\n==')
143
 
144
 
 
202
  '#', '_').replace(
203
  'cmu-arctic', 'cmu_arctic').replace(
204
  '_low', '') + '.wav')
205
+ print('\n STYLE VECTOR \n', precomputed_style_vector.shape)
206
  # ====SILENT VIDEO====
207
 
208
  if args.video is not None:
 
369
 
370
  # Fallback: No image nor video provided - do only tts
371
  x = tts_multi_sentence(text=text,
372
+ precomputed_style_vector=precomputed_style_vector,
373
+ voice=args.voice,
374
+ scene=args.scene)
375
  OUT_FILE = './flask_cache/tmp.wav' #args.out_file + '.wav'
376
  soundfile.write(OUT_FILE, x, 24000)
377
 
 
388
  # response.headers["Content-Type"] = "audio/wav"
389
  # https://stackoverflow.com/questions/67591467/
390
  # flask-shows-typeerror-send-from-directory-missing-1-required-positional-argum
391
+
392
+
393
+
394
+ # send server's output as default file -> srv_result.xx
395
+ print(f'\n=SERVER saved as {OUT_FILE=}\n')
396
+ response = send_from_directory('flask_cache/', path=OUT_FILE)
397
+ response.headers['suffix-file-type'] = OUT_FILE
398
  return response
399
 
400
 
landscape2soundscape.py ADDED
@@ -0,0 +1,272 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import subprocess
3
+ import cv2
4
+
5
+ # with subprocess and an extra argument 'scene' and a 'resized image saved as png' we can call the server
6
+
7
+ # yt-dlp is instaled in .d4
8
+ # Download Part of Video
9
+ # yt-dlp https://www.youtube.com/watch?v=UZ9uyQI3pF0 --downloader ffmpeg --downloader-args "ffmpeg_i:-ss 997 -to 2512"
10
+ # ffmpeg -i Sandra\ Kotevska\,\ Painting\ Rose\ bush\,\ mixed\ media\,\ 2017.\ \[NMzC_036MtE\].mkv -f mp3 -ar 22050 -vn out44.wav -ac 1
11
+
12
+ # https://superuser.com/questions/583393/how-to-extract-subtitle-from-video-using-ffmpeg
13
+
14
+ def _shift(x):
15
+ n = x.shape[0]
16
+ i = np.random.randint(.24 * n, .74 * n)
17
+ return np.roll(x, i)
18
+
19
+ #___________________________________________________________________________________________________
20
+ # VIDEO FROM IMAGE with CAPTIONS
21
+ #
22
+ # UPLOAD to: Simaviro: Documents General WORK PACKAGES WP1 ContentRepository ANBPR_ROMANIA TTSvideos
23
+ # __________________________________________________________________________________________________
24
+
25
+ # TO DONLOAD SRT for youtub
26
+ # yt-dlp --write-sub --sub-lang en --convert-subs "srt" https://www.youtube.com/watch?v=F1Ib7TAu7eg&list=PL4x2B6LSwFewdDvRnUTpBM7jkmpwouhPv&index=2
27
+
28
+ # _voice = 'en_US/vctk_low#p330'
29
+ # _voice = 'en_US/cmu-arctic_low#lnh' #en_US/vctk_low#p249' # 'en_US/vctk_low#p282'
30
+ # _voice = ''en_US/vctk_low#p351''
31
+ # _voice = 'en_US/vctk_low#p351' # avoid 318 it does the ghhhhhh
32
+ # _voice = 'en_US/m-ailabs_low#judy_bieber' # Nice voice for ('Arta culinara romaneasca - Groza Irina [phIF0NxgwlQ].mkv' 'Arta culinara romaneasca - Groza Irina [phIF0NxgwlQ].en-GB.srt'),
33
+ # _voice = 'en_UK/apope_low'
34
+ # _voice = 'en_US/m-ailabs_low#mary_ann'
35
+ # _voice = 'en_US/vctk_low#p351'
36
+ # _voice = 'en_US/hifi-tts_low#92'
37
+ # voice_str = f'_{_voice.replace("/", "")}'
38
+
39
+
40
+
41
+
42
+
43
+ # image/descriptions provided by other SHIFT tool or Human curator
44
+
45
+ # https://simaviro.sharepoint.com/sites/SHIFT/Shared%20Documents/Forms/AllItems.aspx?csf=1&web=1&e=JNK8dQ&cid=363c253d%2D4d61%2D4db1%2D8ffd%2Ddedda749da2d&RootFolder=%2Fsites%2FSHIFT%2FShared%20Documents%2FGENERAL%2FWORK%20PACKAGES%2FWP1%2FContent%20Repository%2Fshift%5FSPK%5Fuse%5Fcases%5Fshare%2F02%5Fuc%5Fspk%5FLandscape2Soundscape%2FLandscape2Soundscape%5F12%5FMasterpieces&FolderCTID=0x01200058F5037C0101524B82F6F0788C02A563
46
+ # STATIC_FRAME = 'uc_spk_Landscape2Soundscape_Masterpieces_pics/01_Schick_AII840_001.jpg' #'assets/image_from_T31.jpg'
47
+
48
+
49
+
50
+
51
+ PIC_DIR = 'uc_spk_Landscape2Soundscape_Masterpieces_pics/'
52
+
53
+ DESCRIPTIONS = [
54
+ # 1
55
+ [
56
+ '01_Schick_AII840_001.jpg', # image
57
+ '01_Schick_AII840_001.txt', # text
58
+ 'Statue in shire hill on autumn beach.', # audiocraft
59
+ 'Gottlieb Chick - Bildnis der Heinrike Dannecker - 1802', # cv2 puttext title
60
+ 'en_US/m-ailabs_low#mary_ann',
61
+ ],
62
+ # 2
63
+ [
64
+ '02_Constable_AI555_001.jpg',
65
+ '02_Constable_AI555_001.txt',
66
+ 'Meadows country farm village in sight',
67
+ 'John Constable - Dorf an dem Flusse Stour - 1804',
68
+ 'en_US/m-ailabs_low#mary_ann',
69
+ ],
70
+ # 3
71
+ [
72
+ '03_Schinkel_WS200-002.jpg',
73
+ '03_Schinkel_WS200-002.txt',
74
+ 'Arriving at the shore on horses',
75
+ 'Karl Friedrich Schinkel - Gotische Kirche auf einem Felsen am Meer - 1815',
76
+ 'en_US/m-ailabs_low#mary_ann',
77
+ ],
78
+ #
79
+ [
80
+ '04_Friedrich_FV317_001.jpg',
81
+ '04_Friedrich_FV317_001.txt',
82
+ 'Land steppes',
83
+ 'Friedrich Caspar David - Der Watzmann - 1824/1825',
84
+ 'en_US/m-ailabs_low#mary_ann',
85
+ ],
86
+ #
87
+ [
88
+ '05_Blechen_FV40_001.jpg',
89
+ '05_Blechen_FV40_001.txt',
90
+ 'fjords',
91
+ 'Blechen - Carl Unwetter in der römischen Campagna - 1829',
92
+ 'en_US/m-ailabs_low#mary_ann',
93
+ ],
94
+ # 6
95
+ [
96
+ '06_Menzel_AI900_001.jpg'
97
+ '06_Menzel_AI900_001.txt',
98
+ 'Olive trees in Seville',
99
+ 'Adolph Menzel - Bauplatz mit Weiden - 1846',
100
+ 'en_US/m-ailabs_low#mary_ann',
101
+ ],
102
+ # 7
103
+ [
104
+ '07_Courbet_AI967_001.jpg',
105
+ '07_Courbet_AI967_001.txt',
106
+ 'Storm at the strand of waves Tsunami',
107
+ 'Gustave Courbet - Die Welle - 1869/1870',
108
+ 'en_US/m-ailabs_low#mary_ann',
109
+ ],
110
+ # 8
111
+ [
112
+ '08_Monet_AI1013_001.jpg',
113
+ '08_Monet_AI1013_001.txt',
114
+ 'Mai flowers blossom picnic',
115
+ 'Claude Monet - Sommertag - 1874',
116
+ 'en_US/m-ailabs_low#mary_ann',
117
+ ],
118
+ # 9
119
+ [
120
+ '09_Blechen_AII823_001.jpg',
121
+ '09_Blechen_AII823_001.txt',
122
+ 'Cascade in Africa',
123
+ 'Carl Blechen - Wasserfälle bei Tivoli - 1832',
124
+ 'en_US/m-ailabs_low#mary_ann',
125
+ ],
126
+ # 10
127
+ [
128
+ '10_Boecklin_967648_NG2-80_001_rsz.jpg',
129
+ '10_Boecklin_967648_NG2-80_001.txt',
130
+ 'Hades ades at it sisland',
131
+ 'Arnold Böcklin - Toteninsel - 1883',
132
+ 'en_US/m-ailabs_low#mary_ann',
133
+ ],
134
+ # 11
135
+ [
136
+ '11_Liebermann_NG4-94_001.jpg',
137
+ '11_Liebermann_NG4-94_001.txt',
138
+ 'Tavern at the waterfront',
139
+ 'Max Tiebermann - Gartenlokal an der Havel. Nikolskoe - 1916',
140
+ 'en_US/m-ailabs_low#mary_ann',
141
+ ],
142
+ # 12
143
+ [
144
+ '12_Slevogt_AII1022_001.jpg',
145
+ '12_Slevogt_AII1022_001.txt',
146
+ 'toy sailing yachts pool',
147
+ 'Max Slevogt - Segelboote auf der Alster am Abend -1905',
148
+ 'en_US/m-ailabs_low#mary_ann',
149
+ ],
150
+ ]
151
+
152
+
153
+ SILENT_VIDEO = '_silent_video.mp4'
154
+
155
+
156
+ # SILENT CLIP
157
+
158
+
159
+ for _img_, _text_, soundscape_text, _title_, _voice_ in DESCRIPTIONS[:1]:
160
+
161
+ # cv2put txt
162
+ im = cv2.imread(PIC_DIR + _img_) # IMG must have EVEN shape
163
+ h, w, _ = im.shape
164
+ im = im[(h%2):, (w%2):, :] # assure even image
165
+ print(im.shape, "GLOBAL IM\n\n\n\n")
166
+ fram = np.zeros((94, im.shape[1], 3), dtype=np.uint8)
167
+ h, w, _ = fram.shape
168
+ font = cv2.FONT_HERSHEY_SIMPLEX
169
+ bottomLeftCornerOfText = (240, 74) # w,h
170
+ fontScale = 2
171
+ fontColor = (255, 255, 255)
172
+ thickness = 4
173
+ lineType = 2
174
+ cv2.putText(fram, _title_, #'LandScape 2 SoundScape',
175
+ bottomLeftCornerOfText,
176
+ font,
177
+ fontScale,
178
+ fontColor,
179
+ thickness,
180
+ lineType)
181
+ offset_h = 24
182
+ im[offset_h:h+offset_h, :w, :] = (.4 * im[offset_h:h+offset_h, :w, :] + .6 * fram).astype(np.uint8)
183
+ # cv2.imshow('i', im); cv2.waitKey(); cv2.destroyAllWindows()
184
+
185
+ # logo aud
186
+
187
+ logo = cv2.imread('assets/audeering_logo.jpg')[:740, :, :]
188
+ logo = cv2.resize(logo, (logo.shape[1]//2, logo.shape[0]//2))
189
+ h, w, _ = logo.shape
190
+ offset_h = im.shape[0] - h
191
+ im[offset_h:h+offset_h, :w, :] = (.23 * im[offset_h:h+offset_h, :w, :] + .77 * logo).astype(np.uint8)
192
+
193
+ # logo SMB
194
+
195
+ logo = cv2.imread('assets/SMB_logo.png')#[:740, :, :]
196
+ logo = cv2.resize(logo, (logo.shape[1]//2, logo.shape[0]//2))
197
+ h, w, _ = logo.shape
198
+ offset_h = im.shape[0] - h
199
+ # fill logo SMB with the pixels of im - where SMB is empty
200
+ ptc = im[offset_h:h+offset_h, :w, :]
201
+ logo[logo == 0] = ptc[logo == 0] # fill empty
202
+ im[offset_h:h+offset_h, :w, :] = (.13 * im[offset_h:h+offset_h, :w, :] + .86 * logo).astype(np.uint8)
203
+
204
+ # # logo shift
205
+
206
+ # logo = cv2.imread('assets/shift_logo.png')#[:740, :, :]
207
+ # logo = cv2.resize(logo, (logo.shape[1]//2, logo.shape[0]//2))
208
+ # h, w, _ = logo.shape
209
+ # offset_h = im.shape[0] - h #-274
210
+ # offset_w = im.shape[1] - w #400
211
+ # # # fill logo SMB with the pixels of im - where SMB is empty
212
+ # ptc = im[offset_h:h+offset_h, :w, :]
213
+ # # msk = np.tile(logo[:, :,0:1] > 252, [1,1,3])
214
+ # # logo[msk] = ptc[msk] # fill empty
215
+ # im[offset_h:h+offset_h, offset_w:w+offset_w, :] = (.0 * im[offset_h:h+offset_h, offset_w:w+offset_w, :] + 1 * logo).astype(np.uint8)
216
+
217
+ # silent video - img
218
+ # im = cv2.resize(im, (700, 700))
219
+ cv2.imwrite('pic_logo_emb.png', im)
220
+
221
+
222
+
223
+
224
+ # raw, _ = soundfile.read(soundscape_file) # 12345, 2
225
+
226
+ # # fill
227
+ # soundscape = []
228
+ # for _replica in range(math.ceil(len(total) / raw.shape[0])+1):
229
+ # soundscape.append(raw) # _shift non defined for stereo
230
+ # soundscape = np.concatenate(soundscape, 0)
231
+
232
+ # total = .36 * np.concatenate([total[:, None],
233
+ # total[:, None]], 1) + .64 * soundscape[:len(total), :]
234
+
235
+ # outfile
236
+
237
+ OUT_FILE = _img_.split('/')[-1].replace('.','__') + '.mp4' # assets / -1
238
+ print(f'{OUT_FILE=}\n')
239
+ # call API passing img
240
+
241
+ subprocess.run(
242
+ [
243
+ "python",
244
+ "tts.py",
245
+ "--text", PIC_DIR + _text_,
246
+ '--image', 'pic_logo_emb.png',
247
+ # "--title", _title_,
248
+ # '--soundscape_text', soundscape_text,
249
+ '--voice', _voice_,
250
+ '--out_file', OUT_FILE,
251
+ ])
252
+
253
+ # soundfile.write(AUDIO_TRACK, total, 22050)
254
+ # subprocess.call(
255
+ # ["ffmpeg",
256
+ # "-y",
257
+ # "-i",
258
+ # SILENT_VIDEO,
259
+ # "-i",
260
+ # AUDIO_TRACK,
261
+ # #"-c:v",
262
+ # #"copy",
263
+ # "-map",
264
+ # "0:v:0",
265
+ # "-map",
266
+ # " 1:a:0",
267
+ # "-vf",
268
+ # "pad",
269
+ # OUT_FILE])
270
+
271
+
272
+
msinference.py CHANGED
@@ -183,7 +183,7 @@ def inference(text, ref_s, alpha = 0.3, beta = 0.7, diffusion_steps=5, embedding
183
  # print(f'TEXTCLEAN: {ps=}\n\n') #TEXTCLEAN: ps='ɐbˈɛbæbləm'
184
  tokens.insert(0, 0)
185
  tokens = torch.LongTensor(tokens).to(device).unsqueeze(0)
186
- print(f'TOKENSFINAL: {ps=}\n\n')
187
 
188
  with torch.no_grad():
189
  input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)
 
183
  # print(f'TEXTCLEAN: {ps=}\n\n') #TEXTCLEAN: ps='ɐbˈɛbæbləm'
184
  tokens.insert(0, 0)
185
  tokens = torch.LongTensor(tokens).to(device).unsqueeze(0)
186
+ # print(f'TOKENSFINAL: {ps=}\n\n')
187
 
188
  with torch.no_grad():
189
  input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)
tts.py CHANGED
@@ -65,7 +65,7 @@ def command_line_args():
65
  '--out_file',
66
  help="Output file name.",
67
  type=str,
68
- default='out'
69
  )
70
  parser.add_argument(
71
  '--scene',
@@ -86,7 +86,7 @@ def send_to_server(args):
86
  'image': args.image,
87
  'video': args.video,
88
  'scene': args.scene,
89
- 'out_file': args.out_file
90
  }
91
 
92
  # In data= we can write args
@@ -147,6 +147,7 @@ def cli():
147
  response = send_to_server(args)
148
 
149
  with open(
 
150
  args.out_file + '.' + response.headers['suffix-file-type'].split('.')[-1],
151
  'wb'
152
  ) as f:
 
65
  '--out_file',
66
  help="Output file name.",
67
  type=str,
68
+ default='b6'
69
  )
70
  parser.add_argument(
71
  '--scene',
 
86
  'image': args.image,
87
  'video': args.video,
88
  'scene': args.scene,
89
+ # 'out_file': args.out_file # let serve save as temp
90
  }
91
 
92
  # In data= we can write args
 
147
  response = send_to_server(args)
148
 
149
  with open(
150
+ # args.out_file is not send to server - server writes tmp - copied by client
151
  args.out_file + '.' + response.headers['suffix-file-type'].split('.')[-1],
152
  'wb'
153
  ) as f: