Dionyssos commited on
Commit
a596c10
·
1 Parent(s): d3c797d
Modules/vits/data_utils.py DELETED
@@ -1,392 +0,0 @@
1
- import time
2
- import os
3
- import random
4
- import numpy as np
5
- import torch
6
- import torch.utils.data
7
-
8
- import commons
9
- from mel_processing import spectrogram_torch
10
- from utils import load_wav_to_torch, load_filepaths_and_text
11
- from text import text_to_sequence, cleaned_text_to_sequence
12
-
13
-
14
- class TextAudioLoader(torch.utils.data.Dataset):
15
- """
16
- 1) loads audio, text pairs
17
- 2) normalizes text and converts them to sequences of integers
18
- 3) computes spectrograms from audio files.
19
- """
20
- def __init__(self, audiopaths_and_text, hparams):
21
- self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text)
22
- self.text_cleaners = hparams.text_cleaners
23
- self.max_wav_value = hparams.max_wav_value
24
- self.sampling_rate = hparams.sampling_rate
25
- self.filter_length = hparams.filter_length
26
- self.hop_length = hparams.hop_length
27
- self.win_length = hparams.win_length
28
- self.sampling_rate = hparams.sampling_rate
29
-
30
- self.cleaned_text = getattr(hparams, "cleaned_text", False)
31
-
32
- self.add_blank = hparams.add_blank
33
- self.min_text_len = getattr(hparams, "min_text_len", 1)
34
- self.max_text_len = getattr(hparams, "max_text_len", 190)
35
-
36
- random.seed(1234)
37
- random.shuffle(self.audiopaths_and_text)
38
- self._filter()
39
-
40
-
41
- def _filter(self):
42
- """
43
- Filter text & store spec lengths
44
- """
45
- # Store spectrogram lengths for Bucketing
46
- # wav_length ~= file_size / (wav_channels * Bytes per dim) = file_size / (1 * 2)
47
- # spec_length = wav_length // hop_length
48
-
49
- audiopaths_and_text_new = []
50
- lengths = []
51
- for audiopath, text in self.audiopaths_and_text:
52
- if self.min_text_len <= len(text) and len(text) <= self.max_text_len:
53
- audiopaths_and_text_new.append([audiopath, text])
54
- lengths.append(os.path.getsize(audiopath) // (2 * self.hop_length))
55
- self.audiopaths_and_text = audiopaths_and_text_new
56
- self.lengths = lengths
57
-
58
- def get_audio_text_pair(self, audiopath_and_text):
59
- # separate filename and text
60
- audiopath, text = audiopath_and_text[0], audiopath_and_text[1]
61
- text = self.get_text(text)
62
- spec, wav = self.get_audio(audiopath)
63
- return (text, spec, wav)
64
-
65
- def get_audio(self, filename):
66
- audio, sampling_rate = load_wav_to_torch(filename)
67
- if sampling_rate != self.sampling_rate:
68
- raise ValueError("{} {} SR doesn't match target {} SR".format(
69
- sampling_rate, self.sampling_rate))
70
- audio_norm = audio / self.max_wav_value
71
- audio_norm = audio_norm.unsqueeze(0)
72
- spec_filename = filename.replace(".wav", ".spec.pt")
73
- if os.path.exists(spec_filename):
74
- spec = torch.load(spec_filename)
75
- else:
76
- spec = spectrogram_torch(audio_norm, self.filter_length,
77
- self.sampling_rate, self.hop_length, self.win_length,
78
- center=False)
79
- spec = torch.squeeze(spec, 0)
80
- torch.save(spec, spec_filename)
81
- return spec, audio_norm
82
-
83
- def get_text(self, text):
84
- if self.cleaned_text:
85
- text_norm = cleaned_text_to_sequence(text)
86
- else:
87
- text_norm = text_to_sequence(text, self.text_cleaners)
88
- if self.add_blank:
89
- text_norm = commons.intersperse(text_norm, 0)
90
- text_norm = torch.LongTensor(text_norm)
91
- return text_norm
92
-
93
- def __getitem__(self, index):
94
- return self.get_audio_text_pair(self.audiopaths_and_text[index])
95
-
96
- def __len__(self):
97
- return len(self.audiopaths_and_text)
98
-
99
-
100
- class TextAudioCollate():
101
- """ Zero-pads model inputs and targets
102
- """
103
- def __init__(self, return_ids=False):
104
- self.return_ids = return_ids
105
-
106
- def __call__(self, batch):
107
- """Collate's training batch from normalized text and aduio
108
- PARAMS
109
- ------
110
- batch: [text_normalized, spec_normalized, wav_normalized]
111
- """
112
- # Right zero-pad all one-hot text sequences to max input length
113
- _, ids_sorted_decreasing = torch.sort(
114
- torch.LongTensor([x[1].size(1) for x in batch]),
115
- dim=0, descending=True)
116
-
117
- max_text_len = max([len(x[0]) for x in batch])
118
- max_spec_len = max([x[1].size(1) for x in batch])
119
- max_wav_len = max([x[2].size(1) for x in batch])
120
-
121
- text_lengths = torch.LongTensor(len(batch))
122
- spec_lengths = torch.LongTensor(len(batch))
123
- wav_lengths = torch.LongTensor(len(batch))
124
-
125
- text_padded = torch.LongTensor(len(batch), max_text_len)
126
- spec_padded = torch.FloatTensor(len(batch), batch[0][1].size(0), max_spec_len)
127
- wav_padded = torch.FloatTensor(len(batch), 1, max_wav_len)
128
- text_padded.zero_()
129
- spec_padded.zero_()
130
- wav_padded.zero_()
131
- for i in range(len(ids_sorted_decreasing)):
132
- row = batch[ids_sorted_decreasing[i]]
133
-
134
- text = row[0]
135
- text_padded[i, :text.size(0)] = text
136
- text_lengths[i] = text.size(0)
137
-
138
- spec = row[1]
139
- spec_padded[i, :, :spec.size(1)] = spec
140
- spec_lengths[i] = spec.size(1)
141
-
142
- wav = row[2]
143
- wav_padded[i, :, :wav.size(1)] = wav
144
- wav_lengths[i] = wav.size(1)
145
-
146
- if self.return_ids:
147
- return text_padded, text_lengths, spec_padded, spec_lengths, wav_padded, wav_lengths, ids_sorted_decreasing
148
- return text_padded, text_lengths, spec_padded, spec_lengths, wav_padded, wav_lengths
149
-
150
-
151
- """Multi speaker version"""
152
- class TextAudioSpeakerLoader(torch.utils.data.Dataset):
153
- """
154
- 1) loads audio, speaker_id, text pairs
155
- 2) normalizes text and converts them to sequences of integers
156
- 3) computes spectrograms from audio files.
157
- """
158
- def __init__(self, audiopaths_sid_text, hparams):
159
- self.audiopaths_sid_text = load_filepaths_and_text(audiopaths_sid_text)
160
- self.text_cleaners = hparams.text_cleaners
161
- self.max_wav_value = hparams.max_wav_value
162
- self.sampling_rate = hparams.sampling_rate
163
- self.filter_length = hparams.filter_length
164
- self.hop_length = hparams.hop_length
165
- self.win_length = hparams.win_length
166
- self.sampling_rate = hparams.sampling_rate
167
-
168
- self.cleaned_text = getattr(hparams, "cleaned_text", False)
169
-
170
- self.add_blank = hparams.add_blank
171
- self.min_text_len = getattr(hparams, "min_text_len", 1)
172
- self.max_text_len = getattr(hparams, "max_text_len", 190)
173
-
174
- random.seed(1234)
175
- random.shuffle(self.audiopaths_sid_text)
176
- self._filter()
177
-
178
- def _filter(self):
179
- """
180
- Filter text & store spec lengths
181
- """
182
- # Store spectrogram lengths for Bucketing
183
- # wav_length ~= file_size / (wav_channels * Bytes per dim) = file_size / (1 * 2)
184
- # spec_length = wav_length // hop_length
185
-
186
- audiopaths_sid_text_new = []
187
- lengths = []
188
- for audiopath, sid, text in self.audiopaths_sid_text:
189
- if self.min_text_len <= len(text) and len(text) <= self.max_text_len:
190
- audiopaths_sid_text_new.append([audiopath, sid, text])
191
- lengths.append(os.path.getsize(audiopath) // (2 * self.hop_length))
192
- self.audiopaths_sid_text = audiopaths_sid_text_new
193
- self.lengths = lengths
194
-
195
- def get_audio_text_speaker_pair(self, audiopath_sid_text):
196
- # separate filename, speaker_id and text
197
- audiopath, sid, text = audiopath_sid_text[0], audiopath_sid_text[1], audiopath_sid_text[2]
198
- text = self.get_text(text)
199
- spec, wav = self.get_audio(audiopath)
200
- sid = self.get_sid(sid)
201
- return (text, spec, wav, sid)
202
-
203
- def get_audio(self, filename):
204
- audio, sampling_rate = load_wav_to_torch(filename)
205
- if sampling_rate != self.sampling_rate:
206
- raise ValueError("{} {} SR doesn't match target {} SR".format(
207
- sampling_rate, self.sampling_rate))
208
- audio_norm = audio / self.max_wav_value
209
- audio_norm = audio_norm.unsqueeze(0)
210
- spec_filename = filename.replace(".wav", ".spec.pt")
211
- if os.path.exists(spec_filename):
212
- spec = torch.load(spec_filename)
213
- else:
214
- spec = spectrogram_torch(audio_norm, self.filter_length,
215
- self.sampling_rate, self.hop_length, self.win_length,
216
- center=False)
217
- spec = torch.squeeze(spec, 0)
218
- torch.save(spec, spec_filename)
219
- return spec, audio_norm
220
-
221
- def get_text(self, text):
222
- if self.cleaned_text:
223
- text_norm = cleaned_text_to_sequence(text)
224
- else:
225
- text_norm = text_to_sequence(text, self.text_cleaners)
226
- if self.add_blank:
227
- text_norm = commons.intersperse(text_norm, 0)
228
- text_norm = torch.LongTensor(text_norm)
229
- return text_norm
230
-
231
- def get_sid(self, sid):
232
- sid = torch.LongTensor([int(sid)])
233
- return sid
234
-
235
- def __getitem__(self, index):
236
- return self.get_audio_text_speaker_pair(self.audiopaths_sid_text[index])
237
-
238
- def __len__(self):
239
- return len(self.audiopaths_sid_text)
240
-
241
-
242
- class TextAudioSpeakerCollate():
243
- """ Zero-pads model inputs and targets
244
- """
245
- def __init__(self, return_ids=False):
246
- self.return_ids = return_ids
247
-
248
- def __call__(self, batch):
249
- """Collate's training batch from normalized text, audio and speaker identities
250
- PARAMS
251
- ------
252
- batch: [text_normalized, spec_normalized, wav_normalized, sid]
253
- """
254
- # Right zero-pad all one-hot text sequences to max input length
255
- _, ids_sorted_decreasing = torch.sort(
256
- torch.LongTensor([x[1].size(1) for x in batch]),
257
- dim=0, descending=True)
258
-
259
- max_text_len = max([len(x[0]) for x in batch])
260
- max_spec_len = max([x[1].size(1) for x in batch])
261
- max_wav_len = max([x[2].size(1) for x in batch])
262
-
263
- text_lengths = torch.LongTensor(len(batch))
264
- spec_lengths = torch.LongTensor(len(batch))
265
- wav_lengths = torch.LongTensor(len(batch))
266
- sid = torch.LongTensor(len(batch))
267
-
268
- text_padded = torch.LongTensor(len(batch), max_text_len)
269
- spec_padded = torch.FloatTensor(len(batch), batch[0][1].size(0), max_spec_len)
270
- wav_padded = torch.FloatTensor(len(batch), 1, max_wav_len)
271
- text_padded.zero_()
272
- spec_padded.zero_()
273
- wav_padded.zero_()
274
- for i in range(len(ids_sorted_decreasing)):
275
- row = batch[ids_sorted_decreasing[i]]
276
-
277
- text = row[0]
278
- text_padded[i, :text.size(0)] = text
279
- text_lengths[i] = text.size(0)
280
-
281
- spec = row[1]
282
- spec_padded[i, :, :spec.size(1)] = spec
283
- spec_lengths[i] = spec.size(1)
284
-
285
- wav = row[2]
286
- wav_padded[i, :, :wav.size(1)] = wav
287
- wav_lengths[i] = wav.size(1)
288
-
289
- sid[i] = row[3]
290
-
291
- if self.return_ids:
292
- return text_padded, text_lengths, spec_padded, spec_lengths, wav_padded, wav_lengths, sid, ids_sorted_decreasing
293
- return text_padded, text_lengths, spec_padded, spec_lengths, wav_padded, wav_lengths, sid
294
-
295
-
296
- class DistributedBucketSampler(torch.utils.data.distributed.DistributedSampler):
297
- """
298
- Maintain similar input lengths in a batch.
299
- Length groups are specified by boundaries.
300
- Ex) boundaries = [b1, b2, b3] -> any batch is included either {x | b1 < length(x) <=b2} or {x | b2 < length(x) <= b3}.
301
-
302
- It removes samples which are not included in the boundaries.
303
- Ex) boundaries = [b1, b2, b3] -> any x s.t. length(x) <= b1 or length(x) > b3 are discarded.
304
- """
305
- def __init__(self, dataset, batch_size, boundaries, num_replicas=None, rank=None, shuffle=True):
306
- super().__init__(dataset, num_replicas=num_replicas, rank=rank, shuffle=shuffle)
307
- self.lengths = dataset.lengths
308
- self.batch_size = batch_size
309
- self.boundaries = boundaries
310
-
311
- self.buckets, self.num_samples_per_bucket = self._create_buckets()
312
- self.total_size = sum(self.num_samples_per_bucket)
313
- self.num_samples = self.total_size // self.num_replicas
314
-
315
- def _create_buckets(self):
316
- buckets = [[] for _ in range(len(self.boundaries) - 1)]
317
- for i in range(len(self.lengths)):
318
- length = self.lengths[i]
319
- idx_bucket = self._bisect(length)
320
- if idx_bucket != -1:
321
- buckets[idx_bucket].append(i)
322
-
323
- for i in range(len(buckets) - 1, 0, -1):
324
- if len(buckets[i]) == 0:
325
- buckets.pop(i)
326
- self.boundaries.pop(i+1)
327
-
328
- num_samples_per_bucket = []
329
- for i in range(len(buckets)):
330
- len_bucket = len(buckets[i])
331
- total_batch_size = self.num_replicas * self.batch_size
332
- rem = (total_batch_size - (len_bucket % total_batch_size)) % total_batch_size
333
- num_samples_per_bucket.append(len_bucket + rem)
334
- return buckets, num_samples_per_bucket
335
-
336
- def __iter__(self):
337
- # deterministically shuffle based on epoch
338
- g = torch.Generator()
339
- g.manual_seed(self.epoch)
340
-
341
- indices = []
342
- if self.shuffle:
343
- for bucket in self.buckets:
344
- indices.append(torch.randperm(len(bucket), generator=g).tolist())
345
- else:
346
- for bucket in self.buckets:
347
- indices.append(list(range(len(bucket))))
348
-
349
- batches = []
350
- for i in range(len(self.buckets)):
351
- bucket = self.buckets[i]
352
- len_bucket = len(bucket)
353
- ids_bucket = indices[i]
354
- num_samples_bucket = self.num_samples_per_bucket[i]
355
-
356
- # add extra samples to make it evenly divisible
357
- rem = num_samples_bucket - len_bucket
358
- ids_bucket = ids_bucket + ids_bucket * (rem // len_bucket) + ids_bucket[:(rem % len_bucket)]
359
-
360
- # subsample
361
- ids_bucket = ids_bucket[self.rank::self.num_replicas]
362
-
363
- # batching
364
- for j in range(len(ids_bucket) // self.batch_size):
365
- batch = [bucket[idx] for idx in ids_bucket[j*self.batch_size:(j+1)*self.batch_size]]
366
- batches.append(batch)
367
-
368
- if self.shuffle:
369
- batch_ids = torch.randperm(len(batches), generator=g).tolist()
370
- batches = [batches[i] for i in batch_ids]
371
- self.batches = batches
372
-
373
- assert len(self.batches) * self.batch_size == self.num_samples
374
- return iter(self.batches)
375
-
376
- def _bisect(self, x, lo=0, hi=None):
377
- if hi is None:
378
- hi = len(self.boundaries) - 1
379
-
380
- if hi > lo:
381
- mid = (hi + lo) // 2
382
- if self.boundaries[mid] < x and x <= self.boundaries[mid+1]:
383
- return mid
384
- elif x <= self.boundaries[mid]:
385
- return self._bisect(x, lo, mid)
386
- else:
387
- return self._bisect(x, mid + 1, hi)
388
- else:
389
- return -1
390
-
391
- def __len__(self):
392
- return self.num_samples // self.batch_size
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Modules/vits/mel_processing.py DELETED
@@ -1,112 +0,0 @@
1
- import math
2
- import os
3
- import random
4
- import torch
5
- from torch import nn
6
- import torch.nn.functional as F
7
- import torch.utils.data
8
- import numpy as np
9
- import librosa
10
- import librosa.util as librosa_util
11
- from librosa.util import normalize, pad_center, tiny
12
- from scipy.signal import get_window
13
- from scipy.io.wavfile import read
14
- from librosa.filters import mel as librosa_mel_fn
15
-
16
- MAX_WAV_VALUE = 32768.0
17
-
18
-
19
- def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
20
- """
21
- PARAMS
22
- ------
23
- C: compression factor
24
- """
25
- return torch.log(torch.clamp(x, min=clip_val) * C)
26
-
27
-
28
- def dynamic_range_decompression_torch(x, C=1):
29
- """
30
- PARAMS
31
- ------
32
- C: compression factor used to compress
33
- """
34
- return torch.exp(x) / C
35
-
36
-
37
- def spectral_normalize_torch(magnitudes):
38
- output = dynamic_range_compression_torch(magnitudes)
39
- return output
40
-
41
-
42
- def spectral_de_normalize_torch(magnitudes):
43
- output = dynamic_range_decompression_torch(magnitudes)
44
- return output
45
-
46
-
47
- mel_basis = {}
48
- hann_window = {}
49
-
50
-
51
- def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False):
52
- if torch.min(y) < -1.:
53
- print('min value is ', torch.min(y))
54
- if torch.max(y) > 1.:
55
- print('max value is ', torch.max(y))
56
-
57
- global hann_window
58
- dtype_device = str(y.dtype) + '_' + str(y.device)
59
- wnsize_dtype_device = str(win_size) + '_' + dtype_device
60
- if wnsize_dtype_device not in hann_window:
61
- hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device)
62
-
63
- y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect')
64
- y = y.squeeze(1)
65
-
66
- spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device],
67
- center=center, pad_mode='reflect', normalized=False, onesided=True)
68
-
69
- spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
70
- return spec
71
-
72
-
73
- def spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax):
74
- global mel_basis
75
- dtype_device = str(spec.dtype) + '_' + str(spec.device)
76
- fmax_dtype_device = str(fmax) + '_' + dtype_device
77
- if fmax_dtype_device not in mel_basis:
78
- mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax)
79
- mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(dtype=spec.dtype, device=spec.device)
80
- spec = torch.matmul(mel_basis[fmax_dtype_device], spec)
81
- spec = spectral_normalize_torch(spec)
82
- return spec
83
-
84
-
85
- def mel_spectrogram_torch(y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False):
86
- if torch.min(y) < -1.:
87
- print('min value is ', torch.min(y))
88
- if torch.max(y) > 1.:
89
- print('max value is ', torch.max(y))
90
-
91
- global mel_basis, hann_window
92
- dtype_device = str(y.dtype) + '_' + str(y.device)
93
- fmax_dtype_device = str(fmax) + '_' + dtype_device
94
- wnsize_dtype_device = str(win_size) + '_' + dtype_device
95
- if fmax_dtype_device not in mel_basis:
96
- mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax)
97
- mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(dtype=y.dtype, device=y.device)
98
- if wnsize_dtype_device not in hann_window:
99
- hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device)
100
-
101
- y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect')
102
- y = y.squeeze(1)
103
-
104
- spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device],
105
- center=center, pad_mode='reflect', normalized=False, onesided=True)
106
-
107
- spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
108
-
109
- spec = torch.matmul(mel_basis[fmax_dtype_device], spec)
110
- spec = spectral_normalize_torch(spec)
111
-
112
- return spec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Modules/vits/text/LICENSE DELETED
@@ -1,19 +0,0 @@
1
- Copyright (c) 2017 Keith Ito
2
-
3
- Permission is hereby granted, free of charge, to any person obtaining a copy
4
- of this software and associated documentation files (the "Software"), to deal
5
- in the Software without restriction, including without limitation the rights
6
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7
- copies of the Software, and to permit persons to whom the Software is
8
- furnished to do so, subject to the following conditions:
9
-
10
- The above copyright notice and this permission notice shall be included in
11
- all copies or substantial portions of the Software.
12
-
13
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19
- THE SOFTWARE.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Modules/vits/text/__init__.py DELETED
@@ -1,54 +0,0 @@
1
- """ from https://github.com/keithito/tacotron """
2
- from text import cleaners
3
- from text.symbols import symbols
4
-
5
-
6
- # Mappings from symbol to numeric ID and vice versa:
7
- _symbol_to_id = {s: i for i, s in enumerate(symbols)}
8
- _id_to_symbol = {i: s for i, s in enumerate(symbols)}
9
-
10
-
11
- def text_to_sequence(text, cleaner_names):
12
- '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
13
- Args:
14
- text: string to convert to a sequence
15
- cleaner_names: names of the cleaner functions to run the text through
16
- Returns:
17
- List of integers corresponding to the symbols in the text
18
- '''
19
- sequence = []
20
-
21
- clean_text = _clean_text(text, cleaner_names)
22
- for symbol in clean_text:
23
- symbol_id = _symbol_to_id[symbol]
24
- sequence += [symbol_id]
25
- return sequence
26
-
27
-
28
- def cleaned_text_to_sequence(cleaned_text):
29
- '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
30
- Args:
31
- text: string to convert to a sequence
32
- Returns:
33
- List of integers corresponding to the symbols in the text
34
- '''
35
- sequence = [_symbol_to_id[symbol] for symbol in cleaned_text]
36
- return sequence
37
-
38
-
39
- def sequence_to_text(sequence):
40
- '''Converts a sequence of IDs back to a string'''
41
- result = ''
42
- for symbol_id in sequence:
43
- s = _id_to_symbol[symbol_id]
44
- result += s
45
- return result
46
-
47
-
48
- def _clean_text(text, cleaner_names):
49
- for name in cleaner_names:
50
- cleaner = getattr(cleaners, name)
51
- if not cleaner:
52
- raise Exception('Unknown cleaner: %s' % name)
53
- text = cleaner(text)
54
- return text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Modules/vits/text/cleaners.py DELETED
@@ -1,100 +0,0 @@
1
- """ from https://github.com/keithito/tacotron """
2
-
3
- '''
4
- Cleaners are transformations that run over the input text at both training and eval time.
5
-
6
- Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners"
7
- hyperparameter. Some cleaners are English-specific. You'll typically want to use:
8
- 1. "english_cleaners" for English text
9
- 2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using
10
- the Unidecode library (https://pypi.python.org/pypi/Unidecode)
11
- 3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update
12
- the symbols in symbols.py to match your data).
13
- '''
14
-
15
- import re
16
- from unidecode import unidecode
17
- from phonemizer import phonemize
18
-
19
-
20
- # Regular expression matching whitespace:
21
- _whitespace_re = re.compile(r'\s+')
22
-
23
- # List of (regular expression, replacement) pairs for abbreviations:
24
- _abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [
25
- ('mrs', 'misess'),
26
- ('mr', 'mister'),
27
- ('dr', 'doctor'),
28
- ('st', 'saint'),
29
- ('co', 'company'),
30
- ('jr', 'junior'),
31
- ('maj', 'major'),
32
- ('gen', 'general'),
33
- ('drs', 'doctors'),
34
- ('rev', 'reverend'),
35
- ('lt', 'lieutenant'),
36
- ('hon', 'honorable'),
37
- ('sgt', 'sergeant'),
38
- ('capt', 'captain'),
39
- ('esq', 'esquire'),
40
- ('ltd', 'limited'),
41
- ('col', 'colonel'),
42
- ('ft', 'fort'),
43
- ]]
44
-
45
-
46
- def expand_abbreviations(text):
47
- for regex, replacement in _abbreviations:
48
- text = re.sub(regex, replacement, text)
49
- return text
50
-
51
-
52
- def expand_numbers(text):
53
- return normalize_numbers(text)
54
-
55
-
56
- def lowercase(text):
57
- return text.lower()
58
-
59
-
60
- def collapse_whitespace(text):
61
- return re.sub(_whitespace_re, ' ', text)
62
-
63
-
64
- def convert_to_ascii(text):
65
- return unidecode(text)
66
-
67
-
68
- def basic_cleaners(text):
69
- '''Basic pipeline that lowercases and collapses whitespace without transliteration.'''
70
- text = lowercase(text)
71
- text = collapse_whitespace(text)
72
- return text
73
-
74
-
75
- def transliteration_cleaners(text):
76
- '''Pipeline for non-English text that transliterates to ASCII.'''
77
- text = convert_to_ascii(text)
78
- text = lowercase(text)
79
- text = collapse_whitespace(text)
80
- return text
81
-
82
-
83
- def english_cleaners(text):
84
- '''Pipeline for English text, including abbreviation expansion.'''
85
- text = convert_to_ascii(text)
86
- text = lowercase(text)
87
- text = expand_abbreviations(text)
88
- phonemes = phonemize(text, language='en-us', backend='espeak', strip=True)
89
- phonemes = collapse_whitespace(phonemes)
90
- return phonemes
91
-
92
-
93
- def english_cleaners2(text):
94
- '''Pipeline for English text, including abbreviation expansion. + punctuation + stress'''
95
- text = convert_to_ascii(text)
96
- text = lowercase(text)
97
- text = expand_abbreviations(text)
98
- phonemes = phonemize(text, language='en-us', backend='espeak', strip=True, preserve_punctuation=True, with_stress=True)
99
- phonemes = collapse_whitespace(phonemes)
100
- return phonemes
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Modules/vits/text/symbols.py DELETED
@@ -1,16 +0,0 @@
1
- """ from https://github.com/keithito/tacotron """
2
-
3
- '''
4
- Defines the set of symbols used in text input to the model.
5
- '''
6
- _pad = '_'
7
- _punctuation = ';:,.!?¡¿—…"«»“” '
8
- _letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'
9
- _letters_ipa = "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ"
10
-
11
-
12
- # Export all symbols:
13
- symbols = [_pad] + list(_punctuation) + list(_letters) + list(_letters_ipa)
14
-
15
- # Special symbol ids
16
- SPACE_ID = symbols.index(" ")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
README.md CHANGED
@@ -25,11 +25,9 @@ Expansion of [SHIFT TTS tool](https://github.com/audeering/shift) with [AudioGen
25
  - [Analysis of emotions of TTS](https://huggingface.co/dkounadis/artificial-styletts2/discussions/2)
26
  - [Listen also foreign languages](https://huggingface.co/dkounadis/artificial-styletts2/discussions/4)
27
 
28
- ## Available Voices
29
 
30
- <a href="https://audeering.github.io/shift/">Listen to available voices!</a>
31
-
32
- <a href="https://github.com/audeering/shift/blob/main/Utils/all_langs.csv">Foreign languages</a>
33
 
34
  ## Flask
35
 
 
25
  - [Analysis of emotions of TTS](https://huggingface.co/dkounadis/artificial-styletts2/discussions/2)
26
  - [Listen also foreign languages](https://huggingface.co/dkounadis/artificial-styletts2/discussions/4)
27
 
28
+ ## Available TTS Voices
29
 
30
+ <a href="https://audeering.github.io/shift/">Listen to available voices!</a> & <a href="https://huggingface.co/dkounadis/artificial-styletts2/blob/main/Utils/all_langs.csv">Foreign languages</a>
 
 
31
 
32
  ## Flask
33