Pratye commited on
Commit
3a554b6
·
1 Parent(s): 7fc8252

Files Added

Browse files
Retrieval_based_Voice_Conversion_WebUI.ipynb ADDED
@@ -0,0 +1,403 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "attachments": {},
5
+ "cell_type": "markdown",
6
+ "metadata": {},
7
+ "source": [
8
+ "# [Retrieval-based-Voice-Conversion-WebUI](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI) Training notebook"
9
+ ]
10
+ },
11
+ {
12
+ "attachments": {},
13
+ "cell_type": "markdown",
14
+ "metadata": {
15
+ "id": "ZFFCx5J80SGa"
16
+ },
17
+ "source": [
18
+ "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/blob/main/Retrieval_based_Voice_Conversion_WebUI.ipynb)"
19
+ ]
20
+ },
21
+ {
22
+ "cell_type": "code",
23
+ "execution_count": null,
24
+ "metadata": {
25
+ "id": "GmFP6bN9dvOq"
26
+ },
27
+ "outputs": [],
28
+ "source": [
29
+ "# @title 查看显卡\n",
30
+ "!nvidia-smi"
31
+ ]
32
+ },
33
+ {
34
+ "cell_type": "code",
35
+ "execution_count": null,
36
+ "metadata": {
37
+ "id": "jwu07JgqoFON"
38
+ },
39
+ "outputs": [],
40
+ "source": [
41
+ "# @title 挂载谷歌云盘\n",
42
+ "\n",
43
+ "from google.colab import drive\n",
44
+ "\n",
45
+ "drive.mount(\"/content/drive\")"
46
+ ]
47
+ },
48
+ {
49
+ "cell_type": "code",
50
+ "execution_count": null,
51
+ "metadata": {
52
+ "id": "wjddIFr1oS3W"
53
+ },
54
+ "outputs": [],
55
+ "source": [
56
+ "# @title 安装依赖\n",
57
+ "!apt-get -y install build-essential python3-dev ffmpeg\n",
58
+ "!pip3 install --upgrade setuptools wheel\n",
59
+ "!pip3 install --upgrade pip\n",
60
+ "!pip3 install faiss-cpu==1.7.2 fairseq gradio==3.14.0 ffmpeg ffmpeg-python praat-parselmouth pyworld numpy==1.23.5 numba==0.56.4 librosa==0.9.2"
61
+ ]
62
+ },
63
+ {
64
+ "cell_type": "code",
65
+ "execution_count": null,
66
+ "metadata": {
67
+ "id": "ge_97mfpgqTm"
68
+ },
69
+ "outputs": [],
70
+ "source": [
71
+ "# @title 克隆仓库\n",
72
+ "\n",
73
+ "!git clone --depth=1 -b stable https://github.com/fumiama/Retrieval-based-Voice-Conversion-WebUI\n",
74
+ "%cd /content/Retrieval-based-Voice-Conversion-WebUI\n",
75
+ "!mkdir -p pretrained uvr5_weights"
76
+ ]
77
+ },
78
+ {
79
+ "cell_type": "code",
80
+ "execution_count": null,
81
+ "metadata": {
82
+ "id": "BLDEZADkvlw1"
83
+ },
84
+ "outputs": [],
85
+ "source": [
86
+ "# @title 更新仓库(一般无需执行)\n",
87
+ "!git pull"
88
+ ]
89
+ },
90
+ {
91
+ "cell_type": "code",
92
+ "execution_count": null,
93
+ "metadata": {
94
+ "id": "pqE0PrnuRqI2"
95
+ },
96
+ "outputs": [],
97
+ "source": [
98
+ "# @title 安装aria2\n",
99
+ "!apt -y install -qq aria2"
100
+ ]
101
+ },
102
+ {
103
+ "cell_type": "code",
104
+ "execution_count": null,
105
+ "metadata": {
106
+ "id": "UG3XpUwEomUz"
107
+ },
108
+ "outputs": [],
109
+ "source": [
110
+ "# @title 下载底模\n",
111
+ "!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/D32k.pth -d /content/Retrieval-based-Voice-Conversion-WebUI/pretrained -o D32k.pth\n",
112
+ "!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/D40k.pth -d /content/Retrieval-based-Voice-Conversion-WebUI/pretrained -o D40k.pth\n",
113
+ "!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/D48k.pth -d /content/Retrieval-based-Voice-Conversion-WebUI/pretrained -o D48k.pth\n",
114
+ "!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/G32k.pth -d /content/Retrieval-based-Voice-Conversion-WebUI/pretrained -o G32k.pth\n",
115
+ "!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/G40k.pth -d /content/Retrieval-based-Voice-Conversion-WebUI/pretrained -o G40k.pth\n",
116
+ "!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/G48k.pth -d /content/Retrieval-based-Voice-Conversion-WebUI/pretrained -o G48k.pth\n",
117
+ "!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/f0D32k.pth -d /content/Retrieval-based-Voice-Conversion-WebUI/pretrained -o f0D32k.pth\n",
118
+ "!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/f0D40k.pth -d /content/Retrieval-based-Voice-Conversion-WebUI/pretrained -o f0D40k.pth\n",
119
+ "!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/f0D48k.pth -d /content/Retrieval-based-Voice-Conversion-WebUI/pretrained -o f0D48k.pth\n",
120
+ "!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/f0G32k.pth -d /content/Retrieval-based-Voice-Conversion-WebUI/pretrained -o f0G32k.pth\n",
121
+ "!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/f0G40k.pth -d /content/Retrieval-based-Voice-Conversion-WebUI/pretrained -o f0G40k.pth\n",
122
+ "!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/f0G48k.pth -d /content/Retrieval-based-Voice-Conversion-WebUI/pretrained -o f0G48k.pth"
123
+ ]
124
+ },
125
+ {
126
+ "cell_type": "code",
127
+ "execution_count": null,
128
+ "metadata": {
129
+ "id": "HugjmZqZRuiF"
130
+ },
131
+ "outputs": [],
132
+ "source": [
133
+ "# @title 下载人声分离模型\n",
134
+ "!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/HP2-人声vocals+非人声instrumentals.pth -d /content/Retrieval-based-Voice-Conversion-WebUI/uvr5_weights -o HP2-人声vocals+非人声instrumentals.pth\n",
135
+ "!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/HP5-主旋律人声vocals+其他instrumentals.pth -d /content/Retrieval-based-Voice-Conversion-WebUI/uvr5_weights -o HP5-主旋律人声vocals+其他instrumentals.pth"
136
+ ]
137
+ },
138
+ {
139
+ "cell_type": "code",
140
+ "execution_count": null,
141
+ "metadata": {
142
+ "id": "2RCaT9FTR0ej"
143
+ },
144
+ "outputs": [],
145
+ "source": [
146
+ "# @title 下载hubert_base\n",
147
+ "!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/hubert_base.pt -d /content/Retrieval-based-Voice-Conversion-WebUI -o hubert_base.pt"
148
+ ]
149
+ },
150
+ {
151
+ "cell_type": "code",
152
+ "execution_count": null,
153
+ "metadata": {},
154
+ "outputs": [],
155
+ "source": [
156
+ "# @title #下载rmvpe模型\n",
157
+ "!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/rmvpe.pt -d /content/Retrieval-based-Voice-Conversion-WebUI -o rmvpe.pt"
158
+ ]
159
+ },
160
+ {
161
+ "cell_type": "code",
162
+ "execution_count": null,
163
+ "metadata": {
164
+ "id": "Mwk7Q0Loqzjx"
165
+ },
166
+ "outputs": [],
167
+ "source": [
168
+ "# @title 从谷歌云盘加载打包好的数据集到/content/dataset\n",
169
+ "\n",
170
+ "# @markdown 数据集位置\n",
171
+ "DATASET = (\n",
172
+ " \"/content/drive/MyDrive/dataset/lulu20230327_32k.zip\" # @param {type:\"string\"}\n",
173
+ ")\n",
174
+ "\n",
175
+ "!mkdir -p /content/dataset\n",
176
+ "!unzip -d /content/dataset -B {DATASET}"
177
+ ]
178
+ },
179
+ {
180
+ "cell_type": "code",
181
+ "execution_count": null,
182
+ "metadata": {
183
+ "id": "PDlFxWHWEynD"
184
+ },
185
+ "outputs": [],
186
+ "source": [
187
+ "# @title 重命名数据集中的重名文件\n",
188
+ "!ls -a /content/dataset/\n",
189
+ "!rename 's/(\\w+)\\.(\\w+)~(\\d*)/$1_$3.$2/' /content/dataset/*.*~*"
190
+ ]
191
+ },
192
+ {
193
+ "cell_type": "code",
194
+ "execution_count": null,
195
+ "metadata": {
196
+ "id": "7vh6vphDwO0b"
197
+ },
198
+ "outputs": [],
199
+ "source": [
200
+ "# @title 启动web\n",
201
+ "%cd /content/Retrieval-based-Voice-Conversion-WebUI\n",
202
+ "# %load_ext tensorboard\n",
203
+ "# %tensorboard --logdir /content/Retrieval-based-Voice-Conversion-WebUI/logs\n",
204
+ "!python3 infer-web.py --colab --pycmd python3"
205
+ ]
206
+ },
207
+ {
208
+ "cell_type": "code",
209
+ "execution_count": null,
210
+ "metadata": {
211
+ "id": "FgJuNeAwx5Y_"
212
+ },
213
+ "outputs": [],
214
+ "source": [
215
+ "# @title 手动将训练后的模型文件备份到谷歌云盘\n",
216
+ "# @markdown 需要自己查看logs文件夹下模型的文件名,手动修改下方命令末尾的文件名\n",
217
+ "\n",
218
+ "# @markdown 模型名\n",
219
+ "MODELNAME = \"lulu\" # @param {type:\"string\"}\n",
220
+ "# @markdown 模型epoch\n",
221
+ "MODELEPOCH = 9600 # @param {type:\"integer\"}\n",
222
+ "\n",
223
+ "!cp /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}/G_{MODELEPOCH}.pth /content/drive/MyDrive/{MODELNAME}_D_{MODELEPOCH}.pth\n",
224
+ "!cp /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}/D_{MODELEPOCH}.pth /content/drive/MyDrive/{MODELNAME}_G_{MODELEPOCH}.pth\n",
225
+ "!cp /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}/added_*.index /content/drive/MyDrive/\n",
226
+ "!cp /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}/total_*.npy /content/drive/MyDrive/\n",
227
+ "\n",
228
+ "!cp /content/Retrieval-based-Voice-Conversion-WebUI/weights/{MODELNAME}.pth /content/drive/MyDrive/{MODELNAME}{MODELEPOCH}.pth"
229
+ ]
230
+ },
231
+ {
232
+ "cell_type": "code",
233
+ "execution_count": null,
234
+ "metadata": {
235
+ "id": "OVQoLQJXS7WX"
236
+ },
237
+ "outputs": [],
238
+ "source": [
239
+ "# @title 从谷歌云盘恢复pth\n",
240
+ "# @markdown 需要自己查看logs文件夹下模型的文件名,手动修改下方命令末尾的文件名\n",
241
+ "\n",
242
+ "# @markdown 模型名\n",
243
+ "MODELNAME = \"lulu\" # @param {type:\"string\"}\n",
244
+ "# @markdown 模型epoch\n",
245
+ "MODELEPOCH = 7500 # @param {type:\"integer\"}\n",
246
+ "\n",
247
+ "!mkdir -p /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}\n",
248
+ "\n",
249
+ "!cp /content/drive/MyDrive/{MODELNAME}_D_{MODELEPOCH}.pth /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}/G_{MODELEPOCH}.pth\n",
250
+ "!cp /content/drive/MyDrive/{MODELNAME}_G_{MODELEPOCH}.pth /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}/D_{MODELEPOCH}.pth\n",
251
+ "!cp /content/drive/MyDrive/*.index /content/\n",
252
+ "!cp /content/drive/MyDrive/*.npy /content/\n",
253
+ "!cp /content/drive/MyDrive/{MODELNAME}{MODELEPOCH}.pth /content/Retrieval-based-Voice-Conversion-WebUI/weights/{MODELNAME}.pth"
254
+ ]
255
+ },
256
+ {
257
+ "cell_type": "code",
258
+ "execution_count": null,
259
+ "metadata": {
260
+ "id": "ZKAyuKb9J6dz"
261
+ },
262
+ "outputs": [],
263
+ "source": [
264
+ "# @title 手动预处理(不推荐)\n",
265
+ "# @markdown 模型名\n",
266
+ "MODELNAME = \"lulu\" # @param {type:\"string\"}\n",
267
+ "# @markdown 采样率\n",
268
+ "BITRATE = 48000 # @param {type:\"integer\"}\n",
269
+ "# @markdown 使用的进程数\n",
270
+ "THREADCOUNT = 8 # @param {type:\"integer\"}\n",
271
+ "\n",
272
+ "!python3 trainset_preprocess_pipeline_print.py /content/dataset {BITRATE} {THREADCOUNT} logs/{MODELNAME} True"
273
+ ]
274
+ },
275
+ {
276
+ "cell_type": "code",
277
+ "execution_count": null,
278
+ "metadata": {
279
+ "id": "CrxJqzAUKmPJ"
280
+ },
281
+ "outputs": [],
282
+ "source": [
283
+ "# @title 手动提取特征(不推荐)\n",
284
+ "# @markdown 模型名\n",
285
+ "MODELNAME = \"lulu\" # @param {type:\"string\"}\n",
286
+ "# @markdown 使用的进程数\n",
287
+ "THREADCOUNT = 8 # @param {type:\"integer\"}\n",
288
+ "# @markdown 音高提取算法\n",
289
+ "ALGO = \"harvest\" # @param {type:\"string\"}\n",
290
+ "\n",
291
+ "!python3 extract_f0_print.py logs/{MODELNAME} {THREADCOUNT} {ALGO}\n",
292
+ "\n",
293
+ "!python3 extract_feature_print.py cpu 1 0 0 logs/{MODELNAME}"
294
+ ]
295
+ },
296
+ {
297
+ "cell_type": "code",
298
+ "execution_count": null,
299
+ "metadata": {
300
+ "id": "IMLPLKOaKj58"
301
+ },
302
+ "outputs": [],
303
+ "source": [
304
+ "# @title 手动训练(不推荐)\n",
305
+ "# @markdown 模型名\n",
306
+ "MODELNAME = \"lulu\" # @param {type:\"string\"}\n",
307
+ "# @markdown 使用的GPU\n",
308
+ "USEGPU = \"0\" # @param {type:\"string\"}\n",
309
+ "# @markdown 批大小\n",
310
+ "BATCHSIZE = 32 # @param {type:\"integer\"}\n",
311
+ "# @markdown 停止的epoch\n",
312
+ "MODELEPOCH = 3200 # @param {type:\"integer\"}\n",
313
+ "# @markdown 保存epoch间隔\n",
314
+ "EPOCHSAVE = 100 # @param {type:\"integer\"}\n",
315
+ "# @markdown 采样率\n",
316
+ "MODELSAMPLE = \"48k\" # @param {type:\"string\"}\n",
317
+ "# @markdown 是否缓存训练集\n",
318
+ "CACHEDATA = 1 # @param {type:\"integer\"}\n",
319
+ "# @markdown 是否仅保存最新的ckpt文件\n",
320
+ "ONLYLATEST = 0 # @param {type:\"integer\"}\n",
321
+ "\n",
322
+ "!python3 train_nsf_sim_cache_sid_load_pretrain.py -e lulu -sr {MODELSAMPLE} -f0 1 -bs {BATCHSIZE} -g {USEGPU} -te {MODELEPOCH} -se {EPOCHSAVE} -pg pretrained/f0G{MODELSAMPLE}.pth -pd pretrained/f0D{MODELSAMPLE}.pth -l {ONLYLATEST} -c {CACHEDATA}"
323
+ ]
324
+ },
325
+ {
326
+ "cell_type": "code",
327
+ "execution_count": null,
328
+ "metadata": {
329
+ "id": "haYA81hySuDl"
330
+ },
331
+ "outputs": [],
332
+ "source": [
333
+ "# @title 删除其它pth,只留选中的(慎点,仔细看代码)\n",
334
+ "# @markdown 模型名\n",
335
+ "MODELNAME = \"lulu\" # @param {type:\"string\"}\n",
336
+ "# @markdown 选中模型epoch\n",
337
+ "MODELEPOCH = 9600 # @param {type:\"integer\"}\n",
338
+ "\n",
339
+ "!echo \"备份选中的模型。。。\"\n",
340
+ "!cp /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}/G_{MODELEPOCH}.pth /content/{MODELNAME}_D_{MODELEPOCH}.pth\n",
341
+ "!cp /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}/D_{MODELEPOCH}.pth /content/{MODELNAME}_G_{MODELEPOCH}.pth\n",
342
+ "\n",
343
+ "!echo \"正在删除。。。\"\n",
344
+ "!ls /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}\n",
345
+ "!rm /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}/*.pth\n",
346
+ "\n",
347
+ "!echo \"恢复选中的模型。。。\"\n",
348
+ "!mv /content/{MODELNAME}_D_{MODELEPOCH}.pth /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}/G_{MODELEPOCH}.pth\n",
349
+ "!mv /content/{MODELNAME}_G_{MODELEPOCH}.pth /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}/D_{MODELEPOCH}.pth\n",
350
+ "\n",
351
+ "!echo \"删除完成\"\n",
352
+ "!ls /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}"
353
+ ]
354
+ },
355
+ {
356
+ "cell_type": "code",
357
+ "execution_count": null,
358
+ "metadata": {
359
+ "id": "QhSiPTVPoIRh"
360
+ },
361
+ "outputs": [],
362
+ "source": [
363
+ "# @title 清除项目下所有文件,只留选中的模型(慎点,仔细看代码)\n",
364
+ "# @markdown 模型名\n",
365
+ "MODELNAME = \"lulu\" # @param {type:\"string\"}\n",
366
+ "# @markdown 选中模型epoch\n",
367
+ "MODELEPOCH = 9600 # @param {type:\"integer\"}\n",
368
+ "\n",
369
+ "!echo \"备份选中的模型。。。\"\n",
370
+ "!cp /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}/G_{MODELEPOCH}.pth /content/{MODELNAME}_D_{MODELEPOCH}.pth\n",
371
+ "!cp /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}/D_{MODELEPOCH}.pth /content/{MODELNAME}_G_{MODELEPOCH}.pth\n",
372
+ "\n",
373
+ "!echo \"正��删除。。。\"\n",
374
+ "!ls /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}\n",
375
+ "!rm -rf /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}/*\n",
376
+ "\n",
377
+ "!echo \"恢复选中的模型。。。\"\n",
378
+ "!mv /content/{MODELNAME}_D_{MODELEPOCH}.pth /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}/G_{MODELEPOCH}.pth\n",
379
+ "!mv /content/{MODELNAME}_G_{MODELEPOCH}.pth /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}/D_{MODELEPOCH}.pth\n",
380
+ "\n",
381
+ "!echo \"删除完成\"\n",
382
+ "!ls /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}"
383
+ ]
384
+ }
385
+ ],
386
+ "metadata": {
387
+ "accelerator": "GPU",
388
+ "colab": {
389
+ "private_outputs": true,
390
+ "provenance": []
391
+ },
392
+ "gpuClass": "standard",
393
+ "kernelspec": {
394
+ "display_name": "Python 3",
395
+ "name": "python3"
396
+ },
397
+ "language_info": {
398
+ "name": "python"
399
+ }
400
+ },
401
+ "nbformat": 4,
402
+ "nbformat_minor": 0
403
+ }
Retrieval_based_Voice_Conversion_WebUI_v2.ipynb ADDED
@@ -0,0 +1,422 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "attachments": {},
5
+ "cell_type": "markdown",
6
+ "metadata": {},
7
+ "source": [
8
+ "# [Retrieval-based-Voice-Conversion-WebUI](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI) Training notebook"
9
+ ]
10
+ },
11
+ {
12
+ "attachments": {},
13
+ "cell_type": "markdown",
14
+ "metadata": {
15
+ "id": "ZFFCx5J80SGa"
16
+ },
17
+ "source": [
18
+ "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/blob/main/Retrieval_based_Voice_Conversion_WebUI_v2.ipynb)"
19
+ ]
20
+ },
21
+ {
22
+ "cell_type": "code",
23
+ "execution_count": null,
24
+ "metadata": {
25
+ "id": "GmFP6bN9dvOq"
26
+ },
27
+ "outputs": [],
28
+ "source": [
29
+ "# @title #查看显卡\n",
30
+ "!nvidia-smi"
31
+ ]
32
+ },
33
+ {
34
+ "cell_type": "code",
35
+ "execution_count": null,
36
+ "metadata": {
37
+ "id": "jwu07JgqoFON"
38
+ },
39
+ "outputs": [],
40
+ "source": [
41
+ "# @title 挂载谷歌云盘\n",
42
+ "\n",
43
+ "from google.colab import drive\n",
44
+ "\n",
45
+ "drive.mount(\"/content/drive\")"
46
+ ]
47
+ },
48
+ {
49
+ "cell_type": "code",
50
+ "execution_count": null,
51
+ "metadata": {
52
+ "id": "wjddIFr1oS3W"
53
+ },
54
+ "outputs": [],
55
+ "source": [
56
+ "# @title #安装依赖\n",
57
+ "!apt-get -y install build-essential python3-dev ffmpeg\n",
58
+ "!pip3 install --upgrade setuptools wheel\n",
59
+ "!pip3 install --upgrade pip\n",
60
+ "!pip3 install faiss-cpu==1.7.2 fairseq gradio==3.14.0 ffmpeg ffmpeg-python praat-parselmouth pyworld numpy==1.23.5 numba==0.56.4 librosa==0.9.2"
61
+ ]
62
+ },
63
+ {
64
+ "cell_type": "code",
65
+ "execution_count": null,
66
+ "metadata": {
67
+ "id": "ge_97mfpgqTm"
68
+ },
69
+ "outputs": [],
70
+ "source": [
71
+ "# @title #克隆仓库\n",
72
+ "\n",
73
+ "!mkdir Retrieval-based-Voice-Conversion-WebUI\n",
74
+ "%cd /content/Retrieval-based-Voice-Conversion-WebUI\n",
75
+ "!git init\n",
76
+ "!git remote add origin https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI.git\n",
77
+ "!git fetch origin cfd984812804ddc9247d65b14c82cd32e56c1133 --depth=1\n",
78
+ "!git reset --hard FETCH_HEAD"
79
+ ]
80
+ },
81
+ {
82
+ "cell_type": "code",
83
+ "execution_count": null,
84
+ "metadata": {
85
+ "id": "BLDEZADkvlw1"
86
+ },
87
+ "outputs": [],
88
+ "source": [
89
+ "# @title #更新仓库(一般无需执行)\n",
90
+ "!git pull"
91
+ ]
92
+ },
93
+ {
94
+ "cell_type": "code",
95
+ "execution_count": null,
96
+ "metadata": {
97
+ "id": "pqE0PrnuRqI2"
98
+ },
99
+ "outputs": [],
100
+ "source": [
101
+ "# @title #安装aria2\n",
102
+ "!apt -y install -qq aria2"
103
+ ]
104
+ },
105
+ {
106
+ "cell_type": "code",
107
+ "execution_count": null,
108
+ "metadata": {
109
+ "id": "UG3XpUwEomUz"
110
+ },
111
+ "outputs": [],
112
+ "source": [
113
+ "# @title 下载底模\n",
114
+ "\n",
115
+ "# v1\n",
116
+ "!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/D32k.pth -d /content/Retrieval-based-Voice-Conversion-WebUI/pretrained -o D32k.pth\n",
117
+ "!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/D40k.pth -d /content/Retrieval-based-Voice-Conversion-WebUI/pretrained -o D40k.pth\n",
118
+ "!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/D48k.pth -d /content/Retrieval-based-Voice-Conversion-WebUI/pretrained -o D48k.pth\n",
119
+ "!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/G32k.pth -d /content/Retrieval-based-Voice-Conversion-WebUI/pretrained -o G32k.pth\n",
120
+ "!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/G40k.pth -d /content/Retrieval-based-Voice-Conversion-WebUI/pretrained -o G40k.pth\n",
121
+ "!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/G48k.pth -d /content/Retrieval-based-Voice-Conversion-WebUI/pretrained -o G48k.pth\n",
122
+ "!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/f0D32k.pth -d /content/Retrieval-based-Voice-Conversion-WebUI/pretrained -o f0D32k.pth\n",
123
+ "!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/f0D40k.pth -d /content/Retrieval-based-Voice-Conversion-WebUI/pretrained -o f0D40k.pth\n",
124
+ "!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/f0D48k.pth -d /content/Retrieval-based-Voice-Conversion-WebUI/pretrained -o f0D48k.pth\n",
125
+ "!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/f0G32k.pth -d /content/Retrieval-based-Voice-Conversion-WebUI/pretrained -o f0G32k.pth\n",
126
+ "!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/f0G40k.pth -d /content/Retrieval-based-Voice-Conversion-WebUI/pretrained -o f0G40k.pth\n",
127
+ "!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/f0G48k.pth -d /content/Retrieval-based-Voice-Conversion-WebUI/pretrained -o f0G48k.pth\n",
128
+ "\n",
129
+ "# v2\n",
130
+ "# !aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained_v2/D32k.pth -d /content/Retrieval-based-Voice-Conversion-WebUI/pretrained_v2 -o D32k.pth\n",
131
+ "!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained_v2/D40k.pth -d /content/Retrieval-based-Voice-Conversion-WebUI/pretrained_v2 -o D40k.pth\n",
132
+ "# !aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained_v2/D48k.pth -d /content/Retrieval-based-Voice-Conversion-WebUI/pretrained_v2 -o D48k.pth\n",
133
+ "# !aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained_v2/G32k.pth -d /content/Retrieval-based-Voice-Conversion-WebUI/pretrained_v2 -o G32k.pth\n",
134
+ "!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained_v2/G40k.pth -d /content/Retrieval-based-Voice-Conversion-WebUI/pretrained_v2 -o G40k.pth\n",
135
+ "# !aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained_v2/G48k.pth -d /content/Retrieval-based-Voice-Conversion-WebUI/pretrained_v2 -o G48k.pth\n",
136
+ "# !aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained_v2/f0D32k.pth -d /content/Retrieval-based-Voice-Conversion-WebUI/pretrained_v2 -o f0D32k.pth\n",
137
+ "!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained_v2/f0D40k.pth -d /content/Retrieval-based-Voice-Conversion-WebUI/pretrained_v2 -o f0D40k.pth\n",
138
+ "# !aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained_v2/f0D48k.pth -d /content/Retrieval-based-Voice-Conversion-WebUI/pretrained_v2 -o f0D48k.pth\n",
139
+ "# !aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained_v2/f0G32k.pth -d /content/Retrieval-based-Voice-Conversion-WebUI/pretrained_v2 -o f0G32k.pth\n",
140
+ "!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained_v2/f0G40k.pth -d /content/Retrieval-based-Voice-Conversion-WebUI/pretrained_v2 -o f0G40k.pth\n",
141
+ "# !aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained_v2/f0G48k.pth -d /content/Retrieval-based-Voice-Conversion-WebUI/pretrained_v2 -o f0G48k.pth"
142
+ ]
143
+ },
144
+ {
145
+ "cell_type": "code",
146
+ "execution_count": null,
147
+ "metadata": {
148
+ "id": "HugjmZqZRuiF"
149
+ },
150
+ "outputs": [],
151
+ "source": [
152
+ "# @title #下载人声分离模型\n",
153
+ "!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/HP2-人声vocals+非人声instrumentals.pth -d /content/Retrieval-based-Voice-Conversion-WebUI/uvr5_weights -o HP2-人声vocals+非人声instrumentals.pth\n",
154
+ "!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/HP5-主旋律人声vocals+其他instrumentals.pth -d /content/Retrieval-based-Voice-Conversion-WebUI/uvr5_weights -o HP5-主旋律人声vocals+其他instrumentals.pth"
155
+ ]
156
+ },
157
+ {
158
+ "cell_type": "code",
159
+ "execution_count": null,
160
+ "metadata": {
161
+ "id": "2RCaT9FTR0ej"
162
+ },
163
+ "outputs": [],
164
+ "source": [
165
+ "# @title #下载hubert_base\n",
166
+ "!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/hubert_base.pt -d /content/Retrieval-based-Voice-Conversion-WebUI -o hubert_base.pt"
167
+ ]
168
+ },
169
+ {
170
+ "cell_type": "code",
171
+ "execution_count": null,
172
+ "metadata": {},
173
+ "outputs": [],
174
+ "source": [
175
+ "# @title #下载rmvpe模型\n",
176
+ "!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/rmvpe.pt -d /content/Retrieval-based-Voice-Conversion-WebUI -o rmvpe.pt"
177
+ ]
178
+ },
179
+ {
180
+ "cell_type": "code",
181
+ "execution_count": null,
182
+ "metadata": {
183
+ "id": "Mwk7Q0Loqzjx"
184
+ },
185
+ "outputs": [],
186
+ "source": [
187
+ "# @title #从谷歌云盘加载打包好的数据集到/content/dataset\n",
188
+ "\n",
189
+ "# @markdown 数据集位置\n",
190
+ "DATASET = (\n",
191
+ " \"/content/drive/MyDrive/dataset/lulu20230327_32k.zip\" # @param {type:\"string\"}\n",
192
+ ")\n",
193
+ "\n",
194
+ "!mkdir -p /content/dataset\n",
195
+ "!unzip -d /content/dataset -B {DATASET}"
196
+ ]
197
+ },
198
+ {
199
+ "cell_type": "code",
200
+ "execution_count": null,
201
+ "metadata": {
202
+ "id": "PDlFxWHWEynD"
203
+ },
204
+ "outputs": [],
205
+ "source": [
206
+ "# @title #重命名数据集中的重名文件\n",
207
+ "!ls -a /content/dataset/\n",
208
+ "!rename 's/(\\w+)\\.(\\w+)~(\\d*)/$1_$3.$2/' /content/dataset/*.*~*"
209
+ ]
210
+ },
211
+ {
212
+ "cell_type": "code",
213
+ "execution_count": null,
214
+ "metadata": {
215
+ "id": "7vh6vphDwO0b"
216
+ },
217
+ "outputs": [],
218
+ "source": [
219
+ "# @title #启动webui\n",
220
+ "%cd /content/Retrieval-based-Voice-Conversion-WebUI\n",
221
+ "# %load_ext tensorboard\n",
222
+ "# %tensorboard --logdir /content/Retrieval-based-Voice-Conversion-WebUI/logs\n",
223
+ "!python3 infer-web.py --colab --pycmd python3"
224
+ ]
225
+ },
226
+ {
227
+ "cell_type": "code",
228
+ "execution_count": null,
229
+ "metadata": {
230
+ "id": "FgJuNeAwx5Y_"
231
+ },
232
+ "outputs": [],
233
+ "source": [
234
+ "# @title #手动将训练后的模型文件备份到谷歌云盘\n",
235
+ "# @markdown #需要自己查看logs文件夹下模型的文件名,手动修改下方命令末尾的文件名\n",
236
+ "\n",
237
+ "# @markdown #模型名\n",
238
+ "MODELNAME = \"lulu\" # @param {type:\"string\"}\n",
239
+ "# @markdown #模型epoch\n",
240
+ "MODELEPOCH = 9600 # @param {type:\"integer\"}\n",
241
+ "\n",
242
+ "!cp /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}/G_{MODELEPOCH}.pth /content/drive/MyDrive/{MODELNAME}_D_{MODELEPOCH}.pth\n",
243
+ "!cp /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}/D_{MODELEPOCH}.pth /content/drive/MyDrive/{MODELNAME}_G_{MODELEPOCH}.pth\n",
244
+ "!cp /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}/added_*.index /content/drive/MyDrive/\n",
245
+ "!cp /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}/total_*.npy /content/drive/MyDrive/\n",
246
+ "\n",
247
+ "!cp /content/Retrieval-based-Voice-Conversion-WebUI/weights/{MODELNAME}.pth /content/drive/MyDrive/{MODELNAME}{MODELEPOCH}.pth"
248
+ ]
249
+ },
250
+ {
251
+ "cell_type": "code",
252
+ "execution_count": null,
253
+ "metadata": {
254
+ "id": "OVQoLQJXS7WX"
255
+ },
256
+ "outputs": [],
257
+ "source": [
258
+ "# @title 从谷歌云盘恢复pth\n",
259
+ "# @markdown 需要自己查看logs文件夹下模型的文件名,手动修改下方命令末尾的文件名\n",
260
+ "\n",
261
+ "# @markdown 模型名\n",
262
+ "MODELNAME = \"lulu\" # @param {type:\"string\"}\n",
263
+ "# @markdown 模型epoch\n",
264
+ "MODELEPOCH = 7500 # @param {type:\"integer\"}\n",
265
+ "\n",
266
+ "!mkdir -p /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}\n",
267
+ "\n",
268
+ "!cp /content/drive/MyDrive/{MODELNAME}_D_{MODELEPOCH}.pth /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}/G_{MODELEPOCH}.pth\n",
269
+ "!cp /content/drive/MyDrive/{MODELNAME}_G_{MODELEPOCH}.pth /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}/D_{MODELEPOCH}.pth\n",
270
+ "!cp /content/drive/MyDrive/*.index /content/\n",
271
+ "!cp /content/drive/MyDrive/*.npy /content/\n",
272
+ "!cp /content/drive/MyDrive/{MODELNAME}{MODELEPOCH}.pth /content/Retrieval-based-Voice-Conversion-WebUI/weights/{MODELNAME}.pth"
273
+ ]
274
+ },
275
+ {
276
+ "cell_type": "code",
277
+ "execution_count": null,
278
+ "metadata": {
279
+ "id": "ZKAyuKb9J6dz"
280
+ },
281
+ "outputs": [],
282
+ "source": [
283
+ "# @title 手动预处理(不推荐)\n",
284
+ "# @markdown 模型名\n",
285
+ "MODELNAME = \"lulu\" # @param {type:\"string\"}\n",
286
+ "# @markdown 采样率\n",
287
+ "BITRATE = 48000 # @param {type:\"integer\"}\n",
288
+ "# @markdown 使用的进程数\n",
289
+ "THREADCOUNT = 8 # @param {type:\"integer\"}\n",
290
+ "\n",
291
+ "!python3 trainset_preprocess_pipeline_print.py /content/dataset {BITRATE} {THREADCOUNT} logs/{MODELNAME} True"
292
+ ]
293
+ },
294
+ {
295
+ "cell_type": "code",
296
+ "execution_count": null,
297
+ "metadata": {
298
+ "id": "CrxJqzAUKmPJ"
299
+ },
300
+ "outputs": [],
301
+ "source": [
302
+ "# @title 手动提取特征(不推荐)\n",
303
+ "# @markdown 模型名\n",
304
+ "MODELNAME = \"lulu\" # @param {type:\"string\"}\n",
305
+ "# @markdown 使用的进程数\n",
306
+ "THREADCOUNT = 8 # @param {type:\"integer\"}\n",
307
+ "# @markdown 音高提取算法\n",
308
+ "ALGO = \"harvest\" # @param {type:\"string\"}\n",
309
+ "\n",
310
+ "!python3 extract_f0_print.py logs/{MODELNAME} {THREADCOUNT} {ALGO}\n",
311
+ "\n",
312
+ "!python3 extract_feature_print.py cpu 1 0 0 logs/{MODELNAME}"
313
+ ]
314
+ },
315
+ {
316
+ "cell_type": "code",
317
+ "execution_count": null,
318
+ "metadata": {
319
+ "id": "IMLPLKOaKj58"
320
+ },
321
+ "outputs": [],
322
+ "source": [
323
+ "# @title 手动训练(不推荐)\n",
324
+ "# @markdown 模型名\n",
325
+ "MODELNAME = \"lulu\" # @param {type:\"string\"}\n",
326
+ "# @markdown 使用的GPU\n",
327
+ "USEGPU = \"0\" # @param {type:\"string\"}\n",
328
+ "# @markdown 批大小\n",
329
+ "BATCHSIZE = 32 # @param {type:\"integer\"}\n",
330
+ "# @markdown 停止的epoch\n",
331
+ "MODELEPOCH = 3200 # @param {type:\"integer\"}\n",
332
+ "# @markdown 保存epoch间隔\n",
333
+ "EPOCHSAVE = 100 # @param {type:\"integer\"}\n",
334
+ "# @markdown 采样率\n",
335
+ "MODELSAMPLE = \"48k\" # @param {type:\"string\"}\n",
336
+ "# @markdown 是否缓存训练集\n",
337
+ "CACHEDATA = 1 # @param {type:\"integer\"}\n",
338
+ "# @markdown 是否仅保存最新的ckpt文件\n",
339
+ "ONLYLATEST = 0 # @param {type:\"integer\"}\n",
340
+ "\n",
341
+ "!python3 train_nsf_sim_cache_sid_load_pretrain.py -e lulu -sr {MODELSAMPLE} -f0 1 -bs {BATCHSIZE} -g {USEGPU} -te {MODELEPOCH} -se {EPOCHSAVE} -pg pretrained/f0G{MODELSAMPLE}.pth -pd pretrained/f0D{MODELSAMPLE}.pth -l {ONLYLATEST} -c {CACHEDATA}"
342
+ ]
343
+ },
344
+ {
345
+ "cell_type": "code",
346
+ "execution_count": null,
347
+ "metadata": {
348
+ "id": "haYA81hySuDl"
349
+ },
350
+ "outputs": [],
351
+ "source": [
352
+ "# @title 删除其它pth,只留选中的(慎点,仔细看代码)\n",
353
+ "# @markdown 模型名\n",
354
+ "MODELNAME = \"lulu\" # @param {type:\"string\"}\n",
355
+ "# @markdown 选中模型epoch\n",
356
+ "MODELEPOCH = 9600 # @param {type:\"integer\"}\n",
357
+ "\n",
358
+ "!echo \"备份选中的模型。。。\"\n",
359
+ "!cp /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}/G_{MODELEPOCH}.pth /content/{MODELNAME}_D_{MODELEPOCH}.pth\n",
360
+ "!cp /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}/D_{MODELEPOCH}.pth /content/{MODELNAME}_G_{MODELEPOCH}.pth\n",
361
+ "\n",
362
+ "!echo \"正在删除。。。\"\n",
363
+ "!ls /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}\n",
364
+ "!rm /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}/*.pth\n",
365
+ "\n",
366
+ "!echo \"恢复选中的模型。。。\"\n",
367
+ "!mv /content/{MODELNAME}_D_{MODELEPOCH}.pth /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}/G_{MODELEPOCH}.pth\n",
368
+ "!mv /content/{MODELNAME}_G_{MODELEPOCH}.pth /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}/D_{MODELEPOCH}.pth\n",
369
+ "\n",
370
+ "!echo \"删除完成\"\n",
371
+ "!ls /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}"
372
+ ]
373
+ },
374
+ {
375
+ "cell_type": "code",
376
+ "execution_count": null,
377
+ "metadata": {
378
+ "id": "QhSiPTVPoIRh"
379
+ },
380
+ "outputs": [],
381
+ "source": [
382
+ "# @title 清除项目下所有文件,只留选中的模型(慎点,仔细看代码)\n",
383
+ "# @markdown 模型名\n",
384
+ "MODELNAME = \"lulu\" # @param {type:\"string\"}\n",
385
+ "# @markdown 选中模型epoch\n",
386
+ "MODELEPOCH = 9600 # @param {type:\"integer\"}\n",
387
+ "\n",
388
+ "!echo \"备份选中的模型。。。\"\n",
389
+ "!cp /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}/G_{MODELEPOCH}.pth /content/{MODELNAME}_D_{MODELEPOCH}.pth\n",
390
+ "!cp /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}/D_{MODELEPOCH}.pth /content/{MODELNAME}_G_{MODELEPOCH}.pth\n",
391
+ "\n",
392
+ "!echo \"正在删除。。。\"\n",
393
+ "!ls /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}\n",
394
+ "!rm -rf /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}/*\n",
395
+ "\n",
396
+ "!echo \"恢复选中的模型。。。\"\n",
397
+ "!mv /content/{MODELNAME}_D_{MODELEPOCH}.pth /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}/G_{MODELEPOCH}.pth\n",
398
+ "!mv /content/{MODELNAME}_G_{MODELEPOCH}.pth /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}/D_{MODELEPOCH}.pth\n",
399
+ "\n",
400
+ "!echo \"删除完成\"\n",
401
+ "!ls /content/Retrieval-based-Voice-Conversion-WebUI/logs/{MODELNAME}"
402
+ ]
403
+ }
404
+ ],
405
+ "metadata": {
406
+ "accelerator": "GPU",
407
+ "colab": {
408
+ "private_outputs": true,
409
+ "provenance": []
410
+ },
411
+ "gpuClass": "standard",
412
+ "kernelspec": {
413
+ "display_name": "Python 3",
414
+ "name": "python3"
415
+ },
416
+ "language_info": {
417
+ "name": "python"
418
+ }
419
+ },
420
+ "nbformat": 4,
421
+ "nbformat_minor": 0
422
+ }
environment_dml.yaml ADDED
@@ -0,0 +1,186 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: pydml
2
+ channels:
3
+ - pytorch
4
+ - https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/main
5
+ - https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/main/
6
+ - https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/free/
7
+ - https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/conda-forge/
8
+ - defaults
9
+ - https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/fastai/
10
+ - https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/pytorch/
11
+ - https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/bioconda/
12
+ dependencies:
13
+ - abseil-cpp=20211102.0=hd77b12b_0
14
+ - absl-py=1.3.0=py310haa95532_0
15
+ - aiohttp=3.8.3=py310h2bbff1b_0
16
+ - aiosignal=1.2.0=pyhd3eb1b0_0
17
+ - async-timeout=4.0.2=py310haa95532_0
18
+ - attrs=22.1.0=py310haa95532_0
19
+ - blas=1.0=mkl
20
+ - blinker=1.4=py310haa95532_0
21
+ - bottleneck=1.3.5=py310h9128911_0
22
+ - brotli=1.0.9=h2bbff1b_7
23
+ - brotli-bin=1.0.9=h2bbff1b_7
24
+ - brotlipy=0.7.0=py310h2bbff1b_1002
25
+ - bzip2=1.0.8=he774522_0
26
+ - c-ares=1.19.0=h2bbff1b_0
27
+ - ca-certificates=2023.05.30=haa95532_0
28
+ - cachetools=4.2.2=pyhd3eb1b0_0
29
+ - certifi=2023.5.7=py310haa95532_0
30
+ - cffi=1.15.1=py310h2bbff1b_3
31
+ - charset-normalizer=2.0.4=pyhd3eb1b0_0
32
+ - click=8.0.4=py310haa95532_0
33
+ - colorama=0.4.6=py310haa95532_0
34
+ - contourpy=1.0.5=py310h59b6b97_0
35
+ - cryptography=39.0.1=py310h21b164f_0
36
+ - cycler=0.11.0=pyhd3eb1b0_0
37
+ - fonttools=4.25.0=pyhd3eb1b0_0
38
+ - freetype=2.12.1=ha860e81_0
39
+ - frozenlist=1.3.3=py310h2bbff1b_0
40
+ - giflib=5.2.1=h8cc25b3_3
41
+ - glib=2.69.1=h5dc1a3c_2
42
+ - google-auth=2.6.0=pyhd3eb1b0_0
43
+ - google-auth-oauthlib=0.4.4=pyhd3eb1b0_0
44
+ - grpc-cpp=1.48.2=hf108199_0
45
+ - grpcio=1.48.2=py310hf108199_0
46
+ - gst-plugins-base=1.18.5=h9e645db_0
47
+ - gstreamer=1.18.5=hd78058f_0
48
+ - icu=58.2=ha925a31_3
49
+ - idna=3.4=py310haa95532_0
50
+ - intel-openmp=2023.1.0=h59b6b97_46319
51
+ - jpeg=9e=h2bbff1b_1
52
+ - kiwisolver=1.4.4=py310hd77b12b_0
53
+ - krb5=1.19.4=h5b6d351_0
54
+ - lerc=3.0=hd77b12b_0
55
+ - libbrotlicommon=1.0.9=h2bbff1b_7
56
+ - libbrotlidec=1.0.9=h2bbff1b_7
57
+ - libbrotlienc=1.0.9=h2bbff1b_7
58
+ - libclang=14.0.6=default_hb5a9fac_1
59
+ - libclang13=14.0.6=default_h8e68704_1
60
+ - libdeflate=1.17=h2bbff1b_0
61
+ - libffi=3.4.4=hd77b12b_0
62
+ - libiconv=1.16=h2bbff1b_2
63
+ - libogg=1.3.5=h2bbff1b_1
64
+ - libpng=1.6.39=h8cc25b3_0
65
+ - libprotobuf=3.20.3=h23ce68f_0
66
+ - libtiff=4.5.0=h6c2663c_2
67
+ - libuv=1.44.2=h2bbff1b_0
68
+ - libvorbis=1.3.7=he774522_0
69
+ - libwebp=1.2.4=hbc33d0d_1
70
+ - libwebp-base=1.2.4=h2bbff1b_1
71
+ - libxml2=2.10.3=h0ad7f3c_0
72
+ - libxslt=1.1.37=h2bbff1b_0
73
+ - lz4-c=1.9.4=h2bbff1b_0
74
+ - markdown=3.4.1=py310haa95532_0
75
+ - markupsafe=2.1.1=py310h2bbff1b_0
76
+ - matplotlib=3.7.1=py310haa95532_1
77
+ - matplotlib-base=3.7.1=py310h4ed8f06_1
78
+ - mkl=2023.1.0=h8bd8f75_46356
79
+ - mkl-service=2.4.0=py310h2bbff1b_1
80
+ - mkl_fft=1.3.6=py310h4ed8f06_1
81
+ - mkl_random=1.2.2=py310h4ed8f06_1
82
+ - multidict=6.0.2=py310h2bbff1b_0
83
+ - munkres=1.1.4=py_0
84
+ - numexpr=2.8.4=py310h2cd9be0_1
85
+ - numpy=1.24.3=py310h055cbcc_1
86
+ - numpy-base=1.24.3=py310h65a83cf_1
87
+ - oauthlib=3.2.2=py310haa95532_0
88
+ - openssl=1.1.1t=h2bbff1b_0
89
+ - packaging=23.0=py310haa95532_0
90
+ - pandas=1.5.3=py310h4ed8f06_0
91
+ - pcre=8.45=hd77b12b_0
92
+ - pillow=9.4.0=py310hd77b12b_0
93
+ - pip=23.0.1=py310haa95532_0
94
+ - ply=3.11=py310haa95532_0
95
+ - protobuf=3.20.3=py310hd77b12b_0
96
+ - pyasn1=0.4.8=pyhd3eb1b0_0
97
+ - pyasn1-modules=0.2.8=py_0
98
+ - pycparser=2.21=pyhd3eb1b0_0
99
+ - pyjwt=2.4.0=py310haa95532_0
100
+ - pyopenssl=23.0.0=py310haa95532_0
101
+ - pyparsing=3.0.9=py310haa95532_0
102
+ - pyqt=5.15.7=py310hd77b12b_0
103
+ - pyqt5-sip=12.11.0=py310hd77b12b_0
104
+ - pysocks=1.7.1=py310haa95532_0
105
+ - python=3.10.11=h966fe2a_2
106
+ - python-dateutil=2.8.2=pyhd3eb1b0_0
107
+ - pytorch-mutex=1.0=cpu
108
+ - pytz=2022.7=py310haa95532_0
109
+ - pyyaml=6.0=py310h2bbff1b_1
110
+ - qt-main=5.15.2=he8e5bd7_8
111
+ - qt-webengine=5.15.9=hb9a9bb5_5
112
+ - qtwebkit=5.212=h2bbfb41_5
113
+ - re2=2022.04.01=hd77b12b_0
114
+ - requests=2.29.0=py310haa95532_0
115
+ - requests-oauthlib=1.3.0=py_0
116
+ - rsa=4.7.2=pyhd3eb1b0_1
117
+ - setuptools=67.8.0=py310haa95532_0
118
+ - sip=6.6.2=py310hd77b12b_0
119
+ - six=1.16.0=pyhd3eb1b0_1
120
+ - sqlite=3.41.2=h2bbff1b_0
121
+ - tbb=2021.8.0=h59b6b97_0
122
+ - tensorboard=2.10.0=py310haa95532_0
123
+ - tensorboard-data-server=0.6.1=py310haa95532_0
124
+ - tensorboard-plugin-wit=1.8.1=py310haa95532_0
125
+ - tk=8.6.12=h2bbff1b_0
126
+ - toml=0.10.2=pyhd3eb1b0_0
127
+ - tornado=6.2=py310h2bbff1b_0
128
+ - tqdm=4.65.0=py310h9909e9c_0
129
+ - typing_extensions=4.5.0=py310haa95532_0
130
+ - tzdata=2023c=h04d1e81_0
131
+ - urllib3=1.26.16=py310haa95532_0
132
+ - vc=14.2=h21ff451_1
133
+ - vs2015_runtime=14.27.29016=h5e58377_2
134
+ - werkzeug=2.2.3=py310haa95532_0
135
+ - wheel=0.38.4=py310haa95532_0
136
+ - win_inet_pton=1.1.0=py310haa95532_0
137
+ - xz=5.4.2=h8cc25b3_0
138
+ - yaml=0.2.5=he774522_0
139
+ - yarl=1.8.1=py310h2bbff1b_0
140
+ - zlib=1.2.13=h8cc25b3_0
141
+ - zstd=1.5.5=hd43e919_0
142
+ - pip:
143
+ - antlr4-python3-runtime==4.8
144
+ - appdirs==1.4.4
145
+ - audioread==3.0.0
146
+ - bitarray==2.7.4
147
+ - cython==0.29.35
148
+ - decorator==5.1.1
149
+ - fairseq==0.12.2
150
+ - faiss-cpu==1.7.4
151
+ - filelock==3.12.0
152
+ - hydra-core==1.0.7
153
+ - jinja2==3.1.2
154
+ - joblib==1.2.0
155
+ - lazy-loader==0.2
156
+ - librosa==0.10.0.post2
157
+ - llvmlite==0.40.0
158
+ - lxml==4.9.2
159
+ - mpmath==1.3.0
160
+ - msgpack==1.0.5
161
+ - networkx==3.1
162
+ - noisereduce==2.0.1
163
+ - numba==0.57.0
164
+ - omegaconf==2.0.6
165
+ - opencv-python==4.7.0.72
166
+ - pooch==1.6.0
167
+ - portalocker==2.7.0
168
+ - pysimplegui==4.60.5
169
+ - pywin32==306
170
+ - pyworld==0.3.3
171
+ - regex==2023.5.5
172
+ - sacrebleu==2.3.1
173
+ - scikit-learn==1.2.2
174
+ - scipy==1.10.1
175
+ - sounddevice==0.4.6
176
+ - soundfile==0.12.1
177
+ - soxr==0.3.5
178
+ - sympy==1.12
179
+ - tabulate==0.9.0
180
+ - threadpoolctl==3.1.0
181
+ - torch==2.0.0
182
+ - torch-directml==0.2.0.dev230426
183
+ - torchaudio==2.0.1
184
+ - torchvision==0.15.1
185
+ - wget==3.2
186
+ prefix: D:\ProgramData\anaconda3_\envs\pydml
pyproject.toml ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [tool.poetry]
2
+ name = "rvc-beta"
3
+ version = "0.1.0"
4
+ description = ""
5
+ authors = ["lj1995"]
6
+ license = "MIT"
7
+
8
+ [tool.poetry.dependencies]
9
+ python = "^3.8"
10
+ torch = "^2.0.0"
11
+ torchaudio = "^2.0.1"
12
+ Cython = "^0.29.34"
13
+ gradio = "^3.34.0"
14
+ future = "^0.18.3"
15
+ pydub = "^0.25.1"
16
+ soundfile = "^0.12.1"
17
+ ffmpeg-python = "^0.2.0"
18
+ tensorboardX = "^2.6"
19
+ functorch = "^2.0.0"
20
+ fairseq = "^0.12.2"
21
+ faiss-cpu = "^1.7.2"
22
+ Jinja2 = "^3.1.2"
23
+ json5 = "^0.9.11"
24
+ librosa = "0.9.1"
25
+ llvmlite = "0.39.0"
26
+ Markdown = "^3.4.3"
27
+ matplotlib = "^3.7.1"
28
+ matplotlib-inline = "^0.1.6"
29
+ numba = "0.56.4"
30
+ numpy = "1.23.5"
31
+ scipy = "1.9.3"
32
+ praat-parselmouth = "^0.4.3"
33
+ Pillow = "9.3.0"
34
+ pyworld = "^0.3.2"
35
+ resampy = "^0.4.2"
36
+ scikit-learn = "^1.2.2"
37
+ starlette = "^0.27.0"
38
+ tensorboard = "^2.12.1"
39
+ tensorboard-data-server = "^0.7.0"
40
+ tensorboard-plugin-wit = "^1.8.1"
41
+ torchgen = "^0.0.1"
42
+ tqdm = "^4.65.0"
43
+ tornado = "^6.3"
44
+ Werkzeug = "^2.2.3"
45
+ uc-micro-py = "^1.0.1"
46
+ sympy = "^1.11.1"
47
+ tabulate = "^0.9.0"
48
+ PyYAML = "^6.0"
49
+ pyasn1 = "^0.4.8"
50
+ pyasn1-modules = "^0.2.8"
51
+ fsspec = "^2023.3.0"
52
+ absl-py = "^1.4.0"
53
+ audioread = "^3.0.0"
54
+ uvicorn = "^0.21.1"
55
+ colorama = "^0.4.6"
56
+ torchcrepe = "0.0.20"
57
+ python-dotenv = "^1.0.0"
58
+ av = "^10.0.0"
59
+
60
+ [tool.poetry.dev-dependencies]
61
+
62
+ [build-system]
63
+ requires = ["poetry-core>=1.0.0"]
64
+ build-backend = "poetry.core.masonry.api"
requirements-amd.txt ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ tensorflow-rocm
2
+ joblib>=1.1.0
3
+ numba==0.56.4
4
+ numpy==1.23.5
5
+ scipy
6
+ librosa==0.9.1
7
+ llvmlite==0.39.0
8
+ fairseq==0.12.2
9
+ faiss-cpu==1.7.3
10
+ gradio==3.34.0
11
+ Cython
12
+ pydub>=0.25.1
13
+ soundfile>=0.12.1
14
+ ffmpeg-python>=0.2.0
15
+ tensorboardX
16
+ Jinja2>=3.1.2
17
+ json5
18
+ Markdown
19
+ matplotlib>=3.7.0
20
+ matplotlib-inline>=0.1.3
21
+ praat-parselmouth>=0.4.2
22
+ Pillow>=9.1.1
23
+ resampy>=0.4.2
24
+ scikit-learn
25
+ tensorboard
26
+ tqdm>=4.63.1
27
+ tornado>=6.1
28
+ Werkzeug>=2.2.3
29
+ uc-micro-py>=1.0.1
30
+ sympy>=1.11.1
31
+ tabulate>=0.8.10
32
+ PyYAML>=6.0
33
+ pyasn1>=0.4.8
34
+ pyasn1-modules>=0.2.8
35
+ fsspec>=2022.11.0
36
+ absl-py>=1.2.0
37
+ audioread
38
+ uvicorn>=0.21.1
39
+ colorama>=0.4.5
40
+ pyworld==0.3.2
41
+ httpx
42
+ onnxruntime
43
+ onnxruntime-gpu
44
+ torchcrepe==0.0.20
45
+ fastapi==0.88
46
+ ffmpy==0.3.1
47
+ python-dotenv>=1.0.0
48
+ av
requirements-dml.txt ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ joblib>=1.1.0
2
+ numba==0.56.4
3
+ numpy==1.23.5
4
+ scipy
5
+ librosa==0.9.1
6
+ llvmlite==0.39.0
7
+ fairseq==0.12.2
8
+ faiss-cpu==1.7.3
9
+ gradio==3.34.0
10
+ Cython
11
+ pydub>=0.25.1
12
+ soundfile>=0.12.1
13
+ ffmpeg-python>=0.2.0
14
+ tensorboardX
15
+ Jinja2>=3.1.2
16
+ json5
17
+ Markdown
18
+ matplotlib>=3.7.0
19
+ matplotlib-inline>=0.1.3
20
+ praat-parselmouth>=0.4.2
21
+ Pillow>=9.1.1
22
+ resampy>=0.4.2
23
+ scikit-learn
24
+ tensorboard
25
+ tqdm>=4.63.1
26
+ tornado>=6.1
27
+ Werkzeug>=2.2.3
28
+ uc-micro-py>=1.0.1
29
+ sympy>=1.11.1
30
+ tabulate>=0.8.10
31
+ PyYAML>=6.0
32
+ pyasn1>=0.4.8
33
+ pyasn1-modules>=0.2.8
34
+ fsspec>=2022.11.0
35
+ absl-py>=1.2.0
36
+ audioread
37
+ uvicorn>=0.21.1
38
+ colorama>=0.4.5
39
+ pyworld==0.3.2
40
+ httpx
41
+ onnxruntime-directml
42
+ torchcrepe==0.0.20
43
+ fastapi==0.88
44
+ ffmpy==0.3.1
45
+ python-dotenv>=1.0.0
46
+ av
requirements-ipex.txt ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ torch==2.0.1a0
2
+ intel_extension_for_pytorch==2.0.110+xpu
3
+ torchvision==0.15.2a0
4
+ https://github.com/Disty0/Retrieval-based-Voice-Conversion-WebUI/releases/download/torchaudio_wheels_for_ipex/torchaudio-2.0.2+31de77d-cp310-cp310-linux_x86_64.whl
5
+ --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
6
+ joblib>=1.1.0
7
+ numba==0.56.4
8
+ numpy==1.23.5
9
+ scipy
10
+ librosa==0.9.1
11
+ llvmlite==0.39.0
12
+ fairseq==0.12.2
13
+ faiss-cpu==1.7.3
14
+ gradio==3.34.0
15
+ Cython
16
+ pydub>=0.25.1
17
+ soundfile>=0.12.1
18
+ ffmpeg-python>=0.2.0
19
+ tensorboardX
20
+ Jinja2>=3.1.2
21
+ json5
22
+ Markdown
23
+ matplotlib>=3.7.0
24
+ matplotlib-inline>=0.1.3
25
+ praat-parselmouth>=0.4.2
26
+ Pillow>=9.1.1
27
+ resampy>=0.4.2
28
+ scikit-learn
29
+ tensorboard
30
+ tqdm>=4.63.1
31
+ tornado>=6.1
32
+ Werkzeug>=2.2.3
33
+ uc-micro-py>=1.0.1
34
+ sympy>=1.11.1
35
+ tabulate>=0.8.10
36
+ PyYAML>=6.0
37
+ pyasn1>=0.4.8
38
+ pyasn1-modules>=0.2.8
39
+ fsspec>=2022.11.0
40
+ absl-py>=1.2.0
41
+ audioread
42
+ uvicorn>=0.21.1
43
+ colorama>=0.4.5
44
+ pyworld==0.3.2
45
+ httpx
46
+ onnxruntime; sys_platform == 'darwin'
47
+ onnxruntime-gpu; sys_platform != 'darwin'
48
+ torchcrepe==0.0.20
49
+ fastapi==0.88
50
+ ffmpy==0.3.1
51
+ python-dotenv>=1.0.0
52
+ av
53
+ PySimpleGUI
54
+ sounddevice
requirements-win-for-realtime_vc_gui-dml.txt ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #1.Install torch from pytorch.org:
2
+ #torch 2.0 with cuda 11.8
3
+ #pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
4
+ #torch 1.11.0 with cuda 11.3
5
+ #pip install torch==1.11.0+cu113 torchvision==0.12.0+cu113 torchaudio==0.11.0 --extra-index-url https://download.pytorch.org/whl/cu113
6
+ einops
7
+ fairseq
8
+ flask
9
+ flask_cors
10
+ gin
11
+ gin_config
12
+ librosa
13
+ local_attention
14
+ matplotlib
15
+ praat-parselmouth
16
+ pyworld
17
+ PyYAML
18
+ resampy
19
+ scikit_learn
20
+ scipy
21
+ SoundFile
22
+ tensorboard
23
+ tqdm
24
+ wave
25
+ PySimpleGUI
26
+ sounddevice
27
+ gradio
28
+ noisereduce
29
+ onnxruntime-directml
requirements-win-for-realtime_vc_gui.txt ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #1.Install torch from pytorch.org:
2
+ #torch 2.0 with cuda 11.8
3
+ #pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
4
+ #torch 1.11.0 with cuda 11.3
5
+ #pip install torch==1.11.0+cu113 torchvision==0.12.0+cu113 torchaudio==0.11.0 --extra-index-url https://download.pytorch.org/whl/cu113
6
+ einops
7
+ fairseq
8
+ flask
9
+ flask_cors
10
+ gin
11
+ gin_config
12
+ librosa
13
+ local_attention
14
+ matplotlib
15
+ praat-parselmouth
16
+ pyworld
17
+ PyYAML
18
+ resampy
19
+ scikit_learn
20
+ scipy
21
+ SoundFile
22
+ tensorboard
23
+ tqdm
24
+ wave
25
+ PySimpleGUI
26
+ sounddevice
27
+ gradio
28
+ noisereduce
requirements.txt ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ joblib>=1.1.0
2
+ numba==0.56.4
3
+ numpy==1.23.5
4
+ scipy
5
+ librosa==0.9.1
6
+ llvmlite==0.39.0
7
+ fairseq==0.12.2
8
+ faiss-cpu==1.7.3
9
+ gradio==3.34.0
10
+ Cython
11
+ pydub>=0.25.1
12
+ soundfile>=0.12.1
13
+ ffmpeg-python>=0.2.0
14
+ tensorboardX
15
+ Jinja2>=3.1.2
16
+ json5
17
+ Markdown
18
+ matplotlib>=3.7.0
19
+ matplotlib-inline>=0.1.3
20
+ praat-parselmouth>=0.4.2
21
+ Pillow>=9.1.1
22
+ resampy>=0.4.2
23
+ scikit-learn
24
+ tensorboard
25
+ tqdm>=4.63.1
26
+ tornado>=6.1
27
+ Werkzeug>=2.2.3
28
+ uc-micro-py>=1.0.1
29
+ sympy>=1.11.1
30
+ tabulate>=0.8.10
31
+ PyYAML>=6.0
32
+ pyasn1>=0.4.8
33
+ pyasn1-modules>=0.2.8
34
+ fsspec>=2022.11.0
35
+ absl-py>=1.2.0
36
+ audioread
37
+ uvicorn>=0.21.1
38
+ colorama>=0.4.5
39
+ pyworld==0.3.2
40
+ httpx
41
+ onnxruntime; sys_platform == 'darwin'
42
+ onnxruntime-gpu; sys_platform != 'darwin'
43
+ torchcrepe==0.0.20
44
+ fastapi==0.88
45
+ ffmpy==0.3.1
46
+ python-dotenv>=1.0.0
47
+ av
run.sh ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ if [[ "$(uname)" == "Darwin" ]]; then
4
+ # macOS specific env:
5
+ export PYTORCH_ENABLE_MPS_FALLBACK=1
6
+ export PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0
7
+ elif [[ "$(uname)" != "Linux" ]]; then
8
+ echo "Unsupported operating system."
9
+ exit 1
10
+ fi
11
+
12
+ if [ -d ".venv" ]; then
13
+ echo "Activate venv..."
14
+ source .venv/bin/activate
15
+ else
16
+ echo "Create venv..."
17
+ requirements_file="requirements.txt"
18
+
19
+ # Check if Python 3.8 is installed
20
+ if ! command -v python3 &> /dev/null; then
21
+ echo "Python 3 not found. Attempting to install 3.8..."
22
+ if [[ "$(uname)" == "Darwin" ]] && command -v brew &> /dev/null; then
23
+ brew install [email protected]
24
+ elif [[ "$(uname)" == "Linux" ]] && command -v apt-get &> /dev/null; then
25
+ sudo apt-get update
26
+ sudo apt-get install python3.8
27
+ else
28
+ echo "Please install Python 3.8 manually."
29
+ exit 1
30
+ fi
31
+ fi
32
+
33
+ python3 -m venv .venv
34
+ source .venv/bin/activate
35
+
36
+ # Check if required packages are installed and install them if not
37
+ if [ -f "${requirements_file}" ]; then
38
+ installed_packages=$(python3 -m pip freeze)
39
+ while IFS= read -r package; do
40
+ [[ "${package}" =~ ^#.* ]] && continue
41
+ package_name=$(echo "${package}" | sed 's/[<>=!].*//')
42
+ if ! echo "${installed_packages}" | grep -q "${package_name}"; then
43
+ echo "${package_name} not found. Attempting to install..."
44
+ python3 -m pip install --upgrade "${package}"
45
+ fi
46
+ done < "${requirements_file}"
47
+ else
48
+ echo "${requirements_file} not found. Please ensure the requirements file with required packages exists."
49
+ exit 1
50
+ fi
51
+ fi
52
+
53
+ # Download models
54
+ ./tools/dlmodels.sh
55
+
56
+ if [[ $? -ne 0 ]]; then
57
+ exit 1
58
+ fi
59
+
60
+ # Run the main script
61
+ python3 infer-web.py --pycmd python3
tools/app.py ADDED
@@ -0,0 +1,147 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import os
3
+
4
+ # os.system("wget -P cvec/ https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/hubert_base.pt")
5
+ import gradio as gr
6
+ from dotenv import load_dotenv
7
+
8
+ from configs.config import Config
9
+ from i18n.i18n import I18nAuto
10
+ from infer.modules.vc.modules import VC
11
+
12
+ logging.getLogger("numba").setLevel(logging.WARNING)
13
+ logging.getLogger("markdown_it").setLevel(logging.WARNING)
14
+ logging.getLogger("urllib3").setLevel(logging.WARNING)
15
+ logging.getLogger("matplotlib").setLevel(logging.WARNING)
16
+ logger = logging.getLogger(__name__)
17
+
18
+ i18n = I18nAuto()
19
+ logger.info(i18n)
20
+
21
+ load_dotenv()
22
+ config = Config()
23
+ vc = VC(config)
24
+
25
+ weight_root = os.getenv("weight_root")
26
+ weight_uvr5_root = os.getenv("weight_uvr5_root")
27
+ index_root = os.getenv("index_root")
28
+ names = []
29
+ hubert_model = None
30
+ for name in os.listdir(weight_root):
31
+ if name.endswith(".pth"):
32
+ names.append(name)
33
+ index_paths = []
34
+ for root, dirs, files in os.walk(index_root, topdown=False):
35
+ for name in files:
36
+ if name.endswith(".index") and "trained" not in name:
37
+ index_paths.append("%s/%s" % (root, name))
38
+
39
+
40
+ app = gr.Blocks()
41
+ with app:
42
+ with gr.Tabs():
43
+ with gr.TabItem("在线demo"):
44
+ gr.Markdown(
45
+ value="""
46
+ RVC 在线demo
47
+ """
48
+ )
49
+ sid = gr.Dropdown(label=i18n("推理音色"), choices=sorted(names))
50
+ with gr.Column():
51
+ spk_item = gr.Slider(
52
+ minimum=0,
53
+ maximum=2333,
54
+ step=1,
55
+ label=i18n("请选择说话人id"),
56
+ value=0,
57
+ visible=False,
58
+ interactive=True,
59
+ )
60
+ sid.change(fn=vc.get_vc, inputs=[sid], outputs=[spk_item])
61
+ gr.Markdown(
62
+ value=i18n("男转女推荐+12key, 女转男推荐-12key, 如果音域爆炸导致音色失真也可以自己调整到合适音域. ")
63
+ )
64
+ vc_input3 = gr.Audio(label="上传音频(长度小于90秒)")
65
+ vc_transform0 = gr.Number(label=i18n("变调(整数, 半音数量, 升八度12降八度-12)"), value=0)
66
+ f0method0 = gr.Radio(
67
+ label=i18n("选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU"),
68
+ choices=["pm", "harvest", "crepe", "rmvpe"],
69
+ value="pm",
70
+ interactive=True,
71
+ )
72
+ filter_radius0 = gr.Slider(
73
+ minimum=0,
74
+ maximum=7,
75
+ label=i18n(">=3则使用对harvest音高识别的结果使用中值滤波,数值为滤波半径,使用可以削弱哑音"),
76
+ value=3,
77
+ step=1,
78
+ interactive=True,
79
+ )
80
+ with gr.Column():
81
+ file_index1 = gr.Textbox(
82
+ label=i18n("特征检索库文件路径,为空则使用下拉的选择结果"),
83
+ value="",
84
+ interactive=False,
85
+ visible=False,
86
+ )
87
+ file_index2 = gr.Dropdown(
88
+ label=i18n("自动检测index路径,下拉式选择(dropdown)"),
89
+ choices=sorted(index_paths),
90
+ interactive=True,
91
+ )
92
+ index_rate1 = gr.Slider(
93
+ minimum=0,
94
+ maximum=1,
95
+ label=i18n("检索特征占比"),
96
+ value=0.88,
97
+ interactive=True,
98
+ )
99
+ resample_sr0 = gr.Slider(
100
+ minimum=0,
101
+ maximum=48000,
102
+ label=i18n("后处理重采样至最终采样率,0为不进行重采样"),
103
+ value=0,
104
+ step=1,
105
+ interactive=True,
106
+ )
107
+ rms_mix_rate0 = gr.Slider(
108
+ minimum=0,
109
+ maximum=1,
110
+ label=i18n("输入源音量包络替换输出音量包络融合比例,越靠近1越使用输出包络"),
111
+ value=1,
112
+ interactive=True,
113
+ )
114
+ protect0 = gr.Slider(
115
+ minimum=0,
116
+ maximum=0.5,
117
+ label=i18n("保护清辅音和呼吸声,防止电音撕裂等artifact,拉满0.5不开启,调低加大保护力度但可能降低索引效果"),
118
+ value=0.33,
119
+ step=0.01,
120
+ interactive=True,
121
+ )
122
+ f0_file = gr.File(label=i18n("F0曲线文件, 可选, 一行一个音高, 代替默认F0及升降调"))
123
+ but0 = gr.Button(i18n("转换"), variant="primary")
124
+ vc_output1 = gr.Textbox(label=i18n("输出信息"))
125
+ vc_output2 = gr.Audio(label=i18n("输出音频(右下角三个点,点了可以下载)"))
126
+ but0.click(
127
+ vc.vc_single,
128
+ [
129
+ spk_item,
130
+ vc_input3,
131
+ vc_transform0,
132
+ f0_file,
133
+ f0method0,
134
+ file_index1,
135
+ file_index2,
136
+ # file_big_npy1,
137
+ index_rate1,
138
+ filter_radius0,
139
+ resample_sr0,
140
+ rms_mix_rate0,
141
+ protect0,
142
+ ],
143
+ [vc_output1, vc_output2],
144
+ )
145
+
146
+
147
+ app.launch()
tools/calc_rvc_model_similarity.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This code references https://huggingface.co/JosephusCheung/ASimilarityCalculatior/blob/main/qwerty.py
2
+ # Fill in the path of the model to be queried and the root directory of the reference models, and this script will return the similarity between the model to be queried and all reference models.
3
+ import os
4
+ import logging
5
+
6
+ logger = logging.getLogger(__name__)
7
+
8
+ import torch
9
+ import torch.nn as nn
10
+ import torch.nn.functional as F
11
+
12
+
13
+ def cal_cross_attn(to_q, to_k, to_v, rand_input):
14
+ hidden_dim, embed_dim = to_q.shape
15
+ attn_to_q = nn.Linear(hidden_dim, embed_dim, bias=False)
16
+ attn_to_k = nn.Linear(hidden_dim, embed_dim, bias=False)
17
+ attn_to_v = nn.Linear(hidden_dim, embed_dim, bias=False)
18
+ attn_to_q.load_state_dict({"weight": to_q})
19
+ attn_to_k.load_state_dict({"weight": to_k})
20
+ attn_to_v.load_state_dict({"weight": to_v})
21
+
22
+ return torch.einsum(
23
+ "ik, jk -> ik",
24
+ F.softmax(
25
+ torch.einsum("ij, kj -> ik", attn_to_q(rand_input), attn_to_k(rand_input)),
26
+ dim=-1,
27
+ ),
28
+ attn_to_v(rand_input),
29
+ )
30
+
31
+
32
+ def model_hash(filename):
33
+ try:
34
+ with open(filename, "rb") as file:
35
+ import hashlib
36
+
37
+ m = hashlib.sha256()
38
+
39
+ file.seek(0x100000)
40
+ m.update(file.read(0x10000))
41
+ return m.hexdigest()[0:8]
42
+ except FileNotFoundError:
43
+ return "NOFILE"
44
+
45
+
46
+ def eval(model, n, input):
47
+ qk = f"enc_p.encoder.attn_layers.{n}.conv_q.weight"
48
+ uk = f"enc_p.encoder.attn_layers.{n}.conv_k.weight"
49
+ vk = f"enc_p.encoder.attn_layers.{n}.conv_v.weight"
50
+ atoq, atok, atov = model[qk][:, :, 0], model[uk][:, :, 0], model[vk][:, :, 0]
51
+
52
+ attn = cal_cross_attn(atoq, atok, atov, input)
53
+ return attn
54
+
55
+
56
+ def main(path, root):
57
+ torch.manual_seed(114514)
58
+ model_a = torch.load(path, map_location="cpu")["weight"]
59
+
60
+ logger.info("Query:\t\t%s\t%s" % (path, model_hash(path)))
61
+
62
+ map_attn_a = {}
63
+ map_rand_input = {}
64
+ for n in range(6):
65
+ hidden_dim, embed_dim, _ = model_a[
66
+ f"enc_p.encoder.attn_layers.{n}.conv_v.weight"
67
+ ].shape
68
+ rand_input = torch.randn([embed_dim, hidden_dim])
69
+
70
+ map_attn_a[n] = eval(model_a, n, rand_input)
71
+ map_rand_input[n] = rand_input
72
+
73
+ del model_a
74
+
75
+ for name in sorted(list(os.listdir(root))):
76
+ path = "%s/%s" % (root, name)
77
+ model_b = torch.load(path, map_location="cpu")["weight"]
78
+
79
+ sims = []
80
+ for n in range(6):
81
+ attn_a = map_attn_a[n]
82
+ attn_b = eval(model_b, n, map_rand_input[n])
83
+
84
+ sim = torch.mean(torch.cosine_similarity(attn_a, attn_b))
85
+ sims.append(sim)
86
+
87
+ logger.info(
88
+ "Reference:\t%s\t%s\t%s"
89
+ % (path, model_hash(path), f"{torch.mean(torch.stack(sims)) * 1e2:.2f}%")
90
+ )
91
+
92
+
93
+ if __name__ == "__main__":
94
+ query_path = r"assets\weights\mi v3.pth"
95
+ reference_root = r"assets\weights"
96
+ main(query_path, reference_root)
tools/dlmodels.bat ADDED
@@ -0,0 +1,348 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ @echo off && chcp 65001
2
+
3
+ echo working dir is %cd%
4
+ echo downloading requirement aria2 check.
5
+ echo=
6
+ dir /a:d/b | findstr "aria2" > flag.txt
7
+ findstr "aria2" flag.txt >nul
8
+ if %errorlevel% ==0 (
9
+ echo aria2 checked.
10
+ echo=
11
+ ) else (
12
+ echo failed. please downloading aria2 from webpage!
13
+ echo unzip it and put in this directory!
14
+ timeout /T 5
15
+ start https://github.com/aria2/aria2/releases/tag/release-1.36.0
16
+ echo=
17
+ goto end
18
+ )
19
+
20
+ echo envfiles checking start.
21
+ echo=
22
+
23
+ for /f %%x in ('findstr /i /c:"aria2" "flag.txt"') do (set aria2=%%x)&goto endSch
24
+ :endSch
25
+
26
+ set d32=f0D32k.pth
27
+ set d40=f0D40k.pth
28
+ set d48=f0D48k.pth
29
+ set g32=f0G32k.pth
30
+ set g40=f0G40k.pth
31
+ set g48=f0G48k.pth
32
+
33
+ set d40v2=f0D40k.pth
34
+ set g40v2=f0G40k.pth
35
+
36
+ set dld32=https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/f0D32k.pth
37
+ set dld40=https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/f0D40k.pth
38
+ set dld48=https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/f0D48k.pth
39
+ set dlg32=https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/f0G32k.pth
40
+ set dlg40=https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/f0G40k.pth
41
+ set dlg48=https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/f0G48k.pth
42
+
43
+ set dld40v2=https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained_v2/f0D40k.pth
44
+ set dlg40v2=https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained_v2/f0G40k.pth
45
+
46
+ set hp2_all=HP2_all_vocals.pth
47
+ set hp3_all=HP3_all_vocals.pth
48
+ set hp5_only=HP5_only_main_vocal.pth
49
+ set VR_DeEchoAggressive=VR-DeEchoAggressive.pth
50
+ set VR_DeEchoDeReverb=VR-DeEchoDeReverb.pth
51
+ set VR_DeEchoNormal=VR-DeEchoNormal.pth
52
+ set onnx_dereverb=vocals.onnx
53
+
54
+ set dlhp2_all=https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/HP2_all_vocals.pth
55
+ set dlhp3_all=https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/HP3_all_vocals.pth
56
+ set dlhp5_only=https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/HP5_only_main_vocal.pth
57
+ set dlVR_DeEchoAggressive=https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/VR-DeEchoAggressive.pth
58
+ set dlVR_DeEchoDeReverb=https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/VR-DeEchoDeReverb.pth
59
+ set dlVR_DeEchoNormal=https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/VR-DeEchoNormal.pth
60
+ set dlonnx_dereverb=https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/onnx_dereverb_By_FoxJoy/vocals.onnx
61
+
62
+ set hb=hubert_base.pt
63
+
64
+ set dlhb=https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/hubert_base.pt
65
+
66
+ echo dir check start.
67
+ echo=
68
+
69
+ if exist "%~dp0assets\pretrained" (
70
+ echo dir .\assets\pretrained checked.
71
+ ) else (
72
+ echo failed. generating dir .\assets\pretrained.
73
+ mkdir pretrained
74
+ )
75
+ if exist "%~dp0assets\pretrained_v2" (
76
+ echo dir .\assets\pretrained_v2 checked.
77
+ ) else (
78
+ echo failed. generating dir .\assets\pretrained_v2.
79
+ mkdir pretrained_v2
80
+ )
81
+ if exist "%~dp0assets\uvr5_weights" (
82
+ echo dir .\assets\uvr5_weights checked.
83
+ ) else (
84
+ echo failed. generating dir .\assets\uvr5_weights.
85
+ mkdir uvr5_weights
86
+ )
87
+ if exist "%~dp0assets\uvr5_weights\onnx_dereverb_By_FoxJoy" (
88
+ echo dir .\assets\uvr5_weights\onnx_dereverb_By_FoxJoy checked.
89
+ ) else (
90
+ echo failed. generating dir .\assets\uvr5_weights\onnx_dereverb_By_FoxJoy.
91
+ mkdir uvr5_weights\onnx_dereverb_By_FoxJoy
92
+ )
93
+
94
+ echo=
95
+ echo dir check finished.
96
+
97
+ echo=
98
+ echo required files check start.
99
+
100
+ echo checking D32k.pth
101
+ if exist "%~dp0assets\pretrained\D32k.pth" (
102
+ echo D32k.pth in .\assets\pretrained checked.
103
+ echo=
104
+ ) else (
105
+ echo failed. starting download from huggingface.
106
+ %~dp0%aria2%\aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/D32k.pth -d %~dp0assets\pretrained -o D32k.pth
107
+ if exist "%~dp0assets\pretrained\D32k.pth" (echo download successful.) else (echo please try again!
108
+ echo=)
109
+ )
110
+ echo checking D40k.pth
111
+ if exist "%~dp0assets\pretrained\D40k.pth" (
112
+ echo D40k.pth in .\assets\pretrained checked.
113
+ echo=
114
+ ) else (
115
+ echo failed. starting download from huggingface.
116
+ %~dp0%aria2%\aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/D40k.pth -d %~dp0assets\pretrained -o D40k.pth
117
+ if exist "%~dp0assets\pretrained\D40k.pth" (echo download successful.) else (echo please try again!
118
+ echo=)
119
+ )
120
+ echo checking D40k.pth
121
+ if exist "%~dp0assets\pretrained_v2\D40k.pth" (
122
+ echo D40k.pth in .\assets\pretrained_v2 checked.
123
+ echo=
124
+ ) else (
125
+ echo failed. starting download from huggingface.
126
+ %~dp0%aria2%\aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained_v2/D40k.pth -d %~dp0assets\pretrained_v2 -o D40k.pth
127
+ if exist "%~dp0assets\pretrained_v2\D40k.pth" (echo download successful.) else (echo please try again!
128
+ echo=)
129
+ )
130
+ echo checking D48k.pth
131
+ if exist "%~dp0assets\pretrained\D48k.pth" (
132
+ echo D48k.pth in .\assets\pretrained checked.
133
+ echo=
134
+ ) else (
135
+ echo failed. starting download from huggingface.
136
+ %~dp0%aria2%\aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/D48k.pth -d %~dp0assets\pretrained -o D48k.pth
137
+ if exist "%~dp0assets\pretrained\D48k.pth" (echo download successful.) else (echo please try again!
138
+ echo=)
139
+ )
140
+ echo checking G32k.pth
141
+ if exist "%~dp0assets\pretrained\G32k.pth" (
142
+ echo G32k.pth in .\assets\pretrained checked.
143
+ echo=
144
+ ) else (
145
+ echo failed. starting download from huggingface.
146
+ %~dp0%aria2%\aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/G32k.pth -d %~dp0assets\pretrained -o G32k.pth
147
+ if exist "%~dp0assets\pretrained\G32k.pth" (echo download successful.) else (echo please try again!
148
+ echo=)
149
+ )
150
+ echo checking G40k.pth
151
+ if exist "%~dp0assets\pretrained\G40k.pth" (
152
+ echo G40k.pth in .\assets\pretrained checked.
153
+ echo=
154
+ ) else (
155
+ echo failed. starting download from huggingface.
156
+ %~dp0%aria2%\aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/G40k.pth -d %~dp0assets\pretrained -o G40k.pth
157
+ if exist "%~dp0assets\pretrained\G40k.pth" (echo download successful.) else (echo please try again!
158
+ echo=)
159
+ )
160
+ echo checking G40k.pth
161
+ if exist "%~dp0assets\pretrained_v2\G40k.pth" (
162
+ echo G40k.pth in .\assets\pretrained_v2 checked.
163
+ echo=
164
+ ) else (
165
+ echo failed. starting download from huggingface.
166
+ %~dp0%aria2%\aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained_v2/G40k.pth -d %~dp0assets\pretrained_v2 -o G40k.pth
167
+ if exist "%~dp0assets\pretrained_v2\G40k.pth" (echo download successful.) else (echo please try again!
168
+ echo=)
169
+ )
170
+ echo checking G48k.pth
171
+ if exist "%~dp0assets\pretrained\G48k.pth" (
172
+ echo G48k.pth in .\assets\pretrained checked.
173
+ echo=
174
+ ) else (
175
+ echo failed. starting download from huggingface.
176
+ %~dp0%aria2%\aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/G48k.pth -d %~dp0assets\pretrained -o G48k.pth
177
+ if exist "%~dp0assets\pretrained\G48k.pth" (echo download successful.) else (echo please try again!
178
+ echo=)
179
+ )
180
+
181
+ echo checking %d32%
182
+ if exist "%~dp0assets\pretrained\%d32%" (
183
+ echo %d32% in .\assets\pretrained checked.
184
+ echo=
185
+ ) else (
186
+ echo failed. starting download from huggingface.
187
+ %~dp0%aria2%\aria2c --console-log-level=error -c -x 16 -s 16 -k 1M %dld32% -d %~dp0assets\pretrained -o %d32%
188
+ if exist "%~dp0assets\pretrained\%d32%" (echo download successful.) else (echo please try again!
189
+ echo=)
190
+ )
191
+ echo checking %d40%
192
+ if exist "%~dp0assets\pretrained\%d40%" (
193
+ echo %d40% in .\assets\pretrained checked.
194
+ echo=
195
+ ) else (
196
+ echo failed. starting download from huggingface.
197
+ %~dp0%aria2%\aria2c --console-log-level=error -c -x 16 -s 16 -k 1M %dld40% -d %~dp0assets\pretrained -o %d40%
198
+ if exist "%~dp0assets\pretrained\%d40%" (echo download successful.) else (echo please try again!
199
+ echo=)
200
+ )
201
+ echo checking %d40v2%
202
+ if exist "%~dp0assets\pretrained_v2\%d40v2%" (
203
+ echo %d40v2% in .\assets\pretrained_v2 checked.
204
+ echo=
205
+ ) else (
206
+ echo failed. starting download from huggingface.
207
+ %~dp0%aria2%\aria2c --console-log-level=error -c -x 16 -s 16 -k 1M %dld40v2% -d %~dp0assets\pretrained_v2 -o %d40v2%
208
+ if exist "%~dp0assets\pretrained_v2\%d40v2%" (echo download successful.) else (echo please try again!
209
+ echo=)
210
+ )
211
+ echo checking %d48%
212
+ if exist "%~dp0assets\pretrained\%d48%" (
213
+ echo %d48% in .\assets\pretrained checked.
214
+ echo=
215
+ ) else (
216
+ echo failed. starting download from huggingface.
217
+ %~dp0%aria2%\aria2c --console-log-level=error -c -x 16 -s 16 -k 1M %dld48% -d %~dp0assets\pretrained -o %d48%
218
+ if exist "%~dp0assets\pretrained\%d48%" (echo download successful.) else (echo please try again!
219
+ echo=)
220
+ )
221
+ echo checking %g32%
222
+ if exist "%~dp0assets\pretrained\%g32%" (
223
+ echo %g32% in .\assets\pretrained checked.
224
+ echo=
225
+ ) else (
226
+ echo failed. starting download from huggingface.
227
+ %~dp0%aria2%\aria2c --console-log-level=error -c -x 16 -s 16 -k 1M %dlg32% -d %~dp0assets\pretrained -o %g32%
228
+ if exist "%~dp0assets\pretrained\%g32%" (echo download successful.) else (echo please try again!
229
+ echo=)
230
+ )
231
+ echo checking %g40%
232
+ if exist "%~dp0assets\pretrained\%g40%" (
233
+ echo %g40% in .\assets\pretrained checked.
234
+ echo=
235
+ ) else (
236
+ echo failed. starting download from huggingface.
237
+ %~dp0%aria2%\aria2c --console-log-level=error -c -x 16 -s 16 -k 1M %dlg40% -d %~dp0assets\pretrained -o %g40%
238
+ if exist "%~dp0assets\pretrained\%g40%" (echo download successful.) else (echo please try again!
239
+ echo=)
240
+ )
241
+ echo checking %g40v2%
242
+ if exist "%~dp0assets\pretrained_v2\%g40v2%" (
243
+ echo %g40v2% in .\assets\pretrained_v2 checked.
244
+ echo=
245
+ ) else (
246
+ echo failed. starting download from huggingface.
247
+ %~dp0%aria2%\aria2c --console-log-level=error -c -x 16 -s 16 -k 1M %dlg40v2% -d %~dp0assets\pretrained_v2 -o %g40v2%
248
+ if exist "%~dp0assets\pretrained_v2\%g40v2%" (echo download successful.) else (echo please try again!
249
+ echo=)
250
+ )
251
+ echo checking %g48%
252
+ if exist "%~dp0assets\pretrained\%g48%" (
253
+ echo %g48% in .\assets\pretrained checked.
254
+ echo=
255
+ ) else (
256
+ echo failed. starting download from huggingface.
257
+ %~dp0%aria2%\aria2c --console-log-level=error -c -x 16 -s 16 -k 1M %dlg48% -d %~dp0assets\pretrained -o %g48%
258
+ if exist "%~dp0assets\pretrained\%g48%" (echo download successful.) else (echo please try again!
259
+ echo=)
260
+ )
261
+
262
+ echo checking %hp2_all%
263
+ if exist "%~dp0assets\uvr5_weights\%hp2_all%" (
264
+ echo %hp2_all% in .\assets\uvr5_weights checked.
265
+ echo=
266
+ ) else (
267
+ echo failed. starting download from huggingface.
268
+ %~dp0%aria2%\aria2c --console-log-level=error -c -x 16 -s 16 -k 1M %dlhp2_all% -d %~dp0assets\uvr5_weights -o %hp2_all%
269
+ if exist "%~dp0assets\uvr5_weights\%hp2_all%" (echo download successful.) else (echo please try again!
270
+ echo=)
271
+ )
272
+ echo checking %hp3_all%
273
+ if exist "%~dp0assets\uvr5_weights\%hp3_all%" (
274
+ echo %hp3_all% in .\assets\uvr5_weights checked.
275
+ echo=
276
+ ) else (
277
+ echo failed. starting download from huggingface.
278
+ %~dp0%aria2%\aria2c --console-log-level=error -c -x 16 -s 16 -k 1M %dlhp3_all% -d %~dp0assets\uvr5_weights -o %hp3_all%
279
+ if exist "%~dp0assets\uvr5_weights\%hp3_all%" (echo download successful.) else (echo please try again!
280
+ echo=)
281
+ )
282
+ echo checking %hp5_only%
283
+ if exist "%~dp0assets\uvr5_weights\%hp5_only%" (
284
+ echo %hp5_only% in .\assets\uvr5_weights checked.
285
+ echo=
286
+ ) else (
287
+ echo failed. starting download from huggingface.
288
+ %~dp0%aria2%\aria2c --console-log-level=error -c -x 16 -s 16 -k 1M %dlhp5_only% -d %~dp0assets\uvr5_weights -o %hp5_only%
289
+ if exist "%~dp0assets\uvr5_weights\%hp5_only%" (echo download successful.) else (echo please try again!
290
+ echo=)
291
+ )
292
+ echo checking %VR_DeEchoAggressive%
293
+ if exist "%~dp0assets\uvr5_weights\%VR_DeEchoAggressive%" (
294
+ echo %VR_DeEchoAggressive% in .\assets\uvr5_weights checked.
295
+ echo=
296
+ ) else (
297
+ echo failed. starting download from huggingface.
298
+ %~dp0%aria2%\aria2c --console-log-level=error -c -x 16 -s 16 -k 1M %dlVR_DeEchoAggressive% -d %~dp0assets\uvr5_weights -o %VR_DeEchoAggressive%
299
+ if exist "%~dp0assets\uvr5_weights\%VR_DeEchoAggressive%" (echo download successful.) else (echo please try again!
300
+ echo=)
301
+ )
302
+ echo checking %VR_DeEchoDeReverb%
303
+ if exist "%~dp0assets\uvr5_weights\%VR_DeEchoDeReverb%" (
304
+ echo %VR_DeEchoDeReverb% in .\assets\uvr5_weights checked.
305
+ echo=
306
+ ) else (
307
+ echo failed. starting download from huggingface.
308
+ %~dp0%aria2%\aria2c --console-log-level=error -c -x 16 -s 16 -k 1M %dlVR_DeEchoDeReverb% -d %~dp0assets\uvr5_weights -o %VR_DeEchoDeReverb%
309
+ if exist "%~dp0assets\uvr5_weights\%VR_DeEchoDeReverb%" (echo download successful.) else (echo please try again!
310
+ echo=)
311
+ )
312
+ echo checking %VR_DeEchoNormal%
313
+ if exist "%~dp0assets\uvr5_weights\%VR_DeEchoNormal%" (
314
+ echo %VR_DeEchoNormal% in .\assets\uvr5_weights checked.
315
+ echo=
316
+ ) else (
317
+ echo failed. starting download from huggingface.
318
+ %~dp0%aria2%\aria2c --console-log-level=error -c -x 16 -s 16 -k 1M %dlVR_DeEchoNormal% -d %~dp0assets\uvr5_weights -o %VR_DeEchoNormal%
319
+ if exist "%~dp0assets\uvr5_weights\%VR_DeEchoNormal%" (echo download successful.) else (echo please try again!
320
+ echo=)
321
+ )
322
+ echo checking %onnx_dereverb%
323
+ if exist "%~dp0assets\uvr5_weights\onnx_dereverb_By_FoxJoy\%onnx_dereverb%" (
324
+ echo %onnx_dereverb% in .\assets\uvr5_weights\onnx_dereverb_By_FoxJoy checked.
325
+ echo=
326
+ ) else (
327
+ echo failed. starting download from huggingface.
328
+ %~dp0%aria2%\aria2c --console-log-level=error -c -x 16 -s 16 -k 1M %dlonnx_dereverb% -d %~dp0assets\uvr5_weights\onnx_dereverb_By_FoxJoy -o %onnx_dereverb%
329
+ if exist "%~dp0assets\uvr5_weights\onnx_dereverb_By_FoxJoy\%onnx_dereverb%" (echo download successful.) else (echo please try again!
330
+ echo=)
331
+ )
332
+
333
+ echo checking %hb%
334
+ if exist "%~dp0assets\hubert\%hb%" (
335
+ echo %hb% in .\assets\hubert\pretrained checked.
336
+ echo=
337
+ ) else (
338
+ echo failed. starting download from huggingface.
339
+ %~dp0%aria2%\aria2c --console-log-level=error -c -x 16 -s 16 -k 1M %dlhb% -d %~dp0assets\hubert\ -o %hb%
340
+ if exist "%~dp0assets\hubert\%hb%" (echo download successful.) else (echo please try again!
341
+ echo=)
342
+ )
343
+
344
+ echo required files check finished.
345
+ echo envfiles check complete.
346
+ pause
347
+ :end
348
+ del flag.txt
tools/dlmodels.sh ADDED
@@ -0,0 +1,566 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ echo working dir is $(pwd)
4
+ echo downloading requirement aria2 check.
5
+
6
+ if command -v aria2c &> /dev/null
7
+ then
8
+ echo "aria2c command found"
9
+ else
10
+ echo failed. please install aria2
11
+ sleep 5
12
+ exit 1
13
+ fi
14
+
15
+ d32="f0D32k.pth"
16
+ d40="f0D40k.pth"
17
+ d48="f0D48k.pth"
18
+ g32="f0G32k.pth"
19
+ g40="f0G40k.pth"
20
+ g48="f0G48k.pth"
21
+
22
+ d40v2="f0D40k.pth"
23
+ g40v2="f0G40k.pth"
24
+
25
+ dld32="https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/f0D32k.pth"
26
+ dld40="https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/f0D40k.pth"
27
+ dld48="https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/f0D48k.pth"
28
+ dlg32="https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/f0G32k.pth"
29
+ dlg40="https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/f0G40k.pth"
30
+ dlg48="https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/f0G48k.pth"
31
+
32
+ dld40v2="https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained_v2/f0D40k.pth"
33
+ dlg40v2="https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained_v2/f0G40k.pth"
34
+
35
+ hp2_all="HP2_all_vocals.pth"
36
+ hp3_all="HP3_all_vocals.pth"
37
+ hp5_only="HP5_only_main_vocal.pth"
38
+ VR_DeEchoAggressive="VR-DeEchoAggressive.pth"
39
+ VR_DeEchoDeReverb="VR-DeEchoDeReverb.pth"
40
+ VR_DeEchoNormal="VR-DeEchoNormal.pth"
41
+ onnx_dereverb="vocals.onnx"
42
+ rmvpe="rmvpe.pt"
43
+
44
+ dlhp2_all="https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/HP2_all_vocals.pth"
45
+ dlhp3_all="https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/HP3_all_vocals.pth"
46
+ dlhp5_only="https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/HP5_only_main_vocal.pth"
47
+ dlVR_DeEchoAggressive="https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/VR-DeEchoAggressive.pth"
48
+ dlVR_DeEchoDeReverb="https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/VR-DeEchoDeReverb.pth"
49
+ dlVR_DeEchoNormal="https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/VR-DeEchoNormal.pth"
50
+ dlonnx_dereverb="https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/onnx_dereverb_By_FoxJoy/vocals.onnx"
51
+ dlrmvpe="https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/rmvpe.pt"
52
+
53
+ hb="hubert_base.pt"
54
+
55
+ dlhb="https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/hubert_base.pt"
56
+
57
+ echo dir check start.
58
+
59
+ if [ -d "./assets/pretrained" ]; then
60
+ echo dir ./assets/pretrained checked.
61
+ else
62
+ echo failed. generating dir ./assets/pretrained.
63
+ mkdir pretrained
64
+ fi
65
+
66
+ if [ -d "./assets/pretrained_v2" ]; then
67
+ echo dir ./assets/pretrained_v2 checked.
68
+ else
69
+ echo failed. generating dir ./assets/pretrained_v2.
70
+ mkdir pretrained_v2
71
+ fi
72
+
73
+ if [ -d "./assets/uvr5_weights" ]; then
74
+ echo dir ./assets/uvr5_weights checked.
75
+ else
76
+ echo failed. generating dir ./assets/uvr5_weights.
77
+ mkdir uvr5_weights
78
+ fi
79
+
80
+ if [ -d "./assets/uvr5_weights/onnx_dereverb_By_FoxJoy" ]; then
81
+ echo dir ./assets/uvr5_weights/onnx_dereverb_By_FoxJoy checked.
82
+ else
83
+ echo failed. generating dir ./assets/uvr5_weights/onnx_dereverb_By_FoxJoy.
84
+ mkdir uvr5_weights/onnx_dereverb_By_FoxJoy
85
+ fi
86
+
87
+ echo dir check finished.
88
+
89
+ echo required files check start.
90
+
91
+ echo checking D32k.pth
92
+ if [ -f "./assets/pretrained/D32k.pth" ]; then
93
+ echo D32k.pth in ./assets/pretrained checked.
94
+ else
95
+ echo failed. starting download from huggingface.
96
+ if command -v aria2c &> /dev/null; then
97
+ aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/D32k.pth -d ./assets/pretrained -o D32k.pth
98
+ if [ -f "./assets/pretrained/D32k.pth" ]; then
99
+ echo download successful.
100
+ else
101
+ echo please try again!
102
+ exit 1
103
+ fi
104
+ else
105
+ echo aria2c command not found. Please install aria2c and try again.
106
+ exit 1
107
+ fi
108
+ fi
109
+
110
+ echo checking D40k.pth
111
+ if [ -f "./assets/pretrained/D40k.pth" ]; then
112
+ echo D40k.pth in ./assets/pretrained checked.
113
+ else
114
+ echo failed. starting download from huggingface.
115
+ if command -v aria2c &> /dev/null; then
116
+ aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/D40k.pth -d ./assets/pretrained -o D40k.pth
117
+ if [ -f "./assets/pretrained/D40k.pth" ]; then
118
+ echo download successful.
119
+ else
120
+ echo please try again!
121
+ exit 1
122
+ fi
123
+ else
124
+ echo aria2c command not found. Please install aria2c and try again.
125
+ exit 1
126
+ fi
127
+ fi
128
+
129
+ echo checking D40k.pth
130
+ if [ -f "./assets/pretrained_v2/D40k.pth" ]; then
131
+ echo D40k.pth in ./assets/pretrained_v2 checked.
132
+ else
133
+ echo failed. starting download from huggingface.
134
+ if command -v aria2c &> /dev/null; then
135
+ aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained_v2/D40k.pth -d ./assets/pretrained_v2 -o D40k.pth
136
+ if [ -f "./assets/pretrained_v2/D40k.pth" ]; then
137
+ echo download successful.
138
+ else
139
+ echo please try again!
140
+ exit 1
141
+ fi
142
+ else
143
+ echo aria2c command not found. Please install aria2c and try again.
144
+ exit 1
145
+ fi
146
+ fi
147
+
148
+ echo checking D48k.pth
149
+ if [ -f "./assets/pretrained/D48k.pth" ]; then
150
+ echo D48k.pth in ./assets/pretrained checked.
151
+ else
152
+ echo failed. starting download from huggingface.
153
+ if command -v aria2c &> /dev/null; then
154
+ aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/D48k.pth -d ./assets/pretrained -o D48k.pth
155
+ if [ -f "./assets/pretrained/D48k.pth" ]; then
156
+ echo download successful.
157
+ else
158
+ echo please try again!
159
+ exit 1
160
+ fi
161
+ else
162
+ echo aria2c command not found. Please install aria2c and try again.
163
+ exit 1
164
+ fi
165
+ fi
166
+
167
+ echo checking G32k.pth
168
+ if [ -f "./assets/pretrained/G32k.pth" ]; then
169
+ echo G32k.pth in ./assets/pretrained checked.
170
+ else
171
+ echo failed. starting download from huggingface.
172
+ if command -v aria2c &> /dev/null; then
173
+ aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/G32k.pth -d ./assets/pretrained -o G32k.pth
174
+ if [ -f "./assets/pretrained/G32k.pth" ]; then
175
+ echo download successful.
176
+ else
177
+ echo please try again!
178
+ exit 1
179
+ fi
180
+ else
181
+ echo aria2c command not found. Please install aria2c and try again.
182
+ exit 1
183
+ fi
184
+ fi
185
+
186
+ echo checking G40k.pth
187
+ if [ -f "./assets/pretrained/G40k.pth" ]; then
188
+ echo G40k.pth in ./assets/pretrained checked.
189
+ else
190
+ echo failed. starting download from huggingface.
191
+ if command -v aria2c &> /dev/null; then
192
+ aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/G40k.pth -d ./assets/pretrained -o G40k.pth
193
+ if [ -f "./assets/pretrained/G40k.pth" ]; then
194
+ echo download successful.
195
+ else
196
+ echo please try again!
197
+ exit 1
198
+ fi
199
+ else
200
+ echo aria2c command not found. Please install aria2c and try again.
201
+ exit 1
202
+ fi
203
+ fi
204
+
205
+ echo checking G40k.pth
206
+ if [ -f "./assets/pretrained_v2/G40k.pth" ]; then
207
+ echo G40k.pth in ./assets/pretrained_v2 checked.
208
+ else
209
+ echo failed. starting download from huggingface.
210
+ if command -v aria2c &> /dev/null; then
211
+ aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained_v2/G40k.pth -d ./assets/pretrained_v2 -o G40k.pth
212
+ if [ -f "./assets/pretrained_v2/G40k.pth" ]; then
213
+ echo download successful.
214
+ else
215
+ echo please try again!
216
+ exit 1
217
+ fi
218
+ else
219
+ echo aria2c command not found. Please install aria2c and try again.
220
+ exit 1
221
+ fi
222
+ fi
223
+
224
+ echo checking G48k.pth
225
+ if [ -f "./assets/pretrained/G48k.pth" ]; then
226
+ echo G48k.pth in ./assets/pretrained checked.
227
+ else
228
+ echo failed. starting download from huggingface.
229
+ if command -v aria2c &> /dev/null; then
230
+ aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained/G48k.pth -d ./assets/pretrained -o G48k.pth
231
+ if [ -f "./assets/pretrained/G48k.pth" ]; then
232
+ echo download successful.
233
+ else
234
+ echo please try again!
235
+ exit 1
236
+ fi
237
+ else
238
+ echo aria2c command not found. Please install aria2c and try again.
239
+ exit 1
240
+ fi
241
+ fi
242
+
243
+ echo checking $d32
244
+ if [ -f "./assets/pretrained/$d32" ]; then
245
+ echo $d32 in ./assets/pretrained checked.
246
+ else
247
+ echo failed. starting download from huggingface.
248
+ if command -v aria2c &> /dev/null; then
249
+ aria2c --console-log-level=error -c -x 16 -s 16 -k 1M $dld32 -d ./assets/pretrained -o $d32
250
+ if [ -f "./assets/pretrained/$d32" ]; then
251
+ echo download successful.
252
+ else
253
+ echo please try again!
254
+ exit 1
255
+ fi
256
+ else
257
+ echo aria2c command not found. Please install aria2c and try again.
258
+ exit 1
259
+ fi
260
+ fi
261
+
262
+ echo checking $d40
263
+ if [ -f "./assets/pretrained/$d40" ]; then
264
+ echo $d40 in ./assets/pretrained checked.
265
+ else
266
+ echo failed. starting download from huggingface.
267
+ if command -v aria2c &> /dev/null; then
268
+ aria2c --console-log-level=error -c -x 16 -s 16 -k 1M $dld40 -d ./assets/pretrained -o $d40
269
+ if [ -f "./assets/pretrained/$d40" ]; then
270
+ echo download successful.
271
+ else
272
+ echo please try again!
273
+ exit 1
274
+ fi
275
+ else
276
+ echo aria2c command not found. Please install aria2c and try again.
277
+ exit 1
278
+ fi
279
+ fi
280
+
281
+ echo checking $d40v2
282
+ if [ -f "./assets/pretrained_v2/$d40v2" ]; then
283
+ echo $d40v2 in ./assets/pretrained_v2 checked.
284
+ else
285
+ echo failed. starting download from huggingface.
286
+ if command -v aria2c &> /dev/null; then
287
+ aria2c --console-log-level=error -c -x 16 -s 16 -k 1M $dld40v2 -d ./assets/pretrained_v2 -o $d40v2
288
+ if [ -f "./assets/pretrained_v2/$d40v2" ]; then
289
+ echo download successful.
290
+ else
291
+ echo please try again!
292
+ exit 1
293
+ fi
294
+ else
295
+ echo aria2c command not found. Please install aria2c and try again.
296
+ exit 1
297
+ fi
298
+ fi
299
+
300
+ echo checking $d48
301
+ if [ -f "./assets/pretrained/$d48" ]; then
302
+ echo $d48 in ./assets/pretrained checked.
303
+ else
304
+ echo failed. starting download from huggingface.
305
+ if command -v aria2c &> /dev/null; then
306
+ aria2c --console-log-level=error -c -x 16 -s 16 -k 1M $dld48 -d ./assets/pretrained -o $d48
307
+ if [ -f "./assets/pretrained/$d48" ]; then
308
+ echo download successful.
309
+ else
310
+ echo please try again!
311
+ exit 1
312
+ fi
313
+ else
314
+ echo aria2c command not found. Please install aria2c and try again.
315
+ exit 1
316
+ fi
317
+ fi
318
+
319
+ echo checking $g32
320
+ if [ -f "./assets/pretrained/$g32" ]; then
321
+ echo $g32 in ./assets/pretrained checked.
322
+ else
323
+ echo failed. starting download from huggingface.
324
+ if command -v aria2c &> /dev/null; then
325
+ aria2c --console-log-level=error -c -x 16 -s 16 -k 1M $dlg32 -d ./assets/pretrained -o $g32
326
+ if [ -f "./assets/pretrained/$g32" ]; then
327
+ echo download successful.
328
+ else
329
+ echo please try again!
330
+ exit 1
331
+ fi
332
+ else
333
+ echo aria2c command not found. Please install aria2c and try again.
334
+ exit 1
335
+ fi
336
+ fi
337
+
338
+ echo checking $g40
339
+ if [ -f "./assets/pretrained/$g40" ]; then
340
+ echo $g40 in ./assets/pretrained checked.
341
+ else
342
+ echo failed. starting download from huggingface.
343
+ if command -v aria2c &> /dev/null; then
344
+ aria2c --console-log-level=error -c -x 16 -s 16 -k 1M $dlg40 -d ./assets/pretrained -o $g40
345
+ if [ -f "./assets/pretrained/$g40" ]; then
346
+ echo download successful.
347
+ else
348
+ echo please try again!
349
+ exit 1
350
+ fi
351
+ else
352
+ echo aria2c command not found. Please install aria2c and try again.
353
+ exit 1
354
+ fi
355
+ fi
356
+
357
+ echo checking $g40v2
358
+ if [ -f "./assets/pretrained_v2/$g40v2" ]; then
359
+ echo $g40v2 in ./assets/pretrained_v2 checked.
360
+ else
361
+ echo failed. starting download from huggingface.
362
+ if command -v aria2c &> /dev/null; then
363
+ aria2c --console-log-level=error -c -x 16 -s 16 -k 1M $dlg40v2 -d ./assets/pretrained_v2 -o $g40v2
364
+ if [ -f "./assets/pretrained_v2/$g40v2" ]; then
365
+ echo download successful.
366
+ else
367
+ echo please try again!
368
+ exit 1
369
+ fi
370
+ else
371
+ echo aria2c command not found. Please install aria2c and try again.
372
+ exit 1
373
+ fi
374
+ fi
375
+
376
+ echo checking $g48
377
+ if [ -f "./assets/pretrained/$g48" ]; then
378
+ echo $g48 in ./assets/pretrained checked.
379
+ else
380
+ echo failed. starting download from huggingface.
381
+ if command -v aria2c &> /dev/null; then
382
+ aria2c --console-log-level=error -c -x 16 -s 16 -k 1M $dlg48 -d ./assets/pretrained -o $g48
383
+ if [ -f "./assets/pretrained/$g48" ]; then
384
+ echo download successful.
385
+ else
386
+ echo please try again!
387
+ exit 1
388
+ fi
389
+ else
390
+ echo aria2c command not found. Please install aria2c and try again.
391
+ exit 1
392
+ fi
393
+ fi
394
+
395
+ echo checking $hp2_all
396
+ if [ -f "./assets/uvr5_weights/$hp2_all" ]; then
397
+ echo $hp2_all in ./assets/uvr5_weights checked.
398
+ else
399
+ echo failed. starting download from huggingface.
400
+ if command -v aria2c &> /dev/null; then
401
+ aria2c --console-log-level=error -c -x 16 -s 16 -k 1M $dlhp2_all -d ./assets/uvr5_weights -o $hp2_all
402
+ if [ -f "./assets/uvr5_weights/$hp2_all" ]; then
403
+ echo download successful.
404
+ else
405
+ echo please try again!
406
+ exit 1
407
+ fi
408
+ else
409
+ echo aria2c command not found. Please install aria2c and try again.
410
+ exit 1
411
+ fi
412
+ fi
413
+
414
+ echo checking $hp3_all
415
+ if [ -f "./assets/uvr5_weights/$hp3_all" ]; then
416
+ echo $hp3_all in ./assets/uvr5_weights checked.
417
+ else
418
+ echo failed. starting download from huggingface.
419
+ if command -v aria2c &> /dev/null; then
420
+ aria2c --console-log-level=error -c -x 16 -s 16 -k 1M $dlhp3_all -d ./assets/uvr5_weights -o $hp3_all
421
+ if [ -f "./assets/uvr5_weights/$hp3_all" ]; then
422
+ echo download successful.
423
+ else
424
+ echo please try again!
425
+ exit 1
426
+ fi
427
+ else
428
+ echo aria2c command not found. Please install aria2c and try again.
429
+ exit 1
430
+ fi
431
+ fi
432
+
433
+ echo checking $hp5_only
434
+ if [ -f "./assets/uvr5_weights/$hp5_only" ]; then
435
+ echo $hp5_only in ./assets/uvr5_weights checked.
436
+ else
437
+ echo failed. starting download from huggingface.
438
+ if command -v aria2c &> /dev/null; then
439
+ aria2c --console-log-level=error -c -x 16 -s 16 -k 1M $dlhp5_only -d ./assets/uvr5_weights -o $hp5_only
440
+ if [ -f "./assets/uvr5_weights/$hp5_only" ]; then
441
+ echo download successful.
442
+ else
443
+ echo please try again!
444
+ exit 1
445
+ fi
446
+ else
447
+ echo aria2c command not found. Please install aria2c and try again.
448
+ exit 1
449
+ fi
450
+ fi
451
+
452
+ echo checking $VR_DeEchoAggressive
453
+ if [ -f "./assets/uvr5_weights/$VR_DeEchoAggressive" ]; then
454
+ echo $VR_DeEchoAggressive in ./assets/uvr5_weights checked.
455
+ else
456
+ echo failed. starting download from huggingface.
457
+ if command -v aria2c &> /dev/null; then
458
+ aria2c --console-log-level=error -c -x 16 -s 16 -k 1M $dlVR_DeEchoAggressive -d ./assets/uvr5_weights -o $VR_DeEchoAggressive
459
+ if [ -f "./assets/uvr5_weights/$VR_DeEchoAggressive" ]; then
460
+ echo download successful.
461
+ else
462
+ echo please try again!
463
+ exit 1
464
+ fi
465
+ else
466
+ echo aria2c command not found. Please install aria2c and try again.
467
+ exit 1
468
+ fi
469
+ fi
470
+
471
+ echo checking $VR_DeEchoDeReverb
472
+ if [ -f "./assets/uvr5_weights/$VR_DeEchoDeReverb" ]; then
473
+ echo $VR_DeEchoDeReverb in ./assets/uvr5_weights checked.
474
+ else
475
+ echo failed. starting download from huggingface.
476
+ if command -v aria2c &> /dev/null; then
477
+ aria2c --console-log-level=error -c -x 16 -s 16 -k 1M $dlVR_DeEchoDeReverb -d ./assets/uvr5_weights -o $VR_DeEchoDeReverb
478
+ if [ -f "./assets/uvr5_weights/$VR_DeEchoDeReverb" ]; then
479
+ echo download successful.
480
+ else
481
+ echo please try again!
482
+ exit 1
483
+ fi
484
+ else
485
+ echo aria2c command not found. Please install aria2c and try again.
486
+ exit 1
487
+ fi
488
+ fi
489
+
490
+ echo checking $VR_DeEchoNormal
491
+ if [ -f "./assets/uvr5_weights/$VR_DeEchoNormal" ]; then
492
+ echo $VR_DeEchoNormal in ./assets/uvr5_weights checked.
493
+ else
494
+ echo failed. starting download from huggingface.
495
+ if command -v aria2c &> /dev/null; then
496
+ aria2c --console-log-level=error -c -x 16 -s 16 -k 1M $dlVR_DeEchoNormal -d ./assets/uvr5_weights -o $VR_DeEchoNormal
497
+ if [ -f "./assets/uvr5_weights/$VR_DeEchoNormal" ]; then
498
+ echo download successful.
499
+ else
500
+ echo please try again!
501
+ exit 1
502
+ fi
503
+ else
504
+ echo aria2c command not found. Please install aria2c and try again.
505
+ exit 1
506
+ fi
507
+ fi
508
+
509
+ echo checking $onnx_dereverb
510
+ if [ -f "./assets/uvr5_weights/onnx_dereverb_By_FoxJoy/$onnx_dereverb" ]; then
511
+ echo $onnx_dereverb in ./assets/uvr5_weights/onnx_dereverb_By_FoxJoy checked.
512
+ else
513
+ echo failed. starting download from huggingface.
514
+ if command -v aria2c &> /dev/null; then
515
+ aria2c --console-log-level=error -c -x 16 -s 16 -k 1M $dlonnx_dereverb -d ./assets/uvr5_weights/onnx_dereverb_By_FoxJoy -o $onnx_dereverb
516
+ if [ -f "./assets/uvr5_weights/onnx_dereverb_By_FoxJoy/$onnx_dereverb" ]; then
517
+ echo download successful.
518
+ else
519
+ echo please try again!
520
+ exit 1
521
+ fi
522
+ else
523
+ echo aria2c command not found. Please install aria2c and try again.
524
+ exit 1
525
+ fi
526
+ fi
527
+
528
+ echo checking $rmvpe
529
+ if [ -f "./assets/rmvpe/$rmvpe" ]; then
530
+ echo $rmvpe in ./assets/rmvpe checked.
531
+ else
532
+ echo failed. starting download from huggingface.
533
+ if command -v aria2c &> /dev/null; then
534
+ aria2c --console-log-level=error -c -x 16 -s 16 -k 1M $dlrmvpe -d ./assets/rmvpe -o $rmvpe
535
+ if [ -f "./assets/rmvpe/$rmvpe" ]; then
536
+ echo download successful.
537
+ else
538
+ echo please try again!
539
+ exit 1
540
+ fi
541
+ else
542
+ echo aria2c command not found. Please install aria2c and try again.
543
+ exit 1
544
+ fi
545
+ fi
546
+
547
+ echo checking $hb
548
+ if [ -f "./assets/hubert/$hb" ]; then
549
+ echo $hb in ./assets/hubert/pretrained checked.
550
+ else
551
+ echo failed. starting download from huggingface.
552
+ if command -v aria2c &> /dev/null; then
553
+ aria2c --console-log-level=error -c -x 16 -s 16 -k 1M $dlhb -d ./assets/hubert/ -o $hb
554
+ if [ -f "./assets/hubert/$hb" ]; then
555
+ echo download successful.
556
+ else
557
+ echo please try again!
558
+ exit 1
559
+ fi
560
+ else
561
+ echo aria2c command not found. Please install aria2c and try again.
562
+ exit 1
563
+ fi
564
+ fi
565
+
566
+ echo required files check finished.
tools/download_models.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from pathlib import Path
3
+ import requests
4
+
5
+ RVC_DOWNLOAD_LINK = "https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/"
6
+
7
+ BASE_DIR = Path(__file__).resolve().parent.parent
8
+
9
+
10
+ def dl_model(link, model_name, dir_name):
11
+ with requests.get(f"{link}{model_name}") as r:
12
+ r.raise_for_status()
13
+ os.makedirs(os.path.dirname(dir_name / model_name), exist_ok=True)
14
+ with open(dir_name / model_name, "wb") as f:
15
+ for chunk in r.iter_content(chunk_size=8192):
16
+ f.write(chunk)
17
+
18
+
19
+ if __name__ == "__main__":
20
+ print("Downloading hubert_base.pt...")
21
+ dl_model(RVC_DOWNLOAD_LINK, "hubert_base.pt", BASE_DIR / "assets/hubert")
22
+ print("Downloading rmvpe.pt...")
23
+ dl_model(RVC_DOWNLOAD_LINK, "rmvpe.pt", BASE_DIR / "assets/rmvpe")
24
+ print("Downloading vocals.onnx...")
25
+ dl_model(
26
+ RVC_DOWNLOAD_LINK + "uvr5_weights/onnx_dereverb_By_FoxJoy/",
27
+ "vocals.onnx",
28
+ BASE_DIR / "assets/uvr5_weights/onnx_dereverb_By_FoxJoy",
29
+ )
30
+
31
+ rvc_models_dir = BASE_DIR / "assets/pretrained"
32
+
33
+ print("Downloading pretrained models:")
34
+
35
+ model_names = [
36
+ "D32k.pth",
37
+ "D40k.pth",
38
+ "D48k.pth",
39
+ "G32k.pth",
40
+ "G40k.pth",
41
+ "G48k.pth",
42
+ "f0D32k.pth",
43
+ "f0D40k.pth",
44
+ "f0D48k.pth",
45
+ "f0G32k.pth",
46
+ "f0G40k.pth",
47
+ "f0G48k.pth",
48
+ ]
49
+ for model in model_names:
50
+ print(f"Downloading {model}...")
51
+ dl_model(RVC_DOWNLOAD_LINK + "pretrained/", model, rvc_models_dir)
52
+
53
+ rvc_models_dir = BASE_DIR / "assets/pretrained_v2"
54
+
55
+ print("Downloading pretrained models v2:")
56
+
57
+ for model in model_names:
58
+ print(f"Downloading {model}...")
59
+ dl_model(RVC_DOWNLOAD_LINK + "pretrained_v2/", model, rvc_models_dir)
60
+
61
+ print("Downloading uvr5_weights:")
62
+
63
+ rvc_models_dir = BASE_DIR / "assets/uvr5_weights"
64
+
65
+ model_names = [
66
+ "HP2-%E4%BA%BA%E5%A3%B0vocals%2B%E9%9D%9E%E4%BA%BA%E5%A3%B0instrumentals.pth",
67
+ "HP2_all_vocals.pth",
68
+ "HP3_all_vocals.pth",
69
+ "HP5-%E4%B8%BB%E6%97%8B%E5%BE%8B%E4%BA%BA%E5%A3%B0vocals%2B%E5%85%B6%E4%BB%96instrumentals.pth",
70
+ "HP5_only_main_vocal.pth",
71
+ "VR-DeEchoAggressive.pth",
72
+ "VR-DeEchoDeReverb.pth",
73
+ "VR-DeEchoNormal.pth",
74
+ ]
75
+ for model in model_names:
76
+ print(f"Downloading {model}...")
77
+ dl_model(RVC_DOWNLOAD_LINK + "uvr5_weights/", model, rvc_models_dir)
78
+
79
+ print("All models downloaded!")
tools/export_onnx.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from infer.lib.infer_pack.models_onnx import SynthesizerTrnMsNSFsidM
3
+
4
+ if __name__ == "__main__":
5
+ MoeVS = True # 模型是否为MoeVoiceStudio(原MoeSS)使用
6
+
7
+ ModelPath = "Shiroha/shiroha.pth" # 模型路径
8
+ ExportedPath = "model.onnx" # 输出路径
9
+ hidden_channels = 256 # hidden_channels,为768Vec做准备
10
+ cpt = torch.load(ModelPath, map_location="cpu")
11
+ cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0] # n_spk
12
+ print(*cpt["config"])
13
+
14
+ test_phone = torch.rand(1, 200, hidden_channels) # hidden unit
15
+ test_phone_lengths = torch.tensor([200]).long() # hidden unit 长度(貌似没啥用)
16
+ test_pitch = torch.randint(size=(1, 200), low=5, high=255) # 基频(单位赫兹)
17
+ test_pitchf = torch.rand(1, 200) # nsf基频
18
+ test_ds = torch.LongTensor([0]) # 说话人ID
19
+ test_rnd = torch.rand(1, 192, 200) # 噪声(加入随机因子)
20
+
21
+ device = "cpu" # 导出时设备(不影响使用模型)
22
+
23
+ net_g = SynthesizerTrnMsNSFsidM(
24
+ *cpt["config"], is_half=False
25
+ ) # fp32导出(C++要支持fp16必须手动将内存重新排列所以暂时不用fp16)
26
+ net_g.load_state_dict(cpt["weight"], strict=False)
27
+ input_names = ["phone", "phone_lengths", "pitch", "pitchf", "ds", "rnd"]
28
+ output_names = [
29
+ "audio",
30
+ ]
31
+ # net_g.construct_spkmixmap(n_speaker) 多角色混合轨道导出
32
+ torch.onnx.export(
33
+ net_g,
34
+ (
35
+ test_phone.to(device),
36
+ test_phone_lengths.to(device),
37
+ test_pitch.to(device),
38
+ test_pitchf.to(device),
39
+ test_ds.to(device),
40
+ test_rnd.to(device),
41
+ ),
42
+ ExportedPath,
43
+ dynamic_axes={
44
+ "phone": [1],
45
+ "pitch": [1],
46
+ "pitchf": [1],
47
+ "rnd": [2],
48
+ },
49
+ do_constant_folding=False,
50
+ opset_version=16,
51
+ verbose=False,
52
+ input_names=input_names,
53
+ output_names=output_names,
54
+ )
tools/infer/infer-pm-index256.py ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+
3
+ 对源特征进行检索
4
+ """
5
+ import os
6
+ import logging
7
+
8
+ logger = logging.getLogger(__name__)
9
+
10
+ import parselmouth
11
+ import torch
12
+
13
+ os.environ["CUDA_VISIBLE_DEVICES"] = "0"
14
+ # import torchcrepe
15
+ from time import time as ttime
16
+
17
+ # import pyworld
18
+ import librosa
19
+ import numpy as np
20
+ import soundfile as sf
21
+ import torch.nn.functional as F
22
+ from fairseq import checkpoint_utils
23
+
24
+ # from models import SynthesizerTrn256#hifigan_nonsf
25
+ # from lib.infer_pack.models import SynthesizerTrn256NSF as SynthesizerTrn256#hifigan_nsf
26
+ from infer.lib.infer_pack.models import (
27
+ SynthesizerTrnMs256NSFsid as SynthesizerTrn256,
28
+ ) # hifigan_nsf
29
+ from scipy.io import wavfile
30
+
31
+ # from lib.infer_pack.models import SynthesizerTrnMs256NSFsid_sim as SynthesizerTrn256#hifigan_nsf
32
+ # from models import SynthesizerTrn256NSFsim as SynthesizerTrn256#hifigan_nsf
33
+ # from models import SynthesizerTrn256NSFsimFlow as SynthesizerTrn256#hifigan_nsf
34
+
35
+
36
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
37
+ model_path = r"E:\codes\py39\vits_vc_gpu_train\assets\hubert\hubert_base.pt" #
38
+ logger.info("Load model(s) from {}".format(model_path))
39
+ models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task(
40
+ [model_path],
41
+ suffix="",
42
+ )
43
+ model = models[0]
44
+ model = model.to(device)
45
+ model = model.half()
46
+ model.eval()
47
+
48
+ # net_g = SynthesizerTrn256(1025,32,192,192,768,2,6,3,0.1,"1", [3,7,11],[[1,3,5], [1,3,5], [1,3,5]],[10,10,2,2],512,[16,16,4,4],183,256,is_half=True)#hifigan#512#256
49
+ # net_g = SynthesizerTrn256(1025,32,192,192,768,2,6,3,0.1,"1", [3,7,11],[[1,3,5], [1,3,5], [1,3,5]],[10,10,2,2],512,[16,16,4,4],109,256,is_half=True)#hifigan#512#256
50
+ net_g = SynthesizerTrn256(
51
+ 1025,
52
+ 32,
53
+ 192,
54
+ 192,
55
+ 768,
56
+ 2,
57
+ 6,
58
+ 3,
59
+ 0,
60
+ "1",
61
+ [3, 7, 11],
62
+ [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
63
+ [10, 10, 2, 2],
64
+ 512,
65
+ [16, 16, 4, 4],
66
+ 183,
67
+ 256,
68
+ is_half=True,
69
+ ) # hifigan#512#256#no_dropout
70
+ # net_g = SynthesizerTrn256(1025,32,192,192,768,2,3,3,0.1,"1", [3,7,11],[[1,3,5], [1,3,5], [1,3,5]],[10,10,2,2],512,[16,16,4,4],0)#ts3
71
+ # net_g = SynthesizerTrn256(1025,32,192,192,768,2,6,3,0.1,"1", [3,7,11],[[1,3,5], [1,3,5], [1,3,5]],[10,10,2],512,[16,16,4],0)#hifigan-ps-sr
72
+ #
73
+ # net_g = SynthesizerTrn(1025, 32, 192, 192, 768, 2, 6, 3, 0.1, "1", [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [5,5], 512, [15,15], 0)#ms
74
+ # net_g = SynthesizerTrn(1025, 32, 192, 192, 768, 2, 6, 3, 0.1, "1", [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [10,10], 512, [16,16], 0)#idwt2
75
+
76
+ # weights=torch.load("infer/ft-mi_1k-noD.pt")
77
+ # weights=torch.load("infer/ft-mi-freeze-vocoder-flow-enc_q_1k.pt")
78
+ # weights=torch.load("infer/ft-mi-freeze-vocoder_true_1k.pt")
79
+ # weights=torch.load("infer/ft-mi-sim1k.pt")
80
+ weights = torch.load("infer/ft-mi-no_opt-no_dropout.pt")
81
+ logger.debug(net_g.load_state_dict(weights, strict=True))
82
+
83
+ net_g.eval().to(device)
84
+ net_g.half()
85
+
86
+
87
+ def get_f0(x, p_len, f0_up_key=0):
88
+ time_step = 160 / 16000 * 1000
89
+ f0_min = 50
90
+ f0_max = 1100
91
+ f0_mel_min = 1127 * np.log(1 + f0_min / 700)
92
+ f0_mel_max = 1127 * np.log(1 + f0_max / 700)
93
+
94
+ f0 = (
95
+ parselmouth.Sound(x, 16000)
96
+ .to_pitch_ac(
97
+ time_step=time_step / 1000,
98
+ voicing_threshold=0.6,
99
+ pitch_floor=f0_min,
100
+ pitch_ceiling=f0_max,
101
+ )
102
+ .selected_array["frequency"]
103
+ )
104
+
105
+ pad_size = (p_len - len(f0) + 1) // 2
106
+ if pad_size > 0 or p_len - len(f0) - pad_size > 0:
107
+ f0 = np.pad(f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant")
108
+ f0 *= pow(2, f0_up_key / 12)
109
+ f0bak = f0.copy()
110
+
111
+ f0_mel = 1127 * np.log(1 + f0 / 700)
112
+ f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (
113
+ f0_mel_max - f0_mel_min
114
+ ) + 1
115
+ f0_mel[f0_mel <= 1] = 1
116
+ f0_mel[f0_mel > 255] = 255
117
+ # f0_mel[f0_mel > 188] = 188
118
+ f0_coarse = np.rint(f0_mel).astype(np.int32)
119
+ return f0_coarse, f0bak
120
+
121
+
122
+ import faiss
123
+
124
+ index = faiss.read_index("infer/added_IVF512_Flat_mi_baseline_src_feat.index")
125
+ big_npy = np.load("infer/big_src_feature_mi.npy")
126
+ ta0 = ta1 = ta2 = 0
127
+ for idx, name in enumerate(
128
+ [
129
+ "冬之花clip1.wav",
130
+ ]
131
+ ): ##
132
+ wav_path = "todo-songs/%s" % name #
133
+ f0_up_key = -2 #
134
+ audio, sampling_rate = sf.read(wav_path)
135
+ if len(audio.shape) > 1:
136
+ audio = librosa.to_mono(audio.transpose(1, 0))
137
+ if sampling_rate != 16000:
138
+ audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000)
139
+
140
+ feats = torch.from_numpy(audio).float()
141
+ if feats.dim() == 2: # double channels
142
+ feats = feats.mean(-1)
143
+ assert feats.dim() == 1, feats.dim()
144
+ feats = feats.view(1, -1)
145
+ padding_mask = torch.BoolTensor(feats.shape).fill_(False)
146
+ inputs = {
147
+ "source": feats.half().to(device),
148
+ "padding_mask": padding_mask.to(device),
149
+ "output_layer": 9, # layer 9
150
+ }
151
+ if torch.cuda.is_available():
152
+ torch.cuda.synchronize()
153
+ t0 = ttime()
154
+ with torch.no_grad():
155
+ logits = model.extract_features(**inputs)
156
+ feats = model.final_proj(logits[0])
157
+
158
+ ####索引优化
159
+ npy = feats[0].cpu().numpy().astype("float32")
160
+ D, I = index.search(npy, 1)
161
+ feats = (
162
+ torch.from_numpy(big_npy[I.squeeze()].astype("float16")).unsqueeze(0).to(device)
163
+ )
164
+
165
+ feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
166
+ if torch.cuda.is_available():
167
+ torch.cuda.synchronize()
168
+ t1 = ttime()
169
+ # p_len = min(feats.shape[1],10000,pitch.shape[0])#太大了爆显存
170
+ p_len = min(feats.shape[1], 10000) #
171
+ pitch, pitchf = get_f0(audio, p_len, f0_up_key)
172
+ p_len = min(feats.shape[1], 10000, pitch.shape[0]) # 太大了爆显存
173
+ if torch.cuda.is_available():
174
+ torch.cuda.synchronize()
175
+ t2 = ttime()
176
+ feats = feats[:, :p_len, :]
177
+ pitch = pitch[:p_len]
178
+ pitchf = pitchf[:p_len]
179
+ p_len = torch.LongTensor([p_len]).to(device)
180
+ pitch = torch.LongTensor(pitch).unsqueeze(0).to(device)
181
+ sid = torch.LongTensor([0]).to(device)
182
+ pitchf = torch.FloatTensor(pitchf).unsqueeze(0).to(device)
183
+ with torch.no_grad():
184
+ audio = (
185
+ net_g.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0]
186
+ .data.cpu()
187
+ .float()
188
+ .numpy()
189
+ ) # nsf
190
+ if torch.cuda.is_available():
191
+ torch.cuda.synchronize()
192
+ t3 = ttime()
193
+ ta0 += t1 - t0
194
+ ta1 += t2 - t1
195
+ ta2 += t3 - t2
196
+ # wavfile.write("ft-mi_1k-index256-noD-%s.wav"%name, 40000, audio)##
197
+ # wavfile.write("ft-mi-freeze-vocoder-flow-enc_q_1k-%s.wav"%name, 40000, audio)##
198
+ # wavfile.write("ft-mi-sim1k-%s.wav"%name, 40000, audio)##
199
+ wavfile.write("ft-mi-no_opt-no_dropout-%s.wav" % name, 40000, audio) ##
200
+
201
+
202
+ logger.debug("%.2fs %.2fs %.2fs", ta0, ta1, ta2) #
tools/infer/train-index-v2.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ 格式:直接cid为自带的index位;aid放不下了,通过字典来查,反正就5w个
3
+ """
4
+ import os
5
+ import traceback
6
+ import logging
7
+
8
+ logger = logging.getLogger(__name__)
9
+
10
+ from multiprocessing import cpu_count
11
+
12
+ import faiss
13
+ import numpy as np
14
+ from sklearn.cluster import MiniBatchKMeans
15
+
16
+ # ###########如果是原始特征要先写save
17
+ n_cpu = 0
18
+ if n_cpu == 0:
19
+ n_cpu = cpu_count()
20
+ inp_root = r"./logs/anz/3_feature768"
21
+ npys = []
22
+ listdir_res = list(os.listdir(inp_root))
23
+ for name in sorted(listdir_res):
24
+ phone = np.load("%s/%s" % (inp_root, name))
25
+ npys.append(phone)
26
+ big_npy = np.concatenate(npys, 0)
27
+ big_npy_idx = np.arange(big_npy.shape[0])
28
+ np.random.shuffle(big_npy_idx)
29
+ big_npy = big_npy[big_npy_idx]
30
+ logger.debug(big_npy.shape) # (6196072, 192)#fp32#4.43G
31
+ if big_npy.shape[0] > 2e5:
32
+ # if(1):
33
+ info = "Trying doing kmeans %s shape to 10k centers." % big_npy.shape[0]
34
+ logger.info(info)
35
+ try:
36
+ big_npy = (
37
+ MiniBatchKMeans(
38
+ n_clusters=10000,
39
+ verbose=True,
40
+ batch_size=256 * n_cpu,
41
+ compute_labels=False,
42
+ init="random",
43
+ )
44
+ .fit(big_npy)
45
+ .cluster_centers_
46
+ )
47
+ except:
48
+ info = traceback.format_exc()
49
+ logger.warning(info)
50
+
51
+ np.save("tools/infer/big_src_feature_mi.npy", big_npy)
52
+
53
+ ##################train+add
54
+ # big_npy=np.load("/bili-coeus/jupyter/jupyterhub-liujing04/vits_ch/inference_f0/big_src_feature_mi.npy")
55
+ n_ivf = min(int(16 * np.sqrt(big_npy.shape[0])), big_npy.shape[0] // 39)
56
+ index = faiss.index_factory(768, "IVF%s,Flat" % n_ivf) # mi
57
+ logger.info("Training...")
58
+ index_ivf = faiss.extract_index_ivf(index) #
59
+ index_ivf.nprobe = 1
60
+ index.train(big_npy)
61
+ faiss.write_index(
62
+ index, "tools/infer/trained_IVF%s_Flat_baseline_src_feat_v2.index" % (n_ivf)
63
+ )
64
+ logger.info("Adding...")
65
+ batch_size_add = 8192
66
+ for i in range(0, big_npy.shape[0], batch_size_add):
67
+ index.add(big_npy[i : i + batch_size_add])
68
+ faiss.write_index(
69
+ index, "tools/infer/added_IVF%s_Flat_mi_baseline_src_feat.index" % (n_ivf)
70
+ )
71
+ """
72
+ 大小(都是FP32)
73
+ big_src_feature 2.95G
74
+ (3098036, 256)
75
+ big_emb 4.43G
76
+ (6196072, 192)
77
+ big_emb双倍是因为求特征要repeat后再加pitch
78
+
79
+ """
tools/infer/train-index.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ 格式:直接cid为自带的index位;aid放不下了,通过字典来查,反正就5w个
3
+ """
4
+ import os
5
+ import logging
6
+
7
+ logger = logging.getLogger(__name__)
8
+
9
+ import faiss
10
+ import numpy as np
11
+
12
+ # ###########如果是原始特征要先写save
13
+ inp_root = r"E:\codes\py39\dataset\mi\2-co256"
14
+ npys = []
15
+ for name in sorted(list(os.listdir(inp_root))):
16
+ phone = np.load("%s/%s" % (inp_root, name))
17
+ npys.append(phone)
18
+ big_npy = np.concatenate(npys, 0)
19
+ logger.debug(big_npy.shape) # (6196072, 192)#fp32#4.43G
20
+ np.save("infer/big_src_feature_mi.npy", big_npy)
21
+
22
+ ##################train+add
23
+ # big_npy=np.load("/bili-coeus/jupyter/jupyterhub-liujing04/vits_ch/inference_f0/big_src_feature_mi.npy")
24
+ logger.debug(big_npy.shape)
25
+ index = faiss.index_factory(256, "IVF512,Flat") # mi
26
+ logger.info("Training...")
27
+ index_ivf = faiss.extract_index_ivf(index) #
28
+ index_ivf.nprobe = 9
29
+ index.train(big_npy)
30
+ faiss.write_index(index, "infer/trained_IVF512_Flat_mi_baseline_src_feat.index")
31
+ logger.info("Adding...")
32
+ index.add(big_npy)
33
+ faiss.write_index(index, "infer/added_IVF512_Flat_mi_baseline_src_feat.index")
34
+ """
35
+ 大小(都是FP32)
36
+ big_src_feature 2.95G
37
+ (3098036, 256)
38
+ big_emb 4.43G
39
+ (6196072, 192)
40
+ big_emb双倍是因为求特征要repeat后再加pitch
41
+
42
+ """
tools/infer/trans_weights.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pdb
2
+
3
+ import torch
4
+
5
+ # a=torch.load(r"E:\codes\py39\vits_vc_gpu_train\logs\ft-mi-suc\G_1000.pth")["model"]#sim_nsf#
6
+ # a=torch.load(r"E:\codes\py39\vits_vc_gpu_train\logs\ft-mi-freeze-vocoder-flow-enc_q\G_1000.pth")["model"]#sim_nsf#
7
+ # a=torch.load(r"E:\codes\py39\vits_vc_gpu_train\logs\ft-mi-freeze-vocoder\G_1000.pth")["model"]#sim_nsf#
8
+ # a=torch.load(r"E:\codes\py39\vits_vc_gpu_train\logs\ft-mi-test\G_1000.pth")["model"]#sim_nsf#
9
+ a = torch.load(
10
+ r"E:\codes\py39\vits_vc_gpu_train\logs\ft-mi-no_opt-no_dropout\G_1000.pth"
11
+ )[
12
+ "model"
13
+ ] # sim_nsf#
14
+ for key in a.keys():
15
+ a[key] = a[key].half()
16
+ # torch.save(a,"ft-mi-freeze-vocoder_true_1k.pt")#
17
+ # torch.save(a,"ft-mi-sim1k.pt")#
18
+ torch.save(a, "ft-mi-no_opt-no_dropout.pt") #
tools/infer_batch_rvc.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import os
3
+ import sys
4
+
5
+ print("Command-line arguments:", sys.argv)
6
+
7
+ now_dir = os.getcwd()
8
+ sys.path.append(now_dir)
9
+ import sys
10
+
11
+ import tqdm as tq
12
+ from dotenv import load_dotenv
13
+ from scipy.io import wavfile
14
+
15
+ from configs.config import Config
16
+ from infer.modules.vc.modules import VC
17
+
18
+
19
+ def arg_parse() -> tuple:
20
+ parser = argparse.ArgumentParser()
21
+ parser.add_argument("--f0up_key", type=int, default=0)
22
+ parser.add_argument("--input_path", type=str, help="input path")
23
+ parser.add_argument("--index_path", type=str, help="index path")
24
+ parser.add_argument("--f0method", type=str, default="harvest", help="harvest or pm")
25
+ parser.add_argument("--opt_path", type=str, help="opt path")
26
+ parser.add_argument("--model_name", type=str, help="store in assets/weight_root")
27
+ parser.add_argument("--index_rate", type=float, default=0.66, help="index rate")
28
+ parser.add_argument("--device", type=str, help="device")
29
+ parser.add_argument("--is_half", type=bool, help="use half -> True")
30
+ parser.add_argument("--filter_radius", type=int, default=3, help="filter radius")
31
+ parser.add_argument("--resample_sr", type=int, default=0, help="resample sr")
32
+ parser.add_argument("--rms_mix_rate", type=float, default=1, help="rms mix rate")
33
+ parser.add_argument("--protect", type=float, default=0.33, help="protect")
34
+
35
+ args = parser.parse_args()
36
+ sys.argv = sys.argv[:1]
37
+
38
+ return args
39
+
40
+
41
+ def main():
42
+ load_dotenv()
43
+ args = arg_parse()
44
+ config = Config()
45
+ config.device = args.device if args.device else config.device
46
+ config.is_half = args.is_half if args.is_half else config.is_half
47
+ vc = VC(config)
48
+ vc.get_vc(args.model_name)
49
+ audios = os.listdir(args.input_path)
50
+ for file in tq.tqdm(audios):
51
+ if file.endswith(".wav"):
52
+ file_path = os.path.join(args.input_path, file)
53
+ _, wav_opt = vc.vc_single(
54
+ 0,
55
+ file_path,
56
+ args.f0up_key,
57
+ None,
58
+ args.f0method,
59
+ args.index_path,
60
+ None,
61
+ args.index_rate,
62
+ args.filter_radius,
63
+ args.resample_sr,
64
+ args.rms_mix_rate,
65
+ args.protect,
66
+ )
67
+ out_path = os.path.join(args.opt_path, file)
68
+ wavfile.write(out_path, wav_opt[0], wav_opt[1])
69
+
70
+
71
+ if __name__ == "__main__":
72
+ main()
tools/infer_cli.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import os
3
+ import sys
4
+
5
+ now_dir = os.getcwd()
6
+ sys.path.append(now_dir)
7
+ from dotenv import load_dotenv
8
+ from scipy.io import wavfile
9
+
10
+ from configs.config import Config
11
+ from infer.modules.vc.modules import VC
12
+
13
+ ####
14
+ # USAGE
15
+ #
16
+ # In your Terminal or CMD or whatever
17
+
18
+
19
+ def arg_parse() -> tuple:
20
+ parser = argparse.ArgumentParser()
21
+ parser.add_argument("--f0up_key", type=int, default=0)
22
+ parser.add_argument("--input_path", type=str, help="input path")
23
+ parser.add_argument("--index_path", type=str, help="index path")
24
+ parser.add_argument("--f0method", type=str, default="harvest", help="harvest or pm")
25
+ parser.add_argument("--opt_path", type=str, help="opt path")
26
+ parser.add_argument("--model_name", type=str, help="store in assets/weight_root")
27
+ parser.add_argument("--index_rate", type=float, default=0.66, help="index rate")
28
+ parser.add_argument("--device", type=str, help="device")
29
+ parser.add_argument("--is_half", type=bool, help="use half -> True")
30
+ parser.add_argument("--filter_radius", type=int, default=3, help="filter radius")
31
+ parser.add_argument("--resample_sr", type=int, default=0, help="resample sr")
32
+ parser.add_argument("--rms_mix_rate", type=float, default=1, help="rms mix rate")
33
+ parser.add_argument("--protect", type=float, default=0.33, help="protect")
34
+
35
+ args = parser.parse_args()
36
+ sys.argv = sys.argv[:1]
37
+
38
+ return args
39
+
40
+
41
+ def main():
42
+ load_dotenv()
43
+ args = arg_parse()
44
+ config = Config()
45
+ config.device = args.device if args.device else config.device
46
+ config.is_half = args.is_half if args.is_half else config.is_half
47
+ vc = VC(config)
48
+ vc.get_vc(args.model_name)
49
+ _, wav_opt = vc.vc_single(
50
+ 0,
51
+ args.input_path,
52
+ args.f0up_key,
53
+ None,
54
+ args.f0method,
55
+ args.index_path,
56
+ None,
57
+ args.index_rate,
58
+ args.filter_radius,
59
+ args.resample_sr,
60
+ args.rms_mix_rate,
61
+ args.protect,
62
+ )
63
+ wavfile.write(args.opt_path, wav_opt[0], wav_opt[1])
64
+
65
+
66
+ if __name__ == "__main__":
67
+ main()
tools/onnx_inference_demo.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import soundfile
2
+
3
+ from ..infer.lib.infer_pack.onnx_inference import OnnxRVC
4
+
5
+ hop_size = 512
6
+ sampling_rate = 40000 # 采样率
7
+ f0_up_key = 0 # 升降调
8
+ sid = 0 # 角色ID
9
+ f0_method = "dio" # F0提取算法
10
+ model_path = "ShirohaRVC.onnx" # 模型的完整路径
11
+ vec_name = "vec-256-layer-9" # 内部自动补齐为 f"pretrained/{vec_name}.onnx" 需要onnx的vec模型
12
+ wav_path = "123.wav" # 输入路径或ByteIO实例
13
+ out_path = "out.wav" # 输出路径或ByteIO实例
14
+
15
+ model = OnnxRVC(
16
+ model_path, vec_path=vec_name, sr=sampling_rate, hop_size=hop_size, device="cuda"
17
+ )
18
+
19
+ audio = model.inference(wav_path, sid, f0_method=f0_method, f0_up_key=f0_up_key)
20
+
21
+ soundfile.write(out_path, audio, sampling_rate)
tools/rvc_for_realtime.py ADDED
@@ -0,0 +1,425 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from io import BytesIO
2
+ import os
3
+ import pickle
4
+ import sys
5
+ import traceback
6
+ from infer.lib import jit
7
+ from infer.lib.jit.get_synthesizer import get_synthesizer
8
+ from time import time as ttime
9
+ import fairseq
10
+ import faiss
11
+ import numpy as np
12
+ import parselmouth
13
+ import pyworld
14
+ import scipy.signal as signal
15
+ import torch
16
+ import torch.nn as nn
17
+ import torch.nn.functional as F
18
+ import torchcrepe
19
+
20
+ from infer.lib.infer_pack.models import (
21
+ SynthesizerTrnMs256NSFsid,
22
+ SynthesizerTrnMs256NSFsid_nono,
23
+ SynthesizerTrnMs768NSFsid,
24
+ SynthesizerTrnMs768NSFsid_nono,
25
+ )
26
+
27
+ now_dir = os.getcwd()
28
+ sys.path.append(now_dir)
29
+ from multiprocessing import Manager as M
30
+
31
+ from configs.config import Config
32
+
33
+ # config = Config()
34
+
35
+ mm = M()
36
+
37
+
38
+ def printt(strr, *args):
39
+ if len(args) == 0:
40
+ print(strr)
41
+ else:
42
+ print(strr % args)
43
+
44
+
45
+ # config.device=torch.device("cpu")########强制cpu测试
46
+ # config.is_half=False########强制cpu测试
47
+ class RVC:
48
+ def __init__(
49
+ self,
50
+ key,
51
+ pth_path,
52
+ index_path,
53
+ index_rate,
54
+ n_cpu,
55
+ inp_q,
56
+ opt_q,
57
+ config: Config,
58
+ last_rvc=None,
59
+ ) -> None:
60
+ """
61
+ 初始化
62
+ """
63
+ try:
64
+ if config.dml == True:
65
+
66
+ def forward_dml(ctx, x, scale):
67
+ ctx.scale = scale
68
+ res = x.clone().detach()
69
+ return res
70
+
71
+ fairseq.modules.grad_multiply.GradMultiply.forward = forward_dml
72
+ # global config
73
+ self.config = config
74
+ self.inp_q = inp_q
75
+ self.opt_q = opt_q
76
+ # device="cpu"########强制cpu测试
77
+ self.device = config.device
78
+ self.f0_up_key = key
79
+ self.time_step = 160 / 16000 * 1000
80
+ self.f0_min = 50
81
+ self.f0_max = 1100
82
+ self.f0_mel_min = 1127 * np.log(1 + self.f0_min / 700)
83
+ self.f0_mel_max = 1127 * np.log(1 + self.f0_max / 700)
84
+ self.sr = 16000
85
+ self.window = 160
86
+ self.n_cpu = n_cpu
87
+ self.use_jit = self.config.use_jit
88
+ self.is_half = config.is_half
89
+
90
+ if index_rate != 0:
91
+ self.index = faiss.read_index(index_path)
92
+ self.big_npy = self.index.reconstruct_n(0, self.index.ntotal)
93
+ printt("Index search enabled")
94
+ self.pth_path: str = pth_path
95
+ self.index_path = index_path
96
+ self.index_rate = index_rate
97
+
98
+ if last_rvc is None:
99
+ models, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task(
100
+ ["assets/hubert/hubert_base.pt"],
101
+ suffix="",
102
+ )
103
+ hubert_model = models[0]
104
+ hubert_model = hubert_model.to(self.device)
105
+ if self.is_half:
106
+ hubert_model = hubert_model.half()
107
+ else:
108
+ hubert_model = hubert_model.float()
109
+ hubert_model.eval()
110
+ self.model = hubert_model
111
+ else:
112
+ self.model = last_rvc.model
113
+
114
+ self.net_g: nn.Module = None
115
+
116
+ def set_default_model():
117
+ self.net_g, cpt = get_synthesizer(self.pth_path, self.device)
118
+ self.tgt_sr = cpt["config"][-1]
119
+ cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0]
120
+ self.if_f0 = cpt.get("f0", 1)
121
+ self.version = cpt.get("version", "v1")
122
+ if self.is_half:
123
+ self.net_g = self.net_g.half()
124
+ else:
125
+ self.net_g = self.net_g.float()
126
+
127
+ def set_jit_model():
128
+ jit_pth_path = self.pth_path.rstrip(".pth")
129
+ jit_pth_path += ".half.jit" if self.is_half else ".jit"
130
+ reload = False
131
+ if str(self.device) == "cuda":
132
+ self.device = torch.device("cuda:0")
133
+ if os.path.exists(jit_pth_path):
134
+ cpt = jit.load(jit_pth_path)
135
+ model_device = cpt["device"]
136
+ if model_device != str(self.device):
137
+ reload = True
138
+ else:
139
+ reload = True
140
+
141
+ if reload:
142
+ cpt = jit.synthesizer_jit_export(
143
+ self.pth_path,
144
+ "script",
145
+ None,
146
+ device=self.device,
147
+ is_half=self.is_half,
148
+ )
149
+
150
+ self.tgt_sr = cpt["config"][-1]
151
+ self.if_f0 = cpt.get("f0", 1)
152
+ self.version = cpt.get("version", "v1")
153
+ self.net_g = torch.jit.load(
154
+ BytesIO(cpt["model"]), map_location=self.device
155
+ )
156
+ self.net_g.infer = self.net_g.forward
157
+ self.net_g.eval().to(self.device)
158
+
159
+ def set_synthesizer():
160
+ if self.use_jit and not config.dml:
161
+ if self.is_half and "cpu" in str(self.device):
162
+ printt(
163
+ "Use default Synthesizer model. \
164
+ Jit is not supported on the CPU for half floating point"
165
+ )
166
+ set_default_model()
167
+ else:
168
+ set_jit_model()
169
+ else:
170
+ set_default_model()
171
+
172
+ if last_rvc is None or last_rvc.pth_path != self.pth_path:
173
+ set_synthesizer()
174
+ else:
175
+ self.tgt_sr = last_rvc.tgt_sr
176
+ self.if_f0 = last_rvc.if_f0
177
+ self.version = last_rvc.version
178
+ self.is_half = last_rvc.is_half
179
+ if last_rvc.use_jit != self.use_jit:
180
+ set_synthesizer()
181
+ else:
182
+ self.net_g = last_rvc.net_g
183
+
184
+ if last_rvc is not None and hasattr(last_rvc, "model_rmvpe"):
185
+ self.model_rmvpe = last_rvc.model_rmvpe
186
+ except:
187
+ printt(traceback.format_exc())
188
+
189
+ def change_key(self, new_key):
190
+ self.f0_up_key = new_key
191
+
192
+ def change_index_rate(self, new_index_rate):
193
+ if new_index_rate != 0 and self.index_rate == 0:
194
+ self.index = faiss.read_index(self.index_path)
195
+ self.big_npy = self.index.reconstruct_n(0, self.index.ntotal)
196
+ printt("Index search enabled")
197
+ self.index_rate = new_index_rate
198
+
199
+ def get_f0_post(self, f0):
200
+ f0_min = self.f0_min
201
+ f0_max = self.f0_max
202
+ f0_mel_min = 1127 * np.log(1 + f0_min / 700)
203
+ f0_mel_max = 1127 * np.log(1 + f0_max / 700)
204
+ f0bak = f0.copy()
205
+ f0_mel = 1127 * np.log(1 + f0 / 700)
206
+ f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (
207
+ f0_mel_max - f0_mel_min
208
+ ) + 1
209
+ f0_mel[f0_mel <= 1] = 1
210
+ f0_mel[f0_mel > 255] = 255
211
+ f0_coarse = np.rint(f0_mel).astype(np.int32)
212
+ return f0_coarse, f0bak
213
+
214
+ def get_f0(self, x, f0_up_key, n_cpu, method="harvest"):
215
+ n_cpu = int(n_cpu)
216
+ if method == "crepe":
217
+ return self.get_f0_crepe(x, f0_up_key)
218
+ if method == "rmvpe":
219
+ return self.get_f0_rmvpe(x, f0_up_key)
220
+ if method == "pm":
221
+ p_len = x.shape[0] // 160 + 1
222
+ f0 = (
223
+ parselmouth.Sound(x, 16000)
224
+ .to_pitch_ac(
225
+ time_step=0.01,
226
+ voicing_threshold=0.6,
227
+ pitch_floor=50,
228
+ pitch_ceiling=1100,
229
+ )
230
+ .selected_array["frequency"]
231
+ )
232
+
233
+ pad_size = (p_len - len(f0) + 1) // 2
234
+ if pad_size > 0 or p_len - len(f0) - pad_size > 0:
235
+ # printt(pad_size, p_len - len(f0) - pad_size)
236
+ f0 = np.pad(
237
+ f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant"
238
+ )
239
+
240
+ f0 *= pow(2, f0_up_key / 12)
241
+ return self.get_f0_post(f0)
242
+ if n_cpu == 1:
243
+ f0, t = pyworld.harvest(
244
+ x.astype(np.double),
245
+ fs=16000,
246
+ f0_ceil=1100,
247
+ f0_floor=50,
248
+ frame_period=10,
249
+ )
250
+ f0 = signal.medfilt(f0, 3)
251
+ f0 *= pow(2, f0_up_key / 12)
252
+ return self.get_f0_post(f0)
253
+ f0bak = np.zeros(x.shape[0] // 160 + 1, dtype=np.float64)
254
+ length = len(x)
255
+ part_length = 160 * ((length // 160 - 1) // n_cpu + 1)
256
+ n_cpu = (length // 160 - 1) // (part_length // 160) + 1
257
+ ts = ttime()
258
+ res_f0 = mm.dict()
259
+ for idx in range(n_cpu):
260
+ tail = part_length * (idx + 1) + 320
261
+ if idx == 0:
262
+ self.inp_q.put((idx, x[:tail], res_f0, n_cpu, ts))
263
+ else:
264
+ self.inp_q.put(
265
+ (idx, x[part_length * idx - 320 : tail], res_f0, n_cpu, ts)
266
+ )
267
+ while 1:
268
+ res_ts = self.opt_q.get()
269
+ if res_ts == ts:
270
+ break
271
+ f0s = [i[1] for i in sorted(res_f0.items(), key=lambda x: x[0])]
272
+ for idx, f0 in enumerate(f0s):
273
+ if idx == 0:
274
+ f0 = f0[:-3]
275
+ elif idx != n_cpu - 1:
276
+ f0 = f0[2:-3]
277
+ else:
278
+ f0 = f0[2:]
279
+ f0bak[
280
+ part_length * idx // 160 : part_length * idx // 160 + f0.shape[0]
281
+ ] = f0
282
+ f0bak = signal.medfilt(f0bak, 3)
283
+ f0bak *= pow(2, f0_up_key / 12)
284
+ return self.get_f0_post(f0bak)
285
+
286
+ def get_f0_crepe(self, x, f0_up_key):
287
+ if "privateuseone" in str(self.device): ###不支持dml,cpu又太慢用不成,拿pm顶替
288
+ return self.get_f0(x, f0_up_key, 1, "pm")
289
+ audio = torch.tensor(np.copy(x))[None].float()
290
+ # printt("using crepe,device:%s"%self.device)
291
+ f0, pd = torchcrepe.predict(
292
+ audio,
293
+ self.sr,
294
+ 160,
295
+ self.f0_min,
296
+ self.f0_max,
297
+ "full",
298
+ batch_size=512,
299
+ # device=self.device if self.device.type!="privateuseone" else "cpu",###crepe不用半精度全部是全精度所以不愁###cpu延迟高到没法用
300
+ device=self.device,
301
+ return_periodicity=True,
302
+ )
303
+ pd = torchcrepe.filter.median(pd, 3)
304
+ f0 = torchcrepe.filter.mean(f0, 3)
305
+ f0[pd < 0.1] = 0
306
+ f0 = f0[0].cpu().numpy()
307
+ f0 *= pow(2, f0_up_key / 12)
308
+ return self.get_f0_post(f0)
309
+
310
+ def get_f0_rmvpe(self, x, f0_up_key):
311
+ if hasattr(self, "model_rmvpe") == False:
312
+ from infer.lib.rmvpe import RMVPE
313
+
314
+ printt("Loading rmvpe model")
315
+ self.model_rmvpe = RMVPE(
316
+ # "rmvpe.pt", is_half=self.is_half if self.device.type!="privateuseone" else False, device=self.device if self.device.type!="privateuseone"else "cpu"####dml时强制对rmvpe用cpu跑
317
+ # "rmvpe.pt", is_half=False, device=self.device####dml配置
318
+ # "rmvpe.pt", is_half=False, device="cpu"####锁定cpu配置
319
+ "assets/rmvpe/rmvpe.pt",
320
+ is_half=self.is_half,
321
+ device=self.device, ####正常逻辑
322
+ use_jit=self.config.use_jit,
323
+ )
324
+ # self.model_rmvpe = RMVPE("aug2_58000_half.pt", is_half=self.is_half, device=self.device)
325
+ f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03)
326
+ f0 *= pow(2, f0_up_key / 12)
327
+ return self.get_f0_post(f0)
328
+
329
+ def infer(
330
+ self,
331
+ feats: torch.Tensor,
332
+ indata: np.ndarray,
333
+ block_frame_16k,
334
+ rate,
335
+ cache_pitch,
336
+ cache_pitchf,
337
+ f0method,
338
+ ) -> np.ndarray:
339
+ feats = feats.view(1, -1)
340
+ if self.config.is_half:
341
+ feats = feats.half()
342
+ else:
343
+ feats = feats.float()
344
+ feats = feats.to(self.device)
345
+ t1 = ttime()
346
+ with torch.no_grad():
347
+ padding_mask = torch.BoolTensor(feats.shape).to(self.device).fill_(False)
348
+ inputs = {
349
+ "source": feats,
350
+ "padding_mask": padding_mask,
351
+ "output_layer": 9 if self.version == "v1" else 12,
352
+ }
353
+ logits = self.model.extract_features(**inputs)
354
+ feats = (
355
+ self.model.final_proj(logits[0]) if self.version == "v1" else logits[0]
356
+ )
357
+ feats = F.pad(feats, (0, 0, 1, 0))
358
+ t2 = ttime()
359
+ try:
360
+ if hasattr(self, "index") and self.index_rate != 0:
361
+ leng_replace_head = int(rate * feats[0].shape[0])
362
+ npy = feats[0][-leng_replace_head:].cpu().numpy().astype("float32")
363
+ score, ix = self.index.search(npy, k=8)
364
+ weight = np.square(1 / score)
365
+ weight /= weight.sum(axis=1, keepdims=True)
366
+ npy = np.sum(self.big_npy[ix] * np.expand_dims(weight, axis=2), axis=1)
367
+ if self.config.is_half:
368
+ npy = npy.astype("float16")
369
+ feats[0][-leng_replace_head:] = (
370
+ torch.from_numpy(npy).unsqueeze(0).to(self.device) * self.index_rate
371
+ + (1 - self.index_rate) * feats[0][-leng_replace_head:]
372
+ )
373
+ else:
374
+ printt("Index search FAILED or disabled")
375
+ except:
376
+ traceback.print_exc()
377
+ printt("Index search FAILED")
378
+ feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
379
+ t3 = ttime()
380
+ if self.if_f0 == 1:
381
+ pitch, pitchf = self.get_f0(indata, self.f0_up_key, self.n_cpu, f0method)
382
+ start_frame = block_frame_16k // 160
383
+ end_frame = len(cache_pitch) - (pitch.shape[0] - 4) + start_frame
384
+ cache_pitch[:] = np.append(cache_pitch[start_frame:end_frame], pitch[3:-1])
385
+ cache_pitchf[:] = np.append(
386
+ cache_pitchf[start_frame:end_frame], pitchf[3:-1]
387
+ )
388
+ p_len = min(feats.shape[1], 13000, cache_pitch.shape[0])
389
+ else:
390
+ cache_pitch, cache_pitchf = None, None
391
+ p_len = min(feats.shape[1], 13000)
392
+ t4 = ttime()
393
+ feats = feats[:, :p_len, :]
394
+ if self.if_f0 == 1:
395
+ cache_pitch = cache_pitch[:p_len]
396
+ cache_pitchf = cache_pitchf[:p_len]
397
+ cache_pitch = torch.LongTensor(cache_pitch).unsqueeze(0).to(self.device)
398
+ cache_pitchf = torch.FloatTensor(cache_pitchf).unsqueeze(0).to(self.device)
399
+ p_len = torch.LongTensor([p_len]).to(self.device)
400
+ ii = 0 # sid
401
+ sid = torch.LongTensor([ii]).to(self.device)
402
+ with torch.no_grad():
403
+ if self.if_f0 == 1:
404
+ # printt(12222222222,feats.device,p_len.device,cache_pitch.device,cache_pitchf.device,sid.device,rate2)
405
+ infered_audio = self.net_g.infer(
406
+ feats,
407
+ p_len,
408
+ cache_pitch,
409
+ cache_pitchf,
410
+ sid,
411
+ torch.FloatTensor([rate]),
412
+ )[0][0, 0].data.float()
413
+ else:
414
+ infered_audio = self.net_g.infer(
415
+ feats, p_len, sid, torch.FloatTensor([rate])
416
+ )[0][0, 0].data.float()
417
+ t5 = ttime()
418
+ printt(
419
+ "Spent time: fea = %.2fs, index = %.2fs, f0 = %.2fs, model = %.2fs",
420
+ t2 - t1,
421
+ t3 - t2,
422
+ t4 - t3,
423
+ t5 - t4,
424
+ )
425
+ return infered_audio
tools/torchgate/__init__.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ TorchGating is a PyTorch-based implementation of Spectral Gating
3
+ ================================================
4
+ Author: Asaf Zorea
5
+
6
+ Contents
7
+ --------
8
+ torchgate imports all the functions from PyTorch, and in addition provides:
9
+ TorchGating --- A PyTorch module that applies a spectral gate to an input signal
10
+
11
+ """
12
+ from .torchgate import TorchGate
tools/torchgate/torchgate.py ADDED
@@ -0,0 +1,280 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from infer.lib.rmvpe import STFT
3
+ from torch.nn.functional import conv1d, conv2d
4
+ from typing import Union, Optional
5
+ from .utils import linspace, temperature_sigmoid, amp_to_db
6
+
7
+
8
+ class TorchGate(torch.nn.Module):
9
+ """
10
+ A PyTorch module that applies a spectral gate to an input signal.
11
+
12
+ Arguments:
13
+ sr {int} -- Sample rate of the input signal.
14
+ nonstationary {bool} -- Whether to use non-stationary or stationary masking (default: {False}).
15
+ n_std_thresh_stationary {float} -- Number of standard deviations above mean to threshold noise for
16
+ stationary masking (default: {1.5}).
17
+ n_thresh_nonstationary {float} -- Number of multiplies above smoothed magnitude spectrogram. for
18
+ non-stationary masking (default: {1.3}).
19
+ temp_coeff_nonstationary {float} -- Temperature coefficient for non-stationary masking (default: {0.1}).
20
+ n_movemean_nonstationary {int} -- Number of samples for moving average smoothing in non-stationary masking
21
+ (default: {20}).
22
+ prop_decrease {float} -- Proportion to decrease signal by where the mask is zero (default: {1.0}).
23
+ n_fft {int} -- Size of FFT for STFT (default: {1024}).
24
+ win_length {[int]} -- Window length for STFT. If None, defaults to `n_fft` (default: {None}).
25
+ hop_length {[int]} -- Hop length for STFT. If None, defaults to `win_length` // 4 (default: {None}).
26
+ freq_mask_smooth_hz {float} -- Frequency smoothing width for mask (in Hz). If None, no smoothing is applied
27
+ (default: {500}).
28
+ time_mask_smooth_ms {float} -- Time smoothing width for mask (in ms). If None, no smoothing is applied
29
+ (default: {50}).
30
+ """
31
+
32
+ @torch.no_grad()
33
+ def __init__(
34
+ self,
35
+ sr: int,
36
+ nonstationary: bool = False,
37
+ n_std_thresh_stationary: float = 1.5,
38
+ n_thresh_nonstationary: float = 1.3,
39
+ temp_coeff_nonstationary: float = 0.1,
40
+ n_movemean_nonstationary: int = 20,
41
+ prop_decrease: float = 1.0,
42
+ n_fft: int = 1024,
43
+ win_length: bool = None,
44
+ hop_length: int = None,
45
+ freq_mask_smooth_hz: float = 500,
46
+ time_mask_smooth_ms: float = 50,
47
+ ):
48
+ super().__init__()
49
+
50
+ # General Params
51
+ self.sr = sr
52
+ self.nonstationary = nonstationary
53
+ assert 0.0 <= prop_decrease <= 1.0
54
+ self.prop_decrease = prop_decrease
55
+
56
+ # STFT Params
57
+ self.n_fft = n_fft
58
+ self.win_length = self.n_fft if win_length is None else win_length
59
+ self.hop_length = self.win_length // 4 if hop_length is None else hop_length
60
+
61
+ # Stationary Params
62
+ self.n_std_thresh_stationary = n_std_thresh_stationary
63
+
64
+ # Non-Stationary Params
65
+ self.temp_coeff_nonstationary = temp_coeff_nonstationary
66
+ self.n_movemean_nonstationary = n_movemean_nonstationary
67
+ self.n_thresh_nonstationary = n_thresh_nonstationary
68
+
69
+ # Smooth Mask Params
70
+ self.freq_mask_smooth_hz = freq_mask_smooth_hz
71
+ self.time_mask_smooth_ms = time_mask_smooth_ms
72
+ self.register_buffer("smoothing_filter", self._generate_mask_smoothing_filter())
73
+
74
+ @torch.no_grad()
75
+ def _generate_mask_smoothing_filter(self) -> Union[torch.Tensor, None]:
76
+ """
77
+ A PyTorch module that applies a spectral gate to an input signal using the STFT.
78
+
79
+ Returns:
80
+ smoothing_filter (torch.Tensor): a 2D tensor representing the smoothing filter,
81
+ with shape (n_grad_freq, n_grad_time), where n_grad_freq is the number of frequency
82
+ bins to smooth and n_grad_time is the number of time frames to smooth.
83
+ If both self.freq_mask_smooth_hz and self.time_mask_smooth_ms are None, returns None.
84
+ """
85
+ if self.freq_mask_smooth_hz is None and self.time_mask_smooth_ms is None:
86
+ return None
87
+
88
+ n_grad_freq = (
89
+ 1
90
+ if self.freq_mask_smooth_hz is None
91
+ else int(self.freq_mask_smooth_hz / (self.sr / (self.n_fft / 2)))
92
+ )
93
+ if n_grad_freq < 1:
94
+ raise ValueError(
95
+ f"freq_mask_smooth_hz needs to be at least {int((self.sr / (self._n_fft / 2)))} Hz"
96
+ )
97
+
98
+ n_grad_time = (
99
+ 1
100
+ if self.time_mask_smooth_ms is None
101
+ else int(self.time_mask_smooth_ms / ((self.hop_length / self.sr) * 1000))
102
+ )
103
+ if n_grad_time < 1:
104
+ raise ValueError(
105
+ f"time_mask_smooth_ms needs to be at least {int((self.hop_length / self.sr) * 1000)} ms"
106
+ )
107
+
108
+ if n_grad_time == 1 and n_grad_freq == 1:
109
+ return None
110
+
111
+ v_f = torch.cat(
112
+ [
113
+ linspace(0, 1, n_grad_freq + 1, endpoint=False),
114
+ linspace(1, 0, n_grad_freq + 2),
115
+ ]
116
+ )[1:-1]
117
+ v_t = torch.cat(
118
+ [
119
+ linspace(0, 1, n_grad_time + 1, endpoint=False),
120
+ linspace(1, 0, n_grad_time + 2),
121
+ ]
122
+ )[1:-1]
123
+ smoothing_filter = torch.outer(v_f, v_t).unsqueeze(0).unsqueeze(0)
124
+
125
+ return smoothing_filter / smoothing_filter.sum()
126
+
127
+ @torch.no_grad()
128
+ def _stationary_mask(
129
+ self, X_db: torch.Tensor, xn: Optional[torch.Tensor] = None
130
+ ) -> torch.Tensor:
131
+ """
132
+ Computes a stationary binary mask to filter out noise in a log-magnitude spectrogram.
133
+
134
+ Arguments:
135
+ X_db (torch.Tensor): 2D tensor of shape (frames, freq_bins) containing the log-magnitude spectrogram.
136
+ xn (torch.Tensor): 1D tensor containing the audio signal corresponding to X_db.
137
+
138
+ Returns:
139
+ sig_mask (torch.Tensor): Binary mask of the same shape as X_db, where values greater than the threshold
140
+ are set to 1, and the rest are set to 0.
141
+ """
142
+ if xn is not None:
143
+ if "privateuseone" in str(xn.device):
144
+ if not hasattr(self, "stft"):
145
+ self.stft = STFT(
146
+ filter_length=self.n_fft,
147
+ hop_length=self.hop_length,
148
+ win_length=self.win_length,
149
+ window="hann",
150
+ ).to(xn.device)
151
+ XN = self.stft.transform(xn)
152
+ else:
153
+ XN = torch.stft(
154
+ xn,
155
+ n_fft=self.n_fft,
156
+ hop_length=self.hop_length,
157
+ win_length=self.win_length,
158
+ return_complex=True,
159
+ pad_mode="constant",
160
+ center=True,
161
+ window=torch.hann_window(self.win_length).to(xn.device),
162
+ )
163
+ XN_db = amp_to_db(XN).to(dtype=X_db.dtype)
164
+ else:
165
+ XN_db = X_db
166
+
167
+ # calculate mean and standard deviation along the frequency axis
168
+ std_freq_noise, mean_freq_noise = torch.std_mean(XN_db, dim=-1)
169
+
170
+ # compute noise threshold
171
+ noise_thresh = mean_freq_noise + std_freq_noise * self.n_std_thresh_stationary
172
+
173
+ # create binary mask by thresholding the spectrogram
174
+ sig_mask = X_db > noise_thresh.unsqueeze(2)
175
+ return sig_mask
176
+
177
+ @torch.no_grad()
178
+ def _nonstationary_mask(self, X_abs: torch.Tensor) -> torch.Tensor:
179
+ """
180
+ Computes a non-stationary binary mask to filter out noise in a log-magnitude spectrogram.
181
+
182
+ Arguments:
183
+ X_abs (torch.Tensor): 2D tensor of shape (frames, freq_bins) containing the magnitude spectrogram.
184
+
185
+ Returns:
186
+ sig_mask (torch.Tensor): Binary mask of the same shape as X_abs, where values greater than the threshold
187
+ are set to 1, and the rest are set to 0.
188
+ """
189
+ X_smoothed = (
190
+ conv1d(
191
+ X_abs.reshape(-1, 1, X_abs.shape[-1]),
192
+ torch.ones(
193
+ self.n_movemean_nonstationary,
194
+ dtype=X_abs.dtype,
195
+ device=X_abs.device,
196
+ ).view(1, 1, -1),
197
+ padding="same",
198
+ ).view(X_abs.shape)
199
+ / self.n_movemean_nonstationary
200
+ )
201
+
202
+ # Compute slowness ratio and apply temperature sigmoid
203
+ slowness_ratio = (X_abs - X_smoothed) / (X_smoothed + 1e-6)
204
+ sig_mask = temperature_sigmoid(
205
+ slowness_ratio, self.n_thresh_nonstationary, self.temp_coeff_nonstationary
206
+ )
207
+
208
+ return sig_mask
209
+
210
+ def forward(
211
+ self, x: torch.Tensor, xn: Optional[torch.Tensor] = None
212
+ ) -> torch.Tensor:
213
+ """
214
+ Apply the proposed algorithm to the input signal.
215
+
216
+ Arguments:
217
+ x (torch.Tensor): The input audio signal, with shape (batch_size, signal_length).
218
+ xn (Optional[torch.Tensor]): The noise signal used for stationary noise reduction. If `None`, the input
219
+ signal is used as the noise signal. Default: `None`.
220
+
221
+ Returns:
222
+ torch.Tensor: The denoised audio signal, with the same shape as the input signal.
223
+ """
224
+
225
+ # Compute short-time Fourier transform (STFT)
226
+ if "privateuseone" in str(x.device):
227
+ if not hasattr(self, "stft"):
228
+ self.stft = STFT(
229
+ filter_length=self.n_fft,
230
+ hop_length=self.hop_length,
231
+ win_length=self.win_length,
232
+ window="hann",
233
+ ).to(x.device)
234
+ X, phase = self.stft.transform(x, return_phase=True)
235
+ else:
236
+ X = torch.stft(
237
+ x,
238
+ n_fft=self.n_fft,
239
+ hop_length=self.hop_length,
240
+ win_length=self.win_length,
241
+ return_complex=True,
242
+ pad_mode="constant",
243
+ center=True,
244
+ window=torch.hann_window(self.win_length).to(x.device),
245
+ )
246
+
247
+ # Compute signal mask based on stationary or nonstationary assumptions
248
+ if self.nonstationary:
249
+ sig_mask = self._nonstationary_mask(X.abs())
250
+ else:
251
+ sig_mask = self._stationary_mask(amp_to_db(X), xn)
252
+
253
+ # Propagate decrease in signal power
254
+ sig_mask = self.prop_decrease * (sig_mask.float() - 1.0) + 1.0
255
+
256
+ # Smooth signal mask with 2D convolution
257
+ if self.smoothing_filter is not None:
258
+ sig_mask = conv2d(
259
+ sig_mask.unsqueeze(1),
260
+ self.smoothing_filter.to(sig_mask.dtype),
261
+ padding="same",
262
+ )
263
+
264
+ # Apply signal mask to STFT magnitude and phase components
265
+ Y = X * sig_mask.squeeze(1)
266
+
267
+ # Inverse STFT to obtain time-domain signal
268
+ if "privateuseone" in str(Y.device):
269
+ y = self.stft.inverse(Y, phase)
270
+ else:
271
+ y = torch.istft(
272
+ Y,
273
+ n_fft=self.n_fft,
274
+ hop_length=self.hop_length,
275
+ win_length=self.win_length,
276
+ center=True,
277
+ window=torch.hann_window(self.win_length).to(Y.device),
278
+ )
279
+
280
+ return y.to(dtype=x.dtype)
tools/torchgate/utils.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from torch.types import Number
3
+
4
+
5
+ @torch.no_grad()
6
+ def amp_to_db(
7
+ x: torch.Tensor, eps=torch.finfo(torch.float64).eps, top_db=40
8
+ ) -> torch.Tensor:
9
+ """
10
+ Convert the input tensor from amplitude to decibel scale.
11
+
12
+ Arguments:
13
+ x {[torch.Tensor]} -- [Input tensor.]
14
+
15
+ Keyword Arguments:
16
+ eps {[float]} -- [Small value to avoid numerical instability.]
17
+ (default: {torch.finfo(torch.float64).eps})
18
+ top_db {[float]} -- [threshold the output at ``top_db`` below the peak]
19
+ ` (default: {40})
20
+
21
+ Returns:
22
+ [torch.Tensor] -- [Output tensor in decibel scale.]
23
+ """
24
+ x_db = 20 * torch.log10(x.abs() + eps)
25
+ return torch.max(x_db, (x_db.max(-1).values - top_db).unsqueeze(-1))
26
+
27
+
28
+ @torch.no_grad()
29
+ def temperature_sigmoid(x: torch.Tensor, x0: float, temp_coeff: float) -> torch.Tensor:
30
+ """
31
+ Apply a sigmoid function with temperature scaling.
32
+
33
+ Arguments:
34
+ x {[torch.Tensor]} -- [Input tensor.]
35
+ x0 {[float]} -- [Parameter that controls the threshold of the sigmoid.]
36
+ temp_coeff {[float]} -- [Parameter that controls the slope of the sigmoid.]
37
+
38
+ Returns:
39
+ [torch.Tensor] -- [Output tensor after applying the sigmoid with temperature scaling.]
40
+ """
41
+ return torch.sigmoid((x - x0) / temp_coeff)
42
+
43
+
44
+ @torch.no_grad()
45
+ def linspace(
46
+ start: Number, stop: Number, num: int = 50, endpoint: bool = True, **kwargs
47
+ ) -> torch.Tensor:
48
+ """
49
+ Generate a linearly spaced 1-D tensor.
50
+
51
+ Arguments:
52
+ start {[Number]} -- [The starting value of the sequence.]
53
+ stop {[Number]} -- [The end value of the sequence, unless `endpoint` is set to False.
54
+ In that case, the sequence consists of all but the last of ``num + 1``
55
+ evenly spaced samples, so that `stop` is excluded. Note that the step
56
+ size changes when `endpoint` is False.]
57
+
58
+ Keyword Arguments:
59
+ num {[int]} -- [Number of samples to generate. Default is 50. Must be non-negative.]
60
+ endpoint {[bool]} -- [If True, `stop` is the last sample. Otherwise, it is not included.
61
+ Default is True.]
62
+ **kwargs -- [Additional arguments to be passed to the underlying PyTorch `linspace` function.]
63
+
64
+ Returns:
65
+ [torch.Tensor] -- [1-D tensor of `num` equally spaced samples from `start` to `stop`.]
66
+ """
67
+ if endpoint:
68
+ return torch.linspace(start, stop, num, **kwargs)
69
+ else:
70
+ return torch.linspace(start, stop, num + 1, **kwargs)[:-1]
venv.sh ADDED
@@ -0,0 +1 @@
 
 
1
+ python3.8 -m venv .venv