ArkanDash commited on
Commit
883d44b
·
1 Parent(s): 31a8225

feat: major update

Browse files
app.py CHANGED
@@ -17,7 +17,7 @@ import io
17
  import wave
18
  from datetime import datetime
19
  from fairseq import checkpoint_utils
20
- from infer_pack.models import (
21
  SynthesizerTrnMs256NSFsid,
22
  SynthesizerTrnMs256NSFsid_nono,
23
  SynthesizerTrnMs768NSFsid,
@@ -29,14 +29,25 @@ config = Config()
29
  logging.getLogger("numba").setLevel(logging.WARNING)
30
  limitation = os.getenv("SYSTEM") == "spaces"
31
 
32
- def create_vc_fn(tgt_sr, net_g, vc, if_f0, file_index):
 
 
 
 
 
 
 
 
 
 
 
 
33
  def vc_fn(
34
  vc_audio_mode,
35
  vc_input,
36
  vc_upload,
37
  tts_text,
38
  tts_voice,
39
- spk_item,
40
  f0_up_key,
41
  f0_method,
42
  index_rate,
@@ -73,13 +84,14 @@ def create_vc_fn(tgt_sr, net_g, vc, if_f0, file_index):
73
  audio_opt = vc.pipeline(
74
  hubert_model,
75
  net_g,
76
- spk_item,
77
  audio,
78
  vc_input,
79
  times,
80
  f0_up_key,
81
  f0_method,
82
  file_index,
 
83
  index_rate,
84
  if_f0,
85
  filter_radius,
@@ -91,7 +103,7 @@ def create_vc_fn(tgt_sr, net_g, vc, if_f0, file_index):
91
  f0_file=None,
92
  )
93
  info = f"[{datetime.now().strftime('%Y-%m-%d %H:%M')}]: npy: {times[0]}, f0: {times[1]}s, infer: {times[2]}s"
94
- print(info)
95
  return info, (tgt_sr, audio_opt)
96
  except:
97
  info = traceback.format_exc()
@@ -99,6 +111,57 @@ def create_vc_fn(tgt_sr, net_g, vc, if_f0, file_index):
99
  return info, (None, None)
100
  return vc_fn
101
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
102
  def cut_vocal_and_inst(url, audio_provider, split_model):
103
  if url != "":
104
  if not os.path.exists("dl_audio"):
@@ -275,61 +338,15 @@ def change_audio_mode(vc_audio_mode):
275
 
276
  if __name__ == '__main__':
277
  load_hubert()
278
- categories = []
279
  tts_voice_list = asyncio.get_event_loop().run_until_complete(edge_tts.list_voices())
280
  voices = [f"{v['ShortName']}-{v['Gender']}" for v in tts_voice_list]
281
- with open("weights/folder_info.json", "r", encoding="utf-8") as f:
282
- folder_info = json.load(f)
283
- for category_name, category_info in folder_info.items():
284
- if not category_info['enable']:
285
- continue
286
- category_title = category_info['title']
287
- category_folder = category_info['folder_path']
288
- description = category_info['description']
289
- models = []
290
- with open(f"weights/{category_folder}/model_info.json", "r", encoding="utf-8") as f:
291
- models_info = json.load(f)
292
- for model_name, info in models_info.items():
293
- if not info['enable']:
294
- continue
295
- model_title = info['title']
296
- model_author = info.get("author", None)
297
- model_cover = f"weights/{category_folder}/{model_name}/{info['cover']}"
298
- model_index = f"weights/{category_folder}/{model_name}/{info['feature_retrieval_library']}"
299
- cpt = torch.load(f"weights/{category_folder}/{model_name}/{model_name}.pth", map_location="cpu")
300
- tgt_sr = cpt["config"][-1]
301
- cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0] # n_spk
302
- if_f0 = cpt.get("f0", 1)
303
- version = cpt.get("version", "v1")
304
- if version == "v1":
305
- if if_f0 == 1:
306
- net_g = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=config.is_half)
307
- else:
308
- net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"])
309
- nodel_version = "V1"
310
- elif version == "v2":
311
- if if_f0 == 1:
312
- net_g = SynthesizerTrnMs768NSFsid(*cpt["config"], is_half=config.is_half)
313
- else:
314
- net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"])
315
- nodel_version = "V2"
316
- del net_g.enc_q
317
- print(net_g.load_state_dict(cpt["weight"], strict=False))
318
- net_g.eval().to(config.device)
319
- if config.is_half:
320
- net_g = net_g.half()
321
- else:
322
- net_g = net_g.float()
323
- vc = VC(tgt_sr, config)
324
- print(f"Model loaded: {model_name}")
325
- models.append((model_name, model_title, model_author, model_cover, nodel_version, create_vc_fn(tgt_sr, net_g, vc, if_f0, model_index)))
326
- categories.append([category_title, category_folder, description, models])
327
  with gr.Blocks() as app:
328
  gr.Markdown(
329
  "# <center> RVC Genshin Impact Inference\n"
330
  "### <center> [Recommended to use Google Colab to use more character & more feature](https://colab.research.google.com/drive/110kiMZTdP6Ri1lY9-NbQf17GVPPhHyeT?usp=sharing)\n"
331
  "#### From [Retrieval-based-Voice-Conversion](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI)\n"
332
- "[![Original Repo](https://badgen.net/badge/icon/github?icon=github&label=Original%20Repo)](https://github.com/ArkanDash/Multi-Model-RVC-Inference)"
333
  )
334
  for (folder_title, folder, description, models) in categories:
335
  with gr.TabItem(folder_title):
@@ -353,7 +370,7 @@ if __name__ == '__main__':
353
  )
354
  with gr.Row():
355
  with gr.Column():
356
- vc_audio_mode = gr.Dropdown(label="Input voice", choices=["Upload audio", "TTS Audio"], allow_custom_value=False, value="Upload audio")
357
  # Input and Upload
358
  vc_input = gr.Textbox(label="Input audio path", visible=False)
359
  vc_upload = gr.Audio(label="Upload audio file", visible=True, interactive=True)
@@ -369,22 +386,13 @@ if __name__ == '__main__':
369
  tts_text = gr.Textbox(visible=False, label="TTS text", info="Text to speech input")
370
  tts_voice = gr.Dropdown(label="Edge-tts speaker", choices=voices, visible=False, allow_custom_value=False, value="en-US-AnaNeural-Female")
371
  with gr.Column():
372
- spk_item = gr.Slider(
373
- minimum=0,
374
- maximum=2333,
375
- step=1,
376
- label="Speaker ID",
377
- info="(Default: 0)",
378
- value=0,
379
- interactive=True,
380
- )
381
  vc_transform0 = gr.Number(label="Transpose", value=0, info='Type "12" to change from male to female voice. Type "-12" to change female to male voice')
382
  f0method0 = gr.Radio(
383
  label="Pitch extraction algorithm",
384
- info="PM is fast, Harvest is good but extremely slow (Default: PM)",
385
- choices=["pm", "harvest"],
386
  value="pm",
387
- interactive=True,
388
  )
389
  index_rate1 = gr.Slider(
390
  minimum=0,
@@ -425,7 +433,16 @@ if __name__ == '__main__':
425
  maximum=0.5,
426
  label="Voice Protection",
427
  info="Protect voiceless consonants and breath sounds to prevent artifacts such as tearing in electronic music. Set to 0.5 to disable. Decrease the value to increase protection, but it may reduce indexing accuracy",
428
- value=0.35,
 
 
 
 
 
 
 
 
 
429
  step=0.01,
430
  interactive=True,
431
  )
@@ -453,7 +470,6 @@ if __name__ == '__main__':
453
  vc_upload,
454
  tts_text,
455
  tts_voice,
456
- spk_item,
457
  vc_transform0,
458
  f0method0,
459
  index_rate1,
 
17
  import wave
18
  from datetime import datetime
19
  from fairseq import checkpoint_utils
20
+ from lib.infer_pack.models import (
21
  SynthesizerTrnMs256NSFsid,
22
  SynthesizerTrnMs256NSFsid_nono,
23
  SynthesizerTrnMs768NSFsid,
 
29
  logging.getLogger("numba").setLevel(logging.WARNING)
30
  limitation = os.getenv("SYSTEM") == "spaces"
31
 
32
+ audio_mode = []
33
+ f0method_mode = []
34
+ f0method_info = ""
35
+ if limitation is True:
36
+ audio_mode = ["Upload audio", "TTS Audio"]
37
+ f0method_mode = ["pm", "harvest"]
38
+ f0method_info = "PM is fast, Harvest is good but extremely slow. (Default: PM)"
39
+ else:
40
+ audio_mode = ["Input path", "Upload audio", "Youtube", "TTS Audio"]
41
+ f0method_mode = ["pm", "harvest", "crepe"]
42
+ f0method_info = "PM is fast, Harvest is good but extremely slow, and Crepe effect is good but requires GPU (Default: PM)"
43
+
44
+ def create_vc_fn(model_title, tgt_sr, net_g, vc, if_f0, version, file_index):
45
  def vc_fn(
46
  vc_audio_mode,
47
  vc_input,
48
  vc_upload,
49
  tts_text,
50
  tts_voice,
 
51
  f0_up_key,
52
  f0_method,
53
  index_rate,
 
84
  audio_opt = vc.pipeline(
85
  hubert_model,
86
  net_g,
87
+ 0,
88
  audio,
89
  vc_input,
90
  times,
91
  f0_up_key,
92
  f0_method,
93
  file_index,
94
+ # file_big_npy,
95
  index_rate,
96
  if_f0,
97
  filter_radius,
 
103
  f0_file=None,
104
  )
105
  info = f"[{datetime.now().strftime('%Y-%m-%d %H:%M')}]: npy: {times[0]}, f0: {times[1]}s, infer: {times[2]}s"
106
+ print(f"{model_title} | {info}")
107
  return info, (tgt_sr, audio_opt)
108
  except:
109
  info = traceback.format_exc()
 
111
  return info, (None, None)
112
  return vc_fn
113
 
114
+ def load_model():
115
+ categories = []
116
+ with open("weights/folder_info.json", "r", encoding="utf-8") as f:
117
+ folder_info = json.load(f)
118
+ for category_name, category_info in folder_info.items():
119
+ if not category_info['enable']:
120
+ continue
121
+ category_title = category_info['title']
122
+ category_folder = category_info['folder_path']
123
+ description = category_info['description']
124
+ models = []
125
+ with open(f"weights/{category_folder}/model_info.json", "r", encoding="utf-8") as f:
126
+ models_info = json.load(f)
127
+ for character_name, info in models_info.items():
128
+ if not info['enable']:
129
+ continue
130
+ model_title = info['title']
131
+ model_name = info['model_path']
132
+ model_author = info.get("author", None)
133
+ model_cover = f"weights/{category_folder}/{character_name}/{info['cover']}"
134
+ model_index = f"weights/{category_folder}/{character_name}/{info['feature_retrieval_library']}"
135
+ cpt = torch.load(f"weights/{category_folder}/{character_name}/{model_name}", map_location="cpu")
136
+ tgt_sr = cpt["config"][-1]
137
+ cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0] # n_spk
138
+ if_f0 = cpt.get("f0", 1)
139
+ version = cpt.get("version", "v1")
140
+ if version == "v1":
141
+ if if_f0 == 1:
142
+ net_g = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=config.is_half)
143
+ else:
144
+ net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"])
145
+ model_version = "V1"
146
+ elif version == "v2":
147
+ if if_f0 == 1:
148
+ net_g = SynthesizerTrnMs768NSFsid(*cpt["config"], is_half=config.is_half)
149
+ else:
150
+ net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"])
151
+ model_version = "V2"
152
+ del net_g.enc_q
153
+ print(net_g.load_state_dict(cpt["weight"], strict=False))
154
+ net_g.eval().to(config.device)
155
+ if config.is_half:
156
+ net_g = net_g.half()
157
+ else:
158
+ net_g = net_g.float()
159
+ vc = VC(tgt_sr, config)
160
+ print(f"Model loaded: {character_name} / {info['feature_retrieval_library']} | ({model_version})")
161
+ models.append((character_name, model_title, model_author, model_cover, model_version, create_vc_fn(model_title, tgt_sr, net_g, vc, if_f0, version, model_index)))
162
+ categories.append([category_title, category_folder, description, models])
163
+ return categories
164
+
165
  def cut_vocal_and_inst(url, audio_provider, split_model):
166
  if url != "":
167
  if not os.path.exists("dl_audio"):
 
338
 
339
  if __name__ == '__main__':
340
  load_hubert()
341
+ categories = load_model()
342
  tts_voice_list = asyncio.get_event_loop().run_until_complete(edge_tts.list_voices())
343
  voices = [f"{v['ShortName']}-{v['Gender']}" for v in tts_voice_list]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
344
  with gr.Blocks() as app:
345
  gr.Markdown(
346
  "# <center> RVC Genshin Impact Inference\n"
347
  "### <center> [Recommended to use Google Colab to use more character & more feature](https://colab.research.google.com/drive/110kiMZTdP6Ri1lY9-NbQf17GVPPhHyeT?usp=sharing)\n"
348
  "#### From [Retrieval-based-Voice-Conversion](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI)\n"
349
+ "### [This spaces use Multi Model RVC Inference](https://github.com/ArkanDash/Multi-Model-RVC-Inference)"
350
  )
351
  for (folder_title, folder, description, models) in categories:
352
  with gr.TabItem(folder_title):
 
370
  )
371
  with gr.Row():
372
  with gr.Column():
373
+ vc_audio_mode = gr.Dropdown(label="Input voice", choices=audio_mode, allow_custom_value=False, value="Upload audio")
374
  # Input and Upload
375
  vc_input = gr.Textbox(label="Input audio path", visible=False)
376
  vc_upload = gr.Audio(label="Upload audio file", visible=True, interactive=True)
 
386
  tts_text = gr.Textbox(visible=False, label="TTS text", info="Text to speech input")
387
  tts_voice = gr.Dropdown(label="Edge-tts speaker", choices=voices, visible=False, allow_custom_value=False, value="en-US-AnaNeural-Female")
388
  with gr.Column():
 
 
 
 
 
 
 
 
 
389
  vc_transform0 = gr.Number(label="Transpose", value=0, info='Type "12" to change from male to female voice. Type "-12" to change female to male voice')
390
  f0method0 = gr.Radio(
391
  label="Pitch extraction algorithm",
392
+ info=f0method_info,
393
+ choices=f0method_mode,
394
  value="pm",
395
+ interactive=True
396
  )
397
  index_rate1 = gr.Slider(
398
  minimum=0,
 
433
  maximum=0.5,
434
  label="Voice Protection",
435
  info="Protect voiceless consonants and breath sounds to prevent artifacts such as tearing in electronic music. Set to 0.5 to disable. Decrease the value to increase protection, but it may reduce indexing accuracy",
436
+ value=0.4,
437
+ step=0.01,
438
+ interactive=True,
439
+ )
440
+ protect0 = gr.Slider(
441
+ minimum=0,
442
+ maximum=0.5,
443
+ label="Voice Protection",
444
+ info="Protect voiceless consonants and breath sounds to prevent artifacts such as tearing in electronic music. Set to 0.5 to disable. Decrease the value to increase protection, but it may reduce indexing accuracy",
445
+ value=0.5,
446
  step=0.01,
447
  interactive=True,
448
  )
 
470
  vc_upload,
471
  tts_text,
472
  tts_voice,
 
473
  vc_transform0,
474
  f0method0,
475
  index_rate1,
config.py DELETED
@@ -1,105 +0,0 @@
1
- import argparse
2
- import torch
3
- from multiprocessing import cpu_count
4
-
5
- class Config:
6
- def __init__(self):
7
- self.device = "cuda:0"
8
- self.is_half = True
9
- self.n_cpu = 0
10
- self.gpu_name = None
11
- self.gpu_mem = None
12
- (
13
- self.python_cmd,
14
- self.listen_port,
15
- self.colab,
16
- self.noparallel,
17
- self.noautoopen,
18
- self.api
19
- ) = self.arg_parse()
20
- self.x_pad, self.x_query, self.x_center, self.x_max = self.device_config()
21
-
22
- @staticmethod
23
- def arg_parse() -> tuple:
24
- parser = argparse.ArgumentParser()
25
- parser.add_argument("--port", type=int, default=7865, help="Listen port")
26
- parser.add_argument(
27
- "--pycmd", type=str, default="python", help="Python command"
28
- )
29
- parser.add_argument("--colab", action="store_true", help="Launch in colab")
30
- parser.add_argument(
31
- "--noparallel", action="store_true", help="Disable parallel processing"
32
- )
33
- parser.add_argument(
34
- "--noautoopen",
35
- action="store_true",
36
- help="Do not open in browser automatically",
37
- )
38
- parser.add_argument("--api", action="store_true", help="Launch with api")
39
- cmd_opts = parser.parse_args()
40
-
41
- cmd_opts.port = cmd_opts.port if 0 <= cmd_opts.port <= 65535 else 7865
42
-
43
- return (
44
- cmd_opts.pycmd,
45
- cmd_opts.port,
46
- cmd_opts.colab,
47
- cmd_opts.noparallel,
48
- cmd_opts.noautoopen,
49
- cmd_opts.api
50
- )
51
-
52
- def device_config(self) -> tuple:
53
- if torch.cuda.is_available():
54
- i_device = int(self.device.split(":")[-1])
55
- self.gpu_name = torch.cuda.get_device_name(i_device)
56
- if (
57
- ("16" in self.gpu_name and "V100" not in self.gpu_name.upper())
58
- or "P40" in self.gpu_name.upper()
59
- or "1060" in self.gpu_name
60
- or "1070" in self.gpu_name
61
- or "1080" in self.gpu_name
62
- ):
63
- print("16系/10系显卡和P40强制单精度")
64
- self.is_half = False
65
- else:
66
- self.gpu_name = None
67
- self.gpu_mem = int(
68
- torch.cuda.get_device_properties(i_device).total_memory
69
- / 1024
70
- / 1024
71
- / 1024
72
- + 0.4
73
- )
74
- elif torch.backends.mps.is_available():
75
- print("没有发现支持的N卡, 使用MPS进行推理")
76
- self.device = "mps"
77
- self.is_half = False
78
- else:
79
- print("没有发现支持的N卡, 使用CPU进行推理")
80
- self.device = "cpu"
81
- self.is_half = False
82
-
83
- if self.n_cpu == 0:
84
- self.n_cpu = cpu_count()
85
-
86
- if self.is_half:
87
- # 6G显存配置
88
- x_pad = 3
89
- x_query = 10
90
- x_center = 60
91
- x_max = 65
92
- else:
93
- # 5G显存配置
94
- x_pad = 1
95
- x_query = 6
96
- x_center = 38
97
- x_max = 41
98
-
99
- if self.gpu_mem != None and self.gpu_mem <= 4:
100
- x_pad = 1
101
- x_query = 5
102
- x_center = 30
103
- x_max = 32
104
-
105
- return x_pad, x_query, x_center, x_max
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
configs/32k.json DELETED
@@ -1,46 +0,0 @@
1
- {
2
- "train": {
3
- "log_interval": 200,
4
- "seed": 1234,
5
- "epochs": 20000,
6
- "learning_rate": 1e-4,
7
- "betas": [0.8, 0.99],
8
- "eps": 1e-9,
9
- "batch_size": 4,
10
- "fp16_run": true,
11
- "lr_decay": 0.999875,
12
- "segment_size": 12800,
13
- "init_lr_ratio": 1,
14
- "warmup_epochs": 0,
15
- "c_mel": 45,
16
- "c_kl": 1.0
17
- },
18
- "data": {
19
- "max_wav_value": 32768.0,
20
- "sampling_rate": 32000,
21
- "filter_length": 1024,
22
- "hop_length": 320,
23
- "win_length": 1024,
24
- "n_mel_channels": 80,
25
- "mel_fmin": 0.0,
26
- "mel_fmax": null
27
- },
28
- "model": {
29
- "inter_channels": 192,
30
- "hidden_channels": 192,
31
- "filter_channels": 768,
32
- "n_heads": 2,
33
- "n_layers": 6,
34
- "kernel_size": 3,
35
- "p_dropout": 0,
36
- "resblock": "1",
37
- "resblock_kernel_sizes": [3,7,11],
38
- "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
39
- "upsample_rates": [10,4,2,2,2],
40
- "upsample_initial_channel": 512,
41
- "upsample_kernel_sizes": [16,16,4,4,4],
42
- "use_spectral_norm": false,
43
- "gin_channels": 256,
44
- "spk_embed_dim": 109
45
- }
46
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
configs/40k.json DELETED
@@ -1,46 +0,0 @@
1
- {
2
- "train": {
3
- "log_interval": 200,
4
- "seed": 1234,
5
- "epochs": 20000,
6
- "learning_rate": 1e-4,
7
- "betas": [0.8, 0.99],
8
- "eps": 1e-9,
9
- "batch_size": 4,
10
- "fp16_run": true,
11
- "lr_decay": 0.999875,
12
- "segment_size": 12800,
13
- "init_lr_ratio": 1,
14
- "warmup_epochs": 0,
15
- "c_mel": 45,
16
- "c_kl": 1.0
17
- },
18
- "data": {
19
- "max_wav_value": 32768.0,
20
- "sampling_rate": 40000,
21
- "filter_length": 2048,
22
- "hop_length": 400,
23
- "win_length": 2048,
24
- "n_mel_channels": 125,
25
- "mel_fmin": 0.0,
26
- "mel_fmax": null
27
- },
28
- "model": {
29
- "inter_channels": 192,
30
- "hidden_channels": 192,
31
- "filter_channels": 768,
32
- "n_heads": 2,
33
- "n_layers": 6,
34
- "kernel_size": 3,
35
- "p_dropout": 0,
36
- "resblock": "1",
37
- "resblock_kernel_sizes": [3,7,11],
38
- "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
39
- "upsample_rates": [10,10,2,2],
40
- "upsample_initial_channel": 512,
41
- "upsample_kernel_sizes": [16,16,4,4],
42
- "use_spectral_norm": false,
43
- "gin_channels": 256,
44
- "spk_embed_dim": 109
45
- }
46
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
configs/48k.json DELETED
@@ -1,46 +0,0 @@
1
- {
2
- "train": {
3
- "log_interval": 200,
4
- "seed": 1234,
5
- "epochs": 20000,
6
- "learning_rate": 1e-4,
7
- "betas": [0.8, 0.99],
8
- "eps": 1e-9,
9
- "batch_size": 4,
10
- "fp16_run": true,
11
- "lr_decay": 0.999875,
12
- "segment_size": 11520,
13
- "init_lr_ratio": 1,
14
- "warmup_epochs": 0,
15
- "c_mel": 45,
16
- "c_kl": 1.0
17
- },
18
- "data": {
19
- "max_wav_value": 32768.0,
20
- "sampling_rate": 48000,
21
- "filter_length": 2048,
22
- "hop_length": 480,
23
- "win_length": 2048,
24
- "n_mel_channels": 128,
25
- "mel_fmin": 0.0,
26
- "mel_fmax": null
27
- },
28
- "model": {
29
- "inter_channels": 192,
30
- "hidden_channels": 192,
31
- "filter_channels": 768,
32
- "n_heads": 2,
33
- "n_layers": 6,
34
- "kernel_size": 3,
35
- "p_dropout": 0,
36
- "resblock": "1",
37
- "resblock_kernel_sizes": [3,7,11],
38
- "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
39
- "upsample_rates": [10,6,2,2,2],
40
- "upsample_initial_channel": 512,
41
- "upsample_kernel_sizes": [16,16,4,4,4],
42
- "use_spectral_norm": false,
43
- "gin_channels": 256,
44
- "spk_embed_dim": 109
45
- }
46
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
infer_pack/models_onnx_moess.py DELETED
@@ -1,849 +0,0 @@
1
- import math, pdb, os
2
- from time import time as ttime
3
- import torch
4
- from torch import nn
5
- from torch.nn import functional as F
6
- from infer_pack import modules
7
- from infer_pack import attentions
8
- from infer_pack import commons
9
- from infer_pack.commons import init_weights, get_padding
10
- from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
11
- from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
12
- from infer_pack.commons import init_weights
13
- import numpy as np
14
- from infer_pack import commons
15
-
16
-
17
- class TextEncoder256(nn.Module):
18
- def __init__(
19
- self,
20
- out_channels,
21
- hidden_channels,
22
- filter_channels,
23
- n_heads,
24
- n_layers,
25
- kernel_size,
26
- p_dropout,
27
- f0=True,
28
- ):
29
- super().__init__()
30
- self.out_channels = out_channels
31
- self.hidden_channels = hidden_channels
32
- self.filter_channels = filter_channels
33
- self.n_heads = n_heads
34
- self.n_layers = n_layers
35
- self.kernel_size = kernel_size
36
- self.p_dropout = p_dropout
37
- self.emb_phone = nn.Linear(256, hidden_channels)
38
- self.lrelu = nn.LeakyReLU(0.1, inplace=True)
39
- if f0 == True:
40
- self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256
41
- self.encoder = attentions.Encoder(
42
- hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
43
- )
44
- self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
45
-
46
- def forward(self, phone, pitch, lengths):
47
- if pitch == None:
48
- x = self.emb_phone(phone)
49
- else:
50
- x = self.emb_phone(phone) + self.emb_pitch(pitch)
51
- x = x * math.sqrt(self.hidden_channels) # [b, t, h]
52
- x = self.lrelu(x)
53
- x = torch.transpose(x, 1, -1) # [b, h, t]
54
- x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to(
55
- x.dtype
56
- )
57
- x = self.encoder(x * x_mask, x_mask)
58
- stats = self.proj(x) * x_mask
59
-
60
- m, logs = torch.split(stats, self.out_channels, dim=1)
61
- return m, logs, x_mask
62
-
63
-
64
- class TextEncoder256Sim(nn.Module):
65
- def __init__(
66
- self,
67
- out_channels,
68
- hidden_channels,
69
- filter_channels,
70
- n_heads,
71
- n_layers,
72
- kernel_size,
73
- p_dropout,
74
- f0=True,
75
- ):
76
- super().__init__()
77
- self.out_channels = out_channels
78
- self.hidden_channels = hidden_channels
79
- self.filter_channels = filter_channels
80
- self.n_heads = n_heads
81
- self.n_layers = n_layers
82
- self.kernel_size = kernel_size
83
- self.p_dropout = p_dropout
84
- self.emb_phone = nn.Linear(256, hidden_channels)
85
- self.lrelu = nn.LeakyReLU(0.1, inplace=True)
86
- if f0 == True:
87
- self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256
88
- self.encoder = attentions.Encoder(
89
- hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
90
- )
91
- self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
92
-
93
- def forward(self, phone, pitch, lengths):
94
- if pitch == None:
95
- x = self.emb_phone(phone)
96
- else:
97
- x = self.emb_phone(phone) + self.emb_pitch(pitch)
98
- x = x * math.sqrt(self.hidden_channels) # [b, t, h]
99
- x = self.lrelu(x)
100
- x = torch.transpose(x, 1, -1) # [b, h, t]
101
- x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to(
102
- x.dtype
103
- )
104
- x = self.encoder(x * x_mask, x_mask)
105
- x = self.proj(x) * x_mask
106
- return x, x_mask
107
-
108
-
109
- class ResidualCouplingBlock(nn.Module):
110
- def __init__(
111
- self,
112
- channels,
113
- hidden_channels,
114
- kernel_size,
115
- dilation_rate,
116
- n_layers,
117
- n_flows=4,
118
- gin_channels=0,
119
- ):
120
- super().__init__()
121
- self.channels = channels
122
- self.hidden_channels = hidden_channels
123
- self.kernel_size = kernel_size
124
- self.dilation_rate = dilation_rate
125
- self.n_layers = n_layers
126
- self.n_flows = n_flows
127
- self.gin_channels = gin_channels
128
-
129
- self.flows = nn.ModuleList()
130
- for i in range(n_flows):
131
- self.flows.append(
132
- modules.ResidualCouplingLayer(
133
- channels,
134
- hidden_channels,
135
- kernel_size,
136
- dilation_rate,
137
- n_layers,
138
- gin_channels=gin_channels,
139
- mean_only=True,
140
- )
141
- )
142
- self.flows.append(modules.Flip())
143
-
144
- def forward(self, x, x_mask, g=None, reverse=False):
145
- if not reverse:
146
- for flow in self.flows:
147
- x, _ = flow(x, x_mask, g=g, reverse=reverse)
148
- else:
149
- for flow in reversed(self.flows):
150
- x = flow(x, x_mask, g=g, reverse=reverse)
151
- return x
152
-
153
- def remove_weight_norm(self):
154
- for i in range(self.n_flows):
155
- self.flows[i * 2].remove_weight_norm()
156
-
157
-
158
- class PosteriorEncoder(nn.Module):
159
- def __init__(
160
- self,
161
- in_channels,
162
- out_channels,
163
- hidden_channels,
164
- kernel_size,
165
- dilation_rate,
166
- n_layers,
167
- gin_channels=0,
168
- ):
169
- super().__init__()
170
- self.in_channels = in_channels
171
- self.out_channels = out_channels
172
- self.hidden_channels = hidden_channels
173
- self.kernel_size = kernel_size
174
- self.dilation_rate = dilation_rate
175
- self.n_layers = n_layers
176
- self.gin_channels = gin_channels
177
-
178
- self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
179
- self.enc = modules.WN(
180
- hidden_channels,
181
- kernel_size,
182
- dilation_rate,
183
- n_layers,
184
- gin_channels=gin_channels,
185
- )
186
- self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
187
-
188
- def forward(self, x, x_lengths, g=None):
189
- x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(
190
- x.dtype
191
- )
192
- x = self.pre(x) * x_mask
193
- x = self.enc(x, x_mask, g=g)
194
- stats = self.proj(x) * x_mask
195
- m, logs = torch.split(stats, self.out_channels, dim=1)
196
- z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
197
- return z, m, logs, x_mask
198
-
199
- def remove_weight_norm(self):
200
- self.enc.remove_weight_norm()
201
-
202
-
203
- class Generator(torch.nn.Module):
204
- def __init__(
205
- self,
206
- initial_channel,
207
- resblock,
208
- resblock_kernel_sizes,
209
- resblock_dilation_sizes,
210
- upsample_rates,
211
- upsample_initial_channel,
212
- upsample_kernel_sizes,
213
- gin_channels=0,
214
- ):
215
- super(Generator, self).__init__()
216
- self.num_kernels = len(resblock_kernel_sizes)
217
- self.num_upsamples = len(upsample_rates)
218
- self.conv_pre = Conv1d(
219
- initial_channel, upsample_initial_channel, 7, 1, padding=3
220
- )
221
- resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2
222
-
223
- self.ups = nn.ModuleList()
224
- for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
225
- self.ups.append(
226
- weight_norm(
227
- ConvTranspose1d(
228
- upsample_initial_channel // (2**i),
229
- upsample_initial_channel // (2 ** (i + 1)),
230
- k,
231
- u,
232
- padding=(k - u) // 2,
233
- )
234
- )
235
- )
236
-
237
- self.resblocks = nn.ModuleList()
238
- for i in range(len(self.ups)):
239
- ch = upsample_initial_channel // (2 ** (i + 1))
240
- for j, (k, d) in enumerate(
241
- zip(resblock_kernel_sizes, resblock_dilation_sizes)
242
- ):
243
- self.resblocks.append(resblock(ch, k, d))
244
-
245
- self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
246
- self.ups.apply(init_weights)
247
-
248
- if gin_channels != 0:
249
- self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
250
-
251
- def forward(self, x, g=None):
252
- x = self.conv_pre(x)
253
- if g is not None:
254
- x = x + self.cond(g)
255
-
256
- for i in range(self.num_upsamples):
257
- x = F.leaky_relu(x, modules.LRELU_SLOPE)
258
- x = self.ups[i](x)
259
- xs = None
260
- for j in range(self.num_kernels):
261
- if xs is None:
262
- xs = self.resblocks[i * self.num_kernels + j](x)
263
- else:
264
- xs += self.resblocks[i * self.num_kernels + j](x)
265
- x = xs / self.num_kernels
266
- x = F.leaky_relu(x)
267
- x = self.conv_post(x)
268
- x = torch.tanh(x)
269
-
270
- return x
271
-
272
- def remove_weight_norm(self):
273
- for l in self.ups:
274
- remove_weight_norm(l)
275
- for l in self.resblocks:
276
- l.remove_weight_norm()
277
-
278
-
279
- class SineGen(torch.nn.Module):
280
- """Definition of sine generator
281
- SineGen(samp_rate, harmonic_num = 0,
282
- sine_amp = 0.1, noise_std = 0.003,
283
- voiced_threshold = 0,
284
- flag_for_pulse=False)
285
- samp_rate: sampling rate in Hz
286
- harmonic_num: number of harmonic overtones (default 0)
287
- sine_amp: amplitude of sine-wavefrom (default 0.1)
288
- noise_std: std of Gaussian noise (default 0.003)
289
- voiced_thoreshold: F0 threshold for U/V classification (default 0)
290
- flag_for_pulse: this SinGen is used inside PulseGen (default False)
291
- Note: when flag_for_pulse is True, the first time step of a voiced
292
- segment is always sin(np.pi) or cos(0)
293
- """
294
-
295
- def __init__(
296
- self,
297
- samp_rate,
298
- harmonic_num=0,
299
- sine_amp=0.1,
300
- noise_std=0.003,
301
- voiced_threshold=0,
302
- flag_for_pulse=False,
303
- ):
304
- super(SineGen, self).__init__()
305
- self.sine_amp = sine_amp
306
- self.noise_std = noise_std
307
- self.harmonic_num = harmonic_num
308
- self.dim = self.harmonic_num + 1
309
- self.sampling_rate = samp_rate
310
- self.voiced_threshold = voiced_threshold
311
-
312
- def _f02uv(self, f0):
313
- # generate uv signal
314
- uv = torch.ones_like(f0)
315
- uv = uv * (f0 > self.voiced_threshold)
316
- return uv
317
-
318
- def forward(self, f0, upp):
319
- """sine_tensor, uv = forward(f0)
320
- input F0: tensor(batchsize=1, length, dim=1)
321
- f0 for unvoiced steps should be 0
322
- output sine_tensor: tensor(batchsize=1, length, dim)
323
- output uv: tensor(batchsize=1, length, 1)
324
- """
325
- with torch.no_grad():
326
- f0 = f0[:, None].transpose(1, 2)
327
- f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, device=f0.device)
328
- # fundamental component
329
- f0_buf[:, :, 0] = f0[:, :, 0]
330
- for idx in np.arange(self.harmonic_num):
331
- f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * (
332
- idx + 2
333
- ) # idx + 2: the (idx+1)-th overtone, (idx+2)-th harmonic
334
- rad_values = (f0_buf / self.sampling_rate) % 1 ###%1意味着n_har的乘积无法后处理优化
335
- rand_ini = torch.rand(
336
- f0_buf.shape[0], f0_buf.shape[2], device=f0_buf.device
337
- )
338
- rand_ini[:, 0] = 0
339
- rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini
340
- tmp_over_one = torch.cumsum(rad_values, 1) # % 1 #####%1意味着后面的cumsum无法再优化
341
- tmp_over_one *= upp
342
- tmp_over_one = F.interpolate(
343
- tmp_over_one.transpose(2, 1),
344
- scale_factor=upp,
345
- mode="linear",
346
- align_corners=True,
347
- ).transpose(2, 1)
348
- rad_values = F.interpolate(
349
- rad_values.transpose(2, 1), scale_factor=upp, mode="nearest"
350
- ).transpose(
351
- 2, 1
352
- ) #######
353
- tmp_over_one %= 1
354
- tmp_over_one_idx = (tmp_over_one[:, 1:, :] - tmp_over_one[:, :-1, :]) < 0
355
- cumsum_shift = torch.zeros_like(rad_values)
356
- cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0
357
- sine_waves = torch.sin(
358
- torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * np.pi
359
- )
360
- sine_waves = sine_waves * self.sine_amp
361
- uv = self._f02uv(f0)
362
- uv = F.interpolate(
363
- uv.transpose(2, 1), scale_factor=upp, mode="nearest"
364
- ).transpose(2, 1)
365
- noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
366
- noise = noise_amp * torch.randn_like(sine_waves)
367
- sine_waves = sine_waves * uv + noise
368
- return sine_waves, uv, noise
369
-
370
-
371
- class SourceModuleHnNSF(torch.nn.Module):
372
- """SourceModule for hn-nsf
373
- SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1,
374
- add_noise_std=0.003, voiced_threshod=0)
375
- sampling_rate: sampling_rate in Hz
376
- harmonic_num: number of harmonic above F0 (default: 0)
377
- sine_amp: amplitude of sine source signal (default: 0.1)
378
- add_noise_std: std of additive Gaussian noise (default: 0.003)
379
- note that amplitude of noise in unvoiced is decided
380
- by sine_amp
381
- voiced_threshold: threhold to set U/V given F0 (default: 0)
382
- Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
383
- F0_sampled (batchsize, length, 1)
384
- Sine_source (batchsize, length, 1)
385
- noise_source (batchsize, length 1)
386
- uv (batchsize, length, 1)
387
- """
388
-
389
- def __init__(
390
- self,
391
- sampling_rate,
392
- harmonic_num=0,
393
- sine_amp=0.1,
394
- add_noise_std=0.003,
395
- voiced_threshod=0,
396
- is_half=True,
397
- ):
398
- super(SourceModuleHnNSF, self).__init__()
399
-
400
- self.sine_amp = sine_amp
401
- self.noise_std = add_noise_std
402
- self.is_half = is_half
403
- # to produce sine waveforms
404
- self.l_sin_gen = SineGen(
405
- sampling_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshod
406
- )
407
-
408
- # to merge source harmonics into a single excitation
409
- self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
410
- self.l_tanh = torch.nn.Tanh()
411
-
412
- def forward(self, x, upp=None):
413
- sine_wavs, uv, _ = self.l_sin_gen(x, upp)
414
- if self.is_half:
415
- sine_wavs = sine_wavs.half()
416
- sine_merge = self.l_tanh(self.l_linear(sine_wavs))
417
- return sine_merge, None, None # noise, uv
418
-
419
-
420
- class GeneratorNSF(torch.nn.Module):
421
- def __init__(
422
- self,
423
- initial_channel,
424
- resblock,
425
- resblock_kernel_sizes,
426
- resblock_dilation_sizes,
427
- upsample_rates,
428
- upsample_initial_channel,
429
- upsample_kernel_sizes,
430
- gin_channels,
431
- sr,
432
- is_half=False,
433
- ):
434
- super(GeneratorNSF, self).__init__()
435
- self.num_kernels = len(resblock_kernel_sizes)
436
- self.num_upsamples = len(upsample_rates)
437
-
438
- self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(upsample_rates))
439
- self.m_source = SourceModuleHnNSF(
440
- sampling_rate=sr, harmonic_num=0, is_half=is_half
441
- )
442
- self.noise_convs = nn.ModuleList()
443
- self.conv_pre = Conv1d(
444
- initial_channel, upsample_initial_channel, 7, 1, padding=3
445
- )
446
- resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2
447
-
448
- self.ups = nn.ModuleList()
449
- for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
450
- c_cur = upsample_initial_channel // (2 ** (i + 1))
451
- self.ups.append(
452
- weight_norm(
453
- ConvTranspose1d(
454
- upsample_initial_channel // (2**i),
455
- upsample_initial_channel // (2 ** (i + 1)),
456
- k,
457
- u,
458
- padding=(k - u) // 2,
459
- )
460
- )
461
- )
462
- if i + 1 < len(upsample_rates):
463
- stride_f0 = np.prod(upsample_rates[i + 1 :])
464
- self.noise_convs.append(
465
- Conv1d(
466
- 1,
467
- c_cur,
468
- kernel_size=stride_f0 * 2,
469
- stride=stride_f0,
470
- padding=stride_f0 // 2,
471
- )
472
- )
473
- else:
474
- self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1))
475
-
476
- self.resblocks = nn.ModuleList()
477
- for i in range(len(self.ups)):
478
- ch = upsample_initial_channel // (2 ** (i + 1))
479
- for j, (k, d) in enumerate(
480
- zip(resblock_kernel_sizes, resblock_dilation_sizes)
481
- ):
482
- self.resblocks.append(resblock(ch, k, d))
483
-
484
- self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
485
- self.ups.apply(init_weights)
486
-
487
- if gin_channels != 0:
488
- self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
489
-
490
- self.upp = np.prod(upsample_rates)
491
-
492
- def forward(self, x, f0, g=None):
493
- har_source, noi_source, uv = self.m_source(f0, self.upp)
494
- har_source = har_source.transpose(1, 2)
495
- x = self.conv_pre(x)
496
- if g is not None:
497
- x = x + self.cond(g)
498
-
499
- for i in range(self.num_upsamples):
500
- x = F.leaky_relu(x, modules.LRELU_SLOPE)
501
- x = self.ups[i](x)
502
- x_source = self.noise_convs[i](har_source)
503
- x = x + x_source
504
- xs = None
505
- for j in range(self.num_kernels):
506
- if xs is None:
507
- xs = self.resblocks[i * self.num_kernels + j](x)
508
- else:
509
- xs += self.resblocks[i * self.num_kernels + j](x)
510
- x = xs / self.num_kernels
511
- x = F.leaky_relu(x)
512
- x = self.conv_post(x)
513
- x = torch.tanh(x)
514
- return x
515
-
516
- def remove_weight_norm(self):
517
- for l in self.ups:
518
- remove_weight_norm(l)
519
- for l in self.resblocks:
520
- l.remove_weight_norm()
521
-
522
-
523
- sr2sr = {
524
- "32k": 32000,
525
- "40k": 40000,
526
- "48k": 48000,
527
- }
528
-
529
-
530
- class SynthesizerTrnMs256NSFsidM(nn.Module):
531
- def __init__(
532
- self,
533
- spec_channels,
534
- segment_size,
535
- inter_channels,
536
- hidden_channels,
537
- filter_channels,
538
- n_heads,
539
- n_layers,
540
- kernel_size,
541
- p_dropout,
542
- resblock,
543
- resblock_kernel_sizes,
544
- resblock_dilation_sizes,
545
- upsample_rates,
546
- upsample_initial_channel,
547
- upsample_kernel_sizes,
548
- spk_embed_dim,
549
- gin_channels,
550
- sr,
551
- **kwargs
552
- ):
553
- super().__init__()
554
- if type(sr) == type("strr"):
555
- sr = sr2sr[sr]
556
- self.spec_channels = spec_channels
557
- self.inter_channels = inter_channels
558
- self.hidden_channels = hidden_channels
559
- self.filter_channels = filter_channels
560
- self.n_heads = n_heads
561
- self.n_layers = n_layers
562
- self.kernel_size = kernel_size
563
- self.p_dropout = p_dropout
564
- self.resblock = resblock
565
- self.resblock_kernel_sizes = resblock_kernel_sizes
566
- self.resblock_dilation_sizes = resblock_dilation_sizes
567
- self.upsample_rates = upsample_rates
568
- self.upsample_initial_channel = upsample_initial_channel
569
- self.upsample_kernel_sizes = upsample_kernel_sizes
570
- self.segment_size = segment_size
571
- self.gin_channels = gin_channels
572
- # self.hop_length = hop_length#
573
- self.spk_embed_dim = spk_embed_dim
574
- self.enc_p = TextEncoder256(
575
- inter_channels,
576
- hidden_channels,
577
- filter_channels,
578
- n_heads,
579
- n_layers,
580
- kernel_size,
581
- p_dropout,
582
- )
583
- self.dec = GeneratorNSF(
584
- inter_channels,
585
- resblock,
586
- resblock_kernel_sizes,
587
- resblock_dilation_sizes,
588
- upsample_rates,
589
- upsample_initial_channel,
590
- upsample_kernel_sizes,
591
- gin_channels=gin_channels,
592
- sr=sr,
593
- is_half=kwargs["is_half"],
594
- )
595
- self.enc_q = PosteriorEncoder(
596
- spec_channels,
597
- inter_channels,
598
- hidden_channels,
599
- 5,
600
- 1,
601
- 16,
602
- gin_channels=gin_channels,
603
- )
604
- self.flow = ResidualCouplingBlock(
605
- inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
606
- )
607
- self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
608
- print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
609
-
610
- def remove_weight_norm(self):
611
- self.dec.remove_weight_norm()
612
- self.flow.remove_weight_norm()
613
- self.enc_q.remove_weight_norm()
614
-
615
- def forward(self, phone, phone_lengths, pitch, nsff0, sid, rnd, max_len=None):
616
- g = self.emb_g(sid).unsqueeze(-1)
617
- m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
618
- z_p = (m_p + torch.exp(logs_p) * rnd) * x_mask
619
- z = self.flow(z_p, x_mask, g=g, reverse=True)
620
- o = self.dec((z * x_mask)[:, :, :max_len], nsff0, g=g)
621
- return o
622
-
623
-
624
- class SynthesizerTrnMs256NSFsid_sim(nn.Module):
625
- """
626
- Synthesizer for Training
627
- """
628
-
629
- def __init__(
630
- self,
631
- spec_channels,
632
- segment_size,
633
- inter_channels,
634
- hidden_channels,
635
- filter_channels,
636
- n_heads,
637
- n_layers,
638
- kernel_size,
639
- p_dropout,
640
- resblock,
641
- resblock_kernel_sizes,
642
- resblock_dilation_sizes,
643
- upsample_rates,
644
- upsample_initial_channel,
645
- upsample_kernel_sizes,
646
- spk_embed_dim,
647
- # hop_length,
648
- gin_channels=0,
649
- use_sdp=True,
650
- **kwargs
651
- ):
652
- super().__init__()
653
- self.spec_channels = spec_channels
654
- self.inter_channels = inter_channels
655
- self.hidden_channels = hidden_channels
656
- self.filter_channels = filter_channels
657
- self.n_heads = n_heads
658
- self.n_layers = n_layers
659
- self.kernel_size = kernel_size
660
- self.p_dropout = p_dropout
661
- self.resblock = resblock
662
- self.resblock_kernel_sizes = resblock_kernel_sizes
663
- self.resblock_dilation_sizes = resblock_dilation_sizes
664
- self.upsample_rates = upsample_rates
665
- self.upsample_initial_channel = upsample_initial_channel
666
- self.upsample_kernel_sizes = upsample_kernel_sizes
667
- self.segment_size = segment_size
668
- self.gin_channels = gin_channels
669
- # self.hop_length = hop_length#
670
- self.spk_embed_dim = spk_embed_dim
671
- self.enc_p = TextEncoder256Sim(
672
- inter_channels,
673
- hidden_channels,
674
- filter_channels,
675
- n_heads,
676
- n_layers,
677
- kernel_size,
678
- p_dropout,
679
- )
680
- self.dec = GeneratorNSF(
681
- inter_channels,
682
- resblock,
683
- resblock_kernel_sizes,
684
- resblock_dilation_sizes,
685
- upsample_rates,
686
- upsample_initial_channel,
687
- upsample_kernel_sizes,
688
- gin_channels=gin_channels,
689
- is_half=kwargs["is_half"],
690
- )
691
-
692
- self.flow = ResidualCouplingBlock(
693
- inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
694
- )
695
- self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
696
- print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
697
-
698
- def remove_weight_norm(self):
699
- self.dec.remove_weight_norm()
700
- self.flow.remove_weight_norm()
701
- self.enc_q.remove_weight_norm()
702
-
703
- def forward(
704
- self, phone, phone_lengths, pitch, pitchf, ds, max_len=None
705
- ): # y是spec不需要了现在
706
- g = self.emb_g(ds.unsqueeze(0)).unsqueeze(-1) # [b, 256, 1]##1是t,广播的
707
- x, x_mask = self.enc_p(phone, pitch, phone_lengths)
708
- x = self.flow(x, x_mask, g=g, reverse=True)
709
- o = self.dec((x * x_mask)[:, :, :max_len], pitchf, g=g)
710
- return o
711
-
712
-
713
- class MultiPeriodDiscriminator(torch.nn.Module):
714
- def __init__(self, use_spectral_norm=False):
715
- super(MultiPeriodDiscriminator, self).__init__()
716
- periods = [2, 3, 5, 7, 11, 17]
717
- # periods = [3, 5, 7, 11, 17, 23, 37]
718
-
719
- discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
720
- discs = discs + [
721
- DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods
722
- ]
723
- self.discriminators = nn.ModuleList(discs)
724
-
725
- def forward(self, y, y_hat):
726
- y_d_rs = [] #
727
- y_d_gs = []
728
- fmap_rs = []
729
- fmap_gs = []
730
- for i, d in enumerate(self.discriminators):
731
- y_d_r, fmap_r = d(y)
732
- y_d_g, fmap_g = d(y_hat)
733
- # for j in range(len(fmap_r)):
734
- # print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape)
735
- y_d_rs.append(y_d_r)
736
- y_d_gs.append(y_d_g)
737
- fmap_rs.append(fmap_r)
738
- fmap_gs.append(fmap_g)
739
-
740
- return y_d_rs, y_d_gs, fmap_rs, fmap_gs
741
-
742
-
743
- class DiscriminatorS(torch.nn.Module):
744
- def __init__(self, use_spectral_norm=False):
745
- super(DiscriminatorS, self).__init__()
746
- norm_f = weight_norm if use_spectral_norm == False else spectral_norm
747
- self.convs = nn.ModuleList(
748
- [
749
- norm_f(Conv1d(1, 16, 15, 1, padding=7)),
750
- norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),
751
- norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)),
752
- norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
753
- norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
754
- norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
755
- ]
756
- )
757
- self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
758
-
759
- def forward(self, x):
760
- fmap = []
761
-
762
- for l in self.convs:
763
- x = l(x)
764
- x = F.leaky_relu(x, modules.LRELU_SLOPE)
765
- fmap.append(x)
766
- x = self.conv_post(x)
767
- fmap.append(x)
768
- x = torch.flatten(x, 1, -1)
769
-
770
- return x, fmap
771
-
772
-
773
- class DiscriminatorP(torch.nn.Module):
774
- def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
775
- super(DiscriminatorP, self).__init__()
776
- self.period = period
777
- self.use_spectral_norm = use_spectral_norm
778
- norm_f = weight_norm if use_spectral_norm == False else spectral_norm
779
- self.convs = nn.ModuleList(
780
- [
781
- norm_f(
782
- Conv2d(
783
- 1,
784
- 32,
785
- (kernel_size, 1),
786
- (stride, 1),
787
- padding=(get_padding(kernel_size, 1), 0),
788
- )
789
- ),
790
- norm_f(
791
- Conv2d(
792
- 32,
793
- 128,
794
- (kernel_size, 1),
795
- (stride, 1),
796
- padding=(get_padding(kernel_size, 1), 0),
797
- )
798
- ),
799
- norm_f(
800
- Conv2d(
801
- 128,
802
- 512,
803
- (kernel_size, 1),
804
- (stride, 1),
805
- padding=(get_padding(kernel_size, 1), 0),
806
- )
807
- ),
808
- norm_f(
809
- Conv2d(
810
- 512,
811
- 1024,
812
- (kernel_size, 1),
813
- (stride, 1),
814
- padding=(get_padding(kernel_size, 1), 0),
815
- )
816
- ),
817
- norm_f(
818
- Conv2d(
819
- 1024,
820
- 1024,
821
- (kernel_size, 1),
822
- 1,
823
- padding=(get_padding(kernel_size, 1), 0),
824
- )
825
- ),
826
- ]
827
- )
828
- self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
829
-
830
- def forward(self, x):
831
- fmap = []
832
-
833
- # 1d to 2d
834
- b, c, t = x.shape
835
- if t % self.period != 0: # pad first
836
- n_pad = self.period - (t % self.period)
837
- x = F.pad(x, (0, n_pad), "reflect")
838
- t = t + n_pad
839
- x = x.view(b, c, t // self.period, self.period)
840
-
841
- for l in self.convs:
842
- x = l(x)
843
- x = F.leaky_relu(x, modules.LRELU_SLOPE)
844
- fmap.append(x)
845
- x = self.conv_post(x)
846
- fmap.append(x)
847
- x = torch.flatten(x, 1, -1)
848
-
849
- return x, fmap
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
{infer_pack → lib/infer_pack}/attentions.py RENAMED
@@ -5,9 +5,9 @@ import torch
5
  from torch import nn
6
  from torch.nn import functional as F
7
 
8
- from infer_pack import commons
9
- from infer_pack import modules
10
- from infer_pack.modules import LayerNorm
11
 
12
 
13
  class Encoder(nn.Module):
 
5
  from torch import nn
6
  from torch.nn import functional as F
7
 
8
+ from lib.infer_pack import commons
9
+ from lib.infer_pack import modules
10
+ from lib.infer_pack.modules import LayerNorm
11
 
12
 
13
  class Encoder(nn.Module):
{infer_pack → lib/infer_pack}/commons.py RENAMED
File without changes
{infer_pack → lib/infer_pack}/models.py RENAMED
@@ -3,15 +3,15 @@ from time import time as ttime
3
  import torch
4
  from torch import nn
5
  from torch.nn import functional as F
6
- from infer_pack import modules
7
- from infer_pack import attentions
8
- from infer_pack import commons
9
- from infer_pack.commons import init_weights, get_padding
10
  from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
11
  from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
12
- from infer_pack.commons import init_weights
13
  import numpy as np
14
- from infer_pack import commons
15
 
16
 
17
  class TextEncoder256(nn.Module):
 
3
  import torch
4
  from torch import nn
5
  from torch.nn import functional as F
6
+ from lib.infer_pack import modules
7
+ from lib.infer_pack import attentions
8
+ from lib.infer_pack import commons
9
+ from lib.infer_pack.commons import init_weights, get_padding
10
  from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
11
  from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
12
+ from lib.infer_pack.commons import init_weights
13
  import numpy as np
14
+ from lib.infer_pack import commons
15
 
16
 
17
  class TextEncoder256(nn.Module):
{infer_pack → lib/infer_pack}/models_onnx.py RENAMED
@@ -3,15 +3,15 @@ from time import time as ttime
3
  import torch
4
  from torch import nn
5
  from torch.nn import functional as F
6
- from infer_pack import modules
7
- from infer_pack import attentions
8
- from infer_pack import commons
9
- from infer_pack.commons import init_weights, get_padding
10
  from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
11
  from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
12
- from infer_pack.commons import init_weights
13
  import numpy as np
14
- from infer_pack import commons
15
 
16
 
17
  class TextEncoder256(nn.Module):
 
3
  import torch
4
  from torch import nn
5
  from torch.nn import functional as F
6
+ from lib.infer_pack import modules
7
+ from lib.infer_pack import attentions
8
+ from lib.infer_pack import commons
9
+ from lib.infer_pack.commons import init_weights, get_padding
10
  from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
11
  from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
12
+ from lib.infer_pack.commons import init_weights
13
  import numpy as np
14
+ from lib.infer_pack import commons
15
 
16
 
17
  class TextEncoder256(nn.Module):
{infer_pack → lib/infer_pack}/modules.py RENAMED
@@ -9,9 +9,9 @@ from torch.nn import functional as F
9
  from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
10
  from torch.nn.utils import weight_norm, remove_weight_norm
11
 
12
- from infer_pack import commons
13
- from infer_pack.commons import init_weights, get_padding
14
- from infer_pack.transforms import piecewise_rational_quadratic_transform
15
 
16
 
17
  LRELU_SLOPE = 0.1
 
9
  from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
10
  from torch.nn.utils import weight_norm, remove_weight_norm
11
 
12
+ from lib.infer_pack import commons
13
+ from lib.infer_pack.commons import init_weights, get_padding
14
+ from lib.infer_pack.transforms import piecewise_rational_quadratic_transform
15
 
16
 
17
  LRELU_SLOPE = 0.1
{infer_pack → lib/infer_pack}/modules/F0Predictor/DioF0Predictor.py RENAMED
@@ -1,4 +1,4 @@
1
- from infer_pack.modules.F0Predictor.F0Predictor import F0Predictor
2
  import pyworld
3
  import numpy as np
4
 
 
1
+ from lib.infer_pack.modules.F0Predictor.F0Predictor import F0Predictor
2
  import pyworld
3
  import numpy as np
4
 
{infer_pack → lib/infer_pack}/modules/F0Predictor/F0Predictor.py RENAMED
File without changes
{infer_pack → lib/infer_pack}/modules/F0Predictor/HarvestF0Predictor.py RENAMED
@@ -1,4 +1,4 @@
1
- from infer_pack.modules.F0Predictor.F0Predictor import F0Predictor
2
  import pyworld
3
  import numpy as np
4
 
 
1
+ from lib.infer_pack.modules.F0Predictor.F0Predictor import F0Predictor
2
  import pyworld
3
  import numpy as np
4
 
{infer_pack → lib/infer_pack}/modules/F0Predictor/PMF0Predictor.py RENAMED
@@ -1,4 +1,4 @@
1
- from infer_pack.modules.F0Predictor.F0Predictor import F0Predictor
2
  import parselmouth
3
  import numpy as np
4
 
 
1
+ from lib.infer_pack.modules.F0Predictor.F0Predictor import F0Predictor
2
  import parselmouth
3
  import numpy as np
4
 
{infer_pack → lib/infer_pack}/modules/F0Predictor/__init__.py RENAMED
File without changes
{infer_pack → lib/infer_pack}/onnx_inference.py RENAMED
@@ -3,6 +3,7 @@ import librosa
3
  import numpy as np
4
  import soundfile
5
 
 
6
  class ContentVec:
7
  def __init__(self, vec_path="pretrained/vec-768-layer-12.onnx", device=None):
8
  print("load model(s) from {}".format(vec_path))
@@ -32,19 +33,19 @@ class ContentVec:
32
 
33
  def get_f0_predictor(f0_predictor, hop_length, sampling_rate, **kargs):
34
  if f0_predictor == "pm":
35
- from infer_pack.modules.F0Predictor.PMF0Predictor import PMF0Predictor
36
 
37
  f0_predictor_object = PMF0Predictor(
38
  hop_length=hop_length, sampling_rate=sampling_rate
39
  )
40
  elif f0_predictor == "harvest":
41
- from infer_pack.modules.F0Predictor.HarvestF0Predictor import HarvestF0Predictor
42
 
43
  f0_predictor_object = HarvestF0Predictor(
44
  hop_length=hop_length, sampling_rate=sampling_rate
45
  )
46
  elif f0_predictor == "dio":
47
- from infer_pack.modules.F0Predictor.DioF0Predictor import DioF0Predictor
48
 
49
  f0_predictor_object = DioF0Predictor(
50
  hop_length=hop_length, sampling_rate=sampling_rate
 
3
  import numpy as np
4
  import soundfile
5
 
6
+
7
  class ContentVec:
8
  def __init__(self, vec_path="pretrained/vec-768-layer-12.onnx", device=None):
9
  print("load model(s) from {}".format(vec_path))
 
33
 
34
  def get_f0_predictor(f0_predictor, hop_length, sampling_rate, **kargs):
35
  if f0_predictor == "pm":
36
+ from lib.infer_pack.modules.F0Predictor.PMF0Predictor import PMF0Predictor
37
 
38
  f0_predictor_object = PMF0Predictor(
39
  hop_length=hop_length, sampling_rate=sampling_rate
40
  )
41
  elif f0_predictor == "harvest":
42
+ from lib.infer_pack.modules.F0Predictor.HarvestF0Predictor import HarvestF0Predictor
43
 
44
  f0_predictor_object = HarvestF0Predictor(
45
  hop_length=hop_length, sampling_rate=sampling_rate
46
  )
47
  elif f0_predictor == "dio":
48
+ from lib.infer_pack.modules.F0Predictor.DioF0Predictor import DioF0Predictor
49
 
50
  f0_predictor_object = DioF0Predictor(
51
  hop_length=hop_length, sampling_rate=sampling_rate
{infer_pack → lib/infer_pack}/transforms.py RENAMED
File without changes
requirements.txt CHANGED
@@ -1,26 +1,21 @@
1
- setuptools
2
  wheel
3
- httpx==0.23.0
4
- fairseq==0.12.2
5
- gradio
6
  ffmpeg
7
- praat-parselmouth
8
- pyworld
9
- numpy==1.23.5
10
  numba==0.56.4
11
- librosa==0.9.2
12
- faiss-cpu==1.7.3
13
- faiss-gpu
14
  scipy==1.9.3
 
 
 
 
15
  pyworld>=0.3.2
 
 
 
16
  tensorboard
17
  tensorboardX
18
- onnxruntime
19
- pyngrok==4.1.12
20
- soundfile>=0.12.1
21
- tqdm>=4.63.1
22
  torchcrepe
23
- asyncio
24
- edge-tts
25
  demucs
26
- yt_dlp
 
 
 
1
  wheel
2
+ setuptools
 
 
3
  ffmpeg
 
 
 
4
  numba==0.56.4
5
+ numpy==1.23.5
 
 
6
  scipy==1.9.3
7
+ librosa==0.9.1
8
+ fairseq==0.12.2
9
+ faiss-cpu==1.7.3
10
+ gradio==3.34.0
11
  pyworld>=0.3.2
12
+ soundfile>=0.12.1
13
+ praat-parselmouth>=0.4.2
14
+ httpx==0.23.0
15
  tensorboard
16
  tensorboardX
 
 
 
 
17
  torchcrepe
18
+ onnxruntime
 
19
  demucs
20
+ edge-tts
21
+ yt_dlp
vc_infer_pipeline.py CHANGED
@@ -184,7 +184,7 @@ class VC(object):
184
  with torch.no_grad():
185
  logits = model.extract_features(**inputs)
186
  feats = model.final_proj(logits[0]) if version == "v1" else logits[0]
187
- if protect < 0.5 and pitch!=None and pitchf!=None:
188
  feats0 = feats.clone()
189
  if (
190
  isinstance(index, type(None)) == False
@@ -211,7 +211,7 @@ class VC(object):
211
  )
212
 
213
  feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
214
- if protect < 0.5 and pitch!=None and pitchf!=None:
215
  feats0 = F.interpolate(feats0.permute(0, 2, 1), scale_factor=2).permute(
216
  0, 2, 1
217
  )
@@ -223,7 +223,7 @@ class VC(object):
223
  pitch = pitch[:, :p_len]
224
  pitchf = pitchf[:, :p_len]
225
 
226
- if protect < 0.5 and pitch!=None and pitchf!=None:
227
  pitchff = pitchf.clone()
228
  pitchff[pitchf > 0] = 1
229
  pitchff[pitchf < 1] = protect
 
184
  with torch.no_grad():
185
  logits = model.extract_features(**inputs)
186
  feats = model.final_proj(logits[0]) if version == "v1" else logits[0]
187
+ if protect < 0.5 and pitch != None and pitchf != None:
188
  feats0 = feats.clone()
189
  if (
190
  isinstance(index, type(None)) == False
 
211
  )
212
 
213
  feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
214
+ if protect < 0.5 and pitch != None and pitchf != None:
215
  feats0 = F.interpolate(feats0.permute(0, 2, 1), scale_factor=2).permute(
216
  0, 2, 1
217
  )
 
223
  pitch = pitch[:, :p_len]
224
  pitchf = pitchf[:, :p_len]
225
 
226
+ if protect < 0.5 and pitch != None and pitchf != None:
227
  pitchff = pitchf.clone()
228
  pitchff[pitchf > 0] = 1
229
  pitchff[pitchf < 1] = protect