File size: 5,557 Bytes
c8318dc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d9a45bd
 
 
c8318dc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d363fb8
c8318dc
 
 
 
 
d363fb8
c8318dc
 
 
 
 
d9a45bd
c8318dc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
import os
import time
import gradio as gr
import soundfile
import torch

import infer_tool

convert_cnt = [0]
dev = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model_name = "83_epochs.pth"
model_name = "mg_1255_epochs_v0.0.6.pth"
config_name = "milky_green.json"
net_g_ms, hubert_soft, feature_input, hps_ms = infer_tool.load_model(f"{model_name}", f"configs/{config_name}")

# 获取config参数
target_sample = hps_ms.data.sampling_rate
spk_dict = {
    "明前奶绿": 0,
    "云灏": 2,
    "即霜": 3,
    "奕兰秋": 4
}

def list_models():
  global model_name
  res = []
  dir = os.getcwd()
  for f in os.listdir(dir):
    if(f.startswith("D_")):
      continue
    if(f.endswith(".pth")):
      res.append(f)
      if len(f) >= len(model_name):
        model_name = f
  return res

def vc_fn(sid, audio_record, audio_upload, tran):
    print(sid, time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
    if audio_upload is not None:
        audio_path = audio_upload
    elif audio_record is not None:
        audio_path = audio_record
    else:
        return "你需要上传wav文件或使用网页内置的录音!", None

    audio, sampling_rate = infer_tool.format_wav(audio_path, target_sample)
    duration = audio.shape[0] / sampling_rate
    if duration > 600:
        return "请上传小于600s的音频,需要转换长音频请使用colab", None

    o_audio, out_sr = infer_tool.infer(audio_path, spk_dict[sid], tran, net_g_ms, hubert_soft, feature_input)
    out_path = f"./out_temp.wav"
    soundfile.write(out_path, o_audio, target_sample)
    infer_tool.f0_plt(audio_path, out_path, tran, hubert_soft, feature_input)
    mistake, var = infer_tool.calc_error(audio_path, out_path, tran, feature_input)
    return f"半音偏差:{mistake}\n半音方差:{var}", (
        target_sample, o_audio), gr.Image.update("temp.jpg")

def change_model(model):
  global model_name
  global net_g_ms
  global hubert_soft
  global feature_input
  global hps_ms
  model_name = model
  net_g_ms, hubert_soft, feature_input, hps_ms = infer_tool.load_model(f"{model_name}", f"configs/{config_name}")
  return "载入模型:"+model_name

available_models = list_models()

app = gr.Blocks()
with app:
    with gr.Tabs():
        with gr.TabItem("Basic"):
            gr.Markdown(value="""
            本模型为 [sovits_f0](https://huggingface.co/spaces/innnky/nyaru-svc2.0-advanced) 魔改。含AI奶绿(aka. [明前奶绿](https://space.bilibili.com/2132180406))音色,支持**60s以内**的**无伴奏**wav、mp3(单声道)格式,或使用**网页内置**的录音(二选一)

            转换效果取决于源音频语气、节奏是否与目标音色相近,以及音域是否超出目标音色音域范围

            奶绿高音数据效果稍差,一些音高过高的需要考虑降调

            该模型基于 [innnky/so-vits-svc](https://github.com/innnky/so-vits-svc),如果想自己制作并训练模型可以访问这个 [GitHub 仓库](https://github.com/IceKyrin/sovits_guide)

            """)
            model_selected = gr.Dropdown(choices=available_models, label = "模型", value=model_name)
            current_model_text = gr.Markdown("")
            model_selected.change(change_model, inputs=[model_selected], outputs=[current_model_text])
            speaker_id = gr.Dropdown(label="音色", choices=['明前奶绿'], value="明前奶绿")
            record_input = gr.Audio(source="microphone", label="录制你的声音", type="filepath", elem_id="audio_inputs")
            upload_input = gr.Audio(source="upload", label="上传音频(长度小于60秒)", type="filepath",
                                    elem_id="audio_inputs")
            vc_transform = gr.Number(label="升降半音(整数,可以正负,半音数量,升高八度就是12)", value=0)
            vc_submit = gr.Button("转换", variant="primary")
            out_audio = gr.Audio(label="Output Audio")
            gr.Markdown(value="""
                        输出信息为音高平均偏差半音数量,体现转换音频的跑调情况(一般平均小于0.5个半音)
                        """)
            out_message = gr.Textbox(label="跑调误差信息")
            gr.Markdown(value="""f0曲线可以直观的显示跑调情况,蓝色为输入音高,橙色为合成音频的音高

                        若**只看见橙色**,说明蓝色曲线被覆盖,转换效果较好

                        """)
            f0_image = gr.Image(label="f0曲线")
        vc_submit.click(vc_fn, [speaker_id, record_input, upload_input, vc_transform],
                        [out_message, out_audio, f0_image])
        with gr.TabItem("使用说明"):
            gr.Markdown(value="""
                        0、合集:https://github.com/IceKyrin/sovits_guide/blob/main/README.md

                        1、仅支持sovit_f0(sovits2.0)模型

                        2、自行下载hubert-soft-0d54a1f4.pt改名为hubert.pt放置于pth文件夹下(已经下好了)
                            https://github.com/bshall/hubert/releases/tag/v0.1

                        3、pth文件夹下放置sovits2.0的模型

                        4、与模型配套的xxx.json,需有speaker项——人物列表

                        5、放无伴奏的音频、或网页内置录音,不要放奇奇怪怪的格式

                        6、仅供交流使用,不对用户行为负责

                        """)

    app.launch()