del nltk - oversplits No. 47

Browse files

Files changed (4) hide show

Modules/diffusion/modules.py +0 -366
api.py +9 -21
models.py +8 -98
msinference.py +9 -11

Modules/diffusion/modules.py DELETED Viewed

@@ -1,366 +0,0 @@
-from math import floor, log, pi
-import torch.nn.functional as F
-import torch
-import torch.nn as nn
-from einops import rearrange, reduce, repeat
-from einops.layers.torch import Rearrange
-from einops_exts import rearrange_many
-from torch import Tensor, einsum
-def default(val, d):
-    if val is not None: #exists(val):
-        return val
-    return d # d() if isfunction(d) else d
-class AdaLayerNorm(nn.Module):
-    def __init__(self, style_dim, channels, eps=1e-5):
-        super().__init__()
-        self.channels = channels
-        self.eps = eps
-        self.fc = nn.Linear(style_dim, channels*2)
-    def forward(self, x, s):
-        x = x.transpose(-1, -2)
-        x = x.transpose(1, -1)
-        h = self.fc(s)
-        h = h.view(h.size(0), h.size(1), 1)
-        gamma, beta = torch.chunk(h, chunks=2, dim=1)
-        gamma, beta = gamma.transpose(1, -1), beta.transpose(1, -1)
-        x = F.layer_norm(x, (self.channels,), eps=self.eps)
-        x = (1 + gamma) * x + beta
-        return x.transpose(1, -1).transpose(-1, -2)
-class StyleTransformer1d(nn.Module):
-    # artificial_stylets / models.py
-    def __init__(
-        self,
-        num_layers: int,
-        channels: int,
-        num_heads: int,
-        head_features: int,
-        multiplier: int,
-        use_context_time: bool = True,
-        use_rel_pos: bool = False,
-        context_features_multiplier: int = 1,
-        # rel_pos_num_buckets: Optional[int] = None,
-        # rel_pos_max_distance: Optional[int] = None,
-        context_features=None,
-        context_embedding_features=None,
-        embedding_max_length=512,
-    ):
-        super().__init__()
-        self.blocks = nn.ModuleList(
-            [
-                StyleTransformerBlock(
-                    features=channels + context_embedding_features,
-                    head_features=head_features,
-                    num_heads=num_heads,
-                    multiplier=multiplier,
-                    style_dim=context_features,
-                    use_rel_pos=use_rel_pos,
-                    # rel_pos_num_buckets=rel_pos_num_buckets,
-                    # rel_pos_max_distance=rel_pos_max_distance,
-                )
-                for i in range(num_layers)
-            ]
-        )
-        self.to_out = nn.Sequential(
-            Rearrange("b t c -> b c t"),
-            nn.Conv1d(
-                in_channels=channels + context_embedding_features,
-                out_channels=channels,
-                kernel_size=1,
-            ),
-        )
-        use_context_features = context_features is not None
-        self.use_context_features = use_context_features
-        self.use_context_time = use_context_time
-        if use_context_time or use_context_features:
-            # print(f'{use_context_time=} {use_context_features=}ooooooooooooooooooooooooooooooooooo')
-            # raise ValueError
-            # True True  both context
-            context_mapping_features = channels + context_embedding_features
-            self.to_mapping = nn.Sequential(
-                nn.Linear(context_mapping_features, context_mapping_features),
-                nn.GELU(),
-                nn.Linear(context_mapping_features, context_mapping_features),
-                nn.GELU(),
-            )
-        if use_context_time:
-            self.to_time = nn.Sequential(
-                TimePositionalEmbedding(
-                    dim=channels, out_features=context_mapping_features
-                ),
-                nn.GELU(),
-            )
-        if use_context_features:
-            self.to_features = nn.Sequential(
-                nn.Linear(
-                    in_features=context_features, out_features=context_mapping_features
-                ),
-                nn.GELU(),
-            )
-        # self.fixed_embedding = FixedEmbedding(
-        #     max_length=embedding_max_length, features=context_embedding_features
-        # )  # Non speker-aware LookUp: EMbedding looks just the time-frame-index [0,1,2...,num-asr-time-frames]
-    def get_mapping(
-        self,
-        time=None,
-        features=None):
-        """Combines context time features and features into mapping"""
-        items, mapping = [], None
-        # Compute time features
-        if self.use_context_time:
-            items += [self.to_time(time)]
-        # Compute features
-        if self.use_context_features:
-            items += [self.to_features(features)]
-        # Compute joint mapping
-        if self.use_context_time or self.use_context_features:
-            # raise ValueError
-            mapping = reduce(torch.stack(items), "n b m -> b m", "sum")
-            mapping = self.to_mapping(mapping)
-        return mapping
-    def forward(self,
-                x,
-                time,
-                embedding= None,
-                features = None):
-        # --
-                # called by forward()
-        mapping = self.get_mapping(time, features)
-        x = torch.cat([x.expand(-1, embedding.size(1), -1), embedding], axis=-1)
-        mapping = mapping.unsqueeze(1).expand(-1, embedding.size(1), -1)
-        for block in self.blocks:
-            x = x + mapping
-            x = block(x, features)
-        x = x.mean(axis=1).unsqueeze(1)
-        x = self.to_out(x)
-        x = x.transpose(-1, -2)
-        return x
-class StyleTransformerBlock(nn.Module):
-    def __init__(
-        self,
-        features: int,
-        num_heads: int,
-        head_features: int,
-        style_dim: int,
-        multiplier: int,
-        use_rel_pos: bool,
-        # rel_pos_num_buckets: Optional[int] = None,
-        # rel_pos_max_distance: Optional[int] = None,
-        context_features = None,
-    ):
-        super().__init__()
-        self.use_cross_attention = (context_features is not None) and (context_features > 0)
-        # print(f'{rel_pos_num_buckets=} {rel_pos_max_distance=}')  # None None
-        # raise ValueError
-        self.attention = StyleAttention(
-            features=features,
-            style_dim=style_dim,
-            num_heads=num_heads,
-            head_features=head_features
-        )
-        if self.use_cross_attention:
-            raise ValueError
-        self.feed_forward = FeedForward(features=features, multiplier=multiplier)
-    def forward(self, x: Tensor, s: Tensor, *, context = None) -> Tensor:
-        x = self.attention(x, s) + x
-        if self.use_cross_attention:
-            raise ValueError
-            # x = self.cross_attention(x, s, context=context) + x
-        x = self.feed_forward(x) + x
-        return x
-class StyleAttention(nn.Module):
-    def __init__(
-        self,
-        features: int,
-        *,
-        style_dim: int,
-        head_features: int,
-        num_heads: int,
-        context_features = None,
-        # use_rel_pos: bool,
-        # rel_pos_num_buckets: Optional[int] = None,
-        # rel_pos_max_distance: Optional[int] = None,
-    ):
-        super().__init__()
-        self.context_features = context_features
-        mid_features = head_features * num_heads
-        context_features = default(context_features, features)
-        self.norm = AdaLayerNorm(style_dim, features)
-        self.norm_context = AdaLayerNorm(style_dim, context_features)
-        self.to_q = nn.Linear(
-            in_features=features, out_features=mid_features, bias=False
-        )
-        self.to_kv = nn.Linear(
-            in_features=context_features, out_features=mid_features * 2, bias=False
-        )
-        self.attention = AttentionBase(
-            features,
-            num_heads=num_heads,
-            head_features=head_features
-        )
-    def forward(self, x, s, *, context = None):
-        if context is not None:
-            raise ValueError
-        context = default(context, x)
-        x, context = self.norm(x, s), self.norm_context(context, s)
-        q, k, v = (self.to_q(x), *torch.chunk(self.to_kv(context), chunks=2, dim=-1))
-        return self.attention(q, k, v)
-def FeedForward(features,
-                multiplier):
-    mid_features = features * multiplier
-    return nn.Sequential(
-        nn.Linear(in_features=features, out_features=mid_features),
-        nn.GELU(),
-        nn.Linear(in_features=mid_features, out_features=features),
-    )
-class AttentionBase(nn.Module):
-    def __init__(
-        self,
-        features,
-        *,
-        head_features,
-        num_heads):
-        super().__init__()
-        self.scale = head_features ** -0.5
-        self.num_heads = num_heads
-        mid_features = head_features * num_heads
-        self.to_out = nn.Linear(in_features=mid_features,
-                                out_features=features)
-    def forward(self, q: Tensor, k: Tensor, v: Tensor) -> Tensor:
-        # Split heads
-        q, k, v = rearrange_many((q, k, v), "b n (h d) -> b h n d", h=self.num_heads)
-        # Compute similarity matrix
-        sim = einsum("... n d, ... m d -> ... n m", q, k)
-        #                     _____THERE_IS_NO_rel_po
-        # sim = (sim + self.rel_pos(*sim.shape[-2:])) if self.use_rel_pos else sim
-        # print(self.rel_pos)
-        sim = sim * self.scale
-        # Get attention matrix with softmax
-        attn = sim.softmax(dim=-1)
-        # Compute values
-        out = einsum("... n m, ... m d -> ... n d", attn, v)
-        out = rearrange(out, "b h n d -> b n (h d)")
-        return self.to_out(out)
-class Attention(nn.Module):
-    def __init__(
-        self,
-        features,
-        *,
-        head_features,
-        num_heads,
-        out_features=None,
-        context_features=None,
-        # use_rel_pos,
-        # rel_pos_num_buckets: Optional[int] = None,
-        # rel_pos_max_distance: Optional[int] = None,
-    ):
-        super().__init__()
-        self.context_features = context_features
-        mid_features = head_features * num_heads
-        context_features = default(context_features, features)
-        self.norm = nn.LayerNorm(features)
-        self.norm_context = nn.LayerNorm(context_features)
-        self.to_q = nn.Linear(
-            in_features=features, out_features=mid_features, bias=False
-        )
-        self.to_kv = nn.Linear(
-            in_features=context_features, out_features=mid_features * 2, bias=False
-        )
-        self.attention = AttentionBase(
-            features,
-            out_features=out_features,
-            num_heads=num_heads,
-            head_features=head_features,
-            # use_rel_pos=use_rel_pos,
-            # rel_pos_num_buckets=rel_pos_num_buckets,
-            # rel_pos_max_distance=rel_pos_max_distance,
-        )
-    def forward(self, x: Tensor, *, context = None) -> Tensor:
-        # assert_message = "You must provide a context when using context_features"
-        # assert not self.context_features or exists(context), assert_message
-        # Use context if provided
-        context = default(context, x)
-        # Normalize then compute q from input and k,v from context
-        x, context = self.norm(x), self.norm_context(context)
-        q, k, v = (self.to_q(x), *torch.chunk(self.to_kv(context), chunks=2, dim=-1))
-        # Compute and return attention
-        return self.attention(q, k, v)
-class LearnedPositionalEmbedding(nn.Module):
-    """Used for continuous time"""
-    def __init__(self, dim: int):
-        super().__init__()
-        assert (dim % 2) == 0
-        half_dim = dim // 2
-        self.weights = nn.Parameter(torch.randn(half_dim))
-    def forward(self, x: Tensor) -> Tensor:
-        x = rearrange(x, "b -> b 1")
-        freqs = x * rearrange(self.weights, "d -> 1 d") * 2 * pi
-        fouriered = torch.cat((freqs.sin(), freqs.cos()), dim=-1)
-        fouriered = torch.cat((x, fouriered), dim=-1)
-        return fouriered
-def TimePositionalEmbedding(dim: int, out_features: int) -> nn.Module:
-    return nn.Sequential(
-        LearnedPositionalEmbedding(dim),
-        nn.Linear(in_features=dim + 1, out_features=out_features),
-    )

api.py CHANGED Viewed

@@ -10,7 +10,6 @@ import srt
 import subprocess
 import cv2
 import markdown
-import json
 from pathlib import Path
 from types import SimpleNamespace
 from flask import Flask, request, send_from_directory
@@ -25,8 +24,7 @@ sound_generator = AudioGen(duration=4.74, device='cuda:0').to('cuda:0').eval()
 Path(CACHE_DIR).mkdir(parents=True, exist_ok=True)
-import nltk
-nltk.download('punkt')
 # SSH AGENT
 #   eval $(ssh-agent -s)
@@ -150,8 +148,7 @@ def tts_multi_sentence(precomputed_style_vector=None,
                        text=None,
                        voice=None,
                        soundscape=None,
-                       speed=None,
-                       diffusion_steps=7):
     '''create 24kHZ np.array with tts
        precomputed_style_vector :   required if en_US or en_UK in voice, so
@@ -168,10 +165,7 @@ def tts_multi_sentence(precomputed_style_vector=None,
         x = []
         for _sentence in text:
             x.append(msinference.inference(_sentence,
-                        precomputed_style_vector,
-                                    alpha=0.3,
-                                    beta=0.7,
-                                    diffusion_steps=diffusion_steps)
                      )
         x = np.concatenate(x)
@@ -270,7 +264,6 @@ def serve_wav():
     # ====STYLE VECTOR====
     precomputed_style_vector = None
-    diffusion_steps = 7  # 7=native / 5=non-native
     if args.native:  # Voice Cloning
         try:
@@ -307,7 +300,7 @@ def serve_wav():
                                                                  '/', '_').replace('#', '_').replace(
                                                                     'cmu-arctic', 'cmu_arctic').replace(
                                                                         '_low', '') + '.wav')
-            diffusion_steps = 5  # non-native
         # Foreign Lang - MMS/TTS
         else:
@@ -448,8 +441,7 @@ def serve_wav():
                                                  precomputed_style_vector=precomputed_style_vector,
                                                  voice=args.voice,
                                                  soundscape=args.soundscape,
-                                                 speed=args.speed,
-                                                 diffusion_steps=diffusion_steps)
                               )
             total = np.concatenate(pieces, 0)
             # x = audresample.resample(x.astype(np.float32), 24000, 22050)  # reshapes (64,) -> (1,64)
@@ -470,8 +462,7 @@ def serve_wav():
                                precomputed_style_vector=precomputed_style_vector,
                                voice=args.voice,
                                soundscape=args.soundscape,
-                               speed=args.speed,
-                               diffusion_steps=diffusion_steps)
             soundfile.write(AUDIO_TRACK, x, 24000)
     # IMAGE 2 SPEECH
@@ -490,8 +481,7 @@ def serve_wav():
                                precomputed_style_vector=precomputed_style_vector,
                                voice=args.voice,
                                soundscape=args.soundscape,
-                               speed=args.speed,
-                               diffusion_steps=diffusion_steps
                                )
         soundfile.write(AUDIO_TRACK, x, 24000)
     if args.video or args.image:
@@ -520,8 +510,7 @@ def serve_wav():
                                precomputed_style_vector=precomputed_style_vector,
                                voice=args.voice,
                                soundscape=args.soundscape,
-                               speed=args.speed,
-                               diffusion_steps=diffusion_steps)
         OUT_FILE = 'tmp.wav'
         soundfile.write(CACHE_DIR + OUT_FILE, x, 24000)
@@ -529,8 +518,7 @@ def serve_wav():
     # audios = [msinference.inference(text,
-    #                                 msinference.compute_style(f'voices/{voice}.wav'),
-    #                                 alpha=0.3, beta=0.7, diffusion_steps=7)]
     # # for t in [text]:
     # output_buffer = io.BytesIO()
     # write(output_buffer, 24000, np.concatenate(audios))

 import subprocess
 import cv2
 import markdown
 from pathlib import Path
 from types import SimpleNamespace
 from flask import Flask, request, send_from_directory
 Path(CACHE_DIR).mkdir(parents=True, exist_ok=True)
 # SSH AGENT
 #   eval $(ssh-agent -s)
                        text=None,
                        voice=None,
                        soundscape=None,
+                       speed=None):
     '''create 24kHZ np.array with tts
        precomputed_style_vector :   required if en_US or en_UK in voice, so
         x = []
         for _sentence in text:
             x.append(msinference.inference(_sentence,
+                        precomputed_style_vector)
                      )
         x = np.concatenate(x)
     # ====STYLE VECTOR====
     precomputed_style_vector = None
     if args.native:  # Voice Cloning
         try:
                                                                  '/', '_').replace('#', '_').replace(
                                                                     'cmu-arctic', 'cmu_arctic').replace(
                                                                         '_low', '') + '.wav')
         # Foreign Lang - MMS/TTS
         else:
                                                  precomputed_style_vector=precomputed_style_vector,
                                                  voice=args.voice,
                                                  soundscape=args.soundscape,
+                                                 speed=args.speed)
                               )
             total = np.concatenate(pieces, 0)
             # x = audresample.resample(x.astype(np.float32), 24000, 22050)  # reshapes (64,) -> (1,64)
                                precomputed_style_vector=precomputed_style_vector,
                                voice=args.voice,
                                soundscape=args.soundscape,
+                               speed=args.speed)
             soundfile.write(AUDIO_TRACK, x, 24000)
     # IMAGE 2 SPEECH
                                precomputed_style_vector=precomputed_style_vector,
                                voice=args.voice,
                                soundscape=args.soundscape,
+                               speed=args.speed
                                )
         soundfile.write(AUDIO_TRACK, x, 24000)
     if args.video or args.image:
                                precomputed_style_vector=precomputed_style_vector,
                                voice=args.voice,
                                soundscape=args.soundscape,
+                               speed=args.speed)
         OUT_FILE = 'tmp.wav'
         soundfile.write(CACHE_DIR + OUT_FILE, x, 24000)
     # audios = [msinference.inference(text,
+    #                                 msinference.compute_style(f'voices/{voice}.wav'))]
     # # for t in [text]:
     # output_buffer = io.BytesIO()
     # write(output_buffer, 24000, np.concatenate(audios))

models.py CHANGED Viewed

@@ -1,96 +1,15 @@
 #coding:utf-8
 import os
-import os.path as osp
-import copy
 import math
-import numpy as np
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
 from Utils.ASR.models import ASRCNN
 from Utils.JDC.model import JDCNet
-from Modules.diffusion.modules import StyleTransformer1d
 from munch import Munch
 import yaml
-from math import pi
-from random import randint
-import torch
-from einops import rearrange
-from torch import Tensor, nn
-from tqdm import tqdm
-def get_default_model_kwargs():
-    return dict(
-        channels=128,
-        patch_size=16,
-        multipliers=[1, 2, 4, 4, 4, 4, 4],
-        factors=[4, 4, 4, 2, 2, 2],
-        num_blocks=[2, 2, 2, 2, 2, 2],
-        attentions=[0, 0, 0, 1, 1, 1, 1],
-        attention_heads=8,
-        attention_features=64,
-        attention_multiplier=2,
-        attention_use_rel_pos=False,
-        diffusion_type="v",
-        diffusion_sigma_distribution=UniformDistribution(),
-    )
-def get_default_sampling_kwargs():
-    return dict(sigma_schedule=LinearSchedule(), sampler=VSampler(), clamp=True)
-class AudioDiffusionConditional(nn.Module):
-    def __init__(
-        self,
-        embedding_features: int,
-        embedding_max_length: int,
-        embedding_mask_proba: float = 0.1,
-        **kwargs,
-    ):
-        self.unet = None
-        self.embedding_mask_proba = embedding_mask_proba
-        # default_kwargs = dict(
-        #     **get_default_model_kwargs(),
-        #     unet_type="cfg",
-        #     context_embedding_features=embedding_features,
-        #     context_embedding_max_length=embedding_max_length,
-        # )
-        super().__init__()
-    def forward(self, *args, **kwargs):
-        default_kwargs = dict(embedding_mask_proba=self.embedding_mask_proba)
-        # here embedding_scale = 1.0 is passed to DiffusionSampler() - del no-op if scale = 1.0
-        return self.diffusion(*args, **{**default_kwargs, **kwargs})
-    # def sample(self, *args, **kwargs):
-    #     default_kwargs = dict(
-    #         **get_default_sampling_kwargs(),
-    #         embedding_scale=5.0,
-    #     )
-    #     return super().sample(*args, **{**default_kwargs, **kwargs})
 class LearnedDownSample(nn.Module):
@@ -106,10 +25,11 @@ class LearnedDownSample(nn.Module):
             self.conv = spectral_norm(nn.Conv2d(dim_in, dim_in, kernel_size=(3, 3), stride=(2, 2), groups=dim_in, padding=1))
         else:
             raise RuntimeError('Got unexpected donwsampletype %s, expected is [none, timepreserve, half]' % self.layer_type)
     def forward(self, x):
         return self.conv(x)
 class DownSample(nn.Module):
     def __init__(self, layer_type):
         super().__init__()
@@ -187,6 +107,7 @@ class ResBlk(nn.Module):
         x = self._shortcut(x) + self._residual(x)
         return x / math.sqrt(2)  # unit variance
 class StyleEncoder(nn.Module):
     def __init__(self, dim_in=48, style_dim=48, max_conv_dim=384):
         super().__init__()
@@ -211,9 +132,9 @@ class StyleEncoder(nn.Module):
         h = self.shared(x)
         h = h.view(h.size(0), -1)
         s = self.unshared(h)
         return s
 class LinearNorm(torch.nn.Module):
     def __init__(self, in_dim, out_dim, bias=True, w_init_gain='linear'):
         super(LinearNorm, self).__init__()
@@ -226,6 +147,7 @@ class LinearNorm(torch.nn.Module):
     def forward(self, x):
         return self.linear_layer(x)
 class ResBlk1d(nn.Module):
     def __init__(self, dim_in, dim_out, actv=nn.LeakyReLU(0.2),
                  normalize=False, downsample='none', dropout_p=0.2):
@@ -286,6 +208,7 @@ class ResBlk1d(nn.Module):
         x = self._shortcut(x) + self._residual(x)
         return x / math.sqrt(2)  # unit variance
 class LayerNorm(nn.Module):
     def __init__(self, channels, eps=1e-5):
         super().__init__()
@@ -299,7 +222,7 @@ class LayerNorm(nn.Module):
         x = x.transpose(1, -1)
         x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
         return x.transpose(1, -1)
 class TextEncoder(nn.Module):
     def __init__(self, channels, kernel_size, depth, n_symbols, actv=nn.LeakyReLU(0.2)):
         super().__init__()
@@ -612,19 +535,6 @@ def build_model(args, text_aligner, pitch_extractor, bert):
     style_encoder = StyleEncoder(dim_in=args.dim_in, style_dim=args.style_dim, max_conv_dim=args.hidden_dim) # acoustic style encoder
     predictor_encoder = StyleEncoder(dim_in=args.dim_in, style_dim=args.style_dim, max_conv_dim=args.hidden_dim) # prosodic style encoder
-    # define diffusion model
-    if args.multispeaker:
-        transformer = StyleTransformer1d(channels=args.style_dim*2,
-                                    context_embedding_features=bert.config.hidden_size,
-                                    context_features=args.style_dim*2,
-                                    **args.diffusion.transformer)
-    else:
-        raise NotImplementedError
     nets = Munch(
             bert=bert,
             bert_encoder=nn.Linear(bert.config.hidden_size, args.hidden_dim),

 #coding:utf-8
 import os
 import math
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
+from torch.nn.utils import weight_norm, spectral_norm
 from Utils.ASR.models import ASRCNN
 from Utils.JDC.model import JDCNet
 from munch import Munch
 import yaml
 class LearnedDownSample(nn.Module):
             self.conv = spectral_norm(nn.Conv2d(dim_in, dim_in, kernel_size=(3, 3), stride=(2, 2), groups=dim_in, padding=1))
         else:
             raise RuntimeError('Got unexpected donwsampletype %s, expected is [none, timepreserve, half]' % self.layer_type)
     def forward(self, x):
         return self.conv(x)
 class DownSample(nn.Module):
     def __init__(self, layer_type):
         super().__init__()
         x = self._shortcut(x) + self._residual(x)
         return x / math.sqrt(2)  # unit variance
 class StyleEncoder(nn.Module):
     def __init__(self, dim_in=48, style_dim=48, max_conv_dim=384):
         super().__init__()
         h = self.shared(x)
         h = h.view(h.size(0), -1)
         s = self.unshared(h)
         return s
 class LinearNorm(torch.nn.Module):
     def __init__(self, in_dim, out_dim, bias=True, w_init_gain='linear'):
         super(LinearNorm, self).__init__()
     def forward(self, x):
         return self.linear_layer(x)
 class ResBlk1d(nn.Module):
     def __init__(self, dim_in, dim_out, actv=nn.LeakyReLU(0.2),
                  normalize=False, downsample='none', dropout_p=0.2):
         x = self._shortcut(x) + self._residual(x)
         return x / math.sqrt(2)  # unit variance
 class LayerNorm(nn.Module):
     def __init__(self, channels, eps=1e-5):
         super().__init__()
         x = x.transpose(1, -1)
         x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
         return x.transpose(1, -1)
 class TextEncoder(nn.Module):
     def __init__(self, channels, kernel_size, depth, n_symbols, actv=nn.LeakyReLU(0.2)):
         super().__init__()
     style_encoder = StyleEncoder(dim_in=args.dim_in, style_dim=args.style_dim, max_conv_dim=args.hidden_dim) # acoustic style encoder
     predictor_encoder = StyleEncoder(dim_in=args.dim_in, style_dim=args.style_dim, max_conv_dim=args.hidden_dim) # prosodic style encoder
     nets = Munch(
             bert=bert,
             bert_encoder=nn.Linear(bert.config.hidden_size, args.hidden_dim),

msinference.py CHANGED Viewed

@@ -1,25 +1,20 @@
 import torch
 from cached_path import cached_path
-import nltk
 import audresample
 # nltk.download('punkt')
 import numpy as np
-np.random.seed(0)
-import time
 import yaml
-import torch.nn.functional as F
-import copy
 import torchaudio
 import librosa
 from models import *
 from munch import Munch
-from torch import nn
 from nltk.tokenize import word_tokenize
 torch.manual_seed(0)
 # torch.backends.cudnn.benchmark = False
 # torch.backends.cudnn.deterministic = True
 # IPA Phonemizer: https://github.com/bootphon/phonemizer
@@ -164,11 +159,12 @@ _ = [model[key].eval() for key in model]
 def inference(text,
               ref_s,
-              alpha = 0.3,
-              beta = 0.7,
-              diffusion_steps=7, # 7 if voice is native English else 5 for non-native
               use_gruut=False):
     text = text.strip()
     ps = global_phonemizer.phonemize([text])
     # print(f'PHONEMIZER: {ps=}\n\n') #PHONEMIZER: ps=['ɐbˈɛbæbləm ']
     ps = word_tokenize(ps[0])
@@ -245,7 +241,7 @@ def inference(text,
                                 F0_pred, N_pred, ref.squeeze().unsqueeze(0))
-    x = x.squeeze().cpu().numpy()[..., :-50] # weird pulse at the end of the model
     x /= np.abs(x).max() + 1e-7
@@ -476,3 +472,5 @@ def foreign(text=None,   # list of text
 # x = synthesize(text=_t, lang=LANG, speed=1.14)
 # audiofile.write('_r.wav', x, 16000)  # mms-tts = 16,000

 import torch
 from cached_path import cached_path
+# import nltk
 import audresample
 # nltk.download('punkt')
 import numpy as np
 import yaml
 import torchaudio
 import librosa
 from models import *
 from munch import Munch
 from nltk.tokenize import word_tokenize
 torch.manual_seed(0)
 # torch.backends.cudnn.benchmark = False
 # torch.backends.cudnn.deterministic = True
+np.random.seed(0)
 # IPA Phonemizer: https://github.com/bootphon/phonemizer
 def inference(text,
               ref_s,
               use_gruut=False):
+    # Ignore .,; AT end of sentence; or just [-50:]
     text = text.strip()
     ps = global_phonemizer.phonemize([text])
     # print(f'PHONEMIZER: {ps=}\n\n') #PHONEMIZER: ps=['ɐbˈɛbæbləm ']
     ps = word_tokenize(ps[0])
                                 F0_pred, N_pred, ref.squeeze().unsqueeze(0))
+    x = x.squeeze().cpu().numpy()[..., :-74] # weird pulse at the end of the model
     x /= np.abs(x).max() + 1e-7
 # x = synthesize(text=_t, lang=LANG, speed=1.14)
 # audiofile.write('_r.wav', x, 16000)  # mms-tts = 16,000