Add vits

Morgic · Feb 4, 2023 · 712a53f · 712a53f
1 parent 24cb262
commit 712a53f
Show file tree

Hide file tree

Showing 36 changed files with 2,877 additions and 291 deletions.
diff --git a/control/cli/ppg2mel_train.py b/control/cli/ppg2mel_train.py
@@ -2,7 +2,7 @@
 import torch
 import argparse
 import numpy as np
-from utils.load_yaml import HpsYaml
+from utils.hparams import HpsYaml
 from models.ppg2mel.train.train_linglf02mel_seq2seq_oneshotvc import Solver
 
 # For reproducibility, comment these may speed up training

diff --git a/control/cli/train_ppg2mel.py b/control/cli/train_ppg2mel.py
@@ -2,7 +2,7 @@
 import torch
 import argparse
 import numpy as np
-from utils.load_yaml import HpsYaml
+from utils.hparams import HpsYaml
 from models.ppg2mel.train.train_linglf02mel_seq2seq_oneshotvc import Solver
 
 # For reproducibility, comment these may speed up training

diff --git a/control/mkgui/train_vc.py b/control/mkgui/train_vc.py
@@ -4,7 +4,7 @@
 from enum import Enum
 from typing import Any, Tuple
 import numpy as np
-from utils.load_yaml import HpsYaml
+from utils.hparams import HpsYaml
 from utils.util import AttrDict
 import torch
 

diff --git a/models/ppg2mel/__init__.py b/models/ppg2mel/__init__.py
@@ -15,7 +15,7 @@
 from .utils.cnn_postnet import Postnet
 from .utils.vc_utils import get_mask_from_lengths
 
-from utils.load_yaml import HpsYaml
+from utils.hparams import HpsYaml
 
 class MelDecoderMOLv2(AbsMelDecoder):
     """Use an encoder to preprocess ppg."""

diff --git a/models/ppg2mel/train.py b/models/ppg2mel/train.py
@@ -2,7 +2,7 @@
 import torch
 import argparse
 import numpy as np
-from utils.load_yaml import HpsYaml
+from utils.hparams import HpsYaml
 from models.ppg2mel.train.train_linglf02mel_seq2seq_oneshotvc import Solver
 
 # For reproducibility, comment these may speed up training

diff --git a/models/ppg2mel/train/solver.py b/models/ppg2mel/train/solver.py
@@ -8,7 +8,6 @@
 
 from .option import default_hparas
 from utils.util import human_format, Timer
-from utils.load_yaml import HpsYaml
 
 
 class BaseSolver():

diff --git a/models/synthesizer/hparams.py b/models/synthesizer/hparams.py
@@ -1,36 +1,4 @@
-import ast
-import pprint
-import json
-
-class HParams(object):
-    def __init__(self, **kwargs): self.__dict__.update(kwargs)
-    def __setitem__(self, key, value): setattr(self, key, value)
-    def __getitem__(self, key): return getattr(self, key)
-    def __repr__(self): return pprint.pformat(self.__dict__)
-
-    def parse(self, string):
-        # Overrides hparams from a comma-separated string of name=value pairs
-        if len(string) > 0:
-            overrides = [s.split("=") for s in string.split(",")]
-            keys, values = zip(*overrides)
-            keys = list(map(str.strip, keys))
-            values = list(map(str.strip, values))
-            for k in keys:
-                self.__dict__[k] = ast.literal_eval(values[keys.index(k)])
-        return self
-
-    def loadJson(self, dict):
-        print("\Loading the json with %s\n", dict)
-        for k in dict.keys():
-            if k not in ["tts_schedule", "tts_finetune_layers"]: 
-                self.__dict__[k] = dict[k]
-        return self
-
-    def dumpJson(self, fp):
-        print("\Saving the json with %s\n", fp)
-        with fp.open("w", encoding="utf-8") as f:
-            json.dump(self.__dict__, f)
-        return self
+from utils.hparams import HParams
 
 hparams = HParams(
         ### Signal Processing (used in both synthesizer and vocoder)
@@ -104,7 +72,7 @@ def dumpJson(self, fp):
         ### SV2TTS
         speaker_embedding_size = 256,               # Dimension for the speaker embedding
         silence_min_duration_split = 0.4,           # Duration in seconds of a silence for an utterance to be split
-        utterance_min_duration = 1.6,               # Duration in seconds below which utterances are discarded
+        utterance_min_duration = 0.5,               # Duration in seconds below which utterances are discarded
         use_gst = True,                             # Whether to use global style token    
         use_ser_for_gst = True,                     # Whether to use speaker embedding referenced for global style token  
         )
diff --git a/models/synthesizer/inference.py b/models/synthesizer/inference.py
@@ -10,7 +10,6 @@
 import numpy as np
 import librosa
 from utils import logmmse
-import json
 from pypinyin import lazy_pinyin, Style
 
 class Synthesizer:
@@ -48,8 +47,7 @@ def load(self):
         # Try to scan config file
         model_config_fpaths = list(self.model_fpath.parent.rglob("*.json"))
         if len(model_config_fpaths)>0 and model_config_fpaths[0].exists():
-            with model_config_fpaths[0].open("r", encoding="utf-8") as f:
-                hparams.loadJson(json.load(f))
+            hparams.loadJson(model_config_fpaths[0])
         """
         Instantiates and loads the model given the weights file that was passed in the constructor.
         """

diff --git a/models/synthesizer/models/base.py b/models/synthesizer/models/base.py
@@ -48,7 +48,11 @@ def log(self, path, msg):
     def load(self, path, device, optimizer=None):
         # Use device of model params as location for loaded state
         checkpoint = torch.load(str(path), map_location=device)
-        self.load_state_dict(checkpoint["model_state"], strict=False)
+        if "model_state" in checkpoint:
+            state = checkpoint["model_state"]
+        else:
+            state = checkpoint["model"]
+        self.load_state_dict(state, strict=False)
 
         if "optimizer_state" in checkpoint and optimizer is not None:
             optimizer.load_state_dict(checkpoint["optimizer_state"])

diff --git a/models/synthesizer/models/sublayer/common/transforms.py b/models/synthesizer/models/sublayer/common/transforms.py
@@ -0,0 +1,193 @@
+import torch
+from torch.nn import functional as F
+
+import numpy as np
+
+
+DEFAULT_MIN_BIN_WIDTH = 1e-3
+DEFAULT_MIN_BIN_HEIGHT = 1e-3
+DEFAULT_MIN_DERIVATIVE = 1e-3
+
+
+def piecewise_rational_quadratic_transform(inputs, 
+                                           unnormalized_widths,
+                                           unnormalized_heights,
+                                           unnormalized_derivatives,
+                                           inverse=False,
+                                           tails=None, 
+                                           tail_bound=1.,
+                                           min_bin_width=DEFAULT_MIN_BIN_WIDTH,
+                                           min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
+                                           min_derivative=DEFAULT_MIN_DERIVATIVE):
+
+    if tails is None:
+        spline_fn = rational_quadratic_spline
+        spline_kwargs = {}
+    else:
+        spline_fn = unconstrained_rational_quadratic_spline
+        spline_kwargs = {
+            'tails': tails,
+            'tail_bound': tail_bound
+        }
+
+    outputs, logabsdet = spline_fn(
+            inputs=inputs,
+            unnormalized_widths=unnormalized_widths,
+            unnormalized_heights=unnormalized_heights,
+            unnormalized_derivatives=unnormalized_derivatives,
+            inverse=inverse,
+            min_bin_width=min_bin_width,
+            min_bin_height=min_bin_height,
+            min_derivative=min_derivative,
+            **spline_kwargs
+    )
+    return outputs, logabsdet
+
+
+def searchsorted(bin_locations, inputs, eps=1e-6):
+    bin_locations[..., -1] += eps
+    return torch.sum(
+        inputs[..., None] >= bin_locations,
+        dim=-1
+    ) - 1
+
+
+def unconstrained_rational_quadratic_spline(inputs,
+                                            unnormalized_widths,
+                                            unnormalized_heights,
+                                            unnormalized_derivatives,
+                                            inverse=False,
+                                            tails='linear',
+                                            tail_bound=1.,
+                                            min_bin_width=DEFAULT_MIN_BIN_WIDTH,
+                                            min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
+                                            min_derivative=DEFAULT_MIN_DERIVATIVE):
+    inside_interval_mask = (inputs >= -tail_bound) & (inputs <= tail_bound)
+    outside_interval_mask = ~inside_interval_mask
+
+    outputs = torch.zeros_like(inputs)
+    logabsdet = torch.zeros_like(inputs)
+
+    if tails == 'linear':
+        unnormalized_derivatives = F.pad(unnormalized_derivatives, pad=(1, 1))
+        constant = np.log(np.exp(1 - min_derivative) - 1)
+        unnormalized_derivatives[..., 0] = constant
+        unnormalized_derivatives[..., -1] = constant
+
+        outputs[outside_interval_mask] = inputs[outside_interval_mask]
+        logabsdet[outside_interval_mask] = 0
+    else:
+        raise RuntimeError('{} tails are not implemented.'.format(tails))
+
+    outputs[inside_interval_mask], logabsdet[inside_interval_mask] = rational_quadratic_spline(
+        inputs=inputs[inside_interval_mask],
+        unnormalized_widths=unnormalized_widths[inside_interval_mask, :],
+        unnormalized_heights=unnormalized_heights[inside_interval_mask, :],
+        unnormalized_derivatives=unnormalized_derivatives[inside_interval_mask, :],
+        inverse=inverse,
+        left=-tail_bound, right=tail_bound, bottom=-tail_bound, top=tail_bound,
+        min_bin_width=min_bin_width,
+        min_bin_height=min_bin_height,
+        min_derivative=min_derivative
+    )
+
+    return outputs, logabsdet
+
+def rational_quadratic_spline(inputs,
+                              unnormalized_widths,
+                              unnormalized_heights,
+                              unnormalized_derivatives,
+                              inverse=False,
+                              left=0., right=1., bottom=0., top=1.,
+                              min_bin_width=DEFAULT_MIN_BIN_WIDTH,
+                              min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
+                              min_derivative=DEFAULT_MIN_DERIVATIVE):
+    if torch.min(inputs) < left or torch.max(inputs) > right:
+        raise ValueError('Input to a transform is not within its domain')
+
+    num_bins = unnormalized_widths.shape[-1]
+
+    if min_bin_width * num_bins > 1.0:
+        raise ValueError('Minimal bin width too large for the number of bins')
+    if min_bin_height * num_bins > 1.0:
+        raise ValueError('Minimal bin height too large for the number of bins')
+
+    widths = F.softmax(unnormalized_widths, dim=-1)
+    widths = min_bin_width + (1 - min_bin_width * num_bins) * widths
+    cumwidths = torch.cumsum(widths, dim=-1)
+    cumwidths = F.pad(cumwidths, pad=(1, 0), mode='constant', value=0.0)
+    cumwidths = (right - left) * cumwidths + left
+    cumwidths[..., 0] = left
+    cumwidths[..., -1] = right
+    widths = cumwidths[..., 1:] - cumwidths[..., :-1]
+
+    derivatives = min_derivative + F.softplus(unnormalized_derivatives)
+
+    heights = F.softmax(unnormalized_heights, dim=-1)
+    heights = min_bin_height + (1 - min_bin_height * num_bins) * heights
+    cumheights = torch.cumsum(heights, dim=-1)
+    cumheights = F.pad(cumheights, pad=(1, 0), mode='constant', value=0.0)
+    cumheights = (top - bottom) * cumheights + bottom
+    cumheights[..., 0] = bottom
+    cumheights[..., -1] = top
+    heights = cumheights[..., 1:] - cumheights[..., :-1]
+
+    if inverse:
+        bin_idx = searchsorted(cumheights, inputs)[..., None]
+    else:
+        bin_idx = searchsorted(cumwidths, inputs)[..., None]
+
+    input_cumwidths = cumwidths.gather(-1, bin_idx)[..., 0]
+    input_bin_widths = widths.gather(-1, bin_idx)[..., 0]
+
+    input_cumheights = cumheights.gather(-1, bin_idx)[..., 0]
+    delta = heights / widths
+    input_delta = delta.gather(-1, bin_idx)[..., 0]
+
+    input_derivatives = derivatives.gather(-1, bin_idx)[..., 0]
+    input_derivatives_plus_one = derivatives[..., 1:].gather(-1, bin_idx)[..., 0]
+
+    input_heights = heights.gather(-1, bin_idx)[..., 0]
+
+    if inverse:
+        a = (((inputs - input_cumheights) * (input_derivatives
+                                             + input_derivatives_plus_one
+                                             - 2 * input_delta)
+              + input_heights * (input_delta - input_derivatives)))
+        b = (input_heights * input_derivatives
+             - (inputs - input_cumheights) * (input_derivatives
+                                              + input_derivatives_plus_one
+                                              - 2 * input_delta))
+        c = - input_delta * (inputs - input_cumheights)
+
+        discriminant = b.pow(2) - 4 * a * c
+        assert (discriminant >= 0).all()
+
+        root = (2 * c) / (-b - torch.sqrt(discriminant))
+        outputs = root * input_bin_widths + input_cumwidths
+
+        theta_one_minus_theta = root * (1 - root)
+        denominator = input_delta + ((input_derivatives + input_derivatives_plus_one - 2 * input_delta)
+                                     * theta_one_minus_theta)
+        derivative_numerator = input_delta.pow(2) * (input_derivatives_plus_one * root.pow(2)
+                                                     + 2 * input_delta * theta_one_minus_theta
+                                                     + input_derivatives * (1 - root).pow(2))
+        logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator)
+
+        return outputs, -logabsdet
+    else:
+        theta = (inputs - input_cumwidths) / input_bin_widths
+        theta_one_minus_theta = theta * (1 - theta)
+
+        numerator = input_heights * (input_delta * theta.pow(2)
+                                     + input_derivatives * theta_one_minus_theta)
+        denominator = input_delta + ((input_derivatives + input_derivatives_plus_one - 2 * input_delta)
+                                     * theta_one_minus_theta)
+        outputs = input_cumheights + numerator / denominator
+
+        derivative_numerator = input_delta.pow(2) * (input_derivatives_plus_one * theta.pow(2)
+                                                     + 2 * input_delta * theta_one_minus_theta
+                                                     + input_derivatives * (1 - theta).pow(2))
+        logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator)
+
+        return outputs, logabsdet