forked from r9y9/tacotron_pytorch
-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathsynthesis.py
103 lines (83 loc) · 3.36 KB
/
synthesis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
# coding: utf-8
"""
Synthesis waveform from trained model.
usage: tts.py [options] <checkpoint> <text_list_file> <dst_dir>
options:
--file-name-suffix=<s> File name suffix [default: ].
--max-decoder-steps=<N> Max decoder steps [default: 500].
-h, --help Show help message.
"""
from docopt import docopt
# Use text & audio modules from existing Tacotron implementation.
import sys
import os
from os.path import dirname, join
tacotron_lib_dir = join(dirname(__file__), "lib", "tacotron")
sys.path.append(tacotron_lib_dir)
from text import text_to_sequence, symbols
from util import audio
from util.plot import plot_alignment
import torch
from torch.autograd import Variable
import numpy as np
import nltk
from tacotron_pytorch import Tacotron
from hparams import hparams
from tqdm import tqdm
use_cuda = torch.cuda.is_available()
def tts(model, text):
"""Convert text to speech waveform given a Tacotron model.
"""
if use_cuda:
model = model.cuda()
# TODO: Turning off dropout of decoder's prenet causes serious performance
# regression, not sure why.
# model.decoder.eval()
model.encoder.eval()
model.postnet.eval()
sequence = np.array(text_to_sequence(text, [hparams.cleaners]))
sequence = Variable(torch.from_numpy(sequence)).unsqueeze(0)
if use_cuda:
sequence = sequence.cuda()
# Greedy decoding
mel_outputs, linear_outputs, alignments = model(sequence)
linear_output = linear_outputs[0].cpu().data.numpy()
spectrogram = audio._denormalize(linear_output)
alignment = alignments[0].cpu().data.numpy()
# Predicted audio signal
waveform = audio.inv_spectrogram(linear_output.T)
return waveform, alignment, spectrogram
if __name__ == "__main__":
args = docopt(__doc__)
print("Command line args:\n", args)
checkpoint_path = args["<checkpoint>"]
text_list_file_path = args["<text_list_file>"]
dst_dir = args["<dst_dir>"]
max_decoder_steps = int(args["--max-decoder-steps"])
file_name_suffix = args["--file-name-suffix"]
model = Tacotron(n_vocab=len(symbols),
embedding_dim=256,
mel_dim=hparams.num_mels,
linear_dim=hparams.num_freq,
r=hparams.outputs_per_step,
padding_idx=hparams.padding_idx,
use_memory_mask=hparams.use_memory_mask,
)
checkpoint = torch.load(checkpoint_path)
model.load_state_dict(checkpoint["state_dict"])
model.decoder.max_decoder_steps = max_decoder_steps
os.makedirs(dst_dir, exist_ok=True)
with open(text_list_file_path, "rb") as f:
lines = f.readlines()
for idx, line in enumerate(lines):
text = line.decode("utf-8")[:-1]
words = nltk.word_tokenize(text)
print("{}: {} ({} chars, {} words)".format(idx, text, len(text), len(words)))
waveform, alignment, _ = tts(model, text)
dst_wav_path = join(dst_dir, "{}{}.wav".format(idx, file_name_suffix))
dst_alignment_path = join(dst_dir, "{}_alignment.png".format(idx))
plot_alignment(alignment.T, dst_alignment_path,
info="tacotron, {}".format(checkpoint_path))
audio.save_wav(waveform, dst_wav_path)
print("Finished! Check out {} for generated audio samples.".format(dst_dir))
sys.exit(0)