mireiafarrus commited on
Commit
a0aa771
1 Parent(s): af7ac2b

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +219 -0
app.py ADDED
@@ -0,0 +1,219 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+
3
+ from hyper_parameters import tacotron_params as hparams
4
+ from training import load_model
5
+
6
+ from audio_processing import griffin_lim
7
+ from nn_layers import TacotronSTFT
8
+
9
+
10
+ from text import text_to_sequence
11
+ from hifigan.env import AttrDict
12
+ from examples_taco2 import *
13
+
14
+ from hifigan.models import Generator
15
+
16
+ import torch
17
+ import numpy as np
18
+ import json
19
+ import os
20
+
21
+ from matplotlib import pyplot as plt
22
+
23
+ # Adjust vertical spacing between subplots
24
+ plt.subplots_adjust(hspace=0.15) # You can adjust the value as needed
25
+
26
+ # Adjust the white space (margins) around the plot
27
+ plt.tight_layout(pad=0.5) # You can adjust the pad value as needed
28
+
29
+ torch.manual_seed(1234)
30
+ MAX_WAV_VALUE = 32768.0
31
+
32
+
33
+ def load_checkpoint(filepath, device):
34
+ assert os.path.isfile(filepath)
35
+ print("Loading '{}'".format(filepath))
36
+ checkpoint_dict = torch.load(filepath, map_location=device)
37
+ print("Complete.")
38
+ return checkpoint_dict
39
+
40
+
41
+ def plot_spec_align_sep(mel, align):
42
+ plt.figure(figsize=(4, 3))
43
+
44
+ fig_mel = plt.figure()
45
+ ax_mel = fig_mel.add_subplot(111)
46
+ fig_mel.tight_layout()
47
+ ax_mel.imshow(mel)
48
+ # fig_mel.set_title('Mel-Scale Spectrogram', fontsize=12)
49
+
50
+ fig_align = plt.figure()
51
+ ax_align = fig_align.add_subplot(111) # fig_align
52
+ fig_align.tight_layout()
53
+ ax_align.imshow(align)
54
+ # fig_align.set_title('Alignment', fontsize=12)
55
+
56
+ return fig_mel, fig_align
57
+
58
+
59
+ # load trained tacotron2 + GST model:
60
+ model = load_model(hparams)
61
+ checkpoint_path = "models/checkpoint_78000.model"
62
+ model.load_state_dict(torch.load(checkpoint_path, map_location="cpu")['state_dict'])
63
+ # model.to('cuda')
64
+ _ = model.eval()
65
+
66
+ # load pre-trained HiFi-GAN model for mel2audio:
67
+ hifigan_checkpoint_path = "models/generator_v1"
68
+ config_file = os.path.join(os.path.split(hifigan_checkpoint_path)[0], 'config.json')
69
+ with open(config_file) as f:
70
+ data = f.read()
71
+ json_config = json.loads(data)
72
+ h = AttrDict(json_config)
73
+ device = torch.device("cpu")
74
+
75
+ generator = Generator(h).to(device)
76
+
77
+ state_dict_g = load_checkpoint(hifigan_checkpoint_path, device)
78
+ generator.load_state_dict(state_dict_g['generator'])
79
+ generator.eval()
80
+ generator.remove_weight_norm()
81
+
82
+
83
+ def synthesize(text, gst_1, gst_2, gst_3, voc):
84
+ sequence = np.array(text_to_sequence(text, ['english_cleaners']))[None, :]
85
+ sequence = torch.from_numpy(sequence).to(device='cpu', dtype=torch.int64)
86
+
87
+ # gst_head_scores = np.array([0.5, 0.15, 0.35])
88
+ gst_head_scores = np.array([gst_1, gst_2, gst_3])
89
+ gst_scores = torch.from_numpy(gst_head_scores).float()
90
+
91
+ with torch.no_grad():
92
+ mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence, gst_scores)
93
+
94
+ if voc == 0:
95
+ # mel2wav inference:
96
+ with torch.no_grad():
97
+ y_g_hat = generator(mel_outputs_postnet)
98
+ audio = y_g_hat.squeeze()
99
+ audio = audio * MAX_WAV_VALUE
100
+ audio_numpy = audio.cpu().numpy().astype('int16')
101
+ # audio = vocoder_model.inference(mel_outputs_postnet)
102
+ # audio_numpy = audio.data.cpu().detach().numpy()
103
+
104
+ else:
105
+ # Griffin Lim vocoder synthesis:
106
+ griffin_iters = 60
107
+ taco_stft = TacotronSTFT(hparams['filter_length'], hparams['hop_length'], hparams['win_length'],
108
+ sampling_rate=hparams['sampling_rate'])
109
+
110
+ mel_decompress = taco_stft.spectral_de_normalize(mel_outputs_postnet)
111
+ mel_decompress = mel_decompress.transpose(1, 2).data.cpu()
112
+
113
+ spec_from_mel_scaling = 60
114
+ spec_from_mel = torch.mm(mel_decompress[0], taco_stft.mel_basis)
115
+ spec_from_mel = spec_from_mel.transpose(0, 1).unsqueeze(0)
116
+ spec_from_mel = spec_from_mel * spec_from_mel_scaling
117
+
118
+ audio = griffin_lim(torch.autograd.Variable(spec_from_mel[:, :, :-1]), taco_stft.stft_fn, griffin_iters)
119
+
120
+ audio = audio.squeeze()
121
+ audio_numpy = audio.cpu().numpy()
122
+
123
+ # prepare plot for the output:
124
+ mel_outputs_postnet = torch.flip(mel_outputs_postnet.squeeze(), [0])
125
+ mel_outputs_postnet = mel_outputs_postnet.detach().numpy()
126
+ alignments = alignments.squeeze().T.detach().numpy()
127
+
128
+ # normalize numpy arrays between [-1, 1]
129
+ min_val = np.min(mel_outputs_postnet)
130
+ max_val = np.max(mel_outputs_postnet)
131
+ scaled_mel = (mel_outputs_postnet - min_val) / (max_val - min_val)
132
+ normalized_mel = 2 * scaled_mel - 1
133
+
134
+ min_val = np.min(alignments)
135
+ max_val = np.max(alignments)
136
+ scaled_align = (alignments - min_val) / (max_val - min_val)
137
+ normalized_align = 2 * scaled_align - 1
138
+
139
+ aw = gr.make_waveform((22050, audio_numpy), bg_image='background_images/wallpaper_test_1_crop_3.jpg',
140
+ bars_color=('#f3df4b', '#63edb7'), bar_count=100, bar_width=0.7, animate=True)
141
+
142
+ return aw, normalized_mel, normalized_align # (22050, audio_numpy), fig_mel, fig_align
143
+
144
+
145
+ with gr.Blocks() as demo:
146
+ gr.Markdown("<center><h1>English Neural Text-to-Speech</h1> "
147
+ "<h2>Speech Synthesis with Partial Style Control</h2></center><br>")
148
+ # gr.Markdown("## <center>Unsupervised Style Tokens using Single-Head Attention Parallel Encoder "
149
+ # "with Tacotron2</center>")
150
+ with gr.Row():
151
+ with gr.Column(scale=1):
152
+ # , value="Speech synthesis has evolved dramatically since the development of neural architectures capable of generating high quality samples."
153
+ inp = gr.Textbox(label="Input Text")
154
+ clear_btn = gr.ClearButton(value='Clear Text', size='sm', components=[inp])
155
+ # gr.Markdown("A continuació, calibrem els pesos dels *style tokens*:")
156
+ with gr.Row():
157
+ with gr.Column(scale=2):
158
+ with gr.Tab("Global Style Tokens"):
159
+ gst_1 = gr.Slider(0.2, 0.45, label="GST 1", value=0.4)
160
+ gst_2 = gr.Slider(0.2, 0.45, label="GST 2", value=0.26)
161
+ gst_3 = gr.Slider(0.2, 0.45, label="GST 3", value=0.33)
162
+ with gr.Column(scale=0):
163
+ with gr.Tab("Vocoder"):
164
+ vocoder = gr.Radio([("HiFi-GAN", 0), ("Griffin-Lim", 1)],
165
+ container=False, value=0, min_width=300) # label="Vocoder")
166
+ greet_btn = gr.Button("Synthesize!", scale=1)
167
+ with gr.Column():
168
+ with gr.Tab("Spectrogram"):
169
+ spec_plot = gr.Image(container=False)
170
+ with gr.Tab("Alignment"):
171
+ align_plot = gr.Image(container=False)
172
+ wave_video = gr.Video(label="Waveform", height=150, width=800, container=False)
173
+
174
+ def display_video():
175
+ return wave_video
176
+ greet_btn.click(fn=synthesize, inputs=[inp, gst_1, gst_2, gst_3, vocoder],
177
+ outputs=[wave_video, spec_plot, align_plot],
178
+ api_name="synthesize")
179
+
180
+ with gr.Row():
181
+ with gr.Column():
182
+ gr.Examples(examples=infer_from_text_examples,
183
+ inputs=[inp, gst_1, gst_2, gst_3, vocoder],
184
+ outputs=[wave_video, spec_plot, align_plot],
185
+ fn=synthesize,
186
+ cache_examples=False, )
187
+ gr.Markdown("""
188
+ ### Details and Indications
189
+ This is a Text-to-Speech (TTS) system that consists of two modules: 1) a replicated Tacotron2 model, which generates
190
+ the spectrogram of the speech corresponding to the input text. And 2) a pre-trained HiFiGAN vocoder that maps
191
+ spectrograms to a digital waveforms. Global Style Tokens (GST) have been implemented to catch style information from
192
+ the female speaker with which the model has been trained (see the links below for more information).
193
+ Please, feel free to play with the GST scores and observe how the synthetic voice spells the input text.
194
+ Keep in mind that GSTs have been trained in an unsupervised way, so there is no specific control of
195
+ style attributes. Moreover, try to balance the GST scores by making them add up to a value close to 1. Below or
196
+ higher than 1 may cause low energy, mispronunciations or distortion.
197
+ You can choose between the HiFiGAN trained vocoder and the iterative algorithm Griffin-Lim, which does not need
198
+ to be trained but produces a "robotic" effect.
199
+
200
+ ### More Information
201
+ Spectrogram generator has been adapted and trained from the
202
+ [NVIDIA's](https://github.com/NVIDIA/tacotron2) Tacotron2 replica published in
203
+ <a href="https://arxiv.org/abs/1712.05884" style="display: inline-block;margin-top: .5em;margin-right: .25em;"
204
+ target="_blank"> <img style="margin-bottom: 0em;display: inline;margin-top: -.25em;"
205
+ src="https://img.shields.io/badge/ArXiv-Tacotron2-b31b1b" alt="Tacotron2"></a>
206
+ <br>
207
+ The neural vocoder is a pre-trained model replicated from <a href="https://arxiv.org/abs/2010.05646"
208
+ style="display: inline-block;margin-top: .5em;margin-right: .25em;" target="_blank"> <img style="margin-bottom:
209
+ 0em;display: inline;margin-top: -.25em;" src="https://img.shields.io/badge/ArXiv-HiFi%20GAN-b31b1b"
210
+ alt="HiFiGAN"></a>
211
+ <br>
212
+ Unsupervised style control has been implemented based on <a href="https://arxiv.org/abs/1803.09017" style="display:
213
+ inline-block;margin-top: .5em;margin-right: .25em;" target="_blank"> <img style="margin-bottom: 0em;display:
214
+ inline;margin-top: -.25em;" src="https://img.shields.io/badge/ArXiv-Global%20Style%20Tokens-b31b1b"
215
+ alt="Global Style Tokens"></a>
216
+ <br>
217
+ """)
218
+
219
+ demo.launch()