David Portes commited on
Commit
810cc75
1 Parent(s): 83f5570
Files changed (1) hide show
  1. hyperparams.yaml +237 -0
hyperparams.yaml ADDED
@@ -0,0 +1,237 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Generated 2022-10-08 from:
2
+ # /home/xportes/projects/speechbrain/recipes/LJSpeech/TTS/tacotron2/hparams/train.yaml
3
+ # yamllint disable
4
+ ############################################################################
5
+ # Model: Tacotron2
6
+ # Tokens: Raw characters (English text)
7
+ # losses: Transducer
8
+ # Training: LJSpeech
9
+ # Authors: Georges Abous-Rjeili, Artem Ploujnikov, Yingzhi Wang
10
+ # ############################################################################
11
+
12
+
13
+ ###################################
14
+ # Experiment Parameters and setup #
15
+ ###################################
16
+ seed: 7234
17
+ __set_seed: !apply:torch.manual_seed [7234]
18
+ output_folder: ./results/tacotron2/7234
19
+ save_folder: ./results/tacotron2/7234/save
20
+ train_log: ./results/tacotron2/7234/train_log.txt
21
+ epochs: 750
22
+ keep_checkpoint_interval: 50
23
+
24
+ ###################################
25
+ # Progress Samples #
26
+ ###################################
27
+ # Progress samples are used to monitor the progress
28
+ # of an ongoing training session by outputting samples
29
+ # of spectrograms, alignments, etc at regular intervals
30
+
31
+ # Whether to enable progress samples
32
+ progress_samples: true
33
+
34
+ # The path where the samples will be stored
35
+ progress_sample_path: ./results/tacotron2/7234/samples
36
+ # The interval, in epochs. For instance, if it is set to 5,
37
+ # progress samples will be output every 5 epochs
38
+ progress_samples_interval: 1
39
+ # The sample size for raw batch samples saved in batch.pth
40
+ # (useful mostly for model debugging)
41
+ progress_batch_sample_size: 3
42
+
43
+ #################################
44
+ # Data files and pre-processing #
45
+ #################################
46
+ data_folder: ../../../../LJSpeech-1.1
47
+ # e.g, /localscratch/ljspeech
48
+
49
+ train_json: ./results/tacotron2/7234/save/train.json
50
+ valid_json: ./results/tacotron2/7234/save/valid.json
51
+ test_json: ./results/tacotron2/7234/save/test.json
52
+
53
+ splits: [train, valid]
54
+ split_ratio: [90, 10]
55
+
56
+ skip_prep: false
57
+
58
+ # Use the original preprocessing from nvidia
59
+ # The cleaners to be used (applicable to nvidia only)
60
+ text_cleaners: [english_cleaners]
61
+
62
+ ################################
63
+ # Audio Parameters #
64
+ ################################
65
+ sample_rate: 22050
66
+ hop_length: 256
67
+ win_length: 1024
68
+ n_mel_channels: 80
69
+ n_fft: 1024
70
+ mel_fmin: 0.0
71
+ mel_fmax: 8000.0
72
+ mel_normalized:
73
+ power: 1
74
+ norm: slaney
75
+ mel_scale: slaney
76
+ dynamic_range_compression: true
77
+
78
+ ################################
79
+ # Optimization Hyperparameters #
80
+ ################################
81
+ learning_rate: 0.001
82
+ weight_decay: 0.000006
83
+ batch_size: 64 #minimum 2
84
+ mask_padding: true
85
+ guided_attention_sigma: 0.2
86
+ guided_attention_weight: 50.0
87
+ guided_attention_weight_half_life: 10.
88
+ guided_attention_hard_stop: 50
89
+ gate_loss_weight: 1.0
90
+
91
+ train_dataloader_opts:
92
+ batch_size: 64
93
+ drop_last: false #True #False
94
+ num_workers: 8
95
+ collate_fn: !new:speechbrain.lobes.models.Tacotron2.TextMelCollate
96
+
97
+ valid_dataloader_opts:
98
+ batch_size: 64
99
+ num_workers: 8
100
+ collate_fn: !new:speechbrain.lobes.models.Tacotron2.TextMelCollate
101
+
102
+ test_dataloader_opts:
103
+ batch_size: 64
104
+ num_workers: 8
105
+ collate_fn: !new:speechbrain.lobes.models.Tacotron2.TextMelCollate
106
+
107
+ ################################
108
+ # Model Parameters and model #
109
+ ################################
110
+ n_symbols: 150 #fixed depending on symbols in textToSequence
111
+ symbols_embedding_dim: 512
112
+
113
+ # Encoder parameters
114
+ encoder_kernel_size: 5
115
+ encoder_n_convolutions: 3
116
+ encoder_embedding_dim: 512
117
+
118
+ # Decoder parameters
119
+ # The number of frames in the target per encoder step
120
+ n_frames_per_step: 1
121
+ decoder_rnn_dim: 1024
122
+ prenet_dim: 256
123
+ max_decoder_steps: 1000
124
+ gate_threshold: 0.5
125
+ p_attention_dropout: 0.1
126
+ p_decoder_dropout: 0.1
127
+ decoder_no_early_stopping: false
128
+
129
+ # Attention parameters
130
+ attention_rnn_dim: 1024
131
+ attention_dim: 128
132
+
133
+ # Location Layer parameters
134
+ attention_location_n_filters: 32
135
+ attention_location_kernel_size: 31
136
+
137
+ # Mel-post processing network parameters
138
+ postnet_embedding_dim: 512
139
+ postnet_kernel_size: 5
140
+ postnet_n_convolutions: 5
141
+
142
+ mel_spectogram: !name:speechbrain.lobes.models.Tacotron2.mel_spectogram
143
+ sample_rate: 22050
144
+ hop_length: 256
145
+ win_length: 1024
146
+ n_fft: 1024
147
+ n_mels: 80
148
+ f_min: 0.0
149
+ f_max: 8000.0
150
+ power: 1
151
+ normalized:
152
+ norm: slaney
153
+ mel_scale: slaney
154
+ compression: true
155
+
156
+ #model
157
+ model: &id002 !new:speechbrain.lobes.models.Tacotron2.Tacotron2
158
+
159
+ #optimizer
160
+ mask_padding: true
161
+ n_mel_channels: 80
162
+ # symbols
163
+ n_symbols: 150
164
+ symbols_embedding_dim: 512
165
+ # encoder
166
+ encoder_kernel_size: 5
167
+ encoder_n_convolutions: 3
168
+ encoder_embedding_dim: 512
169
+ # attention
170
+ attention_rnn_dim: 1024
171
+ attention_dim: 128
172
+ # attention location
173
+ attention_location_n_filters: 32
174
+ attention_location_kernel_size: 31
175
+ # decoder
176
+ n_frames_per_step: 1
177
+ decoder_rnn_dim: 1024
178
+ prenet_dim: 256
179
+ max_decoder_steps: 1000
180
+ gate_threshold: 0.5
181
+ p_attention_dropout: 0.1
182
+ p_decoder_dropout: 0.1
183
+ # postnet
184
+ postnet_embedding_dim: 512
185
+ postnet_kernel_size: 5
186
+ postnet_n_convolutions: 5
187
+ decoder_no_early_stopping: false
188
+
189
+ guided_attention_scheduler: &id001 !new:speechbrain.nnet.schedulers.StepScheduler
190
+ initial_value: 50.0
191
+ half_life: 10.
192
+
193
+ criterion: !new:speechbrain.lobes.models.Tacotron2.Loss
194
+ gate_loss_weight: 1.0
195
+ guided_attention_weight: 50.0
196
+ guided_attention_sigma: 0.2
197
+ guided_attention_scheduler: *id001
198
+ guided_attention_hard_stop: 50
199
+
200
+ modules:
201
+ model: *id002
202
+ opt_class: !name:torch.optim.Adam
203
+ lr: 0.001
204
+ weight_decay: 0.000006
205
+
206
+ #epoch object
207
+ epoch_counter: &id003 !new:speechbrain.utils.epoch_loop.EpochCounter
208
+ limit: 750
209
+
210
+ train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
211
+ save_file: ./results/tacotron2/7234/train_log.txt
212
+
213
+ #annealing_function
214
+ lr_annealing: &id004 !new:speechbrain.nnet.schedulers.IntervalScheduler
215
+
216
+ #infer: !name:speechbrain.lobes.models.Tacotron2.infer
217
+
218
+ intervals:
219
+ - steps: 6000
220
+ lr: 0.0005
221
+ - steps: 8000
222
+ lr: 0.0003
223
+ - steps: 10000
224
+ lr: 0.0001
225
+
226
+ #checkpointer
227
+ checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
228
+ checkpoints_dir: ./results/tacotron2/7234/save
229
+ recoverables:
230
+ model: *id002
231
+ counter: *id003
232
+ scheduler: *id004
233
+ progress_sample_logger: !new:speechbrain.utils.train_logger.ProgressSampleLogger
234
+ output_path: ./results/tacotron2/7234/samples
235
+ batch_sample_size: 3
236
+ formats:
237
+ raw_batch: raw