|
model: |
|
class_path: vocos.experiment.VocosEncodecExp |
|
init_args: |
|
sample_rate: 24000 |
|
initial_learning_rate: 5e-4 |
|
mel_loss_coeff: 45 |
|
mrd_loss_coeff: 1.0 |
|
num_warmup_steps: 0 |
|
pretrain_mel_steps: 0 |
|
|
|
|
|
evaluate_utmos: true |
|
evaluate_pesq: true |
|
evaluate_periodicty: true |
|
|
|
feature_extractor: |
|
class_path: vocos.feature_extractors.EncodecFeatures |
|
init_args: |
|
encodec_model: encodec_24khz |
|
bandwidths: [1.5, 3.0, 6.0, 12.0] |
|
train_codebooks: false |
|
|
|
backbone: |
|
class_path: vocos.models.VocosBackbone |
|
init_args: |
|
input_channels: 128 |
|
dim: 384 |
|
intermediate_dim: 1152 |
|
num_layers: 8 |
|
adanorm_num_embeddings: 4 |
|
|
|
head: |
|
class_path: vocos.heads.WaveNextHead |
|
init_args: |
|
dim: 384 |
|
n_fft: 1280 |
|
hop_length: 320 |
|
padding: same |
|
|
|
melspec_loss: |
|
class_path: vocos.loss.MelSpecReconstructionLoss |
|
init_args: |
|
sample_rate: 24000 |
|
n_fft: 1024 |
|
hop_length: 256 |
|
n_mels: 128 |
|
f_min: 0 |
|
f_max: 12000 |
|
clip_val: 1e-7 |
|
|
|
|