model: class_path: vocos.experiment.VocosEncodecExp init_args: sample_rate: 24000 initial_learning_rate: 5e-4 mel_loss_coeff: 45 mrd_loss_coeff: 1.0 num_warmup_steps: 0 # Optimizers warmup steps pretrain_mel_steps: 0 # 0 means GAN objective from the first iteration # automatic evaluation evaluate_utmos: true evaluate_pesq: true evaluate_periodicty: true feature_extractor: class_path: vocos.feature_extractors.EncodecFeatures init_args: encodec_model: encodec_24khz bandwidths: [1.5, 3.0, 6.0, 12.0] train_codebooks: false backbone: class_path: vocos.models.VocosBackbone init_args: input_channels: 128 dim: 384 intermediate_dim: 1152 num_layers: 8 adanorm_num_embeddings: 4 # len(bandwidths) head: class_path: vocos.heads.WaveNextHead init_args: dim: 384 n_fft: 1280 hop_length: 320 padding: same melspec_loss: class_path: vocos.loss.MelSpecReconstructionLoss init_args: sample_rate: 24000 n_fft: 1024 hop_length: 256 n_mels: 128 f_min: 0 f_max: 12000 clip_val: 1e-7