Emrys365 commited on
Commit
e9113df
1 Parent(s): afc9868

Update model

Browse files
Files changed (45) hide show
  1. README.md +327 -3
  2. exp_vctk/enh_train_enh_bsrnn_large_double_raw/46epoch.pth +3 -0
  3. exp_vctk/enh_train_enh_bsrnn_large_double_raw/config.yaml +240 -0
  4. exp_vctk/enh_train_enh_bsrnn_large_double_raw/enhanced_test_16k/RESULTS.md +23 -0
  5. exp_vctk/enh_train_enh_bsrnn_large_double_raw/enhanced_test_48k/RESULTS.md +18 -0
  6. exp_vctk/enh_train_enh_bsrnn_large_double_raw/images/backward_time.png +0 -0
  7. exp_vctk/enh_train_enh_bsrnn_large_double_raw/images/clip.png +0 -0
  8. exp_vctk/enh_train_enh_bsrnn_large_double_raw/images/forward_time.png +0 -0
  9. exp_vctk/enh_train_enh_bsrnn_large_double_raw/images/gpu_max_cached_mem_GB.png +0 -0
  10. exp_vctk/enh_train_enh_bsrnn_large_double_raw/images/grad_norm.png +0 -0
  11. exp_vctk/enh_train_enh_bsrnn_large_double_raw/images/iter_time.png +0 -0
  12. exp_vctk/enh_train_enh_bsrnn_large_double_raw/images/l1_timedomain+magspec_loss_1ch_16k.png +0 -0
  13. exp_vctk/enh_train_enh_bsrnn_large_double_raw/images/l1_timedomain+magspec_loss_1ch_16k_r.png +0 -0
  14. exp_vctk/enh_train_enh_bsrnn_large_double_raw/images/l1_timedomain+magspec_loss_1ch_24k.png +0 -0
  15. exp_vctk/enh_train_enh_bsrnn_large_double_raw/images/l1_timedomain+magspec_loss_1ch_48k.png +0 -0
  16. exp_vctk/enh_train_enh_bsrnn_large_double_raw/images/l1_timedomain+magspec_loss_1ch_8k.png +0 -0
  17. exp_vctk/enh_train_enh_bsrnn_large_double_raw/images/l1_timedomain+magspec_loss_1ch_8k_r.png +0 -0
  18. exp_vctk/enh_train_enh_bsrnn_large_double_raw/images/l1_timedomain+magspec_loss_2ch_16k.png +0 -0
  19. exp_vctk/enh_train_enh_bsrnn_large_double_raw/images/l1_timedomain+magspec_loss_2ch_16k_r.png +0 -0
  20. exp_vctk/enh_train_enh_bsrnn_large_double_raw/images/l1_timedomain+magspec_loss_2ch_8k.png +0 -0
  21. exp_vctk/enh_train_enh_bsrnn_large_double_raw/images/l1_timedomain+magspec_loss_2ch_8k_r.png +0 -0
  22. exp_vctk/enh_train_enh_bsrnn_large_double_raw/images/l1_timedomain+magspec_loss_5ch_16k.png +0 -0
  23. exp_vctk/enh_train_enh_bsrnn_large_double_raw/images/l1_timedomain+magspec_loss_5ch_8k.png +0 -0
  24. exp_vctk/enh_train_enh_bsrnn_large_double_raw/images/l1_timedomain+magspec_loss_8ch_16k_r.png +0 -0
  25. exp_vctk/enh_train_enh_bsrnn_large_double_raw/images/l1_timedomain+magspec_loss_8ch_8k_r.png +0 -0
  26. exp_vctk/enh_train_enh_bsrnn_large_double_raw/images/loss.png +0 -0
  27. exp_vctk/enh_train_enh_bsrnn_large_double_raw/images/loss_scale.png +0 -0
  28. exp_vctk/enh_train_enh_bsrnn_large_double_raw/images/optim0_lr0.png +0 -0
  29. exp_vctk/enh_train_enh_bsrnn_large_double_raw/images/optim_step_time.png +0 -0
  30. exp_vctk/enh_train_enh_bsrnn_large_double_raw/images/si_snr_loss_1ch_16k.png +0 -0
  31. exp_vctk/enh_train_enh_bsrnn_large_double_raw/images/si_snr_loss_1ch_16k_r.png +0 -0
  32. exp_vctk/enh_train_enh_bsrnn_large_double_raw/images/si_snr_loss_1ch_24k.png +0 -0
  33. exp_vctk/enh_train_enh_bsrnn_large_double_raw/images/si_snr_loss_1ch_48k.png +0 -0
  34. exp_vctk/enh_train_enh_bsrnn_large_double_raw/images/si_snr_loss_1ch_8k.png +0 -0
  35. exp_vctk/enh_train_enh_bsrnn_large_double_raw/images/si_snr_loss_1ch_8k_r.png +0 -0
  36. exp_vctk/enh_train_enh_bsrnn_large_double_raw/images/si_snr_loss_2ch_16k.png +0 -0
  37. exp_vctk/enh_train_enh_bsrnn_large_double_raw/images/si_snr_loss_2ch_16k_r.png +0 -0
  38. exp_vctk/enh_train_enh_bsrnn_large_double_raw/images/si_snr_loss_2ch_8k.png +0 -0
  39. exp_vctk/enh_train_enh_bsrnn_large_double_raw/images/si_snr_loss_2ch_8k_r.png +0 -0
  40. exp_vctk/enh_train_enh_bsrnn_large_double_raw/images/si_snr_loss_5ch_16k.png +0 -0
  41. exp_vctk/enh_train_enh_bsrnn_large_double_raw/images/si_snr_loss_5ch_8k.png +0 -0
  42. exp_vctk/enh_train_enh_bsrnn_large_double_raw/images/si_snr_loss_8ch_16k_r.png +0 -0
  43. exp_vctk/enh_train_enh_bsrnn_large_double_raw/images/si_snr_loss_8ch_8k_r.png +0 -0
  44. exp_vctk/enh_train_enh_bsrnn_large_double_raw/images/train_time.png +0 -0
  45. meta.yaml +8 -0
README.md CHANGED
@@ -1,3 +1,327 @@
1
- ---
2
- license: apache-2.0
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - espnet
4
+ - audio
5
+ - audio-to-audio
6
+ language: en
7
+ datasets:
8
+ - universal_se
9
+ license: cc-by-4.0
10
+ ---
11
+
12
+ ## ESPnet2 ENH model
13
+
14
+ ### `wyz/vctk_bsrnn_large_double_causal`
15
+
16
+ This model was trained by Emrys365 using universal_se recipe in [espnet](https://github.com/espnet/espnet/).
17
+
18
+ ### Demo: How to use in ESPnet2
19
+
20
+ Follow the [ESPnet installation instructions](https://espnet.github.io/espnet/installation.html)
21
+ if you haven't done that already.
22
+
23
+ ```bash
24
+ cd espnet
25
+ git checkout 443028662106472c60fe8bd892cb277e5b488651
26
+ pip install -e .
27
+ cd egs2/universal_se/enh1
28
+ ./run.sh --skip_data_prep false --skip_train true --download_model wyz/vctk_bsrnn_large_double_causal
29
+ ```
30
+
31
+
32
+
33
+ ## ENH config
34
+
35
+ <details><summary>expand</summary>
36
+
37
+ ```
38
+ config: conf/tuning/train_enh_bsrnn_large_double.yaml
39
+ print_config: false
40
+ log_level: INFO
41
+ dry_run: false
42
+ iterator_type: chunk
43
+ output_dir: exp_vctk/enh_train_enh_bsrnn_large_double_raw
44
+ ngpu: 1
45
+ seed: 0
46
+ num_workers: 4
47
+ num_att_plot: 3
48
+ dist_backend: nccl
49
+ dist_init_method: env://
50
+ dist_world_size: 2
51
+ dist_rank: 0
52
+ local_rank: 0
53
+ dist_master_addr: localhost
54
+ dist_master_port: 50679
55
+ dist_launcher: null
56
+ multiprocessing_distributed: true
57
+ unused_parameters: true
58
+ sharded_ddp: false
59
+ cudnn_enabled: true
60
+ cudnn_benchmark: false
61
+ cudnn_deterministic: true
62
+ collect_stats: false
63
+ write_collected_feats: false
64
+ max_epoch: 100
65
+ patience: 30
66
+ val_scheduler_criterion:
67
+ - valid
68
+ - loss
69
+ early_stopping_criterion:
70
+ - valid
71
+ - loss
72
+ - min
73
+ best_model_criterion:
74
+ - - valid
75
+ - loss
76
+ - min
77
+ keep_nbest_models: 1
78
+ nbest_averaging_interval: 0
79
+ grad_clip: 5.0
80
+ grad_clip_type: 2.0
81
+ grad_noise: false
82
+ accum_grad: 1
83
+ no_forward_run: false
84
+ resume: true
85
+ save_interval: 1000
86
+ train_dtype: float32
87
+ use_amp: false
88
+ log_interval: null
89
+ use_matplotlib: true
90
+ use_tensorboard: true
91
+ create_graph_in_tensorboard: false
92
+ use_wandb: false
93
+ wandb_project: null
94
+ wandb_id: null
95
+ wandb_entity: null
96
+ wandb_name: null
97
+ wandb_model_log_interval: -1
98
+ detect_anomaly: false
99
+ pretrain_path: null
100
+ init_param: []
101
+ ignore_init_mismatch: false
102
+ freeze_param: []
103
+ num_iters_per_epoch: 8000
104
+ num_iters_valid: null
105
+ batch_size: 4
106
+ valid_batch_size: null
107
+ batch_bins: 1000000
108
+ valid_batch_bins: null
109
+ train_shape_file:
110
+ - exp_vctk/enh_stats_16k/train/speech_mix_shape
111
+ - exp_vctk/enh_stats_16k/train/speech_ref1_shape
112
+ - exp_vctk/enh_stats_16k/train/dereverb_ref1_shape
113
+ valid_shape_file:
114
+ - exp_vctk/enh_stats_16k/valid/speech_mix_shape
115
+ - exp_vctk/enh_stats_16k/valid/speech_ref1_shape
116
+ - exp_vctk/enh_stats_16k/valid/dereverb_ref1_shape
117
+ batch_type: folded
118
+ valid_batch_type: null
119
+ fold_length:
120
+ - 80000
121
+ - 80000
122
+ - 80000
123
+ sort_in_batch: descending
124
+ sort_batch: descending
125
+ multiple_iterator: false
126
+ chunk_length: 32000
127
+ chunk_shift_ratio: 0.5
128
+ num_cache_chunks: 1024
129
+ chunk_excluded_key_prefixes: []
130
+ chunk_discard_short_samples: false
131
+ train_data_path_and_name_and_type:
132
+ - - dump/raw/vctk_noisy_tr_26spk/wav.scp
133
+ - speech_mix
134
+ - sound
135
+ - - dump/raw/vctk_noisy_tr_26spk/spk1.scp
136
+ - speech_ref1
137
+ - sound
138
+ - - dump/raw/vctk_noisy_tr_26spk/dereverb1.scp
139
+ - dereverb_ref1
140
+ - sound
141
+ - - dump/raw/vctk_noisy_tr_26spk/utt2category
142
+ - category
143
+ - text
144
+ - - dump/raw/vctk_noisy_tr_26spk/utt2fs
145
+ - fs
146
+ - text_int
147
+ valid_data_path_and_name_and_type:
148
+ - - dump/raw/vctk_noisy_cv_2spk/wav.scp
149
+ - speech_mix
150
+ - sound
151
+ - - dump/raw/vctk_noisy_cv_2spk/spk1.scp
152
+ - speech_ref1
153
+ - sound
154
+ - - dump/raw/vctk_noisy_cv_2spk/dereverb1.scp
155
+ - dereverb_ref1
156
+ - sound
157
+ - - dump/raw/vctk_noisy_cv_2spk/utt2category
158
+ - category
159
+ - text
160
+ - - dump/raw/vctk_noisy_cv_2spk/utt2fs
161
+ - fs
162
+ - text_int
163
+ allow_variable_data_keys: false
164
+ max_cache_size: 0.0
165
+ max_cache_fd: 32
166
+ allow_multi_rates: true
167
+ valid_max_cache_size: null
168
+ exclude_weight_decay: false
169
+ exclude_weight_decay_conf: {}
170
+ optim: adam
171
+ optim_conf:
172
+ lr: 0.001
173
+ eps: 1.0e-08
174
+ weight_decay: 1.0e-05
175
+ scheduler: steplr
176
+ scheduler_conf:
177
+ step_size: 2
178
+ gamma: 0.99
179
+ init: null
180
+ model_conf:
181
+ normalize_variance_per_ch: true
182
+ categories:
183
+ - 1ch_8k
184
+ - 1ch_8k_r
185
+ - 1ch_16k_r
186
+ - 1ch_48k
187
+ - 1ch_24k
188
+ - 1ch_16k
189
+ - 2ch_8k
190
+ - 2ch_8k_r
191
+ - 2ch_16k
192
+ - 2ch_16k_r
193
+ - 5ch_8k
194
+ - 5ch_16k
195
+ - 8ch_8k_r
196
+ - 8ch_16k_r
197
+ criterions:
198
+ - name: mr_l1_tfd
199
+ conf:
200
+ window_sz:
201
+ - 256
202
+ - 512
203
+ - 768
204
+ - 1024
205
+ hop_sz: null
206
+ eps: 1.0e-08
207
+ time_domain_weight: 0.5
208
+ normalize_variance: true
209
+ wrapper: fixed_order
210
+ wrapper_conf:
211
+ weight: 1.0
212
+ - name: si_snr
213
+ conf:
214
+ eps: 1.0e-07
215
+ wrapper: fixed_order
216
+ wrapper_conf:
217
+ weight: 0.0
218
+ speech_volume_normalize: null
219
+ rir_scp: null
220
+ rir_apply_prob: 1.0
221
+ noise_scp: null
222
+ noise_apply_prob: 1.0
223
+ noise_db_range: '13_15'
224
+ short_noise_thres: 0.5
225
+ use_reverberant_ref: false
226
+ num_spk: 1
227
+ num_noise_type: 1
228
+ sample_rate: 8000
229
+ force_single_channel: true
230
+ channel_reordering: true
231
+ categories:
232
+ - 1ch_8k
233
+ - 1ch_8k_r
234
+ - 1ch_16k_r
235
+ - 1ch_48k
236
+ - 1ch_24k
237
+ - 1ch_16k
238
+ - 2ch_8k
239
+ - 2ch_8k_r
240
+ - 2ch_16k
241
+ - 2ch_16k_r
242
+ - 5ch_8k
243
+ - 5ch_16k
244
+ - 8ch_8k_r
245
+ - 8ch_16k_r
246
+ speech_segment: null
247
+ avoid_allzero_segment: true
248
+ flexible_numspk: false
249
+ dynamic_mixing: false
250
+ utt2spk: null
251
+ dynamic_mixing_gain_db: 0.0
252
+ encoder: stft
253
+ encoder_conf:
254
+ n_fft: 960
255
+ hop_length: 480
256
+ use_builtin_complex: true
257
+ default_fs: 48000
258
+ separator: bsrnn
259
+ separator_conf:
260
+ num_spk: 1
261
+ num_channels: 256
262
+ num_layers: 12
263
+ target_fs: 48000
264
+ ref_channel: 0
265
+ decoder: stft
266
+ decoder_conf:
267
+ n_fft: 960
268
+ hop_length: 480
269
+ default_fs: 48000
270
+ mask_module: multi_mask
271
+ mask_module_conf: {}
272
+ preprocessor: enh
273
+ preprocessor_conf: {}
274
+ required:
275
+ - output_dir
276
+ version: '202304'
277
+ distributed: true
278
+ ```
279
+
280
+ </details>
281
+
282
+
283
+
284
+ ### Citing ESPnet
285
+
286
+ ```BibTex
287
+ @inproceedings{watanabe2018espnet,
288
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
289
+ title={{ESPnet}: End-to-End Speech Processing Toolkit},
290
+ year={2018},
291
+ booktitle={Proceedings of Interspeech},
292
+ pages={2207--2211},
293
+ doi={10.21437/Interspeech.2018-1456},
294
+ url={http://dx.doi.org/10.21437/Interspeech.2018-1456}
295
+ }
296
+
297
+
298
+ @inproceedings{ESPnet-SE,
299
+ author = {Chenda Li and Jing Shi and Wangyou Zhang and Aswin Shanmugam Subramanian and Xuankai Chang and
300
+ Naoyuki Kamo and Moto Hira and Tomoki Hayashi and Christoph B{"{o}}ddeker and Zhuo Chen and Shinji Watanabe},
301
+ title = {ESPnet-SE: End-To-End Speech Enhancement and Separation Toolkit Designed for {ASR} Integration},
302
+ booktitle = {{IEEE} Spoken Language Technology Workshop, {SLT} 2021, Shenzhen, China, January 19-22, 2021},
303
+ pages = {785--792},
304
+ publisher = {{IEEE}},
305
+ year = {2021},
306
+ url = {https://doi.org/10.1109/SLT48900.2021.9383615},
307
+ doi = {10.1109/SLT48900.2021.9383615},
308
+ timestamp = {Mon, 12 Apr 2021 17:08:59 +0200},
309
+ biburl = {https://dblp.org/rec/conf/slt/Li0ZSCKHHBC021.bib},
310
+ bibsource = {dblp computer science bibliography, https://dblp.org}
311
+ }
312
+
313
+
314
+ ```
315
+
316
+ or arXiv:
317
+
318
+ ```bibtex
319
+ @misc{watanabe2018espnet,
320
+ title={ESPnet: End-to-End Speech Processing Toolkit},
321
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
322
+ year={2018},
323
+ eprint={1804.00015},
324
+ archivePrefix={arXiv},
325
+ primaryClass={cs.CL}
326
+ }
327
+ ```
exp_vctk/enh_train_enh_bsrnn_large_double_raw/46epoch.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b269d10efd9e2eacd41e8a25d3432059f90548154fae36ebab195b0f313aaa7c
3
+ size 334827921
exp_vctk/enh_train_enh_bsrnn_large_double_raw/config.yaml ADDED
@@ -0,0 +1,240 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ config: conf/tuning/train_enh_bsrnn_large_double.yaml
2
+ print_config: false
3
+ log_level: INFO
4
+ dry_run: false
5
+ iterator_type: chunk
6
+ output_dir: exp_vctk/enh_train_enh_bsrnn_large_double_raw
7
+ ngpu: 1
8
+ seed: 0
9
+ num_workers: 4
10
+ num_att_plot: 3
11
+ dist_backend: nccl
12
+ dist_init_method: env://
13
+ dist_world_size: 2
14
+ dist_rank: 0
15
+ local_rank: 0
16
+ dist_master_addr: localhost
17
+ dist_master_port: 50679
18
+ dist_launcher: null
19
+ multiprocessing_distributed: true
20
+ unused_parameters: true
21
+ sharded_ddp: false
22
+ cudnn_enabled: true
23
+ cudnn_benchmark: false
24
+ cudnn_deterministic: true
25
+ collect_stats: false
26
+ write_collected_feats: false
27
+ max_epoch: 100
28
+ patience: 30
29
+ val_scheduler_criterion:
30
+ - valid
31
+ - loss
32
+ early_stopping_criterion:
33
+ - valid
34
+ - loss
35
+ - min
36
+ best_model_criterion:
37
+ - - valid
38
+ - loss
39
+ - min
40
+ keep_nbest_models: 1
41
+ nbest_averaging_interval: 0
42
+ grad_clip: 5.0
43
+ grad_clip_type: 2.0
44
+ grad_noise: false
45
+ accum_grad: 1
46
+ no_forward_run: false
47
+ resume: true
48
+ save_interval: 1000
49
+ train_dtype: float32
50
+ use_amp: false
51
+ log_interval: null
52
+ use_matplotlib: true
53
+ use_tensorboard: true
54
+ create_graph_in_tensorboard: false
55
+ use_wandb: false
56
+ wandb_project: null
57
+ wandb_id: null
58
+ wandb_entity: null
59
+ wandb_name: null
60
+ wandb_model_log_interval: -1
61
+ detect_anomaly: false
62
+ pretrain_path: null
63
+ init_param: []
64
+ ignore_init_mismatch: false
65
+ freeze_param: []
66
+ num_iters_per_epoch: 8000
67
+ num_iters_valid: null
68
+ batch_size: 4
69
+ valid_batch_size: null
70
+ batch_bins: 1000000
71
+ valid_batch_bins: null
72
+ train_shape_file:
73
+ - exp_vctk/enh_stats_16k/train/speech_mix_shape
74
+ - exp_vctk/enh_stats_16k/train/speech_ref1_shape
75
+ - exp_vctk/enh_stats_16k/train/dereverb_ref1_shape
76
+ valid_shape_file:
77
+ - exp_vctk/enh_stats_16k/valid/speech_mix_shape
78
+ - exp_vctk/enh_stats_16k/valid/speech_ref1_shape
79
+ - exp_vctk/enh_stats_16k/valid/dereverb_ref1_shape
80
+ batch_type: folded
81
+ valid_batch_type: null
82
+ fold_length:
83
+ - 80000
84
+ - 80000
85
+ - 80000
86
+ sort_in_batch: descending
87
+ sort_batch: descending
88
+ multiple_iterator: false
89
+ chunk_length: 32000
90
+ chunk_shift_ratio: 0.5
91
+ num_cache_chunks: 1024
92
+ chunk_excluded_key_prefixes: []
93
+ chunk_discard_short_samples: false
94
+ train_data_path_and_name_and_type:
95
+ - - dump/raw/vctk_noisy_tr_26spk/wav.scp
96
+ - speech_mix
97
+ - sound
98
+ - - dump/raw/vctk_noisy_tr_26spk/spk1.scp
99
+ - speech_ref1
100
+ - sound
101
+ - - dump/raw/vctk_noisy_tr_26spk/dereverb1.scp
102
+ - dereverb_ref1
103
+ - sound
104
+ - - dump/raw/vctk_noisy_tr_26spk/utt2category
105
+ - category
106
+ - text
107
+ - - dump/raw/vctk_noisy_tr_26spk/utt2fs
108
+ - fs
109
+ - text_int
110
+ valid_data_path_and_name_and_type:
111
+ - - dump/raw/vctk_noisy_cv_2spk/wav.scp
112
+ - speech_mix
113
+ - sound
114
+ - - dump/raw/vctk_noisy_cv_2spk/spk1.scp
115
+ - speech_ref1
116
+ - sound
117
+ - - dump/raw/vctk_noisy_cv_2spk/dereverb1.scp
118
+ - dereverb_ref1
119
+ - sound
120
+ - - dump/raw/vctk_noisy_cv_2spk/utt2category
121
+ - category
122
+ - text
123
+ - - dump/raw/vctk_noisy_cv_2spk/utt2fs
124
+ - fs
125
+ - text_int
126
+ allow_variable_data_keys: false
127
+ max_cache_size: 0.0
128
+ max_cache_fd: 32
129
+ allow_multi_rates: true
130
+ valid_max_cache_size: null
131
+ exclude_weight_decay: false
132
+ exclude_weight_decay_conf: {}
133
+ optim: adam
134
+ optim_conf:
135
+ lr: 0.001
136
+ eps: 1.0e-08
137
+ weight_decay: 1.0e-05
138
+ scheduler: steplr
139
+ scheduler_conf:
140
+ step_size: 2
141
+ gamma: 0.99
142
+ init: null
143
+ model_conf:
144
+ normalize_variance_per_ch: true
145
+ categories:
146
+ - 1ch_8k
147
+ - 1ch_8k_r
148
+ - 1ch_16k_r
149
+ - 1ch_48k
150
+ - 1ch_24k
151
+ - 1ch_16k
152
+ - 2ch_8k
153
+ - 2ch_8k_r
154
+ - 2ch_16k
155
+ - 2ch_16k_r
156
+ - 5ch_8k
157
+ - 5ch_16k
158
+ - 8ch_8k_r
159
+ - 8ch_16k_r
160
+ criterions:
161
+ - name: mr_l1_tfd
162
+ conf:
163
+ window_sz:
164
+ - 256
165
+ - 512
166
+ - 768
167
+ - 1024
168
+ hop_sz: null
169
+ eps: 1.0e-08
170
+ time_domain_weight: 0.5
171
+ normalize_variance: true
172
+ wrapper: fixed_order
173
+ wrapper_conf:
174
+ weight: 1.0
175
+ - name: si_snr
176
+ conf:
177
+ eps: 1.0e-07
178
+ wrapper: fixed_order
179
+ wrapper_conf:
180
+ weight: 0.0
181
+ speech_volume_normalize: null
182
+ rir_scp: null
183
+ rir_apply_prob: 1.0
184
+ noise_scp: null
185
+ noise_apply_prob: 1.0
186
+ noise_db_range: '13_15'
187
+ short_noise_thres: 0.5
188
+ use_reverberant_ref: false
189
+ num_spk: 1
190
+ num_noise_type: 1
191
+ sample_rate: 8000
192
+ force_single_channel: true
193
+ channel_reordering: true
194
+ categories:
195
+ - 1ch_8k
196
+ - 1ch_8k_r
197
+ - 1ch_16k_r
198
+ - 1ch_48k
199
+ - 1ch_24k
200
+ - 1ch_16k
201
+ - 2ch_8k
202
+ - 2ch_8k_r
203
+ - 2ch_16k
204
+ - 2ch_16k_r
205
+ - 5ch_8k
206
+ - 5ch_16k
207
+ - 8ch_8k_r
208
+ - 8ch_16k_r
209
+ speech_segment: null
210
+ avoid_allzero_segment: true
211
+ flexible_numspk: false
212
+ dynamic_mixing: false
213
+ utt2spk: null
214
+ dynamic_mixing_gain_db: 0.0
215
+ encoder: stft
216
+ encoder_conf:
217
+ n_fft: 960
218
+ hop_length: 480
219
+ use_builtin_complex: true
220
+ default_fs: 48000
221
+ separator: bsrnn
222
+ separator_conf:
223
+ num_spk: 1
224
+ num_channels: 256
225
+ num_layers: 12
226
+ target_fs: 48000
227
+ ref_channel: 0
228
+ decoder: stft
229
+ decoder_conf:
230
+ n_fft: 960
231
+ hop_length: 480
232
+ default_fs: 48000
233
+ mask_module: multi_mask
234
+ mask_module_conf: {}
235
+ preprocessor: enh
236
+ preprocessor_conf: {}
237
+ required:
238
+ - output_dir
239
+ version: '202304'
240
+ distributed: true
exp_vctk/enh_train_enh_bsrnn_large_double_raw/enhanced_test_16k/RESULTS.md ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!-- Generated by ./scripts/utils/show_enh_score.sh -->
2
+ # RESULTS
3
+ ## Environments
4
+ - date: `Wed Feb 28 12:11:05 EST 2024`
5
+ - python version: `3.8.16 (default, Mar 2 2023, 03:21:46) [GCC 11.2.0]`
6
+ - espnet version: `espnet 202304`
7
+ - pytorch version: `pytorch 2.0.1+cu118`
8
+ - Git hash: `443028662106472c60fe8bd892cb277e5b488651`
9
+ - Commit date: `Thu May 11 03:32:59 2023 +0000`
10
+
11
+
12
+ ## enhanced_test_16k
13
+
14
+
15
+ |dataset|PESQ_WB|STOI|SAR|SDR|SIR|SI_SNR|OVRL|SIG|BAK|P808_MOS|
16
+ |---|---|---|---|---|---|---|---|---|---|---|
17
+ |chime4_et05_real_isolated_6ch_track|1.13|46.06|-4.10|-4.10|0.00|-31.50|2.32|2.75|3.26|3.07|
18
+ |chime4_et05_simu_isolated_6ch_track|1.14|63.98|3.58|3.58|0.00|-2.04|2.19|2.54|3.36|2.75|
19
+ |dns20_tt_synthetic_no_reverb|2.18|93.14|13.28|13.28|0.00|12.47|3.05|3.46|3.71|3.73|
20
+ |reverb_et_real_8ch_multich|1.10|59.67|3.75|3.75|0.00|0.44|2.28|2.62|3.50|3.27|
21
+ |reverb_et_simu_8ch_multich|1.61|83.19|9.07|9.07|0.00|-10.76|2.84|3.23|3.71|3.62|
22
+ |whamr_tt_mix_single_reverb_max_16k|1.24|76.23|4.73|4.73|0.00|0.59|2.32|2.66|3.53|3.19|
23
+
exp_vctk/enh_train_enh_bsrnn_large_double_raw/enhanced_test_48k/RESULTS.md ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!-- Generated by ./scripts/utils/show_enh_score.sh -->
2
+ # RESULTS
3
+ ## Environments
4
+ - date: `Thu Dec 28 13:50:59 EST 2023`
5
+ - python version: `3.8.16 (default, Mar 2 2023, 03:21:46) [GCC 11.2.0]`
6
+ - espnet version: `espnet 202304`
7
+ - pytorch version: `pytorch 2.0.1+cu118`
8
+ - Git hash: `443028662106472c60fe8bd892cb277e5b488651`
9
+ - Commit date: `Thu May 11 03:32:59 2023 +0000`
10
+
11
+
12
+ ## enhanced_test_48k
13
+
14
+
15
+ |dataset|STOI|SAR|SDR|SIR|SI_SNR|OVRL|SIG|BAK|P808_MOS|
16
+ |---|---|---|---|---|---|---|---|---|---|
17
+ |vctk_noisy_tt_2spk|94.80|19.49|19.49|0.00|18.45|3.09|3.42|3.92|3.48|
18
+
exp_vctk/enh_train_enh_bsrnn_large_double_raw/images/backward_time.png ADDED
exp_vctk/enh_train_enh_bsrnn_large_double_raw/images/clip.png ADDED
exp_vctk/enh_train_enh_bsrnn_large_double_raw/images/forward_time.png ADDED
exp_vctk/enh_train_enh_bsrnn_large_double_raw/images/gpu_max_cached_mem_GB.png ADDED
exp_vctk/enh_train_enh_bsrnn_large_double_raw/images/grad_norm.png ADDED
exp_vctk/enh_train_enh_bsrnn_large_double_raw/images/iter_time.png ADDED
exp_vctk/enh_train_enh_bsrnn_large_double_raw/images/l1_timedomain+magspec_loss_1ch_16k.png ADDED
exp_vctk/enh_train_enh_bsrnn_large_double_raw/images/l1_timedomain+magspec_loss_1ch_16k_r.png ADDED
exp_vctk/enh_train_enh_bsrnn_large_double_raw/images/l1_timedomain+magspec_loss_1ch_24k.png ADDED
exp_vctk/enh_train_enh_bsrnn_large_double_raw/images/l1_timedomain+magspec_loss_1ch_48k.png ADDED
exp_vctk/enh_train_enh_bsrnn_large_double_raw/images/l1_timedomain+magspec_loss_1ch_8k.png ADDED
exp_vctk/enh_train_enh_bsrnn_large_double_raw/images/l1_timedomain+magspec_loss_1ch_8k_r.png ADDED
exp_vctk/enh_train_enh_bsrnn_large_double_raw/images/l1_timedomain+magspec_loss_2ch_16k.png ADDED
exp_vctk/enh_train_enh_bsrnn_large_double_raw/images/l1_timedomain+magspec_loss_2ch_16k_r.png ADDED
exp_vctk/enh_train_enh_bsrnn_large_double_raw/images/l1_timedomain+magspec_loss_2ch_8k.png ADDED
exp_vctk/enh_train_enh_bsrnn_large_double_raw/images/l1_timedomain+magspec_loss_2ch_8k_r.png ADDED
exp_vctk/enh_train_enh_bsrnn_large_double_raw/images/l1_timedomain+magspec_loss_5ch_16k.png ADDED
exp_vctk/enh_train_enh_bsrnn_large_double_raw/images/l1_timedomain+magspec_loss_5ch_8k.png ADDED
exp_vctk/enh_train_enh_bsrnn_large_double_raw/images/l1_timedomain+magspec_loss_8ch_16k_r.png ADDED
exp_vctk/enh_train_enh_bsrnn_large_double_raw/images/l1_timedomain+magspec_loss_8ch_8k_r.png ADDED
exp_vctk/enh_train_enh_bsrnn_large_double_raw/images/loss.png ADDED
exp_vctk/enh_train_enh_bsrnn_large_double_raw/images/loss_scale.png ADDED
exp_vctk/enh_train_enh_bsrnn_large_double_raw/images/optim0_lr0.png ADDED
exp_vctk/enh_train_enh_bsrnn_large_double_raw/images/optim_step_time.png ADDED
exp_vctk/enh_train_enh_bsrnn_large_double_raw/images/si_snr_loss_1ch_16k.png ADDED
exp_vctk/enh_train_enh_bsrnn_large_double_raw/images/si_snr_loss_1ch_16k_r.png ADDED
exp_vctk/enh_train_enh_bsrnn_large_double_raw/images/si_snr_loss_1ch_24k.png ADDED
exp_vctk/enh_train_enh_bsrnn_large_double_raw/images/si_snr_loss_1ch_48k.png ADDED
exp_vctk/enh_train_enh_bsrnn_large_double_raw/images/si_snr_loss_1ch_8k.png ADDED
exp_vctk/enh_train_enh_bsrnn_large_double_raw/images/si_snr_loss_1ch_8k_r.png ADDED
exp_vctk/enh_train_enh_bsrnn_large_double_raw/images/si_snr_loss_2ch_16k.png ADDED
exp_vctk/enh_train_enh_bsrnn_large_double_raw/images/si_snr_loss_2ch_16k_r.png ADDED
exp_vctk/enh_train_enh_bsrnn_large_double_raw/images/si_snr_loss_2ch_8k.png ADDED
exp_vctk/enh_train_enh_bsrnn_large_double_raw/images/si_snr_loss_2ch_8k_r.png ADDED
exp_vctk/enh_train_enh_bsrnn_large_double_raw/images/si_snr_loss_5ch_16k.png ADDED
exp_vctk/enh_train_enh_bsrnn_large_double_raw/images/si_snr_loss_5ch_8k.png ADDED
exp_vctk/enh_train_enh_bsrnn_large_double_raw/images/si_snr_loss_8ch_16k_r.png ADDED
exp_vctk/enh_train_enh_bsrnn_large_double_raw/images/si_snr_loss_8ch_8k_r.png ADDED
exp_vctk/enh_train_enh_bsrnn_large_double_raw/images/train_time.png ADDED
meta.yaml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ espnet: '202304'
2
+ files:
3
+ model_file: exp_vctk/enh_train_enh_bsrnn_large_double_raw/46epoch.pth
4
+ python: "3.8.16 (default, Mar 2 2023, 03:21:46) \n[GCC 11.2.0]"
5
+ timestamp: 1722935361.497622
6
+ torch: 2.0.1+cu118
7
+ yaml_files:
8
+ train_config: exp_vctk/enh_train_enh_bsrnn_large_double_raw/config.yaml