2023-10-25 08:00:15,628 ---------------------------------------------------------------------------------------------------- 2023-10-25 08:00:15,629 Model: "SequenceTagger( (embeddings): TransformerWordEmbeddings( (model): BertModel( (embeddings): BertEmbeddings( (word_embeddings): Embedding(64001, 768) (position_embeddings): Embedding(512, 768) (token_type_embeddings): Embedding(2, 768) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) (encoder): BertEncoder( (layer): ModuleList( (0): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (1): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (2): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (3): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (4): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (5): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (6): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (7): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (8): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (9): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (10): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (11): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) ) ) (pooler): BertPooler( (dense): Linear(in_features=768, out_features=768, bias=True) (activation): Tanh() ) ) ) (locked_dropout): LockedDropout(p=0.5) (linear): Linear(in_features=768, out_features=13, bias=True) (loss_function): CrossEntropyLoss() )" 2023-10-25 08:00:15,630 ---------------------------------------------------------------------------------------------------- 2023-10-25 08:00:15,630 MultiCorpus: 14465 train + 1392 dev + 2432 test sentences - NER_HIPE_2022 Corpus: 14465 train + 1392 dev + 2432 test sentences - /home/ubuntu/.flair/datasets/ner_hipe_2022/v2.1/letemps/fr/with_doc_seperator 2023-10-25 08:00:15,630 ---------------------------------------------------------------------------------------------------- 2023-10-25 08:00:15,630 Train: 14465 sentences 2023-10-25 08:00:15,630 (train_with_dev=False, train_with_test=False) 2023-10-25 08:00:15,630 ---------------------------------------------------------------------------------------------------- 2023-10-25 08:00:15,630 Training Params: 2023-10-25 08:00:15,630 - learning_rate: "3e-05" 2023-10-25 08:00:15,630 - mini_batch_size: "8" 2023-10-25 08:00:15,630 - max_epochs: "10" 2023-10-25 08:00:15,630 - shuffle: "True" 2023-10-25 08:00:15,630 ---------------------------------------------------------------------------------------------------- 2023-10-25 08:00:15,630 Plugins: 2023-10-25 08:00:15,630 - TensorboardLogger 2023-10-25 08:00:15,630 - LinearScheduler | warmup_fraction: '0.1' 2023-10-25 08:00:15,630 ---------------------------------------------------------------------------------------------------- 2023-10-25 08:00:15,630 Final evaluation on model from best epoch (best-model.pt) 2023-10-25 08:00:15,630 - metric: "('micro avg', 'f1-score')" 2023-10-25 08:00:15,630 ---------------------------------------------------------------------------------------------------- 2023-10-25 08:00:15,630 Computation: 2023-10-25 08:00:15,630 - compute on device: cuda:0 2023-10-25 08:00:15,630 - embedding storage: none 2023-10-25 08:00:15,630 ---------------------------------------------------------------------------------------------------- 2023-10-25 08:00:15,630 Model training base path: "hmbench-letemps/fr-dbmdz/bert-base-historic-multilingual-64k-td-cased-bs8-wsFalse-e10-lr3e-05-poolingfirst-layers-1-crfFalse-1" 2023-10-25 08:00:15,630 ---------------------------------------------------------------------------------------------------- 2023-10-25 08:00:15,630 ---------------------------------------------------------------------------------------------------- 2023-10-25 08:00:15,630 Logging anything other than scalars to TensorBoard is currently not supported. 2023-10-25 08:00:31,579 epoch 1 - iter 180/1809 - loss 1.59994791 - time (sec): 15.95 - samples/sec: 2365.16 - lr: 0.000003 - momentum: 0.000000 2023-10-25 08:00:46,626 epoch 1 - iter 360/1809 - loss 0.90942152 - time (sec): 31.00 - samples/sec: 2420.21 - lr: 0.000006 - momentum: 0.000000 2023-10-25 08:01:02,121 epoch 1 - iter 540/1809 - loss 0.65057194 - time (sec): 46.49 - samples/sec: 2437.06 - lr: 0.000009 - momentum: 0.000000 2023-10-25 08:01:17,548 epoch 1 - iter 720/1809 - loss 0.52234297 - time (sec): 61.92 - samples/sec: 2445.01 - lr: 0.000012 - momentum: 0.000000 2023-10-25 08:01:32,921 epoch 1 - iter 900/1809 - loss 0.44356097 - time (sec): 77.29 - samples/sec: 2440.72 - lr: 0.000015 - momentum: 0.000000 2023-10-25 08:01:48,608 epoch 1 - iter 1080/1809 - loss 0.38806485 - time (sec): 92.98 - samples/sec: 2438.35 - lr: 0.000018 - momentum: 0.000000 2023-10-25 08:02:04,004 epoch 1 - iter 1260/1809 - loss 0.34665146 - time (sec): 108.37 - samples/sec: 2444.01 - lr: 0.000021 - momentum: 0.000000 2023-10-25 08:02:19,487 epoch 1 - iter 1440/1809 - loss 0.31692748 - time (sec): 123.86 - samples/sec: 2440.98 - lr: 0.000024 - momentum: 0.000000 2023-10-25 08:02:35,196 epoch 1 - iter 1620/1809 - loss 0.29267973 - time (sec): 139.57 - samples/sec: 2437.07 - lr: 0.000027 - momentum: 0.000000 2023-10-25 08:02:50,837 epoch 1 - iter 1800/1809 - loss 0.27406237 - time (sec): 155.21 - samples/sec: 2436.52 - lr: 0.000030 - momentum: 0.000000 2023-10-25 08:02:51,583 ---------------------------------------------------------------------------------------------------- 2023-10-25 08:02:51,583 EPOCH 1 done: loss 0.2733 - lr: 0.000030 2023-10-25 08:02:56,022 DEV : loss 0.11878068745136261 - f1-score (micro avg) 0.6243 2023-10-25 08:02:56,043 saving best model 2023-10-25 08:02:56,600 ---------------------------------------------------------------------------------------------------- 2023-10-25 08:03:12,137 epoch 2 - iter 180/1809 - loss 0.08520146 - time (sec): 15.54 - samples/sec: 2457.95 - lr: 0.000030 - momentum: 0.000000 2023-10-25 08:03:28,446 epoch 2 - iter 360/1809 - loss 0.09154462 - time (sec): 31.84 - samples/sec: 2418.86 - lr: 0.000029 - momentum: 0.000000 2023-10-25 08:03:44,444 epoch 2 - iter 540/1809 - loss 0.09248862 - time (sec): 47.84 - samples/sec: 2412.53 - lr: 0.000029 - momentum: 0.000000 2023-10-25 08:04:00,307 epoch 2 - iter 720/1809 - loss 0.08973269 - time (sec): 63.71 - samples/sec: 2404.62 - lr: 0.000029 - momentum: 0.000000 2023-10-25 08:04:16,033 epoch 2 - iter 900/1809 - loss 0.08876132 - time (sec): 79.43 - samples/sec: 2403.58 - lr: 0.000028 - momentum: 0.000000 2023-10-25 08:04:31,870 epoch 2 - iter 1080/1809 - loss 0.08756439 - time (sec): 95.27 - samples/sec: 2394.03 - lr: 0.000028 - momentum: 0.000000 2023-10-25 08:04:47,396 epoch 2 - iter 1260/1809 - loss 0.08711257 - time (sec): 110.79 - samples/sec: 2393.13 - lr: 0.000028 - momentum: 0.000000 2023-10-25 08:05:03,398 epoch 2 - iter 1440/1809 - loss 0.08478479 - time (sec): 126.80 - samples/sec: 2393.23 - lr: 0.000027 - momentum: 0.000000 2023-10-25 08:05:19,435 epoch 2 - iter 1620/1809 - loss 0.08360993 - time (sec): 142.83 - samples/sec: 2388.17 - lr: 0.000027 - momentum: 0.000000 2023-10-25 08:05:34,900 epoch 2 - iter 1800/1809 - loss 0.08306504 - time (sec): 158.30 - samples/sec: 2388.83 - lr: 0.000027 - momentum: 0.000000 2023-10-25 08:05:35,628 ---------------------------------------------------------------------------------------------------- 2023-10-25 08:05:35,629 EPOCH 2 done: loss 0.0831 - lr: 0.000027 2023-10-25 08:05:40,837 DEV : loss 0.13267631828784943 - f1-score (micro avg) 0.6358 2023-10-25 08:05:40,859 saving best model 2023-10-25 08:05:41,675 ---------------------------------------------------------------------------------------------------- 2023-10-25 08:05:57,585 epoch 3 - iter 180/1809 - loss 0.06089348 - time (sec): 15.91 - samples/sec: 2355.16 - lr: 0.000026 - momentum: 0.000000 2023-10-25 08:06:13,768 epoch 3 - iter 360/1809 - loss 0.06038894 - time (sec): 32.09 - samples/sec: 2360.88 - lr: 0.000026 - momentum: 0.000000 2023-10-25 08:06:29,084 epoch 3 - iter 540/1809 - loss 0.05526036 - time (sec): 47.41 - samples/sec: 2389.71 - lr: 0.000026 - momentum: 0.000000 2023-10-25 08:06:44,661 epoch 3 - iter 720/1809 - loss 0.05749612 - time (sec): 62.98 - samples/sec: 2390.73 - lr: 0.000025 - momentum: 0.000000 2023-10-25 08:07:00,404 epoch 3 - iter 900/1809 - loss 0.05617974 - time (sec): 78.73 - samples/sec: 2403.04 - lr: 0.000025 - momentum: 0.000000 2023-10-25 08:07:16,708 epoch 3 - iter 1080/1809 - loss 0.05706057 - time (sec): 95.03 - samples/sec: 2404.56 - lr: 0.000025 - momentum: 0.000000 2023-10-25 08:07:32,106 epoch 3 - iter 1260/1809 - loss 0.05724190 - time (sec): 110.43 - samples/sec: 2401.49 - lr: 0.000024 - momentum: 0.000000 2023-10-25 08:07:48,254 epoch 3 - iter 1440/1809 - loss 0.05718478 - time (sec): 126.58 - samples/sec: 2408.93 - lr: 0.000024 - momentum: 0.000000 2023-10-25 08:08:04,408 epoch 3 - iter 1620/1809 - loss 0.05826610 - time (sec): 142.73 - samples/sec: 2395.50 - lr: 0.000024 - momentum: 0.000000 2023-10-25 08:08:19,957 epoch 3 - iter 1800/1809 - loss 0.05919743 - time (sec): 158.28 - samples/sec: 2391.38 - lr: 0.000023 - momentum: 0.000000 2023-10-25 08:08:20,676 ---------------------------------------------------------------------------------------------------- 2023-10-25 08:08:20,676 EPOCH 3 done: loss 0.0592 - lr: 0.000023 2023-10-25 08:08:25,440 DEV : loss 0.1354532539844513 - f1-score (micro avg) 0.6314 2023-10-25 08:08:25,462 ---------------------------------------------------------------------------------------------------- 2023-10-25 08:08:41,845 epoch 4 - iter 180/1809 - loss 0.03568652 - time (sec): 16.38 - samples/sec: 2312.63 - lr: 0.000023 - momentum: 0.000000 2023-10-25 08:08:58,273 epoch 4 - iter 360/1809 - loss 0.03716226 - time (sec): 32.81 - samples/sec: 2346.64 - lr: 0.000023 - momentum: 0.000000 2023-10-25 08:09:13,745 epoch 4 - iter 540/1809 - loss 0.03968774 - time (sec): 48.28 - samples/sec: 2347.42 - lr: 0.000022 - momentum: 0.000000 2023-10-25 08:09:29,537 epoch 4 - iter 720/1809 - loss 0.04000489 - time (sec): 64.07 - samples/sec: 2355.52 - lr: 0.000022 - momentum: 0.000000 2023-10-25 08:09:45,388 epoch 4 - iter 900/1809 - loss 0.03962584 - time (sec): 79.93 - samples/sec: 2362.15 - lr: 0.000022 - momentum: 0.000000 2023-10-25 08:10:01,321 epoch 4 - iter 1080/1809 - loss 0.03940596 - time (sec): 95.86 - samples/sec: 2371.85 - lr: 0.000021 - momentum: 0.000000 2023-10-25 08:10:17,098 epoch 4 - iter 1260/1809 - loss 0.04055800 - time (sec): 111.64 - samples/sec: 2371.02 - lr: 0.000021 - momentum: 0.000000 2023-10-25 08:10:32,621 epoch 4 - iter 1440/1809 - loss 0.04022573 - time (sec): 127.16 - samples/sec: 2373.19 - lr: 0.000021 - momentum: 0.000000 2023-10-25 08:10:48,660 epoch 4 - iter 1620/1809 - loss 0.04065113 - time (sec): 143.20 - samples/sec: 2370.71 - lr: 0.000020 - momentum: 0.000000 2023-10-25 08:11:04,972 epoch 4 - iter 1800/1809 - loss 0.04133841 - time (sec): 159.51 - samples/sec: 2370.42 - lr: 0.000020 - momentum: 0.000000 2023-10-25 08:11:05,824 ---------------------------------------------------------------------------------------------------- 2023-10-25 08:11:05,824 EPOCH 4 done: loss 0.0414 - lr: 0.000020 2023-10-25 08:11:10,594 DEV : loss 0.2289542257785797 - f1-score (micro avg) 0.6386 2023-10-25 08:11:10,616 saving best model 2023-10-25 08:11:11,305 ---------------------------------------------------------------------------------------------------- 2023-10-25 08:11:26,893 epoch 5 - iter 180/1809 - loss 0.02492765 - time (sec): 15.59 - samples/sec: 2342.07 - lr: 0.000020 - momentum: 0.000000 2023-10-25 08:11:42,990 epoch 5 - iter 360/1809 - loss 0.02595936 - time (sec): 31.68 - samples/sec: 2334.66 - lr: 0.000019 - momentum: 0.000000 2023-10-25 08:11:58,875 epoch 5 - iter 540/1809 - loss 0.02591071 - time (sec): 47.57 - samples/sec: 2350.63 - lr: 0.000019 - momentum: 0.000000 2023-10-25 08:12:14,762 epoch 5 - iter 720/1809 - loss 0.02549117 - time (sec): 63.46 - samples/sec: 2358.16 - lr: 0.000019 - momentum: 0.000000 2023-10-25 08:12:30,706 epoch 5 - iter 900/1809 - loss 0.02448476 - time (sec): 79.40 - samples/sec: 2376.53 - lr: 0.000018 - momentum: 0.000000 2023-10-25 08:12:46,502 epoch 5 - iter 1080/1809 - loss 0.02533076 - time (sec): 95.20 - samples/sec: 2368.81 - lr: 0.000018 - momentum: 0.000000 2023-10-25 08:13:02,210 epoch 5 - iter 1260/1809 - loss 0.02562868 - time (sec): 110.90 - samples/sec: 2367.94 - lr: 0.000018 - momentum: 0.000000 2023-10-25 08:13:18,742 epoch 5 - iter 1440/1809 - loss 0.02590139 - time (sec): 127.44 - samples/sec: 2370.63 - lr: 0.000017 - momentum: 0.000000 2023-10-25 08:13:34,484 epoch 5 - iter 1620/1809 - loss 0.02625493 - time (sec): 143.18 - samples/sec: 2370.26 - lr: 0.000017 - momentum: 0.000000 2023-10-25 08:13:50,871 epoch 5 - iter 1800/1809 - loss 0.02688381 - time (sec): 159.57 - samples/sec: 2371.05 - lr: 0.000017 - momentum: 0.000000 2023-10-25 08:13:51,555 ---------------------------------------------------------------------------------------------------- 2023-10-25 08:13:51,555 EPOCH 5 done: loss 0.0269 - lr: 0.000017 2023-10-25 08:13:56,313 DEV : loss 0.26100045442581177 - f1-score (micro avg) 0.6625 2023-10-25 08:13:56,335 saving best model 2023-10-25 08:13:57,053 ---------------------------------------------------------------------------------------------------- 2023-10-25 08:14:12,919 epoch 6 - iter 180/1809 - loss 0.01378039 - time (sec): 15.86 - samples/sec: 2282.43 - lr: 0.000016 - momentum: 0.000000 2023-10-25 08:14:28,961 epoch 6 - iter 360/1809 - loss 0.01816354 - time (sec): 31.91 - samples/sec: 2360.47 - lr: 0.000016 - momentum: 0.000000 2023-10-25 08:14:45,179 epoch 6 - iter 540/1809 - loss 0.01885418 - time (sec): 48.12 - samples/sec: 2356.04 - lr: 0.000016 - momentum: 0.000000 2023-10-25 08:15:00,786 epoch 6 - iter 720/1809 - loss 0.01972614 - time (sec): 63.73 - samples/sec: 2349.29 - lr: 0.000015 - momentum: 0.000000 2023-10-25 08:15:16,751 epoch 6 - iter 900/1809 - loss 0.01899198 - time (sec): 79.70 - samples/sec: 2361.69 - lr: 0.000015 - momentum: 0.000000 2023-10-25 08:15:32,360 epoch 6 - iter 1080/1809 - loss 0.01852834 - time (sec): 95.31 - samples/sec: 2362.64 - lr: 0.000015 - momentum: 0.000000 2023-10-25 08:15:48,278 epoch 6 - iter 1260/1809 - loss 0.01797562 - time (sec): 111.22 - samples/sec: 2363.36 - lr: 0.000014 - momentum: 0.000000 2023-10-25 08:16:04,333 epoch 6 - iter 1440/1809 - loss 0.01764648 - time (sec): 127.28 - samples/sec: 2373.74 - lr: 0.000014 - momentum: 0.000000 2023-10-25 08:16:20,288 epoch 6 - iter 1620/1809 - loss 0.01787887 - time (sec): 143.23 - samples/sec: 2372.76 - lr: 0.000014 - momentum: 0.000000 2023-10-25 08:16:36,075 epoch 6 - iter 1800/1809 - loss 0.01811722 - time (sec): 159.02 - samples/sec: 2376.53 - lr: 0.000013 - momentum: 0.000000 2023-10-25 08:16:36,858 ---------------------------------------------------------------------------------------------------- 2023-10-25 08:16:36,858 EPOCH 6 done: loss 0.0182 - lr: 0.000013 2023-10-25 08:16:42,101 DEV : loss 0.3313358724117279 - f1-score (micro avg) 0.6553 2023-10-25 08:16:42,123 ---------------------------------------------------------------------------------------------------- 2023-10-25 08:16:57,953 epoch 7 - iter 180/1809 - loss 0.00872843 - time (sec): 15.83 - samples/sec: 2415.85 - lr: 0.000013 - momentum: 0.000000 2023-10-25 08:17:13,241 epoch 7 - iter 360/1809 - loss 0.00854091 - time (sec): 31.12 - samples/sec: 2417.71 - lr: 0.000013 - momentum: 0.000000 2023-10-25 08:17:29,025 epoch 7 - iter 540/1809 - loss 0.01084116 - time (sec): 46.90 - samples/sec: 2397.20 - lr: 0.000012 - momentum: 0.000000 2023-10-25 08:17:44,926 epoch 7 - iter 720/1809 - loss 0.01304482 - time (sec): 62.80 - samples/sec: 2399.67 - lr: 0.000012 - momentum: 0.000000 2023-10-25 08:18:01,388 epoch 7 - iter 900/1809 - loss 0.01267124 - time (sec): 79.26 - samples/sec: 2410.58 - lr: 0.000012 - momentum: 0.000000 2023-10-25 08:18:16,849 epoch 7 - iter 1080/1809 - loss 0.01242008 - time (sec): 94.73 - samples/sec: 2406.32 - lr: 0.000011 - momentum: 0.000000 2023-10-25 08:18:33,141 epoch 7 - iter 1260/1809 - loss 0.01230193 - time (sec): 111.02 - samples/sec: 2391.84 - lr: 0.000011 - momentum: 0.000000 2023-10-25 08:18:48,939 epoch 7 - iter 1440/1809 - loss 0.01248631 - time (sec): 126.82 - samples/sec: 2391.04 - lr: 0.000011 - momentum: 0.000000 2023-10-25 08:19:04,921 epoch 7 - iter 1620/1809 - loss 0.01261317 - time (sec): 142.80 - samples/sec: 2390.66 - lr: 0.000010 - momentum: 0.000000 2023-10-25 08:19:20,957 epoch 7 - iter 1800/1809 - loss 0.01276577 - time (sec): 158.83 - samples/sec: 2380.11 - lr: 0.000010 - momentum: 0.000000 2023-10-25 08:19:21,677 ---------------------------------------------------------------------------------------------------- 2023-10-25 08:19:21,677 EPOCH 7 done: loss 0.0127 - lr: 0.000010 2023-10-25 08:19:26,940 DEV : loss 0.36011332273483276 - f1-score (micro avg) 0.6616 2023-10-25 08:19:26,962 ---------------------------------------------------------------------------------------------------- 2023-10-25 08:19:43,149 epoch 8 - iter 180/1809 - loss 0.00761376 - time (sec): 16.19 - samples/sec: 2367.10 - lr: 0.000010 - momentum: 0.000000 2023-10-25 08:19:59,316 epoch 8 - iter 360/1809 - loss 0.00758239 - time (sec): 32.35 - samples/sec: 2344.06 - lr: 0.000009 - momentum: 0.000000 2023-10-25 08:20:15,488 epoch 8 - iter 540/1809 - loss 0.00857590 - time (sec): 48.52 - samples/sec: 2374.72 - lr: 0.000009 - momentum: 0.000000 2023-10-25 08:20:30,539 epoch 8 - iter 720/1809 - loss 0.00895513 - time (sec): 63.58 - samples/sec: 2393.92 - lr: 0.000009 - momentum: 0.000000 2023-10-25 08:20:46,448 epoch 8 - iter 900/1809 - loss 0.00825738 - time (sec): 79.49 - samples/sec: 2390.01 - lr: 0.000008 - momentum: 0.000000 2023-10-25 08:21:02,541 epoch 8 - iter 1080/1809 - loss 0.00881305 - time (sec): 95.58 - samples/sec: 2386.85 - lr: 0.000008 - momentum: 0.000000 2023-10-25 08:21:18,012 epoch 8 - iter 1260/1809 - loss 0.00882209 - time (sec): 111.05 - samples/sec: 2382.50 - lr: 0.000008 - momentum: 0.000000 2023-10-25 08:21:34,428 epoch 8 - iter 1440/1809 - loss 0.00827827 - time (sec): 127.47 - samples/sec: 2379.38 - lr: 0.000007 - momentum: 0.000000 2023-10-25 08:21:50,011 epoch 8 - iter 1620/1809 - loss 0.00824322 - time (sec): 143.05 - samples/sec: 2380.33 - lr: 0.000007 - momentum: 0.000000 2023-10-25 08:22:05,668 epoch 8 - iter 1800/1809 - loss 0.00850028 - time (sec): 158.70 - samples/sec: 2383.21 - lr: 0.000007 - momentum: 0.000000 2023-10-25 08:22:06,376 ---------------------------------------------------------------------------------------------------- 2023-10-25 08:22:06,376 EPOCH 8 done: loss 0.0086 - lr: 0.000007 2023-10-25 08:22:11,644 DEV : loss 0.39194777607917786 - f1-score (micro avg) 0.6577 2023-10-25 08:22:11,666 ---------------------------------------------------------------------------------------------------- 2023-10-25 08:22:28,097 epoch 9 - iter 180/1809 - loss 0.00369902 - time (sec): 16.43 - samples/sec: 2368.97 - lr: 0.000006 - momentum: 0.000000 2023-10-25 08:22:44,007 epoch 9 - iter 360/1809 - loss 0.00469730 - time (sec): 32.34 - samples/sec: 2414.47 - lr: 0.000006 - momentum: 0.000000 2023-10-25 08:22:59,689 epoch 9 - iter 540/1809 - loss 0.00431458 - time (sec): 48.02 - samples/sec: 2412.11 - lr: 0.000006 - momentum: 0.000000 2023-10-25 08:23:15,295 epoch 9 - iter 720/1809 - loss 0.00481666 - time (sec): 63.63 - samples/sec: 2391.66 - lr: 0.000005 - momentum: 0.000000 2023-10-25 08:23:31,490 epoch 9 - iter 900/1809 - loss 0.00493696 - time (sec): 79.82 - samples/sec: 2402.05 - lr: 0.000005 - momentum: 0.000000 2023-10-25 08:23:47,176 epoch 9 - iter 1080/1809 - loss 0.00523981 - time (sec): 95.51 - samples/sec: 2394.12 - lr: 0.000005 - momentum: 0.000000 2023-10-25 08:24:02,926 epoch 9 - iter 1260/1809 - loss 0.00497472 - time (sec): 111.26 - samples/sec: 2386.83 - lr: 0.000004 - momentum: 0.000000 2023-10-25 08:24:18,624 epoch 9 - iter 1440/1809 - loss 0.00565406 - time (sec): 126.96 - samples/sec: 2386.65 - lr: 0.000004 - momentum: 0.000000 2023-10-25 08:24:34,491 epoch 9 - iter 1620/1809 - loss 0.00563871 - time (sec): 142.82 - samples/sec: 2386.99 - lr: 0.000004 - momentum: 0.000000 2023-10-25 08:24:50,211 epoch 9 - iter 1800/1809 - loss 0.00567494 - time (sec): 158.54 - samples/sec: 2383.99 - lr: 0.000003 - momentum: 0.000000 2023-10-25 08:24:51,042 ---------------------------------------------------------------------------------------------------- 2023-10-25 08:24:51,043 EPOCH 9 done: loss 0.0057 - lr: 0.000003 2023-10-25 08:24:55,799 DEV : loss 0.393858402967453 - f1-score (micro avg) 0.6654 2023-10-25 08:24:55,821 saving best model 2023-10-25 08:24:56,521 ---------------------------------------------------------------------------------------------------- 2023-10-25 08:25:12,728 epoch 10 - iter 180/1809 - loss 0.00196544 - time (sec): 16.21 - samples/sec: 2353.56 - lr: 0.000003 - momentum: 0.000000 2023-10-25 08:25:28,376 epoch 10 - iter 360/1809 - loss 0.00228683 - time (sec): 31.85 - samples/sec: 2391.48 - lr: 0.000003 - momentum: 0.000000 2023-10-25 08:25:44,475 epoch 10 - iter 540/1809 - loss 0.00299234 - time (sec): 47.95 - samples/sec: 2361.65 - lr: 0.000002 - momentum: 0.000000 2023-10-25 08:26:00,434 epoch 10 - iter 720/1809 - loss 0.00293109 - time (sec): 63.91 - samples/sec: 2370.47 - lr: 0.000002 - momentum: 0.000000 2023-10-25 08:26:16,100 epoch 10 - iter 900/1809 - loss 0.00302326 - time (sec): 79.58 - samples/sec: 2361.79 - lr: 0.000002 - momentum: 0.000000 2023-10-25 08:26:31,796 epoch 10 - iter 1080/1809 - loss 0.00327400 - time (sec): 95.27 - samples/sec: 2365.70 - lr: 0.000001 - momentum: 0.000000 2023-10-25 08:26:47,735 epoch 10 - iter 1260/1809 - loss 0.00338707 - time (sec): 111.21 - samples/sec: 2357.01 - lr: 0.000001 - momentum: 0.000000 2023-10-25 08:27:03,951 epoch 10 - iter 1440/1809 - loss 0.00361125 - time (sec): 127.43 - samples/sec: 2361.94 - lr: 0.000001 - momentum: 0.000000 2023-10-25 08:27:20,042 epoch 10 - iter 1620/1809 - loss 0.00365904 - time (sec): 143.52 - samples/sec: 2367.04 - lr: 0.000000 - momentum: 0.000000 2023-10-25 08:27:36,103 epoch 10 - iter 1800/1809 - loss 0.00356670 - time (sec): 159.58 - samples/sec: 2371.48 - lr: 0.000000 - momentum: 0.000000 2023-10-25 08:27:36,804 ---------------------------------------------------------------------------------------------------- 2023-10-25 08:27:36,804 EPOCH 10 done: loss 0.0036 - lr: 0.000000 2023-10-25 08:27:41,566 DEV : loss 0.40507274866104126 - f1-score (micro avg) 0.6612 2023-10-25 08:27:42,142 ---------------------------------------------------------------------------------------------------- 2023-10-25 08:27:42,143 Loading model from best epoch ... 2023-10-25 08:27:44,091 SequenceTagger predicts: Dictionary with 13 tags: O, S-loc, B-loc, E-loc, I-loc, S-pers, B-pers, E-pers, I-pers, S-org, B-org, E-org, I-org 2023-10-25 08:27:50,312 Results: - F-score (micro) 0.6545 - F-score (macro) 0.5095 - Accuracy 0.4987 By class: precision recall f1-score support loc 0.6376 0.7919 0.7064 591 pers 0.5787 0.7619 0.6578 357 org 0.1791 0.1519 0.1644 79 micro avg 0.5917 0.7322 0.6545 1027 macro avg 0.4651 0.5686 0.5095 1027 weighted avg 0.5819 0.7322 0.6478 1027 2023-10-25 08:27:50,312 ----------------------------------------------------------------------------------------------------