diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,8 +1,8 @@ { "best_metric": null, "best_model_checkpoint": null, - "epoch": 1.0, - "global_step": 2105, + "epoch": 5.0, + "global_step": 10525, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, @@ -13,12 +13,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 1.9824163913726807, + "distillation_loss": 1.9993383884429932, "epoch": 0.0, - "learning_rate": 1.99144893111639e-05, - "loss": 1.8791, + "learning_rate": 1.998289786223278e-05, + "loss": 1.8842, "step": 10, - "task_loss": 0.6514205932617188 + "task_loss": 0.6696395874023438 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -26,12 +26,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 1.9337328672409058, + "distillation_loss": 1.9561816453933716, "epoch": 0.01, - "learning_rate": 1.9819477434679337e-05, - "loss": 1.8084, + "learning_rate": 1.996389548693587e-05, + "loss": 1.8112, "step": 20, - "task_loss": 0.68878173828125 + "task_loss": 0.6945114135742188 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -39,12 +39,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 1.7075116634368896, + "distillation_loss": 1.705786108970642, "epoch": 0.01, - "learning_rate": 1.9724465558194775e-05, - "loss": 1.7447, + "learning_rate": 1.9944893111638956e-05, + "loss": 1.7851, "step": 30, - "task_loss": 0.5935821533203125 + "task_loss": 0.599273681640625 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -52,12 +52,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 1.5379259586334229, + "distillation_loss": 1.6281533241271973, "epoch": 0.02, - "learning_rate": 1.9629453681710216e-05, - "loss": 1.4292, + "learning_rate": 1.9925890736342042e-05, + "loss": 1.5178, "step": 40, - "task_loss": 0.5262355804443359 + "task_loss": 0.555267333984375 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -65,12 +65,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.7455981969833374, + "distillation_loss": 0.9100077152252197, "epoch": 0.02, - "learning_rate": 1.954394299287411e-05, - "loss": 1.2067, + "learning_rate": 1.9908788598574825e-05, + "loss": 1.2264, "step": 50, - "task_loss": 0.20388412475585938 + "task_loss": 0.27131175994873047 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -78,12 +78,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 1.1262179613113403, + "distillation_loss": 1.0724254846572876, "epoch": 0.03, - "learning_rate": 1.944893111638955e-05, - "loss": 0.9282, + "learning_rate": 1.988978622327791e-05, + "loss": 0.9712, "step": 60, - "task_loss": 0.45258522033691406 + "task_loss": 0.4018378257751465 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -91,12 +91,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 1.2520625591278076, + "distillation_loss": 1.2103850841522217, "epoch": 0.03, - "learning_rate": 1.935391923990499e-05, - "loss": 0.9656, + "learning_rate": 1.9870783847981e-05, + "loss": 0.9075, "step": 70, - "task_loss": 0.6228516101837158 + "task_loss": 0.5983531475067139 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -104,12 +104,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.8546397686004639, + "distillation_loss": 0.6644728183746338, "epoch": 0.04, - "learning_rate": 1.9268408551068884e-05, - "loss": 0.9789, + "learning_rate": 1.9851781472684087e-05, + "loss": 0.8009, "step": 80, - "task_loss": 0.31339454650878906 + "task_loss": 0.25090086460113525 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -117,12 +117,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 1.1221280097961426, + "distillation_loss": 1.2112879753112793, "epoch": 0.04, - "learning_rate": 1.9173396674584325e-05, - "loss": 0.8387, + "learning_rate": 1.9832779097387176e-05, + "loss": 0.8291, "step": 90, - "task_loss": 0.44170093536376953 + "task_loss": 0.4889531135559082 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -130,12 +130,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.4717516005039215, + "distillation_loss": 0.6701904535293579, "epoch": 0.05, - "learning_rate": 1.9078384798099766e-05, - "loss": 0.6816, + "learning_rate": 1.9813776722090262e-05, + "loss": 0.714, "step": 100, - "task_loss": 0.15700101852416992 + "task_loss": 0.25854694843292236 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -143,12 +143,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.7137082815170288, + "distillation_loss": 0.6915435791015625, "epoch": 0.05, - "learning_rate": 1.8983372921615203e-05, - "loss": 0.6424, + "learning_rate": 1.9794774346793352e-05, + "loss": 0.6274, "step": 110, - "task_loss": 0.3361825942993164 + "task_loss": 0.3031274080276489 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -156,12 +156,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.7122435569763184, + "distillation_loss": 0.6346196532249451, "epoch": 0.06, - "learning_rate": 1.8888361045130644e-05, - "loss": 0.7162, + "learning_rate": 1.9775771971496438e-05, + "loss": 0.7066, "step": 120, - "task_loss": 0.5846670866012573 + "task_loss": 0.5627198815345764 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -169,12 +169,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.5021461248397827, + "distillation_loss": 0.8290033340454102, "epoch": 0.06, - "learning_rate": 1.8793349168646082e-05, - "loss": 0.5908, + "learning_rate": 1.9756769596199528e-05, + "loss": 0.5735, "step": 130, - "task_loss": 0.18646156787872314 + "task_loss": 0.3992866575717926 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -182,12 +182,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.33661046624183655, + "distillation_loss": 0.5018503665924072, "epoch": 0.07, - "learning_rate": 1.8698337292161523e-05, - "loss": 0.8316, + "learning_rate": 1.9737767220902614e-05, + "loss": 0.7937, "step": 140, - "task_loss": 0.11925870180130005 + "task_loss": 0.16568666696548462 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -195,12 +195,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.4071422219276428, + "distillation_loss": 0.49170026183128357, "epoch": 0.07, - "learning_rate": 1.860332541567696e-05, - "loss": 0.6662, + "learning_rate": 1.9718764845605703e-05, + "loss": 0.7142, "step": 150, - "task_loss": 0.38656365871429443 + "task_loss": 0.3975885510444641 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -208,12 +208,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.3433850407600403, + "distillation_loss": 0.5039875507354736, "epoch": 0.08, - "learning_rate": 1.8508313539192398e-05, - "loss": 0.6492, + "learning_rate": 1.969976247030879e-05, + "loss": 0.591, "step": 160, - "task_loss": 0.1292821764945984 + "task_loss": 0.24026933312416077 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -221,12 +221,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.853670597076416, + "distillation_loss": 0.9136797189712524, "epoch": 0.08, - "learning_rate": 1.8413301662707842e-05, - "loss": 0.6022, + "learning_rate": 1.9680760095011876e-05, + "loss": 0.5924, "step": 170, - "task_loss": 0.4025900363922119 + "task_loss": 0.4351678192615509 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -234,12 +234,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.6042287945747375, + "distillation_loss": 0.7978807091712952, "epoch": 0.09, - "learning_rate": 1.831828978622328e-05, - "loss": 0.7525, + "learning_rate": 1.9661757719714965e-05, + "loss": 0.7811, "step": 180, - "task_loss": 0.23989427089691162 + "task_loss": 0.3571215867996216 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -247,12 +247,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 1.0951387882232666, + "distillation_loss": 1.0396177768707275, "epoch": 0.09, - "learning_rate": 1.8223277909738718e-05, - "loss": 0.7468, + "learning_rate": 1.9642755344418055e-05, + "loss": 0.7498, "step": 190, - "task_loss": 0.5307990908622742 + "task_loss": 0.5442102551460266 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -260,12 +260,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.27730026841163635, + "distillation_loss": 0.20894655585289001, "epoch": 0.1, - "learning_rate": 1.812826603325416e-05, - "loss": 0.5985, + "learning_rate": 1.962375296912114e-05, + "loss": 0.5461, "step": 200, - "task_loss": 0.05895298719406128 + "task_loss": 0.05690506100654602 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -273,12 +273,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.9861497282981873, + "distillation_loss": 0.8320358991622925, "epoch": 0.1, - "learning_rate": 1.8033254156769596e-05, - "loss": 0.6073, + "learning_rate": 1.960475059382423e-05, + "loss": 0.6077, "step": 210, - "task_loss": 0.5837754011154175 + "task_loss": 0.4803912937641144 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -286,12 +286,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.532859206199646, + "distillation_loss": 0.5520890951156616, "epoch": 0.1, - "learning_rate": 1.7938242280285037e-05, - "loss": 0.5815, + "learning_rate": 1.958574821852732e-05, + "loss": 0.5951, "step": 220, - "task_loss": 0.2384859323501587 + "task_loss": 0.21851623058319092 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -299,12 +299,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.7019934058189392, + "distillation_loss": 0.6869984865188599, "epoch": 0.11, - "learning_rate": 1.7843230403800475e-05, - "loss": 0.778, + "learning_rate": 1.9566745843230406e-05, + "loss": 0.7494, "step": 230, - "task_loss": 0.3135228157043457 + "task_loss": 0.30402871966362 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -312,12 +312,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.9965037703514099, + "distillation_loss": 1.0405832529067993, "epoch": 0.11, - "learning_rate": 1.7748218527315916e-05, - "loss": 0.6839, + "learning_rate": 1.9547743467933492e-05, + "loss": 0.6492, "step": 240, - "task_loss": 0.4901553988456726 + "task_loss": 0.5161008834838867 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -325,20 +325,20 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.5550552606582642, + "distillation_loss": 0.44516924023628235, "epoch": 0.12, - "learning_rate": 1.7653206650831357e-05, - "loss": 0.4129, + "learning_rate": 1.9528741092636582e-05, + "loss": 0.392, "step": 250, - "task_loss": 0.2684090733528137 + "task_loss": 0.1959661990404129 }, { "epoch": 0.12, - "eval_accuracy": 0.8761467889908257, - "eval_loss": 0.44158032536506653, - "eval_runtime": 21.9879, - "eval_samples_per_second": 39.658, - "eval_steps_per_second": 4.957, + "eval_accuracy": 0.8887614678899083, + "eval_loss": 0.45345592498779297, + "eval_runtime": 28.6659, + "eval_samples_per_second": 30.419, + "eval_steps_per_second": 3.802, "step": 250 }, { @@ -347,12 +347,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.5287630558013916, + "distillation_loss": 0.5078727006912231, "epoch": 0.12, - "learning_rate": 1.7558194774346795e-05, - "loss": 0.5833, + "learning_rate": 1.9509738717339668e-05, + "loss": 0.6549, "step": 260, - "task_loss": 0.2730463147163391 + "task_loss": 0.22611352801322937 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -360,12 +360,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.9515942335128784, + "distillation_loss": 0.772598147392273, "epoch": 0.13, - "learning_rate": 1.7463182897862236e-05, - "loss": 0.6329, + "learning_rate": 1.9490736342042758e-05, + "loss": 0.646, "step": 270, - "task_loss": 0.4847598969936371 + "task_loss": 0.345528244972229 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -373,12 +373,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.5145795345306396, + "distillation_loss": 0.5505326986312866, "epoch": 0.13, - "learning_rate": 1.7368171021377673e-05, - "loss": 0.6194, + "learning_rate": 1.9471733966745844e-05, + "loss": 0.5464, "step": 280, - "task_loss": 0.4189565181732178 + "task_loss": 0.45021820068359375 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -386,12 +386,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.7319836020469666, + "distillation_loss": 0.6786664128303528, "epoch": 0.14, - "learning_rate": 1.727315914489311e-05, - "loss": 0.4816, + "learning_rate": 1.9452731591448933e-05, + "loss": 0.4428, "step": 290, - "task_loss": 0.3963426649570465 + "task_loss": 0.3662004768848419 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -399,12 +399,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.4611780047416687, + "distillation_loss": 0.5807572603225708, "epoch": 0.14, - "learning_rate": 1.7178147268408552e-05, - "loss": 0.5048, + "learning_rate": 1.943372921615202e-05, + "loss": 0.5107, "step": 300, - "task_loss": 0.19796603918075562 + "task_loss": 0.26058703660964966 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -412,12 +412,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.5850609540939331, + "distillation_loss": 0.3432176411151886, "epoch": 0.15, - "learning_rate": 1.7083135391923993e-05, - "loss": 0.5864, + "learning_rate": 1.941472684085511e-05, + "loss": 0.5331, "step": 310, - "task_loss": 0.2181399166584015 + "task_loss": 0.10017021000385284 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -425,12 +425,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.4029274880886078, + "distillation_loss": 0.4090806841850281, "epoch": 0.15, - "learning_rate": 1.698812351543943e-05, - "loss": 0.534, + "learning_rate": 1.9395724465558195e-05, + "loss": 0.5085, "step": 320, - "task_loss": 0.1707906723022461 + "task_loss": 0.1661628633737564 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -438,12 +438,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.5725364685058594, + "distillation_loss": 0.6101839542388916, "epoch": 0.16, - "learning_rate": 1.689311163895487e-05, - "loss": 0.6037, + "learning_rate": 1.9376722090261285e-05, + "loss": 0.5719, "step": 330, - "task_loss": 0.23323196172714233 + "task_loss": 0.2671370506286621 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -451,12 +451,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.6093335747718811, + "distillation_loss": 0.6847037672996521, "epoch": 0.16, - "learning_rate": 1.679809976247031e-05, - "loss": 0.7098, + "learning_rate": 1.935771971496437e-05, + "loss": 0.7526, "step": 340, - "task_loss": 0.4206717610359192 + "task_loss": 0.5243218541145325 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -464,12 +464,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 1.0119699239730835, + "distillation_loss": 1.0114264488220215, "epoch": 0.17, - "learning_rate": 1.670308788598575e-05, - "loss": 0.4015, + "learning_rate": 1.933871733966746e-05, + "loss": 0.4603, "step": 350, - "task_loss": 0.6245448589324951 + "task_loss": 0.5498969554901123 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -477,12 +477,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.6928797960281372, + "distillation_loss": 0.6255608797073364, "epoch": 0.17, - "learning_rate": 1.6608076009501188e-05, - "loss": 0.6882, + "learning_rate": 1.9319714964370547e-05, + "loss": 0.5996, "step": 360, - "task_loss": 0.32340091466903687 + "task_loss": 0.2702621519565582 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -490,12 +490,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.2802911400794983, + "distillation_loss": 0.266034334897995, "epoch": 0.18, - "learning_rate": 1.651306413301663e-05, - "loss": 0.5088, + "learning_rate": 1.9300712589073636e-05, + "loss": 0.4917, "step": 370, - "task_loss": 0.12505391240119934 + "task_loss": 0.13177835941314697 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -503,12 +503,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.6381040811538696, + "distillation_loss": 0.7009546756744385, "epoch": 0.18, - "learning_rate": 1.641805225653207e-05, - "loss": 0.5281, + "learning_rate": 1.9281710213776723e-05, + "loss": 0.5101, "step": 380, - "task_loss": 0.25593793392181396 + "task_loss": 0.28630542755126953 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -516,12 +516,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 1.2553234100341797, + "distillation_loss": 0.9093587398529053, "epoch": 0.19, - "learning_rate": 1.6323040380047507e-05, - "loss": 0.4673, + "learning_rate": 1.9262707838479812e-05, + "loss": 0.5288, "step": 390, - "task_loss": 0.7861297130584717 + "task_loss": 0.5337154865264893 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -529,12 +529,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.32122546434402466, + "distillation_loss": 0.2342422604560852, "epoch": 0.19, - "learning_rate": 1.622802850356295e-05, - "loss": 0.5358, + "learning_rate": 1.92437054631829e-05, + "loss": 0.5478, "step": 400, - "task_loss": 0.14392141997814178 + "task_loss": 0.10641683638095856 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -542,12 +542,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.4124768078327179, + "distillation_loss": 0.4991934597492218, "epoch": 0.19, - "learning_rate": 1.6133016627078386e-05, - "loss": 0.388, + "learning_rate": 1.9224703087885988e-05, + "loss": 0.3909, "step": 410, - "task_loss": 0.18850776553153992 + "task_loss": 0.23908966779708862 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -555,12 +555,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.5564821362495422, + "distillation_loss": 0.4699850380420685, "epoch": 0.2, - "learning_rate": 1.6038004750593824e-05, - "loss": 0.4706, + "learning_rate": 1.9205700712589074e-05, + "loss": 0.4033, "step": 420, - "task_loss": 0.35438424348831177 + "task_loss": 0.28487616777420044 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -568,12 +568,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.8114709854125977, + "distillation_loss": 0.9620916247367859, "epoch": 0.2, - "learning_rate": 1.5942992874109265e-05, - "loss": 0.3992, + "learning_rate": 1.9188598574821856e-05, + "loss": 0.4304, "step": 430, - "task_loss": 0.5884829163551331 + "task_loss": 0.6841728687286377 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -581,12 +581,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.9019944071769714, + "distillation_loss": 0.7321962714195251, "epoch": 0.21, - "learning_rate": 1.5847980997624702e-05, - "loss": 0.5377, + "learning_rate": 1.9169596199524942e-05, + "loss": 0.6077, "step": 440, - "task_loss": 0.4972737431526184 + "task_loss": 0.4345829486846924 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -594,12 +594,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.4662189781665802, + "distillation_loss": 0.46323296427726746, "epoch": 0.21, - "learning_rate": 1.5752969121140143e-05, - "loss": 0.4676, + "learning_rate": 1.915059382422803e-05, + "loss": 0.4439, "step": 450, - "task_loss": 0.22651702165603638 + "task_loss": 0.17068199813365936 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -607,12 +607,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.7801036834716797, + "distillation_loss": 0.9150896072387695, "epoch": 0.22, - "learning_rate": 1.5657957244655584e-05, - "loss": 0.5115, + "learning_rate": 1.9131591448931118e-05, + "loss": 0.5283, "step": 460, - "task_loss": 0.3273088335990906 + "task_loss": 0.42760002613067627 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -620,12 +620,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.24446932971477509, + "distillation_loss": 0.3148682713508606, "epoch": 0.22, - "learning_rate": 1.5562945368171022e-05, - "loss": 0.4323, + "learning_rate": 1.9112589073634208e-05, + "loss": 0.459, "step": 470, - "task_loss": 0.28846049308776855 + "task_loss": 0.3109205663204193 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -633,12 +633,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.31131500005722046, + "distillation_loss": 0.455905556678772, "epoch": 0.23, - "learning_rate": 1.5467933491686463e-05, - "loss": 0.4727, + "learning_rate": 1.9093586698337294e-05, + "loss": 0.4673, "step": 480, - "task_loss": 0.06133585423231125 + "task_loss": 0.13838760554790497 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -646,12 +646,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.2505212426185608, + "distillation_loss": 0.25112056732177734, "epoch": 0.23, - "learning_rate": 1.53729216152019e-05, - "loss": 0.6103, + "learning_rate": 1.907458432304038e-05, + "loss": 0.5656, "step": 490, - "task_loss": 0.16016395390033722 + "task_loss": 0.1593867689371109 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -659,20 +659,20 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.6978294849395752, + "distillation_loss": 0.8803852796554565, "epoch": 0.24, - "learning_rate": 1.527790973871734e-05, - "loss": 0.412, + "learning_rate": 1.905558194774347e-05, + "loss": 0.4413, "step": 500, - "task_loss": 0.3384208679199219 + "task_loss": 0.46672773361206055 }, { "epoch": 0.24, "eval_accuracy": 0.8899082568807339, - "eval_loss": 0.49690014123916626, - "eval_runtime": 22.004, - "eval_samples_per_second": 39.629, - "eval_steps_per_second": 4.954, + "eval_loss": 0.4671143591403961, + "eval_runtime": 23.1207, + "eval_samples_per_second": 37.715, + "eval_steps_per_second": 4.714, "step": 500 }, { @@ -681,12 +681,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.5271720886230469, + "distillation_loss": 0.4302404522895813, "epoch": 0.24, - "learning_rate": 1.5182897862232779e-05, - "loss": 0.3288, + "learning_rate": 1.9036579572446556e-05, + "loss": 0.3487, "step": 510, - "task_loss": 0.3533409833908081 + "task_loss": 0.34454867243766785 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -694,12 +694,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.39492568373680115, + "distillation_loss": 0.339828759431839, "epoch": 0.25, - "learning_rate": 1.508788598574822e-05, - "loss": 0.3691, + "learning_rate": 1.9017577197149645e-05, + "loss": 0.417, "step": 520, - "task_loss": 0.17543694376945496 + "task_loss": 0.14348742365837097 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -707,12 +707,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.5424761772155762, + "distillation_loss": 0.4093138575553894, "epoch": 0.25, - "learning_rate": 1.499287410926366e-05, - "loss": 0.5957, + "learning_rate": 1.8998574821852735e-05, + "loss": 0.6335, "step": 530, - "task_loss": 0.35330963134765625 + "task_loss": 0.2882145345211029 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -720,12 +720,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.2259323000907898, + "distillation_loss": 0.2194601595401764, "epoch": 0.26, - "learning_rate": 1.4897862232779099e-05, - "loss": 0.4177, + "learning_rate": 1.897957244655582e-05, + "loss": 0.4188, "step": 540, - "task_loss": 0.05318892002105713 + "task_loss": 0.058180660009384155 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -733,12 +733,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.912689208984375, + "distillation_loss": 0.5428069829940796, "epoch": 0.26, - "learning_rate": 1.4802850356294538e-05, - "loss": 0.5001, + "learning_rate": 1.8960570071258907e-05, + "loss": 0.4371, "step": 550, - "task_loss": 0.447698175907135 + "task_loss": 0.259736031293869 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -746,12 +746,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.7216652631759644, + "distillation_loss": 0.8619398474693298, "epoch": 0.27, - "learning_rate": 1.4707838479809977e-05, - "loss": 0.5728, + "learning_rate": 1.8941567695961997e-05, + "loss": 0.5938, "step": 560, - "task_loss": 0.5098874568939209 + "task_loss": 0.5211101174354553 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -759,12 +759,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.2023361772298813, + "distillation_loss": 0.1949978768825531, "epoch": 0.27, - "learning_rate": 1.4612826603325417e-05, - "loss": 0.3707, + "learning_rate": 1.8922565320665086e-05, + "loss": 0.3784, "step": 570, - "task_loss": 0.22737565636634827 + "task_loss": 0.20978039503097534 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -772,12 +772,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.46668243408203125, + "distillation_loss": 0.4044320583343506, "epoch": 0.28, - "learning_rate": 1.4517814726840856e-05, - "loss": 0.4068, + "learning_rate": 1.8903562945368172e-05, + "loss": 0.3817, "step": 580, - "task_loss": 0.2710917294025421 + "task_loss": 0.22608956694602966 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -785,12 +785,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.5557296276092529, + "distillation_loss": 0.7097648978233337, "epoch": 0.28, - "learning_rate": 1.4422802850356297e-05, - "loss": 0.3836, + "learning_rate": 1.888456057007126e-05, + "loss": 0.413, "step": 590, - "task_loss": 0.43243685364723206 + "task_loss": 0.5404372811317444 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -798,12 +798,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.6273843050003052, + "distillation_loss": 0.6512647867202759, "epoch": 0.29, - "learning_rate": 1.4327790973871736e-05, - "loss": 0.5479, + "learning_rate": 1.8865558194774348e-05, + "loss": 0.5914, "step": 600, - "task_loss": 0.2661285400390625 + "task_loss": 0.2793722450733185 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -811,12 +811,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.3024005889892578, + "distillation_loss": 0.3936828374862671, "epoch": 0.29, - "learning_rate": 1.4232779097387176e-05, - "loss": 0.3326, + "learning_rate": 1.8846555819477438e-05, + "loss": 0.3537, "step": 610, - "task_loss": 0.11362183094024658 + "task_loss": 0.1763158142566681 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -824,12 +824,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.6180804967880249, + "distillation_loss": 0.5243096947669983, "epoch": 0.29, - "learning_rate": 1.4137767220902613e-05, - "loss": 0.2927, + "learning_rate": 1.8827553444180524e-05, + "loss": 0.3381, "step": 620, - "task_loss": 0.3157673478126526 + "task_loss": 0.24796336889266968 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -837,12 +837,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.2821282148361206, + "distillation_loss": 0.32880261540412903, "epoch": 0.3, - "learning_rate": 1.4042755344418053e-05, - "loss": 0.3051, + "learning_rate": 1.880855106888361e-05, + "loss": 0.329, "step": 630, - "task_loss": 0.1337086707353592 + "task_loss": 0.1616968810558319 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -850,12 +850,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.5159429907798767, + "distillation_loss": 0.3982900381088257, "epoch": 0.3, - "learning_rate": 1.3947743467933492e-05, - "loss": 0.4209, + "learning_rate": 1.87895486935867e-05, + "loss": 0.3864, "step": 640, - "task_loss": 0.35967183113098145 + "task_loss": 0.2720021903514862 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -863,12 +863,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.8615990281105042, + "distillation_loss": 0.59184730052948, "epoch": 0.31, - "learning_rate": 1.3852731591448931e-05, - "loss": 0.4166, + "learning_rate": 1.877054631828979e-05, + "loss": 0.4413, "step": 650, - "task_loss": 0.6024147868156433 + "task_loss": 0.41478922963142395 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -876,12 +876,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.1939113736152649, + "distillation_loss": 0.2740156352519989, "epoch": 0.31, - "learning_rate": 1.3757719714964372e-05, - "loss": 0.3193, + "learning_rate": 1.8751543942992875e-05, + "loss": 0.363, "step": 660, - "task_loss": 0.023959562182426453 + "task_loss": 0.04690548777580261 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -889,12 +889,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.31010207533836365, + "distillation_loss": 0.2690792977809906, "epoch": 0.32, - "learning_rate": 1.3662707838479811e-05, - "loss": 0.3788, + "learning_rate": 1.873254156769596e-05, + "loss": 0.3805, "step": 670, - "task_loss": 0.17067261040210724 + "task_loss": 0.13159048557281494 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -902,12 +902,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.6690168976783752, + "distillation_loss": 0.6759920120239258, "epoch": 0.32, - "learning_rate": 1.356769596199525e-05, - "loss": 0.3763, + "learning_rate": 1.871353919239905e-05, + "loss": 0.3392, "step": 680, - "task_loss": 0.355002760887146 + "task_loss": 0.37233513593673706 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -915,12 +915,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.23823483288288116, + "distillation_loss": 0.2218218445777893, "epoch": 0.33, - "learning_rate": 1.347268408551069e-05, - "loss": 0.4861, + "learning_rate": 1.869453681710214e-05, + "loss": 0.4838, "step": 690, - "task_loss": 0.0851842388510704 + "task_loss": 0.0667574405670166 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -928,12 +928,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.19940122961997986, + "distillation_loss": 0.35146889090538025, "epoch": 0.33, - "learning_rate": 1.337767220902613e-05, - "loss": 0.3373, + "learning_rate": 1.8675534441805227e-05, + "loss": 0.3663, "step": 700, - "task_loss": 0.1375439167022705 + "task_loss": 0.19517013430595398 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -941,12 +941,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.34697651863098145, + "distillation_loss": 0.4373435974121094, "epoch": 0.34, - "learning_rate": 1.3282660332541569e-05, - "loss": 0.2368, + "learning_rate": 1.8656532066508316e-05, + "loss": 0.2644, "step": 710, - "task_loss": 0.32895755767822266 + "task_loss": 0.3898613750934601 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -954,12 +954,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.5809268951416016, + "distillation_loss": 0.4680957794189453, "epoch": 0.34, - "learning_rate": 1.3187648456057008e-05, - "loss": 0.4335, + "learning_rate": 1.8637529691211403e-05, + "loss": 0.4633, "step": 720, - "task_loss": 0.350343257188797 + "task_loss": 0.2719492018222809 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -967,12 +967,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.4058021605014801, + "distillation_loss": 0.9040226936340332, "epoch": 0.35, - "learning_rate": 1.3092636579572449e-05, - "loss": 0.3762, + "learning_rate": 1.8618527315914492e-05, + "loss": 0.4141, "step": 730, - "task_loss": 0.4189315438270569 + "task_loss": 0.6466548442840576 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -980,12 +980,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.5029650330543518, + "distillation_loss": 0.5508012771606445, "epoch": 0.35, - "learning_rate": 1.2997624703087888e-05, - "loss": 0.3535, + "learning_rate": 1.8599524940617578e-05, + "loss": 0.3819, "step": 740, - "task_loss": 0.352708101272583 + "task_loss": 0.3917207717895508 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -993,20 +993,20 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.15115290880203247, + "distillation_loss": 0.10818565636873245, "epoch": 0.36, - "learning_rate": 1.2902612826603326e-05, - "loss": 0.3191, + "learning_rate": 1.8580522565320668e-05, + "loss": 0.29, "step": 750, - "task_loss": 0.05512949079275131 + "task_loss": 0.033290036022663116 }, { "epoch": 0.36, - "eval_accuracy": 0.9162844036697247, - "eval_loss": 0.2716875970363617, - "eval_runtime": 21.9554, - "eval_samples_per_second": 39.717, - "eval_steps_per_second": 4.965, + "eval_accuracy": 0.9128440366972477, + "eval_loss": 0.32853972911834717, + "eval_runtime": 31.1664, + "eval_samples_per_second": 27.979, + "eval_steps_per_second": 3.497, "step": 750 }, { @@ -1015,12 +1015,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.42094355821609497, + "distillation_loss": 0.5789792537689209, "epoch": 0.36, - "learning_rate": 1.2807600950118765e-05, - "loss": 0.3572, + "learning_rate": 1.8561520190023754e-05, + "loss": 0.3252, "step": 760, - "task_loss": 0.22914057970046997 + "task_loss": 0.3467680811882019 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -1028,12 +1028,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.22227834165096283, + "distillation_loss": 0.1704629510641098, "epoch": 0.37, - "learning_rate": 1.2712589073634205e-05, - "loss": 0.5129, + "learning_rate": 1.854251781472684e-05, + "loss": 0.4369, "step": 770, - "task_loss": 0.13911421597003937 + "task_loss": 0.11706624180078506 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -1041,12 +1041,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.6405156254768372, + "distillation_loss": 0.4032849967479706, "epoch": 0.37, - "learning_rate": 1.2617577197149644e-05, - "loss": 0.4576, + "learning_rate": 1.852351543942993e-05, + "loss": 0.4449, "step": 780, - "task_loss": 0.4629451036453247 + "task_loss": 0.26390761137008667 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -1054,12 +1054,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.432763010263443, + "distillation_loss": 0.3772156238555908, "epoch": 0.38, - "learning_rate": 1.2522565320665083e-05, - "loss": 0.3085, + "learning_rate": 1.850451306413302e-05, + "loss": 0.3257, "step": 790, - "task_loss": 0.36907055974006653 + "task_loss": 0.3016626834869385 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -1067,12 +1067,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.30914443731307983, + "distillation_loss": 0.39227956533432007, "epoch": 0.38, - "learning_rate": 1.2427553444180524e-05, - "loss": 0.4019, + "learning_rate": 1.8485510688836105e-05, + "loss": 0.412, "step": 800, - "task_loss": 0.21935078501701355 + "task_loss": 0.2939774990081787 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -1080,12 +1080,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.5716268420219421, + "distillation_loss": 0.3966776728630066, "epoch": 0.38, - "learning_rate": 1.2332541567695964e-05, - "loss": 0.3996, + "learning_rate": 1.846650831353919e-05, + "loss": 0.4206, "step": 810, - "task_loss": 0.4366046190261841 + "task_loss": 0.32761743664741516 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -1093,12 +1093,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.20762380957603455, + "distillation_loss": 0.14727869629859924, "epoch": 0.39, - "learning_rate": 1.2237529691211403e-05, - "loss": 0.4031, + "learning_rate": 1.844750593824228e-05, + "loss": 0.4085, "step": 820, - "task_loss": 0.07009849697351456 + "task_loss": 0.03596585988998413 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -1106,12 +1106,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.48847681283950806, + "distillation_loss": 0.5814430713653564, "epoch": 0.39, - "learning_rate": 1.2142517814726842e-05, - "loss": 0.4559, + "learning_rate": 1.842850356294537e-05, + "loss": 0.5506, "step": 830, - "task_loss": 0.20559999346733093 + "task_loss": 0.2807302474975586 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -1119,12 +1119,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.38028720021247864, + "distillation_loss": 0.3324885964393616, "epoch": 0.4, - "learning_rate": 1.2047505938242281e-05, - "loss": 0.2783, + "learning_rate": 1.8409501187648457e-05, + "loss": 0.2877, "step": 840, - "task_loss": 0.12343298643827438 + "task_loss": 0.11497768759727478 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -1132,12 +1132,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.6831973791122437, + "distillation_loss": 0.4697301387786865, "epoch": 0.4, - "learning_rate": 1.195249406175772e-05, - "loss": 0.4664, + "learning_rate": 1.8390498812351546e-05, + "loss": 0.4512, "step": 850, - "task_loss": 0.6523771286010742 + "task_loss": 0.42074650526046753 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -1145,12 +1145,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.21925556659698486, + "distillation_loss": 0.1672717034816742, "epoch": 0.41, - "learning_rate": 1.1857482185273158e-05, - "loss": 0.366, + "learning_rate": 1.8371496437054633e-05, + "loss": 0.3373, "step": 860, - "task_loss": 0.15919412672519684 + "task_loss": 0.10989774763584137 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -1158,12 +1158,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.07446402311325073, + "distillation_loss": 0.07770746946334839, "epoch": 0.41, - "learning_rate": 1.1762470308788601e-05, - "loss": 0.3713, + "learning_rate": 1.8352494061757722e-05, + "loss": 0.3943, "step": 870, - "task_loss": 0.014185778796672821 + "task_loss": 0.013416633009910583 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -1171,12 +1171,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.31408172845840454, + "distillation_loss": 0.4550301730632782, "epoch": 0.42, - "learning_rate": 1.166745843230404e-05, - "loss": 0.3861, + "learning_rate": 1.833349168646081e-05, + "loss": 0.3908, "step": 880, - "task_loss": 0.13161490857601166 + "task_loss": 0.20334404706954956 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -1184,12 +1184,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.14646156132221222, + "distillation_loss": 0.14840258657932281, "epoch": 0.42, - "learning_rate": 1.1572446555819478e-05, - "loss": 0.2613, + "learning_rate": 1.8314489311163898e-05, + "loss": 0.2848, "step": 890, - "task_loss": 0.07529384642839432 + "task_loss": 0.0623587965965271 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -1197,12 +1197,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.05397426709532738, + "distillation_loss": 0.10735790431499481, "epoch": 0.43, - "learning_rate": 1.1477434679334917e-05, - "loss": 0.3604, + "learning_rate": 1.8295486935866984e-05, + "loss": 0.3985, "step": 900, - "task_loss": 0.043072961270809174 + "task_loss": 0.09991317242383957 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -1210,12 +1210,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.12949523329734802, + "distillation_loss": 0.09649023413658142, "epoch": 0.43, - "learning_rate": 1.1382422802850357e-05, - "loss": 0.2215, + "learning_rate": 1.8276484560570074e-05, + "loss": 0.2586, "step": 910, - "task_loss": 0.029655031859874725 + "task_loss": 0.022964343428611755 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -1223,12 +1223,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.5533275008201599, + "distillation_loss": 0.565139651298523, "epoch": 0.44, - "learning_rate": 1.1287410926365796e-05, - "loss": 0.3736, + "learning_rate": 1.825748218527316e-05, + "loss": 0.3881, "step": 920, - "task_loss": 0.3869485855102539 + "task_loss": 0.3771653473377228 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -1236,12 +1236,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.06997716426849365, + "distillation_loss": 0.07271742820739746, "epoch": 0.44, - "learning_rate": 1.1192399049881235e-05, - "loss": 0.2925, + "learning_rate": 1.823847980997625e-05, + "loss": 0.3163, "step": 930, - "task_loss": 0.17637060582637787 + "task_loss": 0.1788436472415924 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -1249,12 +1249,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.5217384099960327, + "distillation_loss": 0.5031943321228027, "epoch": 0.45, - "learning_rate": 1.1097387173396676e-05, - "loss": 0.3657, + "learning_rate": 1.8219477434679336e-05, + "loss": 0.379, "step": 940, - "task_loss": 0.35078302025794983 + "task_loss": 0.3599282503128052 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -1262,12 +1262,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.20354126393795013, + "distillation_loss": 0.14190588891506195, "epoch": 0.45, - "learning_rate": 1.1002375296912116e-05, - "loss": 0.2885, + "learning_rate": 1.8200475059382425e-05, + "loss": 0.2631, "step": 950, - "task_loss": 0.33367669582366943 + "task_loss": 0.2649960517883301 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -1275,12 +1275,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.6612205505371094, + "distillation_loss": 0.8079104423522949, "epoch": 0.46, - "learning_rate": 1.0907363420427555e-05, - "loss": 0.4112, + "learning_rate": 1.818147268408551e-05, + "loss": 0.4097, "step": 960, - "task_loss": 0.38991737365722656 + "task_loss": 0.49955570697784424 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -1288,12 +1288,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.5148907899856567, + "distillation_loss": 0.32833898067474365, "epoch": 0.46, - "learning_rate": 1.0812351543942994e-05, - "loss": 0.3226, + "learning_rate": 1.81624703087886e-05, + "loss": 0.2893, "step": 970, - "task_loss": 0.2609829604625702 + "task_loss": 0.13918891549110413 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -1301,12 +1301,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.6710062026977539, + "distillation_loss": 0.8008242845535278, "epoch": 0.47, - "learning_rate": 1.0717339667458434e-05, - "loss": 0.2813, + "learning_rate": 1.8143467933491687e-05, + "loss": 0.3336, "step": 980, - "task_loss": 0.34536463022232056 + "task_loss": 0.39846181869506836 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -1314,12 +1314,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.35892608761787415, + "distillation_loss": 0.4929274320602417, "epoch": 0.47, - "learning_rate": 1.0622327790973871e-05, - "loss": 0.3694, + "learning_rate": 1.8124465558194773e-05, + "loss": 0.3877, "step": 990, - "task_loss": 0.190689355134964 + "task_loss": 0.2788354456424713 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -1327,20 +1327,20 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.1477653980255127, + "distillation_loss": 0.24550215899944305, "epoch": 0.48, - "learning_rate": 1.052731591448931e-05, - "loss": 0.2688, + "learning_rate": 1.8105463182897863e-05, + "loss": 0.2851, "step": 1000, - "task_loss": 0.04818693548440933 + "task_loss": 0.10739203542470932 }, { "epoch": 0.48, - "eval_accuracy": 0.911697247706422, - "eval_loss": 0.24315589666366577, - "eval_runtime": 22.0177, - "eval_samples_per_second": 39.604, - "eval_steps_per_second": 4.951, + "eval_accuracy": 0.9151376146788991, + "eval_loss": 0.24980628490447998, + "eval_runtime": 24.2778, + "eval_samples_per_second": 35.918, + "eval_steps_per_second": 4.49, "step": 1000 }, { @@ -1349,12 +1349,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.47181323170661926, + "distillation_loss": 0.32931697368621826, "epoch": 0.48, - "learning_rate": 1.0432304038004753e-05, - "loss": 0.2546, + "learning_rate": 1.8086460807600952e-05, + "loss": 0.2591, "step": 1010, - "task_loss": 0.2418743222951889 + "task_loss": 0.17109191417694092 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -1362,12 +1362,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.21640026569366455, + "distillation_loss": 0.19154950976371765, "epoch": 0.48, - "learning_rate": 1.033729216152019e-05, - "loss": 0.3818, + "learning_rate": 1.806745843230404e-05, + "loss": 0.3552, "step": 1020, - "task_loss": 0.14040601253509521 + "task_loss": 0.13995346426963806 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -1375,12 +1375,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.27131223678588867, + "distillation_loss": 0.2809268534183502, "epoch": 0.49, - "learning_rate": 1.024228028503563e-05, - "loss": 0.253, + "learning_rate": 1.8048456057007128e-05, + "loss": 0.2895, "step": 1030, - "task_loss": 0.1497097909450531 + "task_loss": 0.18339580297470093 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -1388,12 +1388,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.7797483801841736, + "distillation_loss": 0.7353945970535278, "epoch": 0.49, - "learning_rate": 1.014726840855107e-05, - "loss": 0.4507, + "learning_rate": 1.8029453681710218e-05, + "loss": 0.4529, "step": 1040, - "task_loss": 0.46980684995651245 + "task_loss": 0.516687273979187 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -1401,12 +1401,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.11180303990840912, + "distillation_loss": 0.1993633657693863, "epoch": 0.5, - "learning_rate": 1.0052256532066509e-05, - "loss": 0.2814, + "learning_rate": 1.8010451306413304e-05, + "loss": 0.3296, "step": 1050, - "task_loss": 0.4403308629989624 + "task_loss": 0.4725501835346222 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -1414,12 +1414,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.18791627883911133, + "distillation_loss": 0.17681953310966492, "epoch": 0.5, - "learning_rate": 9.95724465558195e-06, - "loss": 0.387, + "learning_rate": 1.799144893111639e-05, + "loss": 0.3377, "step": 1060, - "task_loss": 0.06846746802330017 + "task_loss": 0.06336037814617157 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -1427,12 +1427,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.16468051075935364, + "distillation_loss": 0.13305272161960602, "epoch": 0.51, - "learning_rate": 9.862232779097387e-06, - "loss": 0.3529, + "learning_rate": 1.797244655581948e-05, + "loss": 0.2699, "step": 1070, - "task_loss": 0.18533943593502045 + "task_loss": 0.2189784049987793 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -1440,12 +1440,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.42276692390441895, + "distillation_loss": 0.5291624665260315, "epoch": 0.51, - "learning_rate": 9.767220902612827e-06, - "loss": 0.2331, + "learning_rate": 1.795344418052257e-05, + "loss": 0.2867, "step": 1080, - "task_loss": 0.23515379428863525 + "task_loss": 0.299777090549469 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -1453,12 +1453,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.31465187668800354, + "distillation_loss": 0.11125596612691879, "epoch": 0.52, - "learning_rate": 9.672209026128266e-06, - "loss": 0.2775, + "learning_rate": 1.7934441805225655e-05, + "loss": 0.2417, "step": 1090, - "task_loss": 0.2029043436050415 + "task_loss": 0.0719640851020813 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -1466,12 +1466,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.7025846838951111, + "distillation_loss": 0.6146207451820374, "epoch": 0.52, - "learning_rate": 9.577197149643707e-06, - "loss": 0.3614, + "learning_rate": 1.791543942992874e-05, + "loss": 0.3645, "step": 1100, - "task_loss": 0.43116331100463867 + "task_loss": 0.3506692051887512 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -1479,12 +1479,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.43838998675346375, + "distillation_loss": 0.43747201561927795, "epoch": 0.53, - "learning_rate": 9.482185273159146e-06, - "loss": 0.2565, + "learning_rate": 1.789643705463183e-05, + "loss": 0.2669, "step": 1110, - "task_loss": 0.24070000648498535 + "task_loss": 0.2219913899898529 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -1492,12 +1492,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.526702880859375, + "distillation_loss": 0.3032437562942505, "epoch": 0.53, - "learning_rate": 9.387173396674586e-06, - "loss": 0.3354, + "learning_rate": 1.7877434679334917e-05, + "loss": 0.2974, "step": 1120, - "task_loss": 0.28051871061325073 + "task_loss": 0.12568572163581848 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -1505,12 +1505,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.471240758895874, + "distillation_loss": 0.3314521610736847, "epoch": 0.54, - "learning_rate": 9.292161520190025e-06, - "loss": 0.3342, + "learning_rate": 1.7858432304038007e-05, + "loss": 0.2802, "step": 1130, - "task_loss": 0.2174275815486908 + "task_loss": 0.1527169644832611 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -1518,12 +1518,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.13051442801952362, + "distillation_loss": 0.31865638494491577, "epoch": 0.54, - "learning_rate": 9.197149643705464e-06, - "loss": 0.2616, + "learning_rate": 1.7839429928741093e-05, + "loss": 0.3081, "step": 1140, - "task_loss": 0.08567549288272858 + "task_loss": 0.22448736429214478 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -1531,12 +1531,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.5350298881530762, + "distillation_loss": 0.6496268510818481, "epoch": 0.55, - "learning_rate": 9.102137767220904e-06, - "loss": 0.3387, + "learning_rate": 1.7820427553444182e-05, + "loss": 0.3204, "step": 1150, - "task_loss": 0.3132125437259674 + "task_loss": 0.40611302852630615 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -1544,12 +1544,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.32484662532806396, + "distillation_loss": 0.6914563179016113, "epoch": 0.55, - "learning_rate": 9.007125890736343e-06, - "loss": 0.1753, + "learning_rate": 1.780142517814727e-05, + "loss": 0.2414, "step": 1160, - "task_loss": 0.18460990488529205 + "task_loss": 0.3544915020465851 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -1557,12 +1557,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.3009415864944458, + "distillation_loss": 0.152155801653862, "epoch": 0.56, - "learning_rate": 8.912114014251782e-06, - "loss": 0.3577, + "learning_rate": 1.7782422802850358e-05, + "loss": 0.3349, "step": 1170, - "task_loss": 0.22530747950077057 + "task_loss": 0.07939426600933075 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -1570,12 +1570,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.4420413672924042, + "distillation_loss": 0.48079612851142883, "epoch": 0.56, - "learning_rate": 8.817102137767222e-06, - "loss": 0.3324, + "learning_rate": 1.7763420427553448e-05, + "loss": 0.3187, "step": 1180, - "task_loss": 0.4033554196357727 + "task_loss": 0.44884148240089417 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -1583,12 +1583,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.06501191854476929, + "distillation_loss": 0.05774353817105293, "epoch": 0.57, - "learning_rate": 8.722090261282661e-06, - "loss": 0.3307, + "learning_rate": 1.7744418052256534e-05, + "loss": 0.354, "step": 1190, - "task_loss": 0.012013241648674011 + "task_loss": 0.007377400994300842 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -1596,12 +1596,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.2965516448020935, + "distillation_loss": 0.11594004929065704, "epoch": 0.57, - "learning_rate": 8.6270783847981e-06, - "loss": 0.3437, + "learning_rate": 1.772541567695962e-05, + "loss": 0.3231, "step": 1200, - "task_loss": 0.13238894939422607 + "task_loss": 0.03987376019358635 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -1609,12 +1609,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.36214399337768555, + "distillation_loss": 0.19646455347537994, "epoch": 0.57, - "learning_rate": 8.53206650831354e-06, - "loss": 0.3629, + "learning_rate": 1.770641330166271e-05, + "loss": 0.354, "step": 1210, - "task_loss": 0.26368093490600586 + "task_loss": 0.19836881756782532 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -1622,12 +1622,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.3077784478664398, + "distillation_loss": 0.24901585280895233, "epoch": 0.58, - "learning_rate": 8.437054631828979e-06, - "loss": 0.3036, + "learning_rate": 1.76874109263658e-05, + "loss": 0.3213, "step": 1220, - "task_loss": 0.0927947610616684 + "task_loss": 0.055774152278900146 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -1635,12 +1635,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.20123688876628876, + "distillation_loss": 0.223684161901474, "epoch": 0.58, - "learning_rate": 8.342042755344418e-06, - "loss": 0.2894, + "learning_rate": 1.7668408551068885e-05, + "loss": 0.2828, "step": 1230, - "task_loss": 0.010961085557937622 + "task_loss": 0.01634085178375244 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -1648,12 +1648,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.5563689470291138, + "distillation_loss": 0.2887876033782959, "epoch": 0.59, - "learning_rate": 8.247030878859859e-06, - "loss": 0.3299, + "learning_rate": 1.764940617577197e-05, + "loss": 0.331, "step": 1240, - "task_loss": 0.3913767337799072 + "task_loss": 0.1671813428401947 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -1661,20 +1661,20 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.471743643283844, + "distillation_loss": 0.4364287257194519, "epoch": 0.59, - "learning_rate": 8.152019002375298e-06, - "loss": 0.3306, + "learning_rate": 1.763040380047506e-05, + "loss": 0.3717, "step": 1250, - "task_loss": 0.19782987236976624 + "task_loss": 0.1999722123146057 }, { "epoch": 0.59, "eval_accuracy": 0.9243119266055045, - "eval_loss": 0.20327819883823395, - "eval_runtime": 22.009, - "eval_samples_per_second": 39.62, - "eval_steps_per_second": 4.953, + "eval_loss": 0.2037193924188614, + "eval_runtime": 23.0016, + "eval_samples_per_second": 37.91, + "eval_steps_per_second": 4.739, "step": 1250 }, { @@ -1683,12 +1683,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.01976114884018898, + "distillation_loss": 0.017401084303855896, "epoch": 0.6, - "learning_rate": 8.057007125890736e-06, - "loss": 0.2307, + "learning_rate": 1.761140142517815e-05, + "loss": 0.1969, "step": 1260, - "task_loss": 0.006360933184623718 + "task_loss": 0.00458671897649765 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -1696,12 +1696,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.2867504358291626, + "distillation_loss": 0.26557794213294983, "epoch": 0.6, - "learning_rate": 7.961995249406177e-06, - "loss": 0.4469, + "learning_rate": 1.7592399049881237e-05, + "loss": 0.493, "step": 1270, - "task_loss": 0.1376619189977646 + "task_loss": 0.1353331208229065 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -1709,12 +1709,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.3130514919757843, + "distillation_loss": 0.45439109206199646, "epoch": 0.61, - "learning_rate": 7.866983372921616e-06, - "loss": 0.2792, + "learning_rate": 1.7573396674584323e-05, + "loss": 0.3094, "step": 1280, - "task_loss": 0.30938223004341125 + "task_loss": 0.35616135597229004 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -1722,12 +1722,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.21932274103164673, + "distillation_loss": 0.3577348589897156, "epoch": 0.61, - "learning_rate": 7.771971496437056e-06, - "loss": 0.2458, + "learning_rate": 1.7554394299287412e-05, + "loss": 0.3268, "step": 1290, - "task_loss": 0.07220742851495743 + "task_loss": 0.12562984228134155 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -1735,12 +1735,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.2027844786643982, + "distillation_loss": 0.35081303119659424, "epoch": 0.62, - "learning_rate": 7.676959619952495e-06, - "loss": 0.2947, + "learning_rate": 1.7535391923990502e-05, + "loss": 0.3708, "step": 1300, - "task_loss": 0.1091059073805809 + "task_loss": 0.22274452447891235 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -1748,12 +1748,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.4004560708999634, + "distillation_loss": 0.2592296004295349, "epoch": 0.62, - "learning_rate": 7.581947743467934e-06, - "loss": 0.437, + "learning_rate": 1.7516389548693588e-05, + "loss": 0.4674, "step": 1310, - "task_loss": 0.13185200095176697 + "task_loss": 0.09062568843364716 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -1761,12 +1761,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.11762793362140656, + "distillation_loss": 0.18336069583892822, "epoch": 0.63, - "learning_rate": 7.486935866983374e-06, - "loss": 0.2396, + "learning_rate": 1.7497387173396674e-05, + "loss": 0.2377, "step": 1320, - "task_loss": 0.035176947712898254 + "task_loss": 0.055865660309791565 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -1774,12 +1774,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.38001149892807007, + "distillation_loss": 0.3750176429748535, "epoch": 0.63, - "learning_rate": 7.391923990498813e-06, - "loss": 0.225, + "learning_rate": 1.7478384798099764e-05, + "loss": 0.2343, "step": 1330, - "task_loss": 0.49264460802078247 + "task_loss": 0.4618932902812958 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -1787,12 +1787,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.16133537888526917, + "distillation_loss": 0.277113139629364, "epoch": 0.64, - "learning_rate": 7.296912114014253e-06, - "loss": 0.2908, + "learning_rate": 1.745938242280285e-05, + "loss": 0.4476, "step": 1340, - "task_loss": 0.05434826388955116 + "task_loss": 0.11207205802202225 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -1800,12 +1800,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.5223551988601685, + "distillation_loss": 0.37496018409729004, "epoch": 0.64, - "learning_rate": 7.201900237529692e-06, - "loss": 0.2895, + "learning_rate": 1.744038004750594e-05, + "loss": 0.3001, "step": 1350, - "task_loss": 0.27843406796455383 + "task_loss": 0.2029455304145813 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -1813,12 +1813,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.09743548929691315, + "distillation_loss": 0.08501767367124557, "epoch": 0.65, - "learning_rate": 7.106888361045131e-06, - "loss": 0.2579, + "learning_rate": 1.742137767220903e-05, + "loss": 0.2503, "step": 1360, - "task_loss": 0.014732744544744492 + "task_loss": 0.02285398542881012 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -1826,12 +1826,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.06801208853721619, + "distillation_loss": 0.14294752478599548, "epoch": 0.65, - "learning_rate": 7.01187648456057e-06, - "loss": 0.2038, + "learning_rate": 1.7402375296912115e-05, + "loss": 0.1823, "step": 1370, - "task_loss": 0.01958051323890686 + "task_loss": 0.0727246105670929 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -1839,12 +1839,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.49920564889907837, + "distillation_loss": 0.2509918510913849, "epoch": 0.66, - "learning_rate": 6.91686460807601e-06, - "loss": 0.3022, + "learning_rate": 1.73833729216152e-05, + "loss": 0.2555, "step": 1380, - "task_loss": 0.27152562141418457 + "task_loss": 0.12989100813865662 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -1852,12 +1852,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.07850401848554611, + "distillation_loss": 0.068404421210289, "epoch": 0.66, - "learning_rate": 6.82185273159145e-06, - "loss": 0.1903, + "learning_rate": 1.736437054631829e-05, + "loss": 0.2121, "step": 1390, - "task_loss": 0.19040407240390778 + "task_loss": 0.10038695484399796 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -1865,12 +1865,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.05175900086760521, + "distillation_loss": 0.04248424619436264, "epoch": 0.67, - "learning_rate": 6.726840855106889e-06, - "loss": 0.1951, + "learning_rate": 1.734536817102138e-05, + "loss": 0.22, "step": 1400, - "task_loss": 0.0510752871632576 + "task_loss": 0.08192986994981766 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -1878,12 +1878,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.11907292902469635, + "distillation_loss": 0.0670127123594284, "epoch": 0.67, - "learning_rate": 6.631828978622329e-06, - "loss": 0.1969, + "learning_rate": 1.7326365795724467e-05, + "loss": 0.2685, "step": 1410, - "task_loss": 0.10641947388648987 + "task_loss": 0.07222311943769455 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -1891,12 +1891,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.3327619433403015, + "distillation_loss": 0.37206175923347473, "epoch": 0.67, - "learning_rate": 6.536817102137768e-06, - "loss": 0.2099, + "learning_rate": 1.7307363420427553e-05, + "loss": 0.2674, "step": 1420, - "task_loss": 0.3468553125858307 + "task_loss": 0.35340070724487305 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -1904,12 +1904,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.2979893088340759, + "distillation_loss": 0.2714982032775879, "epoch": 0.68, - "learning_rate": 6.441805225653207e-06, - "loss": 0.3192, + "learning_rate": 1.7288361045130643e-05, + "loss": 0.3446, "step": 1430, - "task_loss": 0.26997804641723633 + "task_loss": 0.2567555904388428 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -1917,12 +1917,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.03590167686343193, + "distillation_loss": 0.07237125188112259, "epoch": 0.68, - "learning_rate": 6.346793349168646e-06, - "loss": 0.179, + "learning_rate": 1.7269358669833732e-05, + "loss": 0.197, "step": 1440, - "task_loss": 0.00364762544631958 + "task_loss": 0.00902317464351654 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -1930,12 +1930,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.16363513469696045, + "distillation_loss": 0.10892651230096817, "epoch": 0.69, - "learning_rate": 6.251781472684086e-06, - "loss": 0.3228, + "learning_rate": 1.7250356294536818e-05, + "loss": 0.2894, "step": 1450, - "task_loss": 0.20806732773780823 + "task_loss": 0.13919095695018768 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -1943,12 +1943,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.28741005063056946, + "distillation_loss": 0.3349069356918335, "epoch": 0.69, - "learning_rate": 6.156769596199526e-06, - "loss": 0.2762, + "learning_rate": 1.7231353919239904e-05, + "loss": 0.2457, "step": 1460, - "task_loss": 0.12442530691623688 + "task_loss": 0.14725209772586823 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -1956,12 +1956,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.49749070405960083, + "distillation_loss": 0.3073666989803314, "epoch": 0.7, - "learning_rate": 6.061757719714965e-06, - "loss": 0.3035, + "learning_rate": 1.7212351543942994e-05, + "loss": 0.2578, "step": 1470, - "task_loss": 0.22260920703411102 + "task_loss": 0.07342517375946045 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -1969,12 +1969,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.5554820895195007, + "distillation_loss": 0.38823768496513367, "epoch": 0.7, - "learning_rate": 5.9667458432304035e-06, - "loss": 0.3454, + "learning_rate": 1.7193349168646084e-05, + "loss": 0.331, "step": 1480, - "task_loss": 0.2797449231147766 + "task_loss": 0.1779521405696869 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -1982,12 +1982,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.13691575825214386, + "distillation_loss": 0.2724040448665619, "epoch": 0.71, - "learning_rate": 5.871733966745844e-06, - "loss": 0.2283, + "learning_rate": 1.717434679334917e-05, + "loss": 0.2154, "step": 1490, - "task_loss": 0.18624231219291687 + "task_loss": 0.2927827835083008 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -1995,20 +1995,20 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.80184406042099, + "distillation_loss": 0.8642085790634155, "epoch": 0.71, - "learning_rate": 5.776722090261283e-06, - "loss": 0.224, + "learning_rate": 1.715534441805226e-05, + "loss": 0.2467, "step": 1500, - "task_loss": 0.4933280646800995 + "task_loss": 0.5641911029815674 }, { "epoch": 0.71, - "eval_accuracy": 0.9243119266055045, - "eval_loss": 0.2382841557264328, - "eval_runtime": 22.0278, - "eval_samples_per_second": 39.586, - "eval_steps_per_second": 4.948, + "eval_accuracy": 0.9174311926605505, + "eval_loss": 0.28397560119628906, + "eval_runtime": 22.6893, + "eval_samples_per_second": 38.432, + "eval_steps_per_second": 4.804, "step": 1500 }, { @@ -2017,12 +2017,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.07949607074260712, + "distillation_loss": 0.05709172412753105, "epoch": 0.72, - "learning_rate": 5.681710213776722e-06, - "loss": 0.2137, + "learning_rate": 1.7136342042755345e-05, + "loss": 0.2719, "step": 1510, - "task_loss": 0.043377894908189774 + "task_loss": 0.025617174804210663 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -2030,12 +2030,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.42658352851867676, + "distillation_loss": 0.45153453946113586, "epoch": 0.72, - "learning_rate": 5.5866983372921624e-06, - "loss": 0.2807, + "learning_rate": 1.7117339667458435e-05, + "loss": 0.2957, "step": 1520, - "task_loss": 0.24954530596733093 + "task_loss": 0.2549566328525543 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -2043,12 +2043,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.244135782122612, + "distillation_loss": 0.2257353961467743, "epoch": 0.73, - "learning_rate": 5.491686460807602e-06, - "loss": 0.3032, + "learning_rate": 1.709833729216152e-05, + "loss": 0.2892, "step": 1530, - "task_loss": 0.5667487382888794 + "task_loss": 0.5615079998970032 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -2056,12 +2056,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.12168382853269577, + "distillation_loss": 0.16512510180473328, "epoch": 0.73, - "learning_rate": 5.39667458432304e-06, - "loss": 0.2224, + "learning_rate": 1.707933491686461e-05, + "loss": 0.1975, "step": 1540, - "task_loss": 0.051823940128088 + "task_loss": 0.09258662164211273 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -2069,12 +2069,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.5252543091773987, + "distillation_loss": 0.4988386929035187, "epoch": 0.74, - "learning_rate": 5.3016627078384795e-06, - "loss": 0.3264, + "learning_rate": 1.7060332541567697e-05, + "loss": 0.3206, "step": 1550, - "task_loss": 0.5932276248931885 + "task_loss": 0.6115778088569641 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -2082,12 +2082,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.10500893741846085, + "distillation_loss": 0.18809723854064941, "epoch": 0.74, - "learning_rate": 5.20665083135392e-06, - "loss": 0.2, + "learning_rate": 1.7041330166270783e-05, + "loss": 0.1958, "step": 1560, - "task_loss": 0.06464926898479462 + "task_loss": 0.10413852334022522 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -2095,12 +2095,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.46214228868484497, + "distillation_loss": 0.4897454082965851, "epoch": 0.75, - "learning_rate": 5.111638954869359e-06, - "loss": 0.3135, + "learning_rate": 1.7022327790973873e-05, + "loss": 0.2965, "step": 1570, - "task_loss": 0.27241963148117065 + "task_loss": 0.2838747799396515 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -2108,12 +2108,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.41102948784828186, + "distillation_loss": 0.4037947654724121, "epoch": 0.75, - "learning_rate": 5.016627078384798e-06, - "loss": 0.2825, + "learning_rate": 1.7003325415676962e-05, + "loss": 0.2892, "step": 1580, - "task_loss": 0.25513461232185364 + "task_loss": 0.25087568163871765 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -2121,12 +2121,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.0665275901556015, + "distillation_loss": 0.05394119769334793, "epoch": 0.76, - "learning_rate": 4.921615201900238e-06, - "loss": 0.3124, + "learning_rate": 1.698432304038005e-05, + "loss": 0.2991, "step": 1590, - "task_loss": 0.01647743582725525 + "task_loss": 0.015048503875732422 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -2134,12 +2134,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.20230551064014435, + "distillation_loss": 0.30840539932250977, "epoch": 0.76, - "learning_rate": 4.826603325415678e-06, - "loss": 0.2472, + "learning_rate": 1.6965320665083134e-05, + "loss": 0.2293, "step": 1600, - "task_loss": 0.6591448783874512 + "task_loss": 0.634148120880127 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -2147,12 +2147,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.1525014340877533, + "distillation_loss": 0.1973361074924469, "epoch": 0.76, - "learning_rate": 4.731591448931116e-06, - "loss": 0.1319, + "learning_rate": 1.6946318289786224e-05, + "loss": 0.1691, "step": 1610, - "task_loss": 0.06440484523773193 + "task_loss": 0.09310252964496613 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -2160,12 +2160,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.38346511125564575, + "distillation_loss": 0.3464374244213104, "epoch": 0.77, - "learning_rate": 4.636579572446556e-06, - "loss": 0.2337, + "learning_rate": 1.6927315914489314e-05, + "loss": 0.2329, "step": 1620, - "task_loss": 0.27461180090904236 + "task_loss": 0.28547829389572144 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -2173,12 +2173,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.26302579045295715, + "distillation_loss": 0.2688274383544922, "epoch": 0.77, - "learning_rate": 4.541567695961996e-06, - "loss": 0.2821, + "learning_rate": 1.69083135391924e-05, + "loss": 0.2133, "step": 1630, - "task_loss": 0.1236814334988594 + "task_loss": 0.1428394615650177 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -2186,12 +2186,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.624555230140686, + "distillation_loss": 0.18282316625118256, "epoch": 0.78, - "learning_rate": 4.446555819477435e-06, - "loss": 0.3081, + "learning_rate": 1.6889311163895486e-05, + "loss": 0.2413, "step": 1640, - "task_loss": 0.3441321849822998 + "task_loss": 0.05560823902487755 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -2199,12 +2199,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.04459763318300247, + "distillation_loss": 0.2214377224445343, "epoch": 0.78, - "learning_rate": 4.351543942992874e-06, - "loss": 0.1871, + "learning_rate": 1.687030878859858e-05, + "loss": 0.2201, "step": 1650, - "task_loss": 0.009879574179649353 + "task_loss": 0.10058430582284927 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -2212,12 +2212,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.47281283140182495, + "distillation_loss": 0.5333009362220764, "epoch": 0.79, - "learning_rate": 4.256532066508314e-06, - "loss": 0.3221, + "learning_rate": 1.6851306413301665e-05, + "loss": 0.2844, "step": 1660, - "task_loss": 0.3637796640396118 + "task_loss": 0.47247129678726196 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -2225,12 +2225,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.2519533634185791, + "distillation_loss": 0.2876150906085968, "epoch": 0.79, - "learning_rate": 4.161520190023753e-06, - "loss": 0.2995, + "learning_rate": 1.683230403800475e-05, + "loss": 0.3131, "step": 1670, - "task_loss": 0.24460454285144806 + "task_loss": 0.09321459382772446 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -2238,12 +2238,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.3187342584133148, + "distillation_loss": 0.22139661014080048, "epoch": 0.8, - "learning_rate": 4.066508313539192e-06, - "loss": 0.3526, + "learning_rate": 1.681330166270784e-05, + "loss": 0.3365, "step": 1680, - "task_loss": 0.25094783306121826 + "task_loss": 0.2087613344192505 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -2251,12 +2251,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.3183995485305786, + "distillation_loss": 0.21150922775268555, "epoch": 0.8, - "learning_rate": 3.9714964370546325e-06, - "loss": 0.2348, + "learning_rate": 1.6794299287410927e-05, + "loss": 0.1991, "step": 1690, - "task_loss": 0.16011971235275269 + "task_loss": 0.1047440767288208 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -2264,12 +2264,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.5012897849082947, + "distillation_loss": 0.3274872303009033, "epoch": 0.81, - "learning_rate": 3.876484560570072e-06, - "loss": 0.2874, + "learning_rate": 1.6775296912114017e-05, + "loss": 0.2242, "step": 1700, - "task_loss": 0.30354738235473633 + "task_loss": 0.2223893105983734 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -2277,12 +2277,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.09144863486289978, + "distillation_loss": 0.11536785960197449, "epoch": 0.81, - "learning_rate": 3.781472684085511e-06, - "loss": 0.2732, + "learning_rate": 1.6756294536817103e-05, + "loss": 0.3133, "step": 1710, - "task_loss": 0.11224113404750824 + "task_loss": 0.07253136485815048 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -2290,12 +2290,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.39011454582214355, + "distillation_loss": 0.3560354709625244, "epoch": 0.82, - "learning_rate": 3.6864608076009504e-06, - "loss": 0.205, + "learning_rate": 1.6737292161520192e-05, + "loss": 0.214, "step": 1720, - "task_loss": 0.3499948978424072 + "task_loss": 0.32433855533599854 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -2303,12 +2303,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.0968744307756424, + "distillation_loss": 0.25568869709968567, "epoch": 0.82, - "learning_rate": 3.5914489311163897e-06, - "loss": 0.2088, + "learning_rate": 1.671828978622328e-05, + "loss": 0.2308, "step": 1730, - "task_loss": 0.03987715393304825 + "task_loss": 0.1160731092095375 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -2316,12 +2316,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.49030616879463196, + "distillation_loss": 0.5275914669036865, "epoch": 0.83, - "learning_rate": 3.4964370546318295e-06, - "loss": 0.3065, + "learning_rate": 1.6699287410926368e-05, + "loss": 0.3426, "step": 1740, - "task_loss": 0.33337563276290894 + "task_loss": 0.3464980125427246 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -2329,20 +2329,20 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.3354378938674927, + "distillation_loss": 0.33361896872520447, "epoch": 0.83, - "learning_rate": 3.4014251781472683e-06, - "loss": 0.2082, + "learning_rate": 1.6680285035629454e-05, + "loss": 0.2114, "step": 1750, - "task_loss": 0.2503882646560669 + "task_loss": 0.37090158462524414 }, { "epoch": 0.83, - "eval_accuracy": 0.9254587155963303, - "eval_loss": 0.22330859303474426, - "eval_runtime": 21.9928, - "eval_samples_per_second": 39.649, - "eval_steps_per_second": 4.956, + "eval_accuracy": 0.9243119266055045, + "eval_loss": 0.2238595187664032, + "eval_runtime": 28.7109, + "eval_samples_per_second": 30.372, + "eval_steps_per_second": 3.796, "step": 1750 }, { @@ -2351,12 +2351,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.07079672068357468, + "distillation_loss": 0.09009531885385513, "epoch": 0.84, - "learning_rate": 3.306413301662708e-06, - "loss": 0.3192, + "learning_rate": 1.6661282660332544e-05, + "loss": 0.3014, "step": 1760, - "task_loss": 0.006712011992931366 + "task_loss": 0.005033731460571289 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -2364,12 +2364,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.12471574544906616, + "distillation_loss": 0.25561287999153137, "epoch": 0.84, - "learning_rate": 3.211401425178148e-06, - "loss": 0.1666, + "learning_rate": 1.664228028503563e-05, + "loss": 0.2109, "step": 1770, - "task_loss": 0.047819193452596664 + "task_loss": 0.12138234078884125 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -2377,12 +2377,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.11018840968608856, + "distillation_loss": 0.0638018250465393, "epoch": 0.85, - "learning_rate": 3.1163895486935867e-06, - "loss": 0.2473, + "learning_rate": 1.662327790973872e-05, + "loss": 0.2247, "step": 1780, - "task_loss": 0.029587876051664352 + "task_loss": 0.013234104961156845 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -2390,12 +2390,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.27604231238365173, + "distillation_loss": 0.2950674891471863, "epoch": 0.85, - "learning_rate": 3.0213776722090264e-06, - "loss": 0.299, + "learning_rate": 1.6604275534441806e-05, + "loss": 0.2714, "step": 1790, - "task_loss": 0.12574651837348938 + "task_loss": 0.13912129402160645 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -2403,12 +2403,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.19024261832237244, + "distillation_loss": 0.27107563614845276, "epoch": 0.86, - "learning_rate": 2.9263657957244658e-06, - "loss": 0.2579, + "learning_rate": 1.6585273159144895e-05, + "loss": 0.3252, "step": 1800, - "task_loss": 0.11378154903650284 + "task_loss": 0.16670764982700348 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -2416,12 +2416,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.24252384901046753, + "distillation_loss": 0.07524740695953369, "epoch": 0.86, - "learning_rate": 2.8313539192399055e-06, - "loss": 0.2374, + "learning_rate": 1.656627078384798e-05, + "loss": 0.2284, "step": 1810, - "task_loss": 0.08886312693357468 + "task_loss": 0.01689385622739792 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -2429,12 +2429,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.40556859970092773, + "distillation_loss": 0.49834099411964417, "epoch": 0.86, - "learning_rate": 2.7363420427553444e-06, - "loss": 0.2556, + "learning_rate": 1.654726840855107e-05, + "loss": 0.2676, "step": 1820, - "task_loss": 0.16999585926532745 + "task_loss": 0.21745765209197998 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -2442,12 +2442,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.6070685386657715, + "distillation_loss": 0.5565487742424011, "epoch": 0.87, - "learning_rate": 2.641330166270784e-06, - "loss": 0.3513, + "learning_rate": 1.652826603325416e-05, + "loss": 0.3219, "step": 1830, - "task_loss": 0.21770261228084564 + "task_loss": 0.20299580693244934 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -2455,12 +2455,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.11093200743198395, + "distillation_loss": 0.1903807818889618, "epoch": 0.87, - "learning_rate": 2.546318289786224e-06, - "loss": 0.2251, + "learning_rate": 1.6509263657957247e-05, + "loss": 0.2616, "step": 1840, - "task_loss": 0.07486331462860107 + "task_loss": 0.14985938370227814 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -2468,12 +2468,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.15582458674907684, + "distillation_loss": 0.07635773718357086, "epoch": 0.88, - "learning_rate": 2.4513064133016627e-06, - "loss": 0.3013, + "learning_rate": 1.6490261282660333e-05, + "loss": 0.2512, "step": 1850, - "task_loss": 0.2854090631008148 + "task_loss": 0.20876792073249817 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -2481,12 +2481,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.5564573407173157, + "distillation_loss": 0.4339378774166107, "epoch": 0.88, - "learning_rate": 2.356294536817102e-06, - "loss": 0.3546, + "learning_rate": 1.6471258907363422e-05, + "loss": 0.3134, "step": 1860, - "task_loss": 0.2976299524307251 + "task_loss": 0.23869368433952332 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -2494,12 +2494,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.35662102699279785, + "distillation_loss": 0.5155743360519409, "epoch": 0.89, - "learning_rate": 2.261282660332542e-06, - "loss": 0.1928, + "learning_rate": 1.6452256532066512e-05, + "loss": 0.2005, "step": 1870, - "task_loss": 0.17799919843673706 + "task_loss": 0.27464839816093445 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -2507,12 +2507,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.07805321365594864, + "distillation_loss": 0.11325311660766602, "epoch": 0.89, - "learning_rate": 2.166270783847981e-06, - "loss": 0.273, + "learning_rate": 1.6433254156769598e-05, + "loss": 0.271, "step": 1880, - "task_loss": 0.032489631325006485 + "task_loss": 0.07444935292005539 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -2520,12 +2520,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.0970858484506607, + "distillation_loss": 0.11618862301111221, "epoch": 0.9, - "learning_rate": 2.071258907363421e-06, - "loss": 0.2183, + "learning_rate": 1.6414251781472684e-05, + "loss": 0.2204, "step": 1890, - "task_loss": 0.011649325489997864 + "task_loss": 0.03171085566282272 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -2533,12 +2533,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.06496274471282959, + "distillation_loss": 0.08980407565832138, "epoch": 0.9, - "learning_rate": 1.97624703087886e-06, - "loss": 0.3183, + "learning_rate": 1.6395249406175774e-05, + "loss": 0.2843, "step": 1900, - "task_loss": 0.09150334447622299 + "task_loss": 0.09885088354349136 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -2546,12 +2546,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.15299738943576813, + "distillation_loss": 0.2316761016845703, "epoch": 0.91, - "learning_rate": 1.8812351543942995e-06, - "loss": 0.2088, + "learning_rate": 1.637624703087886e-05, + "loss": 0.1765, "step": 1910, - "task_loss": 0.06896167993545532 + "task_loss": 0.12237384915351868 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -2559,12 +2559,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.3061752915382385, + "distillation_loss": 0.03400692343711853, "epoch": 0.91, - "learning_rate": 1.7862232779097388e-06, - "loss": 0.1873, + "learning_rate": 1.635724465558195e-05, + "loss": 0.2091, "step": 1920, - "task_loss": 0.14114254713058472 + "task_loss": 0.006525538861751556 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -2572,12 +2572,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.04780574142932892, + "distillation_loss": 0.03192061930894852, "epoch": 0.92, - "learning_rate": 1.691211401425178e-06, - "loss": 0.1784, + "learning_rate": 1.6338242280285036e-05, + "loss": 0.2077, "step": 1930, - "task_loss": 0.009304128587245941 + "task_loss": 0.005565345287322998 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -2585,12 +2585,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.08279172331094742, + "distillation_loss": 0.056986477226018906, "epoch": 0.92, - "learning_rate": 1.5961995249406176e-06, - "loss": 0.2511, + "learning_rate": 1.6319239904988125e-05, + "loss": 0.2957, "step": 1940, - "task_loss": 0.017669253051280975 + "task_loss": 0.011667303740978241 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -2598,12 +2598,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.033537302166223526, + "distillation_loss": 0.03923925757408142, "epoch": 0.93, - "learning_rate": 1.5011876484560572e-06, - "loss": 0.2502, + "learning_rate": 1.630023752969121e-05, + "loss": 0.2349, "step": 1950, - "task_loss": 0.04457058385014534 + "task_loss": 0.07591888308525085 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -2611,12 +2611,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.1866932362318039, + "distillation_loss": 0.30294090509414673, "epoch": 0.93, - "learning_rate": 1.4061757719714967e-06, - "loss": 0.251, + "learning_rate": 1.62812351543943e-05, + "loss": 0.2222, "step": 1960, - "task_loss": 0.1178937703371048 + "task_loss": 0.21820741891860962 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -2624,12 +2624,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.3302549719810486, + "distillation_loss": 0.23649045825004578, "epoch": 0.94, - "learning_rate": 1.311163895486936e-06, - "loss": 0.1995, + "learning_rate": 1.626223277909739e-05, + "loss": 0.2035, "step": 1970, - "task_loss": 0.2511519193649292 + "task_loss": 0.21682609617710114 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -2637,12 +2637,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.2090863585472107, + "distillation_loss": 0.39186713099479675, "epoch": 0.94, - "learning_rate": 1.2161520190023753e-06, - "loss": 0.2799, + "learning_rate": 1.6243230403800477e-05, + "loss": 0.2666, "step": 1980, - "task_loss": 0.14533783495426178 + "task_loss": 0.11818552017211914 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -2650,12 +2650,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.064464271068573, + "distillation_loss": 0.3313908576965332, "epoch": 0.95, - "learning_rate": 1.1211401425178148e-06, - "loss": 0.3001, + "learning_rate": 1.6224228028503563e-05, + "loss": 0.3135, "step": 1990, - "task_loss": 0.11410848796367645 + "task_loss": 0.2329636514186859 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -2663,20 +2663,20 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.5515795946121216, + "distillation_loss": 0.314828097820282, "epoch": 0.95, - "learning_rate": 1.0261282660332544e-06, - "loss": 0.2161, + "learning_rate": 1.6205225653206652e-05, + "loss": 0.1777, "step": 2000, - "task_loss": 0.21116505563259125 + "task_loss": 0.31898385286331177 }, { "epoch": 0.95, - "eval_accuracy": 0.9254587155963303, - "eval_loss": 0.22065171599388123, - "eval_runtime": 21.9957, - "eval_samples_per_second": 39.644, - "eval_steps_per_second": 4.956, + "eval_accuracy": 0.926605504587156, + "eval_loss": 0.1968172788619995, + "eval_runtime": 29.4372, + "eval_samples_per_second": 29.622, + "eval_steps_per_second": 3.703, "step": 2000 }, { @@ -2685,12 +2685,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.33025607466697693, + "distillation_loss": 0.5318342447280884, "epoch": 0.95, - "learning_rate": 9.311163895486937e-07, - "loss": 0.1835, + "learning_rate": 1.6186223277909742e-05, + "loss": 0.2675, "step": 2010, - "task_loss": 0.4712632894515991 + "task_loss": 0.5134344100952148 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -2698,12 +2698,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.30521997809410095, + "distillation_loss": 0.3911985158920288, "epoch": 0.96, - "learning_rate": 8.361045130641331e-07, - "loss": 0.2597, + "learning_rate": 1.6167220902612828e-05, + "loss": 0.2213, "step": 2020, - "task_loss": 0.2004762589931488 + "task_loss": 0.2515479028224945 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -2711,12 +2711,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.10356535017490387, + "distillation_loss": 0.1401204764842987, "epoch": 0.96, - "learning_rate": 7.410926365795724e-07, - "loss": 0.2461, + "learning_rate": 1.6148218527315914e-05, + "loss": 0.2272, "step": 2030, - "task_loss": 0.07815247774124146 + "task_loss": 0.09529782831668854 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -2724,12 +2724,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.07229259610176086, + "distillation_loss": 0.16212940216064453, "epoch": 0.97, - "learning_rate": 6.460807600950119e-07, - "loss": 0.1703, + "learning_rate": 1.6129216152019004e-05, + "loss": 0.212, "step": 2040, - "task_loss": 0.188236802816391 + "task_loss": 0.31426337361335754 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -2737,12 +2737,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.8154426217079163, + "distillation_loss": 0.21590156853199005, "epoch": 0.97, - "learning_rate": 5.510688836104513e-07, - "loss": 0.2093, + "learning_rate": 1.6110213776722093e-05, + "loss": 0.1846, "step": 2050, - "task_loss": 0.6037241220474243 + "task_loss": 0.2860991060733795 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -2750,12 +2750,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.26825618743896484, + "distillation_loss": 0.38810980319976807, "epoch": 0.98, - "learning_rate": 4.560570071258908e-07, - "loss": 0.2883, + "learning_rate": 1.609121140142518e-05, + "loss": 0.2844, "step": 2060, - "task_loss": 0.31348007917404175 + "task_loss": 0.39935895800590515 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -2763,12 +2763,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.7826902270317078, + "distillation_loss": 0.7248147130012512, "epoch": 0.98, - "learning_rate": 3.610451306413302e-07, - "loss": 0.3093, + "learning_rate": 1.6072209026128266e-05, + "loss": 0.3505, "step": 2070, - "task_loss": 0.6269980669021606 + "task_loss": 0.5984583497047424 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -2776,12 +2776,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.39573800563812256, + "distillation_loss": 0.2939862608909607, "epoch": 0.99, - "learning_rate": 2.660332541567696e-07, - "loss": 0.3111, + "learning_rate": 1.6053206650831355e-05, + "loss": 0.2442, "step": 2080, - "task_loss": 0.24385812878608704 + "task_loss": 0.15204966068267822 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -2789,12 +2789,12 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.23440049588680267, + "distillation_loss": 0.13827058672904968, "epoch": 0.99, - "learning_rate": 1.7102137767220902e-07, - "loss": 0.2771, + "learning_rate": 1.6034204275534445e-05, + "loss": 0.1982, "step": 2090, - "task_loss": 0.12204352021217346 + "task_loss": 0.05438768118619919 }, { "compression/movement_sparsity/importance_regularization_factor": 0.0, @@ -2802,26 +2802,11278 @@ "compression/movement_sparsity/linear_layer_sparsity": 0.0, "compression/movement_sparsity/model_sparsity": 0.0, "compression_loss": 0.0, - "distillation_loss": 0.2844744920730591, + "distillation_loss": 0.18541285395622253, "epoch": 1.0, - "learning_rate": 7.600950118764846e-08, - "loss": 0.2458, + "learning_rate": 1.601520190023753e-05, + "loss": 0.2719, "step": 2100, - "task_loss": 0.3573310077190399 + "task_loss": 0.3633626699447632 }, { + "compression/movement_sparsity/importance_regularization_factor": 0.00028449433713194084, + "compression/movement_sparsity/importance_threshold": -0.0026588774864763865, + "compression/movement_sparsity/linear_layer_sparsity": 0.0010491357083709726, + "compression/movement_sparsity/model_sparsity": 0.0008146869027482775, + "compression_loss": 0.07738782465457916, + "distillation_loss": 0.10512904077768326, "epoch": 1.0, - "step": 2105, - "total_flos": 4441630972486656.0, - "train_loss": 0.40093172477146793, - "train_runtime": 1354.5918, - "train_samples_per_second": 49.719, - "train_steps_per_second": 1.554 + "learning_rate": 1.5996199524940617e-05, + "loss": 0.1552, + "step": 2110, + "task_loss": 0.061604227870702744 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0009910043789297608, + "compression/movement_sparsity/importance_threshold": -0.002621092018561318, + "compression/movement_sparsity/linear_layer_sparsity": 0.0011034303485395966, + "compression/movement_sparsity/model_sparsity": 0.0008568483999520002, + "compression_loss": 0.2695717215538025, + "distillation_loss": 0.11719319969415665, + "epoch": 1.01, + "learning_rate": 1.5977197149643707e-05, + "loss": 0.3396, + "step": 2120, + "task_loss": 0.3348212242126465 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.001690789007294047, + "compression/movement_sparsity/importance_threshold": -0.0025836662382352064, + "compression/movement_sparsity/linear_layer_sparsity": 0.0012307216388136105, + "compression/movement_sparsity/model_sparsity": 0.0009556940937862046, + "compression_loss": 0.4599255919456482, + "distillation_loss": 0.0449785441160202, + "epoch": 1.01, + "learning_rate": 1.5958194774346796e-05, + "loss": 0.5465, + "step": 2130, + "task_loss": 0.017805740237236023 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0023838803858471425, + "compression/movement_sparsity/importance_threshold": -0.002546598425327852, + "compression/movement_sparsity/linear_layer_sparsity": 0.0014600153380006022, + "compression/movement_sparsity/model_sparsity": 0.0011337478690222034, + "compression_loss": 0.6484553217887878, + "distillation_loss": 0.08259381353855133, + "epoch": 1.02, + "learning_rate": 1.5939192399049882e-05, + "loss": 0.7333, + "step": 2140, + "task_loss": 0.034424424171447754 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0030703106782113913, + "compression/movement_sparsity/importance_threshold": -0.0025098868596690545, + "compression/movement_sparsity/linear_layer_sparsity": 0.0017334528380006022, + "compression/movement_sparsity/model_sparsity": 0.001346080695169288, + "compression_loss": 0.8351697325706482, + "distillation_loss": 0.07959377765655518, + "epoch": 1.02, + "learning_rate": 1.5920190023752972e-05, + "loss": 0.9215, + "step": 2150, + "task_loss": 0.015001043677330017 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0037501120480091413, + "compression/movement_sparsity/importance_threshold": -0.0024735298210886133, + "compression/movement_sparsity/linear_layer_sparsity": 0.0020618201407708522, + "compression/movement_sparsity/model_sparsity": 0.0016010682422742115, + "compression_loss": 1.0200759172439575, + "distillation_loss": 0.38012033700942993, + "epoch": 1.03, + "learning_rate": 1.5901187648456058e-05, + "loss": 1.1094, + "step": 2160, + "task_loss": 0.1993572860956192 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.004423316658862747, + "compression/movement_sparsity/importance_threshold": -0.0024375255894163272, + "compression/movement_sparsity/linear_layer_sparsity": 0.0023032407407407407, + "compression/movement_sparsity/model_sparsity": 0.0017885389377045428, + "compression_loss": 1.2031831741333008, + "distillation_loss": 0.09336289763450623, + "epoch": 1.03, + "learning_rate": 1.5882185273159144e-05, + "loss": 1.319, + "step": 2170, + "task_loss": 0.3724411129951477 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.005089956674394564, + "compression/movement_sparsity/importance_threshold": -0.0024018724444819957, + "compression/movement_sparsity/linear_layer_sparsity": 0.00264551104712436, + "compression/movement_sparsity/model_sparsity": 0.00205432260476068, + "compression_loss": 1.384499192237854, + "distillation_loss": 0.1010119616985321, + "epoch": 1.04, + "learning_rate": 1.5863182897862234e-05, + "loss": 1.4167, + "step": 2180, + "task_loss": 0.05338115245103836 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0057500642582269225, + "compression/movement_sparsity/importance_threshold": -0.002366568666115419, + "compression/movement_sparsity/linear_layer_sparsity": 0.0030340894120746763, + "compression/movement_sparsity/model_sparsity": 0.0023560659370011876, + "compression_loss": 1.564030647277832, + "distillation_loss": 0.021691124886274338, + "epoch": 1.04, + "learning_rate": 1.5844180522565323e-05, + "loss": 1.6449, + "step": 2190, + "task_loss": 0.003468889743089676 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.006403671573982206, + "compression/movement_sparsity/importance_threshold": -0.002331612534146395, + "compression/movement_sparsity/linear_layer_sparsity": 0.0036809201859379706, + "compression/movement_sparsity/model_sparsity": 0.0028583503941561096, + "compression_loss": 1.741767406463623, + "distillation_loss": 0.07079476118087769, + "epoch": 1.05, + "learning_rate": 1.582517814726841e-05, + "loss": 1.7437, + "step": 2200, + "task_loss": 0.022576410323381424 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.007050810785282746, + "compression/movement_sparsity/importance_threshold": -0.0022970023284047234, + "compression/movement_sparsity/linear_layer_sparsity": 0.00432740985395965, + "compression/movement_sparsity/model_sparsity": 0.0033603699718875247, + "compression_loss": 1.9177247285842896, + "distillation_loss": 0.030747881159186363, + "epoch": 1.05, + "learning_rate": 1.5806175771971496e-05, + "loss": 1.9982, + "step": 2210, + "task_loss": 0.005105555057525635 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.007691514055750898, + "compression/movement_sparsity/importance_threshold": -0.002262736328720204, + "compression/movement_sparsity/linear_layer_sparsity": 0.0052953741342968985, + "compression/movement_sparsity/model_sparsity": 0.004112024705614379, + "compression_loss": 2.0919368267059326, + "distillation_loss": 0.2072967141866684, + "epoch": 1.05, + "learning_rate": 1.5787173396674585e-05, + "loss": 2.1582, + "step": 2220, + "task_loss": 0.33855485916137695 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.00832581354900901, + "compression/movement_sparsity/importance_threshold": -0.002228812814922636, + "compression/movement_sparsity/linear_layer_sparsity": 0.006365270249924722, + "compression/movement_sparsity/model_sparsity": 0.004942832718103681, + "compression_loss": 2.2644152641296387, + "distillation_loss": 0.13281983137130737, + "epoch": 1.06, + "learning_rate": 1.5768171021377675e-05, + "loss": 2.3957, + "step": 2230, + "task_loss": 0.10870643705129623 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.00895374142867943, + "compression/movement_sparsity/importance_threshold": -0.002195230066841819, + "compression/movement_sparsity/linear_layer_sparsity": 0.007350924984944294, + "compression/movement_sparsity/model_sparsity": 0.005708224646759998, + "compression_loss": 2.43515682220459, + "distillation_loss": 0.023621466010808945, + "epoch": 1.06, + "learning_rate": 1.574916864608076e-05, + "loss": 2.5349, + "step": 2240, + "task_loss": 0.0038209035992622375 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.009575329858384518, + "compression/movement_sparsity/importance_threshold": -0.0021619863643075514, + "compression/movement_sparsity/linear_layer_sparsity": 0.008769878236976815, + "compression/movement_sparsity/model_sparsity": 0.006810086513455902, + "compression_loss": 2.6041481494903564, + "distillation_loss": 0.014014622196555138, + "epoch": 1.07, + "learning_rate": 1.5730166270783847e-05, + "loss": 2.6501, + "step": 2250, + "task_loss": 0.002429734915494919 + }, + { + "epoch": 1.07, + "eval_accuracy": 0.9254587155963303, + "eval_loss": 2.8218579292297363, + "eval_runtime": 30.6417, + "eval_samples_per_second": 28.458, + "eval_steps_per_second": 3.557, + "step": 2250 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.010190611001746603, + "compression/movement_sparsity/importance_threshold": -0.0021290799871496345, + "compression/movement_sparsity/linear_layer_sparsity": 0.010168882678410117, + "compression/movement_sparsity/model_sparsity": 0.007896457500763263, + "compression_loss": 2.7714102268218994, + "distillation_loss": 0.1411275863647461, + "epoch": 1.07, + "learning_rate": 1.5711163895486937e-05, + "loss": 2.8369, + "step": 2260, + "task_loss": 0.06266731023788452 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.010799617022388065, + "compression/movement_sparsity/importance_threshold": -0.0020965092151978655, + "compression/movement_sparsity/linear_layer_sparsity": 0.011874576558265583, + "compression/movement_sparsity/model_sparsity": 0.009220982491123023, + "compression_loss": 2.936958074569702, + "distillation_loss": 0.32265156507492065, + "epoch": 1.08, + "learning_rate": 1.5692161520190026e-05, + "loss": 3.0225, + "step": 2270, + "task_loss": 0.4202197194099426 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.01140238008393124, + "compression/movement_sparsity/importance_threshold": -0.0020642723282820446, + "compression/movement_sparsity/linear_layer_sparsity": 0.013452696853357423, + "compression/movement_sparsity/model_sparsity": 0.010446442577091193, + "compression_loss": 3.100813865661621, + "distillation_loss": 0.03656826913356781, + "epoch": 1.08, + "learning_rate": 1.5673159144893113e-05, + "loss": 3.1307, + "step": 2280, + "task_loss": 0.16597847640514374 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.011998932349998482, + "compression/movement_sparsity/importance_threshold": -0.002032367606231971, + "compression/movement_sparsity/linear_layer_sparsity": 0.015547780694820837, + "compression/movement_sparsity/model_sparsity": 0.01207334113004394, + "compression_loss": 3.262972354888916, + "distillation_loss": 0.3931387662887573, + "epoch": 1.09, + "learning_rate": 1.5654156769596202e-05, + "loss": 3.4253, + "step": 2290, + "task_loss": 0.39384859800338745 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.012589305984212141, + "compression/movement_sparsity/importance_threshold": -0.0020007933288774442, + "compression/movement_sparsity/linear_layer_sparsity": 0.018422667777024993, + "compression/movement_sparsity/model_sparsity": 0.01430578144645319, + "compression_loss": 3.4234325885772705, + "distillation_loss": 0.05559838190674782, + "epoch": 1.09, + "learning_rate": 1.5635154394299288e-05, + "loss": 3.4753, + "step": 2300, + "task_loss": 0.011821478605270386 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.013173533150194566, + "compression/movement_sparsity/importance_threshold": -0.0019695477760482637, + "compression/movement_sparsity/linear_layer_sparsity": 0.02154230418172237, + "compression/movement_sparsity/model_sparsity": 0.01672827731611527, + "compression_loss": 3.5822105407714844, + "distillation_loss": 0.10458941757678986, + "epoch": 1.1, + "learning_rate": 1.5616152019002378e-05, + "loss": 3.7453, + "step": 2310, + "task_loss": 0.33254674077033997 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.013751646011568098, + "compression/movement_sparsity/importance_threshold": -0.001938629227574229, + "compression/movement_sparsity/linear_layer_sparsity": 0.025347445705359833, + "compression/movement_sparsity/model_sparsity": 0.019683089489294023, + "compression_loss": 3.739313840866089, + "distillation_loss": 0.07036435604095459, + "epoch": 1.1, + "learning_rate": 1.5597149643705464e-05, + "loss": 3.7802, + "step": 2320, + "task_loss": 0.042275626212358475 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.01432367673195511, + "compression/movement_sparsity/importance_threshold": -0.0019080359632851387, + "compression/movement_sparsity/linear_layer_sparsity": 0.02890405045543511, + "compression/movement_sparsity/model_sparsity": 0.022444905034241694, + "compression_loss": 3.894759178161621, + "distillation_loss": 0.06921583414077759, + "epoch": 1.11, + "learning_rate": 1.5578147268408554e-05, + "loss": 3.984, + "step": 2330, + "task_loss": 0.2081681489944458 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.014889657474977936, + "compression/movement_sparsity/importance_threshold": -0.0018777662630107927, + "compression/movement_sparsity/linear_layer_sparsity": 0.03373462671258657, + "compression/movement_sparsity/model_sparsity": 0.026195999557121603, + "compression_loss": 4.048542022705078, + "distillation_loss": 0.021231140941381454, + "epoch": 1.11, + "learning_rate": 1.555914489311164e-05, + "loss": 4.1113, + "step": 2340, + "task_loss": 0.00398905947804451 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.015449620404258941, + "compression/movement_sparsity/importance_threshold": -0.0018478184065809896, + "compression/movement_sparsity/linear_layer_sparsity": 0.04030603075127973, + "compression/movement_sparsity/model_sparsity": 0.03129890165098249, + "compression_loss": 4.2006354331970215, + "distillation_loss": 0.024290261790156364, + "epoch": 1.12, + "learning_rate": 1.554014251781473e-05, + "loss": 4.25, + "step": 2350, + "task_loss": 0.005856834352016449 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.016003597683420464, + "compression/movement_sparsity/importance_threshold": -0.0018181906738255296, + "compression/movement_sparsity/linear_layer_sparsity": 0.04683566696778079, + "compression/movement_sparsity/model_sparsity": 0.036369369716123603, + "compression_loss": 4.351089954376221, + "distillation_loss": 0.03239811956882477, + "epoch": 1.12, + "learning_rate": 1.5521140142517815e-05, + "loss": 4.4352, + "step": 2360, + "task_loss": 0.012328799813985825 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.016551621476084855, + "compression/movement_sparsity/importance_threshold": -0.0017888813445742116, + "compression/movement_sparsity/linear_layer_sparsity": 0.05416729006699789, + "compression/movement_sparsity/model_sparsity": 0.0420626058410224, + "compression_loss": 4.499930381774902, + "distillation_loss": 0.12336177378892899, + "epoch": 1.13, + "learning_rate": 1.5502137767220905e-05, + "loss": 4.5803, + "step": 2370, + "task_loss": 0.06430968642234802 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.017093723945874474, + "compression/movement_sparsity/importance_threshold": -0.0017598886986568353, + "compression/movement_sparsity/linear_layer_sparsity": 0.06110632386705812, + "compression/movement_sparsity/model_sparsity": 0.04745098401701118, + "compression_loss": 4.647174835205078, + "distillation_loss": 0.030381930992007256, + "epoch": 1.13, + "learning_rate": 1.548313539192399e-05, + "loss": 4.7628, + "step": 2380, + "task_loss": 0.1719023883342743 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.01762993725641166, + "compression/movement_sparsity/importance_threshold": -0.0017312110159031996, + "compression/movement_sparsity/linear_layer_sparsity": 0.06815575833709726, + "compression/movement_sparsity/model_sparsity": 0.05292509178848395, + "compression_loss": 4.792819023132324, + "distillation_loss": 0.44239741563796997, + "epoch": 1.14, + "learning_rate": 1.5464133016627077e-05, + "loss": 4.888, + "step": 2390, + "task_loss": 0.2511236369609833 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.018160293571318775, + "compression/movement_sparsity/importance_threshold": -0.0017028465761431042, + "compression/movement_sparsity/linear_layer_sparsity": 0.07490678400707618, + "compression/movement_sparsity/model_sparsity": 0.058167475733254294, + "compression_loss": 4.936878681182861, + "distillation_loss": 0.07934065163135529, + "epoch": 1.14, + "learning_rate": 1.5445130641330167e-05, + "loss": 5.0129, + "step": 2400, + "task_loss": 0.19744229316711426 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.018684825054218146, + "compression/movement_sparsity/importance_threshold": -0.0016747936592063492, + "compression/movement_sparsity/linear_layer_sparsity": 0.0832007019722975, + "compression/movement_sparsity/model_sparsity": 0.06460796411318578, + "compression_loss": 5.079326152801514, + "distillation_loss": 0.08092580735683441, + "epoch": 1.14, + "learning_rate": 1.5426128266033256e-05, + "loss": 5.1946, + "step": 2410, + "task_loss": 0.020919568836688995 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.019203563868732162, + "compression/movement_sparsity/importance_threshold": -0.001647050544922732, + "compression/movement_sparsity/linear_layer_sparsity": 0.09128394591237579, + "compression/movement_sparsity/model_sparsity": 0.07088485748089549, + "compression_loss": 5.220214366912842, + "distillation_loss": 0.1532871127128601, + "epoch": 1.15, + "learning_rate": 1.5407125890736343e-05, + "loss": 5.4019, + "step": 2420, + "task_loss": 0.10705733299255371 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.01971654217848315, + "compression/movement_sparsity/importance_threshold": -0.0016196155131220531, + "compression/movement_sparsity/linear_layer_sparsity": 0.09979940624059018, + "compression/movement_sparsity/model_sparsity": 0.07749738047950812, + "compression_loss": 5.3595380783081055, + "distillation_loss": 0.08937288820743561, + "epoch": 1.15, + "learning_rate": 1.538812351543943e-05, + "loss": 5.5179, + "step": 2430, + "task_loss": 0.2294204831123352 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.020223792147093467, + "compression/movement_sparsity/importance_threshold": -0.0015924868436341117, + "compression/movement_sparsity/linear_layer_sparsity": 0.10875641043736826, + "compression/movement_sparsity/model_sparsity": 0.08445277619119063, + "compression_loss": 5.49728536605835, + "distillation_loss": 0.039297595620155334, + "epoch": 1.16, + "learning_rate": 1.536912114014252e-05, + "loss": 5.5902, + "step": 2440, + "task_loss": 0.012466028332710266 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.020725345938185466, + "compression/movement_sparsity/importance_threshold": -0.0015656628162887072, + "compression/movement_sparsity/linear_layer_sparsity": 0.11579635275519422, + "compression/movement_sparsity/model_sparsity": 0.08991951300767133, + "compression_loss": 5.633523941040039, + "distillation_loss": 0.271115243434906, + "epoch": 1.16, + "learning_rate": 1.5350118764845608e-05, + "loss": 5.8822, + "step": 2450, + "task_loss": 0.06959662586450577 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.02122123571538147, + "compression/movement_sparsity/importance_threshold": -0.0015391417109156397, + "compression/movement_sparsity/linear_layer_sparsity": 0.1232250616342969, + "compression/movement_sparsity/model_sparsity": 0.09568813929676413, + "compression_loss": 5.76822566986084, + "distillation_loss": 0.6192089319229126, + "epoch": 1.17, + "learning_rate": 1.5331116389548694e-05, + "loss": 5.9777, + "step": 2460, + "task_loss": 0.36697810888290405 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.021711493642303875, + "compression/movement_sparsity/importance_threshold": -0.001512921807344707, + "compression/movement_sparsity/linear_layer_sparsity": 0.1321434620596206, + "compression/movement_sparsity/model_sparsity": 0.10261355796472424, + "compression_loss": 5.90135383605957, + "distillation_loss": 0.33043575286865234, + "epoch": 1.17, + "learning_rate": 1.5312114014251784e-05, + "loss": 6.1168, + "step": 2470, + "task_loss": 0.23188000917434692 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.022196151882575007, + "compression/movement_sparsity/importance_threshold": -0.0014870013854057092, + "compression/movement_sparsity/linear_layer_sparsity": 0.14109207975760313, + "compression/movement_sparsity/model_sparsity": 0.10956244129609777, + "compression_loss": 6.032968044281006, + "distillation_loss": 0.44384852051734924, + "epoch": 1.18, + "learning_rate": 1.5293111638954873e-05, + "loss": 6.2156, + "step": 2480, + "task_loss": 0.26120489835739136 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.022675242599817222, + "compression/movement_sparsity/importance_threshold": -0.0014613787249284456, + "compression/movement_sparsity/linear_layer_sparsity": 0.14897257744278833, + "compression/movement_sparsity/model_sparsity": 0.11568189581473888, + "compression_loss": 6.16309928894043, + "distillation_loss": 0.13425683975219727, + "epoch": 1.18, + "learning_rate": 1.527410926365796e-05, + "loss": 6.4394, + "step": 2490, + "task_loss": 0.04769594222307205 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.02314879795765287, + "compression/movement_sparsity/importance_threshold": -0.0014360521057427153, + "compression/movement_sparsity/linear_layer_sparsity": 0.1572927958446251, + "compression/movement_sparsity/model_sparsity": 0.12214280731160009, + "compression_loss": 6.291727542877197, + "distillation_loss": 0.3531065583229065, + "epoch": 1.19, + "learning_rate": 1.5255106888361047e-05, + "loss": 6.4768, + "step": 2500, + "task_loss": 0.4275900721549988 + }, + { + "epoch": 1.19, + "eval_accuracy": 0.8979357798165137, + "eval_loss": 6.5765380859375, + "eval_runtime": 26.114, + "eval_samples_per_second": 33.392, + "eval_steps_per_second": 4.174, + "step": 2500 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.023616850119704293, + "compression/movement_sparsity/importance_threshold": -0.0014110198076783182, + "compression/movement_sparsity/linear_layer_sparsity": 0.16596552949036436, + "compression/movement_sparsity/model_sparsity": 0.12887745799199585, + "compression_loss": 6.418839454650879, + "distillation_loss": 0.32202211022377014, + "epoch": 1.19, + "learning_rate": 1.5236104513064133e-05, + "loss": 6.6039, + "step": 2510, + "task_loss": 0.21520735323429108 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.024079431249593854, + "compression/movement_sparsity/importance_threshold": -0.0013862801105650534, + "compression/movement_sparsity/linear_layer_sparsity": 0.17616387665612768, + "compression/movement_sparsity/model_sparsity": 0.1367967955946862, + "compression_loss": 6.544415473937988, + "distillation_loss": 0.24521012604236603, + "epoch": 1.2, + "learning_rate": 1.5217102137767221e-05, + "loss": 6.6944, + "step": 2520, + "task_loss": 0.1956482082605362 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0245365735109439, + "compression/movement_sparsity/importance_threshold": -0.00136183129423272, + "compression/movement_sparsity/linear_layer_sparsity": 0.18661719455736225, + "compression/movement_sparsity/model_sparsity": 0.14491412599956133, + "compression_loss": 6.668520450592041, + "distillation_loss": 0.3077329397201538, + "epoch": 1.2, + "learning_rate": 1.519809976247031e-05, + "loss": 6.8466, + "step": 2530, + "task_loss": 0.16484826803207397 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.02498830906737678, + "compression/movement_sparsity/importance_threshold": -0.0013376716385111176, + "compression/movement_sparsity/linear_layer_sparsity": 0.19720587266636555, + "compression/movement_sparsity/model_sparsity": 0.1531365678667026, + "compression_loss": 6.791137218475342, + "distillation_loss": 0.15260806679725647, + "epoch": 1.21, + "learning_rate": 1.5179097387173399e-05, + "loss": 6.9378, + "step": 2540, + "task_loss": 0.04942808300256729 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.025434670082514835, + "compression/movement_sparsity/importance_threshold": -0.001313799423230046, + "compression/movement_sparsity/linear_layer_sparsity": 0.20696387571514605, + "compression/movement_sparsity/model_sparsity": 0.16071396440119193, + "compression_loss": 6.912316799163818, + "distillation_loss": 0.1831030249595642, + "epoch": 1.21, + "learning_rate": 1.5160095011876485e-05, + "loss": 7.1445, + "step": 2550, + "task_loss": 0.0789690762758255 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.025875688719980434, + "compression/movement_sparsity/importance_threshold": -0.0012902129282193035, + "compression/movement_sparsity/linear_layer_sparsity": 0.21675474254742547, + "compression/movement_sparsity/model_sparsity": 0.1683168806980702, + "compression_loss": 7.032046794891357, + "distillation_loss": 0.14177289605140686, + "epoch": 1.22, + "learning_rate": 1.5141092636579573e-05, + "loss": 7.2325, + "step": 2560, + "task_loss": 0.2033880352973938 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.02631139714339592, + "compression/movement_sparsity/importance_threshold": -0.00126691043330869, + "compression/movement_sparsity/linear_layer_sparsity": 0.22645565153568203, + "compression/movement_sparsity/model_sparsity": 0.1758499418973284, + "compression_loss": 7.150337219238281, + "distillation_loss": 0.21663016080856323, + "epoch": 1.22, + "learning_rate": 1.5122090261282662e-05, + "loss": 7.3976, + "step": 2570, + "task_loss": 0.18229436874389648 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.02674182751638365, + "compression/movement_sparsity/importance_threshold": -0.0012438902183280046, + "compression/movement_sparsity/linear_layer_sparsity": 0.2362620798517013, + "compression/movement_sparsity/model_sparsity": 0.18346494217618248, + "compression_loss": 7.267183780670166, + "distillation_loss": 0.062164291739463806, + "epoch": 1.23, + "learning_rate": 1.510308788598575e-05, + "loss": 7.4626, + "step": 2580, + "task_loss": 0.006800137460231781 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.027167012002565962, + "compression/movement_sparsity/importance_threshold": -0.0012211505631070472, + "compression/movement_sparsity/linear_layer_sparsity": 0.24733038806082505, + "compression/movement_sparsity/model_sparsity": 0.19205983191409415, + "compression_loss": 7.382561683654785, + "distillation_loss": 0.05586311221122742, + "epoch": 1.23, + "learning_rate": 1.5084085510688838e-05, + "loss": 7.5546, + "step": 2590, + "task_loss": 0.01366850733757019 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.027586982765565204, + "compression/movement_sparsity/importance_threshold": -0.001198689747475617, + "compression/movement_sparsity/linear_layer_sparsity": 0.2569306825880759, + "compression/movement_sparsity/model_sparsity": 0.19951476281719105, + "compression_loss": 7.496555805206299, + "distillation_loss": 0.6351262331008911, + "epoch": 1.24, + "learning_rate": 1.5065083135391924e-05, + "loss": 7.888, + "step": 2600, + "task_loss": 0.46579158306121826 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.028001771969003754, + "compression/movement_sparsity/importance_threshold": -0.0011765060512635124, + "compression/movement_sparsity/linear_layer_sparsity": 0.2665302008054803, + "compression/movement_sparsity/model_sparsity": 0.20696909089125515, + "compression_loss": 7.60915994644165, + "distillation_loss": 0.42842957377433777, + "epoch": 1.24, + "learning_rate": 1.5046080760095012e-05, + "loss": 7.9082, + "step": 2610, + "task_loss": 0.14085018634796143 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.02841141177650394, + "compression/movement_sparsity/importance_threshold": -0.0011545977543005338, + "compression/movement_sparsity/linear_layer_sparsity": 0.2762365557249323, + "compression/movement_sparsity/model_sparsity": 0.21450638102751624, + "compression_loss": 7.720366954803467, + "distillation_loss": 0.3830278515815735, + "epoch": 1.24, + "learning_rate": 1.5027078384798102e-05, + "loss": 8.14, + "step": 2620, + "task_loss": 0.28632909059524536 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.028815934351688118, + "compression/movement_sparsity/importance_threshold": -0.00113296313641648, + "compression/movement_sparsity/linear_layer_sparsity": 0.285795494109455, + "compression/movement_sparsity/model_sparsity": 0.22192919758395624, + "compression_loss": 7.830181121826172, + "distillation_loss": 0.2521060109138489, + "epoch": 1.25, + "learning_rate": 1.500807600950119e-05, + "loss": 8.1153, + "step": 2630, + "task_loss": 0.1563887596130371 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.029215371858178636, + "compression/movement_sparsity/importance_threshold": -0.0011116004774411505, + "compression/movement_sparsity/linear_layer_sparsity": 0.2953732638888889, + "compression/movement_sparsity/model_sparsity": 0.22936663731132845, + "compression_loss": 7.938580513000488, + "distillation_loss": 0.1190139651298523, + "epoch": 1.25, + "learning_rate": 1.4989073634204276e-05, + "loss": 8.1567, + "step": 2640, + "task_loss": 0.04846365749835968 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.029609756459597847, + "compression/movement_sparsity/importance_threshold": -0.0010905080572043448, + "compression/movement_sparsity/linear_layer_sparsity": 0.30575512599744054, + "compression/movement_sparsity/model_sparsity": 0.2374284800438655, + "compression_loss": 8.045588493347168, + "distillation_loss": 0.06826656311750412, + "epoch": 1.26, + "learning_rate": 1.4970071258907363e-05, + "loss": 8.2518, + "step": 2650, + "task_loss": 0.011266250163316727 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.02999912031956811, + "compression/movement_sparsity/importance_threshold": -0.001069684155535862, + "compression/movement_sparsity/linear_layer_sparsity": 0.31633204183604335, + "compression/movement_sparsity/model_sparsity": 0.2456417881377824, + "compression_loss": 8.151252746582031, + "distillation_loss": 0.1792048215866089, + "epoch": 1.26, + "learning_rate": 1.4951068883610453e-05, + "loss": 8.4222, + "step": 2660, + "task_loss": 0.06012868136167526 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03038349560171176, + "compression/movement_sparsity/importance_threshold": -0.0010491270522655018, + "compression/movement_sparsity/linear_layer_sparsity": 0.32636890479147845, + "compression/movement_sparsity/model_sparsity": 0.25343572816787524, + "compression_loss": 8.255572319030762, + "distillation_loss": 0.0919712483882904, + "epoch": 1.27, + "learning_rate": 1.4932066508313541e-05, + "loss": 8.513, + "step": 2670, + "task_loss": 0.017085224390029907 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.030762914469651154, + "compression/movement_sparsity/importance_threshold": -0.001028835027223063, + "compression/movement_sparsity/linear_layer_sparsity": 0.3357471276535682, + "compression/movement_sparsity/model_sparsity": 0.2607182134324959, + "compression_loss": 8.358521461486816, + "distillation_loss": 0.5716733336448669, + "epoch": 1.27, + "learning_rate": 1.4913064133016629e-05, + "loss": 8.648, + "step": 2680, + "task_loss": 0.34179773926734924 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03113740908700865, + "compression/movement_sparsity/importance_threshold": -0.0010088063602383453, + "compression/movement_sparsity/linear_layer_sparsity": 0.3457659002371274, + "compression/movement_sparsity/model_sparsity": 0.26849810571936966, + "compression_loss": 8.460137367248535, + "distillation_loss": 0.3870583772659302, + "epoch": 1.28, + "learning_rate": 1.4894061757719715e-05, + "loss": 8.7722, + "step": 2690, + "task_loss": 0.2868567109107971 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03150701161740658, + "compression/movement_sparsity/importance_threshold": -0.0009890393311411487, + "compression/movement_sparsity/linear_layer_sparsity": 0.35571594587473654, + "compression/movement_sparsity/model_sparsity": 0.27622462936929343, + "compression_loss": 8.560412406921387, + "distillation_loss": 0.5741435289382935, + "epoch": 1.28, + "learning_rate": 1.4875059382422804e-05, + "loss": 8.8839, + "step": 2700, + "task_loss": 0.5338079333305359 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03187175422446732, + "compression/movement_sparsity/importance_threshold": -0.000969532219761271, + "compression/movement_sparsity/linear_layer_sparsity": 0.36540933877220716, + "compression/movement_sparsity/model_sparsity": 0.28375185408746123, + "compression_loss": 8.659358024597168, + "distillation_loss": 0.5666482448577881, + "epoch": 1.29, + "learning_rate": 1.4856057007125892e-05, + "loss": 9.0379, + "step": 2710, + "task_loss": 0.368877112865448 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.032231669071813206, + "compression/movement_sparsity/importance_threshold": -0.0009502833059285125, + "compression/movement_sparsity/linear_layer_sparsity": 0.3747603790274014, + "compression/movement_sparsity/model_sparsity": 0.2910132312021604, + "compression_loss": 8.756999015808105, + "distillation_loss": 0.4417470097541809, + "epoch": 1.29, + "learning_rate": 1.483705463182898e-05, + "loss": 9.134, + "step": 2720, + "task_loss": 0.2064928412437439 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.032586788323066586, + "compression/movement_sparsity/importance_threshold": -0.0009312908694726729, + "compression/movement_sparsity/linear_layer_sparsity": 0.38400508835817526, + "compression/movement_sparsity/model_sparsity": 0.29819203900691116, + "compression_loss": 8.85331916809082, + "distillation_loss": 0.38386473059654236, + "epoch": 1.3, + "learning_rate": 1.4818052256532068e-05, + "loss": 9.1598, + "step": 2730, + "task_loss": 0.21585097908973694 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03293714414184983, + "compression/movement_sparsity/importance_threshold": -0.00091255319022355, + "compression/movement_sparsity/linear_layer_sparsity": 0.39283086090409514, + "compression/movement_sparsity/model_sparsity": 0.3050455292107293, + "compression_loss": 8.948369026184082, + "distillation_loss": 0.3611488938331604, + "epoch": 1.3, + "learning_rate": 1.4799049881235154e-05, + "loss": 9.3696, + "step": 2740, + "task_loss": 0.24724030494689941 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.033282768691785265, + "compression/movement_sparsity/importance_threshold": -0.0008940685480109448, + "compression/movement_sparsity/linear_layer_sparsity": 0.4017413100346281, + "compression/movement_sparsity/model_sparsity": 0.31196477344798973, + "compression_loss": 9.04212474822998, + "distillation_loss": 0.436295747756958, + "epoch": 1.31, + "learning_rate": 1.4780047505938244e-05, + "loss": 9.3594, + "step": 2750, + "task_loss": 0.18571683764457703 + }, + { + "epoch": 1.31, + "eval_accuracy": 0.8818807339449541, + "eval_loss": 9.464767456054688, + "eval_runtime": 25.44, + "eval_samples_per_second": 34.277, + "eval_steps_per_second": 4.285, + "step": 2750 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.033623694136495255, + "compression/movement_sparsity/importance_threshold": -0.000875835222664656, + "compression/movement_sparsity/linear_layer_sparsity": 0.4111423987503764, + "compression/movement_sparsity/model_sparsity": 0.31926501476775854, + "compression_loss": 9.134596824645996, + "distillation_loss": 0.541549563407898, + "epoch": 1.31, + "learning_rate": 1.4761045130641332e-05, + "loss": 9.4946, + "step": 2760, + "task_loss": 0.27293768525123596 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03395995263960215, + "compression/movement_sparsity/importance_threshold": -0.0008578514940144827, + "compression/movement_sparsity/linear_layer_sparsity": 0.4209716693955134, + "compression/movement_sparsity/model_sparsity": 0.3268977528342144, + "compression_loss": 9.225789070129395, + "distillation_loss": 0.18441905081272125, + "epoch": 1.32, + "learning_rate": 1.474204275534442e-05, + "loss": 9.5998, + "step": 2770, + "task_loss": 0.05545267462730408 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03429157636472829, + "compression/movement_sparsity/importance_threshold": -0.0008401156418902246, + "compression/movement_sparsity/linear_layer_sparsity": 0.4294070404245709, + "compression/movement_sparsity/model_sparsity": 0.3334480839709423, + "compression_loss": 9.31571102142334, + "distillation_loss": 0.2552655339241028, + "epoch": 1.32, + "learning_rate": 1.4723040380047506e-05, + "loss": 9.5848, + "step": 2780, + "task_loss": 0.07465029507875443 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03461859747549605, + "compression/movement_sparsity/importance_threshold": -0.0008226259461216808, + "compression/movement_sparsity/linear_layer_sparsity": 0.43871413683754895, + "compression/movement_sparsity/model_sparsity": 0.3406753373088751, + "compression_loss": 9.404358863830566, + "distillation_loss": 0.5486918687820435, + "epoch": 1.33, + "learning_rate": 1.4704038004750595e-05, + "loss": 9.6934, + "step": 2790, + "task_loss": 0.25401195883750916 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03494104813552775, + "compression/movement_sparsity/importance_threshold": -0.0008053806865386509, + "compression/movement_sparsity/linear_layer_sparsity": 0.44789961137458595, + "compression/movement_sparsity/model_sparsity": 0.34780814743166794, + "compression_loss": 9.491792678833008, + "distillation_loss": 0.7868499755859375, + "epoch": 1.33, + "learning_rate": 1.4686935866983374e-05, + "loss": 9.9283, + "step": 2800, + "task_loss": 0.47319698333740234 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03525896050844576, + "compression/movement_sparsity/importance_threshold": -0.0007883781429709343, + "compression/movement_sparsity/linear_layer_sparsity": 0.4564576275971093, + "compression/movement_sparsity/model_sparsity": 0.35445371642180634, + "compression_loss": 9.577984809875488, + "distillation_loss": 0.3035008907318115, + "epoch": 1.33, + "learning_rate": 1.4667933491686462e-05, + "loss": 9.8715, + "step": 2810, + "task_loss": 0.44653627276420593 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03557236675787243, + "compression/movement_sparsity/importance_threshold": -0.0007716165952483297, + "compression/movement_sparsity/linear_layer_sparsity": 0.4642448975271003, + "compression/movement_sparsity/model_sparsity": 0.36050077665387104, + "compression_loss": 9.66294002532959, + "distillation_loss": 0.10570189356803894, + "epoch": 1.34, + "learning_rate": 1.4648931116389552e-05, + "loss": 10.013, + "step": 2820, + "task_loss": 0.022750303149223328 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03588129904743011, + "compression/movement_sparsity/importance_threshold": -0.0007550943232006373, + "compression/movement_sparsity/linear_layer_sparsity": 0.4726805626129178, + "compression/movement_sparsity/model_sparsity": 0.3670513361349295, + "compression_loss": 9.746687889099121, + "distillation_loss": 0.5314592123031616, + "epoch": 1.34, + "learning_rate": 1.4629928741092638e-05, + "loss": 10.1699, + "step": 2830, + "task_loss": 0.5152712464332581 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.036185789540741135, + "compression/movement_sparsity/importance_threshold": -0.0007388096066576562, + "compression/movement_sparsity/linear_layer_sparsity": 0.48106820234869013, + "compression/movement_sparsity/model_sparsity": 0.37356460241991285, + "compression_loss": 9.829238891601562, + "distillation_loss": 0.6711559295654297, + "epoch": 1.35, + "learning_rate": 1.4610926365795726e-05, + "loss": 10.3624, + "step": 2840, + "task_loss": 0.32968375086784363 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.036485870401427874, + "compression/movement_sparsity/importance_threshold": -0.0007227607254491853, + "compression/movement_sparsity/linear_layer_sparsity": 0.4889538048592291, + "compression/movement_sparsity/model_sparsity": 0.3796880209961333, + "compression_loss": 9.910573959350586, + "distillation_loss": 0.2346329241991043, + "epoch": 1.35, + "learning_rate": 1.4591923990498813e-05, + "loss": 10.5236, + "step": 2850, + "task_loss": 0.04817202687263489 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03678157379311268, + "compression/movement_sparsity/importance_threshold": -0.0007069459594050238, + "compression/movement_sparsity/linear_layer_sparsity": 0.4977513832429991, + "compression/movement_sparsity/model_sparsity": 0.38651961754553266, + "compression_loss": 9.990703582763672, + "distillation_loss": 0.2503522038459778, + "epoch": 1.36, + "learning_rate": 1.45729216152019e-05, + "loss": 10.4097, + "step": 2860, + "task_loss": 0.17465853691101074 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03707293187941788, + "compression/movement_sparsity/importance_threshold": -0.0006913635883549723, + "compression/movement_sparsity/linear_layer_sparsity": 0.5060777297877146, + "compression/movement_sparsity/model_sparsity": 0.39298528773824376, + "compression_loss": 10.069620132446289, + "distillation_loss": 0.6797527074813843, + "epoch": 1.36, + "learning_rate": 1.455391923990499e-05, + "loss": 10.5071, + "step": 2870, + "task_loss": 0.306240975856781 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03735997682396585, + "compression/movement_sparsity/importance_threshold": -0.0006760118921288289, + "compression/movement_sparsity/linear_layer_sparsity": 0.5141535282106293, + "compression/movement_sparsity/model_sparsity": 0.3992563994275024, + "compression_loss": 10.147372245788574, + "distillation_loss": 0.5875498056411743, + "epoch": 1.37, + "learning_rate": 1.4534916864608077e-05, + "loss": 10.567, + "step": 2880, + "task_loss": 0.4646558165550232 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03764274079037894, + "compression/movement_sparsity/importance_threshold": -0.0006608891505563933, + "compression/movement_sparsity/linear_layer_sparsity": 0.5215901883845228, + "compression/movement_sparsity/model_sparsity": 0.4050312001472949, + "compression_loss": 10.223993301391602, + "distillation_loss": 0.8365118503570557, + "epoch": 1.37, + "learning_rate": 1.4515914489311165e-05, + "loss": 10.7409, + "step": 2890, + "task_loss": 0.357979416847229 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03792125594227949, + "compression/movement_sparsity/importance_threshold": -0.0006459936434674647, + "compression/movement_sparsity/linear_layer_sparsity": 0.5287817816546221, + "compression/movement_sparsity/model_sparsity": 0.41061569870195747, + "compression_loss": 10.299442291259766, + "distillation_loss": 0.34872639179229736, + "epoch": 1.38, + "learning_rate": 1.4496912114014253e-05, + "loss": 10.6705, + "step": 2900, + "task_loss": 0.3244781494140625 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03819555444328984, + "compression/movement_sparsity/importance_threshold": -0.0006313236506918429, + "compression/movement_sparsity/linear_layer_sparsity": 0.5359220914257754, + "compression/movement_sparsity/model_sparsity": 0.41616037400536177, + "compression_loss": 10.373762130737305, + "distillation_loss": 0.966638445854187, + "epoch": 1.38, + "learning_rate": 1.4477909738717342e-05, + "loss": 10.9055, + "step": 2910, + "task_loss": 0.6029879450798035 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03846566845703237, + "compression/movement_sparsity/importance_threshold": -0.0006168774520593267, + "compression/movement_sparsity/linear_layer_sparsity": 0.5422028027137911, + "compression/movement_sparsity/model_sparsity": 0.4210375440277554, + "compression_loss": 10.446945190429688, + "distillation_loss": 0.1921069324016571, + "epoch": 1.39, + "learning_rate": 1.4458907363420428e-05, + "loss": 10.8335, + "step": 2920, + "task_loss": 0.06273065507411957 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03873163014712941, + "compression/movement_sparsity/importance_threshold": -0.0006026533273997157, + "compression/movement_sparsity/linear_layer_sparsity": 0.5490401163994278, + "compression/movement_sparsity/model_sparsity": 0.4263469333328998, + "compression_loss": 10.518943786621094, + "distillation_loss": 0.16988125443458557, + "epoch": 1.39, + "learning_rate": 1.4439904988123516e-05, + "loss": 10.808, + "step": 2930, + "task_loss": 0.1513996422290802 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03899347167720332, + "compression/movement_sparsity/importance_threshold": -0.0005886495565428094, + "compression/movement_sparsity/linear_layer_sparsity": 0.5570446472259861, + "compression/movement_sparsity/model_sparsity": 0.43256270349019194, + "compression_loss": 10.589797973632812, + "distillation_loss": 0.29276514053344727, + "epoch": 1.4, + "learning_rate": 1.4420902612826604e-05, + "loss": 11.0576, + "step": 2940, + "task_loss": 0.39499735832214355 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03925122521087644, + "compression/movement_sparsity/importance_threshold": -0.0005748644193184068, + "compression/movement_sparsity/linear_layer_sparsity": 0.5645892144685336, + "compression/movement_sparsity/model_sparsity": 0.43842129744554487, + "compression_loss": 10.659571647644043, + "distillation_loss": 0.4497772753238678, + "epoch": 1.4, + "learning_rate": 1.4401900237529694e-05, + "loss": 11.046, + "step": 2950, + "task_loss": 0.18338404595851898 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03950492291177113, + "compression/movement_sparsity/importance_threshold": -0.0005612961955563074, + "compression/movement_sparsity/linear_layer_sparsity": 0.5716300271943692, + "compression/movement_sparsity/model_sparsity": 0.44388871016124415, + "compression_loss": 10.72823429107666, + "distillation_loss": 0.9414302110671997, + "epoch": 1.41, + "learning_rate": 1.4382897862232782e-05, + "loss": 11.2474, + "step": 2960, + "task_loss": 0.3824530839920044 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03975459694350974, + "compression/movement_sparsity/importance_threshold": -0.0005479431650863105, + "compression/movement_sparsity/linear_layer_sparsity": 0.5780708112202649, + "compression/movement_sparsity/model_sparsity": 0.44889018170344835, + "compression_loss": 10.795801162719727, + "distillation_loss": 0.5067998170852661, + "epoch": 1.41, + "learning_rate": 1.4363895486935868e-05, + "loss": 11.2138, + "step": 2970, + "task_loss": 0.17481166124343872 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04000027946971461, + "compression/movement_sparsity/importance_threshold": -0.000534803607738216, + "compression/movement_sparsity/linear_layer_sparsity": 0.5847961833785005, + "compression/movement_sparsity/model_sparsity": 0.4541126448888163, + "compression_loss": 10.862251281738281, + "distillation_loss": 0.47052228450775146, + "epoch": 1.42, + "learning_rate": 1.4344893111638956e-05, + "loss": 11.3225, + "step": 2980, + "task_loss": 0.4483351707458496 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.040242002654008104, + "compression/movement_sparsity/importance_threshold": -0.0005218758033418225, + "compression/movement_sparsity/linear_layer_sparsity": 0.5909785738482385, + "compression/movement_sparsity/model_sparsity": 0.45891346570082736, + "compression_loss": 10.92764663696289, + "distillation_loss": 0.43901222944259644, + "epoch": 1.42, + "learning_rate": 1.4325890736342044e-05, + "loss": 11.3835, + "step": 2990, + "task_loss": 0.24378883838653564 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04047979866001257, + "compression/movement_sparsity/importance_threshold": -0.0005091580317269292, + "compression/movement_sparsity/linear_layer_sparsity": 0.5969155445460704, + "compression/movement_sparsity/model_sparsity": 0.46352371033451184, + "compression_loss": 10.991974830627441, + "distillation_loss": 0.6936439275741577, + "epoch": 1.43, + "learning_rate": 1.4306888361045133e-05, + "loss": 11.5481, + "step": 3000, + "task_loss": 0.26209497451782227 + }, + { + "epoch": 1.43, + "eval_accuracy": 0.856651376146789, + "eval_loss": 11.539105415344238, + "eval_runtime": 24.425, + "eval_samples_per_second": 35.701, + "eval_steps_per_second": 4.463, + "step": 3000 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.040713699651350355, + "compression/movement_sparsity/importance_threshold": -0.0004966485727233363, + "compression/movement_sparsity/linear_layer_sparsity": 0.6025890286246612, + "compression/movement_sparsity/model_sparsity": 0.46792934931419705, + "compression_loss": 11.055262565612793, + "distillation_loss": 0.39871829748153687, + "epoch": 1.43, + "learning_rate": 1.428788598574822e-05, + "loss": 11.5298, + "step": 3010, + "task_loss": 0.16510379314422607 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.040943737791643814, + "compression/movement_sparsity/importance_threshold": -0.00048434570616084265, + "compression/movement_sparsity/linear_layer_sparsity": 0.6088090701219512, + "compression/movement_sparsity/model_sparsity": 0.47275940733429933, + "compression_loss": 11.11749267578125, + "distillation_loss": 0.4512562155723572, + "epoch": 1.43, + "learning_rate": 1.4268883610451307e-05, + "loss": 11.6419, + "step": 3020, + "task_loss": 0.2681746184825897 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.041169945244515296, + "compression/movement_sparsity/importance_threshold": -0.000472247711869247, + "compression/movement_sparsity/linear_layer_sparsity": 0.6145064551339958, + "compression/movement_sparsity/model_sparsity": 0.47718360614117644, + "compression_loss": 11.178674697875977, + "distillation_loss": 0.33271324634552, + "epoch": 1.44, + "learning_rate": 1.4249881235154395e-05, + "loss": 11.726, + "step": 3030, + "task_loss": 0.2389669120311737 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04139235417358715, + "compression/movement_sparsity/importance_threshold": -0.00046035286967835, + "compression/movement_sparsity/linear_layer_sparsity": 0.6200651276912075, + "compression/movement_sparsity/model_sparsity": 0.4815000903604185, + "compression_loss": 11.238801002502441, + "distillation_loss": 1.2966899871826172, + "epoch": 1.44, + "learning_rate": 1.4230878859857485e-05, + "loss": 11.8034, + "step": 3040, + "task_loss": 0.7004615068435669 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04161099674248173, + "compression/movement_sparsity/importance_threshold": -0.00044865945941794955, + "compression/movement_sparsity/linear_layer_sparsity": 0.6260913622591087, + "compression/movement_sparsity/model_sparsity": 0.4861796511991028, + "compression_loss": 11.297928810119629, + "distillation_loss": 0.4639695882797241, + "epoch": 1.45, + "learning_rate": 1.4211876484560572e-05, + "loss": 11.7174, + "step": 3050, + "task_loss": 0.23858779668807983 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.041825905114821385, + "compression/movement_sparsity/importance_threshold": -0.0004371657609178463, + "compression/movement_sparsity/linear_layer_sparsity": 0.630700890168624, + "compression/movement_sparsity/model_sparsity": 0.489759094721777, + "compression_loss": 11.356048583984375, + "distillation_loss": 0.2719360888004303, + "epoch": 1.45, + "learning_rate": 1.4192874109263659e-05, + "loss": 11.7773, + "step": 3060, + "task_loss": 0.21212808787822723 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04203711145422847, + "compression/movement_sparsity/importance_threshold": -0.00042587005400783863, + "compression/movement_sparsity/linear_layer_sparsity": 0.6357438459801265, + "compression/movement_sparsity/model_sparsity": 0.4936751086539962, + "compression_loss": 11.413163185119629, + "distillation_loss": 0.750543475151062, + "epoch": 1.46, + "learning_rate": 1.4173871733966746e-05, + "loss": 12.0443, + "step": 3070, + "task_loss": 0.33162063360214233 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04224464792432533, + "compression/movement_sparsity/importance_threshold": -0.00041477061851772625, + "compression/movement_sparsity/linear_layer_sparsity": 0.6405869490552545, + "compression/movement_sparsity/model_sparsity": 0.49743593064536007, + "compression_loss": 11.469284057617188, + "distillation_loss": 0.4959287941455841, + "epoch": 1.46, + "learning_rate": 1.4154869358669834e-05, + "loss": 11.8504, + "step": 3080, + "task_loss": 0.22281783819198608 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04244854668873431, + "compression/movement_sparsity/importance_threshold": -0.0004038657342773089, + "compression/movement_sparsity/linear_layer_sparsity": 0.6453207100647396, + "compression/movement_sparsity/model_sparsity": 0.5011118450808302, + "compression_loss": 11.524415969848633, + "distillation_loss": 0.7054104804992676, + "epoch": 1.47, + "learning_rate": 1.4135866983372924e-05, + "loss": 12.0627, + "step": 3090, + "task_loss": 0.45865899324417114 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04264883991107778, + "compression/movement_sparsity/importance_threshold": -0.0003931536811163849, + "compression/movement_sparsity/linear_layer_sparsity": 0.6499866733476363, + "compression/movement_sparsity/model_sparsity": 0.5047351124474349, + "compression_loss": 11.57857608795166, + "distillation_loss": 0.363597571849823, + "epoch": 1.47, + "learning_rate": 1.411686460807601e-05, + "loss": 11.9987, + "step": 3100, + "task_loss": 0.1337648332118988 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.042845559754978065, + "compression/movement_sparsity/importance_threshold": -0.0003826327388647549, + "compression/movement_sparsity/linear_layer_sparsity": 0.6550466021153267, + "compression/movement_sparsity/model_sparsity": 0.508664306414417, + "compression_loss": 11.631749153137207, + "distillation_loss": 0.42896637320518494, + "epoch": 1.48, + "learning_rate": 1.4097862232779098e-05, + "loss": 12.1107, + "step": 3110, + "task_loss": 0.39144840836524963 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04303873838405753, + "compression/movement_sparsity/importance_threshold": -0.0003723011873522173, + "compression/movement_sparsity/linear_layer_sparsity": 0.6597050610697079, + "compression/movement_sparsity/model_sparsity": 0.5122817464337045, + "compression_loss": 11.68390941619873, + "distillation_loss": 0.6013063192367554, + "epoch": 1.48, + "learning_rate": 1.4078859857482186e-05, + "loss": 12.2025, + "step": 3120, + "task_loss": 0.20251774787902832 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04322840796193853, + "compression/movement_sparsity/importance_threshold": -0.0003621573064085718, + "compression/movement_sparsity/linear_layer_sparsity": 0.6641005862315568, + "compression/movement_sparsity/model_sparsity": 0.5156950100863344, + "compression_loss": 11.735136985778809, + "distillation_loss": 0.4356003701686859, + "epoch": 1.49, + "learning_rate": 1.4059857482185275e-05, + "loss": 12.1346, + "step": 3130, + "task_loss": 0.19159378111362457 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.043414600652243424, + "compression/movement_sparsity/importance_threshold": -0.00035219937586361723, + "compression/movement_sparsity/linear_layer_sparsity": 0.6691960135313159, + "compression/movement_sparsity/model_sparsity": 0.5196517697809077, + "compression_loss": 11.785431861877441, + "distillation_loss": 0.36588340997695923, + "epoch": 1.49, + "learning_rate": 1.4040855106888363e-05, + "loss": 12.2288, + "step": 3140, + "task_loss": 0.11971582472324371 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04359734861859453, + "compression/movement_sparsity/importance_threshold": -0.00034242567554715374, + "compression/movement_sparsity/linear_layer_sparsity": 0.6736123226249624, + "compression/movement_sparsity/model_sparsity": 0.523081172810825, + "compression_loss": 11.834785461425781, + "distillation_loss": 0.27993667125701904, + "epoch": 1.5, + "learning_rate": 1.402185273159145e-05, + "loss": 12.3277, + "step": 3150, + "task_loss": 0.10686977207660675 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04377668402461424, + "compression/movement_sparsity/importance_threshold": -0.00033283448528897974, + "compression/movement_sparsity/linear_layer_sparsity": 0.677556011931647, + "compression/movement_sparsity/model_sparsity": 0.5261435716988137, + "compression_loss": 11.883201599121094, + "distillation_loss": 0.22374649345874786, + "epoch": 1.5, + "learning_rate": 1.4002850356294537e-05, + "loss": 12.1793, + "step": 3160, + "task_loss": 0.11772890388965607 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.043952639033924865, + "compression/movement_sparsity/importance_threshold": -0.0003234240849188958, + "compression/movement_sparsity/linear_layer_sparsity": 0.6809970641373081, + "compression/movement_sparsity/model_sparsity": 0.5288156570556051, + "compression_loss": 11.930685997009277, + "distillation_loss": 1.1999856233596802, + "epoch": 1.51, + "learning_rate": 1.3983847980997627e-05, + "loss": 12.4018, + "step": 3170, + "task_loss": 0.6132296323776245 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04412524581014878, + "compression/movement_sparsity/importance_threshold": -0.0003141927542666999, + "compression/movement_sparsity/linear_layer_sparsity": 0.6850743257866606, + "compression/movement_sparsity/model_sparsity": 0.5319817790723298, + "compression_loss": 11.977304458618164, + "distillation_loss": 0.48509520292282104, + "epoch": 1.51, + "learning_rate": 1.3964845605700715e-05, + "loss": 12.4075, + "step": 3180, + "task_loss": 0.3772471845149994 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04429453651690833, + "compression/movement_sparsity/importance_threshold": -0.0003051387731621926, + "compression/movement_sparsity/linear_layer_sparsity": 0.6893989597448058, + "compression/movement_sparsity/model_sparsity": 0.5353399934737364, + "compression_loss": 12.023012161254883, + "distillation_loss": 0.5859512090682983, + "epoch": 1.52, + "learning_rate": 1.39458432304038e-05, + "loss": 12.5605, + "step": 3190, + "task_loss": 0.14015838503837585 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04446054331782587, + "compression/movement_sparsity/importance_threshold": -0.00029626042143517236, + "compression/movement_sparsity/linear_layer_sparsity": 0.6931833173366456, + "compression/movement_sparsity/model_sparsity": 0.5382786662696276, + "compression_loss": 12.067791938781738, + "distillation_loss": 0.3705664277076721, + "epoch": 1.52, + "learning_rate": 1.3926840855106889e-05, + "loss": 12.4846, + "step": 3200, + "task_loss": 0.1473858654499054 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04462329837652374, + "compression/movement_sparsity/importance_threshold": -0.00028755597891543883, + "compression/movement_sparsity/linear_layer_sparsity": 0.6960933147959952, + "compression/movement_sparsity/model_sparsity": 0.5405383708991116, + "compression_loss": 12.111686706542969, + "distillation_loss": 0.826543927192688, + "epoch": 1.52, + "learning_rate": 1.3907838479809977e-05, + "loss": 12.5674, + "step": 3210, + "task_loss": 0.4986266791820526 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04478283385662431, + "compression/movement_sparsity/importance_threshold": -0.0002790237254327913, + "compression/movement_sparsity/linear_layer_sparsity": 0.6998678391297802, + "compression/movement_sparsity/model_sparsity": 0.5434694078605874, + "compression_loss": 12.154720306396484, + "distillation_loss": 0.4464240074157715, + "epoch": 1.53, + "learning_rate": 1.3888836104513066e-05, + "loss": 12.4925, + "step": 3220, + "task_loss": 0.23257675766944885 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04493918192174991, + "compression/movement_sparsity/importance_threshold": -0.00027066194081702905, + "compression/movement_sparsity/linear_layer_sparsity": 0.7031833526234568, + "compression/movement_sparsity/model_sparsity": 0.546044008455755, + "compression_loss": 12.196907043457031, + "distillation_loss": 0.6404677629470825, + "epoch": 1.53, + "learning_rate": 1.3869833729216154e-05, + "loss": 12.7082, + "step": 3230, + "task_loss": 0.3287440538406372 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04509237473552289, + "compression/movement_sparsity/importance_threshold": -0.0002624689048979522, + "compression/movement_sparsity/linear_layer_sparsity": 0.707251851381361, + "compression/movement_sparsity/model_sparsity": 0.5492033258114275, + "compression_loss": 12.238248825073242, + "distillation_loss": 0.7356147766113281, + "epoch": 1.54, + "learning_rate": 1.385083135391924e-05, + "loss": 12.8188, + "step": 3240, + "task_loss": 0.4595518708229065 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04524244446156562, + "compression/movement_sparsity/importance_threshold": -0.0002544428975053588, + "compression/movement_sparsity/linear_layer_sparsity": 0.7103969883882867, + "compression/movement_sparsity/model_sparsity": 0.55164562370144, + "compression_loss": 12.278731346130371, + "distillation_loss": 0.5563812851905823, + "epoch": 1.54, + "learning_rate": 1.3831828978622328e-05, + "loss": 12.7541, + "step": 3250, + "task_loss": 0.2265515923500061 + }, + { + "epoch": 1.54, + "eval_accuracy": 0.8577981651376146, + "eval_loss": 12.835886001586914, + "eval_runtime": 24.5855, + "eval_samples_per_second": 35.468, + "eval_steps_per_second": 4.434, + "step": 3250 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.045389423263500435, + "compression/movement_sparsity/importance_threshold": -0.0002465821984690493, + "compression/movement_sparsity/linear_layer_sparsity": 0.7141112898976212, + "compression/movement_sparsity/model_sparsity": 0.5545298957440069, + "compression_loss": 12.318343162536621, + "distillation_loss": 0.7919510006904602, + "epoch": 1.55, + "learning_rate": 1.3812826603325418e-05, + "loss": 12.8273, + "step": 3260, + "task_loss": 0.3617420196533203 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0455333433049497, + "compression/movement_sparsity/importance_threshold": -0.0002388850876188218, + "compression/movement_sparsity/linear_layer_sparsity": 0.7169433387910268, + "compression/movement_sparsity/model_sparsity": 0.556729070858333, + "compression_loss": 12.357135772705078, + "distillation_loss": 0.6720787286758423, + "epoch": 1.55, + "learning_rate": 1.3793824228028505e-05, + "loss": 12.7631, + "step": 3270, + "task_loss": 0.34426239132881165 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.045674236749535746, + "compression/movement_sparsity/importance_threshold": -0.00023134984478447723, + "compression/movement_sparsity/linear_layer_sparsity": 0.71958969672162, + "compression/movement_sparsity/model_sparsity": 0.5587840510947658, + "compression_loss": 12.395111083984375, + "distillation_loss": 0.5282790064811707, + "epoch": 1.56, + "learning_rate": 1.3774821852731593e-05, + "loss": 12.8532, + "step": 3280, + "task_loss": 0.2643144726753235 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.045812135760880945, + "compression/movement_sparsity/importance_threshold": -0.0002239747497958136, + "compression/movement_sparsity/linear_layer_sparsity": 0.7223523129328515, + "compression/movement_sparsity/model_sparsity": 0.5609293095457484, + "compression_loss": 12.432284355163574, + "distillation_loss": 0.40331676602363586, + "epoch": 1.56, + "learning_rate": 1.375581947743468e-05, + "loss": 12.9658, + "step": 3290, + "task_loss": 0.1825210452079773 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.045947072502607635, + "compression/movement_sparsity/importance_threshold": -0.0002167580824826306, + "compression/movement_sparsity/linear_layer_sparsity": 0.725434474744053, + "compression/movement_sparsity/model_sparsity": 0.5633227052139176, + "compression_loss": 12.468629837036133, + "distillation_loss": 0.4709409475326538, + "epoch": 1.57, + "learning_rate": 1.3736817102137769e-05, + "loss": 12.9347, + "step": 3300, + "task_loss": 0.2354767620563507 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04607907913833817, + "compression/movement_sparsity/importance_threshold": -0.00020969812267472836, + "compression/movement_sparsity/linear_layer_sparsity": 0.7282369768142126, + "compression/movement_sparsity/model_sparsity": 0.5654989362899041, + "compression_loss": 12.504148483276367, + "distillation_loss": 0.44413477182388306, + "epoch": 1.57, + "learning_rate": 1.3717814726840857e-05, + "loss": 12.9811, + "step": 3310, + "task_loss": 0.25892922282218933 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04620818783169489, + "compression/movement_sparsity/importance_threshold": -0.0002027931502019049, + "compression/movement_sparsity/linear_layer_sparsity": 0.73153312961081, + "compression/movement_sparsity/model_sparsity": 0.5680585026943443, + "compression_loss": 12.538874626159668, + "distillation_loss": 0.2548993229866028, + "epoch": 1.58, + "learning_rate": 1.3698812351543945e-05, + "loss": 12.9981, + "step": 3320, + "task_loss": 0.06697467714548111 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04633443074630017, + "compression/movement_sparsity/importance_threshold": -0.0001960414448939603, + "compression/movement_sparsity/linear_layer_sparsity": 0.7338615886592894, + "compression/movement_sparsity/model_sparsity": 0.569866624441842, + "compression_loss": 12.572813034057617, + "distillation_loss": 1.0099992752075195, + "epoch": 1.58, + "learning_rate": 1.3679809976247031e-05, + "loss": 13.0474, + "step": 3330, + "task_loss": 0.6512800455093384 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04645784004577634, + "compression/movement_sparsity/importance_threshold": -0.00018944128658069433, + "compression/movement_sparsity/linear_layer_sparsity": 0.7365036533611864, + "compression/movement_sparsity/model_sparsity": 0.571918270851048, + "compression_loss": 12.606016159057617, + "distillation_loss": 0.48587775230407715, + "epoch": 1.59, + "learning_rate": 1.3660807600950119e-05, + "loss": 13.2127, + "step": 3340, + "task_loss": 0.2523800730705261 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04657844789374576, + "compression/movement_sparsity/importance_threshold": -0.0001829909550919058, + "compression/movement_sparsity/linear_layer_sparsity": 0.739569489141072, + "compression/movement_sparsity/model_sparsity": 0.5742989888419817, + "compression_loss": 12.638442993164062, + "distillation_loss": 0.8291702270507812, + "epoch": 1.59, + "learning_rate": 1.3641805225653208e-05, + "loss": 13.1407, + "step": 3350, + "task_loss": 0.37619519233703613 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04669628645383078, + "compression/movement_sparsity/importance_threshold": -0.00017668873025739355, + "compression/movement_sparsity/linear_layer_sparsity": 0.7418803871198434, + "compression/movement_sparsity/model_sparsity": 0.5760934738660553, + "compression_loss": 12.670111656188965, + "distillation_loss": 0.5576849579811096, + "epoch": 1.6, + "learning_rate": 1.3622802850356296e-05, + "loss": 13.1378, + "step": 3360, + "task_loss": 0.2429836392402649 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04681138788965375, + "compression/movement_sparsity/importance_threshold": -0.00017053289190695773, + "compression/movement_sparsity/linear_layer_sparsity": 0.7444494081225534, + "compression/movement_sparsity/model_sparsity": 0.5780883995435379, + "compression_loss": 12.701035499572754, + "distillation_loss": 0.9135178327560425, + "epoch": 1.6, + "learning_rate": 1.3603800475059384e-05, + "loss": 13.1637, + "step": 3370, + "task_loss": 0.5078893899917603 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04692378436483701, + "compression/movement_sparsity/importance_threshold": -0.0001645217198703976, + "compression/movement_sparsity/linear_layer_sparsity": 0.7467615764265282, + "compression/movement_sparsity/model_sparsity": 0.5798838710151197, + "compression_loss": 12.731249809265137, + "distillation_loss": 0.5046910047531128, + "epoch": 1.61, + "learning_rate": 1.358479809976247e-05, + "loss": 13.1697, + "step": 3380, + "task_loss": 0.35627833008766174 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04703350804300293, + "compression/movement_sparsity/importance_threshold": -0.00015865349397751203, + "compression/movement_sparsity/linear_layer_sparsity": 0.7494276361600422, + "compression/movement_sparsity/model_sparsity": 0.5819541503217034, + "compression_loss": 12.760735511779785, + "distillation_loss": 0.8998797535896301, + "epoch": 1.61, + "learning_rate": 1.356579572446556e-05, + "loss": 13.1898, + "step": 3390, + "task_loss": 0.5463681221008301 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.047140591087773846, + "compression/movement_sparsity/importance_threshold": -0.0001529264940581007, + "compression/movement_sparsity/linear_layer_sparsity": 0.7521114686841313, + "compression/movement_sparsity/model_sparsity": 0.584038230759629, + "compression_loss": 12.789498329162598, + "distillation_loss": 0.8819484114646912, + "epoch": 1.62, + "learning_rate": 1.3546793349168648e-05, + "loss": 13.3173, + "step": 3400, + "task_loss": 0.5092288851737976 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04724506566277212, + "compression/movement_sparsity/importance_threshold": -0.0001473389999419629, + "compression/movement_sparsity/linear_layer_sparsity": 0.7540012185712135, + "compression/movement_sparsity/model_sparsity": 0.5855056810334038, + "compression_loss": 12.8175630569458, + "distillation_loss": 0.6930016875267029, + "epoch": 1.62, + "learning_rate": 1.3527790973871735e-05, + "loss": 13.311, + "step": 3410, + "task_loss": 0.25591135025024414 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.047346963931620085, + "compression/movement_sparsity/importance_threshold": -0.00014188929145889833, + "compression/movement_sparsity/linear_layer_sparsity": 0.7560711429162903, + "compression/movement_sparsity/model_sparsity": 0.5871130424454297, + "compression_loss": 12.844893455505371, + "distillation_loss": 0.5201950073242188, + "epoch": 1.62, + "learning_rate": 1.3508788598574822e-05, + "loss": 13.2855, + "step": 3420, + "task_loss": 0.2590838670730591 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04744631805794011, + "compression/movement_sparsity/importance_threshold": -0.0001365756484387054, + "compression/movement_sparsity/linear_layer_sparsity": 0.7579267822192111, + "compression/movement_sparsity/model_sparsity": 0.5885540047768538, + "compression_loss": 12.871529579162598, + "distillation_loss": 0.5567278861999512, + "epoch": 1.63, + "learning_rate": 1.348978622327791e-05, + "loss": 13.2248, + "step": 3430, + "task_loss": 0.2669626772403717 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04754316020535455, + "compression/movement_sparsity/importance_threshold": -0.00013139635071118426, + "compression/movement_sparsity/linear_layer_sparsity": 0.7606231180367359, + "compression/movement_sparsity/model_sparsity": 0.590647794415717, + "compression_loss": 12.897520065307617, + "distillation_loss": 0.45216840505599976, + "epoch": 1.63, + "learning_rate": 1.3470783847980999e-05, + "loss": 13.4032, + "step": 3440, + "task_loss": 0.242957204580307 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04763752253748573, + "compression/movement_sparsity/importance_threshold": -0.00012634967810613417, + "compression/movement_sparsity/linear_layer_sparsity": 0.7631808825466727, + "compression/movement_sparsity/model_sparsity": 0.5926339790722238, + "compression_loss": 12.922788619995117, + "distillation_loss": 0.6922011375427246, + "epoch": 1.64, + "learning_rate": 1.3451781472684087e-05, + "loss": 13.3809, + "step": 3450, + "task_loss": 0.4704532027244568 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04772943721795603, + "compression/movement_sparsity/importance_threshold": -0.00012143391045335399, + "compression/movement_sparsity/linear_layer_sparsity": 0.7656033927092744, + "compression/movement_sparsity/model_sparsity": 0.5945151344704236, + "compression_loss": 12.947429656982422, + "distillation_loss": 0.35389190912246704, + "epoch": 1.64, + "learning_rate": 1.3432779097387175e-05, + "loss": 13.5379, + "step": 3460, + "task_loss": 0.2909751236438751 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04781893641038778, + "compression/movement_sparsity/importance_threshold": -0.00011664732758264297, + "compression/movement_sparsity/linear_layer_sparsity": 0.7674263564250225, + "compression/movement_sparsity/model_sparsity": 0.5959307231798303, + "compression_loss": 12.971384048461914, + "distillation_loss": 0.40384596586227417, + "epoch": 1.65, + "learning_rate": 1.3413776722090261e-05, + "loss": 13.4134, + "step": 3470, + "task_loss": 0.23214919865131378 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04790605227840333, + "compression/movement_sparsity/importance_threshold": -0.00011198820932380171, + "compression/movement_sparsity/linear_layer_sparsity": 0.7692812899917194, + "compression/movement_sparsity/model_sparsity": 0.597371137484861, + "compression_loss": 12.994702339172363, + "distillation_loss": 0.8970961570739746, + "epoch": 1.65, + "learning_rate": 1.339477434679335e-05, + "loss": 13.5349, + "step": 3480, + "task_loss": 0.45820656418800354 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.047990816985625045, + "compression/movement_sparsity/importance_threshold": -0.00010745483550662818, + "compression/movement_sparsity/linear_layer_sparsity": 0.7712226997704005, + "compression/movement_sparsity/model_sparsity": 0.5988787032906373, + "compression_loss": 13.017374992370605, + "distillation_loss": 0.43547579646110535, + "epoch": 1.66, + "learning_rate": 1.3375771971496438e-05, + "loss": 13.4468, + "step": 3490, + "task_loss": 0.22628280520439148 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04807326269567527, + "compression/movement_sparsity/importance_threshold": -0.00010304548596092207, + "compression/movement_sparsity/linear_layer_sparsity": 0.773500263474857, + "compression/movement_sparsity/model_sparsity": 0.600647303201393, + "compression_loss": 13.039436340332031, + "distillation_loss": 0.4127658009529114, + "epoch": 1.66, + "learning_rate": 1.3356769596199526e-05, + "loss": 13.6184, + "step": 3500, + "task_loss": 0.13857224583625793 + }, + { + "epoch": 1.66, + "eval_accuracy": 0.8428899082568807, + "eval_loss": 13.651920318603516, + "eval_runtime": 32.6189, + "eval_samples_per_second": 26.733, + "eval_steps_per_second": 3.342, + "step": 3500 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.048153421572176364, + "compression/movement_sparsity/importance_threshold": -9.87584405164831e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.7749152881285757, + "compression/movement_sparsity/model_sparsity": 0.601746114387832, + "compression_loss": 13.060900688171387, + "distillation_loss": 0.3880394995212555, + "epoch": 1.67, + "learning_rate": 1.3337767220902612e-05, + "loss": 13.5323, + "step": 3510, + "task_loss": 0.09369392693042755 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04823132577875065, + "compression/movement_sparsity/importance_threshold": -9.459197900311099e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.7763604947681422, + "compression/movement_sparsity/model_sparsity": 0.6028683628363647, + "compression_loss": 13.081767082214355, + "distillation_loss": 0.6340326070785522, + "epoch": 1.67, + "learning_rate": 1.3318764845605704e-05, + "loss": 13.5637, + "step": 3520, + "task_loss": 0.42461642622947693 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04830700747902051, + "compression/movement_sparsity/importance_threshold": -9.05443812506037e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.7780264674608551, + "compression/movement_sparsity/model_sparsity": 0.6041620430745457, + "compression_loss": 13.102035522460938, + "distillation_loss": 0.25822052359580994, + "epoch": 1.68, + "learning_rate": 1.329976247030879e-05, + "loss": 13.6241, + "step": 3530, + "task_loss": 0.06908401101827621 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04838049883660828, + "compression/movement_sparsity/importance_threshold": -8.66139270887618e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.7795130772922313, + "compression/movement_sparsity/model_sparsity": 0.6053164424048282, + "compression_loss": 13.121685981750488, + "distillation_loss": 0.9944248795509338, + "epoch": 1.68, + "learning_rate": 1.3280760095011878e-05, + "loss": 13.6799, + "step": 3540, + "task_loss": 0.43491220474243164 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04845183201513631, + "compression/movement_sparsity/importance_threshold": -8.279889634738372e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.7810252935862692, + "compression/movement_sparsity/model_sparsity": 0.6064907259594202, + "compression_loss": 13.140732765197754, + "distillation_loss": 0.4648146629333496, + "epoch": 1.69, + "learning_rate": 1.3261757719714966e-05, + "loss": 13.6035, + "step": 3550, + "task_loss": 0.22887027263641357 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.048521039178226956, + "compression/movement_sparsity/importance_threshold": -7.909756885626958e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.7832119702273411, + "compression/movement_sparsity/model_sparsity": 0.6081887492044712, + "compression_loss": 13.159188270568848, + "distillation_loss": 0.7744244933128357, + "epoch": 1.69, + "learning_rate": 1.3242755344418052e-05, + "loss": 13.6462, + "step": 3560, + "task_loss": 0.3696683943271637 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04858815248950257, + "compression/movement_sparsity/importance_threshold": -7.550822444521823e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.7850979091388136, + "compression/movement_sparsity/model_sparsity": 0.6096532401357213, + "compression_loss": 13.177058219909668, + "distillation_loss": 0.3203733563423157, + "epoch": 1.7, + "learning_rate": 1.3223752969121141e-05, + "loss": 13.6384, + "step": 3570, + "task_loss": 0.19227594137191772 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.048653204112585495, + "compression/movement_sparsity/importance_threshold": -7.202914294402937e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.7865801081187895, + "compression/movement_sparsity/model_sparsity": 0.6108042143010447, + "compression_loss": 13.194385528564453, + "distillation_loss": 0.5051361918449402, + "epoch": 1.7, + "learning_rate": 1.3204750593824229e-05, + "loss": 13.6339, + "step": 3580, + "task_loss": 0.25856292247772217 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.048716226211098085, + "compression/movement_sparsity/importance_threshold": -6.865860418250229e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.78752878227567, + "compression/movement_sparsity/model_sparsity": 0.6115408896466835, + "compression_loss": 13.211155891418457, + "distillation_loss": 0.5065594911575317, + "epoch": 1.71, + "learning_rate": 1.3185748218527317e-05, + "loss": 13.7948, + "step": 3590, + "task_loss": 0.31428200006484985 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0487772509486627, + "compression/movement_sparsity/importance_threshold": -6.539488799043583e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.7893345260275519, + "compression/movement_sparsity/model_sparsity": 0.6129431065120898, + "compression_loss": 13.227370262145996, + "distillation_loss": 0.44985029101371765, + "epoch": 1.71, + "learning_rate": 1.3166745843230405e-05, + "loss": 13.7212, + "step": 3600, + "task_loss": 0.16183635592460632 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04883631048890167, + "compression/movement_sparsity/importance_threshold": -6.223627419762968e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.7903761574074074, + "compression/movement_sparsity/model_sparsity": 0.613751966067521, + "compression_loss": 13.2430419921875, + "distillation_loss": 0.8156948089599609, + "epoch": 1.71, + "learning_rate": 1.3147743467933494e-05, + "loss": 13.6668, + "step": 3610, + "task_loss": 0.43693339824676514 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04889343699543736, + "compression/movement_sparsity/importance_threshold": -5.918104263388357e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.7916715480088828, + "compression/movement_sparsity/model_sparsity": 0.6147578776464948, + "compression_loss": 13.258223533630371, + "distillation_loss": 0.9145314693450928, + "epoch": 1.72, + "learning_rate": 1.312874109263658e-05, + "loss": 13.7678, + "step": 3620, + "task_loss": 0.4340613782405853 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.048948662631892126, + "compression/movement_sparsity/importance_threshold": -5.622747312899589e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.7927108857460102, + "compression/movement_sparsity/model_sparsity": 0.6155649561161471, + "compression_loss": 13.272916793823242, + "distillation_loss": 0.8170522451400757, + "epoch": 1.72, + "learning_rate": 1.3109738717339668e-05, + "loss": 13.7102, + "step": 3630, + "task_loss": 0.38500523567199707 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.049002019561888314, + "compression/movement_sparsity/importance_threshold": -5.3373845512766794e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.7941787935674496, + "compression/movement_sparsity/model_sparsity": 0.6167048327470029, + "compression_loss": 13.287137031555176, + "distillation_loss": 0.32002827525138855, + "epoch": 1.73, + "learning_rate": 1.3090736342042756e-05, + "loss": 13.8523, + "step": 3640, + "task_loss": 0.15141043066978455 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.049053539949048264, + "compression/movement_sparsity/importance_threshold": -5.061843961499555e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.795318228225685, + "compression/movement_sparsity/model_sparsity": 0.6175896396267948, + "compression_loss": 13.300834655761719, + "distillation_loss": 0.7697837352752686, + "epoch": 1.73, + "learning_rate": 1.3071733966745846e-05, + "loss": 13.8336, + "step": 3650, + "task_loss": 0.4357471466064453 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04910325595699434, + "compression/movement_sparsity/importance_threshold": -4.795953526548144e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.7962184535907859, + "compression/movement_sparsity/model_sparsity": 0.6182886929605225, + "compression_loss": 13.313998222351074, + "distillation_loss": 0.6824323534965515, + "epoch": 1.74, + "learning_rate": 1.3052731591448932e-05, + "loss": 13.8227, + "step": 3660, + "task_loss": 0.34603098034858704 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.049151199749348895, + "compression/movement_sparsity/importance_threshold": -4.53954122940233e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.7974448937631737, + "compression/movement_sparsity/model_sparsity": 0.619241062360855, + "compression_loss": 13.326610565185547, + "distillation_loss": 0.08792783319950104, + "epoch": 1.74, + "learning_rate": 1.303372921615202e-05, + "loss": 13.6874, + "step": 3670, + "task_loss": 0.2505362629890442 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04919740348973427, + "compression/movement_sparsity/importance_threshold": -4.2924350530420836e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.7981858697493225, + "compression/movement_sparsity/model_sparsity": 0.6198164535388978, + "compression_loss": 13.338798522949219, + "distillation_loss": 1.1169078350067139, + "epoch": 1.75, + "learning_rate": 1.3014726840855108e-05, + "loss": 13.9032, + "step": 3680, + "task_loss": 0.44423192739486694 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04924189934177281, + "compression/movement_sparsity/importance_threshold": -4.054462980447376e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.7994761437631738, + "compression/movement_sparsity/model_sparsity": 0.620818391926519, + "compression_loss": 13.350573539733887, + "distillation_loss": 0.7253522872924805, + "epoch": 1.75, + "learning_rate": 1.2995724465558196e-05, + "loss": 13.7283, + "step": 3690, + "task_loss": 0.4123845100402832 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.049284719469086885, + "compression/movement_sparsity/importance_threshold": -3.8254529945980914e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.8006924801452876, + "compression/movement_sparsity/model_sparsity": 0.6217629154156518, + "compression_loss": 13.361876487731934, + "distillation_loss": 0.7416278719902039, + "epoch": 1.76, + "learning_rate": 1.2976722090261285e-05, + "loss": 13.7298, + "step": 3700, + "task_loss": 0.4112977981567383 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04932589603529883, + "compression/movement_sparsity/importance_threshold": -3.605233078474157e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.801716456357272, + "compression/movement_sparsity/model_sparsity": 0.6225580651774731, + "compression_loss": 13.372750282287598, + "distillation_loss": 0.42862361669540405, + "epoch": 1.76, + "learning_rate": 1.2957719714964371e-05, + "loss": 13.8126, + "step": 3710, + "task_loss": 0.1545902043581009 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.049365461204031, + "compression/movement_sparsity/importance_threshold": -3.393631215055587e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.8031718373607347, + "compression/movement_sparsity/model_sparsity": 0.6236882143398449, + "compression_loss": 13.383188247680664, + "distillation_loss": 0.1796613186597824, + "epoch": 1.77, + "learning_rate": 1.293871733966746e-05, + "loss": 13.9012, + "step": 3720, + "task_loss": 0.04331651329994202 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04940344713890575, + "compression/movement_sparsity/importance_threshold": -3.190475387322223e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.8042020240514905, + "compression/movement_sparsity/model_sparsity": 0.6244881867339287, + "compression_loss": 13.39314079284668, + "distillation_loss": 0.6083955764770508, + "epoch": 1.77, + "learning_rate": 1.2919714964370547e-05, + "loss": 13.7336, + "step": 3730, + "task_loss": 0.5452806353569031 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04943988600354543, + "compression/movement_sparsity/importance_threshold": -2.995593578253991e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.805487898976212, + "compression/movement_sparsity/model_sparsity": 0.6254867090903641, + "compression_loss": 13.402615547180176, + "distillation_loss": 0.35754120349884033, + "epoch": 1.78, + "learning_rate": 1.2900712589073637e-05, + "loss": 13.7667, + "step": 3740, + "task_loss": 0.12456899136304855 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.049474809961572386, + "compression/movement_sparsity/importance_threshold": -2.8088137708309063e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.8067680691997893, + "compression/movement_sparsity/model_sparsity": 0.6264808015667855, + "compression_loss": 13.411738395690918, + "distillation_loss": 0.689171552658081, + "epoch": 1.78, + "learning_rate": 1.2881710213776723e-05, + "loss": 13.9171, + "step": 3750, + "task_loss": 0.5145508646965027 + }, + { + "epoch": 1.78, + "eval_accuracy": 0.8474770642201835, + "eval_loss": 14.073362350463867, + "eval_runtime": 23.1584, + "eval_samples_per_second": 37.654, + "eval_steps_per_second": 4.707, + "step": 3750 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04950825117660897, + "compression/movement_sparsity/importance_threshold": -2.629963948032896e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.8076983001166818, + "compression/movement_sparsity/model_sparsity": 0.6272031551560086, + "compression_loss": 13.420463562011719, + "distillation_loss": 0.3995903730392456, + "epoch": 1.79, + "learning_rate": 1.286270783847981e-05, + "loss": 13.7818, + "step": 3760, + "task_loss": 0.16869285702705383 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.049540241812277536, + "compression/movement_sparsity/importance_threshold": -2.458872092839801e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.8085055094474556, + "compression/movement_sparsity/model_sparsity": 0.6278299786110779, + "compression_loss": 13.42878532409668, + "distillation_loss": 0.34937620162963867, + "epoch": 1.79, + "learning_rate": 1.2843705463182899e-05, + "loss": 13.9115, + "step": 3770, + "task_loss": 0.16115997731685638 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04957081403220043, + "compression/movement_sparsity/importance_threshold": -2.2953661882316786e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.809237898976212, + "compression/movement_sparsity/model_sparsity": 0.628398702134667, + "compression_loss": 13.43675708770752, + "distillation_loss": 0.7227451801300049, + "epoch": 1.8, + "learning_rate": 1.2824703087885986e-05, + "loss": 13.9422, + "step": 3780, + "task_loss": 0.4984316825866699 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.049600000000000005, + "compression/movement_sparsity/importance_threshold": -2.1392742171883698e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.8102044517841012, + "compression/movement_sparsity/model_sparsity": 0.6291492608156068, + "compression_loss": 13.444360733032227, + "distillation_loss": 0.44791698455810547, + "epoch": 1.8, + "learning_rate": 1.2805700712589076e-05, + "loss": 13.9218, + "step": 3790, + "task_loss": 0.32737138867378235 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04962783187929862, + "compression/movement_sparsity/importance_threshold": -1.990424162689802e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.8111833902627221, + "compression/movement_sparsity/model_sparsity": 0.629909437359752, + "compression_loss": 13.45160961151123, + "distillation_loss": 0.26000645756721497, + "epoch": 1.81, + "learning_rate": 1.2786698337292162e-05, + "loss": 13.9993, + "step": 3800, + "task_loss": 0.18172568082809448 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.049654341833718606, + "compression/movement_sparsity/importance_threshold": -1.8486440077159893e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.8120328261442337, + "compression/movement_sparsity/model_sparsity": 0.6305690510606968, + "compression_loss": 13.458487510681152, + "distillation_loss": 0.5715627670288086, + "epoch": 1.81, + "learning_rate": 1.276769596199525e-05, + "loss": 13.9063, + "step": 3810, + "task_loss": 0.3135397434234619 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04967956202688233, + "compression/movement_sparsity/importance_threshold": -1.713761735246816e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.8128433641975309, + "compression/movement_sparsity/model_sparsity": 0.6311984593735886, + "compression_loss": 13.46501636505127, + "distillation_loss": 0.36794915795326233, + "epoch": 1.81, + "learning_rate": 1.2748693586698338e-05, + "loss": 13.9184, + "step": 3820, + "task_loss": 0.3162195384502411 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04970352462241214, + "compression/movement_sparsity/importance_threshold": -1.5856053282622528e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.8135937852868113, + "compression/movement_sparsity/model_sparsity": 0.6317811849715307, + "compression_loss": 13.471232414245605, + "distillation_loss": 0.7687112092971802, + "epoch": 1.82, + "learning_rate": 1.2729691211401427e-05, + "loss": 13.9925, + "step": 3830, + "task_loss": 0.4176146686077118 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04972626178393039, + "compression/movement_sparsity/importance_threshold": -1.4640027697421405e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.8145009386291779, + "compression/movement_sparsity/model_sparsity": 0.6324856180976874, + "compression_loss": 13.477070808410645, + "distillation_loss": 0.18658028542995453, + "epoch": 1.82, + "learning_rate": 1.2710688836104515e-05, + "loss": 13.8647, + "step": 3840, + "task_loss": 0.1433614045381546 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.049747805675059424, + "compression/movement_sparsity/importance_threshold": -1.3487820426664934e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.8151889255871725, + "compression/movement_sparsity/model_sparsity": 0.6330198616273545, + "compression_loss": 13.482598304748535, + "distillation_loss": 0.5786043405532837, + "epoch": 1.83, + "learning_rate": 1.2691686460807601e-05, + "loss": 13.9974, + "step": 3850, + "task_loss": 0.24868422746658325 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.049768188459421596, + "compression/movement_sparsity/importance_threshold": -1.2397711300152388e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.8155505330660946, + "compression/movement_sparsity/model_sparsity": 0.6333006612175915, + "compression_loss": 13.487833023071289, + "distillation_loss": 0.2891767621040344, + "epoch": 1.83, + "learning_rate": 1.267268408551069e-05, + "loss": 13.8999, + "step": 3860, + "task_loss": 0.17670656740665436 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04978744230063925, + "compression/movement_sparsity/importance_threshold": -1.1367980147683476e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.8161176156466425, + "compression/movement_sparsity/model_sparsity": 0.6337410186922852, + "compression_loss": 13.4927339553833, + "distillation_loss": 0.9970003366470337, + "epoch": 1.84, + "learning_rate": 1.2653681710213779e-05, + "loss": 14.0161, + "step": 3870, + "task_loss": 0.44373542070388794 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04980559936233475, + "compression/movement_sparsity/importance_threshold": -1.0396906799056173e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.8173062989310449, + "compression/movement_sparsity/model_sparsity": 0.6346640686805675, + "compression_loss": 13.497337341308594, + "distillation_loss": 0.15605174005031586, + "epoch": 1.84, + "learning_rate": 1.2634679334916867e-05, + "loss": 13.7718, + "step": 3880, + "task_loss": 0.025827720761299133 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04982269180813043, + "compression/movement_sparsity/importance_threshold": -9.482771084071487e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.8178974588791027, + "compression/movement_sparsity/model_sparsity": 0.6351231229890514, + "compression_loss": 13.501633644104004, + "distillation_loss": 0.5499266982078552, + "epoch": 1.85, + "learning_rate": 1.2615676959619953e-05, + "loss": 13.9453, + "step": 3890, + "task_loss": 0.355935275554657 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04983875180164866, + "compression/movement_sparsity/importance_threshold": -8.623852832527392e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.8187233819820837, + "compression/movement_sparsity/model_sparsity": 0.6357644782773206, + "compression_loss": 13.50567626953125, + "distillation_loss": 0.44997888803482056, + "epoch": 1.85, + "learning_rate": 1.259667458432304e-05, + "loss": 13.9764, + "step": 3900, + "task_loss": 0.2622376084327698 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.049853811506511775, + "compression/movement_sparsity/importance_threshold": -7.81843187422403e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.8195478700880758, + "compression/movement_sparsity/model_sparsity": 0.6364047192452565, + "compression_loss": 13.509462356567383, + "distillation_loss": 0.7038769721984863, + "epoch": 1.86, + "learning_rate": 1.2577672209026129e-05, + "loss": 14.057, + "step": 3910, + "task_loss": 0.33292850852012634 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04986790308634213, + "compression/movement_sparsity/importance_threshold": -7.064788038961111e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.8199815449977417, + "compression/movement_sparsity/model_sparsity": 0.6367414814640393, + "compression_loss": 13.513002395629883, + "distillation_loss": 0.3087387681007385, + "epoch": 1.86, + "learning_rate": 1.2558669833729218e-05, + "loss": 13.9792, + "step": 3920, + "task_loss": 0.10697836428880692 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04988105870476209, + "compression/movement_sparsity/importance_threshold": -6.361201156536607e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.8203553617133393, + "compression/movement_sparsity/model_sparsity": 0.6370317619108833, + "compression_loss": 13.516273498535156, + "distillation_loss": 0.9965323805809021, + "epoch": 1.87, + "learning_rate": 1.2539667458432306e-05, + "loss": 13.9582, + "step": 3930, + "task_loss": 0.5111120343208313 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.049893310525393975, + "compression/movement_sparsity/importance_threshold": -5.705951056751094e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.8209573899992472, + "compression/movement_sparsity/model_sparsity": 0.6374992558258266, + "compression_loss": 13.519289016723633, + "distillation_loss": 0.6441047787666321, + "epoch": 1.87, + "learning_rate": 1.2520665083135392e-05, + "loss": 13.9644, + "step": 3940, + "task_loss": 0.35741135478019714 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04990469071186017, + "compression/movement_sparsity/importance_threshold": -5.09731756940298e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.8216001980766335, + "compression/movement_sparsity/model_sparsity": 0.6379984165325387, + "compression_loss": 13.522068977355957, + "distillation_loss": 0.3542700409889221, + "epoch": 1.88, + "learning_rate": 1.250166270783848e-05, + "loss": 13.9309, + "step": 3950, + "task_loss": 0.16567806899547577 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.049915231427783, + "compression/movement_sparsity/importance_threshold": -4.533580524292407e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.8219789667080699, + "compression/movement_sparsity/model_sparsity": 0.6382925422979101, + "compression_loss": 13.524628639221191, + "distillation_loss": 0.6030027866363525, + "epoch": 1.88, + "learning_rate": 1.248266033254157e-05, + "loss": 14.0541, + "step": 3960, + "task_loss": 0.4497295618057251 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04992496483678483, + "compression/movement_sparsity/importance_threshold": -4.013019751218216e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.8225840296597411, + "compression/movement_sparsity/model_sparsity": 0.6387623927263453, + "compression_loss": 13.52697467803955, + "distillation_loss": 0.28092095255851746, + "epoch": 1.89, + "learning_rate": 1.2463657957244657e-05, + "loss": 13.9522, + "step": 3970, + "task_loss": 0.09975674748420715 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04993392310248801, + "compression/movement_sparsity/importance_threshold": -3.533915079980115e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.823128846262421, + "compression/movement_sparsity/model_sparsity": 0.6391854599683252, + "compression_loss": 13.529121398925781, + "distillation_loss": 0.1360899955034256, + "epoch": 1.89, + "learning_rate": 1.2444655581947744e-05, + "loss": 14.0078, + "step": 3980, + "task_loss": 0.031911686062812805 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04994213838851488, + "compression/movement_sparsity/importance_threshold": -3.094546340377379e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.8237104552469136, + "compression/movement_sparsity/model_sparsity": 0.639637097652951, + "compression_loss": 13.53105640411377, + "distillation_loss": 0.8466547727584839, + "epoch": 1.9, + "learning_rate": 1.2425653206650832e-05, + "loss": 14.0804, + "step": 3990, + "task_loss": 0.4057835340499878 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0499496428584878, + "compression/movement_sparsity/importance_threshold": -2.6931933622088497e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.8239907736750979, + "compression/movement_sparsity/model_sparsity": 0.6398547737364343, + "compression_loss": 13.532774925231934, + "distillation_loss": 0.13539861142635345, + "epoch": 1.9, + "learning_rate": 1.2406650831353921e-05, + "loss": 13.9601, + "step": 4000, + "task_loss": 0.03517580032348633 + }, + { + "epoch": 1.9, + "eval_accuracy": 0.8577981651376146, + "eval_loss": 14.102431297302246, + "eval_runtime": 23.3122, + "eval_samples_per_second": 37.405, + "eval_steps_per_second": 4.676, + "step": 4000 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04995646867602912, + "compression/movement_sparsity/importance_threshold": -2.3281359752746686e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.8247015323885878, + "compression/movement_sparsity/model_sparsity": 0.6404067002510637, + "compression_loss": 13.5343017578125, + "distillation_loss": 0.37791934609413147, + "epoch": 1.9, + "learning_rate": 1.2387648456057009e-05, + "loss": 14.0972, + "step": 4010, + "task_loss": 0.31983357667922974 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04996264800476118, + "compression/movement_sparsity/importance_threshold": -1.997654009373677e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.8250938629177959, + "compression/movement_sparsity/model_sparsity": 0.6407113572569627, + "compression_loss": 13.535682678222656, + "distillation_loss": 0.20223000645637512, + "epoch": 1.91, + "learning_rate": 1.2368646080760097e-05, + "loss": 13.8255, + "step": 4020, + "task_loss": 0.0476217158138752 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04996821300830635, + "compression/movement_sparsity/importance_threshold": -1.7000272943051495e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.8256924801452875, + "compression/movement_sparsity/model_sparsity": 0.6411762023776709, + "compression_loss": 13.536922454833984, + "distillation_loss": 0.5841785669326782, + "epoch": 1.91, + "learning_rate": 1.2349643705463183e-05, + "loss": 14.0384, + "step": 4030, + "task_loss": 0.35522913932800293 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04997319585028697, + "compression/movement_sparsity/importance_threshold": -1.4335356598687947e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.8266022917607648, + "compression/movement_sparsity/model_sparsity": 0.6418826997365765, + "compression_loss": 13.537969589233398, + "distillation_loss": 0.47049567103385925, + "epoch": 1.92, + "learning_rate": 1.233064133016627e-05, + "loss": 13.9148, + "step": 4040, + "task_loss": 0.39884519577026367 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04997762869432539, + "compression/movement_sparsity/importance_threshold": -1.1964589358634536e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.8273503133468835, + "compression/movement_sparsity/model_sparsity": 0.6424635620447807, + "compression_loss": 13.538784980773926, + "distillation_loss": 0.41415804624557495, + "epoch": 1.92, + "learning_rate": 1.231163895486936e-05, + "loss": 13.8998, + "step": 4050, + "task_loss": 0.3223814368247986 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.049981543704043965, + "compression/movement_sparsity/importance_threshold": -9.870769520888348e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.8279402735433604, + "compression/movement_sparsity/model_sparsity": 0.6429216847083958, + "compression_loss": 13.53950023651123, + "distillation_loss": 0.19833412766456604, + "epoch": 1.93, + "learning_rate": 1.2292636579572448e-05, + "loss": 14.0509, + "step": 4060, + "task_loss": 0.10071046650409698 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04998497304306504, + "compression/movement_sparsity/importance_threshold": -8.036695383442129e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.8284314541930141, + "compression/movement_sparsity/model_sparsity": 0.6433031019444725, + "compression_loss": 13.540120124816895, + "distillation_loss": 0.48713254928588867, + "epoch": 1.93, + "learning_rate": 1.2273634204275534e-05, + "loss": 14.023, + "step": 4070, + "task_loss": 0.2855875492095947 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04998794887501098, + "compression/movement_sparsity/importance_threshold": -6.44516524428429e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.8293861388700693, + "compression/movement_sparsity/model_sparsity": 0.6440444446482291, + "compression_loss": 13.540621757507324, + "distillation_loss": 0.48493221402168274, + "epoch": 1.94, + "learning_rate": 1.2254631828978622e-05, + "loss": 14.0355, + "step": 4080, + "task_loss": 0.24878904223442078 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04999050336350411, + "compression/movement_sparsity/importance_threshold": -5.078977401416253e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.8297654603282144, + "compression/movement_sparsity/model_sparsity": 0.644338999700942, + "compression_loss": 13.540968894958496, + "distillation_loss": 0.4940585196018219, + "epoch": 1.94, + "learning_rate": 1.2235629453681712e-05, + "loss": 13.8999, + "step": 4090, + "task_loss": 0.3463954031467438 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0499926686721668, + "compression/movement_sparsity/importance_threshold": -3.920930152830765e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.8303199808039747, + "compression/movement_sparsity/model_sparsity": 0.644769602305832, + "compression_loss": 13.541226387023926, + "distillation_loss": 0.3753039836883545, + "epoch": 1.95, + "learning_rate": 1.22166270783848e-05, + "loss": 13.9771, + "step": 4100, + "task_loss": 0.5592837333679199 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0499944769646214, + "compression/movement_sparsity/importance_threshold": -2.953821796516237e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.8311270254629629, + "compression/movement_sparsity/model_sparsity": 0.6453962978880762, + "compression_loss": 13.541411399841309, + "distillation_loss": 0.4341853857040405, + "epoch": 1.95, + "learning_rate": 1.2197624703087888e-05, + "loss": 14.01, + "step": 4110, + "task_loss": 0.2506285309791565 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04999596040449025, + "compression/movement_sparsity/importance_threshold": -2.160450630469754e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.8317775143029208, + "compression/movement_sparsity/model_sparsity": 0.6459014229487039, + "compression_loss": 13.541484832763672, + "distillation_loss": 0.3095904588699341, + "epoch": 1.96, + "learning_rate": 1.2178622327790974e-05, + "loss": 13.9163, + "step": 4120, + "task_loss": 0.12216134369373322 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.049997151155395714, + "compression/movement_sparsity/importance_threshold": -1.5236149526797263e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.8322562739950317, + "compression/movement_sparsity/model_sparsity": 0.6462731949202557, + "compression_loss": 13.541465759277344, + "distillation_loss": 0.4424591064453125, + "epoch": 1.96, + "learning_rate": 1.2159619952494062e-05, + "loss": 13.915, + "step": 4130, + "task_loss": 0.1971740871667862 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04999808138096014, + "compression/movement_sparsity/importance_threshold": -1.0261130611475752e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.8326778572907256, + "compression/movement_sparsity/model_sparsity": 0.6466005676201638, + "compression_loss": 13.541352272033691, + "distillation_loss": 0.8569298982620239, + "epoch": 1.97, + "learning_rate": 1.2140617577197151e-05, + "loss": 13.9558, + "step": 4140, + "task_loss": 0.45095348358154297 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04999878324480587, + "compression/movement_sparsity/importance_threshold": -6.507432538617117e-08, + "compression/movement_sparsity/linear_layer_sparsity": 0.8331807531616983, + "compression/movement_sparsity/model_sparsity": 0.6469910820943721, + "compression_loss": 13.541196823120117, + "distillation_loss": 0.5633354187011719, + "epoch": 1.97, + "learning_rate": 1.2121615201900239e-05, + "loss": 13.8935, + "step": 4150, + "task_loss": 0.26475971937179565 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04999928891055526, + "compression/movement_sparsity/importance_threshold": -3.803038288148833e-08, + "compression/movement_sparsity/linear_layer_sparsity": 0.8333070799457994, + "compression/movement_sparsity/model_sparsity": 0.6470891788188018, + "compression_loss": 13.540977478027344, + "distillation_loss": 0.5583696365356445, + "epoch": 1.98, + "learning_rate": 1.2102612826603327e-05, + "loss": 13.9072, + "step": 4160, + "task_loss": 0.3018655776977539 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04999963054183066, + "compression/movement_sparsity/importance_threshold": -1.975930839998377e-08, + "compression/movement_sparsity/linear_layer_sparsity": 0.834074744523487, + "compression/movement_sparsity/model_sparsity": 0.6476852945282907, + "compression_loss": 13.540740013122559, + "distillation_loss": 0.480516254901886, + "epoch": 1.98, + "learning_rate": 1.2083610451306413e-05, + "loss": 14.0023, + "step": 4170, + "task_loss": 0.21734555065631866 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04999984030225443, + "compression/movement_sparsity/importance_threshold": -8.540931741365942e-09, + "compression/movement_sparsity/linear_layer_sparsity": 0.8346077047576032, + "compression/movement_sparsity/model_sparsity": 0.6480991549268605, + "compression_loss": 13.540478706359863, + "distillation_loss": 0.5318132638931274, + "epoch": 1.99, + "learning_rate": 1.2064608076009503e-05, + "loss": 13.8838, + "step": 4180, + "task_loss": 0.33174723386764526 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04999995035544891, + "compression/movement_sparsity/importance_threshold": -2.655082704475925e-09, + "compression/movement_sparsity/linear_layer_sparsity": 0.8350744198848239, + "compression/movement_sparsity/model_sparsity": 0.6484615739146306, + "compression_loss": 13.54019546508789, + "distillation_loss": 0.5515092611312866, + "epoch": 1.99, + "learning_rate": 1.204560570071259e-05, + "loss": 13.9953, + "step": 4190, + "task_loss": 0.2831692099571228 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.049999992865036445, + "compression/movement_sparsity/importance_threshold": -3.815910894558461e-10, + "compression/movement_sparsity/linear_layer_sparsity": 0.8356860344211081, + "compression/movement_sparsity/model_sparsity": 0.6489365118547517, + "compression_loss": 13.539885520935059, + "distillation_loss": 0.5041601061820984, + "epoch": 2.0, + "learning_rate": 1.2026603325415678e-05, + "loss": 13.9405, + "step": 4200, + "task_loss": 0.24462735652923584 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.049999999994639395, + "compression/movement_sparsity/importance_threshold": -2.8669514678947294e-13, + "compression/movement_sparsity/linear_layer_sparsity": 0.836323984680819, + "compression/movement_sparsity/model_sparsity": 0.6494319003131221, + "compression_loss": 13.539533615112305, + "distillation_loss": 0.37183666229248047, + "epoch": 2.0, + "learning_rate": 1.2007600950118764e-05, + "loss": 13.9897, + "step": 4210, + "task_loss": 0.1697111278772354 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.6137279272079468, + "epoch": 2.0, + "learning_rate": 1.1988598574821854e-05, + "loss": 0.7213, + "step": 4220, + "task_loss": 0.35716748237609863 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.24926382303237915, + "epoch": 2.01, + "learning_rate": 1.1969596199524942e-05, + "loss": 0.2347, + "step": 4230, + "task_loss": 0.09346465766429901 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.11653564870357513, + "epoch": 2.01, + "learning_rate": 1.195059382422803e-05, + "loss": 0.1942, + "step": 4240, + "task_loss": 0.07350118458271027 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.4009395241737366, + "epoch": 2.02, + "learning_rate": 1.1931591448931118e-05, + "loss": 0.2701, + "step": 4250, + "task_loss": 0.47706255316734314 + }, + { + "epoch": 2.02, + "eval_accuracy": 0.9048165137614679, + "eval_loss": 0.33537691831588745, + "eval_runtime": 22.0839, + "eval_samples_per_second": 39.486, + "eval_steps_per_second": 4.936, + "step": 4250 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.260000079870224, + "epoch": 2.02, + "learning_rate": 1.1912589073634204e-05, + "loss": 0.2824, + "step": 4260, + "task_loss": 0.33928802609443665 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.5958622097969055, + "epoch": 2.03, + "learning_rate": 1.1893586698337293e-05, + "loss": 0.2131, + "step": 4270, + "task_loss": 0.2858772873878479 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.28239572048187256, + "epoch": 2.03, + "learning_rate": 1.1874584323040381e-05, + "loss": 0.3669, + "step": 4280, + "task_loss": 0.3893040120601654 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.09593847393989563, + "epoch": 2.04, + "learning_rate": 1.1855581947743469e-05, + "loss": 0.2448, + "step": 4290, + "task_loss": 0.20446573197841644 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.017060134559869766, + "epoch": 2.04, + "learning_rate": 1.1836579572446555e-05, + "loss": 0.1788, + "step": 4300, + "task_loss": 0.0036827102303504944 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.1649906486272812, + "epoch": 2.05, + "learning_rate": 1.1817577197149645e-05, + "loss": 0.1344, + "step": 4310, + "task_loss": 0.07797713577747345 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.36799219250679016, + "epoch": 2.05, + "learning_rate": 1.1798574821852733e-05, + "loss": 0.1798, + "step": 4320, + "task_loss": 0.19989198446273804 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.2834874391555786, + "epoch": 2.06, + "learning_rate": 1.177957244655582e-05, + "loss": 0.0852, + "step": 4330, + "task_loss": 0.11706365644931793 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.4932941794395447, + "epoch": 2.06, + "learning_rate": 1.1760570071258908e-05, + "loss": 0.3161, + "step": 4340, + "task_loss": 0.25535914301872253 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.4178961515426636, + "epoch": 2.07, + "learning_rate": 1.1741567695961998e-05, + "loss": 0.2081, + "step": 4350, + "task_loss": 0.10093335807323456 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.3657403588294983, + "epoch": 2.07, + "learning_rate": 1.1722565320665084e-05, + "loss": 0.2158, + "step": 4360, + "task_loss": 0.25138112902641296 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.02273857593536377, + "epoch": 2.08, + "learning_rate": 1.1703562945368172e-05, + "loss": 0.1983, + "step": 4370, + "task_loss": 0.006311226636171341 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.17630484700202942, + "epoch": 2.08, + "learning_rate": 1.168456057007126e-05, + "loss": 0.2566, + "step": 4380, + "task_loss": 0.505398154258728 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.031373463571071625, + "epoch": 2.09, + "learning_rate": 1.1665558194774346e-05, + "loss": 0.2201, + "step": 4390, + "task_loss": 0.002458591014146805 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.33634716272354126, + "epoch": 2.09, + "learning_rate": 1.1646555819477436e-05, + "loss": 0.1559, + "step": 4400, + "task_loss": 0.2244289070367813 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.2994489073753357, + "epoch": 2.1, + "learning_rate": 1.1627553444180523e-05, + "loss": 0.248, + "step": 4410, + "task_loss": 0.2632516026496887 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.060996197164058685, + "epoch": 2.1, + "learning_rate": 1.1608551068883611e-05, + "loss": 0.1764, + "step": 4420, + "task_loss": 0.022328753024339676 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.0799054354429245, + "epoch": 2.1, + "learning_rate": 1.15895486935867e-05, + "loss": 0.2472, + "step": 4430, + "task_loss": 0.028038904070854187 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.4037141799926758, + "epoch": 2.11, + "learning_rate": 1.1570546318289789e-05, + "loss": 0.2217, + "step": 4440, + "task_loss": 0.40060561895370483 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.29084426164627075, + "epoch": 2.11, + "learning_rate": 1.1551543942992875e-05, + "loss": 0.2478, + "step": 4450, + "task_loss": 0.22699187695980072 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.022717095911502838, + "epoch": 2.12, + "learning_rate": 1.1532541567695963e-05, + "loss": 0.1458, + "step": 4460, + "task_loss": 0.007930740714073181 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.05607360601425171, + "epoch": 2.12, + "learning_rate": 1.151353919239905e-05, + "loss": 0.2155, + "step": 4470, + "task_loss": 0.23636063933372498 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.0458001047372818, + "epoch": 2.13, + "learning_rate": 1.1494536817102138e-05, + "loss": 0.2193, + "step": 4480, + "task_loss": 0.03247682377696037 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.27424195408821106, + "epoch": 2.13, + "learning_rate": 1.1475534441805228e-05, + "loss": 0.2339, + "step": 4490, + "task_loss": 0.16137200593948364 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.33258911967277527, + "epoch": 2.14, + "learning_rate": 1.1456532066508314e-05, + "loss": 0.2689, + "step": 4500, + "task_loss": 0.34519919753074646 + }, + { + "epoch": 2.14, + "eval_accuracy": 0.9048165137614679, + "eval_loss": 0.3319544792175293, + "eval_runtime": 22.0326, + "eval_samples_per_second": 39.578, + "eval_steps_per_second": 4.947, + "step": 4500 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.06233971193432808, + "epoch": 2.14, + "learning_rate": 1.1437529691211402e-05, + "loss": 0.2959, + "step": 4510, + "task_loss": 0.008978258818387985 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.19504857063293457, + "epoch": 2.15, + "learning_rate": 1.141852731591449e-05, + "loss": 0.16, + "step": 4520, + "task_loss": 0.12643758952617645 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.02659631334245205, + "epoch": 2.15, + "learning_rate": 1.139952494061758e-05, + "loss": 0.2226, + "step": 4530, + "task_loss": 0.003300584852695465 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.05742825195193291, + "epoch": 2.16, + "learning_rate": 1.1380522565320666e-05, + "loss": 0.1707, + "step": 4540, + "task_loss": 0.27017584443092346 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.197635218501091, + "epoch": 2.16, + "learning_rate": 1.1361520190023754e-05, + "loss": 0.2152, + "step": 4550, + "task_loss": 0.17246709764003754 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.13968348503112793, + "epoch": 2.17, + "learning_rate": 1.1342517814726841e-05, + "loss": 0.2187, + "step": 4560, + "task_loss": 0.3724798858165741 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.2040019929409027, + "epoch": 2.17, + "learning_rate": 1.1323515439429931e-05, + "loss": 0.1535, + "step": 4570, + "task_loss": 0.049664292484521866 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.570436954498291, + "epoch": 2.18, + "learning_rate": 1.1304513064133019e-05, + "loss": 0.1904, + "step": 4580, + "task_loss": 0.3329172134399414 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.09249302744865417, + "epoch": 2.18, + "learning_rate": 1.1285510688836105e-05, + "loss": 0.1101, + "step": 4590, + "task_loss": 0.21237826347351074 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.03505774587392807, + "epoch": 2.19, + "learning_rate": 1.1266508313539193e-05, + "loss": 0.1319, + "step": 4600, + "task_loss": 0.4041575789451599 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.3652401864528656, + "epoch": 2.19, + "learning_rate": 1.124750593824228e-05, + "loss": 0.1775, + "step": 4610, + "task_loss": 0.18814264237880707 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.16492074728012085, + "epoch": 2.19, + "learning_rate": 1.122850356294537e-05, + "loss": 0.2287, + "step": 4620, + "task_loss": 0.16214123368263245 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.17321833968162537, + "epoch": 2.2, + "learning_rate": 1.1209501187648456e-05, + "loss": 0.1745, + "step": 4630, + "task_loss": 0.1337524801492691 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.04696015268564224, + "epoch": 2.2, + "learning_rate": 1.1190498812351544e-05, + "loss": 0.1773, + "step": 4640, + "task_loss": 0.011036917567253113 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.3878232538700104, + "epoch": 2.21, + "learning_rate": 1.1171496437054632e-05, + "loss": 0.2254, + "step": 4650, + "task_loss": 0.28115496039390564 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.09996585547924042, + "epoch": 2.21, + "learning_rate": 1.1152494061757722e-05, + "loss": 0.1769, + "step": 4660, + "task_loss": 0.1222931444644928 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.4124844968318939, + "epoch": 2.22, + "learning_rate": 1.113349168646081e-05, + "loss": 0.2081, + "step": 4670, + "task_loss": 0.3749869763851166 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.13545894622802734, + "epoch": 2.22, + "learning_rate": 1.1114489311163896e-05, + "loss": 0.0857, + "step": 4680, + "task_loss": 0.05755548179149628 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.1826428771018982, + "epoch": 2.23, + "learning_rate": 1.1095486935866984e-05, + "loss": 0.2862, + "step": 4690, + "task_loss": 0.01588393747806549 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.1444108933210373, + "epoch": 2.23, + "learning_rate": 1.1076484560570073e-05, + "loss": 0.1427, + "step": 4700, + "task_loss": 0.26151856780052185 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.1410968005657196, + "epoch": 2.24, + "learning_rate": 1.1057482185273161e-05, + "loss": 0.1974, + "step": 4710, + "task_loss": 0.07411689311265945 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.1891230046749115, + "epoch": 2.24, + "learning_rate": 1.1038479809976247e-05, + "loss": 0.1835, + "step": 4720, + "task_loss": 0.11419402062892914 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.10753890872001648, + "epoch": 2.25, + "learning_rate": 1.1019477434679335e-05, + "loss": 0.201, + "step": 4730, + "task_loss": 0.05918397009372711 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.12443285435438156, + "epoch": 2.25, + "learning_rate": 1.1000475059382423e-05, + "loss": 0.1185, + "step": 4740, + "task_loss": 0.06015586480498314 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.39146068692207336, + "epoch": 2.26, + "learning_rate": 1.0981472684085512e-05, + "loss": 0.1775, + "step": 4750, + "task_loss": 0.406578928232193 + }, + { + "epoch": 2.26, + "eval_accuracy": 0.9162844036697247, + "eval_loss": 0.28384512662887573, + "eval_runtime": 22.1283, + "eval_samples_per_second": 39.407, + "eval_steps_per_second": 4.926, + "step": 4750 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.019335411489009857, + "epoch": 2.26, + "learning_rate": 1.09624703087886e-05, + "loss": 0.166, + "step": 4760, + "task_loss": 0.16980049014091492 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.05398799106478691, + "epoch": 2.27, + "learning_rate": 1.0943467933491686e-05, + "loss": 0.1554, + "step": 4770, + "task_loss": 0.007133938372135162 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.018566645681858063, + "epoch": 2.27, + "learning_rate": 1.0924465558194774e-05, + "loss": 0.2266, + "step": 4780, + "task_loss": 0.005496695637702942 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.09009189158678055, + "epoch": 2.28, + "learning_rate": 1.0905463182897864e-05, + "loss": 0.1098, + "step": 4790, + "task_loss": 0.04330487921833992 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.29310229420661926, + "epoch": 2.28, + "learning_rate": 1.0886460807600952e-05, + "loss": 0.2037, + "step": 4800, + "task_loss": 0.21032559871673584 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.32484009861946106, + "epoch": 2.29, + "learning_rate": 1.086745843230404e-05, + "loss": 0.1728, + "step": 4810, + "task_loss": 0.23763622343540192 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.32907170057296753, + "epoch": 2.29, + "learning_rate": 1.0848456057007126e-05, + "loss": 0.1479, + "step": 4820, + "task_loss": 0.27866876125335693 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.015447848476469517, + "epoch": 2.29, + "learning_rate": 1.0829453681710214e-05, + "loss": 0.1612, + "step": 4830, + "task_loss": 0.003272462636232376 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.07381051778793335, + "epoch": 2.3, + "learning_rate": 1.0810451306413303e-05, + "loss": 0.0796, + "step": 4840, + "task_loss": 0.13187864422798157 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.11486229300498962, + "epoch": 2.3, + "learning_rate": 1.0791448931116391e-05, + "loss": 0.1633, + "step": 4850, + "task_loss": 0.17579086124897003 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.2775013744831085, + "epoch": 2.31, + "learning_rate": 1.0772446555819477e-05, + "loss": 0.1521, + "step": 4860, + "task_loss": 0.34235501289367676 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.07822778820991516, + "epoch": 2.31, + "learning_rate": 1.0753444180522565e-05, + "loss": 0.1217, + "step": 4870, + "task_loss": 0.017286375164985657 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.08122527599334717, + "epoch": 2.32, + "learning_rate": 1.0734441805225655e-05, + "loss": 0.1756, + "step": 4880, + "task_loss": 0.24036413431167603 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.8243784308433533, + "epoch": 2.32, + "learning_rate": 1.0715439429928743e-05, + "loss": 0.2231, + "step": 4890, + "task_loss": 0.5445951819419861 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.061645179986953735, + "epoch": 2.33, + "learning_rate": 1.069643705463183e-05, + "loss": 0.1871, + "step": 4900, + "task_loss": 0.023713212460279465 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.11187595874071121, + "epoch": 2.33, + "learning_rate": 1.0677434679334917e-05, + "loss": 0.1132, + "step": 4910, + "task_loss": 0.023900482803583145 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.026393216103315353, + "epoch": 2.34, + "learning_rate": 1.0658432304038006e-05, + "loss": 0.1716, + "step": 4920, + "task_loss": 0.1366603672504425 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.01934751495718956, + "epoch": 2.34, + "learning_rate": 1.0639429928741094e-05, + "loss": 0.1488, + "step": 4930, + "task_loss": 0.004630662500858307 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.019772972911596298, + "epoch": 2.35, + "learning_rate": 1.0620427553444182e-05, + "loss": 0.123, + "step": 4940, + "task_loss": 0.004192207008600235 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.07974082976579666, + "epoch": 2.35, + "learning_rate": 1.0601425178147268e-05, + "loss": 0.2069, + "step": 4950, + "task_loss": 0.02976549044251442 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.4843176603317261, + "epoch": 2.36, + "learning_rate": 1.0582422802850356e-05, + "loss": 0.2589, + "step": 4960, + "task_loss": 0.5428990125656128 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.11596342921257019, + "epoch": 2.36, + "learning_rate": 1.0563420427553445e-05, + "loss": 0.1505, + "step": 4970, + "task_loss": 0.40985506772994995 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.31906628608703613, + "epoch": 2.37, + "learning_rate": 1.0544418052256533e-05, + "loss": 0.2339, + "step": 4980, + "task_loss": 0.16161520779132843 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.08732592314481735, + "epoch": 2.37, + "learning_rate": 1.0525415676959621e-05, + "loss": 0.1856, + "step": 4990, + "task_loss": 0.03401995077729225 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.16932491958141327, + "epoch": 2.38, + "learning_rate": 1.0506413301662707e-05, + "loss": 0.1648, + "step": 5000, + "task_loss": 0.1186380535364151 + }, + { + "epoch": 2.38, + "eval_accuracy": 0.9128440366972477, + "eval_loss": 0.2842116057872772, + "eval_runtime": 22.1968, + "eval_samples_per_second": 39.285, + "eval_steps_per_second": 4.911, + "step": 5000 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.4252215325832367, + "epoch": 2.38, + "learning_rate": 1.0487410926365797e-05, + "loss": 0.1908, + "step": 5010, + "task_loss": 0.26106417179107666 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.4758025109767914, + "epoch": 2.38, + "learning_rate": 1.0468408551068885e-05, + "loss": 0.2279, + "step": 5020, + "task_loss": 0.26795437932014465 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.11485397070646286, + "epoch": 2.39, + "learning_rate": 1.0449406175771973e-05, + "loss": 0.1344, + "step": 5030, + "task_loss": 0.10242946445941925 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.07752517610788345, + "epoch": 2.39, + "learning_rate": 1.0430403800475059e-05, + "loss": 0.1472, + "step": 5040, + "task_loss": 0.23853403329849243 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.23056337237358093, + "epoch": 2.4, + "learning_rate": 1.041140142517815e-05, + "loss": 0.111, + "step": 5050, + "task_loss": 0.1433607041835785 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.46395134925842285, + "epoch": 2.4, + "learning_rate": 1.0392399049881236e-05, + "loss": 0.1743, + "step": 5060, + "task_loss": 0.2823677659034729 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.16598868370056152, + "epoch": 2.41, + "learning_rate": 1.0373396674584324e-05, + "loss": 0.1343, + "step": 5070, + "task_loss": 0.03318723663687706 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.016996556892991066, + "epoch": 2.41, + "learning_rate": 1.0354394299287412e-05, + "loss": 0.1427, + "step": 5080, + "task_loss": 0.006868541240692139 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.6260548830032349, + "epoch": 2.42, + "learning_rate": 1.0335391923990498e-05, + "loss": 0.2121, + "step": 5090, + "task_loss": 0.34184902906417847 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.0207875557243824, + "epoch": 2.42, + "learning_rate": 1.0316389548693588e-05, + "loss": 0.1736, + "step": 5100, + "task_loss": 0.10779790580272675 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.11162707954645157, + "epoch": 2.43, + "learning_rate": 1.0297387173396676e-05, + "loss": 0.1683, + "step": 5110, + "task_loss": 0.17393611371517181 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.10660862922668457, + "epoch": 2.43, + "learning_rate": 1.0278384798099763e-05, + "loss": 0.2287, + "step": 5120, + "task_loss": 0.013046719133853912 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.11832073330879211, + "epoch": 2.44, + "learning_rate": 1.0259382422802851e-05, + "loss": 0.1859, + "step": 5130, + "task_loss": 0.03974215313792229 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.19762274622917175, + "epoch": 2.44, + "learning_rate": 1.024038004750594e-05, + "loss": 0.158, + "step": 5140, + "task_loss": 0.09981634467840195 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.22084400057792664, + "epoch": 2.45, + "learning_rate": 1.0221377672209027e-05, + "loss": 0.1586, + "step": 5150, + "task_loss": 0.10561450570821762 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.42767268419265747, + "epoch": 2.45, + "learning_rate": 1.0202375296912115e-05, + "loss": 0.1494, + "step": 5160, + "task_loss": 0.2538478970527649 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.03990669548511505, + "epoch": 2.46, + "learning_rate": 1.0183372921615203e-05, + "loss": 0.1552, + "step": 5170, + "task_loss": 0.012294076383113861 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.0684843510389328, + "epoch": 2.46, + "learning_rate": 1.0164370546318289e-05, + "loss": 0.1114, + "step": 5180, + "task_loss": 0.04257909581065178 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.10727253556251526, + "epoch": 2.47, + "learning_rate": 1.0145368171021378e-05, + "loss": 0.1408, + "step": 5190, + "task_loss": 0.0835094004869461 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.1803852915763855, + "epoch": 2.47, + "learning_rate": 1.0126365795724466e-05, + "loss": 0.1463, + "step": 5200, + "task_loss": 0.10350771993398666 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.21467408537864685, + "epoch": 2.48, + "learning_rate": 1.0107363420427554e-05, + "loss": 0.1776, + "step": 5210, + "task_loss": 0.12939737737178802 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.49049556255340576, + "epoch": 2.48, + "learning_rate": 1.0088361045130642e-05, + "loss": 0.187, + "step": 5220, + "task_loss": 0.4802893400192261 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.18884505331516266, + "epoch": 2.48, + "learning_rate": 1.0069358669833732e-05, + "loss": 0.1488, + "step": 5230, + "task_loss": 0.09134702384471893 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.17404451966285706, + "epoch": 2.49, + "learning_rate": 1.0050356294536818e-05, + "loss": 0.1596, + "step": 5240, + "task_loss": 0.08957971632480621 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.012248549610376358, + "epoch": 2.49, + "learning_rate": 1.0031353919239906e-05, + "loss": 0.1316, + "step": 5250, + "task_loss": 0.004954520612955093 + }, + { + "epoch": 2.49, + "eval_accuracy": 0.9162844036697247, + "eval_loss": 0.2750292420387268, + "eval_runtime": 22.048, + "eval_samples_per_second": 39.55, + "eval_steps_per_second": 4.944, + "step": 5250 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.08270278573036194, + "epoch": 2.5, + "learning_rate": 1.0012351543942993e-05, + "loss": 0.1146, + "step": 5260, + "task_loss": 0.04688364639878273 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.10440254956483841, + "epoch": 2.5, + "learning_rate": 9.993349168646081e-06, + "loss": 0.1711, + "step": 5270, + "task_loss": 0.06673218309879303 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.34925076365470886, + "epoch": 2.51, + "learning_rate": 9.97434679334917e-06, + "loss": 0.2062, + "step": 5280, + "task_loss": 0.16814163327217102 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.44582313299179077, + "epoch": 2.51, + "learning_rate": 9.955344418052257e-06, + "loss": 0.2331, + "step": 5290, + "task_loss": 0.35081130266189575 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.02439703233540058, + "epoch": 2.52, + "learning_rate": 9.936342042755345e-06, + "loss": 0.1357, + "step": 5300, + "task_loss": 0.003609389066696167 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.0913199633359909, + "epoch": 2.52, + "learning_rate": 9.917339667458433e-06, + "loss": 0.1696, + "step": 5310, + "task_loss": 0.3426245152950287 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.02632327377796173, + "epoch": 2.53, + "learning_rate": 9.89833729216152e-06, + "loss": 0.1167, + "step": 5320, + "task_loss": 0.005381196737289429 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.1984187364578247, + "epoch": 2.53, + "learning_rate": 9.879334916864608e-06, + "loss": 0.1597, + "step": 5330, + "task_loss": 0.12277568876743317 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.16098986566066742, + "epoch": 2.54, + "learning_rate": 9.860332541567696e-06, + "loss": 0.1252, + "step": 5340, + "task_loss": 0.081682950258255 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.5197485685348511, + "epoch": 2.54, + "learning_rate": 9.841330166270784e-06, + "loss": 0.1671, + "step": 5350, + "task_loss": 0.3930453062057495 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.0463409461081028, + "epoch": 2.55, + "learning_rate": 9.822327790973872e-06, + "loss": 0.1617, + "step": 5360, + "task_loss": 0.01061389222741127 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.042029425501823425, + "epoch": 2.55, + "learning_rate": 9.803325415676962e-06, + "loss": 0.1519, + "step": 5370, + "task_loss": 0.027968235313892365 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.3101033866405487, + "epoch": 2.56, + "learning_rate": 9.784323040380048e-06, + "loss": 0.193, + "step": 5380, + "task_loss": 0.16425858438014984 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.7263270020484924, + "epoch": 2.56, + "learning_rate": 9.765320665083137e-06, + "loss": 0.2224, + "step": 5390, + "task_loss": 0.7286182641983032 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.040765728801488876, + "epoch": 2.57, + "learning_rate": 9.746318289786224e-06, + "loss": 0.1389, + "step": 5400, + "task_loss": 0.011140488088130951 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.1607925444841385, + "epoch": 2.57, + "learning_rate": 9.727315914489311e-06, + "loss": 0.1126, + "step": 5410, + "task_loss": 0.11116501688957214 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.0224788635969162, + "epoch": 2.57, + "learning_rate": 9.7083135391924e-06, + "loss": 0.1352, + "step": 5420, + "task_loss": 0.005074281245470047 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.0271650031208992, + "epoch": 2.58, + "learning_rate": 9.689311163895487e-06, + "loss": 0.105, + "step": 5430, + "task_loss": 0.012768540531396866 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.02819114923477173, + "epoch": 2.58, + "learning_rate": 9.670308788598575e-06, + "loss": 0.2538, + "step": 5440, + "task_loss": 0.00484645739197731 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.18920451402664185, + "epoch": 2.59, + "learning_rate": 9.651306413301663e-06, + "loss": 0.1027, + "step": 5450, + "task_loss": 0.24752211570739746 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.11157466471195221, + "epoch": 2.59, + "learning_rate": 9.632304038004752e-06, + "loss": 0.162, + "step": 5460, + "task_loss": 0.1878063678741455 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.26885926723480225, + "epoch": 2.6, + "learning_rate": 9.613301662707839e-06, + "loss": 0.2035, + "step": 5470, + "task_loss": 0.058163829147815704 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.1357247531414032, + "epoch": 2.6, + "learning_rate": 9.594299287410928e-06, + "loss": 0.2073, + "step": 5480, + "task_loss": 0.1269286572933197 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.05651300773024559, + "epoch": 2.61, + "learning_rate": 9.575296912114014e-06, + "loss": 0.1478, + "step": 5490, + "task_loss": 0.014481060206890106 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.11179080605506897, + "epoch": 2.61, + "learning_rate": 9.556294536817104e-06, + "loss": 0.2349, + "step": 5500, + "task_loss": 0.13668608665466309 + }, + { + "epoch": 2.61, + "eval_accuracy": 0.9231651376146789, + "eval_loss": 0.24054142832756042, + "eval_runtime": 22.021, + "eval_samples_per_second": 39.599, + "eval_steps_per_second": 4.95, + "step": 5500 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.02737560123205185, + "epoch": 2.62, + "learning_rate": 9.53729216152019e-06, + "loss": 0.1493, + "step": 5510, + "task_loss": 0.10180087387561798 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.12310200929641724, + "epoch": 2.62, + "learning_rate": 9.518289786223278e-06, + "loss": 0.1098, + "step": 5520, + "task_loss": 0.058783046901226044 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.13475853204727173, + "epoch": 2.63, + "learning_rate": 9.499287410926367e-06, + "loss": 0.1139, + "step": 5530, + "task_loss": 0.05447866767644882 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.17890390753746033, + "epoch": 2.63, + "learning_rate": 9.480285035629454e-06, + "loss": 0.1686, + "step": 5540, + "task_loss": 0.22207100689411163 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.05488044023513794, + "epoch": 2.64, + "learning_rate": 9.461282660332543e-06, + "loss": 0.1037, + "step": 5550, + "task_loss": 0.2198052704334259 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.07350677251815796, + "epoch": 2.64, + "learning_rate": 9.44228028503563e-06, + "loss": 0.1472, + "step": 5560, + "task_loss": 0.15250588953495026 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.23793400824069977, + "epoch": 2.65, + "learning_rate": 9.423277909738719e-06, + "loss": 0.187, + "step": 5570, + "task_loss": 0.17216888070106506 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.3636675775051117, + "epoch": 2.65, + "learning_rate": 9.404275534441805e-06, + "loss": 0.1792, + "step": 5580, + "task_loss": 0.21685606241226196 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.05343058705329895, + "epoch": 2.66, + "learning_rate": 9.385273159144895e-06, + "loss": 0.1529, + "step": 5590, + "task_loss": 0.012824393808841705 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.21820136904716492, + "epoch": 2.66, + "learning_rate": 9.36627078384798e-06, + "loss": 0.2066, + "step": 5600, + "task_loss": 0.10303452610969543 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.02510383166372776, + "epoch": 2.67, + "learning_rate": 9.34726840855107e-06, + "loss": 0.0728, + "step": 5610, + "task_loss": 0.004608385264873505 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.2674543261528015, + "epoch": 2.67, + "learning_rate": 9.328266033254158e-06, + "loss": 0.146, + "step": 5620, + "task_loss": 0.2036900818347931 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.2999550402164459, + "epoch": 2.67, + "learning_rate": 9.309263657957246e-06, + "loss": 0.1655, + "step": 5630, + "task_loss": 0.1722106784582138 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.08812573552131653, + "epoch": 2.68, + "learning_rate": 9.290261282660334e-06, + "loss": 0.1105, + "step": 5640, + "task_loss": 0.041734665632247925 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.014322986826300621, + "epoch": 2.68, + "learning_rate": 9.27125890736342e-06, + "loss": 0.0986, + "step": 5650, + "task_loss": 0.11941255629062653 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.12577423453330994, + "epoch": 2.69, + "learning_rate": 9.25225653206651e-06, + "loss": 0.1373, + "step": 5660, + "task_loss": 0.3789353370666504 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.0945727676153183, + "epoch": 2.69, + "learning_rate": 9.233254156769596e-06, + "loss": 0.1489, + "step": 5670, + "task_loss": 0.14805974066257477 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.27903738617897034, + "epoch": 2.7, + "learning_rate": 9.214251781472685e-06, + "loss": 0.1515, + "step": 5680, + "task_loss": 0.14870142936706543 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.1626128852367401, + "epoch": 2.7, + "learning_rate": 9.195249406175773e-06, + "loss": 0.149, + "step": 5690, + "task_loss": 0.08122530579566956 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.055356357246637344, + "epoch": 2.71, + "learning_rate": 9.176247030878861e-06, + "loss": 0.1217, + "step": 5700, + "task_loss": 0.16854631900787354 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.1662861853837967, + "epoch": 2.71, + "learning_rate": 9.157244655581949e-06, + "loss": 0.1675, + "step": 5710, + "task_loss": 0.013219501823186874 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.20596551895141602, + "epoch": 2.72, + "learning_rate": 9.138242280285037e-06, + "loss": 0.2061, + "step": 5720, + "task_loss": 0.1961277425289154 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.2168876677751541, + "epoch": 2.72, + "learning_rate": 9.119239904988125e-06, + "loss": 0.1496, + "step": 5730, + "task_loss": 0.2229800522327423 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.044798918068408966, + "epoch": 2.73, + "learning_rate": 9.100237529691213e-06, + "loss": 0.114, + "step": 5740, + "task_loss": 0.005338538438081741 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.13803905248641968, + "epoch": 2.73, + "learning_rate": 9.0812351543943e-06, + "loss": 0.066, + "step": 5750, + "task_loss": 0.06024959683418274 + }, + { + "epoch": 2.73, + "eval_accuracy": 0.9174311926605505, + "eval_loss": 0.26952359080314636, + "eval_runtime": 22.0206, + "eval_samples_per_second": 39.599, + "eval_steps_per_second": 4.95, + "step": 5750 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.009917501360177994, + "epoch": 2.74, + "learning_rate": 9.062232779097387e-06, + "loss": 0.1051, + "step": 5760, + "task_loss": 0.0061325803399086 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.08573028445243835, + "epoch": 2.74, + "learning_rate": 9.043230403800476e-06, + "loss": 0.1055, + "step": 5770, + "task_loss": 0.05740174278616905 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.07769911736249924, + "epoch": 2.75, + "learning_rate": 9.024228028503564e-06, + "loss": 0.1586, + "step": 5780, + "task_loss": 0.019947297871112823 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.04432743787765503, + "epoch": 2.75, + "learning_rate": 9.005225653206652e-06, + "loss": 0.1275, + "step": 5790, + "task_loss": 0.17999230325222015 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.1737172156572342, + "epoch": 2.76, + "learning_rate": 8.98622327790974e-06, + "loss": 0.2693, + "step": 5800, + "task_loss": 0.18654870986938477 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.021758923307061195, + "epoch": 2.76, + "learning_rate": 8.967220902612828e-06, + "loss": 0.119, + "step": 5810, + "task_loss": 0.0038488097488880157 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.011074014008045197, + "epoch": 2.76, + "learning_rate": 8.948218527315915e-06, + "loss": 0.1163, + "step": 5820, + "task_loss": 0.0033141709864139557 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.23885256052017212, + "epoch": 2.77, + "learning_rate": 8.929216152019003e-06, + "loss": 0.1698, + "step": 5830, + "task_loss": 0.5643174052238464 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.046549778431653976, + "epoch": 2.77, + "learning_rate": 8.910213776722091e-06, + "loss": 0.0801, + "step": 5840, + "task_loss": 0.00456850603222847 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.09816709160804749, + "epoch": 2.78, + "learning_rate": 8.891211401425179e-06, + "loss": 0.1631, + "step": 5850, + "task_loss": 0.12905465066432953 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.08407769352197647, + "epoch": 2.78, + "learning_rate": 8.872209026128267e-06, + "loss": 0.1879, + "step": 5860, + "task_loss": 0.03167784959077835 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.2762501537799835, + "epoch": 2.79, + "learning_rate": 8.853206650831355e-06, + "loss": 0.1614, + "step": 5870, + "task_loss": 0.16979815065860748 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.12890750169754028, + "epoch": 2.79, + "learning_rate": 8.834204275534443e-06, + "loss": 0.1169, + "step": 5880, + "task_loss": 0.04023109748959541 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.02156493254005909, + "epoch": 2.8, + "learning_rate": 8.81520190023753e-06, + "loss": 0.0952, + "step": 5890, + "task_loss": 0.0034607164561748505 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.007742735557258129, + "epoch": 2.8, + "learning_rate": 8.796199524940618e-06, + "loss": 0.1524, + "step": 5900, + "task_loss": 0.0028364397585392 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.1480652093887329, + "epoch": 2.81, + "learning_rate": 8.777197149643706e-06, + "loss": 0.1092, + "step": 5910, + "task_loss": 0.29450657963752747 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.17767339944839478, + "epoch": 2.81, + "learning_rate": 8.758194774346794e-06, + "loss": 0.2108, + "step": 5920, + "task_loss": 0.10835998505353928 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.26279133558273315, + "epoch": 2.82, + "learning_rate": 8.739192399049882e-06, + "loss": 0.169, + "step": 5930, + "task_loss": 0.23328514397144318 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.015537131577730179, + "epoch": 2.82, + "learning_rate": 8.72019002375297e-06, + "loss": 0.1353, + "step": 5940, + "task_loss": 0.02001919597387314 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.07597079128026962, + "epoch": 2.83, + "learning_rate": 8.701187648456058e-06, + "loss": 0.0961, + "step": 5950, + "task_loss": 0.11477569490671158 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.11929579079151154, + "epoch": 2.83, + "learning_rate": 8.682185273159146e-06, + "loss": 0.1904, + "step": 5960, + "task_loss": 0.04400225356221199 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.24032776057720184, + "epoch": 2.84, + "learning_rate": 8.663182897862233e-06, + "loss": 0.1713, + "step": 5970, + "task_loss": 0.16706398129463196 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.039569027721881866, + "epoch": 2.84, + "learning_rate": 8.644180522565321e-06, + "loss": 0.079, + "step": 5980, + "task_loss": 0.009293723851442337 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.14112095534801483, + "epoch": 2.85, + "learning_rate": 8.625178147268409e-06, + "loss": 0.1912, + "step": 5990, + "task_loss": 0.12100718915462494 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.2663877308368683, + "epoch": 2.85, + "learning_rate": 8.606175771971497e-06, + "loss": 0.1285, + "step": 6000, + "task_loss": 0.1642446219921112 + }, + { + "epoch": 2.85, + "eval_accuracy": 0.9094036697247706, + "eval_loss": 0.3016970455646515, + "eval_runtime": 21.9287, + "eval_samples_per_second": 39.765, + "eval_steps_per_second": 4.971, + "step": 6000 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.02504299208521843, + "epoch": 2.86, + "learning_rate": 8.587173396674585e-06, + "loss": 0.0738, + "step": 6010, + "task_loss": 0.004753179848194122 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.05768204480409622, + "epoch": 2.86, + "learning_rate": 8.570071258907364e-06, + "loss": 0.1474, + "step": 6020, + "task_loss": 0.03982783481478691 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.04381980746984482, + "epoch": 2.86, + "learning_rate": 8.551068883610452e-06, + "loss": 0.0811, + "step": 6030, + "task_loss": 0.014950472861528397 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.10156150907278061, + "epoch": 2.87, + "learning_rate": 8.53206650831354e-06, + "loss": 0.0831, + "step": 6040, + "task_loss": 0.10873029381036758 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.07243537902832031, + "epoch": 2.87, + "learning_rate": 8.513064133016627e-06, + "loss": 0.146, + "step": 6050, + "task_loss": 0.029440071433782578 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.27121058106422424, + "epoch": 2.88, + "learning_rate": 8.494061757719715e-06, + "loss": 0.1812, + "step": 6060, + "task_loss": 0.30982720851898193 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.23871323466300964, + "epoch": 2.88, + "learning_rate": 8.475059382422803e-06, + "loss": 0.1833, + "step": 6070, + "task_loss": 0.17489267885684967 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.12207458168268204, + "epoch": 2.89, + "learning_rate": 8.456057007125893e-06, + "loss": 0.1351, + "step": 6080, + "task_loss": 0.008778557181358337 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.09064328670501709, + "epoch": 2.89, + "learning_rate": 8.437054631828979e-06, + "loss": 0.0597, + "step": 6090, + "task_loss": 0.026089288294315338 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.28841766715049744, + "epoch": 2.9, + "learning_rate": 8.418052256532068e-06, + "loss": 0.162, + "step": 6100, + "task_loss": 0.1540592610836029 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.049679264426231384, + "epoch": 2.9, + "learning_rate": 8.399049881235155e-06, + "loss": 0.0938, + "step": 6110, + "task_loss": 0.012325655668973923 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.031312961131334305, + "epoch": 2.91, + "learning_rate": 8.380047505938242e-06, + "loss": 0.1657, + "step": 6120, + "task_loss": 0.003661230206489563 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.0725235864520073, + "epoch": 2.91, + "learning_rate": 8.36104513064133e-06, + "loss": 0.142, + "step": 6130, + "task_loss": 0.029958881437778473 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.19128333032131195, + "epoch": 2.92, + "learning_rate": 8.342042755344418e-06, + "loss": 0.1463, + "step": 6140, + "task_loss": 0.11310219764709473 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.024800153449177742, + "epoch": 2.92, + "learning_rate": 8.323040380047506e-06, + "loss": 0.1464, + "step": 6150, + "task_loss": 0.007242865860462189 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.0960264801979065, + "epoch": 2.93, + "learning_rate": 8.304038004750594e-06, + "loss": 0.1254, + "step": 6160, + "task_loss": 0.10413940250873566 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.029338371008634567, + "epoch": 2.93, + "learning_rate": 8.285035629453683e-06, + "loss": 0.1295, + "step": 6170, + "task_loss": 0.010850280523300171 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.40849167108535767, + "epoch": 2.94, + "learning_rate": 8.26603325415677e-06, + "loss": 0.1832, + "step": 6180, + "task_loss": 0.32399341464042664 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.017780784517526627, + "epoch": 2.94, + "learning_rate": 8.247030878859859e-06, + "loss": 0.1762, + "step": 6190, + "task_loss": 0.004788093268871307 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.20082998275756836, + "epoch": 2.95, + "learning_rate": 8.228028503562945e-06, + "loss": 0.1738, + "step": 6200, + "task_loss": 0.2769722044467926 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.09029912948608398, + "epoch": 2.95, + "learning_rate": 8.209026128266035e-06, + "loss": 0.1472, + "step": 6210, + "task_loss": 0.22967661917209625 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.03564874455332756, + "epoch": 2.95, + "learning_rate": 8.190023752969121e-06, + "loss": 0.1332, + "step": 6220, + "task_loss": 0.011339064687490463 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.06550465524196625, + "epoch": 2.96, + "learning_rate": 8.171021377672209e-06, + "loss": 0.0727, + "step": 6230, + "task_loss": 0.03282541409134865 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.021643269807100296, + "epoch": 2.96, + "learning_rate": 8.152019002375298e-06, + "loss": 0.1414, + "step": 6240, + "task_loss": 0.00811653584241867 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.2023080289363861, + "epoch": 2.97, + "learning_rate": 8.133016627078385e-06, + "loss": 0.1813, + "step": 6250, + "task_loss": 0.1566222906112671 + }, + { + "epoch": 2.97, + "eval_accuracy": 0.9105504587155964, + "eval_loss": 0.347153902053833, + "eval_runtime": 22.058, + "eval_samples_per_second": 39.532, + "eval_steps_per_second": 4.942, + "step": 6250 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.40393316745758057, + "epoch": 2.97, + "learning_rate": 8.114014251781474e-06, + "loss": 0.2437, + "step": 6260, + "task_loss": 0.2824239134788513 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.04513658583164215, + "epoch": 2.98, + "learning_rate": 8.09501187648456e-06, + "loss": 0.1042, + "step": 6270, + "task_loss": 0.007071588188409805 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.01414411049336195, + "epoch": 2.98, + "learning_rate": 8.07600950118765e-06, + "loss": 0.0815, + "step": 6280, + "task_loss": 0.010898426175117493 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.27671492099761963, + "epoch": 2.99, + "learning_rate": 8.057007125890736e-06, + "loss": 0.1583, + "step": 6290, + "task_loss": 0.4247187376022339 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.04388013109564781, + "epoch": 2.99, + "learning_rate": 8.038004750593826e-06, + "loss": 0.1534, + "step": 6300, + "task_loss": 0.009620524942874908 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.04090433940291405, + "epoch": 3.0, + "learning_rate": 8.019002375296912e-06, + "loss": 0.1315, + "step": 6310, + "task_loss": 0.004594910889863968 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.026270296424627304, + "epoch": 3.0, + "learning_rate": 8.000000000000001e-06, + "loss": 0.1069, + "step": 6320, + "task_loss": 0.0037337057292461395 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.16657724976539612, + "epoch": 3.01, + "learning_rate": 7.98099762470309e-06, + "loss": 0.0828, + "step": 6330, + "task_loss": 0.17973756790161133 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.022658517584204674, + "epoch": 3.01, + "learning_rate": 7.961995249406177e-06, + "loss": 0.095, + "step": 6340, + "task_loss": 0.008285708725452423 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.03448961302638054, + "epoch": 3.02, + "learning_rate": 7.942992874109265e-06, + "loss": 0.0677, + "step": 6350, + "task_loss": 0.02081955224275589 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.020961280912160873, + "epoch": 3.02, + "learning_rate": 7.923990498812351e-06, + "loss": 0.0523, + "step": 6360, + "task_loss": 0.24772392213344574 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.09194082766771317, + "epoch": 3.03, + "learning_rate": 7.90498812351544e-06, + "loss": 0.0895, + "step": 6370, + "task_loss": 0.3988223373889923 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.15075421333312988, + "epoch": 3.03, + "learning_rate": 7.885985748218527e-06, + "loss": 0.1081, + "step": 6380, + "task_loss": 0.06303508579730988 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.012541974894702435, + "epoch": 3.04, + "learning_rate": 7.866983372921616e-06, + "loss": 0.0863, + "step": 6390, + "task_loss": 0.004561152309179306 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.021147001534700394, + "epoch": 3.04, + "learning_rate": 7.847980997624704e-06, + "loss": 0.0674, + "step": 6400, + "task_loss": 0.0043773651123046875 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.016194012016057968, + "epoch": 3.05, + "learning_rate": 7.828978622327792e-06, + "loss": 0.1634, + "step": 6410, + "task_loss": 0.0029691122472286224 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.04040956124663353, + "epoch": 3.05, + "learning_rate": 7.80997624703088e-06, + "loss": 0.0932, + "step": 6420, + "task_loss": 0.006615336984395981 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.1302649974822998, + "epoch": 3.05, + "learning_rate": 7.790973871733968e-06, + "loss": 0.0852, + "step": 6430, + "task_loss": 0.08950361609458923 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.026604020968079567, + "epoch": 3.06, + "learning_rate": 7.771971496437056e-06, + "loss": 0.0547, + "step": 6440, + "task_loss": 0.19832749664783478 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.1625809371471405, + "epoch": 3.06, + "learning_rate": 7.752969121140144e-06, + "loss": 0.0843, + "step": 6450, + "task_loss": 0.07263679802417755 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.3386070132255554, + "epoch": 3.07, + "learning_rate": 7.733966745843231e-06, + "loss": 0.1032, + "step": 6460, + "task_loss": 0.3932771682739258 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.02958713099360466, + "epoch": 3.07, + "learning_rate": 7.714964370546318e-06, + "loss": 0.0915, + "step": 6470, + "task_loss": 0.002492375671863556 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.03094402514398098, + "epoch": 3.08, + "learning_rate": 7.695961995249407e-06, + "loss": 0.1083, + "step": 6480, + "task_loss": 0.006007764488458633 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.020859267562627792, + "epoch": 3.08, + "learning_rate": 7.676959619952495e-06, + "loss": 0.0465, + "step": 6490, + "task_loss": 0.00389765202999115 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.17978915572166443, + "epoch": 3.09, + "learning_rate": 7.657957244655583e-06, + "loss": 0.078, + "step": 6500, + "task_loss": 0.09985024482011795 + }, + { + "epoch": 3.09, + "eval_accuracy": 0.9139908256880734, + "eval_loss": 0.2914510667324066, + "eval_runtime": 22.4643, + "eval_samples_per_second": 38.817, + "eval_steps_per_second": 4.852, + "step": 6500 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.07285353541374207, + "epoch": 3.09, + "learning_rate": 7.63895486935867e-06, + "loss": 0.0599, + "step": 6510, + "task_loss": 0.21172893047332764 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.016873065382242203, + "epoch": 3.1, + "learning_rate": 7.619952494061759e-06, + "loss": 0.0994, + "step": 6520, + "task_loss": 0.1662091165781021 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.15915945172309875, + "epoch": 3.1, + "learning_rate": 7.600950118764846e-06, + "loss": 0.1219, + "step": 6530, + "task_loss": 0.2617415487766266 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.16674910485744476, + "epoch": 3.11, + "learning_rate": 7.581947743467934e-06, + "loss": 0.0909, + "step": 6540, + "task_loss": 0.12426068633794785 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.0346527174115181, + "epoch": 3.11, + "learning_rate": 7.562945368171022e-06, + "loss": 0.0854, + "step": 6550, + "task_loss": 0.012002792209386826 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.08293355256319046, + "epoch": 3.12, + "learning_rate": 7.54394299287411e-06, + "loss": 0.0816, + "step": 6560, + "task_loss": 0.08816082775592804 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.054979730397462845, + "epoch": 3.12, + "learning_rate": 7.524940617577198e-06, + "loss": 0.0739, + "step": 6570, + "task_loss": 0.5423688292503357 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.021432993933558464, + "epoch": 3.13, + "learning_rate": 7.505938242280285e-06, + "loss": 0.0644, + "step": 6580, + "task_loss": 0.005201835185289383 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.11038654297590256, + "epoch": 3.13, + "learning_rate": 7.486935866983374e-06, + "loss": 0.0679, + "step": 6590, + "task_loss": 0.10889068990945816 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.011515101417899132, + "epoch": 3.14, + "learning_rate": 7.467933491686461e-06, + "loss": 0.1239, + "step": 6600, + "task_loss": 0.0029702894389629364 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.027960218489170074, + "epoch": 3.14, + "learning_rate": 7.448931116389549e-06, + "loss": 0.0788, + "step": 6610, + "task_loss": 0.005036883056163788 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.018560441210865974, + "epoch": 3.14, + "learning_rate": 7.429928741092637e-06, + "loss": 0.0824, + "step": 6620, + "task_loss": 0.009363945573568344 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.08277207612991333, + "epoch": 3.15, + "learning_rate": 7.410926365795725e-06, + "loss": 0.145, + "step": 6630, + "task_loss": 0.23960661888122559 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.024465510621666908, + "epoch": 3.15, + "learning_rate": 7.391923990498813e-06, + "loss": 0.0744, + "step": 6640, + "task_loss": 0.09361692517995834 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.28370052576065063, + "epoch": 3.16, + "learning_rate": 7.372921615201901e-06, + "loss": 0.0718, + "step": 6650, + "task_loss": 0.18601828813552856 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.1300351619720459, + "epoch": 3.16, + "learning_rate": 7.353919239904989e-06, + "loss": 0.1288, + "step": 6660, + "task_loss": 0.06523597240447998 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.04163939133286476, + "epoch": 3.17, + "learning_rate": 7.334916864608077e-06, + "loss": 0.0974, + "step": 6670, + "task_loss": 0.17136383056640625 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.018006887286901474, + "epoch": 3.17, + "learning_rate": 7.315914489311164e-06, + "loss": 0.0903, + "step": 6680, + "task_loss": 0.005319155752658844 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.24133111536502838, + "epoch": 3.18, + "learning_rate": 7.296912114014253e-06, + "loss": 0.1113, + "step": 6690, + "task_loss": 0.2049761414527893 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.18800024688243866, + "epoch": 3.18, + "learning_rate": 7.27790973871734e-06, + "loss": 0.0828, + "step": 6700, + "task_loss": 0.10774320363998413 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.05912996828556061, + "epoch": 3.19, + "learning_rate": 7.258907363420428e-06, + "loss": 0.1459, + "step": 6710, + "task_loss": 0.024521011859178543 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.042738813906908035, + "epoch": 3.19, + "learning_rate": 7.239904988123516e-06, + "loss": 0.0925, + "step": 6720, + "task_loss": 0.00665607675909996 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.08050870150327682, + "epoch": 3.2, + "learning_rate": 7.220902612826604e-06, + "loss": 0.0643, + "step": 6730, + "task_loss": 0.22327980399131775 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.23609286546707153, + "epoch": 3.2, + "learning_rate": 7.201900237529692e-06, + "loss": 0.1105, + "step": 6740, + "task_loss": 0.15705671906471252 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.00854767207056284, + "epoch": 3.21, + "learning_rate": 7.1828978622327794e-06, + "loss": 0.0886, + "step": 6750, + "task_loss": 0.003407653421163559 + }, + { + "epoch": 3.21, + "eval_accuracy": 0.9151376146788991, + "eval_loss": 0.28525349497795105, + "eval_runtime": 22.3514, + "eval_samples_per_second": 39.013, + "eval_steps_per_second": 4.877, + "step": 6750 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.014828482642769814, + "epoch": 3.21, + "learning_rate": 7.163895486935868e-06, + "loss": 0.0819, + "step": 6760, + "task_loss": 0.22838379442691803 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.05267515778541565, + "epoch": 3.22, + "learning_rate": 7.144893111638955e-06, + "loss": 0.0694, + "step": 6770, + "task_loss": 0.17177042365074158 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.11514155566692352, + "epoch": 3.22, + "learning_rate": 7.125890736342044e-06, + "loss": 0.1022, + "step": 6780, + "task_loss": 0.04417066648602486 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.10625005513429642, + "epoch": 3.23, + "learning_rate": 7.106888361045131e-06, + "loss": 0.0949, + "step": 6790, + "task_loss": 0.06190282851457596 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.23833660781383514, + "epoch": 3.23, + "learning_rate": 7.08788598574822e-06, + "loss": 0.1509, + "step": 6800, + "task_loss": 0.3238193392753601 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.060429543256759644, + "epoch": 3.24, + "learning_rate": 7.068883610451307e-06, + "loss": 0.0693, + "step": 6810, + "task_loss": 0.3467991352081299 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.05774849280714989, + "epoch": 3.24, + "learning_rate": 7.0498812351543945e-06, + "loss": 0.0708, + "step": 6820, + "task_loss": 0.02291129156947136 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.018570270389318466, + "epoch": 3.24, + "learning_rate": 7.030878859857483e-06, + "loss": 0.066, + "step": 6830, + "task_loss": 0.005066726356744766 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.06323438882827759, + "epoch": 3.25, + "learning_rate": 7.01187648456057e-06, + "loss": 0.0893, + "step": 6840, + "task_loss": 0.04271535202860832 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.02259252592921257, + "epoch": 3.25, + "learning_rate": 6.992874109263659e-06, + "loss": 0.0495, + "step": 6850, + "task_loss": 0.08561189472675323 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.03591727092862129, + "epoch": 3.26, + "learning_rate": 6.973871733966746e-06, + "loss": 0.0788, + "step": 6860, + "task_loss": 0.005202621221542358 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.03306454047560692, + "epoch": 3.26, + "learning_rate": 6.954869358669835e-06, + "loss": 0.1167, + "step": 6870, + "task_loss": 0.0036557093262672424 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.10268350690603256, + "epoch": 3.27, + "learning_rate": 6.935866983372922e-06, + "loss": 0.0842, + "step": 6880, + "task_loss": 0.11851201951503754 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.03656313568353653, + "epoch": 3.27, + "learning_rate": 6.91686460807601e-06, + "loss": 0.0654, + "step": 6890, + "task_loss": 0.009406276047229767 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.07688532024621964, + "epoch": 3.28, + "learning_rate": 6.897862232779098e-06, + "loss": 0.0885, + "step": 6900, + "task_loss": 0.09606030583381653 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.021331768482923508, + "epoch": 3.28, + "learning_rate": 6.878859857482186e-06, + "loss": 0.1026, + "step": 6910, + "task_loss": 0.003994014114141464 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.03601570054888725, + "epoch": 3.29, + "learning_rate": 6.859857482185274e-06, + "loss": 0.1132, + "step": 6920, + "task_loss": 0.1425406038761139 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.054355375468730927, + "epoch": 3.29, + "learning_rate": 6.840855106888361e-06, + "loss": 0.1151, + "step": 6930, + "task_loss": 0.1992948204278946 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.026305008679628372, + "epoch": 3.3, + "learning_rate": 6.82185273159145e-06, + "loss": 0.0999, + "step": 6940, + "task_loss": 0.008063357323408127 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.14803074300289154, + "epoch": 3.3, + "learning_rate": 6.802850356294537e-06, + "loss": 0.0867, + "step": 6950, + "task_loss": 0.21300971508026123 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.10545548796653748, + "epoch": 3.31, + "learning_rate": 6.783847980997625e-06, + "loss": 0.0741, + "step": 6960, + "task_loss": 0.06187749281525612 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.2063077986240387, + "epoch": 3.31, + "learning_rate": 6.764845605700712e-06, + "loss": 0.1095, + "step": 6970, + "task_loss": 0.27536579966545105 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.012730289250612259, + "epoch": 3.32, + "learning_rate": 6.745843230403801e-06, + "loss": 0.1128, + "step": 6980, + "task_loss": 0.005988124758005142 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.024963170289993286, + "epoch": 3.32, + "learning_rate": 6.726840855106889e-06, + "loss": 0.1089, + "step": 6990, + "task_loss": 0.006193935871124268 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.1125224307179451, + "epoch": 3.33, + "learning_rate": 6.707838479809977e-06, + "loss": 0.117, + "step": 7000, + "task_loss": 0.25286245346069336 + }, + { + "epoch": 3.33, + "eval_accuracy": 0.9185779816513762, + "eval_loss": 0.2689138948917389, + "eval_runtime": 22.4717, + "eval_samples_per_second": 38.804, + "eval_steps_per_second": 4.851, + "step": 7000 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.07371003925800323, + "epoch": 3.33, + "learning_rate": 6.688836104513065e-06, + "loss": 0.0961, + "step": 7010, + "task_loss": 0.0624442957341671 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.08843027800321579, + "epoch": 3.33, + "learning_rate": 6.669833729216153e-06, + "loss": 0.053, + "step": 7020, + "task_loss": 0.22307521104812622 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.01580330729484558, + "epoch": 3.34, + "learning_rate": 6.6508313539192404e-06, + "loss": 0.0582, + "step": 7030, + "task_loss": 0.021607249975204468 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.3695874810218811, + "epoch": 3.34, + "learning_rate": 6.631828978622329e-06, + "loss": 0.1369, + "step": 7040, + "task_loss": 0.4205509424209595 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.025944426655769348, + "epoch": 3.35, + "learning_rate": 6.612826603325416e-06, + "loss": 0.054, + "step": 7050, + "task_loss": 0.18624553084373474 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.26268765330314636, + "epoch": 3.35, + "learning_rate": 6.593824228028504e-06, + "loss": 0.1181, + "step": 7060, + "task_loss": 0.2383035570383072 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.018553506582975388, + "epoch": 3.36, + "learning_rate": 6.574821852731592e-06, + "loss": 0.1264, + "step": 7070, + "task_loss": 0.002540022134780884 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.30001404881477356, + "epoch": 3.36, + "learning_rate": 6.55581947743468e-06, + "loss": 0.1015, + "step": 7080, + "task_loss": 0.4227195382118225 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.017469489946961403, + "epoch": 3.37, + "learning_rate": 6.536817102137768e-06, + "loss": 0.0563, + "step": 7090, + "task_loss": 0.005384139716625214 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.2690996825695038, + "epoch": 3.37, + "learning_rate": 6.5178147268408555e-06, + "loss": 0.0829, + "step": 7100, + "task_loss": 0.19411098957061768 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.012529075145721436, + "epoch": 3.38, + "learning_rate": 6.498812351543944e-06, + "loss": 0.0957, + "step": 7110, + "task_loss": 0.00231257826089859 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.015447025187313557, + "epoch": 3.38, + "learning_rate": 6.479809976247031e-06, + "loss": 0.0599, + "step": 7120, + "task_loss": 0.0031574219465255737 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.016959063708782196, + "epoch": 3.39, + "learning_rate": 6.46080760095012e-06, + "loss": 0.0663, + "step": 7130, + "task_loss": 0.15422838926315308 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.015068082138895988, + "epoch": 3.39, + "learning_rate": 6.441805225653207e-06, + "loss": 0.0898, + "step": 7140, + "task_loss": 0.004132535308599472 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.2646453082561493, + "epoch": 3.4, + "learning_rate": 6.422802850356296e-06, + "loss": 0.0993, + "step": 7150, + "task_loss": 0.17641893029212952 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.09765015542507172, + "epoch": 3.4, + "learning_rate": 6.403800475059383e-06, + "loss": 0.0545, + "step": 7160, + "task_loss": 0.0811493992805481 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.042697787284851074, + "epoch": 3.41, + "learning_rate": 6.3847980997624705e-06, + "loss": 0.0704, + "step": 7170, + "task_loss": 0.21445339918136597 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.23853538930416107, + "epoch": 3.41, + "learning_rate": 6.365795724465559e-06, + "loss": 0.0793, + "step": 7180, + "task_loss": 0.15609657764434814 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.09077732264995575, + "epoch": 3.42, + "learning_rate": 6.346793349168646e-06, + "loss": 0.1064, + "step": 7190, + "task_loss": 0.3665331304073334 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.021392345428466797, + "epoch": 3.42, + "learning_rate": 6.327790973871735e-06, + "loss": 0.1384, + "step": 7200, + "task_loss": 0.18095025420188904 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.2543436586856842, + "epoch": 3.43, + "learning_rate": 6.308788598574822e-06, + "loss": 0.1032, + "step": 7210, + "task_loss": 0.13427653908729553 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.02127661183476448, + "epoch": 3.43, + "learning_rate": 6.289786223277911e-06, + "loss": 0.1165, + "step": 7220, + "task_loss": 0.004494883120059967 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.029238495975732803, + "epoch": 3.43, + "learning_rate": 6.270783847980998e-06, + "loss": 0.0929, + "step": 7230, + "task_loss": 0.0037034451961517334 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.0373212993144989, + "epoch": 3.44, + "learning_rate": 6.251781472684086e-06, + "loss": 0.1337, + "step": 7240, + "task_loss": 0.005556315183639526 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.23062750697135925, + "epoch": 3.44, + "learning_rate": 6.232779097387173e-06, + "loss": 0.0894, + "step": 7250, + "task_loss": 0.34356802701950073 + }, + { + "epoch": 3.44, + "eval_accuracy": 0.9174311926605505, + "eval_loss": 0.27475783228874207, + "eval_runtime": 22.2988, + "eval_samples_per_second": 39.105, + "eval_steps_per_second": 4.888, + "step": 7250 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.12996672093868256, + "epoch": 3.45, + "learning_rate": 6.213776722090262e-06, + "loss": 0.0485, + "step": 7260, + "task_loss": 0.1863107681274414 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.19036176800727844, + "epoch": 3.45, + "learning_rate": 6.19477434679335e-06, + "loss": 0.0722, + "step": 7270, + "task_loss": 0.15313661098480225 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.24250520765781403, + "epoch": 3.46, + "learning_rate": 6.175771971496437e-06, + "loss": 0.087, + "step": 7280, + "task_loss": 0.34011194109916687 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.13349878787994385, + "epoch": 3.46, + "learning_rate": 6.156769596199526e-06, + "loss": 0.0665, + "step": 7290, + "task_loss": 0.2456045001745224 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.03384882211685181, + "epoch": 3.47, + "learning_rate": 6.137767220902613e-06, + "loss": 0.1172, + "step": 7300, + "task_loss": 0.0046829357743263245 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.02070469781756401, + "epoch": 3.47, + "learning_rate": 6.1187648456057014e-06, + "loss": 0.1301, + "step": 7310, + "task_loss": 0.005317840725183487 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.06323584169149399, + "epoch": 3.48, + "learning_rate": 6.0997624703087884e-06, + "loss": 0.0884, + "step": 7320, + "task_loss": 0.01452043280005455 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.059165108948946, + "epoch": 3.48, + "learning_rate": 6.080760095011877e-06, + "loss": 0.0623, + "step": 7330, + "task_loss": 0.02391085773706436 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.11576695740222931, + "epoch": 3.49, + "learning_rate": 6.061757719714965e-06, + "loss": 0.0891, + "step": 7340, + "task_loss": 0.19836050271987915 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.012924795970320702, + "epoch": 3.49, + "learning_rate": 6.042755344418053e-06, + "loss": 0.0451, + "step": 7350, + "task_loss": 0.13838058710098267 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.029066815972328186, + "epoch": 3.5, + "learning_rate": 6.023752969121141e-06, + "loss": 0.1052, + "step": 7360, + "task_loss": 0.01864166557788849 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.13578568398952484, + "epoch": 3.5, + "learning_rate": 6.004750593824229e-06, + "loss": 0.1078, + "step": 7370, + "task_loss": 0.08328451216220856 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.23648998141288757, + "epoch": 3.51, + "learning_rate": 5.9857482185273165e-06, + "loss": 0.0746, + "step": 7380, + "task_loss": 0.22922304272651672 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.09257731586694717, + "epoch": 3.51, + "learning_rate": 5.9667458432304035e-06, + "loss": 0.0938, + "step": 7390, + "task_loss": 0.09933258593082428 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.17856569588184357, + "epoch": 3.52, + "learning_rate": 5.947743467933492e-06, + "loss": 0.1055, + "step": 7400, + "task_loss": 0.15133805572986603 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.009972669184207916, + "epoch": 3.52, + "learning_rate": 5.928741092636579e-06, + "loss": 0.0526, + "step": 7410, + "task_loss": 0.003201443701982498 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.08634195476770401, + "epoch": 3.52, + "learning_rate": 5.909738717339668e-06, + "loss": 0.1139, + "step": 7420, + "task_loss": 0.1812535524368286 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.013718021102249622, + "epoch": 3.53, + "learning_rate": 5.890736342042756e-06, + "loss": 0.0834, + "step": 7430, + "task_loss": 0.0068436190485954285 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.015075819566845894, + "epoch": 3.53, + "learning_rate": 5.871733966745844e-06, + "loss": 0.0571, + "step": 7440, + "task_loss": 0.13153530657291412 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.1445593535900116, + "epoch": 3.54, + "learning_rate": 5.8527315914489315e-06, + "loss": 0.0546, + "step": 7450, + "task_loss": 0.07850364595651627 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.1473163664340973, + "epoch": 3.54, + "learning_rate": 5.83372921615202e-06, + "loss": 0.1104, + "step": 7460, + "task_loss": 0.15125451982021332 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.06551199406385422, + "epoch": 3.55, + "learning_rate": 5.814726840855107e-06, + "loss": 0.0888, + "step": 7470, + "task_loss": 0.034045103937387466 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.21904179453849792, + "epoch": 3.55, + "learning_rate": 5.795724465558196e-06, + "loss": 0.098, + "step": 7480, + "task_loss": 0.2412600815296173 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.031937625259160995, + "epoch": 3.56, + "learning_rate": 5.776722090261283e-06, + "loss": 0.0702, + "step": 7490, + "task_loss": 0.015268594026565552 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.06982429325580597, + "epoch": 3.56, + "learning_rate": 5.757719714964372e-06, + "loss": 0.1023, + "step": 7500, + "task_loss": 0.3576207160949707 + }, + { + "epoch": 3.56, + "eval_accuracy": 0.9094036697247706, + "eval_loss": 0.32788407802581787, + "eval_runtime": 21.9451, + "eval_samples_per_second": 39.736, + "eval_steps_per_second": 4.967, + "step": 7500 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.029036687687039375, + "epoch": 3.57, + "learning_rate": 5.738717339667459e-06, + "loss": 0.1211, + "step": 7510, + "task_loss": 0.014738757163286209 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.021804381161928177, + "epoch": 3.57, + "learning_rate": 5.7197149643705466e-06, + "loss": 0.0494, + "step": 7520, + "task_loss": 0.005628753453493118 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.017943602055311203, + "epoch": 3.58, + "learning_rate": 5.700712589073634e-06, + "loss": 0.0556, + "step": 7530, + "task_loss": 0.17601385712623596 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.04630360007286072, + "epoch": 3.58, + "learning_rate": 5.681710213776722e-06, + "loss": 0.092, + "step": 7540, + "task_loss": 0.07351444661617279 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.19516371190547943, + "epoch": 3.59, + "learning_rate": 5.662707838479811e-06, + "loss": 0.102, + "step": 7550, + "task_loss": 0.11753221601247787 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.075057253241539, + "epoch": 3.59, + "learning_rate": 5.643705463182898e-06, + "loss": 0.0556, + "step": 7560, + "task_loss": 0.004980906844139099 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.04576558992266655, + "epoch": 3.6, + "learning_rate": 5.624703087885987e-06, + "loss": 0.1244, + "step": 7570, + "task_loss": 0.022598903626203537 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.008466158993542194, + "epoch": 3.6, + "learning_rate": 5.605700712589074e-06, + "loss": 0.0988, + "step": 7580, + "task_loss": 0.0024762973189353943 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.02449963614344597, + "epoch": 3.61, + "learning_rate": 5.5866983372921624e-06, + "loss": 0.1019, + "step": 7590, + "task_loss": 0.003134731203317642 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.026412509381771088, + "epoch": 3.61, + "learning_rate": 5.5676959619952495e-06, + "loss": 0.0813, + "step": 7600, + "task_loss": 0.10514649748802185 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.09414191544055939, + "epoch": 3.62, + "learning_rate": 5.548693586698338e-06, + "loss": 0.1004, + "step": 7610, + "task_loss": 0.15864884853363037 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.16137491166591644, + "epoch": 3.62, + "learning_rate": 5.529691211401426e-06, + "loss": 0.1594, + "step": 7620, + "task_loss": 0.06678696721792221 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.081720732152462, + "epoch": 3.62, + "learning_rate": 5.510688836104513e-06, + "loss": 0.0898, + "step": 7630, + "task_loss": 0.13728320598602295 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.09084869921207428, + "epoch": 3.63, + "learning_rate": 5.491686460807602e-06, + "loss": 0.0892, + "step": 7640, + "task_loss": 0.03533271327614784 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.008733304217457771, + "epoch": 3.63, + "learning_rate": 5.472684085510689e-06, + "loss": 0.0742, + "step": 7650, + "task_loss": 0.006262246519327164 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.024746078997850418, + "epoch": 3.64, + "learning_rate": 5.4536817102137775e-06, + "loss": 0.0666, + "step": 7660, + "task_loss": 0.1450251340866089 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.17060193419456482, + "epoch": 3.64, + "learning_rate": 5.4346793349168645e-06, + "loss": 0.1122, + "step": 7670, + "task_loss": 0.4199802875518799 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.07124300301074982, + "epoch": 3.65, + "learning_rate": 5.415676959619953e-06, + "loss": 0.0768, + "step": 7680, + "task_loss": 0.3081812262535095 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.04165620356798172, + "epoch": 3.65, + "learning_rate": 5.39667458432304e-06, + "loss": 0.0542, + "step": 7690, + "task_loss": 0.008864354342222214 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.12530671060085297, + "epoch": 3.66, + "learning_rate": 5.377672209026129e-06, + "loss": 0.065, + "step": 7700, + "task_loss": 0.007050979882478714 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.02896895259618759, + "epoch": 3.66, + "learning_rate": 5.358669833729217e-06, + "loss": 0.0978, + "step": 7710, + "task_loss": 0.008473467081785202 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.02197858691215515, + "epoch": 3.67, + "learning_rate": 5.339667458432305e-06, + "loss": 0.0708, + "step": 7720, + "task_loss": 0.13779109716415405 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.14627870917320251, + "epoch": 3.67, + "learning_rate": 5.3206650831353925e-06, + "loss": 0.0796, + "step": 7730, + "task_loss": 0.11915778368711472 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.009426005184650421, + "epoch": 3.68, + "learning_rate": 5.3016627078384795e-06, + "loss": 0.0713, + "step": 7740, + "task_loss": 0.0036111027002334595 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.026764120906591415, + "epoch": 3.68, + "learning_rate": 5.282660332541568e-06, + "loss": 0.0495, + "step": 7750, + "task_loss": 0.17961205542087555 + }, + { + "epoch": 3.68, + "eval_accuracy": 0.9151376146788991, + "eval_loss": 0.2988388240337372, + "eval_runtime": 21.9907, + "eval_samples_per_second": 39.653, + "eval_steps_per_second": 4.957, + "step": 7750 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.02125394716858864, + "epoch": 3.69, + "learning_rate": 5.263657957244655e-06, + "loss": 0.0592, + "step": 7760, + "task_loss": 0.13330192863941193 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.023334577679634094, + "epoch": 3.69, + "learning_rate": 5.244655581947744e-06, + "loss": 0.1014, + "step": 7770, + "task_loss": 0.2334352731704712 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.07482799887657166, + "epoch": 3.7, + "learning_rate": 5.225653206650832e-06, + "loss": 0.0642, + "step": 7780, + "task_loss": 0.03224996477365494 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.06566259264945984, + "epoch": 3.7, + "learning_rate": 5.20665083135392e-06, + "loss": 0.1021, + "step": 7790, + "task_loss": 0.03293333947658539 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.20237162709236145, + "epoch": 3.71, + "learning_rate": 5.1876484560570076e-06, + "loss": 0.0681, + "step": 7800, + "task_loss": 0.11539559811353683 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.020512059330940247, + "epoch": 3.71, + "learning_rate": 5.168646080760095e-06, + "loss": 0.1384, + "step": 7810, + "task_loss": 0.14106139540672302 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.033618003129959106, + "epoch": 3.71, + "learning_rate": 5.149643705463183e-06, + "loss": 0.0839, + "step": 7820, + "task_loss": 0.2362319827079773 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.01834789663553238, + "epoch": 3.72, + "learning_rate": 5.130641330166272e-06, + "loss": 0.0992, + "step": 7830, + "task_loss": 0.06447672098875046 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.08836167305707932, + "epoch": 3.72, + "learning_rate": 5.111638954869359e-06, + "loss": 0.0935, + "step": 7840, + "task_loss": 0.08710360527038574 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.04010258615016937, + "epoch": 3.73, + "learning_rate": 5.092636579572448e-06, + "loss": 0.1045, + "step": 7850, + "task_loss": 0.14512062072753906 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.3869872987270355, + "epoch": 3.73, + "learning_rate": 5.073634204275535e-06, + "loss": 0.0924, + "step": 7860, + "task_loss": 0.2303229570388794 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.19745223224163055, + "epoch": 3.74, + "learning_rate": 5.054631828978623e-06, + "loss": 0.0985, + "step": 7870, + "task_loss": 0.14754478633403778 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.12212712317705154, + "epoch": 3.74, + "learning_rate": 5.0356294536817105e-06, + "loss": 0.1248, + "step": 7880, + "task_loss": 0.09426712989807129 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.04913119226694107, + "epoch": 3.75, + "learning_rate": 5.016627078384798e-06, + "loss": 0.1346, + "step": 7890, + "task_loss": 0.09092673659324646 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.12528668344020844, + "epoch": 3.75, + "learning_rate": 4.997624703087887e-06, + "loss": 0.1229, + "step": 7900, + "task_loss": 0.07792092859745026 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.0339021235704422, + "epoch": 3.76, + "learning_rate": 4.978622327790975e-06, + "loss": 0.0752, + "step": 7910, + "task_loss": 0.015460405498743057 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.024016963317990303, + "epoch": 3.76, + "learning_rate": 4.959619952494062e-06, + "loss": 0.0512, + "step": 7920, + "task_loss": 0.006217729300260544 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.04985890910029411, + "epoch": 3.77, + "learning_rate": 4.94061757719715e-06, + "loss": 0.0897, + "step": 7930, + "task_loss": 0.20359504222869873 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.05362790822982788, + "epoch": 3.77, + "learning_rate": 4.921615201900238e-06, + "loss": 0.0983, + "step": 7940, + "task_loss": 0.017056990414857864 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.03559091314673424, + "epoch": 3.78, + "learning_rate": 4.9026128266033255e-06, + "loss": 0.1052, + "step": 7950, + "task_loss": 0.020256590098142624 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.012951750308275223, + "epoch": 3.78, + "learning_rate": 4.883610451306413e-06, + "loss": 0.0661, + "step": 7960, + "task_loss": 0.002787817269563675 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.018247317522764206, + "epoch": 3.79, + "learning_rate": 4.864608076009501e-06, + "loss": 0.0567, + "step": 7970, + "task_loss": 0.005940131843090057 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.01746300235390663, + "epoch": 3.79, + "learning_rate": 4.84560570071259e-06, + "loss": 0.0532, + "step": 7980, + "task_loss": 0.14214222133159637 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.06765209138393402, + "epoch": 3.8, + "learning_rate": 4.826603325415678e-06, + "loss": 0.1438, + "step": 7990, + "task_loss": 0.02867637202143669 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.08068070560693741, + "epoch": 3.8, + "learning_rate": 4.807600950118766e-06, + "loss": 0.0899, + "step": 8000, + "task_loss": 0.22640740871429443 + }, + { + "epoch": 3.8, + "eval_accuracy": 0.9174311926605505, + "eval_loss": 0.2796386182308197, + "eval_runtime": 22.3683, + "eval_samples_per_second": 38.984, + "eval_steps_per_second": 4.873, + "step": 8000 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.025517662987113, + "epoch": 3.81, + "learning_rate": 4.7885985748218535e-06, + "loss": 0.0794, + "step": 8010, + "task_loss": 0.009171344339847565 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.030752386897802353, + "epoch": 3.81, + "learning_rate": 4.769596199524941e-06, + "loss": 0.0868, + "step": 8020, + "task_loss": 0.004918545484542847 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.024365507066249847, + "epoch": 3.81, + "learning_rate": 4.750593824228028e-06, + "loss": 0.0529, + "step": 8030, + "task_loss": 0.05997316166758537 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.1693132221698761, + "epoch": 3.82, + "learning_rate": 4.731591448931116e-06, + "loss": 0.1007, + "step": 8040, + "task_loss": 0.2038179188966751 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.030011361464858055, + "epoch": 3.82, + "learning_rate": 4.712589073634204e-06, + "loss": 0.1164, + "step": 8050, + "task_loss": 0.016198869794607162 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.03136039152741432, + "epoch": 3.83, + "learning_rate": 4.693586698337293e-06, + "loss": 0.0714, + "step": 8060, + "task_loss": 0.012641217559576035 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.026943452656269073, + "epoch": 3.83, + "learning_rate": 4.674584323040381e-06, + "loss": 0.0604, + "step": 8070, + "task_loss": 0.003778461366891861 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.0963515192270279, + "epoch": 3.84, + "learning_rate": 4.6555819477434686e-06, + "loss": 0.0641, + "step": 8080, + "task_loss": 0.1955173909664154 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.06787940859794617, + "epoch": 3.84, + "learning_rate": 4.636579572446556e-06, + "loss": 0.07, + "step": 8090, + "task_loss": 0.029893912374973297 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.18741311132907867, + "epoch": 3.85, + "learning_rate": 4.617577197149644e-06, + "loss": 0.0773, + "step": 8100, + "task_loss": 0.12163101881742477 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.07996465265750885, + "epoch": 3.85, + "learning_rate": 4.598574821852732e-06, + "loss": 0.1448, + "step": 8110, + "task_loss": 0.04173227399587631 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.04726814478635788, + "epoch": 3.86, + "learning_rate": 4.57957244655582e-06, + "loss": 0.0407, + "step": 8120, + "task_loss": 0.0074256956577301025 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.1402328610420227, + "epoch": 3.86, + "learning_rate": 4.560570071258908e-06, + "loss": 0.0503, + "step": 8130, + "task_loss": 0.07385687530040741 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.2221362441778183, + "epoch": 3.87, + "learning_rate": 4.541567695961996e-06, + "loss": 0.1239, + "step": 8140, + "task_loss": 0.16662901639938354 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.027472279965877533, + "epoch": 3.87, + "learning_rate": 4.522565320665084e-06, + "loss": 0.0452, + "step": 8150, + "task_loss": 0.007453102618455887 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.04676855728030205, + "epoch": 3.88, + "learning_rate": 4.5035629453681715e-06, + "loss": 0.1059, + "step": 8160, + "task_loss": 0.15876612067222595 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.025045206770300865, + "epoch": 3.88, + "learning_rate": 4.484560570071259e-06, + "loss": 0.1289, + "step": 8170, + "task_loss": 0.004179120063781738 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.011216237209737301, + "epoch": 3.89, + "learning_rate": 4.465558194774347e-06, + "loss": 0.1117, + "step": 8180, + "task_loss": 0.0025112181901931763 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.013475890271365643, + "epoch": 3.89, + "learning_rate": 4.446555819477435e-06, + "loss": 0.0861, + "step": 8190, + "task_loss": 0.0036102384328842163 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.029119327664375305, + "epoch": 3.9, + "learning_rate": 4.427553444180523e-06, + "loss": 0.0486, + "step": 8200, + "task_loss": 0.0045392923057079315 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.013042573817074299, + "epoch": 3.9, + "learning_rate": 4.408551068883611e-06, + "loss": 0.0941, + "step": 8210, + "task_loss": 0.11527097970247269 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.02264486812055111, + "epoch": 3.9, + "learning_rate": 4.389548693586699e-06, + "loss": 0.0687, + "step": 8220, + "task_loss": 0.04810720682144165 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.007877323776483536, + "epoch": 3.91, + "learning_rate": 4.3705463182897865e-06, + "loss": 0.0728, + "step": 8230, + "task_loss": 0.00500917062163353 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.030022092163562775, + "epoch": 3.91, + "learning_rate": 4.351543942992874e-06, + "loss": 0.0973, + "step": 8240, + "task_loss": 0.02026822790503502 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.20027883350849152, + "epoch": 3.92, + "learning_rate": 4.332541567695962e-06, + "loss": 0.1102, + "step": 8250, + "task_loss": 0.12834565341472626 + }, + { + "epoch": 3.92, + "eval_accuracy": 0.9162844036697247, + "eval_loss": 0.26672619581222534, + "eval_runtime": 22.291, + "eval_samples_per_second": 39.119, + "eval_steps_per_second": 4.89, + "step": 8250 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.12070262432098389, + "epoch": 3.92, + "learning_rate": 4.31353919239905e-06, + "loss": 0.1236, + "step": 8260, + "task_loss": 0.2388094961643219 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.15468746423721313, + "epoch": 3.93, + "learning_rate": 4.294536817102138e-06, + "loss": 0.1083, + "step": 8270, + "task_loss": 0.23200759291648865 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.18423910439014435, + "epoch": 3.93, + "learning_rate": 4.275534441805226e-06, + "loss": 0.0713, + "step": 8280, + "task_loss": 0.09041578322649002 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.021810609847307205, + "epoch": 3.94, + "learning_rate": 4.256532066508314e-06, + "loss": 0.0409, + "step": 8290, + "task_loss": 0.0035596080124378204 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.11239130049943924, + "epoch": 3.94, + "learning_rate": 4.2375296912114015e-06, + "loss": 0.0715, + "step": 8300, + "task_loss": 0.18248476088047028 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.08229388296604156, + "epoch": 3.95, + "learning_rate": 4.218527315914489e-06, + "loss": 0.1268, + "step": 8310, + "task_loss": 0.062466811388731 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.020841067656874657, + "epoch": 3.95, + "learning_rate": 4.199524940617577e-06, + "loss": 0.0958, + "step": 8320, + "task_loss": 0.01168619841337204 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.07428955286741257, + "epoch": 3.96, + "learning_rate": 4.180522565320665e-06, + "loss": 0.1132, + "step": 8330, + "task_loss": 0.020049307495355606 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.14601218700408936, + "epoch": 3.96, + "learning_rate": 4.161520190023753e-06, + "loss": 0.1099, + "step": 8340, + "task_loss": 0.06591594219207764 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.014185711741447449, + "epoch": 3.97, + "learning_rate": 4.142517814726842e-06, + "loss": 0.0869, + "step": 8350, + "task_loss": 0.11826062202453613 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.003288624342530966, + "epoch": 3.97, + "learning_rate": 4.1235154394299296e-06, + "loss": 0.0704, + "step": 8360, + "task_loss": 0.008070297539234161 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.0797647014260292, + "epoch": 3.98, + "learning_rate": 4.104513064133017e-06, + "loss": 0.0894, + "step": 8370, + "task_loss": 0.10876837372779846 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.04628456011414528, + "epoch": 3.98, + "learning_rate": 4.0855106888361044e-06, + "loss": 0.1141, + "step": 8380, + "task_loss": 0.2866939604282379 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.012273245491087437, + "epoch": 3.99, + "learning_rate": 4.066508313539192e-06, + "loss": 0.1052, + "step": 8390, + "task_loss": 0.14446358382701874 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.06594362109899521, + "epoch": 3.99, + "learning_rate": 4.04750593824228e-06, + "loss": 0.052, + "step": 8400, + "task_loss": 0.02517566829919815 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.017545923590660095, + "epoch": 4.0, + "learning_rate": 4.028503562945368e-06, + "loss": 0.0642, + "step": 8410, + "task_loss": 0.006249226629734039 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.04165567085146904, + "epoch": 4.0, + "learning_rate": 4.009501187648456e-06, + "loss": 0.062, + "step": 8420, + "task_loss": 0.008884565904736519 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.06637102365493774, + "epoch": 4.0, + "learning_rate": 3.990498812351545e-06, + "loss": 0.0575, + "step": 8430, + "task_loss": 0.11581560969352722 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.03454139828681946, + "epoch": 4.01, + "learning_rate": 3.9714964370546325e-06, + "loss": 0.0857, + "step": 8440, + "task_loss": 0.025920506566762924 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.12666280567646027, + "epoch": 4.01, + "learning_rate": 3.95249406175772e-06, + "loss": 0.052, + "step": 8450, + "task_loss": 0.22069977223873138 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.12481103837490082, + "epoch": 4.02, + "learning_rate": 3.933491686460808e-06, + "loss": 0.0905, + "step": 8460, + "task_loss": 0.34771737456321716 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.027316780760884285, + "epoch": 4.02, + "learning_rate": 3.914489311163896e-06, + "loss": 0.045, + "step": 8470, + "task_loss": 0.008249737322330475 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.019640466198325157, + "epoch": 4.03, + "learning_rate": 3.895486935866984e-06, + "loss": 0.0346, + "step": 8480, + "task_loss": 0.0024937279522418976 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.02247186377644539, + "epoch": 4.03, + "learning_rate": 3.876484560570072e-06, + "loss": 0.0312, + "step": 8490, + "task_loss": 0.014447018504142761 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.010977246798574924, + "epoch": 4.04, + "learning_rate": 3.857482185273159e-06, + "loss": 0.061, + "step": 8500, + "task_loss": 0.03449002653360367 + }, + { + "epoch": 4.04, + "eval_accuracy": 0.9174311926605505, + "eval_loss": 0.283713161945343, + "eval_runtime": 22.1383, + "eval_samples_per_second": 39.389, + "eval_steps_per_second": 4.924, + "step": 8500 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.03942285478115082, + "epoch": 4.04, + "learning_rate": 3.8384798099762475e-06, + "loss": 0.0416, + "step": 8510, + "task_loss": 0.005950760096311569 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.014301864430308342, + "epoch": 4.05, + "learning_rate": 3.819477434679335e-06, + "loss": 0.069, + "step": 8520, + "task_loss": 0.0029358714818954468 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.039006754755973816, + "epoch": 4.05, + "learning_rate": 3.800475059382423e-06, + "loss": 0.0629, + "step": 8530, + "task_loss": 0.19067448377609253 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.07345309853553772, + "epoch": 4.06, + "learning_rate": 3.781472684085511e-06, + "loss": 0.0515, + "step": 8540, + "task_loss": 0.18440312147140503 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.0816812664270401, + "epoch": 4.06, + "learning_rate": 3.762470308788599e-06, + "loss": 0.0785, + "step": 8550, + "task_loss": 0.1739380657672882 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.07522659748792648, + "epoch": 4.07, + "learning_rate": 3.743467933491687e-06, + "loss": 0.089, + "step": 8560, + "task_loss": 0.027755912393331528 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.2582024037837982, + "epoch": 4.07, + "learning_rate": 3.7244655581947747e-06, + "loss": 0.1013, + "step": 8570, + "task_loss": 0.27483993768692017 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.011575591750442982, + "epoch": 4.08, + "learning_rate": 3.7054631828978625e-06, + "loss": 0.051, + "step": 8580, + "task_loss": 0.004382755607366562 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.043528296053409576, + "epoch": 4.08, + "learning_rate": 3.6864608076009504e-06, + "loss": 0.0395, + "step": 8590, + "task_loss": 0.01687121018767357 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.06470053642988205, + "epoch": 4.09, + "learning_rate": 3.6674584323040387e-06, + "loss": 0.0767, + "step": 8600, + "task_loss": 0.12226305902004242 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.16258491575717926, + "epoch": 4.09, + "learning_rate": 3.6484560570071265e-06, + "loss": 0.0875, + "step": 8610, + "task_loss": 0.12856589257717133 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.012949327006936073, + "epoch": 4.1, + "learning_rate": 3.629453681710214e-06, + "loss": 0.0215, + "step": 8620, + "task_loss": 0.006088566035032272 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.22730335593223572, + "epoch": 4.1, + "learning_rate": 3.610451306413302e-06, + "loss": 0.0768, + "step": 8630, + "task_loss": 0.3216843008995056 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.02113342471420765, + "epoch": 4.1, + "learning_rate": 3.5914489311163897e-06, + "loss": 0.0908, + "step": 8640, + "task_loss": 0.3219309449195862 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.3414075970649719, + "epoch": 4.11, + "learning_rate": 3.5724465558194776e-06, + "loss": 0.1158, + "step": 8650, + "task_loss": 0.39104947447776794 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.017360147088766098, + "epoch": 4.11, + "learning_rate": 3.5534441805225654e-06, + "loss": 0.1378, + "step": 8660, + "task_loss": 0.005871061235666275 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.03779000788927078, + "epoch": 4.12, + "learning_rate": 3.5344418052256533e-06, + "loss": 0.0691, + "step": 8670, + "task_loss": 0.0107310451567173 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.2806292772293091, + "epoch": 4.12, + "learning_rate": 3.5154394299287416e-06, + "loss": 0.1005, + "step": 8680, + "task_loss": 0.3259883522987366 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.06188252195715904, + "epoch": 4.13, + "learning_rate": 3.4964370546318295e-06, + "loss": 0.0653, + "step": 8690, + "task_loss": 0.12273290753364563 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.09528631716966629, + "epoch": 4.13, + "learning_rate": 3.4774346793349173e-06, + "loss": 0.065, + "step": 8700, + "task_loss": 0.18623818457126617 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.041278135031461716, + "epoch": 4.14, + "learning_rate": 3.458432304038005e-06, + "loss": 0.0581, + "step": 8710, + "task_loss": 0.02173343300819397 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.012133522890508175, + "epoch": 4.14, + "learning_rate": 3.439429928741093e-06, + "loss": 0.0752, + "step": 8720, + "task_loss": 0.3635708689689636 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.02899017184972763, + "epoch": 4.15, + "learning_rate": 3.4204275534441805e-06, + "loss": 0.0563, + "step": 8730, + "task_loss": 0.11488357186317444 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.18777720630168915, + "epoch": 4.15, + "learning_rate": 3.4014251781472683e-06, + "loss": 0.0686, + "step": 8740, + "task_loss": 0.11035171151161194 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.1590900421142578, + "epoch": 4.16, + "learning_rate": 3.382422802850356e-06, + "loss": 0.0594, + "step": 8750, + "task_loss": 0.05561506748199463 + }, + { + "epoch": 4.16, + "eval_accuracy": 0.9151376146788991, + "eval_loss": 0.2766323983669281, + "eval_runtime": 22.3282, + "eval_samples_per_second": 39.054, + "eval_steps_per_second": 4.882, + "step": 8750 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.06593011319637299, + "epoch": 4.16, + "learning_rate": 3.3634204275534445e-06, + "loss": 0.0356, + "step": 8760, + "task_loss": 0.0075419507920742035 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.019471261650323868, + "epoch": 4.17, + "learning_rate": 3.3444180522565324e-06, + "loss": 0.0346, + "step": 8770, + "task_loss": 0.004301343113183975 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.13416732847690582, + "epoch": 4.17, + "learning_rate": 3.3254156769596202e-06, + "loss": 0.0982, + "step": 8780, + "task_loss": 0.23774561285972595 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.006739257834851742, + "epoch": 4.18, + "learning_rate": 3.306413301662708e-06, + "loss": 0.0593, + "step": 8790, + "task_loss": 0.0027306489646434784 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.04961520433425903, + "epoch": 4.18, + "learning_rate": 3.287410926365796e-06, + "loss": 0.0887, + "step": 8800, + "task_loss": 0.040518369525671005 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.017031384631991386, + "epoch": 4.19, + "learning_rate": 3.268408551068884e-06, + "loss": 0.0779, + "step": 8810, + "task_loss": 0.006137076765298843 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.010000055655837059, + "epoch": 4.19, + "learning_rate": 3.249406175771972e-06, + "loss": 0.0801, + "step": 8820, + "task_loss": 0.006373908370733261 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.08995574712753296, + "epoch": 4.19, + "learning_rate": 3.23040380047506e-06, + "loss": 0.0486, + "step": 8830, + "task_loss": 0.13840673863887787 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.018883461132645607, + "epoch": 4.2, + "learning_rate": 3.211401425178148e-06, + "loss": 0.0651, + "step": 8840, + "task_loss": 0.002827569842338562 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.038998525589704514, + "epoch": 4.2, + "learning_rate": 3.1923990498812353e-06, + "loss": 0.0606, + "step": 8850, + "task_loss": 0.014568600803613663 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.026058971881866455, + "epoch": 4.21, + "learning_rate": 3.173396674584323e-06, + "loss": 0.0357, + "step": 8860, + "task_loss": 0.029363825917243958 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.024625074118375778, + "epoch": 4.21, + "learning_rate": 3.154394299287411e-06, + "loss": 0.0547, + "step": 8870, + "task_loss": 0.004386581480503082 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.020818475633859634, + "epoch": 4.22, + "learning_rate": 3.135391923990499e-06, + "loss": 0.0566, + "step": 8880, + "task_loss": 0.0970916673541069 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.015941135585308075, + "epoch": 4.22, + "learning_rate": 3.1163895486935867e-06, + "loss": 0.0339, + "step": 8890, + "task_loss": 0.003059886395931244 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.020430579781532288, + "epoch": 4.23, + "learning_rate": 3.097387173396675e-06, + "loss": 0.035, + "step": 8900, + "task_loss": 0.0052056461572647095 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.15293776988983154, + "epoch": 4.23, + "learning_rate": 3.078384798099763e-06, + "loss": 0.0453, + "step": 8910, + "task_loss": 0.09499054402112961 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.15551890432834625, + "epoch": 4.24, + "learning_rate": 3.0593824228028507e-06, + "loss": 0.0701, + "step": 8920, + "task_loss": 0.10638581216335297 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.11212066560983658, + "epoch": 4.24, + "learning_rate": 3.0403800475059386e-06, + "loss": 0.0544, + "step": 8930, + "task_loss": 0.19491392374038696 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.016818024218082428, + "epoch": 4.25, + "learning_rate": 3.0213776722090264e-06, + "loss": 0.0488, + "step": 8940, + "task_loss": 0.2066863775253296 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.0049272989854216576, + "epoch": 4.25, + "learning_rate": 3.0023752969121143e-06, + "loss": 0.0446, + "step": 8950, + "task_loss": 0.007535018026828766 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.03029637783765793, + "epoch": 4.26, + "learning_rate": 2.9833729216152017e-06, + "loss": 0.0593, + "step": 8960, + "task_loss": 0.3232007324695587 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.07497206330299377, + "epoch": 4.26, + "learning_rate": 2.9643705463182896e-06, + "loss": 0.087, + "step": 8970, + "task_loss": 0.18494026362895966 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.04392876848578453, + "epoch": 4.27, + "learning_rate": 2.945368171021378e-06, + "loss": 0.0731, + "step": 8980, + "task_loss": 0.1489235907793045 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.01495037879794836, + "epoch": 4.27, + "learning_rate": 2.9263657957244658e-06, + "loss": 0.0548, + "step": 8990, + "task_loss": 0.0053473226726055145 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.08316938579082489, + "epoch": 4.28, + "learning_rate": 2.9073634204275536e-06, + "loss": 0.1062, + "step": 9000, + "task_loss": 0.029360707849264145 + }, + { + "epoch": 4.28, + "eval_accuracy": 0.9139908256880734, + "eval_loss": 0.2777394950389862, + "eval_runtime": 22.1404, + "eval_samples_per_second": 39.385, + "eval_steps_per_second": 4.923, + "step": 9000 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.023107042536139488, + "epoch": 4.28, + "learning_rate": 2.8883610451306415e-06, + "loss": 0.0757, + "step": 9010, + "task_loss": 0.00649937242269516 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.030949320644140244, + "epoch": 4.29, + "learning_rate": 2.8693586698337293e-06, + "loss": 0.0622, + "step": 9020, + "task_loss": 0.017877578735351562 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.02196587808430195, + "epoch": 4.29, + "learning_rate": 2.850356294536817e-06, + "loss": 0.0572, + "step": 9030, + "task_loss": 0.014159291982650757 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.05502418801188469, + "epoch": 4.29, + "learning_rate": 2.8313539192399055e-06, + "loss": 0.0318, + "step": 9040, + "task_loss": 0.20975813269615173 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.013882439583539963, + "epoch": 4.3, + "learning_rate": 2.8123515439429934e-06, + "loss": 0.0576, + "step": 9050, + "task_loss": 0.09990142285823822 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.010699973441660404, + "epoch": 4.3, + "learning_rate": 2.7933491686460812e-06, + "loss": 0.0913, + "step": 9060, + "task_loss": 0.0033752471208572388 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.01082450058311224, + "epoch": 4.31, + "learning_rate": 2.774346793349169e-06, + "loss": 0.0409, + "step": 9070, + "task_loss": 0.004309527575969696 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.016541698947548866, + "epoch": 4.31, + "learning_rate": 2.7553444180522565e-06, + "loss": 0.0631, + "step": 9080, + "task_loss": 0.002968382090330124 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.006014951970428228, + "epoch": 4.32, + "learning_rate": 2.7363420427553444e-06, + "loss": 0.0573, + "step": 9090, + "task_loss": 0.1005413755774498 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.22611913084983826, + "epoch": 4.32, + "learning_rate": 2.7173396674584322e-06, + "loss": 0.1065, + "step": 9100, + "task_loss": 0.13456928730010986 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.03514198213815689, + "epoch": 4.33, + "learning_rate": 2.69833729216152e-06, + "loss": 0.0453, + "step": 9110, + "task_loss": 0.0037595927715301514 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.016409728676080704, + "epoch": 4.33, + "learning_rate": 2.6793349168646084e-06, + "loss": 0.0601, + "step": 9120, + "task_loss": 0.005627848207950592 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.016438759863376617, + "epoch": 4.34, + "learning_rate": 2.6603325415676963e-06, + "loss": 0.088, + "step": 9130, + "task_loss": 0.004075001925230026 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.048099249601364136, + "epoch": 4.34, + "learning_rate": 2.641330166270784e-06, + "loss": 0.0575, + "step": 9140, + "task_loss": 0.021824389696121216 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.03359050303697586, + "epoch": 4.35, + "learning_rate": 2.622327790973872e-06, + "loss": 0.0541, + "step": 9150, + "task_loss": 0.010654143989086151 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.35756510496139526, + "epoch": 4.35, + "learning_rate": 2.60332541567696e-06, + "loss": 0.0756, + "step": 9160, + "task_loss": 0.1754165142774582 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.015034861862659454, + "epoch": 4.36, + "learning_rate": 2.5843230403800477e-06, + "loss": 0.1075, + "step": 9170, + "task_loss": 0.008607205003499985 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.26236093044281006, + "epoch": 4.36, + "learning_rate": 2.565320665083136e-06, + "loss": 0.121, + "step": 9180, + "task_loss": 0.14051243662834167 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.024604659527540207, + "epoch": 4.37, + "learning_rate": 2.546318289786224e-06, + "loss": 0.0556, + "step": 9190, + "task_loss": 0.12805365025997162 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.041992854326963425, + "epoch": 4.37, + "learning_rate": 2.5273159144893113e-06, + "loss": 0.1061, + "step": 9200, + "task_loss": 0.007956545799970627 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.15274813771247864, + "epoch": 4.38, + "learning_rate": 2.508313539192399e-06, + "loss": 0.0563, + "step": 9210, + "task_loss": 0.06382400542497635 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.1668039709329605, + "epoch": 4.38, + "learning_rate": 2.4893111638954874e-06, + "loss": 0.0911, + "step": 9220, + "task_loss": 0.32040756940841675 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.08066344261169434, + "epoch": 4.38, + "learning_rate": 2.470308788598575e-06, + "loss": 0.0624, + "step": 9230, + "task_loss": 0.06633038818836212 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.021942077204585075, + "epoch": 4.39, + "learning_rate": 2.4513064133016627e-06, + "loss": 0.0486, + "step": 9240, + "task_loss": 0.00539054349064827 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.021917153149843216, + "epoch": 4.39, + "learning_rate": 2.4323040380047506e-06, + "loss": 0.0751, + "step": 9250, + "task_loss": 0.03953142464160919 + }, + { + "epoch": 4.39, + "eval_accuracy": 0.9220183486238532, + "eval_loss": 0.2689874768257141, + "eval_runtime": 22.1397, + "eval_samples_per_second": 39.386, + "eval_steps_per_second": 4.923, + "step": 9250 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.013196494430303574, + "epoch": 4.4, + "learning_rate": 2.413301662707839e-06, + "loss": 0.0473, + "step": 9260, + "task_loss": 0.004326473921537399 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.04993726685643196, + "epoch": 4.4, + "learning_rate": 2.3942992874109268e-06, + "loss": 0.0502, + "step": 9270, + "task_loss": 0.03252362832427025 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.06098417192697525, + "epoch": 4.41, + "learning_rate": 2.375296912114014e-06, + "loss": 0.0519, + "step": 9280, + "task_loss": 0.04169990494847298 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.030414171516895294, + "epoch": 4.41, + "learning_rate": 2.356294536817102e-06, + "loss": 0.0838, + "step": 9290, + "task_loss": 0.008283041417598724 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.04094374552369118, + "epoch": 4.42, + "learning_rate": 2.3372921615201903e-06, + "loss": 0.0802, + "step": 9300, + "task_loss": 0.01785259321331978 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.022956877946853638, + "epoch": 4.42, + "learning_rate": 2.318289786223278e-06, + "loss": 0.0409, + "step": 9310, + "task_loss": 0.1291673481464386 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.42349928617477417, + "epoch": 4.43, + "learning_rate": 2.299287410926366e-06, + "loss": 0.1074, + "step": 9320, + "task_loss": 0.4811912178993225 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.021385155618190765, + "epoch": 4.43, + "learning_rate": 2.280285035629454e-06, + "loss": 0.0496, + "step": 9330, + "task_loss": 0.006091751158237457 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.013296818360686302, + "epoch": 4.44, + "learning_rate": 2.261282660332542e-06, + "loss": 0.0448, + "step": 9340, + "task_loss": 0.007342435419559479 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.0073758745566010475, + "epoch": 4.44, + "learning_rate": 2.2422802850356297e-06, + "loss": 0.074, + "step": 9350, + "task_loss": 0.0043178461492061615 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.017247337847948074, + "epoch": 4.45, + "learning_rate": 2.2232779097387175e-06, + "loss": 0.0686, + "step": 9360, + "task_loss": 0.002873547375202179 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.13869813084602356, + "epoch": 4.45, + "learning_rate": 2.2042755344418054e-06, + "loss": 0.0623, + "step": 9370, + "task_loss": 0.12998461723327637 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.037029024213552475, + "epoch": 4.46, + "learning_rate": 2.1852731591448932e-06, + "loss": 0.0671, + "step": 9380, + "task_loss": 0.21687030792236328 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.01134589221328497, + "epoch": 4.46, + "learning_rate": 2.166270783847981e-06, + "loss": 0.057, + "step": 9390, + "task_loss": 0.19388972222805023 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.06970459967851639, + "epoch": 4.47, + "learning_rate": 2.147268408551069e-06, + "loss": 0.0584, + "step": 9400, + "task_loss": 0.02260136976838112 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.009460528381168842, + "epoch": 4.47, + "learning_rate": 2.128266033254157e-06, + "loss": 0.045, + "step": 9410, + "task_loss": 0.12310618162155151 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.11489342153072357, + "epoch": 4.48, + "learning_rate": 2.1092636579572447e-06, + "loss": 0.0371, + "step": 9420, + "task_loss": 0.08696582913398743 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.07559200376272202, + "epoch": 4.48, + "learning_rate": 2.0902612826603326e-06, + "loss": 0.0618, + "step": 9430, + "task_loss": 0.3284105062484741 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.07968532294034958, + "epoch": 4.48, + "learning_rate": 2.071258907363421e-06, + "loss": 0.1117, + "step": 9440, + "task_loss": 0.07459443807601929 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.28972363471984863, + "epoch": 4.49, + "learning_rate": 2.0522565320665087e-06, + "loss": 0.0473, + "step": 9450, + "task_loss": 0.16974028944969177 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.02968553639948368, + "epoch": 4.49, + "learning_rate": 2.033254156769596e-06, + "loss": 0.0684, + "step": 9460, + "task_loss": 0.2634113132953644 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.030829662457108498, + "epoch": 4.5, + "learning_rate": 2.014251781472684e-06, + "loss": 0.0682, + "step": 9470, + "task_loss": 0.03290770202875137 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.03441692143678665, + "epoch": 4.5, + "learning_rate": 1.9952494061757723e-06, + "loss": 0.108, + "step": 9480, + "task_loss": 0.011277075856924057 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.08928455412387848, + "epoch": 4.51, + "learning_rate": 1.97624703087886e-06, + "loss": 0.0582, + "step": 9490, + "task_loss": 0.07190164923667908 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.01250946894288063, + "epoch": 4.51, + "learning_rate": 1.957244655581948e-06, + "loss": 0.0386, + "step": 9500, + "task_loss": 0.009386200457811356 + }, + { + "epoch": 4.51, + "eval_accuracy": 0.9162844036697247, + "eval_loss": 0.2667511999607086, + "eval_runtime": 22.1403, + "eval_samples_per_second": 39.385, + "eval_steps_per_second": 4.923, + "step": 9500 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.020871058106422424, + "epoch": 4.52, + "learning_rate": 1.938242280285036e-06, + "loss": 0.0491, + "step": 9510, + "task_loss": 0.005145017057657242 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.03983244299888611, + "epoch": 4.52, + "learning_rate": 1.9192399049881237e-06, + "loss": 0.05, + "step": 9520, + "task_loss": 0.24316395819187164 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.10421483218669891, + "epoch": 4.53, + "learning_rate": 1.9002375296912114e-06, + "loss": 0.0836, + "step": 9530, + "task_loss": 0.06570681929588318 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.043953992426395416, + "epoch": 4.53, + "learning_rate": 1.8812351543942995e-06, + "loss": 0.0585, + "step": 9540, + "task_loss": 0.008459363132715225 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.058616213500499725, + "epoch": 4.54, + "learning_rate": 1.8622327790973873e-06, + "loss": 0.0522, + "step": 9550, + "task_loss": 0.26781898736953735 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.2032776176929474, + "epoch": 4.54, + "learning_rate": 1.8432304038004752e-06, + "loss": 0.0885, + "step": 9560, + "task_loss": 0.3326171636581421 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.041910551488399506, + "epoch": 4.55, + "learning_rate": 1.8242280285035633e-06, + "loss": 0.0669, + "step": 9570, + "task_loss": 0.004698578268289566 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.02482573315501213, + "epoch": 4.55, + "learning_rate": 1.805225653206651e-06, + "loss": 0.0582, + "step": 9580, + "task_loss": 0.005337722599506378 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.2271367311477661, + "epoch": 4.56, + "learning_rate": 1.7862232779097388e-06, + "loss": 0.0514, + "step": 9590, + "task_loss": 0.1770418882369995 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.10083962976932526, + "epoch": 4.56, + "learning_rate": 1.7672209026128267e-06, + "loss": 0.0755, + "step": 9600, + "task_loss": 0.20249593257904053 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.03125529736280441, + "epoch": 4.57, + "learning_rate": 1.7482185273159147e-06, + "loss": 0.0481, + "step": 9610, + "task_loss": 0.012315116822719574 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.03914954140782356, + "epoch": 4.57, + "learning_rate": 1.7292161520190026e-06, + "loss": 0.0612, + "step": 9620, + "task_loss": 0.1674419790506363 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.01954154670238495, + "epoch": 4.57, + "learning_rate": 1.7102137767220902e-06, + "loss": 0.0653, + "step": 9630, + "task_loss": 0.06889810413122177 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.05788855999708176, + "epoch": 4.58, + "learning_rate": 1.691211401425178e-06, + "loss": 0.0253, + "step": 9640, + "task_loss": 0.14915083348751068 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.01645825058221817, + "epoch": 4.58, + "learning_rate": 1.6722090261282662e-06, + "loss": 0.0519, + "step": 9650, + "task_loss": 0.12134365737438202 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.04989754408597946, + "epoch": 4.59, + "learning_rate": 1.653206650831354e-06, + "loss": 0.1004, + "step": 9660, + "task_loss": 0.1625884622335434 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.019783230498433113, + "epoch": 4.59, + "learning_rate": 1.634204275534442e-06, + "loss": 0.0567, + "step": 9670, + "task_loss": 0.004505373537540436 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.19092217087745667, + "epoch": 4.6, + "learning_rate": 1.61520190023753e-06, + "loss": 0.0458, + "step": 9680, + "task_loss": 0.2721264362335205 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.10719658434391022, + "epoch": 4.6, + "learning_rate": 1.5961995249406176e-06, + "loss": 0.0513, + "step": 9690, + "task_loss": 0.03760954737663269 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.05233538895845413, + "epoch": 4.61, + "learning_rate": 1.5771971496437055e-06, + "loss": 0.0688, + "step": 9700, + "task_loss": 0.12094153463840485 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.014219870790839195, + "epoch": 4.61, + "learning_rate": 1.5581947743467934e-06, + "loss": 0.0581, + "step": 9710, + "task_loss": 0.0061318278312683105 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.012800279073417187, + "epoch": 4.62, + "learning_rate": 1.5391923990498814e-06, + "loss": 0.0268, + "step": 9720, + "task_loss": 0.0036646276712417603 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.12216310203075409, + "epoch": 4.62, + "learning_rate": 1.5201900237529693e-06, + "loss": 0.1461, + "step": 9730, + "task_loss": 0.13348951935768127 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.07808056473731995, + "epoch": 4.63, + "learning_rate": 1.5011876484560572e-06, + "loss": 0.0822, + "step": 9740, + "task_loss": 0.22806300222873688 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.010630585253238678, + "epoch": 4.63, + "learning_rate": 1.4821852731591448e-06, + "loss": 0.0284, + "step": 9750, + "task_loss": 0.002479270100593567 + }, + { + "epoch": 4.63, + "eval_accuracy": 0.9185779816513762, + "eval_loss": 0.2812165319919586, + "eval_runtime": 22.1715, + "eval_samples_per_second": 39.33, + "eval_steps_per_second": 4.916, + "step": 9750 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.022376172244548798, + "epoch": 4.64, + "learning_rate": 1.4631828978622329e-06, + "loss": 0.0637, + "step": 9760, + "task_loss": 0.0058107636868953705 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.009340550750494003, + "epoch": 4.64, + "learning_rate": 1.4441805225653207e-06, + "loss": 0.0344, + "step": 9770, + "task_loss": 0.0033394992351531982 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.010714657604694366, + "epoch": 4.65, + "learning_rate": 1.4251781472684086e-06, + "loss": 0.0541, + "step": 9780, + "task_loss": 0.004770912230014801 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.024335071444511414, + "epoch": 4.65, + "learning_rate": 1.4061757719714967e-06, + "loss": 0.0614, + "step": 9790, + "task_loss": 0.006472300738096237 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.07776258885860443, + "epoch": 4.66, + "learning_rate": 1.3871733966745845e-06, + "loss": 0.075, + "step": 9800, + "task_loss": 0.039640650153160095 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.04833105206489563, + "epoch": 4.66, + "learning_rate": 1.3681710213776722e-06, + "loss": 0.0351, + "step": 9810, + "task_loss": 0.0052606649696826935 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.06957106292247772, + "epoch": 4.67, + "learning_rate": 1.34916864608076e-06, + "loss": 0.0566, + "step": 9820, + "task_loss": 0.012664098292589188 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.018420187756419182, + "epoch": 4.67, + "learning_rate": 1.3301662707838481e-06, + "loss": 0.0743, + "step": 9830, + "task_loss": 0.012570954859256744 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.022069044411182404, + "epoch": 4.67, + "learning_rate": 1.311163895486936e-06, + "loss": 0.1181, + "step": 9840, + "task_loss": 0.2193067967891693 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.10804280638694763, + "epoch": 4.68, + "learning_rate": 1.2921615201900239e-06, + "loss": 0.0684, + "step": 9850, + "task_loss": 0.1666574627161026 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.021793734282255173, + "epoch": 4.68, + "learning_rate": 1.273159144893112e-06, + "loss": 0.0713, + "step": 9860, + "task_loss": 0.1198580265045166 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.013394506648182869, + "epoch": 4.69, + "learning_rate": 1.2541567695961996e-06, + "loss": 0.077, + "step": 9870, + "task_loss": 0.003841262310743332 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.024098927155137062, + "epoch": 4.69, + "learning_rate": 1.2351543942992874e-06, + "loss": 0.0668, + "step": 9880, + "task_loss": 0.11605469882488251 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.012392224743962288, + "epoch": 4.7, + "learning_rate": 1.2161520190023753e-06, + "loss": 0.0472, + "step": 9890, + "task_loss": 0.003138858824968338 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.05335499346256256, + "epoch": 4.7, + "learning_rate": 1.1971496437054634e-06, + "loss": 0.0593, + "step": 9900, + "task_loss": 0.18697652220726013 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.028973210602998734, + "epoch": 4.71, + "learning_rate": 1.178147268408551e-06, + "loss": 0.0279, + "step": 9910, + "task_loss": 0.11546307057142258 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.09816677868366241, + "epoch": 4.71, + "learning_rate": 1.159144893111639e-06, + "loss": 0.0397, + "step": 9920, + "task_loss": 0.0736415684223175 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.011303352192044258, + "epoch": 4.72, + "learning_rate": 1.140142517814727e-06, + "loss": 0.0569, + "step": 9930, + "task_loss": 0.004318550229072571 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.01750335283577442, + "epoch": 4.72, + "learning_rate": 1.1211401425178148e-06, + "loss": 0.0446, + "step": 9940, + "task_loss": 0.19570045173168182 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.046896446496248245, + "epoch": 4.73, + "learning_rate": 1.1021377672209027e-06, + "loss": 0.0291, + "step": 9950, + "task_loss": 0.07542459666728973 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.2460804581642151, + "epoch": 4.73, + "learning_rate": 1.0831353919239906e-06, + "loss": 0.061, + "step": 9960, + "task_loss": 0.16217932105064392 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.012438797391951084, + "epoch": 4.74, + "learning_rate": 1.0641330166270784e-06, + "loss": 0.0274, + "step": 9970, + "task_loss": 0.0034538879990577698 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.012422207742929459, + "epoch": 4.74, + "learning_rate": 1.0451306413301663e-06, + "loss": 0.0365, + "step": 9980, + "task_loss": 0.003980562090873718 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.09234738349914551, + "epoch": 4.75, + "learning_rate": 1.0261282660332544e-06, + "loss": 0.0777, + "step": 9990, + "task_loss": 0.07327043265104294 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.12707777321338654, + "epoch": 4.75, + "learning_rate": 1.007125890736342e-06, + "loss": 0.1016, + "step": 10000, + "task_loss": 0.08033004403114319 + }, + { + "epoch": 4.75, + "eval_accuracy": 0.9162844036697247, + "eval_loss": 0.2825167179107666, + "eval_runtime": 22.1651, + "eval_samples_per_second": 39.341, + "eval_steps_per_second": 4.918, + "step": 10000 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.07169970124959946, + "epoch": 4.76, + "learning_rate": 9.8812351543943e-07, + "loss": 0.0671, + "step": 10010, + "task_loss": 0.06848999857902527 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.013409988954663277, + "epoch": 4.76, + "learning_rate": 9.69121140142518e-07, + "loss": 0.0377, + "step": 10020, + "task_loss": 0.09568732976913452 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.07166978716850281, + "epoch": 4.76, + "learning_rate": 9.501187648456057e-07, + "loss": 0.0294, + "step": 10030, + "task_loss": 0.0640857145190239 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.04987531155347824, + "epoch": 4.77, + "learning_rate": 9.311163895486937e-07, + "loss": 0.0656, + "step": 10040, + "task_loss": 0.021407999098300934 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.007782880216836929, + "epoch": 4.77, + "learning_rate": 9.121140142517816e-07, + "loss": 0.0498, + "step": 10050, + "task_loss": 0.002929363399744034 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.021184280514717102, + "epoch": 4.78, + "learning_rate": 8.931116389548694e-07, + "loss": 0.0508, + "step": 10060, + "task_loss": 0.004369787871837616 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.012340977787971497, + "epoch": 4.78, + "learning_rate": 8.741092636579574e-07, + "loss": 0.0516, + "step": 10070, + "task_loss": 0.006207786500453949 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.1317601352930069, + "epoch": 4.79, + "learning_rate": 8.551068883610451e-07, + "loss": 0.0475, + "step": 10080, + "task_loss": 0.004766244441270828 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.023309551179409027, + "epoch": 4.79, + "learning_rate": 8.361045130641331e-07, + "loss": 0.0457, + "step": 10090, + "task_loss": 0.008698023855686188 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.046007562428712845, + "epoch": 4.8, + "learning_rate": 8.17102137767221e-07, + "loss": 0.1062, + "step": 10100, + "task_loss": 0.05794458091259003 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.01069045439362526, + "epoch": 4.8, + "learning_rate": 7.980997624703088e-07, + "loss": 0.0411, + "step": 10110, + "task_loss": 0.002656955271959305 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.021371502429246902, + "epoch": 4.81, + "learning_rate": 7.790973871733967e-07, + "loss": 0.0504, + "step": 10120, + "task_loss": 0.005056999623775482 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.060870636254549026, + "epoch": 4.81, + "learning_rate": 7.600950118764846e-07, + "loss": 0.0759, + "step": 10130, + "task_loss": 0.097642682492733 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.019857440143823624, + "epoch": 4.82, + "learning_rate": 7.410926365795724e-07, + "loss": 0.0455, + "step": 10140, + "task_loss": 0.005811773240566254 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.05780591070652008, + "epoch": 4.82, + "learning_rate": 7.220902612826604e-07, + "loss": 0.0547, + "step": 10150, + "task_loss": 0.032336119562387466 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.043222397565841675, + "epoch": 4.83, + "learning_rate": 7.030878859857483e-07, + "loss": 0.0618, + "step": 10160, + "task_loss": 0.002787157893180847 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.01754024438560009, + "epoch": 4.83, + "learning_rate": 6.840855106888361e-07, + "loss": 0.0529, + "step": 10170, + "task_loss": 0.0054728202521800995 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.007267419248819351, + "epoch": 4.84, + "learning_rate": 6.650831353919241e-07, + "loss": 0.0394, + "step": 10180, + "task_loss": 0.004029855132102966 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.060019608587026596, + "epoch": 4.84, + "learning_rate": 6.460807600950119e-07, + "loss": 0.0454, + "step": 10190, + "task_loss": 0.005614615976810455 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.06279192864894867, + "epoch": 4.85, + "learning_rate": 6.270783847980998e-07, + "loss": 0.0804, + "step": 10200, + "task_loss": 0.22416365146636963 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.01789834164083004, + "epoch": 4.85, + "learning_rate": 6.080760095011877e-07, + "loss": 0.0704, + "step": 10210, + "task_loss": 0.0029133372008800507 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.023510048165917397, + "epoch": 4.86, + "learning_rate": 5.890736342042755e-07, + "loss": 0.0306, + "step": 10220, + "task_loss": 0.008717145770788193 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.06903427839279175, + "epoch": 4.86, + "learning_rate": 5.700712589073635e-07, + "loss": 0.0636, + "step": 10230, + "task_loss": 0.02681458741426468 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.11212094873189926, + "epoch": 4.86, + "learning_rate": 5.510688836104513e-07, + "loss": 0.0766, + "step": 10240, + "task_loss": 0.05283607542514801 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.020517665892839432, + "epoch": 4.87, + "learning_rate": 5.320665083135392e-07, + "loss": 0.0507, + "step": 10250, + "task_loss": 0.006865642964839935 + }, + { + "epoch": 4.87, + "eval_accuracy": 0.9139908256880734, + "eval_loss": 0.280513197183609, + "eval_runtime": 22.1726, + "eval_samples_per_second": 39.328, + "eval_steps_per_second": 4.916, + "step": 10250 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.06565746665000916, + "epoch": 4.87, + "learning_rate": 5.130641330166272e-07, + "loss": 0.084, + "step": 10260, + "task_loss": 0.08022348582744598 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.041272662580013275, + "epoch": 4.88, + "learning_rate": 4.94061757719715e-07, + "loss": 0.0807, + "step": 10270, + "task_loss": 0.044755056500434875 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.02130916342139244, + "epoch": 4.88, + "learning_rate": 4.7505938242280285e-07, + "loss": 0.0456, + "step": 10280, + "task_loss": 0.005585514008998871 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.008105727843940258, + "epoch": 4.89, + "learning_rate": 4.560570071258908e-07, + "loss": 0.0662, + "step": 10290, + "task_loss": 0.007067546248435974 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.02983175590634346, + "epoch": 4.89, + "learning_rate": 4.370546318289787e-07, + "loss": 0.0701, + "step": 10300, + "task_loss": 0.22576290369033813 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.009511064738035202, + "epoch": 4.9, + "learning_rate": 4.1805225653206654e-07, + "loss": 0.085, + "step": 10310, + "task_loss": 0.007220160216093063 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.037107907235622406, + "epoch": 4.9, + "learning_rate": 3.990498812351544e-07, + "loss": 0.0351, + "step": 10320, + "task_loss": 0.1071058064699173 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.007235179655253887, + "epoch": 4.91, + "learning_rate": 3.800475059382423e-07, + "loss": 0.0385, + "step": 10330, + "task_loss": 0.002338908612728119 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.014286589808762074, + "epoch": 4.91, + "learning_rate": 3.610451306413302e-07, + "loss": 0.0366, + "step": 10340, + "task_loss": 0.0025917217135429382 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.02622096985578537, + "epoch": 4.92, + "learning_rate": 3.4204275534441805e-07, + "loss": 0.0292, + "step": 10350, + "task_loss": 0.0064817629754543304 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.011151728220283985, + "epoch": 4.92, + "learning_rate": 3.2304038004750596e-07, + "loss": 0.0683, + "step": 10360, + "task_loss": 0.003448858857154846 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.015970010310411453, + "epoch": 4.93, + "learning_rate": 3.040380047505938e-07, + "loss": 0.0584, + "step": 10370, + "task_loss": 0.005513232201337814 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.1370335817337036, + "epoch": 4.93, + "learning_rate": 2.8503562945368174e-07, + "loss": 0.0771, + "step": 10380, + "task_loss": 0.008187536150217056 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.022085029631853104, + "epoch": 4.94, + "learning_rate": 2.660332541567696e-07, + "loss": 0.0691, + "step": 10390, + "task_loss": 0.009103760123252869 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.029131721705198288, + "epoch": 4.94, + "learning_rate": 2.470308788598575e-07, + "loss": 0.0614, + "step": 10400, + "task_loss": 0.008749555796384811 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.01142764464020729, + "epoch": 4.95, + "learning_rate": 2.280285035629454e-07, + "loss": 0.0506, + "step": 10410, + "task_loss": 0.003860827535390854 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.22063679993152618, + "epoch": 4.95, + "learning_rate": 2.0902612826603327e-07, + "loss": 0.1186, + "step": 10420, + "task_loss": 0.411041259765625 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.00713141867890954, + "epoch": 4.95, + "learning_rate": 1.9002375296912116e-07, + "loss": 0.0519, + "step": 10430, + "task_loss": 0.0033237673342227936 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.08448931574821472, + "epoch": 4.96, + "learning_rate": 1.7102137767220902e-07, + "loss": 0.053, + "step": 10440, + "task_loss": 0.19629473984241486 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.014177798293530941, + "epoch": 4.96, + "learning_rate": 1.520190023752969e-07, + "loss": 0.0672, + "step": 10450, + "task_loss": 0.009178481996059418 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.13920198380947113, + "epoch": 4.97, + "learning_rate": 1.330166270783848e-07, + "loss": 0.0836, + "step": 10460, + "task_loss": 0.05482466146349907 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.01523562241345644, + "epoch": 4.97, + "learning_rate": 1.140142517814727e-07, + "loss": 0.0335, + "step": 10470, + "task_loss": 0.0879075825214386 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.015354710631072521, + "epoch": 4.98, + "learning_rate": 9.501187648456058e-08, + "loss": 0.0834, + "step": 10480, + "task_loss": 0.12236123532056808 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.12151747941970825, + "epoch": 4.98, + "learning_rate": 7.600950118764846e-08, + "loss": 0.0548, + "step": 10490, + "task_loss": 0.1053650826215744 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.07341088354587555, + "epoch": 4.99, + "learning_rate": 5.700712589073635e-08, + "loss": 0.0709, + "step": 10500, + "task_loss": 0.12648561596870422 + }, + { + "epoch": 4.99, + "eval_accuracy": 0.9139908256880734, + "eval_loss": 0.2854968011379242, + "eval_runtime": 21.9579, + "eval_samples_per_second": 39.712, + "eval_steps_per_second": 4.964, + "step": 10500 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.13635969161987305, + "epoch": 4.99, + "learning_rate": 3.800475059382423e-08, + "loss": 0.0549, + "step": 10510, + "task_loss": 0.1342453956604004 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.4047220222636254, + "compression/movement_sparsity/model_sparsity": 0.3142793903220987, + "compression_loss": 0.0, + "distillation_loss": 0.03788226097822189, + "epoch": 5.0, + "learning_rate": 1.9002375296912114e-08, + "loss": 0.0381, + "step": 10520, + "task_loss": 0.05685931071639061 + }, + { + "epoch": 5.0, + "step": 10525, + "total_flos": 2.220815486243328e+16, + "train_loss": 2.258239375927669, + "train_runtime": 6578.4214, + "train_samples_per_second": 51.189, + "train_steps_per_second": 1.6 } ], - "max_steps": 2105, - "num_train_epochs": 1, - "total_flos": 4441630972486656.0, + "max_steps": 10525, + "num_train_epochs": 5, + "total_flos": 2.220815486243328e+16, "trial_name": null, "trial_params": null }