{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 9129, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 6.129295660236616, "learning_rate": 2.190580503833516e-08, "loss": 3.0196, "step": 1 }, { "epoch": 0.0, "grad_norm": 6.603884158736625, "learning_rate": 1.095290251916758e-07, "loss": 3.0488, "step": 5 }, { "epoch": 0.0, "grad_norm": 7.078081397709763, "learning_rate": 2.190580503833516e-07, "loss": 3.0906, "step": 10 }, { "epoch": 0.0, "grad_norm": 7.190488710464048, "learning_rate": 3.285870755750274e-07, "loss": 3.1082, "step": 15 }, { "epoch": 0.0, "grad_norm": 8.063570583994066, "learning_rate": 4.381161007667032e-07, "loss": 3.0465, "step": 20 }, { "epoch": 0.0, "grad_norm": 6.5434241805747435, "learning_rate": 5.47645125958379e-07, "loss": 3.109, "step": 25 }, { "epoch": 0.0, "grad_norm": 6.906242906571553, "learning_rate": 6.571741511500548e-07, "loss": 3.0349, "step": 30 }, { "epoch": 0.0, "grad_norm": 6.87422439190803, "learning_rate": 7.667031763417306e-07, "loss": 3.078, "step": 35 }, { "epoch": 0.0, "grad_norm": 6.006044898892149, "learning_rate": 8.762322015334064e-07, "loss": 3.0473, "step": 40 }, { "epoch": 0.0, "grad_norm": 6.825241497527427, "learning_rate": 9.857612267250823e-07, "loss": 3.0841, "step": 45 }, { "epoch": 0.01, "grad_norm": 6.084236629094488, "learning_rate": 1.095290251916758e-06, "loss": 3.0791, "step": 50 }, { "epoch": 0.01, "grad_norm": 6.693160784385476, "learning_rate": 1.2048192771084338e-06, "loss": 3.0447, "step": 55 }, { "epoch": 0.01, "grad_norm": 5.899676436477525, "learning_rate": 1.3143483023001096e-06, "loss": 3.0171, "step": 60 }, { "epoch": 0.01, "grad_norm": 4.87817742266469, "learning_rate": 1.4238773274917855e-06, "loss": 2.9311, "step": 65 }, { "epoch": 0.01, "grad_norm": 5.398627564345958, "learning_rate": 1.5334063526834611e-06, "loss": 2.9708, "step": 70 }, { "epoch": 0.01, "grad_norm": 5.184622536680821, "learning_rate": 1.642935377875137e-06, "loss": 2.9471, "step": 75 }, { "epoch": 0.01, "grad_norm": 4.368721643383131, "learning_rate": 1.7524644030668128e-06, "loss": 2.8996, "step": 80 }, { "epoch": 0.01, "grad_norm": 3.9158747056688603, "learning_rate": 1.8619934282584886e-06, "loss": 2.834, "step": 85 }, { "epoch": 0.01, "grad_norm": 3.4162080404793227, "learning_rate": 1.9715224534501647e-06, "loss": 2.7924, "step": 90 }, { "epoch": 0.01, "grad_norm": 3.3996646503205312, "learning_rate": 2.0810514786418403e-06, "loss": 2.7018, "step": 95 }, { "epoch": 0.01, "grad_norm": 2.8562612237923584, "learning_rate": 2.190580503833516e-06, "loss": 2.76, "step": 100 }, { "epoch": 0.01, "grad_norm": 2.71964875430513, "learning_rate": 2.300109529025192e-06, "loss": 2.7916, "step": 105 }, { "epoch": 0.01, "grad_norm": 3.0536705194304705, "learning_rate": 2.4096385542168676e-06, "loss": 2.7119, "step": 110 }, { "epoch": 0.01, "grad_norm": 2.4357248439595134, "learning_rate": 2.5191675794085437e-06, "loss": 2.7412, "step": 115 }, { "epoch": 0.01, "grad_norm": 2.5089045854099323, "learning_rate": 2.6286966046002193e-06, "loss": 2.7103, "step": 120 }, { "epoch": 0.01, "grad_norm": 2.2776584857143405, "learning_rate": 2.7382256297918953e-06, "loss": 2.6926, "step": 125 }, { "epoch": 0.01, "grad_norm": 2.1400456110900836, "learning_rate": 2.847754654983571e-06, "loss": 2.6838, "step": 130 }, { "epoch": 0.01, "grad_norm": 1.912565012736398, "learning_rate": 2.957283680175247e-06, "loss": 2.6769, "step": 135 }, { "epoch": 0.02, "grad_norm": 1.96313903094818, "learning_rate": 3.0668127053669222e-06, "loss": 2.686, "step": 140 }, { "epoch": 0.02, "grad_norm": 1.8942216602250406, "learning_rate": 3.1763417305585983e-06, "loss": 2.6631, "step": 145 }, { "epoch": 0.02, "grad_norm": 2.0895762412535923, "learning_rate": 3.285870755750274e-06, "loss": 2.6533, "step": 150 }, { "epoch": 0.02, "grad_norm": 1.8852269553587158, "learning_rate": 3.39539978094195e-06, "loss": 2.6068, "step": 155 }, { "epoch": 0.02, "grad_norm": 1.8636969796673521, "learning_rate": 3.5049288061336256e-06, "loss": 2.6033, "step": 160 }, { "epoch": 0.02, "grad_norm": 1.8214927998124102, "learning_rate": 3.6144578313253016e-06, "loss": 2.5811, "step": 165 }, { "epoch": 0.02, "grad_norm": 1.6731156161336724, "learning_rate": 3.7239868565169773e-06, "loss": 2.568, "step": 170 }, { "epoch": 0.02, "grad_norm": 1.508537478640596, "learning_rate": 3.833515881708653e-06, "loss": 2.5823, "step": 175 }, { "epoch": 0.02, "grad_norm": 1.7027893770483755, "learning_rate": 3.943044906900329e-06, "loss": 2.5144, "step": 180 }, { "epoch": 0.02, "grad_norm": 1.527746180939285, "learning_rate": 4.0525739320920046e-06, "loss": 2.5149, "step": 185 }, { "epoch": 0.02, "grad_norm": 1.8028113419482, "learning_rate": 4.162102957283681e-06, "loss": 2.5207, "step": 190 }, { "epoch": 0.02, "grad_norm": 1.5495785585582262, "learning_rate": 4.271631982475356e-06, "loss": 2.5384, "step": 195 }, { "epoch": 0.02, "grad_norm": 1.5805093411104003, "learning_rate": 4.381161007667032e-06, "loss": 2.4929, "step": 200 }, { "epoch": 0.02, "grad_norm": 1.546068214291057, "learning_rate": 4.490690032858708e-06, "loss": 2.4379, "step": 205 }, { "epoch": 0.02, "grad_norm": 1.526259522669331, "learning_rate": 4.600219058050384e-06, "loss": 2.5033, "step": 210 }, { "epoch": 0.02, "grad_norm": 1.4983647260437638, "learning_rate": 4.709748083242059e-06, "loss": 2.4799, "step": 215 }, { "epoch": 0.02, "grad_norm": 1.4501146786210302, "learning_rate": 4.819277108433735e-06, "loss": 2.4922, "step": 220 }, { "epoch": 0.02, "grad_norm": 1.3753375691200371, "learning_rate": 4.928806133625411e-06, "loss": 2.3949, "step": 225 }, { "epoch": 0.03, "grad_norm": 1.4941885757003668, "learning_rate": 5.038335158817087e-06, "loss": 2.4824, "step": 230 }, { "epoch": 0.03, "grad_norm": 1.3525123554979424, "learning_rate": 5.1478641840087625e-06, "loss": 2.4488, "step": 235 }, { "epoch": 0.03, "grad_norm": 1.7632931672650867, "learning_rate": 5.257393209200439e-06, "loss": 2.4233, "step": 240 }, { "epoch": 0.03, "grad_norm": 1.3826577571745955, "learning_rate": 5.366922234392115e-06, "loss": 2.4595, "step": 245 }, { "epoch": 0.03, "grad_norm": 1.37020520828531, "learning_rate": 5.476451259583791e-06, "loss": 2.4478, "step": 250 }, { "epoch": 0.03, "grad_norm": 1.302423167305176, "learning_rate": 5.585980284775466e-06, "loss": 2.4417, "step": 255 }, { "epoch": 0.03, "grad_norm": 1.3891887269302, "learning_rate": 5.695509309967142e-06, "loss": 2.4052, "step": 260 }, { "epoch": 0.03, "grad_norm": 1.254356284730965, "learning_rate": 5.805038335158818e-06, "loss": 2.4374, "step": 265 }, { "epoch": 0.03, "grad_norm": 1.396133329832306, "learning_rate": 5.914567360350494e-06, "loss": 2.3901, "step": 270 }, { "epoch": 0.03, "grad_norm": 1.6788786778754499, "learning_rate": 6.02409638554217e-06, "loss": 2.4609, "step": 275 }, { "epoch": 0.03, "grad_norm": 2.204893856229964, "learning_rate": 6.1336254107338444e-06, "loss": 2.388, "step": 280 }, { "epoch": 0.03, "grad_norm": 1.347394897183825, "learning_rate": 6.2431544359255205e-06, "loss": 2.4207, "step": 285 }, { "epoch": 0.03, "grad_norm": 1.4537239043833885, "learning_rate": 6.3526834611171965e-06, "loss": 2.3549, "step": 290 }, { "epoch": 0.03, "grad_norm": 1.29154484216092, "learning_rate": 6.462212486308872e-06, "loss": 2.3873, "step": 295 }, { "epoch": 0.03, "grad_norm": 1.3364520261018606, "learning_rate": 6.571741511500548e-06, "loss": 2.4244, "step": 300 }, { "epoch": 0.03, "grad_norm": 1.385397566153177, "learning_rate": 6.681270536692224e-06, "loss": 2.4029, "step": 305 }, { "epoch": 0.03, "grad_norm": 1.276681908890578, "learning_rate": 6.7907995618839e-06, "loss": 2.3793, "step": 310 }, { "epoch": 0.03, "grad_norm": 1.3440691577655948, "learning_rate": 6.900328587075575e-06, "loss": 2.3867, "step": 315 }, { "epoch": 0.04, "grad_norm": 1.254731296188651, "learning_rate": 7.009857612267251e-06, "loss": 2.3609, "step": 320 }, { "epoch": 0.04, "grad_norm": 1.2270048860463718, "learning_rate": 7.119386637458927e-06, "loss": 2.3446, "step": 325 }, { "epoch": 0.04, "grad_norm": 1.272228997347141, "learning_rate": 7.228915662650603e-06, "loss": 2.4158, "step": 330 }, { "epoch": 0.04, "grad_norm": 1.3248132885411372, "learning_rate": 7.3384446878422785e-06, "loss": 2.3849, "step": 335 }, { "epoch": 0.04, "grad_norm": 1.2774271548581821, "learning_rate": 7.4479737130339545e-06, "loss": 2.2849, "step": 340 }, { "epoch": 0.04, "grad_norm": 1.4438394735295585, "learning_rate": 7.5575027382256306e-06, "loss": 2.3671, "step": 345 }, { "epoch": 0.04, "grad_norm": 1.3078992964071001, "learning_rate": 7.667031763417307e-06, "loss": 2.3224, "step": 350 }, { "epoch": 0.04, "grad_norm": 1.2418056712490717, "learning_rate": 7.776560788608982e-06, "loss": 2.331, "step": 355 }, { "epoch": 0.04, "grad_norm": 1.2739024742482792, "learning_rate": 7.886089813800659e-06, "loss": 2.379, "step": 360 }, { "epoch": 0.04, "grad_norm": 1.378117121319382, "learning_rate": 7.995618838992334e-06, "loss": 2.3123, "step": 365 }, { "epoch": 0.04, "grad_norm": 1.2729811834998477, "learning_rate": 8.105147864184009e-06, "loss": 2.3451, "step": 370 }, { "epoch": 0.04, "grad_norm": 1.3501888141820928, "learning_rate": 8.214676889375686e-06, "loss": 2.2961, "step": 375 }, { "epoch": 0.04, "grad_norm": 1.4074564892839918, "learning_rate": 8.324205914567361e-06, "loss": 2.3326, "step": 380 }, { "epoch": 0.04, "grad_norm": 1.3052438668613935, "learning_rate": 8.433734939759038e-06, "loss": 2.3027, "step": 385 }, { "epoch": 0.04, "grad_norm": 1.1556604923049043, "learning_rate": 8.543263964950712e-06, "loss": 2.3748, "step": 390 }, { "epoch": 0.04, "grad_norm": 1.1502106207245197, "learning_rate": 8.652792990142389e-06, "loss": 2.3194, "step": 395 }, { "epoch": 0.04, "grad_norm": 1.1604539897045514, "learning_rate": 8.762322015334064e-06, "loss": 2.3761, "step": 400 }, { "epoch": 0.04, "grad_norm": 1.2050644882899062, "learning_rate": 8.871851040525739e-06, "loss": 2.2999, "step": 405 }, { "epoch": 0.04, "grad_norm": 1.2811336775760953, "learning_rate": 8.981380065717416e-06, "loss": 2.3367, "step": 410 }, { "epoch": 0.05, "grad_norm": 1.3533519220504746, "learning_rate": 9.090909090909091e-06, "loss": 2.3471, "step": 415 }, { "epoch": 0.05, "grad_norm": 1.1861655908209485, "learning_rate": 9.200438116100768e-06, "loss": 2.3082, "step": 420 }, { "epoch": 0.05, "grad_norm": 1.166638097757976, "learning_rate": 9.309967141292443e-06, "loss": 2.2954, "step": 425 }, { "epoch": 0.05, "grad_norm": 1.1469550897764293, "learning_rate": 9.419496166484118e-06, "loss": 2.2939, "step": 430 }, { "epoch": 0.05, "grad_norm": 1.298631821591701, "learning_rate": 9.529025191675795e-06, "loss": 2.3486, "step": 435 }, { "epoch": 0.05, "grad_norm": 1.1379611355407062, "learning_rate": 9.63855421686747e-06, "loss": 2.3188, "step": 440 }, { "epoch": 0.05, "grad_norm": 1.2768202103444983, "learning_rate": 9.748083242059146e-06, "loss": 2.3026, "step": 445 }, { "epoch": 0.05, "grad_norm": 1.3002416418219096, "learning_rate": 9.857612267250823e-06, "loss": 2.3285, "step": 450 }, { "epoch": 0.05, "grad_norm": 1.2160121799545927, "learning_rate": 9.967141292442498e-06, "loss": 2.2804, "step": 455 }, { "epoch": 0.05, "grad_norm": 1.1637665623033808, "learning_rate": 1.0076670317634175e-05, "loss": 2.2729, "step": 460 }, { "epoch": 0.05, "grad_norm": 1.257198008187511, "learning_rate": 1.0186199342825848e-05, "loss": 2.2672, "step": 465 }, { "epoch": 0.05, "grad_norm": 1.4462164660488996, "learning_rate": 1.0295728368017525e-05, "loss": 2.2614, "step": 470 }, { "epoch": 0.05, "grad_norm": 1.2081812756640942, "learning_rate": 1.04052573932092e-05, "loss": 2.2646, "step": 475 }, { "epoch": 0.05, "grad_norm": 1.3382972102104431, "learning_rate": 1.0514786418400877e-05, "loss": 2.2885, "step": 480 }, { "epoch": 0.05, "grad_norm": 1.3543985316024243, "learning_rate": 1.0624315443592552e-05, "loss": 2.239, "step": 485 }, { "epoch": 0.05, "grad_norm": 1.2574897482117309, "learning_rate": 1.073384446878423e-05, "loss": 2.3434, "step": 490 }, { "epoch": 0.05, "grad_norm": 1.5073999356604233, "learning_rate": 1.0843373493975904e-05, "loss": 2.2746, "step": 495 }, { "epoch": 0.05, "grad_norm": 1.360021720178913, "learning_rate": 1.0952902519167581e-05, "loss": 2.299, "step": 500 }, { "epoch": 0.06, "grad_norm": 1.1751546963778623, "learning_rate": 1.1062431544359255e-05, "loss": 2.2651, "step": 505 }, { "epoch": 0.06, "grad_norm": 1.2247251071988299, "learning_rate": 1.1171960569550932e-05, "loss": 2.2382, "step": 510 }, { "epoch": 0.06, "grad_norm": 1.4271504803939052, "learning_rate": 1.1281489594742607e-05, "loss": 2.2715, "step": 515 }, { "epoch": 0.06, "grad_norm": 1.3000497904587276, "learning_rate": 1.1391018619934284e-05, "loss": 2.2373, "step": 520 }, { "epoch": 0.06, "grad_norm": 1.1458106227790528, "learning_rate": 1.1500547645125959e-05, "loss": 2.3209, "step": 525 }, { "epoch": 0.06, "grad_norm": 1.1564261753147838, "learning_rate": 1.1610076670317636e-05, "loss": 2.305, "step": 530 }, { "epoch": 0.06, "grad_norm": 1.1925870954005313, "learning_rate": 1.1719605695509311e-05, "loss": 2.2334, "step": 535 }, { "epoch": 0.06, "grad_norm": 1.362918903671762, "learning_rate": 1.1829134720700988e-05, "loss": 2.2907, "step": 540 }, { "epoch": 0.06, "grad_norm": 1.1485771721265563, "learning_rate": 1.1938663745892662e-05, "loss": 2.2568, "step": 545 }, { "epoch": 0.06, "grad_norm": 1.189575319547226, "learning_rate": 1.204819277108434e-05, "loss": 2.2249, "step": 550 }, { "epoch": 0.06, "grad_norm": 1.2144032944339294, "learning_rate": 1.2157721796276014e-05, "loss": 2.3105, "step": 555 }, { "epoch": 0.06, "grad_norm": 1.1634756914704985, "learning_rate": 1.2267250821467689e-05, "loss": 2.2402, "step": 560 }, { "epoch": 0.06, "grad_norm": 1.1830718714990502, "learning_rate": 1.2376779846659366e-05, "loss": 2.2855, "step": 565 }, { "epoch": 0.06, "grad_norm": 1.1248654002565333, "learning_rate": 1.2486308871851041e-05, "loss": 2.2701, "step": 570 }, { "epoch": 0.06, "grad_norm": 1.1340544458008102, "learning_rate": 1.2595837897042718e-05, "loss": 2.2597, "step": 575 }, { "epoch": 0.06, "grad_norm": 1.3464735680621647, "learning_rate": 1.2705366922234393e-05, "loss": 2.2595, "step": 580 }, { "epoch": 0.06, "grad_norm": 1.201259618294619, "learning_rate": 1.281489594742607e-05, "loss": 2.2659, "step": 585 }, { "epoch": 0.06, "grad_norm": 1.1474577728722304, "learning_rate": 1.2924424972617743e-05, "loss": 2.335, "step": 590 }, { "epoch": 0.07, "grad_norm": 1.2748140428929302, "learning_rate": 1.303395399780942e-05, "loss": 2.2838, "step": 595 }, { "epoch": 0.07, "grad_norm": 1.2300405090588766, "learning_rate": 1.3143483023001096e-05, "loss": 2.2868, "step": 600 }, { "epoch": 0.07, "grad_norm": 1.55416503784747, "learning_rate": 1.3253012048192772e-05, "loss": 2.305, "step": 605 }, { "epoch": 0.07, "grad_norm": 1.1618363380121337, "learning_rate": 1.3362541073384448e-05, "loss": 2.256, "step": 610 }, { "epoch": 0.07, "grad_norm": 1.1753631165071854, "learning_rate": 1.3472070098576125e-05, "loss": 2.2245, "step": 615 }, { "epoch": 0.07, "grad_norm": 1.1516961099300926, "learning_rate": 1.35815991237678e-05, "loss": 2.2525, "step": 620 }, { "epoch": 0.07, "grad_norm": 1.1481446018522314, "learning_rate": 1.3691128148959477e-05, "loss": 2.3051, "step": 625 }, { "epoch": 0.07, "grad_norm": 1.2426514524452208, "learning_rate": 1.380065717415115e-05, "loss": 2.2337, "step": 630 }, { "epoch": 0.07, "grad_norm": 1.166245972934172, "learning_rate": 1.3910186199342827e-05, "loss": 2.275, "step": 635 }, { "epoch": 0.07, "grad_norm": 1.2716507755808424, "learning_rate": 1.4019715224534502e-05, "loss": 2.2426, "step": 640 }, { "epoch": 0.07, "grad_norm": 1.2375643814413653, "learning_rate": 1.412924424972618e-05, "loss": 2.2509, "step": 645 }, { "epoch": 0.07, "grad_norm": 1.1745031898000593, "learning_rate": 1.4238773274917854e-05, "loss": 2.3079, "step": 650 }, { "epoch": 0.07, "grad_norm": 1.1131412471977316, "learning_rate": 1.4348302300109531e-05, "loss": 2.2223, "step": 655 }, { "epoch": 0.07, "grad_norm": 1.1876619662923074, "learning_rate": 1.4457831325301207e-05, "loss": 2.2289, "step": 660 }, { "epoch": 0.07, "grad_norm": 1.4454106689019526, "learning_rate": 1.4567360350492883e-05, "loss": 2.2544, "step": 665 }, { "epoch": 0.07, "grad_norm": 1.267098532468749, "learning_rate": 1.4676889375684557e-05, "loss": 2.236, "step": 670 }, { "epoch": 0.07, "grad_norm": 1.1284165057754538, "learning_rate": 1.4786418400876232e-05, "loss": 2.2616, "step": 675 }, { "epoch": 0.07, "grad_norm": 1.2981514282479578, "learning_rate": 1.4895947426067909e-05, "loss": 2.2385, "step": 680 }, { "epoch": 0.08, "grad_norm": 1.3060533521682849, "learning_rate": 1.5005476451259584e-05, "loss": 2.2671, "step": 685 }, { "epoch": 0.08, "grad_norm": 1.1553026185915214, "learning_rate": 1.5115005476451261e-05, "loss": 2.2567, "step": 690 }, { "epoch": 0.08, "grad_norm": 1.16657202455599, "learning_rate": 1.5224534501642936e-05, "loss": 2.2823, "step": 695 }, { "epoch": 0.08, "grad_norm": 1.4468255386565563, "learning_rate": 1.5334063526834613e-05, "loss": 2.21, "step": 700 }, { "epoch": 0.08, "grad_norm": 1.1537008143491212, "learning_rate": 1.5443592552026287e-05, "loss": 2.2251, "step": 705 }, { "epoch": 0.08, "grad_norm": 1.2511272697408526, "learning_rate": 1.5553121577217964e-05, "loss": 2.304, "step": 710 }, { "epoch": 0.08, "grad_norm": 1.1782312599092204, "learning_rate": 1.566265060240964e-05, "loss": 2.2637, "step": 715 }, { "epoch": 0.08, "grad_norm": 1.1719573626771, "learning_rate": 1.5772179627601317e-05, "loss": 2.2471, "step": 720 }, { "epoch": 0.08, "grad_norm": 1.2758530323597597, "learning_rate": 1.588170865279299e-05, "loss": 2.2537, "step": 725 }, { "epoch": 0.08, "grad_norm": 1.2102102523114282, "learning_rate": 1.5991237677984668e-05, "loss": 2.2223, "step": 730 }, { "epoch": 0.08, "grad_norm": 1.2763910458268635, "learning_rate": 1.610076670317634e-05, "loss": 2.2548, "step": 735 }, { "epoch": 0.08, "grad_norm": 1.2457262503933173, "learning_rate": 1.6210295728368018e-05, "loss": 2.2393, "step": 740 }, { "epoch": 0.08, "grad_norm": 1.193830237256192, "learning_rate": 1.6319824753559695e-05, "loss": 2.1955, "step": 745 }, { "epoch": 0.08, "grad_norm": 1.4183987890961753, "learning_rate": 1.6429353778751372e-05, "loss": 2.2183, "step": 750 }, { "epoch": 0.08, "grad_norm": 1.2816704626777247, "learning_rate": 1.6538882803943046e-05, "loss": 2.249, "step": 755 }, { "epoch": 0.08, "grad_norm": 1.1595928509438225, "learning_rate": 1.6648411829134722e-05, "loss": 2.2589, "step": 760 }, { "epoch": 0.08, "grad_norm": 1.2840009884205277, "learning_rate": 1.67579408543264e-05, "loss": 2.2514, "step": 765 }, { "epoch": 0.08, "grad_norm": 2.03150431501068, "learning_rate": 1.6867469879518076e-05, "loss": 2.2249, "step": 770 }, { "epoch": 0.08, "grad_norm": 1.16489971988164, "learning_rate": 1.697699890470975e-05, "loss": 2.2059, "step": 775 }, { "epoch": 0.09, "grad_norm": 1.2193641387800866, "learning_rate": 1.7086527929901423e-05, "loss": 2.2994, "step": 780 }, { "epoch": 0.09, "grad_norm": 1.1257161143895573, "learning_rate": 1.71960569550931e-05, "loss": 2.2882, "step": 785 }, { "epoch": 0.09, "grad_norm": 1.1477618256029176, "learning_rate": 1.7305585980284777e-05, "loss": 2.2131, "step": 790 }, { "epoch": 0.09, "grad_norm": 1.2026381186201662, "learning_rate": 1.7415115005476454e-05, "loss": 2.299, "step": 795 }, { "epoch": 0.09, "grad_norm": 1.234175602784883, "learning_rate": 1.7524644030668127e-05, "loss": 2.2433, "step": 800 }, { "epoch": 0.09, "grad_norm": 1.0843009897002622, "learning_rate": 1.7634173055859804e-05, "loss": 2.2015, "step": 805 }, { "epoch": 0.09, "grad_norm": 1.1140449548949825, "learning_rate": 1.7743702081051478e-05, "loss": 2.2543, "step": 810 }, { "epoch": 0.09, "grad_norm": 1.2420230250878992, "learning_rate": 1.7853231106243155e-05, "loss": 2.2425, "step": 815 }, { "epoch": 0.09, "grad_norm": 1.5252747293594222, "learning_rate": 1.796276013143483e-05, "loss": 2.1847, "step": 820 }, { "epoch": 0.09, "grad_norm": 1.2089005088247822, "learning_rate": 1.807228915662651e-05, "loss": 2.2589, "step": 825 }, { "epoch": 0.09, "grad_norm": 1.1638188133313738, "learning_rate": 1.8181818181818182e-05, "loss": 2.2142, "step": 830 }, { "epoch": 0.09, "grad_norm": 1.1625892772432, "learning_rate": 1.829134720700986e-05, "loss": 2.1924, "step": 835 }, { "epoch": 0.09, "grad_norm": 1.240385575559641, "learning_rate": 1.8400876232201536e-05, "loss": 2.1984, "step": 840 }, { "epoch": 0.09, "grad_norm": 1.2085963838298415, "learning_rate": 1.8510405257393213e-05, "loss": 2.2342, "step": 845 }, { "epoch": 0.09, "grad_norm": 1.206402555369941, "learning_rate": 1.8619934282584886e-05, "loss": 2.26, "step": 850 }, { "epoch": 0.09, "grad_norm": 1.1113251756677591, "learning_rate": 1.8729463307776563e-05, "loss": 2.2206, "step": 855 }, { "epoch": 0.09, "grad_norm": 1.236608319460818, "learning_rate": 1.8838992332968237e-05, "loss": 2.2279, "step": 860 }, { "epoch": 0.09, "grad_norm": 1.1471395956118593, "learning_rate": 1.8948521358159914e-05, "loss": 2.2353, "step": 865 }, { "epoch": 0.1, "grad_norm": 1.2296954546043617, "learning_rate": 1.905805038335159e-05, "loss": 2.2185, "step": 870 }, { "epoch": 0.1, "grad_norm": 1.2273521611996157, "learning_rate": 1.9167579408543267e-05, "loss": 2.2194, "step": 875 }, { "epoch": 0.1, "grad_norm": 1.161107911806466, "learning_rate": 1.927710843373494e-05, "loss": 2.2218, "step": 880 }, { "epoch": 0.1, "grad_norm": 1.1287646059276044, "learning_rate": 1.9386637458926618e-05, "loss": 2.2285, "step": 885 }, { "epoch": 0.1, "grad_norm": 1.2389246894301895, "learning_rate": 1.949616648411829e-05, "loss": 2.1679, "step": 890 }, { "epoch": 0.1, "grad_norm": 1.2487497358643318, "learning_rate": 1.9605695509309968e-05, "loss": 2.2246, "step": 895 }, { "epoch": 0.1, "grad_norm": 1.140649811628765, "learning_rate": 1.9715224534501645e-05, "loss": 2.2121, "step": 900 }, { "epoch": 0.1, "grad_norm": 1.1616800411236698, "learning_rate": 1.982475355969332e-05, "loss": 2.252, "step": 905 }, { "epoch": 0.1, "grad_norm": 1.2068982590836104, "learning_rate": 1.9934282584884995e-05, "loss": 2.2147, "step": 910 }, { "epoch": 0.1, "grad_norm": 1.3493246641935688, "learning_rate": 1.9999997075787978e-05, "loss": 2.188, "step": 915 }, { "epoch": 0.1, "grad_norm": 1.2457350236781515, "learning_rate": 1.9999964178422337e-05, "loss": 2.2527, "step": 920 }, { "epoch": 0.1, "grad_norm": 1.1475402540069277, "learning_rate": 1.999989472854667e-05, "loss": 2.1865, "step": 925 }, { "epoch": 0.1, "grad_norm": 1.1517184531044913, "learning_rate": 1.9999788726414837e-05, "loss": 2.1932, "step": 930 }, { "epoch": 0.1, "grad_norm": 1.2875202122842755, "learning_rate": 1.9999646172414305e-05, "loss": 2.2018, "step": 935 }, { "epoch": 0.1, "grad_norm": 1.193816329696385, "learning_rate": 1.9999467067066145e-05, "loss": 2.2064, "step": 940 }, { "epoch": 0.1, "grad_norm": 1.2339257221118765, "learning_rate": 1.9999251411025034e-05, "loss": 2.1811, "step": 945 }, { "epoch": 0.1, "grad_norm": 1.3551157911644447, "learning_rate": 1.9998999205079253e-05, "loss": 2.2042, "step": 950 }, { "epoch": 0.1, "grad_norm": 1.2539479508523428, "learning_rate": 1.999871045015068e-05, "loss": 2.2418, "step": 955 }, { "epoch": 0.11, "grad_norm": 1.2644180392910132, "learning_rate": 1.999838514729479e-05, "loss": 2.2081, "step": 960 }, { "epoch": 0.11, "grad_norm": 1.1629519842008387, "learning_rate": 1.9998023297700656e-05, "loss": 2.1935, "step": 965 }, { "epoch": 0.11, "grad_norm": 1.1228451183580812, "learning_rate": 1.999762490269093e-05, "loss": 2.1883, "step": 970 }, { "epoch": 0.11, "grad_norm": 1.2736740918380058, "learning_rate": 1.9997189963721845e-05, "loss": 2.2326, "step": 975 }, { "epoch": 0.11, "grad_norm": 1.1313355437549668, "learning_rate": 1.999671848238323e-05, "loss": 2.194, "step": 980 }, { "epoch": 0.11, "grad_norm": 1.1568079603438357, "learning_rate": 1.9996210460398464e-05, "loss": 2.1612, "step": 985 }, { "epoch": 0.11, "grad_norm": 1.160840723365268, "learning_rate": 1.9995665899624505e-05, "loss": 2.1654, "step": 990 }, { "epoch": 0.11, "grad_norm": 1.1253356054725323, "learning_rate": 1.9995084802051866e-05, "loss": 2.1942, "step": 995 }, { "epoch": 0.11, "grad_norm": 1.2131059936469133, "learning_rate": 1.9994467169804613e-05, "loss": 2.2077, "step": 1000 }, { "epoch": 0.11, "grad_norm": 1.3236738278500673, "learning_rate": 1.9993813005140355e-05, "loss": 2.2317, "step": 1005 }, { "epoch": 0.11, "grad_norm": 1.340753968715426, "learning_rate": 1.9993122310450237e-05, "loss": 2.199, "step": 1010 }, { "epoch": 0.11, "grad_norm": 1.178739785509154, "learning_rate": 1.999239508825893e-05, "loss": 2.2273, "step": 1015 }, { "epoch": 0.11, "grad_norm": 1.1793847697492794, "learning_rate": 1.999163134122462e-05, "loss": 2.1904, "step": 1020 }, { "epoch": 0.11, "grad_norm": 1.2148673126306246, "learning_rate": 1.9990831072139008e-05, "loss": 2.1747, "step": 1025 }, { "epoch": 0.11, "grad_norm": 1.1559661051994823, "learning_rate": 1.9989994283927287e-05, "loss": 2.1984, "step": 1030 }, { "epoch": 0.11, "grad_norm": 1.2862265073331878, "learning_rate": 1.998912097964814e-05, "loss": 2.1959, "step": 1035 }, { "epoch": 0.11, "grad_norm": 1.2280755658043683, "learning_rate": 1.9988211162493726e-05, "loss": 2.1771, "step": 1040 }, { "epoch": 0.11, "grad_norm": 1.2191888826436497, "learning_rate": 1.9987264835789665e-05, "loss": 2.195, "step": 1045 }, { "epoch": 0.12, "grad_norm": 1.2363385392991006, "learning_rate": 1.998628200299503e-05, "loss": 2.2165, "step": 1050 }, { "epoch": 0.12, "grad_norm": 1.1435286897705585, "learning_rate": 1.9985262667702336e-05, "loss": 2.1768, "step": 1055 }, { "epoch": 0.12, "grad_norm": 1.17647172890328, "learning_rate": 1.998420683363752e-05, "loss": 2.2194, "step": 1060 }, { "epoch": 0.12, "grad_norm": 1.137748781528574, "learning_rate": 1.9983114504659943e-05, "loss": 2.2248, "step": 1065 }, { "epoch": 0.12, "grad_norm": 1.387682828701235, "learning_rate": 1.9981985684762344e-05, "loss": 2.1904, "step": 1070 }, { "epoch": 0.12, "grad_norm": 1.2609906857987274, "learning_rate": 1.9980820378070867e-05, "loss": 2.2058, "step": 1075 }, { "epoch": 0.12, "grad_norm": 1.1731722774711626, "learning_rate": 1.997961858884501e-05, "loss": 2.2528, "step": 1080 }, { "epoch": 0.12, "grad_norm": 1.262110440110577, "learning_rate": 1.9978380321477634e-05, "loss": 2.22, "step": 1085 }, { "epoch": 0.12, "grad_norm": 1.1322184236114696, "learning_rate": 1.997710558049493e-05, "loss": 2.2231, "step": 1090 }, { "epoch": 0.12, "grad_norm": 1.1989904161125964, "learning_rate": 1.997579437055642e-05, "loss": 2.1633, "step": 1095 }, { "epoch": 0.12, "grad_norm": 1.2917506346027867, "learning_rate": 1.997444669645491e-05, "loss": 2.1932, "step": 1100 }, { "epoch": 0.12, "grad_norm": 1.1282675601218313, "learning_rate": 1.9973062563116515e-05, "loss": 2.1939, "step": 1105 }, { "epoch": 0.12, "grad_norm": 1.1709774900349155, "learning_rate": 1.9971641975600608e-05, "loss": 2.1311, "step": 1110 }, { "epoch": 0.12, "grad_norm": 1.238025894658575, "learning_rate": 1.9970184939099805e-05, "loss": 2.1988, "step": 1115 }, { "epoch": 0.12, "grad_norm": 1.3054601511645478, "learning_rate": 1.996869145893996e-05, "loss": 2.1569, "step": 1120 }, { "epoch": 0.12, "grad_norm": 1.201320053200845, "learning_rate": 1.996716154058014e-05, "loss": 2.1794, "step": 1125 }, { "epoch": 0.12, "grad_norm": 1.2336062408126236, "learning_rate": 1.9965595189612605e-05, "loss": 2.2154, "step": 1130 }, { "epoch": 0.12, "grad_norm": 1.1369086873456473, "learning_rate": 1.9963992411762775e-05, "loss": 2.2207, "step": 1135 }, { "epoch": 0.12, "grad_norm": 1.1311000579179211, "learning_rate": 1.9962353212889228e-05, "loss": 2.1515, "step": 1140 }, { "epoch": 0.13, "grad_norm": 1.2476444447168977, "learning_rate": 1.9960677598983672e-05, "loss": 2.1992, "step": 1145 }, { "epoch": 0.13, "grad_norm": 1.1667988181907132, "learning_rate": 1.995896557617091e-05, "loss": 2.2049, "step": 1150 }, { "epoch": 0.13, "grad_norm": 1.1186101088344749, "learning_rate": 1.9957217150708848e-05, "loss": 2.1981, "step": 1155 }, { "epoch": 0.13, "grad_norm": 1.1123034627401065, "learning_rate": 1.9955432328988437e-05, "loss": 2.1627, "step": 1160 }, { "epoch": 0.13, "grad_norm": 1.0629340078477847, "learning_rate": 1.995361111753367e-05, "loss": 2.1334, "step": 1165 }, { "epoch": 0.13, "grad_norm": 1.2583910273678192, "learning_rate": 1.995175352300156e-05, "loss": 2.1798, "step": 1170 }, { "epoch": 0.13, "grad_norm": 1.284356193628631, "learning_rate": 1.99498595521821e-05, "loss": 2.2186, "step": 1175 }, { "epoch": 0.13, "grad_norm": 1.1789269202753125, "learning_rate": 1.9947929211998264e-05, "loss": 2.2193, "step": 1180 }, { "epoch": 0.13, "grad_norm": 1.1430022576477439, "learning_rate": 1.994596250950595e-05, "loss": 2.1227, "step": 1185 }, { "epoch": 0.13, "grad_norm": 1.0937770271959615, "learning_rate": 1.9943959451893967e-05, "loss": 2.1826, "step": 1190 }, { "epoch": 0.13, "grad_norm": 1.2010469373663781, "learning_rate": 1.994192004648403e-05, "loss": 2.2309, "step": 1195 }, { "epoch": 0.13, "grad_norm": 1.194392726794883, "learning_rate": 1.99398443007307e-05, "loss": 2.196, "step": 1200 }, { "epoch": 0.13, "grad_norm": 1.193466684072325, "learning_rate": 1.993773222222138e-05, "loss": 2.1268, "step": 1205 }, { "epoch": 0.13, "grad_norm": 1.0996309822249255, "learning_rate": 1.9935583818676266e-05, "loss": 2.1557, "step": 1210 }, { "epoch": 0.13, "grad_norm": 1.1239291837592627, "learning_rate": 1.9933399097948344e-05, "loss": 2.1073, "step": 1215 }, { "epoch": 0.13, "grad_norm": 1.183838115309417, "learning_rate": 1.9931178068023352e-05, "loss": 2.1279, "step": 1220 }, { "epoch": 0.13, "grad_norm": 1.289612523815178, "learning_rate": 1.9928920737019735e-05, "loss": 2.1494, "step": 1225 }, { "epoch": 0.13, "grad_norm": 1.1086584288788845, "learning_rate": 1.9926627113188634e-05, "loss": 2.1678, "step": 1230 }, { "epoch": 0.14, "grad_norm": 1.2028663399011625, "learning_rate": 1.9924297204913855e-05, "loss": 2.1938, "step": 1235 }, { "epoch": 0.14, "grad_norm": 1.1143994581435561, "learning_rate": 1.9921931020711817e-05, "loss": 2.1948, "step": 1240 }, { "epoch": 0.14, "grad_norm": 1.2921446413570086, "learning_rate": 1.9919528569231556e-05, "loss": 2.1101, "step": 1245 }, { "epoch": 0.14, "grad_norm": 1.1188679219474233, "learning_rate": 1.9917089859254666e-05, "loss": 2.1999, "step": 1250 }, { "epoch": 0.14, "grad_norm": 1.2924658851186663, "learning_rate": 1.991461489969528e-05, "loss": 2.1603, "step": 1255 }, { "epoch": 0.14, "grad_norm": 1.3457737070114664, "learning_rate": 1.991210369960002e-05, "loss": 2.1132, "step": 1260 }, { "epoch": 0.14, "grad_norm": 1.2267118386921139, "learning_rate": 1.9909556268147995e-05, "loss": 2.176, "step": 1265 }, { "epoch": 0.14, "grad_norm": 1.127196867448862, "learning_rate": 1.9906972614650726e-05, "loss": 2.157, "step": 1270 }, { "epoch": 0.14, "grad_norm": 1.1997155604413043, "learning_rate": 1.9904352748552164e-05, "loss": 2.1724, "step": 1275 }, { "epoch": 0.14, "grad_norm": 1.0768356741509026, "learning_rate": 1.9901696679428605e-05, "loss": 2.1848, "step": 1280 }, { "epoch": 0.14, "grad_norm": 1.1099807859967414, "learning_rate": 1.989900441698868e-05, "loss": 2.2003, "step": 1285 }, { "epoch": 0.14, "grad_norm": 1.1234539371421566, "learning_rate": 1.9896275971073326e-05, "loss": 2.2091, "step": 1290 }, { "epoch": 0.14, "grad_norm": 1.1662264084574396, "learning_rate": 1.989351135165573e-05, "loss": 2.1647, "step": 1295 }, { "epoch": 0.14, "grad_norm": 1.205972800658048, "learning_rate": 1.98907105688413e-05, "loss": 2.183, "step": 1300 }, { "epoch": 0.14, "grad_norm": 1.1063162837515004, "learning_rate": 1.9887873632867645e-05, "loss": 2.1176, "step": 1305 }, { "epoch": 0.14, "grad_norm": 1.1710193243788574, "learning_rate": 1.9885000554104516e-05, "loss": 2.1737, "step": 1310 }, { "epoch": 0.14, "grad_norm": 1.196407773137755, "learning_rate": 1.9882091343053767e-05, "loss": 2.1623, "step": 1315 }, { "epoch": 0.14, "grad_norm": 1.1130495770243871, "learning_rate": 1.987914601034934e-05, "loss": 2.1747, "step": 1320 }, { "epoch": 0.15, "grad_norm": 1.1044326697880014, "learning_rate": 1.98761645667572e-05, "loss": 2.1683, "step": 1325 }, { "epoch": 0.15, "grad_norm": 1.1702303526350486, "learning_rate": 1.9873147023175316e-05, "loss": 2.1745, "step": 1330 }, { "epoch": 0.15, "grad_norm": 1.1820618824490956, "learning_rate": 1.9870093390633596e-05, "loss": 2.1285, "step": 1335 }, { "epoch": 0.15, "grad_norm": 1.2848572725783973, "learning_rate": 1.9867003680293884e-05, "loss": 2.1895, "step": 1340 }, { "epoch": 0.15, "grad_norm": 1.271030270872849, "learning_rate": 1.9863877903449883e-05, "loss": 2.2113, "step": 1345 }, { "epoch": 0.15, "grad_norm": 1.079763040913528, "learning_rate": 1.9860716071527136e-05, "loss": 2.127, "step": 1350 }, { "epoch": 0.15, "grad_norm": 1.1212847647023692, "learning_rate": 1.9857518196082964e-05, "loss": 2.153, "step": 1355 }, { "epoch": 0.15, "grad_norm": 1.2373030055616905, "learning_rate": 1.9854284288806458e-05, "loss": 2.1643, "step": 1360 }, { "epoch": 0.15, "grad_norm": 1.134200013865174, "learning_rate": 1.9851014361518397e-05, "loss": 2.165, "step": 1365 }, { "epoch": 0.15, "grad_norm": 1.13885587297294, "learning_rate": 1.984770842617123e-05, "loss": 2.1706, "step": 1370 }, { "epoch": 0.15, "grad_norm": 1.1740613176717045, "learning_rate": 1.9844366494849024e-05, "loss": 2.1996, "step": 1375 }, { "epoch": 0.15, "grad_norm": 1.3370979179123226, "learning_rate": 1.984098857976742e-05, "loss": 2.1335, "step": 1380 }, { "epoch": 0.15, "grad_norm": 1.0860533889922857, "learning_rate": 1.983757469327359e-05, "loss": 2.1118, "step": 1385 }, { "epoch": 0.15, "grad_norm": 1.1497174725225545, "learning_rate": 1.9834124847846193e-05, "loss": 2.1487, "step": 1390 }, { "epoch": 0.15, "grad_norm": 1.1295396446583834, "learning_rate": 1.983063905609532e-05, "loss": 2.1261, "step": 1395 }, { "epoch": 0.15, "grad_norm": 1.1247153235692318, "learning_rate": 1.9827117330762465e-05, "loss": 2.1525, "step": 1400 }, { "epoch": 0.15, "grad_norm": 1.3223969115771739, "learning_rate": 1.9823559684720464e-05, "loss": 2.1929, "step": 1405 }, { "epoch": 0.15, "grad_norm": 1.1883781069399215, "learning_rate": 1.9819966130973446e-05, "loss": 2.1187, "step": 1410 }, { "epoch": 0.16, "grad_norm": 1.2033188937516397, "learning_rate": 1.9816336682656812e-05, "loss": 2.11, "step": 1415 }, { "epoch": 0.16, "grad_norm": 1.1811276276064537, "learning_rate": 1.981267135303714e-05, "loss": 2.1481, "step": 1420 }, { "epoch": 0.16, "grad_norm": 1.150883283131275, "learning_rate": 1.9808970155512187e-05, "loss": 2.1937, "step": 1425 }, { "epoch": 0.16, "grad_norm": 1.2414110963662213, "learning_rate": 1.9805233103610807e-05, "loss": 2.131, "step": 1430 }, { "epoch": 0.16, "grad_norm": 1.1147754864185486, "learning_rate": 1.9801460210992906e-05, "loss": 2.1363, "step": 1435 }, { "epoch": 0.16, "grad_norm": 1.164702631621878, "learning_rate": 1.9797651491449408e-05, "loss": 2.1707, "step": 1440 }, { "epoch": 0.16, "grad_norm": 1.2062967804036584, "learning_rate": 1.9793806958902188e-05, "loss": 2.2183, "step": 1445 }, { "epoch": 0.16, "grad_norm": 1.0900002315075563, "learning_rate": 1.9789926627404022e-05, "loss": 2.1713, "step": 1450 }, { "epoch": 0.16, "grad_norm": 1.2195505001027207, "learning_rate": 1.9786010511138556e-05, "loss": 2.1673, "step": 1455 }, { "epoch": 0.16, "grad_norm": 1.2877836043676791, "learning_rate": 1.9782058624420224e-05, "loss": 2.1987, "step": 1460 }, { "epoch": 0.16, "grad_norm": 1.143092498391585, "learning_rate": 1.9778070981694216e-05, "loss": 2.1353, "step": 1465 }, { "epoch": 0.16, "grad_norm": 1.1703405060999144, "learning_rate": 1.9774047597536417e-05, "loss": 2.1222, "step": 1470 }, { "epoch": 0.16, "grad_norm": 1.132210649496107, "learning_rate": 1.9769988486653363e-05, "loss": 2.1752, "step": 1475 }, { "epoch": 0.16, "grad_norm": 1.2428013203329145, "learning_rate": 1.9765893663882175e-05, "loss": 2.1364, "step": 1480 }, { "epoch": 0.16, "grad_norm": 1.1411141271082412, "learning_rate": 1.976176314419051e-05, "loss": 2.144, "step": 1485 }, { "epoch": 0.16, "grad_norm": 1.2828907027436938, "learning_rate": 1.975759694267651e-05, "loss": 2.0953, "step": 1490 }, { "epoch": 0.16, "grad_norm": 1.1569075673521425, "learning_rate": 1.975339507456874e-05, "loss": 2.1958, "step": 1495 }, { "epoch": 0.16, "grad_norm": 1.2119519079479597, "learning_rate": 1.974915755522614e-05, "loss": 2.1997, "step": 1500 }, { "epoch": 0.16, "grad_norm": 1.1512583248064905, "learning_rate": 1.974488440013796e-05, "loss": 2.2144, "step": 1505 }, { "epoch": 0.17, "grad_norm": 1.1283001713623226, "learning_rate": 1.9740575624923714e-05, "loss": 2.1446, "step": 1510 }, { "epoch": 0.17, "grad_norm": 1.1886296144695103, "learning_rate": 1.9736231245333106e-05, "loss": 2.1526, "step": 1515 }, { "epoch": 0.17, "grad_norm": 1.1010780391996788, "learning_rate": 1.9731851277246e-05, "loss": 2.1248, "step": 1520 }, { "epoch": 0.17, "grad_norm": 1.1372039963599885, "learning_rate": 1.972743573667233e-05, "loss": 2.0866, "step": 1525 }, { "epoch": 0.17, "grad_norm": 1.1513798932127701, "learning_rate": 1.9722984639752063e-05, "loss": 2.0999, "step": 1530 }, { "epoch": 0.17, "grad_norm": 1.252655552559543, "learning_rate": 1.9718498002755136e-05, "loss": 2.1258, "step": 1535 }, { "epoch": 0.17, "grad_norm": 1.1291972680147382, "learning_rate": 1.9713975842081392e-05, "loss": 2.1398, "step": 1540 }, { "epoch": 0.17, "grad_norm": 1.3086217790740327, "learning_rate": 1.9709418174260523e-05, "loss": 2.0917, "step": 1545 }, { "epoch": 0.17, "grad_norm": 1.2232447925144458, "learning_rate": 1.9704825015952005e-05, "loss": 2.1566, "step": 1550 }, { "epoch": 0.17, "grad_norm": 1.1927086992976819, "learning_rate": 1.970019638394505e-05, "loss": 2.1288, "step": 1555 }, { "epoch": 0.17, "grad_norm": 1.094895962631647, "learning_rate": 1.9695532295158526e-05, "loss": 2.1523, "step": 1560 }, { "epoch": 0.17, "grad_norm": 1.1695988967937028, "learning_rate": 1.969083276664091e-05, "loss": 2.2153, "step": 1565 }, { "epoch": 0.17, "grad_norm": 1.1067474180445056, "learning_rate": 1.9686097815570223e-05, "loss": 2.186, "step": 1570 }, { "epoch": 0.17, "grad_norm": 1.1166995659576118, "learning_rate": 1.9681327459253957e-05, "loss": 2.1264, "step": 1575 }, { "epoch": 0.17, "grad_norm": 1.1496695628819085, "learning_rate": 1.9676521715129027e-05, "loss": 2.172, "step": 1580 }, { "epoch": 0.17, "grad_norm": 1.1380162014727886, "learning_rate": 1.9671680600761694e-05, "loss": 2.1532, "step": 1585 }, { "epoch": 0.17, "grad_norm": 1.2423652595025285, "learning_rate": 1.9666804133847517e-05, "loss": 2.101, "step": 1590 }, { "epoch": 0.17, "grad_norm": 1.0961305645996358, "learning_rate": 1.9661892332211266e-05, "loss": 2.1025, "step": 1595 }, { "epoch": 0.18, "grad_norm": 1.1037295694438627, "learning_rate": 1.965694521380687e-05, "loss": 2.1198, "step": 1600 }, { "epoch": 0.18, "grad_norm": 1.1308568400113361, "learning_rate": 1.9651962796717354e-05, "loss": 2.1481, "step": 1605 }, { "epoch": 0.18, "grad_norm": 1.1240200456927807, "learning_rate": 1.9646945099154774e-05, "loss": 2.131, "step": 1610 }, { "epoch": 0.18, "grad_norm": 1.1062978856891295, "learning_rate": 1.9641892139460133e-05, "loss": 2.1363, "step": 1615 }, { "epoch": 0.18, "grad_norm": 1.2299542940303656, "learning_rate": 1.9636803936103333e-05, "loss": 2.1712, "step": 1620 }, { "epoch": 0.18, "grad_norm": 1.2994488849289871, "learning_rate": 1.963168050768311e-05, "loss": 2.1109, "step": 1625 }, { "epoch": 0.18, "grad_norm": 1.1925534029209723, "learning_rate": 1.962652187292694e-05, "loss": 2.1485, "step": 1630 }, { "epoch": 0.18, "grad_norm": 1.2705165031291825, "learning_rate": 1.9621328050691e-05, "loss": 2.1139, "step": 1635 }, { "epoch": 0.18, "grad_norm": 1.1287055867909601, "learning_rate": 1.9616099059960077e-05, "loss": 2.1781, "step": 1640 }, { "epoch": 0.18, "grad_norm": 1.09737698264014, "learning_rate": 1.9610834919847513e-05, "loss": 2.084, "step": 1645 }, { "epoch": 0.18, "grad_norm": 1.1432990314427203, "learning_rate": 1.9605535649595137e-05, "loss": 2.1502, "step": 1650 }, { "epoch": 0.18, "grad_norm": 1.135333987140668, "learning_rate": 1.9600201268573177e-05, "loss": 2.1238, "step": 1655 }, { "epoch": 0.18, "grad_norm": 1.114528249483071, "learning_rate": 1.9594831796280202e-05, "loss": 2.112, "step": 1660 }, { "epoch": 0.18, "grad_norm": 1.091993418592747, "learning_rate": 1.9589427252343054e-05, "loss": 2.121, "step": 1665 }, { "epoch": 0.18, "grad_norm": 1.1248064219465874, "learning_rate": 1.958398765651677e-05, "loss": 2.1422, "step": 1670 }, { "epoch": 0.18, "grad_norm": 1.1103240742794478, "learning_rate": 1.9578513028684508e-05, "loss": 2.1725, "step": 1675 }, { "epoch": 0.18, "grad_norm": 1.1425138081769493, "learning_rate": 1.9573003388857476e-05, "loss": 2.2014, "step": 1680 }, { "epoch": 0.18, "grad_norm": 1.1549445112466585, "learning_rate": 1.9567458757174865e-05, "loss": 2.0555, "step": 1685 }, { "epoch": 0.19, "grad_norm": 1.1849870951831714, "learning_rate": 1.9561879153903768e-05, "loss": 2.1726, "step": 1690 }, { "epoch": 0.19, "grad_norm": 1.1670209546352508, "learning_rate": 1.9556264599439107e-05, "loss": 2.1479, "step": 1695 }, { "epoch": 0.19, "grad_norm": 1.1118382496915291, "learning_rate": 1.9550615114303566e-05, "loss": 2.1416, "step": 1700 }, { "epoch": 0.19, "grad_norm": 1.1023506226464763, "learning_rate": 1.95449307191475e-05, "loss": 2.214, "step": 1705 }, { "epoch": 0.19, "grad_norm": 1.180992657784109, "learning_rate": 1.953921143474888e-05, "loss": 2.1353, "step": 1710 }, { "epoch": 0.19, "grad_norm": 1.1215565828732512, "learning_rate": 1.9533457282013193e-05, "loss": 2.1167, "step": 1715 }, { "epoch": 0.19, "grad_norm": 1.0474066992062003, "learning_rate": 1.9527668281973394e-05, "loss": 2.1326, "step": 1720 }, { "epoch": 0.19, "grad_norm": 1.1148958861099467, "learning_rate": 1.95218444557898e-05, "loss": 2.1402, "step": 1725 }, { "epoch": 0.19, "grad_norm": 1.081892843448702, "learning_rate": 1.951598582475004e-05, "loss": 2.1501, "step": 1730 }, { "epoch": 0.19, "grad_norm": 1.1846740771185376, "learning_rate": 1.951009241026896e-05, "loss": 2.1355, "step": 1735 }, { "epoch": 0.19, "grad_norm": 1.042686529886763, "learning_rate": 1.9504164233888535e-05, "loss": 2.108, "step": 1740 }, { "epoch": 0.19, "grad_norm": 1.1931750331078463, "learning_rate": 1.949820131727783e-05, "loss": 2.1524, "step": 1745 }, { "epoch": 0.19, "grad_norm": 1.176365319726279, "learning_rate": 1.9492203682232866e-05, "loss": 2.1418, "step": 1750 }, { "epoch": 0.19, "grad_norm": 1.204171818440874, "learning_rate": 1.9486171350676594e-05, "loss": 2.1193, "step": 1755 }, { "epoch": 0.19, "grad_norm": 1.1413376872729701, "learning_rate": 1.948010434465877e-05, "loss": 2.1614, "step": 1760 }, { "epoch": 0.19, "grad_norm": 1.1745394195863148, "learning_rate": 1.947400268635591e-05, "loss": 2.1944, "step": 1765 }, { "epoch": 0.19, "grad_norm": 1.1049785833241939, "learning_rate": 1.9467866398071185e-05, "loss": 2.193, "step": 1770 }, { "epoch": 0.19, "grad_norm": 1.1692992636706039, "learning_rate": 1.9461695502234346e-05, "loss": 2.1002, "step": 1775 }, { "epoch": 0.19, "grad_norm": 1.2535842868561684, "learning_rate": 1.945549002140165e-05, "loss": 2.1094, "step": 1780 }, { "epoch": 0.2, "grad_norm": 1.2928161755410037, "learning_rate": 1.944924997825577e-05, "loss": 2.1235, "step": 1785 }, { "epoch": 0.2, "grad_norm": 1.179969391824463, "learning_rate": 1.9442975395605706e-05, "loss": 2.1634, "step": 1790 }, { "epoch": 0.2, "grad_norm": 1.1125158291897195, "learning_rate": 1.9436666296386718e-05, "loss": 2.1635, "step": 1795 }, { "epoch": 0.2, "grad_norm": 1.1404588546413061, "learning_rate": 1.943032270366023e-05, "loss": 2.1505, "step": 1800 }, { "epoch": 0.2, "grad_norm": 1.2542460643581737, "learning_rate": 1.942394464061375e-05, "loss": 2.1385, "step": 1805 }, { "epoch": 0.2, "grad_norm": 1.198560735785274, "learning_rate": 1.9417532130560784e-05, "loss": 2.1764, "step": 1810 }, { "epoch": 0.2, "grad_norm": 1.1580507590203666, "learning_rate": 1.9411085196940744e-05, "loss": 2.1333, "step": 1815 }, { "epoch": 0.2, "grad_norm": 1.1623695755952121, "learning_rate": 1.940460386331888e-05, "loss": 2.1211, "step": 1820 }, { "epoch": 0.2, "grad_norm": 1.1403881272396452, "learning_rate": 1.9398088153386175e-05, "loss": 2.1528, "step": 1825 }, { "epoch": 0.2, "grad_norm": 1.092466841155982, "learning_rate": 1.939153809095927e-05, "loss": 2.0924, "step": 1830 }, { "epoch": 0.2, "grad_norm": 1.0693788058346385, "learning_rate": 1.9384953699980373e-05, "loss": 2.0863, "step": 1835 }, { "epoch": 0.2, "grad_norm": 1.1486353509142142, "learning_rate": 1.9378335004517175e-05, "loss": 2.1202, "step": 1840 }, { "epoch": 0.2, "grad_norm": 1.1862082223973176, "learning_rate": 1.9371682028762752e-05, "loss": 2.1244, "step": 1845 }, { "epoch": 0.2, "grad_norm": 1.3228211906963443, "learning_rate": 1.936499479703549e-05, "loss": 2.1058, "step": 1850 }, { "epoch": 0.2, "grad_norm": 1.252087326867817, "learning_rate": 1.9358273333778988e-05, "loss": 2.1004, "step": 1855 }, { "epoch": 0.2, "grad_norm": 1.0895089741627582, "learning_rate": 1.9351517663561964e-05, "loss": 2.1697, "step": 1860 }, { "epoch": 0.2, "grad_norm": 1.2218561685590001, "learning_rate": 1.9344727811078183e-05, "loss": 2.0952, "step": 1865 }, { "epoch": 0.2, "grad_norm": 1.093594383727882, "learning_rate": 1.933790380114635e-05, "loss": 2.1269, "step": 1870 }, { "epoch": 0.21, "grad_norm": 1.3439872266899549, "learning_rate": 1.933104565871001e-05, "loss": 2.1496, "step": 1875 }, { "epoch": 0.21, "grad_norm": 1.1803082152312334, "learning_rate": 1.93241534088375e-05, "loss": 2.1286, "step": 1880 }, { "epoch": 0.21, "grad_norm": 1.1528175280602568, "learning_rate": 1.9317227076721807e-05, "loss": 2.1364, "step": 1885 }, { "epoch": 0.21, "grad_norm": 1.2670164430069262, "learning_rate": 1.9310266687680504e-05, "loss": 2.1542, "step": 1890 }, { "epoch": 0.21, "grad_norm": 1.1550941795525151, "learning_rate": 1.9303272267155657e-05, "loss": 2.1404, "step": 1895 }, { "epoch": 0.21, "grad_norm": 1.129242610039014, "learning_rate": 1.9296243840713713e-05, "loss": 2.1791, "step": 1900 }, { "epoch": 0.21, "grad_norm": 1.2546435551178852, "learning_rate": 1.9289181434045428e-05, "loss": 2.1138, "step": 1905 }, { "epoch": 0.21, "grad_norm": 1.1163295614909128, "learning_rate": 1.9282085072965765e-05, "loss": 2.1286, "step": 1910 }, { "epoch": 0.21, "grad_norm": 1.1718732284308668, "learning_rate": 1.92749547834138e-05, "loss": 2.1928, "step": 1915 }, { "epoch": 0.21, "grad_norm": 1.1396461785239915, "learning_rate": 1.926779059145262e-05, "loss": 2.1216, "step": 1920 }, { "epoch": 0.21, "grad_norm": 1.1199191588467758, "learning_rate": 1.9260592523269245e-05, "loss": 2.1474, "step": 1925 }, { "epoch": 0.21, "grad_norm": 1.1272449181337518, "learning_rate": 1.9253360605174505e-05, "loss": 2.0978, "step": 1930 }, { "epoch": 0.21, "grad_norm": 1.1559815863647187, "learning_rate": 1.924609486360298e-05, "loss": 2.1181, "step": 1935 }, { "epoch": 0.21, "grad_norm": 1.189226164353155, "learning_rate": 1.9238795325112867e-05, "loss": 2.1593, "step": 1940 }, { "epoch": 0.21, "grad_norm": 1.2383844883782773, "learning_rate": 1.9231462016385917e-05, "loss": 2.1586, "step": 1945 }, { "epoch": 0.21, "grad_norm": 1.1432012465479817, "learning_rate": 1.92240949642273e-05, "loss": 2.1241, "step": 1950 }, { "epoch": 0.21, "grad_norm": 1.181736234596286, "learning_rate": 1.921669419556554e-05, "loss": 2.1077, "step": 1955 }, { "epoch": 0.21, "grad_norm": 1.118768199280119, "learning_rate": 1.9209259737452407e-05, "loss": 2.1234, "step": 1960 }, { "epoch": 0.22, "grad_norm": 1.218726227477995, "learning_rate": 1.92017916170628e-05, "loss": 2.1419, "step": 1965 }, { "epoch": 0.22, "grad_norm": 1.0932754009432817, "learning_rate": 1.9194289861694674e-05, "loss": 2.0897, "step": 1970 }, { "epoch": 0.22, "grad_norm": 1.2012163874801354, "learning_rate": 1.9186754498768932e-05, "loss": 2.0861, "step": 1975 }, { "epoch": 0.22, "grad_norm": 1.1616200650319204, "learning_rate": 1.9179185555829308e-05, "loss": 2.1775, "step": 1980 }, { "epoch": 0.22, "grad_norm": 1.1285963840420035, "learning_rate": 1.9171583060542288e-05, "loss": 2.0827, "step": 1985 }, { "epoch": 0.22, "grad_norm": 1.2126042954483498, "learning_rate": 1.9163947040697006e-05, "loss": 2.1688, "step": 1990 }, { "epoch": 0.22, "grad_norm": 1.1330659367868618, "learning_rate": 1.9156277524205125e-05, "loss": 2.1293, "step": 1995 }, { "epoch": 0.22, "grad_norm": 1.0927014421196581, "learning_rate": 1.914857453910076e-05, "loss": 2.1612, "step": 2000 }, { "epoch": 0.22, "grad_norm": 1.1920487120297107, "learning_rate": 1.9140838113540347e-05, "loss": 2.1296, "step": 2005 }, { "epoch": 0.22, "grad_norm": 1.166161841291995, "learning_rate": 1.9133068275802575e-05, "loss": 2.1176, "step": 2010 }, { "epoch": 0.22, "grad_norm": 1.0789326573726574, "learning_rate": 1.912526505428824e-05, "loss": 2.1073, "step": 2015 }, { "epoch": 0.22, "grad_norm": 1.0388561300524752, "learning_rate": 1.911742847752019e-05, "loss": 2.1197, "step": 2020 }, { "epoch": 0.22, "grad_norm": 1.1863728854849205, "learning_rate": 1.9109558574143173e-05, "loss": 2.1351, "step": 2025 }, { "epoch": 0.22, "grad_norm": 1.0941604821519817, "learning_rate": 1.9101655372923772e-05, "loss": 2.1023, "step": 2030 }, { "epoch": 0.22, "grad_norm": 1.1549005373606414, "learning_rate": 1.909371890275027e-05, "loss": 2.1476, "step": 2035 }, { "epoch": 0.22, "grad_norm": 1.1131212169930467, "learning_rate": 1.908574919263256e-05, "loss": 2.1031, "step": 2040 }, { "epoch": 0.22, "grad_norm": 1.0678944767945526, "learning_rate": 1.907774627170204e-05, "loss": 2.1399, "step": 2045 }, { "epoch": 0.22, "grad_norm": 1.070767882878616, "learning_rate": 1.9069710169211503e-05, "loss": 2.1137, "step": 2050 }, { "epoch": 0.23, "grad_norm": 1.1018720104191884, "learning_rate": 1.9061640914535022e-05, "loss": 2.0953, "step": 2055 }, { "epoch": 0.23, "grad_norm": 1.2158145192753356, "learning_rate": 1.905353853716785e-05, "loss": 2.1588, "step": 2060 }, { "epoch": 0.23, "grad_norm": 1.1475939009679483, "learning_rate": 1.9045403066726325e-05, "loss": 2.1178, "step": 2065 }, { "epoch": 0.23, "grad_norm": 1.177353298617741, "learning_rate": 1.9037234532947737e-05, "loss": 2.1454, "step": 2070 }, { "epoch": 0.23, "grad_norm": 1.06635711983871, "learning_rate": 1.902903296569023e-05, "loss": 2.0752, "step": 2075 }, { "epoch": 0.23, "grad_norm": 1.2524346927327616, "learning_rate": 1.90207983949327e-05, "loss": 2.0823, "step": 2080 }, { "epoch": 0.23, "grad_norm": 1.2634741268049525, "learning_rate": 1.9012530850774678e-05, "loss": 2.1105, "step": 2085 }, { "epoch": 0.23, "grad_norm": 1.149695459455429, "learning_rate": 1.9004230363436226e-05, "loss": 2.1328, "step": 2090 }, { "epoch": 0.23, "grad_norm": 1.0846058120749036, "learning_rate": 1.899589696325781e-05, "loss": 2.0999, "step": 2095 }, { "epoch": 0.23, "grad_norm": 1.1597736224074298, "learning_rate": 1.898753068070021e-05, "loss": 2.1323, "step": 2100 }, { "epoch": 0.23, "grad_norm": 1.1113436604930973, "learning_rate": 1.8979131546344404e-05, "loss": 2.1627, "step": 2105 }, { "epoch": 0.23, "grad_norm": 1.2372661995144008, "learning_rate": 1.8970699590891443e-05, "loss": 2.1373, "step": 2110 }, { "epoch": 0.23, "grad_norm": 1.1760521442769696, "learning_rate": 1.896223484516235e-05, "loss": 2.0827, "step": 2115 }, { "epoch": 0.23, "grad_norm": 1.0766347096958273, "learning_rate": 1.8953737340098005e-05, "loss": 2.1181, "step": 2120 }, { "epoch": 0.23, "grad_norm": 1.1250504259250924, "learning_rate": 1.8945207106759032e-05, "loss": 2.1315, "step": 2125 }, { "epoch": 0.23, "grad_norm": 1.0810927678881945, "learning_rate": 1.8936644176325695e-05, "loss": 2.1555, "step": 2130 }, { "epoch": 0.23, "grad_norm": 1.0827482529616157, "learning_rate": 1.8928048580097758e-05, "loss": 2.1411, "step": 2135 }, { "epoch": 0.23, "grad_norm": 1.1706049059113375, "learning_rate": 1.8919420349494395e-05, "loss": 2.125, "step": 2140 }, { "epoch": 0.23, "grad_norm": 1.138534700244326, "learning_rate": 1.8910759516054074e-05, "loss": 2.1112, "step": 2145 }, { "epoch": 0.24, "grad_norm": 1.062386775633157, "learning_rate": 1.890206611143442e-05, "loss": 2.127, "step": 2150 }, { "epoch": 0.24, "grad_norm": 1.2446352368372686, "learning_rate": 1.8893340167412135e-05, "loss": 2.0958, "step": 2155 }, { "epoch": 0.24, "grad_norm": 1.1037101991351739, "learning_rate": 1.8884581715882838e-05, "loss": 2.1106, "step": 2160 }, { "epoch": 0.24, "grad_norm": 1.1170010238594863, "learning_rate": 1.8875790788860987e-05, "loss": 2.1392, "step": 2165 }, { "epoch": 0.24, "grad_norm": 1.1688334137079393, "learning_rate": 1.8866967418479742e-05, "loss": 2.0793, "step": 2170 }, { "epoch": 0.24, "grad_norm": 1.0872357060702926, "learning_rate": 1.8858111636990845e-05, "loss": 2.1557, "step": 2175 }, { "epoch": 0.24, "grad_norm": 1.1592785013700209, "learning_rate": 1.8849223476764528e-05, "loss": 2.062, "step": 2180 }, { "epoch": 0.24, "grad_norm": 1.1696054807448215, "learning_rate": 1.884030297028936e-05, "loss": 2.1361, "step": 2185 }, { "epoch": 0.24, "grad_norm": 1.1345878797165292, "learning_rate": 1.883135015017214e-05, "loss": 2.1163, "step": 2190 }, { "epoch": 0.24, "grad_norm": 1.114817037784154, "learning_rate": 1.8822365049137796e-05, "loss": 2.1159, "step": 2195 }, { "epoch": 0.24, "grad_norm": 1.0480021604320415, "learning_rate": 1.8813347700029244e-05, "loss": 2.1267, "step": 2200 }, { "epoch": 0.24, "grad_norm": 1.167032746611577, "learning_rate": 1.8804298135807283e-05, "loss": 2.0868, "step": 2205 }, { "epoch": 0.24, "grad_norm": 1.1216916859320072, "learning_rate": 1.8795216389550452e-05, "loss": 2.0786, "step": 2210 }, { "epoch": 0.24, "grad_norm": 1.1550566220005496, "learning_rate": 1.8786102494454936e-05, "loss": 2.1713, "step": 2215 }, { "epoch": 0.24, "grad_norm": 1.2312311516068628, "learning_rate": 1.8776956483834425e-05, "loss": 2.1091, "step": 2220 }, { "epoch": 0.24, "grad_norm": 1.1491883938798069, "learning_rate": 1.8767778391120008e-05, "loss": 2.0842, "step": 2225 }, { "epoch": 0.24, "grad_norm": 1.120983194953082, "learning_rate": 1.8758568249860035e-05, "loss": 2.1457, "step": 2230 }, { "epoch": 0.24, "grad_norm": 1.155683351931537, "learning_rate": 1.8749326093720002e-05, "loss": 2.1085, "step": 2235 }, { "epoch": 0.25, "grad_norm": 1.0547565359436817, "learning_rate": 1.8740051956482435e-05, "loss": 2.107, "step": 2240 }, { "epoch": 0.25, "grad_norm": 1.1891448777593032, "learning_rate": 1.8730745872046755e-05, "loss": 2.1198, "step": 2245 }, { "epoch": 0.25, "grad_norm": 1.2037560164398111, "learning_rate": 1.8721407874429153e-05, "loss": 2.1194, "step": 2250 }, { "epoch": 0.25, "grad_norm": 1.1935090203626975, "learning_rate": 1.8712037997762485e-05, "loss": 2.0971, "step": 2255 }, { "epoch": 0.25, "grad_norm": 1.1672120855442747, "learning_rate": 1.8702636276296114e-05, "loss": 2.0988, "step": 2260 }, { "epoch": 0.25, "grad_norm": 1.162490138451964, "learning_rate": 1.869320274439583e-05, "loss": 2.0656, "step": 2265 }, { "epoch": 0.25, "grad_norm": 1.077132726296442, "learning_rate": 1.8683737436543664e-05, "loss": 2.0945, "step": 2270 }, { "epoch": 0.25, "grad_norm": 1.1342667609898403, "learning_rate": 1.867424038733783e-05, "loss": 2.1266, "step": 2275 }, { "epoch": 0.25, "grad_norm": 1.2246386903387072, "learning_rate": 1.866471163149255e-05, "loss": 2.1185, "step": 2280 }, { "epoch": 0.25, "grad_norm": 1.1963105459764347, "learning_rate": 1.8655151203837936e-05, "loss": 2.0951, "step": 2285 }, { "epoch": 0.25, "grad_norm": 1.1538691678313253, "learning_rate": 1.8645559139319882e-05, "loss": 2.1157, "step": 2290 }, { "epoch": 0.25, "grad_norm": 1.215794872787627, "learning_rate": 1.8635935472999916e-05, "loss": 2.0422, "step": 2295 }, { "epoch": 0.25, "grad_norm": 1.0870636472917075, "learning_rate": 1.862628024005508e-05, "loss": 2.1331, "step": 2300 }, { "epoch": 0.25, "grad_norm": 1.1572440559963386, "learning_rate": 1.8616593475777795e-05, "loss": 2.1176, "step": 2305 }, { "epoch": 0.25, "grad_norm": 1.0656958693711773, "learning_rate": 1.860687521557575e-05, "loss": 2.0845, "step": 2310 }, { "epoch": 0.25, "grad_norm": 1.1366346096151212, "learning_rate": 1.8597125494971753e-05, "loss": 2.1608, "step": 2315 }, { "epoch": 0.25, "grad_norm": 1.0585753282845514, "learning_rate": 1.85873443496036e-05, "loss": 2.1026, "step": 2320 }, { "epoch": 0.25, "grad_norm": 1.0661646986438136, "learning_rate": 1.8577531815223964e-05, "loss": 2.14, "step": 2325 }, { "epoch": 0.26, "grad_norm": 1.0330273844073252, "learning_rate": 1.8567687927700255e-05, "loss": 2.056, "step": 2330 }, { "epoch": 0.26, "grad_norm": 1.3127890427549427, "learning_rate": 1.8557812723014476e-05, "loss": 2.16, "step": 2335 }, { "epoch": 0.26, "grad_norm": 1.1259013710243255, "learning_rate": 1.854790623726311e-05, "loss": 2.1321, "step": 2340 }, { "epoch": 0.26, "grad_norm": 1.2024471052534482, "learning_rate": 1.8537968506656976e-05, "loss": 2.0844, "step": 2345 }, { "epoch": 0.26, "grad_norm": 1.3687828874090977, "learning_rate": 1.8527999567521107e-05, "loss": 2.1183, "step": 2350 }, { "epoch": 0.26, "grad_norm": 1.1486497463714087, "learning_rate": 1.8517999456294608e-05, "loss": 2.1245, "step": 2355 }, { "epoch": 0.26, "grad_norm": 1.0692892920989003, "learning_rate": 1.8507968209530522e-05, "loss": 2.1264, "step": 2360 }, { "epoch": 0.26, "grad_norm": 1.1318455195164525, "learning_rate": 1.8497905863895715e-05, "loss": 2.1263, "step": 2365 }, { "epoch": 0.26, "grad_norm": 1.0784398384331952, "learning_rate": 1.8487812456170707e-05, "loss": 2.1209, "step": 2370 }, { "epoch": 0.26, "grad_norm": 1.1803937843886272, "learning_rate": 1.8477688023249572e-05, "loss": 2.0745, "step": 2375 }, { "epoch": 0.26, "grad_norm": 1.129358165629013, "learning_rate": 1.846753260213979e-05, "loss": 2.112, "step": 2380 }, { "epoch": 0.26, "grad_norm": 1.0801806225083181, "learning_rate": 1.8457346229962106e-05, "loss": 2.1021, "step": 2385 }, { "epoch": 0.26, "grad_norm": 1.1574659529925975, "learning_rate": 1.8447128943950398e-05, "loss": 2.1394, "step": 2390 }, { "epoch": 0.26, "grad_norm": 1.1528918707857776, "learning_rate": 1.8436880781451545e-05, "loss": 2.1327, "step": 2395 }, { "epoch": 0.26, "grad_norm": 1.0830043193631544, "learning_rate": 1.8426601779925288e-05, "loss": 2.0948, "step": 2400 }, { "epoch": 0.26, "grad_norm": 1.2479415101462448, "learning_rate": 1.8416291976944095e-05, "loss": 2.114, "step": 2405 }, { "epoch": 0.26, "grad_norm": 1.2103533011070207, "learning_rate": 1.8405951410193014e-05, "loss": 2.1411, "step": 2410 }, { "epoch": 0.26, "grad_norm": 1.1695070489979742, "learning_rate": 1.8395580117469554e-05, "loss": 2.1169, "step": 2415 }, { "epoch": 0.27, "grad_norm": 1.11195868518463, "learning_rate": 1.838517813668352e-05, "loss": 2.1108, "step": 2420 }, { "epoch": 0.27, "grad_norm": 1.158928510346745, "learning_rate": 1.8374745505856904e-05, "loss": 2.1203, "step": 2425 }, { "epoch": 0.27, "grad_norm": 1.1216081238407842, "learning_rate": 1.8364282263123728e-05, "loss": 2.0896, "step": 2430 }, { "epoch": 0.27, "grad_norm": 1.077765506589036, "learning_rate": 1.8353788446729907e-05, "loss": 2.1295, "step": 2435 }, { "epoch": 0.27, "grad_norm": 1.1276051343982811, "learning_rate": 1.8343264095033108e-05, "loss": 2.1054, "step": 2440 }, { "epoch": 0.27, "grad_norm": 1.0951532870128755, "learning_rate": 1.8332709246502616e-05, "loss": 2.1203, "step": 2445 }, { "epoch": 0.27, "grad_norm": 1.1276543502408611, "learning_rate": 1.8322123939719197e-05, "loss": 2.1571, "step": 2450 }, { "epoch": 0.27, "grad_norm": 1.088856989783588, "learning_rate": 1.8311508213374932e-05, "loss": 2.0716, "step": 2455 }, { "epoch": 0.27, "grad_norm": 1.1034684467899738, "learning_rate": 1.8300862106273113e-05, "loss": 2.1023, "step": 2460 }, { "epoch": 0.27, "grad_norm": 1.1329651224327326, "learning_rate": 1.8290185657328073e-05, "loss": 2.1035, "step": 2465 }, { "epoch": 0.27, "grad_norm": 1.1210353174650303, "learning_rate": 1.8279478905565048e-05, "loss": 2.1148, "step": 2470 }, { "epoch": 0.27, "grad_norm": 1.1471406803174353, "learning_rate": 1.8268741890120042e-05, "loss": 2.0964, "step": 2475 }, { "epoch": 0.27, "grad_norm": 1.120301684467795, "learning_rate": 1.8257974650239688e-05, "loss": 2.0755, "step": 2480 }, { "epoch": 0.27, "grad_norm": 1.0748158768211187, "learning_rate": 1.8247177225281088e-05, "loss": 2.0897, "step": 2485 }, { "epoch": 0.27, "grad_norm": 1.0861574508929397, "learning_rate": 1.823634965471168e-05, "loss": 2.1411, "step": 2490 }, { "epoch": 0.27, "grad_norm": 1.135442139626173, "learning_rate": 1.82254919781091e-05, "loss": 2.1045, "step": 2495 }, { "epoch": 0.27, "grad_norm": 1.092167433000726, "learning_rate": 1.8214604235161016e-05, "loss": 2.1174, "step": 2500 }, { "epoch": 0.27, "grad_norm": 1.19194626730115, "learning_rate": 1.820368646566501e-05, "loss": 2.1085, "step": 2505 }, { "epoch": 0.27, "grad_norm": 1.1804756013544928, "learning_rate": 1.819273870952841e-05, "loss": 2.1451, "step": 2510 }, { "epoch": 0.28, "grad_norm": 1.1080761475238445, "learning_rate": 1.8181761006768155e-05, "loss": 2.166, "step": 2515 }, { "epoch": 0.28, "grad_norm": 1.3203422941281007, "learning_rate": 1.8170753397510648e-05, "loss": 2.1237, "step": 2520 }, { "epoch": 0.28, "grad_norm": 1.1059412830377124, "learning_rate": 1.8159715921991612e-05, "loss": 2.137, "step": 2525 }, { "epoch": 0.28, "grad_norm": 1.1920043697088563, "learning_rate": 1.814864862055593e-05, "loss": 2.1573, "step": 2530 }, { "epoch": 0.28, "grad_norm": 1.1840285397143258, "learning_rate": 1.8137551533657513e-05, "loss": 2.0798, "step": 2535 }, { "epoch": 0.28, "grad_norm": 1.1439420126020277, "learning_rate": 1.8126424701859146e-05, "loss": 2.1126, "step": 2540 }, { "epoch": 0.28, "grad_norm": 1.096358579786206, "learning_rate": 1.8115268165832336e-05, "loss": 2.1272, "step": 2545 }, { "epoch": 0.28, "grad_norm": 1.115607538300524, "learning_rate": 1.810408196635717e-05, "loss": 2.0727, "step": 2550 }, { "epoch": 0.28, "grad_norm": 1.1467875251543764, "learning_rate": 1.809286614432216e-05, "loss": 2.0833, "step": 2555 }, { "epoch": 0.28, "grad_norm": 1.130474846785207, "learning_rate": 1.8081620740724096e-05, "loss": 2.0804, "step": 2560 }, { "epoch": 0.28, "grad_norm": 1.1681042572098064, "learning_rate": 1.8070345796667903e-05, "loss": 2.0544, "step": 2565 }, { "epoch": 0.28, "grad_norm": 1.1707902773084513, "learning_rate": 1.8059041353366472e-05, "loss": 2.1279, "step": 2570 }, { "epoch": 0.28, "grad_norm": 1.1802353201997353, "learning_rate": 1.804770745214054e-05, "loss": 2.1601, "step": 2575 }, { "epoch": 0.28, "grad_norm": 1.1448050665012182, "learning_rate": 1.8036344134418497e-05, "loss": 2.0771, "step": 2580 }, { "epoch": 0.28, "grad_norm": 1.113708984714782, "learning_rate": 1.8024951441736275e-05, "loss": 2.1056, "step": 2585 }, { "epoch": 0.28, "grad_norm": 1.074520934754304, "learning_rate": 1.801352941573718e-05, "loss": 2.1316, "step": 2590 }, { "epoch": 0.28, "grad_norm": 1.1297174012559132, "learning_rate": 1.8002078098171723e-05, "loss": 2.1256, "step": 2595 }, { "epoch": 0.28, "grad_norm": 1.1365120228484074, "learning_rate": 1.7990597530897502e-05, "loss": 2.1174, "step": 2600 }, { "epoch": 0.29, "grad_norm": 1.3646690087355797, "learning_rate": 1.797908775587902e-05, "loss": 2.1533, "step": 2605 }, { "epoch": 0.29, "grad_norm": 1.1398652693063138, "learning_rate": 1.7967548815187542e-05, "loss": 2.1027, "step": 2610 }, { "epoch": 0.29, "grad_norm": 1.0597491123371798, "learning_rate": 1.7955980751000947e-05, "loss": 2.1495, "step": 2615 }, { "epoch": 0.29, "grad_norm": 1.329054040263678, "learning_rate": 1.7944383605603567e-05, "loss": 2.0784, "step": 2620 }, { "epoch": 0.29, "grad_norm": 1.1265441658188498, "learning_rate": 1.793275742138602e-05, "loss": 2.0932, "step": 2625 }, { "epoch": 0.29, "grad_norm": 1.1118789365157977, "learning_rate": 1.7921102240845097e-05, "loss": 2.0664, "step": 2630 }, { "epoch": 0.29, "grad_norm": 1.1956768558124202, "learning_rate": 1.7909418106583546e-05, "loss": 2.1296, "step": 2635 }, { "epoch": 0.29, "grad_norm": 1.1145283325697752, "learning_rate": 1.7897705061309973e-05, "loss": 2.1036, "step": 2640 }, { "epoch": 0.29, "grad_norm": 1.1417336275496874, "learning_rate": 1.788596314783864e-05, "loss": 2.1065, "step": 2645 }, { "epoch": 0.29, "grad_norm": 1.1050729062011972, "learning_rate": 1.7874192409089355e-05, "loss": 2.0757, "step": 2650 }, { "epoch": 0.29, "grad_norm": 1.094468946498002, "learning_rate": 1.786239288808727e-05, "loss": 2.0825, "step": 2655 }, { "epoch": 0.29, "grad_norm": 1.1687286435786433, "learning_rate": 1.7850564627962752e-05, "loss": 2.1096, "step": 2660 }, { "epoch": 0.29, "grad_norm": 1.0922461828654444, "learning_rate": 1.7838707671951215e-05, "loss": 2.1066, "step": 2665 }, { "epoch": 0.29, "grad_norm": 1.1220904172021324, "learning_rate": 1.7826822063392963e-05, "loss": 2.1069, "step": 2670 }, { "epoch": 0.29, "grad_norm": 1.1720478017155251, "learning_rate": 1.7814907845733037e-05, "loss": 2.0978, "step": 2675 }, { "epoch": 0.29, "grad_norm": 1.151049240168988, "learning_rate": 1.780296506252105e-05, "loss": 2.0735, "step": 2680 }, { "epoch": 0.29, "grad_norm": 1.124287027242074, "learning_rate": 1.7790993757411024e-05, "loss": 2.07, "step": 2685 }, { "epoch": 0.29, "grad_norm": 1.0735458896829055, "learning_rate": 1.7778993974161247e-05, "loss": 2.1021, "step": 2690 }, { "epoch": 0.3, "grad_norm": 1.093201332891808, "learning_rate": 1.776696575663409e-05, "loss": 2.0628, "step": 2695 }, { "epoch": 0.3, "grad_norm": 1.1196139617991079, "learning_rate": 1.775490914879587e-05, "loss": 2.0636, "step": 2700 }, { "epoch": 0.3, "grad_norm": 1.2133368730110112, "learning_rate": 1.7742824194716664e-05, "loss": 2.1392, "step": 2705 }, { "epoch": 0.3, "grad_norm": 1.1316547216061348, "learning_rate": 1.7730710938570182e-05, "loss": 2.1401, "step": 2710 }, { "epoch": 0.3, "grad_norm": 1.0774098852891354, "learning_rate": 1.7718569424633566e-05, "loss": 2.1187, "step": 2715 }, { "epoch": 0.3, "grad_norm": 1.1718102488535178, "learning_rate": 1.770639969728726e-05, "loss": 2.0685, "step": 2720 }, { "epoch": 0.3, "grad_norm": 1.1168563808620742, "learning_rate": 1.769420180101483e-05, "loss": 2.1058, "step": 2725 }, { "epoch": 0.3, "grad_norm": 1.1651766554582779, "learning_rate": 1.7681975780402807e-05, "loss": 2.0993, "step": 2730 }, { "epoch": 0.3, "grad_norm": 1.1050180571132542, "learning_rate": 1.7669721680140526e-05, "loss": 2.1138, "step": 2735 }, { "epoch": 0.3, "grad_norm": 1.0722099710251072, "learning_rate": 1.7657439545019963e-05, "loss": 2.1362, "step": 2740 }, { "epoch": 0.3, "grad_norm": 1.1631476035734578, "learning_rate": 1.7645129419935565e-05, "loss": 2.138, "step": 2745 }, { "epoch": 0.3, "grad_norm": 1.087406894894766, "learning_rate": 1.7632791349884083e-05, "loss": 2.1415, "step": 2750 }, { "epoch": 0.3, "grad_norm": 1.0604590729082244, "learning_rate": 1.762042537996443e-05, "loss": 2.1232, "step": 2755 }, { "epoch": 0.3, "grad_norm": 1.2078455539293027, "learning_rate": 1.7608031555377487e-05, "loss": 2.1044, "step": 2760 }, { "epoch": 0.3, "grad_norm": 1.1696495927133321, "learning_rate": 1.759560992142596e-05, "loss": 2.0989, "step": 2765 }, { "epoch": 0.3, "grad_norm": 1.0654450598640384, "learning_rate": 1.7583160523514197e-05, "loss": 2.1069, "step": 2770 }, { "epoch": 0.3, "grad_norm": 1.107023981464564, "learning_rate": 1.7570683407148037e-05, "loss": 2.0444, "step": 2775 }, { "epoch": 0.3, "grad_norm": 1.0633632965018371, "learning_rate": 1.755817861793464e-05, "loss": 2.1178, "step": 2780 }, { "epoch": 0.31, "grad_norm": 1.1234331190153795, "learning_rate": 1.7545646201582304e-05, "loss": 2.0517, "step": 2785 }, { "epoch": 0.31, "grad_norm": 1.263481865384053, "learning_rate": 1.7533086203900324e-05, "loss": 2.0867, "step": 2790 }, { "epoch": 0.31, "grad_norm": 1.1707081266246808, "learning_rate": 1.7520498670798812e-05, "loss": 2.1002, "step": 2795 }, { "epoch": 0.31, "grad_norm": 1.268810545460621, "learning_rate": 1.7507883648288527e-05, "loss": 2.0745, "step": 2800 }, { "epoch": 0.31, "grad_norm": 1.0834113537758923, "learning_rate": 1.7495241182480703e-05, "loss": 2.1219, "step": 2805 }, { "epoch": 0.31, "grad_norm": 1.1373178119039804, "learning_rate": 1.748257131958689e-05, "loss": 2.1174, "step": 2810 }, { "epoch": 0.31, "grad_norm": 1.088974876393026, "learning_rate": 1.7469874105918793e-05, "loss": 2.1236, "step": 2815 }, { "epoch": 0.31, "grad_norm": 1.1393813323529787, "learning_rate": 1.7457149587888065e-05, "loss": 2.1082, "step": 2820 }, { "epoch": 0.31, "grad_norm": 1.1178096029451159, "learning_rate": 1.7444397812006194e-05, "loss": 2.1379, "step": 2825 }, { "epoch": 0.31, "grad_norm": 1.09187933332392, "learning_rate": 1.7431618824884283e-05, "loss": 2.1337, "step": 2830 }, { "epoch": 0.31, "grad_norm": 1.0764329838681872, "learning_rate": 1.741881267323291e-05, "loss": 2.0589, "step": 2835 }, { "epoch": 0.31, "grad_norm": 1.1929828098910324, "learning_rate": 1.740597940386193e-05, "loss": 2.1273, "step": 2840 }, { "epoch": 0.31, "grad_norm": 1.0559080267276388, "learning_rate": 1.739311906368034e-05, "loss": 2.0811, "step": 2845 }, { "epoch": 0.31, "grad_norm": 1.1253521982263932, "learning_rate": 1.738023169969608e-05, "loss": 2.0695, "step": 2850 }, { "epoch": 0.31, "grad_norm": 1.1235872191579686, "learning_rate": 1.736731735901587e-05, "loss": 2.0602, "step": 2855 }, { "epoch": 0.31, "grad_norm": 1.0832885298216302, "learning_rate": 1.7354376088845027e-05, "loss": 2.1145, "step": 2860 }, { "epoch": 0.31, "grad_norm": 1.0776252181759327, "learning_rate": 1.7341407936487316e-05, "loss": 2.0743, "step": 2865 }, { "epoch": 0.31, "grad_norm": 1.1301096597701228, "learning_rate": 1.7328412949344767e-05, "loss": 2.1276, "step": 2870 }, { "epoch": 0.31, "grad_norm": 1.0772595499989182, "learning_rate": 1.7315391174917476e-05, "loss": 2.1246, "step": 2875 }, { "epoch": 0.32, "grad_norm": 1.210914460662658, "learning_rate": 1.730234266080348e-05, "loss": 2.1396, "step": 2880 }, { "epoch": 0.32, "grad_norm": 1.0939104687593169, "learning_rate": 1.728926745469853e-05, "loss": 2.1368, "step": 2885 }, { "epoch": 0.32, "grad_norm": 2.75472924109131, "learning_rate": 1.7276165604395975e-05, "loss": 2.1402, "step": 2890 }, { "epoch": 0.32, "grad_norm": 1.1093010651226822, "learning_rate": 1.7263037157786526e-05, "loss": 2.0924, "step": 2895 }, { "epoch": 0.32, "grad_norm": 1.1324517363268187, "learning_rate": 1.7249882162858127e-05, "loss": 2.0807, "step": 2900 }, { "epoch": 0.32, "grad_norm": 1.0729848020560875, "learning_rate": 1.7236700667695754e-05, "loss": 2.0903, "step": 2905 }, { "epoch": 0.32, "grad_norm": 1.068302709822653, "learning_rate": 1.7223492720481255e-05, "loss": 2.1017, "step": 2910 }, { "epoch": 0.32, "grad_norm": 1.1429308141751306, "learning_rate": 1.721025836949317e-05, "loss": 2.1579, "step": 2915 }, { "epoch": 0.32, "grad_norm": 1.1927812202822, "learning_rate": 1.719699766310654e-05, "loss": 2.0937, "step": 2920 }, { "epoch": 0.32, "grad_norm": 1.1469469602198863, "learning_rate": 1.7183710649792754e-05, "loss": 2.0705, "step": 2925 }, { "epoch": 0.32, "grad_norm": 1.1625188280312655, "learning_rate": 1.7170397378119356e-05, "loss": 2.1262, "step": 2930 }, { "epoch": 0.32, "grad_norm": 1.1782950051913783, "learning_rate": 1.7157057896749862e-05, "loss": 2.1568, "step": 2935 }, { "epoch": 0.32, "grad_norm": 1.1475473295065628, "learning_rate": 1.7143692254443606e-05, "loss": 2.0763, "step": 2940 }, { "epoch": 0.32, "grad_norm": 1.1664423447042471, "learning_rate": 1.7130300500055537e-05, "loss": 2.0767, "step": 2945 }, { "epoch": 0.32, "grad_norm": 1.1218527595561445, "learning_rate": 1.7116882682536062e-05, "loss": 2.0699, "step": 2950 }, { "epoch": 0.32, "grad_norm": 1.1732851303495795, "learning_rate": 1.7103438850930837e-05, "loss": 2.0738, "step": 2955 }, { "epoch": 0.32, "grad_norm": 1.0522861961632755, "learning_rate": 1.708996905438062e-05, "loss": 2.1126, "step": 2960 }, { "epoch": 0.32, "grad_norm": 1.165862269862746, "learning_rate": 1.7076473342121077e-05, "loss": 2.0707, "step": 2965 }, { "epoch": 0.33, "grad_norm": 1.1329357733977503, "learning_rate": 1.7062951763482598e-05, "loss": 2.0336, "step": 2970 }, { "epoch": 0.33, "grad_norm": 1.169670602494436, "learning_rate": 1.7049404367890127e-05, "loss": 2.1211, "step": 2975 }, { "epoch": 0.33, "grad_norm": 1.0980138817373712, "learning_rate": 1.703583120486297e-05, "loss": 2.1035, "step": 2980 }, { "epoch": 0.33, "grad_norm": 1.1381156204273948, "learning_rate": 1.7022232324014628e-05, "loss": 2.0891, "step": 2985 }, { "epoch": 0.33, "grad_norm": 1.099997155415494, "learning_rate": 1.7008607775052593e-05, "loss": 2.1101, "step": 2990 }, { "epoch": 0.33, "grad_norm": 1.1846003395771518, "learning_rate": 1.6994957607778196e-05, "loss": 2.0542, "step": 2995 }, { "epoch": 0.33, "grad_norm": 1.1018891824594454, "learning_rate": 1.69812818720864e-05, "loss": 2.0587, "step": 3000 }, { "epoch": 0.33, "grad_norm": 1.108564527237041, "learning_rate": 1.6967580617965635e-05, "loss": 2.0442, "step": 3005 }, { "epoch": 0.33, "grad_norm": 1.0256985618256775, "learning_rate": 1.6953853895497596e-05, "loss": 2.0313, "step": 3010 }, { "epoch": 0.33, "grad_norm": 1.117266057301956, "learning_rate": 1.6940101754857087e-05, "loss": 2.0653, "step": 3015 }, { "epoch": 0.33, "grad_norm": 1.1054664584306193, "learning_rate": 1.6926324246311807e-05, "loss": 2.0701, "step": 3020 }, { "epoch": 0.33, "grad_norm": 1.1388831343598398, "learning_rate": 1.691252142022219e-05, "loss": 2.0758, "step": 3025 }, { "epoch": 0.33, "grad_norm": 1.1499026035275537, "learning_rate": 1.689869332704122e-05, "loss": 2.0387, "step": 3030 }, { "epoch": 0.33, "grad_norm": 1.1486721772962458, "learning_rate": 1.6884840017314215e-05, "loss": 2.0122, "step": 3035 }, { "epoch": 0.33, "grad_norm": 1.1946594748908477, "learning_rate": 1.6870961541678686e-05, "loss": 2.0562, "step": 3040 }, { "epoch": 0.33, "grad_norm": 1.1081200912171763, "learning_rate": 1.6857057950864134e-05, "loss": 2.1097, "step": 3045 }, { "epoch": 0.33, "grad_norm": 1.1560175876314496, "learning_rate": 1.6843129295691847e-05, "loss": 2.0653, "step": 3050 }, { "epoch": 0.33, "grad_norm": 1.1686473646775395, "learning_rate": 1.682917562707474e-05, "loss": 2.0671, "step": 3055 }, { "epoch": 0.34, "grad_norm": 1.181145730787233, "learning_rate": 1.6815196996017156e-05, "loss": 2.0685, "step": 3060 }, { "epoch": 0.34, "grad_norm": 1.1776535575038598, "learning_rate": 1.6801193453614683e-05, "loss": 2.051, "step": 3065 }, { "epoch": 0.34, "grad_norm": 1.0995302191315015, "learning_rate": 1.6787165051053974e-05, "loss": 2.109, "step": 3070 }, { "epoch": 0.34, "grad_norm": 1.1118446617101214, "learning_rate": 1.6773111839612536e-05, "loss": 2.0615, "step": 3075 }, { "epoch": 0.34, "grad_norm": 1.085478550441025, "learning_rate": 1.675903387065857e-05, "loss": 2.0914, "step": 3080 }, { "epoch": 0.34, "grad_norm": 1.112841492779926, "learning_rate": 1.6744931195650775e-05, "loss": 2.0448, "step": 3085 }, { "epoch": 0.34, "grad_norm": 1.0808904399167643, "learning_rate": 1.673080386613815e-05, "loss": 2.0438, "step": 3090 }, { "epoch": 0.34, "grad_norm": 1.1593574153382304, "learning_rate": 1.6716651933759812e-05, "loss": 2.0639, "step": 3095 }, { "epoch": 0.34, "grad_norm": 1.1341186674186763, "learning_rate": 1.6702475450244818e-05, "loss": 2.0498, "step": 3100 }, { "epoch": 0.34, "grad_norm": 1.1141606971142795, "learning_rate": 1.6688274467411953e-05, "loss": 2.0679, "step": 3105 }, { "epoch": 0.34, "grad_norm": 1.078157679287869, "learning_rate": 1.6674049037169565e-05, "loss": 2.0334, "step": 3110 }, { "epoch": 0.34, "grad_norm": 1.1814910193753863, "learning_rate": 1.6659799211515352e-05, "loss": 2.1517, "step": 3115 }, { "epoch": 0.34, "grad_norm": 1.0557518379444377, "learning_rate": 1.6645525042536196e-05, "loss": 2.0646, "step": 3120 }, { "epoch": 0.34, "grad_norm": 1.142501323832221, "learning_rate": 1.6631226582407954e-05, "loss": 2.1155, "step": 3125 }, { "epoch": 0.34, "grad_norm": 1.226982659855433, "learning_rate": 1.661690388339527e-05, "loss": 2.1405, "step": 3130 }, { "epoch": 0.34, "grad_norm": 1.154900585410977, "learning_rate": 1.6602556997851394e-05, "loss": 2.0184, "step": 3135 }, { "epoch": 0.34, "grad_norm": 1.0953597499817394, "learning_rate": 1.6588185978217982e-05, "loss": 2.0445, "step": 3140 }, { "epoch": 0.34, "grad_norm": 1.1593667618843306, "learning_rate": 1.6573790877024903e-05, "loss": 2.0953, "step": 3145 }, { "epoch": 0.35, "grad_norm": 1.1047654778720604, "learning_rate": 1.6559371746890058e-05, "loss": 2.0795, "step": 3150 }, { "epoch": 0.35, "grad_norm": 1.1155683409276855, "learning_rate": 1.6544928640519174e-05, "loss": 2.1239, "step": 3155 }, { "epoch": 0.35, "grad_norm": 1.075924274275993, "learning_rate": 1.6530461610705616e-05, "loss": 2.0886, "step": 3160 }, { "epoch": 0.35, "grad_norm": 1.0971475981064696, "learning_rate": 1.6515970710330205e-05, "loss": 2.1294, "step": 3165 }, { "epoch": 0.35, "grad_norm": 1.151469103275728, "learning_rate": 1.6501455992361004e-05, "loss": 2.0852, "step": 3170 }, { "epoch": 0.35, "grad_norm": 1.1719863245615771, "learning_rate": 1.648691750985314e-05, "loss": 2.1035, "step": 3175 }, { "epoch": 0.35, "grad_norm": 1.1347414969545695, "learning_rate": 1.647235531594861e-05, "loss": 2.0641, "step": 3180 }, { "epoch": 0.35, "grad_norm": 1.1204301067311855, "learning_rate": 1.6457769463876078e-05, "loss": 2.1022, "step": 3185 }, { "epoch": 0.35, "grad_norm": 1.0621609202944586, "learning_rate": 1.6443160006950677e-05, "loss": 2.0138, "step": 3190 }, { "epoch": 0.35, "grad_norm": 1.1143841796175546, "learning_rate": 1.642852699857384e-05, "loss": 2.0444, "step": 3195 }, { "epoch": 0.35, "grad_norm": 1.1087328004977834, "learning_rate": 1.641387049223308e-05, "loss": 2.0643, "step": 3200 }, { "epoch": 0.35, "grad_norm": 1.1160589414657458, "learning_rate": 1.6399190541501786e-05, "loss": 2.0875, "step": 3205 }, { "epoch": 0.35, "grad_norm": 1.0903246954037675, "learning_rate": 1.638448720003906e-05, "loss": 2.0424, "step": 3210 }, { "epoch": 0.35, "grad_norm": 1.0639848909825909, "learning_rate": 1.6369760521589503e-05, "loss": 2.0874, "step": 3215 }, { "epoch": 0.35, "grad_norm": 1.1579622295086673, "learning_rate": 1.6355010559983015e-05, "loss": 2.1284, "step": 3220 }, { "epoch": 0.35, "grad_norm": 1.1089790582659134, "learning_rate": 1.634023736913459e-05, "loss": 2.0748, "step": 3225 }, { "epoch": 0.35, "grad_norm": 1.0548875665998114, "learning_rate": 1.6325441003044152e-05, "loss": 2.0418, "step": 3230 }, { "epoch": 0.35, "grad_norm": 1.0821418808700585, "learning_rate": 1.6310621515796322e-05, "loss": 2.0953, "step": 3235 }, { "epoch": 0.35, "grad_norm": 1.11775098391777, "learning_rate": 1.6295778961560242e-05, "loss": 2.0948, "step": 3240 }, { "epoch": 0.36, "grad_norm": 1.1423595116236458, "learning_rate": 1.6280913394589368e-05, "loss": 2.1392, "step": 3245 }, { "epoch": 0.36, "grad_norm": 1.1814195834094339, "learning_rate": 1.6266024869221272e-05, "loss": 2.0918, "step": 3250 }, { "epoch": 0.36, "grad_norm": 1.076912872102532, "learning_rate": 1.625111343987744e-05, "loss": 2.0365, "step": 3255 }, { "epoch": 0.36, "grad_norm": 1.0904616741975421, "learning_rate": 1.62361791610631e-05, "loss": 2.0696, "step": 3260 }, { "epoch": 0.36, "grad_norm": 1.1714475385414609, "learning_rate": 1.622122208736697e-05, "loss": 2.0693, "step": 3265 }, { "epoch": 0.36, "grad_norm": 1.051664932872804, "learning_rate": 1.6206242273461106e-05, "loss": 2.0716, "step": 3270 }, { "epoch": 0.36, "grad_norm": 1.0727716949190738, "learning_rate": 1.61912397741007e-05, "loss": 2.1372, "step": 3275 }, { "epoch": 0.36, "grad_norm": 1.1782261822482705, "learning_rate": 1.6176214644123827e-05, "loss": 2.0347, "step": 3280 }, { "epoch": 0.36, "grad_norm": 1.1260983455437332, "learning_rate": 1.616116693845132e-05, "loss": 2.1197, "step": 3285 }, { "epoch": 0.36, "grad_norm": 1.1098287950745347, "learning_rate": 1.614609671208651e-05, "loss": 2.081, "step": 3290 }, { "epoch": 0.36, "grad_norm": 1.0436061993117904, "learning_rate": 1.613100402011506e-05, "loss": 2.0708, "step": 3295 }, { "epoch": 0.36, "grad_norm": 1.159205538125202, "learning_rate": 1.611588891770474e-05, "loss": 2.1424, "step": 3300 }, { "epoch": 0.36, "grad_norm": 1.1104093341621577, "learning_rate": 1.6100751460105244e-05, "loss": 2.1372, "step": 3305 }, { "epoch": 0.36, "grad_norm": 1.109910079296386, "learning_rate": 1.6085591702647978e-05, "loss": 2.0627, "step": 3310 }, { "epoch": 0.36, "grad_norm": 1.0957645231598219, "learning_rate": 1.6070409700745857e-05, "loss": 2.0675, "step": 3315 }, { "epoch": 0.36, "grad_norm": 1.1051430756484568, "learning_rate": 1.6055205509893108e-05, "loss": 2.1144, "step": 3320 }, { "epoch": 0.36, "grad_norm": 1.0619960453826602, "learning_rate": 1.6039979185665063e-05, "loss": 2.0783, "step": 3325 }, { "epoch": 0.36, "grad_norm": 1.1699535308022313, "learning_rate": 1.602473078371796e-05, "loss": 2.0318, "step": 3330 }, { "epoch": 0.37, "grad_norm": 1.1781679212702045, "learning_rate": 1.6009460359788734e-05, "loss": 2.1088, "step": 3335 }, { "epoch": 0.37, "grad_norm": 1.1316858954470768, "learning_rate": 1.5994167969694824e-05, "loss": 2.106, "step": 3340 }, { "epoch": 0.37, "grad_norm": 1.170556549875735, "learning_rate": 1.5978853669333938e-05, "loss": 2.0538, "step": 3345 }, { "epoch": 0.37, "grad_norm": 1.2787336396825792, "learning_rate": 1.5963517514683907e-05, "loss": 2.1129, "step": 3350 }, { "epoch": 0.37, "grad_norm": 1.20467298894606, "learning_rate": 1.5948159561802414e-05, "loss": 2.1254, "step": 3355 }, { "epoch": 0.37, "grad_norm": 1.2213357389574577, "learning_rate": 1.5932779866826837e-05, "loss": 2.095, "step": 3360 }, { "epoch": 0.37, "grad_norm": 1.161156677939486, "learning_rate": 1.591737848597402e-05, "loss": 2.0792, "step": 3365 }, { "epoch": 0.37, "grad_norm": 1.0722652533527763, "learning_rate": 1.5901955475540087e-05, "loss": 2.067, "step": 3370 }, { "epoch": 0.37, "grad_norm": 1.1238612742515977, "learning_rate": 1.5886510891900203e-05, "loss": 2.0947, "step": 3375 }, { "epoch": 0.37, "grad_norm": 1.1058085194150948, "learning_rate": 1.5871044791508404e-05, "loss": 2.1234, "step": 3380 }, { "epoch": 0.37, "grad_norm": 1.1876966864138578, "learning_rate": 1.5855557230897373e-05, "loss": 2.0857, "step": 3385 }, { "epoch": 0.37, "grad_norm": 1.1175726566859054, "learning_rate": 1.5840048266678236e-05, "loss": 2.0675, "step": 3390 }, { "epoch": 0.37, "grad_norm": 1.125131372985968, "learning_rate": 1.5824517955540345e-05, "loss": 2.0288, "step": 3395 }, { "epoch": 0.37, "grad_norm": 1.1814108073306664, "learning_rate": 1.5808966354251097e-05, "loss": 2.0774, "step": 3400 }, { "epoch": 0.37, "grad_norm": 1.0961162850855288, "learning_rate": 1.5793393519655697e-05, "loss": 2.0712, "step": 3405 }, { "epoch": 0.37, "grad_norm": 1.1105888920743134, "learning_rate": 1.5777799508676974e-05, "loss": 2.0696, "step": 3410 }, { "epoch": 0.37, "grad_norm": 1.0813238176581508, "learning_rate": 1.5762184378315146e-05, "loss": 2.103, "step": 3415 }, { "epoch": 0.37, "grad_norm": 1.1310690814233393, "learning_rate": 1.574654818564765e-05, "loss": 2.0857, "step": 3420 }, { "epoch": 0.38, "grad_norm": 1.087258952335332, "learning_rate": 1.5730890987828893e-05, "loss": 2.1125, "step": 3425 }, { "epoch": 0.38, "grad_norm": 1.100945556914244, "learning_rate": 1.5715212842090067e-05, "loss": 2.0811, "step": 3430 }, { "epoch": 0.38, "grad_norm": 1.0967502479063236, "learning_rate": 1.5699513805738942e-05, "loss": 2.1128, "step": 3435 }, { "epoch": 0.38, "grad_norm": 1.0860599522102894, "learning_rate": 1.5683793936159636e-05, "loss": 2.1082, "step": 3440 }, { "epoch": 0.38, "grad_norm": 1.1660577133517571, "learning_rate": 1.5668053290812423e-05, "loss": 2.1144, "step": 3445 }, { "epoch": 0.38, "grad_norm": 1.0952329396542488, "learning_rate": 1.5652291927233525e-05, "loss": 2.0985, "step": 3450 }, { "epoch": 0.38, "grad_norm": 1.1669788039742652, "learning_rate": 1.5636509903034883e-05, "loss": 2.0941, "step": 3455 }, { "epoch": 0.38, "grad_norm": 1.0717864359194949, "learning_rate": 1.5620707275903964e-05, "loss": 2.0809, "step": 3460 }, { "epoch": 0.38, "grad_norm": 1.1321977241167134, "learning_rate": 1.5604884103603547e-05, "loss": 2.1175, "step": 3465 }, { "epoch": 0.38, "grad_norm": 1.1675603769757348, "learning_rate": 1.55890404439715e-05, "loss": 2.1318, "step": 3470 }, { "epoch": 0.38, "grad_norm": 1.139951361254572, "learning_rate": 1.557317635492059e-05, "loss": 2.0722, "step": 3475 }, { "epoch": 0.38, "grad_norm": 1.166249801007756, "learning_rate": 1.5557291894438247e-05, "loss": 2.1117, "step": 3480 }, { "epoch": 0.38, "grad_norm": 1.1156535413976312, "learning_rate": 1.554138712058637e-05, "loss": 2.0973, "step": 3485 }, { "epoch": 0.38, "grad_norm": 1.0542724637556116, "learning_rate": 1.5525462091501112e-05, "loss": 2.0815, "step": 3490 }, { "epoch": 0.38, "grad_norm": 1.265467268410671, "learning_rate": 1.5509516865392652e-05, "loss": 2.0588, "step": 3495 }, { "epoch": 0.38, "grad_norm": 1.0858454017839732, "learning_rate": 1.549355150054501e-05, "loss": 2.1544, "step": 3500 }, { "epoch": 0.38, "grad_norm": 1.136883207680999, "learning_rate": 1.5477566055315808e-05, "loss": 2.1408, "step": 3505 }, { "epoch": 0.38, "grad_norm": 1.191558766752858, "learning_rate": 1.5461560588136065e-05, "loss": 2.1196, "step": 3510 }, { "epoch": 0.39, "grad_norm": 1.1337199882399849, "learning_rate": 1.544553515751e-05, "loss": 2.1004, "step": 3515 }, { "epoch": 0.39, "grad_norm": 1.0591226943304792, "learning_rate": 1.542948982201479e-05, "loss": 2.1151, "step": 3520 }, { "epoch": 0.39, "grad_norm": 1.1144487670493992, "learning_rate": 1.541342464030037e-05, "loss": 2.0882, "step": 3525 }, { "epoch": 0.39, "grad_norm": 1.1266101348133806, "learning_rate": 1.5397339671089228e-05, "loss": 2.0846, "step": 3530 }, { "epoch": 0.39, "grad_norm": 1.0891799084214837, "learning_rate": 1.538123497317617e-05, "loss": 2.1033, "step": 3535 }, { "epoch": 0.39, "grad_norm": 1.2158818215911102, "learning_rate": 1.5365110605428123e-05, "loss": 2.0774, "step": 3540 }, { "epoch": 0.39, "grad_norm": 1.0804614391587901, "learning_rate": 1.534896662678391e-05, "loss": 2.1054, "step": 3545 }, { "epoch": 0.39, "grad_norm": 1.1694795760579961, "learning_rate": 1.533280309625403e-05, "loss": 2.0963, "step": 3550 }, { "epoch": 0.39, "grad_norm": 1.127851288105375, "learning_rate": 1.5316620072920466e-05, "loss": 2.0531, "step": 3555 }, { "epoch": 0.39, "grad_norm": 1.127283843628519, "learning_rate": 1.5300417615936433e-05, "loss": 2.0998, "step": 3560 }, { "epoch": 0.39, "grad_norm": 1.1083263568479134, "learning_rate": 1.5284195784526196e-05, "loss": 2.0805, "step": 3565 }, { "epoch": 0.39, "grad_norm": 1.3210020329085226, "learning_rate": 1.526795463798483e-05, "loss": 2.1146, "step": 3570 }, { "epoch": 0.39, "grad_norm": 1.122053829044529, "learning_rate": 1.5251694235678019e-05, "loss": 2.111, "step": 3575 }, { "epoch": 0.39, "grad_norm": 1.1256673445291332, "learning_rate": 1.5235414637041821e-05, "loss": 2.065, "step": 3580 }, { "epoch": 0.39, "grad_norm": 1.167672037196411, "learning_rate": 1.5219115901582471e-05, "loss": 2.1056, "step": 3585 }, { "epoch": 0.39, "grad_norm": 1.1637880813595944, "learning_rate": 1.5202798088876156e-05, "loss": 2.0396, "step": 3590 }, { "epoch": 0.39, "grad_norm": 1.0952989421704529, "learning_rate": 1.5186461258568785e-05, "loss": 2.1448, "step": 3595 }, { "epoch": 0.39, "grad_norm": 1.1913586739315074, "learning_rate": 1.517010547037579e-05, "loss": 2.1237, "step": 3600 }, { "epoch": 0.39, "grad_norm": 1.1788964458889817, "learning_rate": 1.5153730784081896e-05, "loss": 2.0312, "step": 3605 }, { "epoch": 0.4, "grad_norm": 1.1714549101983995, "learning_rate": 1.5137337259540908e-05, "loss": 2.1149, "step": 3610 }, { "epoch": 0.4, "grad_norm": 1.1185074932370724, "learning_rate": 1.5120924956675484e-05, "loss": 2.109, "step": 3615 }, { "epoch": 0.4, "grad_norm": 1.0822409000309015, "learning_rate": 1.5104493935476928e-05, "loss": 2.1246, "step": 3620 }, { "epoch": 0.4, "grad_norm": 1.0719732905251764, "learning_rate": 1.5088044256004958e-05, "loss": 2.0973, "step": 3625 }, { "epoch": 0.4, "grad_norm": 1.1147862632370846, "learning_rate": 1.5071575978387505e-05, "loss": 2.0029, "step": 3630 }, { "epoch": 0.4, "grad_norm": 1.1214805206217422, "learning_rate": 1.5055089162820462e-05, "loss": 2.0712, "step": 3635 }, { "epoch": 0.4, "grad_norm": 1.0961622390174324, "learning_rate": 1.50385838695675e-05, "loss": 2.0925, "step": 3640 }, { "epoch": 0.4, "grad_norm": 1.159483603396113, "learning_rate": 1.5022060158959827e-05, "loss": 2.0942, "step": 3645 }, { "epoch": 0.4, "grad_norm": 1.0886186360856882, "learning_rate": 1.5005518091395964e-05, "loss": 2.0584, "step": 3650 }, { "epoch": 0.4, "grad_norm": 1.1498450278965668, "learning_rate": 1.4988957727341543e-05, "loss": 2.1052, "step": 3655 }, { "epoch": 0.4, "grad_norm": 1.1058614538694598, "learning_rate": 1.4972379127329062e-05, "loss": 2.0958, "step": 3660 }, { "epoch": 0.4, "grad_norm": 1.0561366380450348, "learning_rate": 1.4955782351957681e-05, "loss": 2.0689, "step": 3665 }, { "epoch": 0.4, "grad_norm": 1.091056741003288, "learning_rate": 1.4939167461893008e-05, "loss": 2.1448, "step": 3670 }, { "epoch": 0.4, "grad_norm": 1.1886667736960905, "learning_rate": 1.4922534517866843e-05, "loss": 2.0749, "step": 3675 }, { "epoch": 0.4, "grad_norm": 1.0715246633883568, "learning_rate": 1.490588358067699e-05, "loss": 2.1101, "step": 3680 }, { "epoch": 0.4, "grad_norm": 1.0328636556908009, "learning_rate": 1.4889214711187028e-05, "loss": 2.037, "step": 3685 }, { "epoch": 0.4, "grad_norm": 1.0849100761912411, "learning_rate": 1.4872527970326075e-05, "loss": 2.0678, "step": 3690 }, { "epoch": 0.4, "grad_norm": 1.1622731100294095, "learning_rate": 1.4855823419088576e-05, "loss": 2.0807, "step": 3695 }, { "epoch": 0.41, "grad_norm": 1.1206975337016192, "learning_rate": 1.4839101118534071e-05, "loss": 2.0903, "step": 3700 }, { "epoch": 0.41, "grad_norm": 1.0790502106351736, "learning_rate": 1.4822361129786992e-05, "loss": 2.0805, "step": 3705 }, { "epoch": 0.41, "grad_norm": 1.0479285271048226, "learning_rate": 1.4805603514036417e-05, "loss": 2.0738, "step": 3710 }, { "epoch": 0.41, "grad_norm": 1.187457262970017, "learning_rate": 1.4788828332535858e-05, "loss": 2.0813, "step": 3715 }, { "epoch": 0.41, "grad_norm": 1.0794199580435586, "learning_rate": 1.4772035646603032e-05, "loss": 2.1254, "step": 3720 }, { "epoch": 0.41, "grad_norm": 1.1121311041684154, "learning_rate": 1.4755225517619643e-05, "loss": 2.0389, "step": 3725 }, { "epoch": 0.41, "grad_norm": 1.087947336936164, "learning_rate": 1.473839800703115e-05, "loss": 2.138, "step": 3730 }, { "epoch": 0.41, "grad_norm": 1.1539857727763527, "learning_rate": 1.4721553176346545e-05, "loss": 2.1032, "step": 3735 }, { "epoch": 0.41, "grad_norm": 1.084853723076532, "learning_rate": 1.470469108713814e-05, "loss": 2.0913, "step": 3740 }, { "epoch": 0.41, "grad_norm": 1.0714906715102737, "learning_rate": 1.4687811801041323e-05, "loss": 2.0927, "step": 3745 }, { "epoch": 0.41, "grad_norm": 1.1527776763594195, "learning_rate": 1.4670915379754338e-05, "loss": 2.0009, "step": 3750 }, { "epoch": 0.41, "grad_norm": 1.1309711867263936, "learning_rate": 1.4654001885038069e-05, "loss": 2.078, "step": 3755 }, { "epoch": 0.41, "grad_norm": 1.1231479019999449, "learning_rate": 1.4637071378715807e-05, "loss": 2.1049, "step": 3760 }, { "epoch": 0.41, "grad_norm": 1.1992515421717667, "learning_rate": 1.4620123922673024e-05, "loss": 2.0469, "step": 3765 }, { "epoch": 0.41, "grad_norm": 1.1286379014618428, "learning_rate": 1.4603159578857145e-05, "loss": 2.0335, "step": 3770 }, { "epoch": 0.41, "grad_norm": 1.1591268200153777, "learning_rate": 1.4586178409277327e-05, "loss": 2.1104, "step": 3775 }, { "epoch": 0.41, "grad_norm": 1.0969867832144846, "learning_rate": 1.4569180476004237e-05, "loss": 2.0728, "step": 3780 }, { "epoch": 0.41, "grad_norm": 1.2343285234599615, "learning_rate": 1.45521658411698e-05, "loss": 2.0639, "step": 3785 }, { "epoch": 0.42, "grad_norm": 1.095710051754578, "learning_rate": 1.4535134566967004e-05, "loss": 2.1368, "step": 3790 }, { "epoch": 0.42, "grad_norm": 1.1197749989074302, "learning_rate": 1.4518086715649657e-05, "loss": 2.0225, "step": 3795 }, { "epoch": 0.42, "grad_norm": 1.1368512560043975, "learning_rate": 1.4501022349532155e-05, "loss": 2.0371, "step": 3800 }, { "epoch": 0.42, "grad_norm": 1.141573219660765, "learning_rate": 1.4483941530989263e-05, "loss": 2.0585, "step": 3805 }, { "epoch": 0.42, "grad_norm": 1.0863385111202175, "learning_rate": 1.4466844322455882e-05, "loss": 2.0552, "step": 3810 }, { "epoch": 0.42, "grad_norm": 1.0343538202663156, "learning_rate": 1.4449730786426825e-05, "loss": 2.0857, "step": 3815 }, { "epoch": 0.42, "grad_norm": 1.066372133750889, "learning_rate": 1.4432600985456592e-05, "loss": 2.0739, "step": 3820 }, { "epoch": 0.42, "grad_norm": 1.0790717301212016, "learning_rate": 1.4415454982159121e-05, "loss": 2.0689, "step": 3825 }, { "epoch": 0.42, "grad_norm": 1.1073696078806285, "learning_rate": 1.4398292839207583e-05, "loss": 2.0991, "step": 3830 }, { "epoch": 0.42, "grad_norm": 1.0777170822803703, "learning_rate": 1.438111461933415e-05, "loss": 2.0712, "step": 3835 }, { "epoch": 0.42, "grad_norm": 1.1087453789934494, "learning_rate": 1.4363920385329748e-05, "loss": 2.0489, "step": 3840 }, { "epoch": 0.42, "grad_norm": 1.0737881216348049, "learning_rate": 1.434671020004384e-05, "loss": 2.0853, "step": 3845 }, { "epoch": 0.42, "grad_norm": 1.0822003048921027, "learning_rate": 1.4329484126384206e-05, "loss": 2.0565, "step": 3850 }, { "epoch": 0.42, "grad_norm": 1.1082712712937464, "learning_rate": 1.4312242227316689e-05, "loss": 2.074, "step": 3855 }, { "epoch": 0.42, "grad_norm": 1.1446139410055034, "learning_rate": 1.4294984565864983e-05, "loss": 2.037, "step": 3860 }, { "epoch": 0.42, "grad_norm": 1.1376850731396333, "learning_rate": 1.4277711205110398e-05, "loss": 2.1838, "step": 3865 }, { "epoch": 0.42, "grad_norm": 1.1665446356653948, "learning_rate": 1.4260422208191635e-05, "loss": 2.1069, "step": 3870 }, { "epoch": 0.42, "grad_norm": 1.025703810569636, "learning_rate": 1.424311763830454e-05, "loss": 2.0648, "step": 3875 }, { "epoch": 0.43, "grad_norm": 1.1812074726963293, "learning_rate": 1.4225797558701885e-05, "loss": 2.0399, "step": 3880 }, { "epoch": 0.43, "grad_norm": 1.1142231036853525, "learning_rate": 1.4208462032693136e-05, "loss": 2.0821, "step": 3885 }, { "epoch": 0.43, "grad_norm": 1.0776634591216698, "learning_rate": 1.419111112364422e-05, "loss": 2.1015, "step": 3890 }, { "epoch": 0.43, "grad_norm": 1.1573371902729876, "learning_rate": 1.417374489497729e-05, "loss": 2.032, "step": 3895 }, { "epoch": 0.43, "grad_norm": 1.231427478696499, "learning_rate": 1.4156363410170502e-05, "loss": 2.1486, "step": 3900 }, { "epoch": 0.43, "grad_norm": 1.1804878597073283, "learning_rate": 1.4138966732757766e-05, "loss": 2.0569, "step": 3905 }, { "epoch": 0.43, "grad_norm": 1.1563232101411665, "learning_rate": 1.412155492632854e-05, "loss": 2.0592, "step": 3910 }, { "epoch": 0.43, "grad_norm": 1.1773361700217357, "learning_rate": 1.410412805452757e-05, "loss": 2.0804, "step": 3915 }, { "epoch": 0.43, "grad_norm": 1.1389419837577535, "learning_rate": 1.4086686181054677e-05, "loss": 2.1028, "step": 3920 }, { "epoch": 0.43, "grad_norm": 1.1358281256978966, "learning_rate": 1.4069229369664514e-05, "loss": 2.0608, "step": 3925 }, { "epoch": 0.43, "grad_norm": 1.0949423547451935, "learning_rate": 1.4051757684166339e-05, "loss": 2.1393, "step": 3930 }, { "epoch": 0.43, "grad_norm": 1.1272988855272394, "learning_rate": 1.4034271188423771e-05, "loss": 2.0661, "step": 3935 }, { "epoch": 0.43, "grad_norm": 1.1902092609881862, "learning_rate": 1.4016769946354573e-05, "loss": 2.0668, "step": 3940 }, { "epoch": 0.43, "grad_norm": 1.1726610093310714, "learning_rate": 1.3999254021930416e-05, "loss": 2.0377, "step": 3945 }, { "epoch": 0.43, "grad_norm": 1.1570076569088603, "learning_rate": 1.3981723479176614e-05, "loss": 2.0298, "step": 3950 }, { "epoch": 0.43, "grad_norm": 1.1374290256511088, "learning_rate": 1.3964178382171942e-05, "loss": 2.0858, "step": 3955 }, { "epoch": 0.43, "grad_norm": 1.0998759314985755, "learning_rate": 1.3946618795048357e-05, "loss": 2.077, "step": 3960 }, { "epoch": 0.43, "grad_norm": 1.1648830356904165, "learning_rate": 1.392904478199079e-05, "loss": 2.0501, "step": 3965 }, { "epoch": 0.43, "grad_norm": 1.2632484649786138, "learning_rate": 1.39114564072369e-05, "loss": 2.0033, "step": 3970 }, { "epoch": 0.44, "grad_norm": 1.0807705945400357, "learning_rate": 1.3893853735076839e-05, "loss": 2.0595, "step": 3975 }, { "epoch": 0.44, "grad_norm": 1.0371290710566696, "learning_rate": 1.3876236829853026e-05, "loss": 2.1264, "step": 3980 }, { "epoch": 0.44, "grad_norm": 1.1271227874459142, "learning_rate": 1.3858605755959902e-05, "loss": 2.0338, "step": 3985 }, { "epoch": 0.44, "grad_norm": 1.115697299247894, "learning_rate": 1.3840960577843695e-05, "loss": 2.1165, "step": 3990 }, { "epoch": 0.44, "grad_norm": 1.0868186702587643, "learning_rate": 1.3823301360002197e-05, "loss": 2.088, "step": 3995 }, { "epoch": 0.44, "grad_norm": 1.135280795866365, "learning_rate": 1.3805628166984506e-05, "loss": 2.0517, "step": 4000 }, { "epoch": 0.44, "grad_norm": 1.0902611718370974, "learning_rate": 1.3787941063390814e-05, "loss": 2.1019, "step": 4005 }, { "epoch": 0.44, "grad_norm": 1.0701123921168167, "learning_rate": 1.3770240113872153e-05, "loss": 2.0881, "step": 4010 }, { "epoch": 0.44, "grad_norm": 1.1362764312152593, "learning_rate": 1.3752525383130172e-05, "loss": 2.0344, "step": 4015 }, { "epoch": 0.44, "grad_norm": 1.0587800764015347, "learning_rate": 1.3734796935916888e-05, "loss": 2.1076, "step": 4020 }, { "epoch": 0.44, "grad_norm": 1.0554380785401654, "learning_rate": 1.3717054837034459e-05, "loss": 2.0581, "step": 4025 }, { "epoch": 0.44, "grad_norm": 1.1085464327911465, "learning_rate": 1.3699299151334937e-05, "loss": 2.1038, "step": 4030 }, { "epoch": 0.44, "grad_norm": 1.0877441064997873, "learning_rate": 1.368152994372005e-05, "loss": 2.0866, "step": 4035 }, { "epoch": 0.44, "grad_norm": 1.0947817675409057, "learning_rate": 1.366374727914094e-05, "loss": 2.0651, "step": 4040 }, { "epoch": 0.44, "grad_norm": 1.20604881923778, "learning_rate": 1.3645951222597941e-05, "loss": 2.128, "step": 4045 }, { "epoch": 0.44, "grad_norm": 1.089897966096813, "learning_rate": 1.3628141839140345e-05, "loss": 2.0734, "step": 4050 }, { "epoch": 0.44, "grad_norm": 1.0912465441445693, "learning_rate": 1.3610319193866151e-05, "loss": 2.097, "step": 4055 }, { "epoch": 0.44, "grad_norm": 1.0685626627049327, "learning_rate": 1.3592483351921826e-05, "loss": 2.0338, "step": 4060 }, { "epoch": 0.45, "grad_norm": 1.1537960681179606, "learning_rate": 1.3574634378502092e-05, "loss": 2.0549, "step": 4065 }, { "epoch": 0.45, "grad_norm": 1.1941604926633542, "learning_rate": 1.3556772338849658e-05, "loss": 2.0656, "step": 4070 }, { "epoch": 0.45, "grad_norm": 1.0792593508386137, "learning_rate": 1.3538897298254994e-05, "loss": 2.0587, "step": 4075 }, { "epoch": 0.45, "grad_norm": 1.089402390800475, "learning_rate": 1.3521009322056099e-05, "loss": 2.0477, "step": 4080 }, { "epoch": 0.45, "grad_norm": 1.1001896779780547, "learning_rate": 1.3503108475638244e-05, "loss": 2.0848, "step": 4085 }, { "epoch": 0.45, "grad_norm": 1.1552859448053805, "learning_rate": 1.3485194824433754e-05, "loss": 2.1052, "step": 4090 }, { "epoch": 0.45, "grad_norm": 1.1221298521480432, "learning_rate": 1.3467268433921756e-05, "loss": 1.9584, "step": 4095 }, { "epoch": 0.45, "grad_norm": 1.1454626954519598, "learning_rate": 1.3449329369627937e-05, "loss": 2.0622, "step": 4100 }, { "epoch": 0.45, "grad_norm": 1.1713517405126508, "learning_rate": 1.343137769712432e-05, "loss": 2.0498, "step": 4105 }, { "epoch": 0.45, "grad_norm": 1.1238415060612987, "learning_rate": 1.3413413482029008e-05, "loss": 2.0619, "step": 4110 }, { "epoch": 0.45, "grad_norm": 1.0225968668152061, "learning_rate": 1.3395436790005945e-05, "loss": 2.0657, "step": 4115 }, { "epoch": 0.45, "grad_norm": 1.125311964130542, "learning_rate": 1.3377447686764695e-05, "loss": 2.0307, "step": 4120 }, { "epoch": 0.45, "grad_norm": 1.1380628490759972, "learning_rate": 1.335944623806018e-05, "loss": 2.0636, "step": 4125 }, { "epoch": 0.45, "grad_norm": 1.0997313029032674, "learning_rate": 1.3341432509692447e-05, "loss": 2.065, "step": 4130 }, { "epoch": 0.45, "grad_norm": 1.074238075540202, "learning_rate": 1.3323406567506433e-05, "loss": 2.0639, "step": 4135 }, { "epoch": 0.45, "grad_norm": 1.1078501717508566, "learning_rate": 1.3305368477391715e-05, "loss": 2.0276, "step": 4140 }, { "epoch": 0.45, "grad_norm": 1.0534216888778523, "learning_rate": 1.3287318305282277e-05, "loss": 2.0848, "step": 4145 }, { "epoch": 0.45, "grad_norm": 1.092898101907433, "learning_rate": 1.326925611715627e-05, "loss": 2.0912, "step": 4150 }, { "epoch": 0.46, "grad_norm": 1.1103687953119292, "learning_rate": 1.3251181979035748e-05, "loss": 1.9963, "step": 4155 }, { "epoch": 0.46, "grad_norm": 1.1287117241876143, "learning_rate": 1.3233095956986477e-05, "loss": 2.0714, "step": 4160 }, { "epoch": 0.46, "grad_norm": 1.1591499699879233, "learning_rate": 1.321499811711763e-05, "loss": 2.0463, "step": 4165 }, { "epoch": 0.46, "grad_norm": 1.1345356890927996, "learning_rate": 1.3196888525581595e-05, "loss": 2.1235, "step": 4170 }, { "epoch": 0.46, "grad_norm": 1.2213478400684037, "learning_rate": 1.3178767248573715e-05, "loss": 2.093, "step": 4175 }, { "epoch": 0.46, "grad_norm": 1.103662900414304, "learning_rate": 1.3160634352332045e-05, "loss": 2.0464, "step": 4180 }, { "epoch": 0.46, "grad_norm": 1.105911480565854, "learning_rate": 1.3142489903137101e-05, "loss": 2.0159, "step": 4185 }, { "epoch": 0.46, "grad_norm": 1.1095160341379844, "learning_rate": 1.3124333967311644e-05, "loss": 2.0567, "step": 4190 }, { "epoch": 0.46, "grad_norm": 1.1229614019522776, "learning_rate": 1.3106166611220412e-05, "loss": 2.0344, "step": 4195 }, { "epoch": 0.46, "grad_norm": 1.117562790083517, "learning_rate": 1.308798790126989e-05, "loss": 2.0023, "step": 4200 }, { "epoch": 0.46, "grad_norm": 1.0767552578012236, "learning_rate": 1.3069797903908064e-05, "loss": 2.109, "step": 4205 }, { "epoch": 0.46, "grad_norm": 1.1336669158549828, "learning_rate": 1.3051596685624173e-05, "loss": 2.0765, "step": 4210 }, { "epoch": 0.46, "grad_norm": 1.0833967380299871, "learning_rate": 1.3033384312948487e-05, "loss": 2.0834, "step": 4215 }, { "epoch": 0.46, "grad_norm": 1.114081541429066, "learning_rate": 1.301516085245203e-05, "loss": 2.1189, "step": 4220 }, { "epoch": 0.46, "grad_norm": 1.1615085331172228, "learning_rate": 1.299692637074636e-05, "loss": 2.0766, "step": 4225 }, { "epoch": 0.46, "grad_norm": 1.1966221517242868, "learning_rate": 1.2978680934483334e-05, "loss": 2.0394, "step": 4230 }, { "epoch": 0.46, "grad_norm": 1.095687456761838, "learning_rate": 1.2960424610354836e-05, "loss": 2.0318, "step": 4235 }, { "epoch": 0.46, "grad_norm": 1.165429544236628, "learning_rate": 1.2942157465092546e-05, "loss": 2.0674, "step": 4240 }, { "epoch": 0.47, "grad_norm": 1.1353515582352522, "learning_rate": 1.2923879565467712e-05, "loss": 2.058, "step": 4245 }, { "epoch": 0.47, "grad_norm": 1.106678521669802, "learning_rate": 1.290559097829088e-05, "loss": 2.0571, "step": 4250 }, { "epoch": 0.47, "grad_norm": 1.1665501281445383, "learning_rate": 1.2887291770411668e-05, "loss": 2.0146, "step": 4255 }, { "epoch": 0.47, "grad_norm": 1.0527135410827075, "learning_rate": 1.2868982008718509e-05, "loss": 2.0546, "step": 4260 }, { "epoch": 0.47, "grad_norm": 1.141288359698131, "learning_rate": 1.2850661760138423e-05, "loss": 2.0244, "step": 4265 }, { "epoch": 0.47, "grad_norm": 1.1335260883279379, "learning_rate": 1.2832331091636756e-05, "loss": 2.1446, "step": 4270 }, { "epoch": 0.47, "grad_norm": 1.0669405038971442, "learning_rate": 1.281399007021694e-05, "loss": 2.0495, "step": 4275 }, { "epoch": 0.47, "grad_norm": 1.1177528107042043, "learning_rate": 1.2795638762920254e-05, "loss": 2.0841, "step": 4280 }, { "epoch": 0.47, "grad_norm": 1.131873250858352, "learning_rate": 1.2777277236825576e-05, "loss": 2.1001, "step": 4285 }, { "epoch": 0.47, "grad_norm": 1.0630884444587207, "learning_rate": 1.2758905559049124e-05, "loss": 2.0671, "step": 4290 }, { "epoch": 0.47, "grad_norm": 1.1491964715251155, "learning_rate": 1.2740523796744238e-05, "loss": 2.0992, "step": 4295 }, { "epoch": 0.47, "grad_norm": 1.0819989695849115, "learning_rate": 1.2722132017101115e-05, "loss": 2.0526, "step": 4300 }, { "epoch": 0.47, "grad_norm": 1.0953977498773142, "learning_rate": 1.2703730287346565e-05, "loss": 2.1089, "step": 4305 }, { "epoch": 0.47, "grad_norm": 1.069489639129453, "learning_rate": 1.2685318674743769e-05, "loss": 2.047, "step": 4310 }, { "epoch": 0.47, "grad_norm": 1.082906486037135, "learning_rate": 1.2666897246592034e-05, "loss": 2.0692, "step": 4315 }, { "epoch": 0.47, "grad_norm": 1.1650734306581634, "learning_rate": 1.2648466070226547e-05, "loss": 2.0253, "step": 4320 }, { "epoch": 0.47, "grad_norm": 1.110560701641449, "learning_rate": 1.2630025213018127e-05, "loss": 2.0176, "step": 4325 }, { "epoch": 0.47, "grad_norm": 1.1194561001334415, "learning_rate": 1.2611574742372967e-05, "loss": 2.0429, "step": 4330 }, { "epoch": 0.47, "grad_norm": 1.117788927328157, "learning_rate": 1.259311472573242e-05, "loss": 2.0595, "step": 4335 }, { "epoch": 0.48, "grad_norm": 1.0851222670212033, "learning_rate": 1.2574645230572722e-05, "loss": 2.005, "step": 4340 }, { "epoch": 0.48, "grad_norm": 1.127698423329636, "learning_rate": 1.2556166324404747e-05, "loss": 2.0664, "step": 4345 }, { "epoch": 0.48, "grad_norm": 1.133204686155976, "learning_rate": 1.2537678074773785e-05, "loss": 2.0592, "step": 4350 }, { "epoch": 0.48, "grad_norm": 1.1967709089063834, "learning_rate": 1.251918054925927e-05, "loss": 2.0796, "step": 4355 }, { "epoch": 0.48, "grad_norm": 1.0816545522198335, "learning_rate": 1.2500673815474543e-05, "loss": 2.0896, "step": 4360 }, { "epoch": 0.48, "grad_norm": 1.0859512816441796, "learning_rate": 1.2482157941066599e-05, "loss": 2.0545, "step": 4365 }, { "epoch": 0.48, "grad_norm": 1.0704135764094322, "learning_rate": 1.2463632993715853e-05, "loss": 2.0442, "step": 4370 }, { "epoch": 0.48, "grad_norm": 1.0968975869865876, "learning_rate": 1.244509904113588e-05, "loss": 2.1071, "step": 4375 }, { "epoch": 0.48, "grad_norm": 1.1050677665133675, "learning_rate": 1.242655615107317e-05, "loss": 2.0389, "step": 4380 }, { "epoch": 0.48, "grad_norm": 1.0860365258749805, "learning_rate": 1.2408004391306883e-05, "loss": 2.0138, "step": 4385 }, { "epoch": 0.48, "grad_norm": 1.1066217506593872, "learning_rate": 1.2389443829648602e-05, "loss": 2.056, "step": 4390 }, { "epoch": 0.48, "grad_norm": 1.1263715405012742, "learning_rate": 1.2370874533942083e-05, "loss": 2.0938, "step": 4395 }, { "epoch": 0.48, "grad_norm": 1.1186722783283325, "learning_rate": 1.2352296572062999e-05, "loss": 2.0817, "step": 4400 }, { "epoch": 0.48, "grad_norm": 1.033827652010266, "learning_rate": 1.2333710011918713e-05, "loss": 2.0864, "step": 4405 }, { "epoch": 0.48, "grad_norm": 1.1909246617156706, "learning_rate": 1.2315114921448012e-05, "loss": 2.0934, "step": 4410 }, { "epoch": 0.48, "grad_norm": 1.0940128608636932, "learning_rate": 1.2296511368620858e-05, "loss": 2.0601, "step": 4415 }, { "epoch": 0.48, "grad_norm": 1.0929850591973302, "learning_rate": 1.2277899421438155e-05, "loss": 2.0338, "step": 4420 }, { "epoch": 0.48, "grad_norm": 1.0878763098230584, "learning_rate": 1.2259279147931479e-05, "loss": 2.0494, "step": 4425 }, { "epoch": 0.49, "grad_norm": 1.046008975235024, "learning_rate": 1.2240650616162853e-05, "loss": 1.9926, "step": 4430 }, { "epoch": 0.49, "grad_norm": 1.0832286901463481, "learning_rate": 1.2222013894224476e-05, "loss": 2.1067, "step": 4435 }, { "epoch": 0.49, "grad_norm": 1.0706317825883884, "learning_rate": 1.2203369050238489e-05, "loss": 2.0644, "step": 4440 }, { "epoch": 0.49, "grad_norm": 1.120973793734029, "learning_rate": 1.2184716152356725e-05, "loss": 2.0809, "step": 4445 }, { "epoch": 0.49, "grad_norm": 1.1195779633249452, "learning_rate": 1.2166055268760451e-05, "loss": 2.0985, "step": 4450 }, { "epoch": 0.49, "grad_norm": 1.0809834985902398, "learning_rate": 1.2147386467660119e-05, "loss": 2.0593, "step": 4455 }, { "epoch": 0.49, "grad_norm": 1.085198669041652, "learning_rate": 1.2128709817295132e-05, "loss": 2.0905, "step": 4460 }, { "epoch": 0.49, "grad_norm": 1.2256510014016468, "learning_rate": 1.2110025385933582e-05, "loss": 2.0769, "step": 4465 }, { "epoch": 0.49, "grad_norm": 1.1831613178981129, "learning_rate": 1.2091333241871992e-05, "loss": 2.0886, "step": 4470 }, { "epoch": 0.49, "grad_norm": 1.1442486406751624, "learning_rate": 1.2072633453435092e-05, "loss": 2.053, "step": 4475 }, { "epoch": 0.49, "grad_norm": 1.0799725598833219, "learning_rate": 1.2053926088975545e-05, "loss": 2.0529, "step": 4480 }, { "epoch": 0.49, "grad_norm": 1.1421811903871824, "learning_rate": 1.203521121687371e-05, "loss": 2.11, "step": 4485 }, { "epoch": 0.49, "grad_norm": 1.137928301580973, "learning_rate": 1.2016488905537384e-05, "loss": 2.0508, "step": 4490 }, { "epoch": 0.49, "grad_norm": 1.0683293930516091, "learning_rate": 1.1997759223401562e-05, "loss": 2.0654, "step": 4495 }, { "epoch": 0.49, "grad_norm": 1.152603371525857, "learning_rate": 1.1979022238928179e-05, "loss": 2.0937, "step": 4500 }, { "epoch": 0.49, "grad_norm": 1.090807368867559, "learning_rate": 1.1960278020605861e-05, "loss": 2.0302, "step": 4505 }, { "epoch": 0.49, "grad_norm": 1.1919495997875138, "learning_rate": 1.1941526636949674e-05, "loss": 2.1434, "step": 4510 }, { "epoch": 0.49, "grad_norm": 1.0762507580561773, "learning_rate": 1.1922768156500882e-05, "loss": 2.0356, "step": 4515 }, { "epoch": 0.5, "grad_norm": 1.1094334477397263, "learning_rate": 1.1904002647826682e-05, "loss": 2.0529, "step": 4520 }, { "epoch": 0.5, "grad_norm": 1.1232412162723349, "learning_rate": 1.1885230179519963e-05, "loss": 2.0886, "step": 4525 }, { "epoch": 0.5, "grad_norm": 1.0699760221611696, "learning_rate": 1.1866450820199056e-05, "loss": 2.0657, "step": 4530 }, { "epoch": 0.5, "grad_norm": 1.169753321532871, "learning_rate": 1.184766463850748e-05, "loss": 2.0826, "step": 4535 }, { "epoch": 0.5, "grad_norm": 1.1090252429913754, "learning_rate": 1.1828871703113686e-05, "loss": 2.0525, "step": 4540 }, { "epoch": 0.5, "grad_norm": 1.1653659621895398, "learning_rate": 1.1810072082710823e-05, "loss": 2.126, "step": 4545 }, { "epoch": 0.5, "grad_norm": 1.1644246344272575, "learning_rate": 1.1791265846016461e-05, "loss": 2.0762, "step": 4550 }, { "epoch": 0.5, "grad_norm": 1.1191080032985037, "learning_rate": 1.177245306177237e-05, "loss": 2.0729, "step": 4555 }, { "epoch": 0.5, "grad_norm": 1.092653268545582, "learning_rate": 1.175363379874424e-05, "loss": 2.0551, "step": 4560 }, { "epoch": 0.5, "grad_norm": 1.1227573916183717, "learning_rate": 1.1734808125721441e-05, "loss": 2.1146, "step": 4565 }, { "epoch": 0.5, "grad_norm": 1.0807206437968646, "learning_rate": 1.1715976111516794e-05, "loss": 2.0434, "step": 4570 }, { "epoch": 0.5, "grad_norm": 1.099562818023924, "learning_rate": 1.1697137824966273e-05, "loss": 2.0868, "step": 4575 }, { "epoch": 0.5, "grad_norm": 1.1222628071563274, "learning_rate": 1.1678293334928795e-05, "loss": 2.0826, "step": 4580 }, { "epoch": 0.5, "grad_norm": 1.1092127557476292, "learning_rate": 1.1659442710285948e-05, "loss": 2.0302, "step": 4585 }, { "epoch": 0.5, "grad_norm": 1.0838946143363315, "learning_rate": 1.1640586019941742e-05, "loss": 2.0375, "step": 4590 }, { "epoch": 0.5, "grad_norm": 1.1190897246599587, "learning_rate": 1.162172333282236e-05, "loss": 2.0349, "step": 4595 }, { "epoch": 0.5, "grad_norm": 1.044353198531872, "learning_rate": 1.1602854717875906e-05, "loss": 2.0565, "step": 4600 }, { "epoch": 0.5, "grad_norm": 1.1010982478288043, "learning_rate": 1.158398024407215e-05, "loss": 2.0304, "step": 4605 }, { "epoch": 0.5, "grad_norm": 1.0815876169983454, "learning_rate": 1.1565099980402284e-05, "loss": 2.0192, "step": 4610 }, { "epoch": 0.51, "grad_norm": 1.1464933937213106, "learning_rate": 1.1546213995878646e-05, "loss": 2.0996, "step": 4615 }, { "epoch": 0.51, "grad_norm": 1.0281026557831758, "learning_rate": 1.1527322359534506e-05, "loss": 2.0449, "step": 4620 }, { "epoch": 0.51, "grad_norm": 1.1422293846875218, "learning_rate": 1.1508425140423782e-05, "loss": 2.0838, "step": 4625 }, { "epoch": 0.51, "grad_norm": 1.1483591366131618, "learning_rate": 1.1489522407620802e-05, "loss": 2.0442, "step": 4630 }, { "epoch": 0.51, "grad_norm": 1.111748314283335, "learning_rate": 1.1470614230220041e-05, "loss": 2.0179, "step": 4635 }, { "epoch": 0.51, "grad_norm": 1.1218190715384995, "learning_rate": 1.1451700677335889e-05, "loss": 2.0719, "step": 4640 }, { "epoch": 0.51, "grad_norm": 1.1806247755096917, "learning_rate": 1.1432781818102374e-05, "loss": 2.0238, "step": 4645 }, { "epoch": 0.51, "grad_norm": 1.062185041018419, "learning_rate": 1.1413857721672916e-05, "loss": 2.0478, "step": 4650 }, { "epoch": 0.51, "grad_norm": 1.0270707140751227, "learning_rate": 1.1394928457220096e-05, "loss": 2.0722, "step": 4655 }, { "epoch": 0.51, "grad_norm": 1.0975961208626284, "learning_rate": 1.137599409393537e-05, "loss": 2.0324, "step": 4660 }, { "epoch": 0.51, "grad_norm": 1.0944587771746428, "learning_rate": 1.1357054701028836e-05, "loss": 2.0337, "step": 4665 }, { "epoch": 0.51, "grad_norm": 1.0930361282302832, "learning_rate": 1.1338110347728973e-05, "loss": 2.032, "step": 4670 }, { "epoch": 0.51, "grad_norm": 1.0930587369343403, "learning_rate": 1.1319161103282398e-05, "loss": 2.0239, "step": 4675 }, { "epoch": 0.51, "grad_norm": 1.1430731148498792, "learning_rate": 1.1300207036953613e-05, "loss": 2.1165, "step": 4680 }, { "epoch": 0.51, "grad_norm": 1.153773639847964, "learning_rate": 1.1281248218024723e-05, "loss": 2.0675, "step": 4685 }, { "epoch": 0.51, "grad_norm": 1.1109760876070638, "learning_rate": 1.1262284715795223e-05, "loss": 2.0074, "step": 4690 }, { "epoch": 0.51, "grad_norm": 1.1780990537406848, "learning_rate": 1.1243316599581725e-05, "loss": 2.0278, "step": 4695 }, { "epoch": 0.51, "grad_norm": 1.084802747102323, "learning_rate": 1.1224343938717699e-05, "loss": 2.0397, "step": 4700 }, { "epoch": 0.52, "grad_norm": 1.0757740084262926, "learning_rate": 1.1205366802553231e-05, "loss": 2.1034, "step": 4705 }, { "epoch": 0.52, "grad_norm": 1.099232435780472, "learning_rate": 1.1186385260454768e-05, "loss": 2.0776, "step": 4710 }, { "epoch": 0.52, "grad_norm": 1.083658542147341, "learning_rate": 1.1167399381804863e-05, "loss": 2.0487, "step": 4715 }, { "epoch": 0.52, "grad_norm": 1.0878211971168938, "learning_rate": 1.1148409236001907e-05, "loss": 1.9686, "step": 4720 }, { "epoch": 0.52, "grad_norm": 1.1520600564364925, "learning_rate": 1.1129414892459902e-05, "loss": 2.0174, "step": 4725 }, { "epoch": 0.52, "grad_norm": 1.0549456004910753, "learning_rate": 1.1110416420608193e-05, "loss": 1.9806, "step": 4730 }, { "epoch": 0.52, "grad_norm": 1.1490111768807956, "learning_rate": 1.1091413889891211e-05, "loss": 2.0333, "step": 4735 }, { "epoch": 0.52, "grad_norm": 1.1497590384642855, "learning_rate": 1.1072407369768222e-05, "loss": 2.0814, "step": 4740 }, { "epoch": 0.52, "grad_norm": 1.0418251541308143, "learning_rate": 1.1053396929713076e-05, "loss": 2.068, "step": 4745 }, { "epoch": 0.52, "grad_norm": 1.1413761128885032, "learning_rate": 1.1034382639213957e-05, "loss": 2.0864, "step": 4750 }, { "epoch": 0.52, "grad_norm": 1.1359626767924271, "learning_rate": 1.1015364567773114e-05, "loss": 2.098, "step": 4755 }, { "epoch": 0.52, "grad_norm": 1.0692357265546106, "learning_rate": 1.099634278490662e-05, "loss": 1.9997, "step": 4760 }, { "epoch": 0.52, "grad_norm": 1.03161328872651, "learning_rate": 1.097731736014412e-05, "loss": 2.0575, "step": 4765 }, { "epoch": 0.52, "grad_norm": 1.102645591983199, "learning_rate": 1.0958288363028562e-05, "loss": 2.0174, "step": 4770 }, { "epoch": 0.52, "grad_norm": 1.1564182747212546, "learning_rate": 1.0939255863115959e-05, "loss": 2.0686, "step": 4775 }, { "epoch": 0.52, "grad_norm": 1.043956627449105, "learning_rate": 1.0920219929975118e-05, "loss": 2.0747, "step": 4780 }, { "epoch": 0.52, "grad_norm": 1.041085540953387, "learning_rate": 1.090118063318741e-05, "loss": 2.0542, "step": 4785 }, { "epoch": 0.52, "grad_norm": 1.080431218035328, "learning_rate": 1.0882138042346488e-05, "loss": 2.0912, "step": 4790 }, { "epoch": 0.53, "grad_norm": 1.1111212409496378, "learning_rate": 1.0863092227058051e-05, "loss": 2.0098, "step": 4795 }, { "epoch": 0.53, "grad_norm": 1.0220315345377011, "learning_rate": 1.0844043256939585e-05, "loss": 2.0495, "step": 4800 }, { "epoch": 0.53, "grad_norm": 1.2414722969388017, "learning_rate": 1.082499120162011e-05, "loss": 2.1003, "step": 4805 }, { "epoch": 0.53, "grad_norm": 1.0986934023939594, "learning_rate": 1.0805936130739911e-05, "loss": 2.0951, "step": 4810 }, { "epoch": 0.53, "grad_norm": 1.2131395037380222, "learning_rate": 1.0786878113950312e-05, "loss": 2.028, "step": 4815 }, { "epoch": 0.53, "grad_norm": 1.0907833156436297, "learning_rate": 1.0767817220913394e-05, "loss": 2.0356, "step": 4820 }, { "epoch": 0.53, "grad_norm": 1.03929824698217, "learning_rate": 1.0748753521301758e-05, "loss": 2.0682, "step": 4825 }, { "epoch": 0.53, "grad_norm": 1.0632849528818291, "learning_rate": 1.072968708479826e-05, "loss": 2.0763, "step": 4830 }, { "epoch": 0.53, "grad_norm": 1.0818876113989597, "learning_rate": 1.0710617981095754e-05, "loss": 2.0462, "step": 4835 }, { "epoch": 0.53, "grad_norm": 1.0743738752350598, "learning_rate": 1.069154627989686e-05, "loss": 2.0706, "step": 4840 }, { "epoch": 0.53, "grad_norm": 1.125849671935532, "learning_rate": 1.0672472050913678e-05, "loss": 2.027, "step": 4845 }, { "epoch": 0.53, "grad_norm": 1.178731544798949, "learning_rate": 1.0653395363867551e-05, "loss": 2.0334, "step": 4850 }, { "epoch": 0.53, "grad_norm": 1.0897303473021323, "learning_rate": 1.0634316288488808e-05, "loss": 2.0656, "step": 4855 }, { "epoch": 0.53, "grad_norm": 1.1260035150575207, "learning_rate": 1.0615234894516517e-05, "loss": 2.1234, "step": 4860 }, { "epoch": 0.53, "grad_norm": 1.1483933307889824, "learning_rate": 1.05961512516982e-05, "loss": 2.0637, "step": 4865 }, { "epoch": 0.53, "grad_norm": 1.061429926487569, "learning_rate": 1.057706542978962e-05, "loss": 2.0954, "step": 4870 }, { "epoch": 0.53, "grad_norm": 1.103823113341728, "learning_rate": 1.0557977498554489e-05, "loss": 2.0836, "step": 4875 }, { "epoch": 0.53, "grad_norm": 1.3261569785411271, "learning_rate": 1.0538887527764245e-05, "loss": 2.0287, "step": 4880 }, { "epoch": 0.54, "grad_norm": 1.2096300852119426, "learning_rate": 1.0519795587197768e-05, "loss": 2.07, "step": 4885 }, { "epoch": 0.54, "grad_norm": 1.2575715649021257, "learning_rate": 1.0500701746641142e-05, "loss": 2.0753, "step": 4890 }, { "epoch": 0.54, "grad_norm": 1.0609098442371527, "learning_rate": 1.04816060758874e-05, "loss": 1.9882, "step": 4895 }, { "epoch": 0.54, "grad_norm": 1.1155277158554042, "learning_rate": 1.0462508644736262e-05, "loss": 2.0026, "step": 4900 }, { "epoch": 0.54, "grad_norm": 1.0636778011747472, "learning_rate": 1.0443409522993877e-05, "loss": 2.0345, "step": 4905 }, { "epoch": 0.54, "grad_norm": 1.1130136307914735, "learning_rate": 1.0424308780472585e-05, "loss": 2.0322, "step": 4910 }, { "epoch": 0.54, "grad_norm": 1.0757650313164606, "learning_rate": 1.0405206486990642e-05, "loss": 2.0949, "step": 4915 }, { "epoch": 0.54, "grad_norm": 1.0340850025914057, "learning_rate": 1.0386102712371975e-05, "loss": 2.0701, "step": 4920 }, { "epoch": 0.54, "grad_norm": 1.0834919922731008, "learning_rate": 1.0366997526445928e-05, "loss": 2.0651, "step": 4925 }, { "epoch": 0.54, "grad_norm": 1.129493610674135, "learning_rate": 1.0347890999046998e-05, "loss": 2.0258, "step": 4930 }, { "epoch": 0.54, "grad_norm": 1.1026897766186166, "learning_rate": 1.0328783200014588e-05, "loss": 2.0483, "step": 4935 }, { "epoch": 0.54, "grad_norm": 1.1062345888294867, "learning_rate": 1.0309674199192753e-05, "loss": 2.0918, "step": 4940 }, { "epoch": 0.54, "grad_norm": 1.0440081206325804, "learning_rate": 1.0290564066429935e-05, "loss": 2.0879, "step": 4945 }, { "epoch": 0.54, "grad_norm": 1.1163278226732885, "learning_rate": 1.027145287157872e-05, "loss": 2.06, "step": 4950 }, { "epoch": 0.54, "grad_norm": 1.1012821056361937, "learning_rate": 1.025234068449557e-05, "loss": 2.0682, "step": 4955 }, { "epoch": 0.54, "grad_norm": 1.1105047981025156, "learning_rate": 1.0233227575040572e-05, "loss": 2.1237, "step": 4960 }, { "epoch": 0.54, "grad_norm": 1.1281356708024215, "learning_rate": 1.0214113613077197e-05, "loss": 2.0908, "step": 4965 }, { "epoch": 0.54, "grad_norm": 1.0899988297844936, "learning_rate": 1.0194998868472016e-05, "loss": 2.1143, "step": 4970 }, { "epoch": 0.54, "grad_norm": 1.0845356513354827, "learning_rate": 1.0175883411094473e-05, "loss": 2.13, "step": 4975 }, { "epoch": 0.55, "grad_norm": 1.067681353317346, "learning_rate": 1.0156767310816616e-05, "loss": 2.0448, "step": 4980 }, { "epoch": 0.55, "grad_norm": 1.1439364103209957, "learning_rate": 1.0137650637512835e-05, "loss": 2.0623, "step": 4985 }, { "epoch": 0.55, "grad_norm": 1.2058174354762354, "learning_rate": 1.0118533461059617e-05, "loss": 2.1092, "step": 4990 }, { "epoch": 0.55, "grad_norm": 1.0307338887698831, "learning_rate": 1.00994158513353e-05, "loss": 2.0035, "step": 4995 }, { "epoch": 0.55, "grad_norm": 1.0723552466043387, "learning_rate": 1.0080297878219785e-05, "loss": 2.0504, "step": 5000 }, { "epoch": 0.55, "grad_norm": 1.2139015239944049, "learning_rate": 1.0061179611594323e-05, "loss": 2.0566, "step": 5005 }, { "epoch": 0.55, "grad_norm": 1.1541878282112488, "learning_rate": 1.0042061121341222e-05, "loss": 2.0761, "step": 5010 }, { "epoch": 0.55, "grad_norm": 1.2188835001131502, "learning_rate": 1.0022942477343613e-05, "loss": 2.0122, "step": 5015 }, { "epoch": 0.55, "grad_norm": 1.1497588333746003, "learning_rate": 1.0003823749485193e-05, "loss": 2.0955, "step": 5020 }, { "epoch": 0.55, "grad_norm": 1.1414927150858214, "learning_rate": 9.98470500764996e-06, "loss": 2.0322, "step": 5025 }, { "epoch": 0.55, "grad_norm": 1.0788494134872506, "learning_rate": 9.965586321721958e-06, "loss": 2.0913, "step": 5030 }, { "epoch": 0.55, "grad_norm": 1.1471200625763642, "learning_rate": 9.946467761585043e-06, "loss": 2.0346, "step": 5035 }, { "epoch": 0.55, "grad_norm": 1.1260064543343609, "learning_rate": 9.927349397122595e-06, "loss": 2.004, "step": 5040 }, { "epoch": 0.55, "grad_norm": 1.0671331245637625, "learning_rate": 9.90823129821729e-06, "loss": 2.0437, "step": 5045 }, { "epoch": 0.55, "grad_norm": 1.0152338431837347, "learning_rate": 9.889113534750823e-06, "loss": 2.0715, "step": 5050 }, { "epoch": 0.55, "grad_norm": 1.1801097709864117, "learning_rate": 9.869996176603672e-06, "loss": 2.0376, "step": 5055 }, { "epoch": 0.55, "grad_norm": 1.0426691588947268, "learning_rate": 9.850879293654829e-06, "loss": 2.0864, "step": 5060 }, { "epoch": 0.55, "grad_norm": 1.0887954101930313, "learning_rate": 9.831762955781548e-06, "loss": 2.0212, "step": 5065 }, { "epoch": 0.56, "grad_norm": 1.1841606191797287, "learning_rate": 9.8126472328591e-06, "loss": 2.0718, "step": 5070 }, { "epoch": 0.56, "grad_norm": 1.118072183332927, "learning_rate": 9.79353219476049e-06, "loss": 2.0107, "step": 5075 }, { "epoch": 0.56, "grad_norm": 1.129370505582247, "learning_rate": 9.774417911356237e-06, "loss": 2.0825, "step": 5080 }, { "epoch": 0.56, "grad_norm": 1.045684580915944, "learning_rate": 9.755304452514091e-06, "loss": 2.073, "step": 5085 }, { "epoch": 0.56, "grad_norm": 1.0866447508099955, "learning_rate": 9.736191888098797e-06, "loss": 2.0512, "step": 5090 }, { "epoch": 0.56, "grad_norm": 1.0876806012854694, "learning_rate": 9.717080287971824e-06, "loss": 2.0519, "step": 5095 }, { "epoch": 0.56, "grad_norm": 1.1272542912387633, "learning_rate": 9.697969721991114e-06, "loss": 2.0713, "step": 5100 }, { "epoch": 0.56, "grad_norm": 1.0933293589827713, "learning_rate": 9.678860260010834e-06, "loss": 2.0308, "step": 5105 }, { "epoch": 0.56, "grad_norm": 1.083886908851357, "learning_rate": 9.659751971881119e-06, "loss": 2.0268, "step": 5110 }, { "epoch": 0.56, "grad_norm": 1.0354091863655817, "learning_rate": 9.640644927447801e-06, "loss": 2.0482, "step": 5115 }, { "epoch": 0.56, "grad_norm": 1.0811564925193906, "learning_rate": 9.621539196552177e-06, "loss": 2.0864, "step": 5120 }, { "epoch": 0.56, "grad_norm": 1.0740195220883966, "learning_rate": 9.602434849030747e-06, "loss": 2.0515, "step": 5125 }, { "epoch": 0.56, "grad_norm": 1.0811213680540268, "learning_rate": 9.583331954714935e-06, "loss": 1.9967, "step": 5130 }, { "epoch": 0.56, "grad_norm": 1.122139885595036, "learning_rate": 9.564230583430866e-06, "loss": 2.0734, "step": 5135 }, { "epoch": 0.56, "grad_norm": 1.1316701978806596, "learning_rate": 9.5451308049991e-06, "loss": 2.016, "step": 5140 }, { "epoch": 0.56, "grad_norm": 1.077106463794164, "learning_rate": 9.526032689234374e-06, "loss": 2.0172, "step": 5145 }, { "epoch": 0.56, "grad_norm": 1.0990682908328333, "learning_rate": 9.506936305945343e-06, "loss": 2.022, "step": 5150 }, { "epoch": 0.56, "grad_norm": 1.114222800053068, "learning_rate": 9.487841724934325e-06, "loss": 2.0277, "step": 5155 }, { "epoch": 0.57, "grad_norm": 1.0942848536707008, "learning_rate": 9.468749015997063e-06, "loss": 2.1043, "step": 5160 }, { "epoch": 0.57, "grad_norm": 1.0905729012032352, "learning_rate": 9.449658248922452e-06, "loss": 2.1036, "step": 5165 }, { "epoch": 0.57, "grad_norm": 1.0756582489657216, "learning_rate": 9.43056949349228e-06, "loss": 2.0432, "step": 5170 }, { "epoch": 0.57, "grad_norm": 1.0659326971725576, "learning_rate": 9.411482819480994e-06, "loss": 2.0883, "step": 5175 }, { "epoch": 0.57, "grad_norm": 1.1180849241524742, "learning_rate": 9.392398296655432e-06, "loss": 2.0836, "step": 5180 }, { "epoch": 0.57, "grad_norm": 1.0551287354221965, "learning_rate": 9.373315994774558e-06, "loss": 2.0556, "step": 5185 }, { "epoch": 0.57, "grad_norm": 1.1578330048276684, "learning_rate": 9.354235983589229e-06, "loss": 2.0875, "step": 5190 }, { "epoch": 0.57, "grad_norm": 1.089768974650412, "learning_rate": 9.335158332841922e-06, "loss": 2.0622, "step": 5195 }, { "epoch": 0.57, "grad_norm": 1.1010706302814328, "learning_rate": 9.316083112266491e-06, "loss": 2.0584, "step": 5200 }, { "epoch": 0.57, "grad_norm": 1.1227483093133004, "learning_rate": 9.297010391587909e-06, "loss": 2.0397, "step": 5205 }, { "epoch": 0.57, "grad_norm": 1.1217225838838254, "learning_rate": 9.277940240521996e-06, "loss": 2.0252, "step": 5210 }, { "epoch": 0.57, "grad_norm": 1.0418580641493653, "learning_rate": 9.258872728775198e-06, "loss": 2.0744, "step": 5215 }, { "epoch": 0.57, "grad_norm": 1.0611218674748695, "learning_rate": 9.23980792604431e-06, "loss": 2.0413, "step": 5220 }, { "epoch": 0.57, "grad_norm": 1.059603448859148, "learning_rate": 9.22074590201621e-06, "loss": 2.0694, "step": 5225 }, { "epoch": 0.57, "grad_norm": 1.1073002327171813, "learning_rate": 9.201686726367638e-06, "loss": 2.0467, "step": 5230 }, { "epoch": 0.57, "grad_norm": 1.0566888319853762, "learning_rate": 9.182630468764917e-06, "loss": 2.054, "step": 5235 }, { "epoch": 0.57, "grad_norm": 1.2119803787537455, "learning_rate": 9.163577198863695e-06, "loss": 2.0486, "step": 5240 }, { "epoch": 0.57, "grad_norm": 1.0581293727515462, "learning_rate": 9.144526986308704e-06, "loss": 2.0766, "step": 5245 }, { "epoch": 0.58, "grad_norm": 1.074292077742016, "learning_rate": 9.125479900733505e-06, "loss": 2.0438, "step": 5250 }, { "epoch": 0.58, "grad_norm": 1.0516933750471331, "learning_rate": 9.106436011760229e-06, "loss": 2.0308, "step": 5255 }, { "epoch": 0.58, "grad_norm": 1.1426584095527283, "learning_rate": 9.087395388999319e-06, "loss": 2.0127, "step": 5260 }, { "epoch": 0.58, "grad_norm": 1.0445227467497327, "learning_rate": 9.068358102049272e-06, "loss": 2.0365, "step": 5265 }, { "epoch": 0.58, "grad_norm": 1.1544574335706221, "learning_rate": 9.049324220496405e-06, "loss": 2.0159, "step": 5270 }, { "epoch": 0.58, "grad_norm": 1.1035345496013722, "learning_rate": 9.030293813914586e-06, "loss": 2.0304, "step": 5275 }, { "epoch": 0.58, "grad_norm": 1.068389416291131, "learning_rate": 9.01126695186497e-06, "loss": 2.0601, "step": 5280 }, { "epoch": 0.58, "grad_norm": 1.0688188806609953, "learning_rate": 8.992243703895766e-06, "loss": 2.0882, "step": 5285 }, { "epoch": 0.58, "grad_norm": 1.0801959378099826, "learning_rate": 8.973224139541971e-06, "loss": 2.0202, "step": 5290 }, { "epoch": 0.58, "grad_norm": 1.1015616274707378, "learning_rate": 8.954208328325112e-06, "loss": 2.0972, "step": 5295 }, { "epoch": 0.58, "grad_norm": 1.0725442846052415, "learning_rate": 8.935196339753e-06, "loss": 2.0624, "step": 5300 }, { "epoch": 0.58, "grad_norm": 1.0998726234432248, "learning_rate": 8.91618824331948e-06, "loss": 2.0688, "step": 5305 }, { "epoch": 0.58, "grad_norm": 1.0290624070121706, "learning_rate": 8.89718410850416e-06, "loss": 2.0862, "step": 5310 }, { "epoch": 0.58, "grad_norm": 1.1548559651706123, "learning_rate": 8.878184004772174e-06, "loss": 2.0643, "step": 5315 }, { "epoch": 0.58, "grad_norm": 1.15125186733994, "learning_rate": 8.859188001573916e-06, "loss": 2.0846, "step": 5320 }, { "epoch": 0.58, "grad_norm": 1.1295567710991432, "learning_rate": 8.840196168344798e-06, "loss": 2.0223, "step": 5325 }, { "epoch": 0.58, "grad_norm": 1.065128211130043, "learning_rate": 8.821208574504982e-06, "loss": 2.0349, "step": 5330 }, { "epoch": 0.58, "grad_norm": 1.146978776196902, "learning_rate": 8.80222528945914e-06, "loss": 2.0522, "step": 5335 }, { "epoch": 0.58, "grad_norm": 1.1212674741647894, "learning_rate": 8.783246382596195e-06, "loss": 2.0485, "step": 5340 }, { "epoch": 0.59, "grad_norm": 1.0914552108567808, "learning_rate": 8.764271923289064e-06, "loss": 2.0596, "step": 5345 }, { "epoch": 0.59, "grad_norm": 1.1913167295615903, "learning_rate": 8.745301980894399e-06, "loss": 2.0754, "step": 5350 }, { "epoch": 0.59, "grad_norm": 1.0893775596988204, "learning_rate": 8.726336624752355e-06, "loss": 2.0658, "step": 5355 }, { "epoch": 0.59, "grad_norm": 1.0658705211861876, "learning_rate": 8.707375924186321e-06, "loss": 2.098, "step": 5360 }, { "epoch": 0.59, "grad_norm": 1.1025699517960845, "learning_rate": 8.688419948502656e-06, "loss": 2.0508, "step": 5365 }, { "epoch": 0.59, "grad_norm": 1.1123361465895094, "learning_rate": 8.66946876699047e-06, "loss": 2.0105, "step": 5370 }, { "epoch": 0.59, "grad_norm": 1.1007621642720498, "learning_rate": 8.650522448921323e-06, "loss": 2.0829, "step": 5375 }, { "epoch": 0.59, "grad_norm": 1.0290515889207443, "learning_rate": 8.631581063549018e-06, "loss": 2.0705, "step": 5380 }, { "epoch": 0.59, "grad_norm": 1.049186637888095, "learning_rate": 8.61264468010932e-06, "loss": 2.0615, "step": 5385 }, { "epoch": 0.59, "grad_norm": 1.0813759791025181, "learning_rate": 8.59371336781971e-06, "loss": 2.0764, "step": 5390 }, { "epoch": 0.59, "grad_norm": 1.1263140461522694, "learning_rate": 8.574787195879136e-06, "loss": 2.0601, "step": 5395 }, { "epoch": 0.59, "grad_norm": 1.0657094460877041, "learning_rate": 8.555866233467758e-06, "loss": 2.0719, "step": 5400 }, { "epoch": 0.59, "grad_norm": 1.1635064468943173, "learning_rate": 8.536950549746683e-06, "loss": 2.0375, "step": 5405 }, { "epoch": 0.59, "grad_norm": 1.1444778175956825, "learning_rate": 8.518040213857733e-06, "loss": 2.0955, "step": 5410 }, { "epoch": 0.59, "grad_norm": 1.1300389930482988, "learning_rate": 8.499135294923185e-06, "loss": 2.079, "step": 5415 }, { "epoch": 0.59, "grad_norm": 1.0904953321791813, "learning_rate": 8.480235862045503e-06, "loss": 2.0554, "step": 5420 }, { "epoch": 0.59, "grad_norm": 1.0863105033423839, "learning_rate": 8.461341984307115e-06, "loss": 2.0579, "step": 5425 }, { "epoch": 0.59, "grad_norm": 1.1735794289344135, "learning_rate": 8.442453730770122e-06, "loss": 2.1016, "step": 5430 }, { "epoch": 0.6, "grad_norm": 1.1234047726823304, "learning_rate": 8.423571170476086e-06, "loss": 2.0648, "step": 5435 }, { "epoch": 0.6, "grad_norm": 1.1721995708643582, "learning_rate": 8.404694372445752e-06, "loss": 2.0275, "step": 5440 }, { "epoch": 0.6, "grad_norm": 1.071479174896118, "learning_rate": 8.385823405678798e-06, "loss": 2.0277, "step": 5445 }, { "epoch": 0.6, "grad_norm": 1.077312497719103, "learning_rate": 8.3669583391536e-06, "loss": 1.9904, "step": 5450 }, { "epoch": 0.6, "grad_norm": 1.0524625954788391, "learning_rate": 8.348099241826949e-06, "loss": 2.0145, "step": 5455 }, { "epoch": 0.6, "grad_norm": 1.1348971835036348, "learning_rate": 8.329246182633828e-06, "loss": 2.0811, "step": 5460 }, { "epoch": 0.6, "grad_norm": 1.0433493028118779, "learning_rate": 8.310399230487148e-06, "loss": 2.0539, "step": 5465 }, { "epoch": 0.6, "grad_norm": 1.092541260829923, "learning_rate": 8.2915584542775e-06, "loss": 2.0175, "step": 5470 }, { "epoch": 0.6, "grad_norm": 1.0391001139394955, "learning_rate": 8.272723922872893e-06, "loss": 2.0255, "step": 5475 }, { "epoch": 0.6, "grad_norm": 1.0966873981075718, "learning_rate": 8.25389570511852e-06, "loss": 2.0524, "step": 5480 }, { "epoch": 0.6, "grad_norm": 1.129186349801931, "learning_rate": 8.235073869836478e-06, "loss": 2.0715, "step": 5485 }, { "epoch": 0.6, "grad_norm": 1.1445108015533498, "learning_rate": 8.216258485825555e-06, "loss": 2.0862, "step": 5490 }, { "epoch": 0.6, "grad_norm": 1.0777581902555582, "learning_rate": 8.197449621860944e-06, "loss": 2.0324, "step": 5495 }, { "epoch": 0.6, "grad_norm": 1.1170981003460099, "learning_rate": 8.17864734669401e-06, "loss": 2.0102, "step": 5500 }, { "epoch": 0.6, "grad_norm": 1.1833067750724837, "learning_rate": 8.159851729052041e-06, "loss": 1.9831, "step": 5505 }, { "epoch": 0.6, "grad_norm": 1.0830051032625179, "learning_rate": 8.141062837637976e-06, "loss": 2.0166, "step": 5510 }, { "epoch": 0.6, "grad_norm": 1.0913116092753985, "learning_rate": 8.122280741130177e-06, "loss": 2.0919, "step": 5515 }, { "epoch": 0.6, "grad_norm": 1.0687762096783777, "learning_rate": 8.103505508182165e-06, "loss": 2.0281, "step": 5520 }, { "epoch": 0.61, "grad_norm": 1.0934319283987148, "learning_rate": 8.08473720742238e-06, "loss": 2.1081, "step": 5525 }, { "epoch": 0.61, "grad_norm": 1.0949603824897458, "learning_rate": 8.065975907453915e-06, "loss": 2.0361, "step": 5530 }, { "epoch": 0.61, "grad_norm": 1.1094479968561803, "learning_rate": 8.04722167685428e-06, "loss": 2.0278, "step": 5535 }, { "epoch": 0.61, "grad_norm": 1.224018134172163, "learning_rate": 8.028474584175132e-06, "loss": 2.0578, "step": 5540 }, { "epoch": 0.61, "grad_norm": 1.1024386394386596, "learning_rate": 8.009734697942054e-06, "loss": 2.0639, "step": 5545 }, { "epoch": 0.61, "grad_norm": 1.079765072873961, "learning_rate": 7.991002086654273e-06, "loss": 2.0446, "step": 5550 }, { "epoch": 0.61, "grad_norm": 1.0985866046637207, "learning_rate": 7.972276818784435e-06, "loss": 2.0719, "step": 5555 }, { "epoch": 0.61, "grad_norm": 1.1029864700908378, "learning_rate": 7.953558962778342e-06, "loss": 2.0732, "step": 5560 }, { "epoch": 0.61, "grad_norm": 1.1963352878209914, "learning_rate": 7.934848587054695e-06, "loss": 2.0619, "step": 5565 }, { "epoch": 0.61, "grad_norm": 1.0029150765465709, "learning_rate": 7.916145760004859e-06, "loss": 2.0221, "step": 5570 }, { "epoch": 0.61, "grad_norm": 1.1253062918317942, "learning_rate": 7.89745054999261e-06, "loss": 2.0103, "step": 5575 }, { "epoch": 0.61, "grad_norm": 1.0676110710303193, "learning_rate": 7.878763025353875e-06, "loss": 2.1083, "step": 5580 }, { "epoch": 0.61, "grad_norm": 1.0460378069728187, "learning_rate": 7.860083254396491e-06, "loss": 2.0384, "step": 5585 }, { "epoch": 0.61, "grad_norm": 1.0803465717733967, "learning_rate": 7.841411305399961e-06, "loss": 2.0212, "step": 5590 }, { "epoch": 0.61, "grad_norm": 1.0763262864208696, "learning_rate": 7.82274724661518e-06, "loss": 2.0457, "step": 5595 }, { "epoch": 0.61, "grad_norm": 1.1588255859951275, "learning_rate": 7.804091146264219e-06, "loss": 2.0241, "step": 5600 }, { "epoch": 0.61, "grad_norm": 1.0671805845460878, "learning_rate": 7.785443072540047e-06, "loss": 2.0477, "step": 5605 }, { "epoch": 0.61, "grad_norm": 1.1428315311139943, "learning_rate": 7.766803093606304e-06, "loss": 2.0366, "step": 5610 }, { "epoch": 0.62, "grad_norm": 1.0719792235951784, "learning_rate": 7.748171277597029e-06, "loss": 2.0369, "step": 5615 }, { "epoch": 0.62, "grad_norm": 1.0569108521811565, "learning_rate": 7.729547692616437e-06, "loss": 2.0501, "step": 5620 }, { "epoch": 0.62, "grad_norm": 1.1016527186135046, "learning_rate": 7.710932406738643e-06, "loss": 2.0626, "step": 5625 }, { "epoch": 0.62, "grad_norm": 1.2507371953501463, "learning_rate": 7.692325488007439e-06, "loss": 2.0735, "step": 5630 }, { "epoch": 0.62, "grad_norm": 1.0878494760993236, "learning_rate": 7.67372700443602e-06, "loss": 2.0851, "step": 5635 }, { "epoch": 0.62, "grad_norm": 1.0467045149105347, "learning_rate": 7.655137024006762e-06, "loss": 2.0598, "step": 5640 }, { "epoch": 0.62, "grad_norm": 1.0675123858497342, "learning_rate": 7.636555614670953e-06, "loss": 2.1307, "step": 5645 }, { "epoch": 0.62, "grad_norm": 1.0531840400881332, "learning_rate": 7.617982844348547e-06, "loss": 2.0375, "step": 5650 }, { "epoch": 0.62, "grad_norm": 1.0707183090112935, "learning_rate": 7.599418780927925e-06, "loss": 2.0695, "step": 5655 }, { "epoch": 0.62, "grad_norm": 1.0329159295711234, "learning_rate": 7.580863492265642e-06, "loss": 1.9993, "step": 5660 }, { "epoch": 0.62, "grad_norm": 1.0617765214377932, "learning_rate": 7.562317046186182e-06, "loss": 2.0859, "step": 5665 }, { "epoch": 0.62, "grad_norm": 1.0518382495850056, "learning_rate": 7.543779510481703e-06, "loss": 2.0172, "step": 5670 }, { "epoch": 0.62, "grad_norm": 1.0366413109492707, "learning_rate": 7.525250952911787e-06, "loss": 2.0532, "step": 5675 }, { "epoch": 0.62, "grad_norm": 1.0808128527851268, "learning_rate": 7.506731441203209e-06, "loss": 2.0276, "step": 5680 }, { "epoch": 0.62, "grad_norm": 1.1049740952927858, "learning_rate": 7.488221043049679e-06, "loss": 2.0994, "step": 5685 }, { "epoch": 0.62, "grad_norm": 1.1574022275475977, "learning_rate": 7.469719826111585e-06, "loss": 2.0673, "step": 5690 }, { "epoch": 0.62, "grad_norm": 1.052724813445374, "learning_rate": 7.451227858015764e-06, "loss": 2.0068, "step": 5695 }, { "epoch": 0.62, "grad_norm": 1.0927719995403513, "learning_rate": 7.432745206355247e-06, "loss": 2.0216, "step": 5700 }, { "epoch": 0.62, "grad_norm": 1.1201513789399524, "learning_rate": 7.414271938689e-06, "loss": 1.9987, "step": 5705 }, { "epoch": 0.63, "grad_norm": 1.2000436638175427, "learning_rate": 7.395808122541697e-06, "loss": 2.1026, "step": 5710 }, { "epoch": 0.63, "grad_norm": 1.1842829796930912, "learning_rate": 7.377353825403462e-06, "loss": 2.0604, "step": 5715 }, { "epoch": 0.63, "grad_norm": 1.071148811928719, "learning_rate": 7.358909114729629e-06, "loss": 2.0297, "step": 5720 }, { "epoch": 0.63, "grad_norm": 1.1829063428651205, "learning_rate": 7.340474057940484e-06, "loss": 1.9821, "step": 5725 }, { "epoch": 0.63, "grad_norm": 1.1879254595753763, "learning_rate": 7.322048722421024e-06, "loss": 2.0964, "step": 5730 }, { "epoch": 0.63, "grad_norm": 1.1529921620393997, "learning_rate": 7.3036331755207216e-06, "loss": 2.0542, "step": 5735 }, { "epoch": 0.63, "grad_norm": 1.0964431461494535, "learning_rate": 7.285227484553264e-06, "loss": 2.0424, "step": 5740 }, { "epoch": 0.63, "grad_norm": 1.1465402558218125, "learning_rate": 7.266831716796307e-06, "loss": 2.0371, "step": 5745 }, { "epoch": 0.63, "grad_norm": 1.0848583883020673, "learning_rate": 7.248445939491247e-06, "loss": 2.0633, "step": 5750 }, { "epoch": 0.63, "grad_norm": 1.0868066443789028, "learning_rate": 7.2300702198429554e-06, "loss": 2.0906, "step": 5755 }, { "epoch": 0.63, "grad_norm": 1.1431623316532262, "learning_rate": 7.211704625019538e-06, "loss": 2.0126, "step": 5760 }, { "epoch": 0.63, "grad_norm": 1.1072679966619954, "learning_rate": 7.193349222152089e-06, "loss": 2.0748, "step": 5765 }, { "epoch": 0.63, "grad_norm": 1.0652585823559406, "learning_rate": 7.17500407833446e-06, "loss": 2.0847, "step": 5770 }, { "epoch": 0.63, "grad_norm": 1.113443077921772, "learning_rate": 7.156669260622997e-06, "loss": 2.0373, "step": 5775 }, { "epoch": 0.63, "grad_norm": 1.1634294431376044, "learning_rate": 7.138344836036302e-06, "loss": 2.0597, "step": 5780 }, { "epoch": 0.63, "grad_norm": 1.1214495559776985, "learning_rate": 7.12003087155498e-06, "loss": 2.0221, "step": 5785 }, { "epoch": 0.63, "grad_norm": 1.0451601687729262, "learning_rate": 7.101727434121408e-06, "loss": 2.0421, "step": 5790 }, { "epoch": 0.63, "grad_norm": 1.0862263661540281, "learning_rate": 7.0834345906394905e-06, "loss": 2.0156, "step": 5795 }, { "epoch": 0.64, "grad_norm": 1.1182337340204944, "learning_rate": 7.065152407974396e-06, "loss": 2.0513, "step": 5800 }, { "epoch": 0.64, "grad_norm": 1.110886881194482, "learning_rate": 7.046880952952335e-06, "loss": 2.0552, "step": 5805 }, { "epoch": 0.64, "grad_norm": 1.0569780875433503, "learning_rate": 7.028620292360295e-06, "loss": 2.0098, "step": 5810 }, { "epoch": 0.64, "grad_norm": 1.141883153489762, "learning_rate": 7.01037049294582e-06, "loss": 2.0366, "step": 5815 }, { "epoch": 0.64, "grad_norm": 1.1062595512649183, "learning_rate": 6.9921316214167374e-06, "loss": 2.1185, "step": 5820 }, { "epoch": 0.64, "grad_norm": 1.100760273147725, "learning_rate": 6.973903744440949e-06, "loss": 2.0393, "step": 5825 }, { "epoch": 0.64, "grad_norm": 1.0974651479453275, "learning_rate": 6.9556869286461525e-06, "loss": 2.0446, "step": 5830 }, { "epoch": 0.64, "grad_norm": 1.1106691013911072, "learning_rate": 6.937481240619628e-06, "loss": 2.0685, "step": 5835 }, { "epoch": 0.64, "grad_norm": 1.1414921590380411, "learning_rate": 6.9192867469079625e-06, "loss": 2.0721, "step": 5840 }, { "epoch": 0.64, "grad_norm": 1.1429372082988074, "learning_rate": 6.901103514016842e-06, "loss": 2.0757, "step": 5845 }, { "epoch": 0.64, "grad_norm": 1.176111931095732, "learning_rate": 6.882931608410788e-06, "loss": 2.0545, "step": 5850 }, { "epoch": 0.64, "grad_norm": 1.0685955665187419, "learning_rate": 6.864771096512907e-06, "loss": 2.0496, "step": 5855 }, { "epoch": 0.64, "grad_norm": 1.2450103376347597, "learning_rate": 6.846622044704675e-06, "loss": 2.0654, "step": 5860 }, { "epoch": 0.64, "grad_norm": 1.0544042614340512, "learning_rate": 6.82848451932566e-06, "loss": 2.077, "step": 5865 }, { "epoch": 0.64, "grad_norm": 1.0933765517322724, "learning_rate": 6.810358586673314e-06, "loss": 2.0438, "step": 5870 }, { "epoch": 0.64, "grad_norm": 1.0725788392736022, "learning_rate": 6.792244313002703e-06, "loss": 2.0558, "step": 5875 }, { "epoch": 0.64, "grad_norm": 1.0476178462394736, "learning_rate": 6.7741417645262865e-06, "loss": 2.0124, "step": 5880 }, { "epoch": 0.64, "grad_norm": 1.0686915892517284, "learning_rate": 6.7560510074136556e-06, "loss": 2.1191, "step": 5885 }, { "epoch": 0.65, "grad_norm": 1.1076839403970535, "learning_rate": 6.7379721077913095e-06, "loss": 2.0468, "step": 5890 }, { "epoch": 0.65, "grad_norm": 1.07732446997292, "learning_rate": 6.719905131742395e-06, "loss": 2.0968, "step": 5895 }, { "epoch": 0.65, "grad_norm": 1.0836949675361982, "learning_rate": 6.701850145306485e-06, "loss": 2.0655, "step": 5900 }, { "epoch": 0.65, "grad_norm": 1.0859510888268316, "learning_rate": 6.683807214479323e-06, "loss": 2.0797, "step": 5905 }, { "epoch": 0.65, "grad_norm": 1.0615810178520821, "learning_rate": 6.665776405212585e-06, "loss": 2.0265, "step": 5910 }, { "epoch": 0.65, "grad_norm": 1.0938333888806135, "learning_rate": 6.647757783413644e-06, "loss": 2.0715, "step": 5915 }, { "epoch": 0.65, "grad_norm": 1.1120309441047764, "learning_rate": 6.629751414945318e-06, "loss": 2.0513, "step": 5920 }, { "epoch": 0.65, "grad_norm": 1.094981820086556, "learning_rate": 6.611757365625637e-06, "loss": 2.0279, "step": 5925 }, { "epoch": 0.65, "grad_norm": 1.1050724469440145, "learning_rate": 6.593775701227607e-06, "loss": 2.0622, "step": 5930 }, { "epoch": 0.65, "grad_norm": 1.0957912827417855, "learning_rate": 6.575806487478962e-06, "loss": 2.0545, "step": 5935 }, { "epoch": 0.65, "grad_norm": 1.0984274646958023, "learning_rate": 6.557849790061919e-06, "loss": 2.0762, "step": 5940 }, { "epoch": 0.65, "grad_norm": 1.1576613010549697, "learning_rate": 6.539905674612956e-06, "loss": 2.0608, "step": 5945 }, { "epoch": 0.65, "grad_norm": 1.0887085536290133, "learning_rate": 6.521974206722546e-06, "loss": 2.0549, "step": 5950 }, { "epoch": 0.65, "grad_norm": 1.187613565222458, "learning_rate": 6.504055451934944e-06, "loss": 2.0808, "step": 5955 }, { "epoch": 0.65, "grad_norm": 1.1206905466775947, "learning_rate": 6.48614947574793e-06, "loss": 2.0452, "step": 5960 }, { "epoch": 0.65, "grad_norm": 1.0637973790415451, "learning_rate": 6.468256343612575e-06, "loss": 2.0411, "step": 5965 }, { "epoch": 0.65, "grad_norm": 1.1454784700544354, "learning_rate": 6.450376120933008e-06, "loss": 2.0298, "step": 5970 }, { "epoch": 0.65, "grad_norm": 1.024433162395534, "learning_rate": 6.432508873066161e-06, "loss": 2.0155, "step": 5975 }, { "epoch": 0.66, "grad_norm": 1.0555867328011295, "learning_rate": 6.4146546653215404e-06, "loss": 2.0811, "step": 5980 }, { "epoch": 0.66, "grad_norm": 1.072615204743798, "learning_rate": 6.396813562960993e-06, "loss": 2.0512, "step": 5985 }, { "epoch": 0.66, "grad_norm": 1.1549264725581572, "learning_rate": 6.378985631198462e-06, "loss": 2.1108, "step": 5990 }, { "epoch": 0.66, "grad_norm": 1.081486432749601, "learning_rate": 6.361170935199745e-06, "loss": 2.089, "step": 5995 }, { "epoch": 0.66, "grad_norm": 1.0745000990935587, "learning_rate": 6.343369540082262e-06, "loss": 2.0518, "step": 6000 }, { "epoch": 0.66, "grad_norm": 1.0985616235475981, "learning_rate": 6.32558151091481e-06, "loss": 2.0763, "step": 6005 }, { "epoch": 0.66, "grad_norm": 1.1386783556114126, "learning_rate": 6.307806912717336e-06, "loss": 2.048, "step": 6010 }, { "epoch": 0.66, "grad_norm": 1.135643505601148, "learning_rate": 6.290045810460688e-06, "loss": 2.0516, "step": 6015 }, { "epoch": 0.66, "grad_norm": 1.0384633969601875, "learning_rate": 6.272298269066388e-06, "loss": 2.0083, "step": 6020 }, { "epoch": 0.66, "grad_norm": 1.102678614142519, "learning_rate": 6.2545643534063894e-06, "loss": 2.0267, "step": 6025 }, { "epoch": 0.66, "grad_norm": 1.1041568600661347, "learning_rate": 6.236844128302832e-06, "loss": 2.003, "step": 6030 }, { "epoch": 0.66, "grad_norm": 1.0652285482265529, "learning_rate": 6.219137658527819e-06, "loss": 2.0361, "step": 6035 }, { "epoch": 0.66, "grad_norm": 1.116982197351896, "learning_rate": 6.2014450088031755e-06, "loss": 2.0532, "step": 6040 }, { "epoch": 0.66, "grad_norm": 1.078254533346598, "learning_rate": 6.1837662438002086e-06, "loss": 2.0288, "step": 6045 }, { "epoch": 0.66, "grad_norm": 1.0643782258306655, "learning_rate": 6.166101428139472e-06, "loss": 2.0769, "step": 6050 }, { "epoch": 0.66, "grad_norm": 1.078947985838913, "learning_rate": 6.148450626390539e-06, "loss": 1.9868, "step": 6055 }, { "epoch": 0.66, "grad_norm": 1.0169498723848571, "learning_rate": 6.130813903071739e-06, "loss": 2.0458, "step": 6060 }, { "epoch": 0.66, "grad_norm": 1.0169009702774392, "learning_rate": 6.113191322649964e-06, "loss": 2.0488, "step": 6065 }, { "epoch": 0.66, "grad_norm": 1.041120960302728, "learning_rate": 6.095582949540393e-06, "loss": 2.0493, "step": 6070 }, { "epoch": 0.67, "grad_norm": 1.1283602986530428, "learning_rate": 6.0779888481062834e-06, "loss": 2.0387, "step": 6075 }, { "epoch": 0.67, "grad_norm": 1.0876904424453977, "learning_rate": 6.060409082658725e-06, "loss": 2.0688, "step": 6080 }, { "epoch": 0.67, "grad_norm": 1.068284408010192, "learning_rate": 6.042843717456399e-06, "loss": 2.0334, "step": 6085 }, { "epoch": 0.67, "grad_norm": 1.0201746218597543, "learning_rate": 6.025292816705352e-06, "loss": 1.9909, "step": 6090 }, { "epoch": 0.67, "grad_norm": 1.0966909811015106, "learning_rate": 6.007756444558766e-06, "loss": 2.0457, "step": 6095 }, { "epoch": 0.67, "grad_norm": 1.1224611023603956, "learning_rate": 5.990234665116713e-06, "loss": 2.0852, "step": 6100 }, { "epoch": 0.67, "grad_norm": 1.0689731427931881, "learning_rate": 5.97272754242592e-06, "loss": 2.0056, "step": 6105 }, { "epoch": 0.67, "grad_norm": 1.0820108067444485, "learning_rate": 5.95523514047955e-06, "loss": 2.0297, "step": 6110 }, { "epoch": 0.67, "grad_norm": 1.070308643323891, "learning_rate": 5.937757523216945e-06, "loss": 2.0624, "step": 6115 }, { "epoch": 0.67, "grad_norm": 1.0625838781924501, "learning_rate": 5.9202947545234165e-06, "loss": 2.0366, "step": 6120 }, { "epoch": 0.67, "grad_norm": 1.085928056296824, "learning_rate": 5.902846898229993e-06, "loss": 2.0053, "step": 6125 }, { "epoch": 0.67, "grad_norm": 1.1068266825474753, "learning_rate": 5.8854140181132e-06, "loss": 2.0715, "step": 6130 }, { "epoch": 0.67, "grad_norm": 1.102918003298733, "learning_rate": 5.867996177894819e-06, "loss": 2.0948, "step": 6135 }, { "epoch": 0.67, "grad_norm": 1.0397905627365138, "learning_rate": 5.850593441241652e-06, "loss": 2.053, "step": 6140 }, { "epoch": 0.67, "grad_norm": 1.0699314943993576, "learning_rate": 5.833205871765297e-06, "loss": 2.068, "step": 6145 }, { "epoch": 0.67, "grad_norm": 1.0407957685620277, "learning_rate": 5.815833533021916e-06, "loss": 2.0498, "step": 6150 }, { "epoch": 0.67, "grad_norm": 1.1392700492075594, "learning_rate": 5.798476488511992e-06, "loss": 1.9992, "step": 6155 }, { "epoch": 0.67, "grad_norm": 1.0360866093371432, "learning_rate": 5.781134801680115e-06, "loss": 2.0266, "step": 6160 }, { "epoch": 0.68, "grad_norm": 1.0779294106537212, "learning_rate": 5.7638085359147235e-06, "loss": 2.059, "step": 6165 }, { "epoch": 0.68, "grad_norm": 1.12768682574256, "learning_rate": 5.746497754547891e-06, "loss": 2.0602, "step": 6170 }, { "epoch": 0.68, "grad_norm": 1.0996041444381066, "learning_rate": 5.7292025208550975e-06, "loss": 2.0447, "step": 6175 }, { "epoch": 0.68, "grad_norm": 1.0384682937787677, "learning_rate": 5.711922898054991e-06, "loss": 2.0422, "step": 6180 }, { "epoch": 0.68, "grad_norm": 1.1535141109949882, "learning_rate": 5.694658949309158e-06, "loss": 2.0723, "step": 6185 }, { "epoch": 0.68, "grad_norm": 1.1239265610096303, "learning_rate": 5.677410737721893e-06, "loss": 2.0617, "step": 6190 }, { "epoch": 0.68, "grad_norm": 1.1750247235595435, "learning_rate": 5.660178326339956e-06, "loss": 2.0322, "step": 6195 }, { "epoch": 0.68, "grad_norm": 1.1319217747764854, "learning_rate": 5.64296177815237e-06, "loss": 1.9901, "step": 6200 }, { "epoch": 0.68, "grad_norm": 1.033924327593486, "learning_rate": 5.6257611560901596e-06, "loss": 2.0424, "step": 6205 }, { "epoch": 0.68, "grad_norm": 1.1226441643777423, "learning_rate": 5.608576523026142e-06, "loss": 2.0504, "step": 6210 }, { "epoch": 0.68, "grad_norm": 1.0410272492344628, "learning_rate": 5.591407941774693e-06, "loss": 2.0594, "step": 6215 }, { "epoch": 0.68, "grad_norm": 1.0806797983269754, "learning_rate": 5.5742554750915055e-06, "loss": 2.0777, "step": 6220 }, { "epoch": 0.68, "grad_norm": 1.1791256206038145, "learning_rate": 5.5571191856733795e-06, "loss": 2.0836, "step": 6225 }, { "epoch": 0.68, "grad_norm": 1.081731666579369, "learning_rate": 5.539999136157977e-06, "loss": 2.0185, "step": 6230 }, { "epoch": 0.68, "grad_norm": 1.0497871163863046, "learning_rate": 5.522895389123606e-06, "loss": 2.0442, "step": 6235 }, { "epoch": 0.68, "grad_norm": 1.0922564681417128, "learning_rate": 5.505808007088967e-06, "loss": 2.0606, "step": 6240 }, { "epoch": 0.68, "grad_norm": 1.0738404549253175, "learning_rate": 5.488737052512967e-06, "loss": 2.0639, "step": 6245 }, { "epoch": 0.68, "grad_norm": 1.0867735550456459, "learning_rate": 5.4716825877944425e-06, "loss": 2.0569, "step": 6250 }, { "epoch": 0.69, "grad_norm": 1.056686749306022, "learning_rate": 5.454644675271969e-06, "loss": 2.0424, "step": 6255 }, { "epoch": 0.69, "grad_norm": 1.0408490323802606, "learning_rate": 5.43762337722362e-06, "loss": 2.0256, "step": 6260 }, { "epoch": 0.69, "grad_norm": 1.2702794476408228, "learning_rate": 5.420618755866736e-06, "loss": 2.045, "step": 6265 }, { "epoch": 0.69, "grad_norm": 1.229049881614737, "learning_rate": 5.403630873357693e-06, "loss": 2.0502, "step": 6270 }, { "epoch": 0.69, "grad_norm": 1.0375292020518598, "learning_rate": 5.386659791791695e-06, "loss": 2.0288, "step": 6275 }, { "epoch": 0.69, "grad_norm": 1.0597402447229445, "learning_rate": 5.369705573202519e-06, "loss": 2.0301, "step": 6280 }, { "epoch": 0.69, "grad_norm": 1.1034664906740375, "learning_rate": 5.352768279562315e-06, "loss": 2.0514, "step": 6285 }, { "epoch": 0.69, "grad_norm": 1.094300717169376, "learning_rate": 5.3358479727813665e-06, "loss": 2.0585, "step": 6290 }, { "epoch": 0.69, "grad_norm": 1.1101884846242396, "learning_rate": 5.318944714707861e-06, "loss": 2.0498, "step": 6295 }, { "epoch": 0.69, "grad_norm": 1.0745963839955277, "learning_rate": 5.302058567127675e-06, "loss": 2.1563, "step": 6300 }, { "epoch": 0.69, "grad_norm": 1.0629615007166326, "learning_rate": 5.2851895917641345e-06, "loss": 2.068, "step": 6305 }, { "epoch": 0.69, "grad_norm": 1.0814668594747519, "learning_rate": 5.268337850277796e-06, "loss": 2.0364, "step": 6310 }, { "epoch": 0.69, "grad_norm": 1.1136210823939863, "learning_rate": 5.251503404266228e-06, "loss": 2.0396, "step": 6315 }, { "epoch": 0.69, "grad_norm": 1.0774654798500185, "learning_rate": 5.2346863152637776e-06, "loss": 2.0348, "step": 6320 }, { "epoch": 0.69, "grad_norm": 1.0442544241928502, "learning_rate": 5.217886644741348e-06, "loss": 2.0454, "step": 6325 }, { "epoch": 0.69, "grad_norm": 1.0366168034554906, "learning_rate": 5.2011044541061674e-06, "loss": 1.9706, "step": 6330 }, { "epoch": 0.69, "grad_norm": 1.1949959316317988, "learning_rate": 5.184339804701575e-06, "loss": 2.0687, "step": 6335 }, { "epoch": 0.69, "grad_norm": 1.0453279219290244, "learning_rate": 5.167592757806798e-06, "loss": 2.0424, "step": 6340 }, { "epoch": 0.7, "grad_norm": 1.130335321079309, "learning_rate": 5.1508633746367075e-06, "loss": 2.0469, "step": 6345 }, { "epoch": 0.7, "grad_norm": 1.0606368873954024, "learning_rate": 5.13415171634162e-06, "loss": 2.1026, "step": 6350 }, { "epoch": 0.7, "grad_norm": 1.0977845250466591, "learning_rate": 5.117457844007066e-06, "loss": 2.0382, "step": 6355 }, { "epoch": 0.7, "grad_norm": 1.121898876420687, "learning_rate": 5.100781818653549e-06, "loss": 2.0302, "step": 6360 }, { "epoch": 0.7, "grad_norm": 1.0926015071989703, "learning_rate": 5.084123701236347e-06, "loss": 2.0308, "step": 6365 }, { "epoch": 0.7, "grad_norm": 1.0740557335679626, "learning_rate": 5.067483552645282e-06, "loss": 2.0787, "step": 6370 }, { "epoch": 0.7, "grad_norm": 1.0987298217975487, "learning_rate": 5.050861433704495e-06, "loss": 2.0341, "step": 6375 }, { "epoch": 0.7, "grad_norm": 1.0265287820731077, "learning_rate": 5.034257405172216e-06, "loss": 2.1067, "step": 6380 }, { "epoch": 0.7, "grad_norm": 1.1118670732440081, "learning_rate": 5.017671527740551e-06, "loss": 2.0513, "step": 6385 }, { "epoch": 0.7, "grad_norm": 1.092919073068794, "learning_rate": 5.001103862035264e-06, "loss": 2.0779, "step": 6390 }, { "epoch": 0.7, "grad_norm": 1.1716839381414195, "learning_rate": 4.984554468615551e-06, "loss": 2.0742, "step": 6395 }, { "epoch": 0.7, "grad_norm": 1.1318511071505077, "learning_rate": 4.968023407973816e-06, "loss": 2.0674, "step": 6400 }, { "epoch": 0.7, "grad_norm": 1.04614482040218, "learning_rate": 4.951510740535448e-06, "loss": 2.0583, "step": 6405 }, { "epoch": 0.7, "grad_norm": 1.0780447206004922, "learning_rate": 4.935016526658615e-06, "loss": 2.0516, "step": 6410 }, { "epoch": 0.7, "grad_norm": 1.0825052175860206, "learning_rate": 4.9185408266340195e-06, "loss": 2.0579, "step": 6415 }, { "epoch": 0.7, "grad_norm": 1.031992570465647, "learning_rate": 4.902083700684692e-06, "loss": 2.0971, "step": 6420 }, { "epoch": 0.7, "grad_norm": 1.0495460394203766, "learning_rate": 4.885645208965779e-06, "loss": 2.1026, "step": 6425 }, { "epoch": 0.7, "grad_norm": 1.1481615447005535, "learning_rate": 4.869225411564308e-06, "loss": 2.0633, "step": 6430 }, { "epoch": 0.7, "grad_norm": 1.1023544838695916, "learning_rate": 4.852824368498979e-06, "loss": 2.0735, "step": 6435 }, { "epoch": 0.71, "grad_norm": 1.0686424095476494, "learning_rate": 4.836442139719929e-06, "loss": 2.0659, "step": 6440 }, { "epoch": 0.71, "grad_norm": 1.1883035622432618, "learning_rate": 4.820078785108533e-06, "loss": 2.0184, "step": 6445 }, { "epoch": 0.71, "grad_norm": 1.1026116370748873, "learning_rate": 4.803734364477179e-06, "loss": 2.0261, "step": 6450 }, { "epoch": 0.71, "grad_norm": 1.0796819580245074, "learning_rate": 4.787408937569032e-06, "loss": 2.0277, "step": 6455 }, { "epoch": 0.71, "grad_norm": 1.0992695080536736, "learning_rate": 4.771102564057843e-06, "loss": 2.0313, "step": 6460 }, { "epoch": 0.71, "grad_norm": 1.039539567099332, "learning_rate": 4.7548153035477185e-06, "loss": 2.0364, "step": 6465 }, { "epoch": 0.71, "grad_norm": 1.206223329988185, "learning_rate": 4.7385472155728885e-06, "loss": 2.0039, "step": 6470 }, { "epoch": 0.71, "grad_norm": 1.0914115357886938, "learning_rate": 4.722298359597515e-06, "loss": 2.0308, "step": 6475 }, { "epoch": 0.71, "grad_norm": 1.0543000853475375, "learning_rate": 4.706068795015457e-06, "loss": 2.0462, "step": 6480 }, { "epoch": 0.71, "grad_norm": 1.128223494444816, "learning_rate": 4.689858581150064e-06, "loss": 2.0995, "step": 6485 }, { "epoch": 0.71, "grad_norm": 1.1386698403898152, "learning_rate": 4.673667777253944e-06, "loss": 2.0035, "step": 6490 }, { "epoch": 0.71, "grad_norm": 1.1213198443710088, "learning_rate": 4.657496442508759e-06, "loss": 2.0377, "step": 6495 }, { "epoch": 0.71, "grad_norm": 1.0578279866760252, "learning_rate": 4.64134463602501e-06, "loss": 2.1075, "step": 6500 }, { "epoch": 0.71, "grad_norm": 1.0905717314285923, "learning_rate": 4.625212416841816e-06, "loss": 2.1203, "step": 6505 }, { "epoch": 0.71, "grad_norm": 1.1280015650617037, "learning_rate": 4.609099843926701e-06, "loss": 2.0091, "step": 6510 }, { "epoch": 0.71, "grad_norm": 1.0791879358197602, "learning_rate": 4.593006976175375e-06, "loss": 2.0369, "step": 6515 }, { "epoch": 0.71, "grad_norm": 1.1124558482389784, "learning_rate": 4.576933872411512e-06, "loss": 1.9975, "step": 6520 }, { "epoch": 0.71, "grad_norm": 1.1331275476213436, "learning_rate": 4.560880591386561e-06, "loss": 2.0364, "step": 6525 }, { "epoch": 0.72, "grad_norm": 1.0666221396041278, "learning_rate": 4.544847191779495e-06, "loss": 2.1246, "step": 6530 }, { "epoch": 0.72, "grad_norm": 1.031547697947528, "learning_rate": 4.5288337321966265e-06, "loss": 2.0521, "step": 6535 }, { "epoch": 0.72, "grad_norm": 1.0549248565814335, "learning_rate": 4.512840271171381e-06, "loss": 2.0626, "step": 6540 }, { "epoch": 0.72, "grad_norm": 1.1478472798448625, "learning_rate": 4.496866867164087e-06, "loss": 2.0401, "step": 6545 }, { "epoch": 0.72, "grad_norm": 1.1562823698091946, "learning_rate": 4.480913578561744e-06, "loss": 1.9968, "step": 6550 }, { "epoch": 0.72, "grad_norm": 1.1273552558897353, "learning_rate": 4.464980463677846e-06, "loss": 2.056, "step": 6555 }, { "epoch": 0.72, "grad_norm": 1.0664790595253175, "learning_rate": 4.4490675807521255e-06, "loss": 2.0306, "step": 6560 }, { "epoch": 0.72, "grad_norm": 1.0756704008746107, "learning_rate": 4.433174987950377e-06, "loss": 2.0657, "step": 6565 }, { "epoch": 0.72, "grad_norm": 1.0404961590051478, "learning_rate": 4.417302743364229e-06, "loss": 2.056, "step": 6570 }, { "epoch": 0.72, "grad_norm": 1.0806254634569743, "learning_rate": 4.4014509050109185e-06, "loss": 2.0242, "step": 6575 }, { "epoch": 0.72, "grad_norm": 1.0939282492543405, "learning_rate": 4.3856195308331055e-06, "loss": 2.0348, "step": 6580 }, { "epoch": 0.72, "grad_norm": 1.1283631685221096, "learning_rate": 4.3698086786986425e-06, "loss": 2.1101, "step": 6585 }, { "epoch": 0.72, "grad_norm": 1.1795508838079485, "learning_rate": 4.354018406400374e-06, "loss": 2.0506, "step": 6590 }, { "epoch": 0.72, "grad_norm": 1.0738158407485934, "learning_rate": 4.3382487716559095e-06, "loss": 2.0682, "step": 6595 }, { "epoch": 0.72, "grad_norm": 1.0561933301322377, "learning_rate": 4.322499832107434e-06, "loss": 2.024, "step": 6600 }, { "epoch": 0.72, "grad_norm": 1.085332652809342, "learning_rate": 4.306771645321474e-06, "loss": 2.0535, "step": 6605 }, { "epoch": 0.72, "grad_norm": 1.09717002527647, "learning_rate": 4.29106426878871e-06, "loss": 2.0771, "step": 6610 }, { "epoch": 0.72, "grad_norm": 1.075225891903269, "learning_rate": 4.275377759923751e-06, "loss": 2.0644, "step": 6615 }, { "epoch": 0.73, "grad_norm": 1.0470432175890323, "learning_rate": 4.25971217606493e-06, "loss": 2.0156, "step": 6620 }, { "epoch": 0.73, "grad_norm": 1.0255546083135572, "learning_rate": 4.244067574474098e-06, "loss": 2.0549, "step": 6625 }, { "epoch": 0.73, "grad_norm": 1.0691609246164564, "learning_rate": 4.2284440123364e-06, "loss": 2.0508, "step": 6630 }, { "epoch": 0.73, "grad_norm": 1.0606322171659857, "learning_rate": 4.212841546760078e-06, "loss": 2.0229, "step": 6635 }, { "epoch": 0.73, "grad_norm": 1.0690393007468622, "learning_rate": 4.197260234776269e-06, "loss": 2.0428, "step": 6640 }, { "epoch": 0.73, "grad_norm": 1.0485998627321558, "learning_rate": 4.181700133338783e-06, "loss": 2.0676, "step": 6645 }, { "epoch": 0.73, "grad_norm": 1.2165114137160171, "learning_rate": 4.166161299323901e-06, "loss": 2.0746, "step": 6650 }, { "epoch": 0.73, "grad_norm": 1.0991693248485284, "learning_rate": 4.1506437895301664e-06, "loss": 2.0114, "step": 6655 }, { "epoch": 0.73, "grad_norm": 1.018413138145112, "learning_rate": 4.13514766067817e-06, "loss": 2.0606, "step": 6660 }, { "epoch": 0.73, "grad_norm": 1.0758949928121877, "learning_rate": 4.119672969410362e-06, "loss": 2.0423, "step": 6665 }, { "epoch": 0.73, "grad_norm": 1.1364195015648175, "learning_rate": 4.104219772290819e-06, "loss": 2.0317, "step": 6670 }, { "epoch": 0.73, "grad_norm": 1.0421614978767877, "learning_rate": 4.088788125805061e-06, "loss": 2.0233, "step": 6675 }, { "epoch": 0.73, "grad_norm": 1.089959841211098, "learning_rate": 4.073378086359834e-06, "loss": 2.0311, "step": 6680 }, { "epoch": 0.73, "grad_norm": 1.1398527004327725, "learning_rate": 4.057989710282897e-06, "loss": 2.0847, "step": 6685 }, { "epoch": 0.73, "grad_norm": 1.1106392260140925, "learning_rate": 4.0426230538228295e-06, "loss": 2.063, "step": 6690 }, { "epoch": 0.73, "grad_norm": 1.10333211759685, "learning_rate": 4.027278173148821e-06, "loss": 2.1252, "step": 6695 }, { "epoch": 0.73, "grad_norm": 1.0584164767135102, "learning_rate": 4.011955124350465e-06, "loss": 2.057, "step": 6700 }, { "epoch": 0.73, "grad_norm": 1.0879699060871235, "learning_rate": 3.996653963437546e-06, "loss": 1.994, "step": 6705 }, { "epoch": 0.74, "grad_norm": 1.049235440413386, "learning_rate": 3.981374746339854e-06, "loss": 2.0387, "step": 6710 }, { "epoch": 0.74, "grad_norm": 1.0444011642274351, "learning_rate": 3.966117528906956e-06, "loss": 2.0629, "step": 6715 }, { "epoch": 0.74, "grad_norm": 1.1052013451466383, "learning_rate": 3.9508823669080154e-06, "loss": 2.0489, "step": 6720 }, { "epoch": 0.74, "grad_norm": 1.069977370831718, "learning_rate": 3.935669316031573e-06, "loss": 2.0725, "step": 6725 }, { "epoch": 0.74, "grad_norm": 1.0641142281545968, "learning_rate": 3.920478431885345e-06, "loss": 2.0514, "step": 6730 }, { "epoch": 0.74, "grad_norm": 1.0551618532156168, "learning_rate": 3.905309769996031e-06, "loss": 2.1263, "step": 6735 }, { "epoch": 0.74, "grad_norm": 1.1091920925511176, "learning_rate": 3.890163385809092e-06, "loss": 2.0544, "step": 6740 }, { "epoch": 0.74, "grad_norm": 1.0669238131384329, "learning_rate": 3.875039334688556e-06, "loss": 2.0311, "step": 6745 }, { "epoch": 0.74, "grad_norm": 1.0844541282575768, "learning_rate": 3.859937671916833e-06, "loss": 2.0612, "step": 6750 }, { "epoch": 0.74, "grad_norm": 1.0778604908237686, "learning_rate": 3.844858452694483e-06, "loss": 2.0605, "step": 6755 }, { "epoch": 0.74, "grad_norm": 1.0849036117668862, "learning_rate": 3.829801732140039e-06, "loss": 2.0481, "step": 6760 }, { "epoch": 0.74, "grad_norm": 1.0869838337027067, "learning_rate": 3.814767565289793e-06, "loss": 2.0402, "step": 6765 }, { "epoch": 0.74, "grad_norm": 1.1508269138172573, "learning_rate": 3.799756007097588e-06, "loss": 2.0679, "step": 6770 }, { "epoch": 0.74, "grad_norm": 1.1352276771772896, "learning_rate": 3.784767112434641e-06, "loss": 2.016, "step": 6775 }, { "epoch": 0.74, "grad_norm": 1.1685190235142595, "learning_rate": 3.7698009360893127e-06, "loss": 2.1107, "step": 6780 }, { "epoch": 0.74, "grad_norm": 1.1421538649472731, "learning_rate": 3.7548575327669345e-06, "loss": 2.0092, "step": 6785 }, { "epoch": 0.74, "grad_norm": 1.0767155546682732, "learning_rate": 3.739936957089596e-06, "loss": 2.0318, "step": 6790 }, { "epoch": 0.74, "grad_norm": 1.0895952710292436, "learning_rate": 3.725039263595933e-06, "loss": 2.0308, "step": 6795 }, { "epoch": 0.74, "grad_norm": 1.0460121795977542, "learning_rate": 3.7101645067409555e-06, "loss": 2.1095, "step": 6800 }, { "epoch": 0.75, "grad_norm": 1.1669596300256566, "learning_rate": 3.69531274089583e-06, "loss": 2.0571, "step": 6805 }, { "epoch": 0.75, "grad_norm": 1.0891651008284138, "learning_rate": 3.6804840203476768e-06, "loss": 2.0386, "step": 6810 }, { "epoch": 0.75, "grad_norm": 1.1419139874801834, "learning_rate": 3.6656783992993885e-06, "loss": 1.9823, "step": 6815 }, { "epoch": 0.75, "grad_norm": 1.0612264548445145, "learning_rate": 3.6508959318694236e-06, "loss": 1.9998, "step": 6820 }, { "epoch": 0.75, "grad_norm": 1.1517735565278964, "learning_rate": 3.636136672091598e-06, "loss": 2.114, "step": 6825 }, { "epoch": 0.75, "grad_norm": 1.028001542045217, "learning_rate": 3.6214006739149078e-06, "loss": 2.0358, "step": 6830 }, { "epoch": 0.75, "grad_norm": 1.1446661316387627, "learning_rate": 3.606687991203317e-06, "loss": 1.9748, "step": 6835 }, { "epoch": 0.75, "grad_norm": 1.1279476192440228, "learning_rate": 3.591998677735571e-06, "loss": 2.0404, "step": 6840 }, { "epoch": 0.75, "grad_norm": 1.0786150029113748, "learning_rate": 3.5773327872049867e-06, "loss": 2.0225, "step": 6845 }, { "epoch": 0.75, "grad_norm": 1.052337794266004, "learning_rate": 3.562690373219262e-06, "loss": 2.0575, "step": 6850 }, { "epoch": 0.75, "grad_norm": 1.1016492556494766, "learning_rate": 3.5480714893002933e-06, "loss": 2.0162, "step": 6855 }, { "epoch": 0.75, "grad_norm": 1.1027216885860485, "learning_rate": 3.5334761888839587e-06, "loss": 2.0193, "step": 6860 }, { "epoch": 0.75, "grad_norm": 1.0986674609038476, "learning_rate": 3.5189045253199384e-06, "loss": 1.9916, "step": 6865 }, { "epoch": 0.75, "grad_norm": 1.0679116167255394, "learning_rate": 3.5043565518715096e-06, "loss": 2.0325, "step": 6870 }, { "epoch": 0.75, "grad_norm": 1.0388701985796662, "learning_rate": 3.489832321715362e-06, "loss": 2.074, "step": 6875 }, { "epoch": 0.75, "grad_norm": 1.0629289580537227, "learning_rate": 3.475331887941388e-06, "loss": 2.0157, "step": 6880 }, { "epoch": 0.75, "grad_norm": 1.0526371638707026, "learning_rate": 3.4608553035525005e-06, "loss": 2.072, "step": 6885 }, { "epoch": 0.75, "grad_norm": 1.0966942409136857, "learning_rate": 3.44640262146444e-06, "loss": 2.0682, "step": 6890 }, { "epoch": 0.76, "grad_norm": 1.1390378447821399, "learning_rate": 3.431973894505578e-06, "loss": 2.0584, "step": 6895 }, { "epoch": 0.76, "grad_norm": 1.047509517452995, "learning_rate": 3.417569175416725e-06, "loss": 2.1015, "step": 6900 }, { "epoch": 0.76, "grad_norm": 1.119149638810271, "learning_rate": 3.403188516850927e-06, "loss": 2.0348, "step": 6905 }, { "epoch": 0.76, "grad_norm": 1.0650848951187675, "learning_rate": 3.388831971373293e-06, "loss": 2.0462, "step": 6910 }, { "epoch": 0.76, "grad_norm": 1.0492176757892235, "learning_rate": 3.3744995914607905e-06, "loss": 2.0415, "step": 6915 }, { "epoch": 0.76, "grad_norm": 1.0365934450270533, "learning_rate": 3.3601914295020455e-06, "loss": 2.1272, "step": 6920 }, { "epoch": 0.76, "grad_norm": 1.1437706741881266, "learning_rate": 3.345907537797177e-06, "loss": 2.059, "step": 6925 }, { "epoch": 0.76, "grad_norm": 1.0653686564088776, "learning_rate": 3.3316479685575755e-06, "loss": 2.035, "step": 6930 }, { "epoch": 0.76, "grad_norm": 1.0762027827693046, "learning_rate": 3.317412773905734e-06, "loss": 2.0454, "step": 6935 }, { "epoch": 0.76, "grad_norm": 1.1334000654253669, "learning_rate": 3.303202005875049e-06, "loss": 1.9643, "step": 6940 }, { "epoch": 0.76, "grad_norm": 1.0318442741330254, "learning_rate": 3.2890157164096315e-06, "loss": 2.0661, "step": 6945 }, { "epoch": 0.76, "grad_norm": 1.117781767304972, "learning_rate": 3.2748539573641182e-06, "loss": 2.081, "step": 6950 }, { "epoch": 0.76, "grad_norm": 1.0617731156012749, "learning_rate": 3.260716780503476e-06, "loss": 2.0451, "step": 6955 }, { "epoch": 0.76, "grad_norm": 1.0952884296175636, "learning_rate": 3.2466042375028153e-06, "loss": 2.0348, "step": 6960 }, { "epoch": 0.76, "grad_norm": 1.0381093087763817, "learning_rate": 3.2325163799472126e-06, "loss": 1.9772, "step": 6965 }, { "epoch": 0.76, "grad_norm": 1.0801037457075262, "learning_rate": 3.2184532593315077e-06, "loss": 2.0242, "step": 6970 }, { "epoch": 0.76, "grad_norm": 1.098383925177788, "learning_rate": 3.20441492706012e-06, "loss": 1.9647, "step": 6975 }, { "epoch": 0.76, "grad_norm": 1.1091260012293644, "learning_rate": 3.190401434446865e-06, "loss": 2.0504, "step": 6980 }, { "epoch": 0.77, "grad_norm": 1.0487472273643903, "learning_rate": 3.1764128327147515e-06, "loss": 2.0344, "step": 6985 }, { "epoch": 0.77, "grad_norm": 1.1288573614071222, "learning_rate": 3.1624491729958207e-06, "loss": 2.0655, "step": 6990 }, { "epoch": 0.77, "grad_norm": 1.0356981493380066, "learning_rate": 3.1485105063309296e-06, "loss": 2.0772, "step": 6995 }, { "epoch": 0.77, "grad_norm": 1.0365342718836954, "learning_rate": 3.1345968836695904e-06, "loss": 2.0356, "step": 7000 }, { "epoch": 0.77, "grad_norm": 1.040491055629202, "learning_rate": 3.1207083558697673e-06, "loss": 2.0418, "step": 7005 }, { "epoch": 0.77, "grad_norm": 1.0474153169506732, "learning_rate": 3.1068449736977015e-06, "loss": 2.0193, "step": 7010 }, { "epoch": 0.77, "grad_norm": 1.0856817775817103, "learning_rate": 3.0930067878277083e-06, "loss": 2.0604, "step": 7015 }, { "epoch": 0.77, "grad_norm": 1.0827062708341393, "learning_rate": 3.079193848842015e-06, "loss": 2.0611, "step": 7020 }, { "epoch": 0.77, "grad_norm": 1.0623587579427942, "learning_rate": 3.0654062072305667e-06, "loss": 2.0286, "step": 7025 }, { "epoch": 0.77, "grad_norm": 1.0743299150607515, "learning_rate": 3.051643913390826e-06, "loss": 2.0587, "step": 7030 }, { "epoch": 0.77, "grad_norm": 1.081293874461259, "learning_rate": 3.0379070176276193e-06, "loss": 2.0094, "step": 7035 }, { "epoch": 0.77, "grad_norm": 1.1112803071534423, "learning_rate": 3.0241955701529212e-06, "loss": 2.0715, "step": 7040 }, { "epoch": 0.77, "grad_norm": 1.112180637095379, "learning_rate": 3.0105096210856976e-06, "loss": 2.0367, "step": 7045 }, { "epoch": 0.77, "grad_norm": 1.1071137101223008, "learning_rate": 2.9968492204517062e-06, "loss": 2.0164, "step": 7050 }, { "epoch": 0.77, "grad_norm": 1.0485995557178027, "learning_rate": 2.983214418183322e-06, "loss": 2.0386, "step": 7055 }, { "epoch": 0.77, "grad_norm": 1.0879584796836543, "learning_rate": 2.969605264119342e-06, "loss": 2.0391, "step": 7060 }, { "epoch": 0.77, "grad_norm": 1.1182504074105257, "learning_rate": 2.9560218080048243e-06, "loss": 2.0271, "step": 7065 }, { "epoch": 0.77, "grad_norm": 1.093277013299855, "learning_rate": 2.942464099490883e-06, "loss": 2.0667, "step": 7070 }, { "epoch": 0.78, "grad_norm": 1.0922117269089155, "learning_rate": 2.9289321881345257e-06, "loss": 2.0303, "step": 7075 }, { "epoch": 0.78, "grad_norm": 1.0206040169191413, "learning_rate": 2.9154261233984617e-06, "loss": 1.9722, "step": 7080 }, { "epoch": 0.78, "grad_norm": 1.1169675447818994, "learning_rate": 2.9019459546509255e-06, "loss": 2.0403, "step": 7085 }, { "epoch": 0.78, "grad_norm": 1.1049754667694918, "learning_rate": 2.888491731165497e-06, "loss": 2.0698, "step": 7090 }, { "epoch": 0.78, "grad_norm": 1.0652359172685801, "learning_rate": 2.8750635021209095e-06, "loss": 2.0396, "step": 7095 }, { "epoch": 0.78, "grad_norm": 1.1073169718885805, "learning_rate": 2.8616613166008923e-06, "loss": 2.0502, "step": 7100 }, { "epoch": 0.78, "grad_norm": 1.0973810222667622, "learning_rate": 2.8482852235939672e-06, "loss": 2.0481, "step": 7105 }, { "epoch": 0.78, "grad_norm": 1.061759817919837, "learning_rate": 2.834935271993291e-06, "loss": 2.1142, "step": 7110 }, { "epoch": 0.78, "grad_norm": 1.1275703440567955, "learning_rate": 2.8216115105964605e-06, "loss": 2.0291, "step": 7115 }, { "epoch": 0.78, "grad_norm": 1.094108854470618, "learning_rate": 2.8083139881053465e-06, "loss": 2.0064, "step": 7120 }, { "epoch": 0.78, "grad_norm": 1.0578565816598984, "learning_rate": 2.7950427531258995e-06, "loss": 2.0337, "step": 7125 }, { "epoch": 0.78, "grad_norm": 1.1373138287810132, "learning_rate": 2.7817978541679936e-06, "loss": 1.9807, "step": 7130 }, { "epoch": 0.78, "grad_norm": 1.075044286569311, "learning_rate": 2.7685793396452275e-06, "loss": 2.04, "step": 7135 }, { "epoch": 0.78, "grad_norm": 1.0839457343443177, "learning_rate": 2.755387257874764e-06, "loss": 2.0077, "step": 7140 }, { "epoch": 0.78, "grad_norm": 1.1375666907108197, "learning_rate": 2.742221657077151e-06, "loss": 2.0493, "step": 7145 }, { "epoch": 0.78, "grad_norm": 1.078363894438191, "learning_rate": 2.7290825853761293e-06, "loss": 2.1062, "step": 7150 }, { "epoch": 0.78, "grad_norm": 1.0590957681786433, "learning_rate": 2.7159700907984785e-06, "loss": 2.08, "step": 7155 }, { "epoch": 0.78, "grad_norm": 1.0789212815813636, "learning_rate": 2.7028842212738303e-06, "loss": 2.0024, "step": 7160 }, { "epoch": 0.78, "grad_norm": 1.0855217950991025, "learning_rate": 2.689825024634496e-06, "loss": 2.0182, "step": 7165 }, { "epoch": 0.79, "grad_norm": 1.19299127581019, "learning_rate": 2.676792548615282e-06, "loss": 2.0409, "step": 7170 }, { "epoch": 0.79, "grad_norm": 1.107157017870416, "learning_rate": 2.6637868408533373e-06, "loss": 2.0594, "step": 7175 }, { "epoch": 0.79, "grad_norm": 1.0801445421759224, "learning_rate": 2.650807948887951e-06, "loss": 2.0546, "step": 7180 }, { "epoch": 0.79, "grad_norm": 1.1524247147369344, "learning_rate": 2.6378559201604047e-06, "loss": 2.0401, "step": 7185 }, { "epoch": 0.79, "grad_norm": 1.058539490897232, "learning_rate": 2.6249308020137844e-06, "loss": 2.0092, "step": 7190 }, { "epoch": 0.79, "grad_norm": 1.0207889815106557, "learning_rate": 2.612032641692809e-06, "loss": 2.0559, "step": 7195 }, { "epoch": 0.79, "grad_norm": 1.0574359695531035, "learning_rate": 2.5991614863436653e-06, "loss": 2.0686, "step": 7200 }, { "epoch": 0.79, "grad_norm": 1.0441264830123236, "learning_rate": 2.5863173830138212e-06, "loss": 2.0433, "step": 7205 }, { "epoch": 0.79, "grad_norm": 1.0405164768622843, "learning_rate": 2.573500378651864e-06, "loss": 2.0622, "step": 7210 }, { "epoch": 0.79, "grad_norm": 1.071632141992065, "learning_rate": 2.560710520107332e-06, "loss": 2.0548, "step": 7215 }, { "epoch": 0.79, "grad_norm": 1.0856253686393051, "learning_rate": 2.547947854130537e-06, "loss": 2.0805, "step": 7220 }, { "epoch": 0.79, "grad_norm": 1.1630557203684804, "learning_rate": 2.535212427372393e-06, "loss": 2.0324, "step": 7225 }, { "epoch": 0.79, "grad_norm": 1.080082586627047, "learning_rate": 2.522504286384252e-06, "loss": 2.0085, "step": 7230 }, { "epoch": 0.79, "grad_norm": 1.0681232616761207, "learning_rate": 2.509823477617718e-06, "loss": 2.0558, "step": 7235 }, { "epoch": 0.79, "grad_norm": 1.0734199861107971, "learning_rate": 2.497170047424503e-06, "loss": 2.1231, "step": 7240 }, { "epoch": 0.79, "grad_norm": 1.0645455315000067, "learning_rate": 2.4845440420562294e-06, "loss": 2.0474, "step": 7245 }, { "epoch": 0.79, "grad_norm": 1.0474349568913661, "learning_rate": 2.471945507664286e-06, "loss": 2.0014, "step": 7250 }, { "epoch": 0.79, "grad_norm": 1.0883790891998402, "learning_rate": 2.459374490299644e-06, "loss": 2.053, "step": 7255 }, { "epoch": 0.8, "grad_norm": 1.1118965225043536, "learning_rate": 2.446831035912687e-06, "loss": 2.0238, "step": 7260 }, { "epoch": 0.8, "grad_norm": 1.293229099605057, "learning_rate": 2.434315190353056e-06, "loss": 2.0606, "step": 7265 }, { "epoch": 0.8, "grad_norm": 1.0316894453487375, "learning_rate": 2.4218269993694733e-06, "loss": 2.0078, "step": 7270 }, { "epoch": 0.8, "grad_norm": 1.1328582664683062, "learning_rate": 2.409366508609575e-06, "loss": 2.0633, "step": 7275 }, { "epoch": 0.8, "grad_norm": 1.0971620185776176, "learning_rate": 2.3969337636197465e-06, "loss": 2.0119, "step": 7280 }, { "epoch": 0.8, "grad_norm": 1.080647445575166, "learning_rate": 2.384528809844948e-06, "loss": 2.0762, "step": 7285 }, { "epoch": 0.8, "grad_norm": 1.080089853730437, "learning_rate": 2.372151692628566e-06, "loss": 2.0581, "step": 7290 }, { "epoch": 0.8, "grad_norm": 1.0519990046892946, "learning_rate": 2.3598024572122336e-06, "loss": 2.0442, "step": 7295 }, { "epoch": 0.8, "grad_norm": 1.147245101763119, "learning_rate": 2.3474811487356653e-06, "loss": 2.0521, "step": 7300 }, { "epoch": 0.8, "grad_norm": 1.0239452304485515, "learning_rate": 2.335187812236499e-06, "loss": 2.1185, "step": 7305 }, { "epoch": 0.8, "grad_norm": 1.1913769933950669, "learning_rate": 2.3229224926501292e-06, "loss": 2.0965, "step": 7310 }, { "epoch": 0.8, "grad_norm": 1.0514575988354349, "learning_rate": 2.310685234809534e-06, "loss": 2.0938, "step": 7315 }, { "epoch": 0.8, "grad_norm": 1.0345566403769595, "learning_rate": 2.2984760834451224e-06, "loss": 2.0777, "step": 7320 }, { "epoch": 0.8, "grad_norm": 1.1189184018437646, "learning_rate": 2.2862950831845677e-06, "loss": 2.0398, "step": 7325 }, { "epoch": 0.8, "grad_norm": 1.0618233402436388, "learning_rate": 2.2741422785526435e-06, "loss": 2.0955, "step": 7330 }, { "epoch": 0.8, "grad_norm": 1.0873034917920443, "learning_rate": 2.262017713971063e-06, "loss": 2.0434, "step": 7335 }, { "epoch": 0.8, "grad_norm": 1.0750183984656552, "learning_rate": 2.2499214337583062e-06, "loss": 1.9968, "step": 7340 }, { "epoch": 0.8, "grad_norm": 1.1573319117431446, "learning_rate": 2.237853482129475e-06, "loss": 2.0763, "step": 7345 }, { "epoch": 0.81, "grad_norm": 1.0536410478326208, "learning_rate": 2.2258139031961212e-06, "loss": 2.0922, "step": 7350 }, { "epoch": 0.81, "grad_norm": 1.0265455618795605, "learning_rate": 2.213802740966081e-06, "loss": 2.0162, "step": 7355 }, { "epoch": 0.81, "grad_norm": 1.0399182161882259, "learning_rate": 2.2018200393433276e-06, "loss": 2.0804, "step": 7360 }, { "epoch": 0.81, "grad_norm": 1.1011683096870306, "learning_rate": 2.189865842127802e-06, "loss": 2.0358, "step": 7365 }, { "epoch": 0.81, "grad_norm": 1.0759384328912183, "learning_rate": 2.1779401930152457e-06, "loss": 2.0087, "step": 7370 }, { "epoch": 0.81, "grad_norm": 1.0577752049235762, "learning_rate": 2.16604313559706e-06, "loss": 2.0422, "step": 7375 }, { "epoch": 0.81, "grad_norm": 1.0590986895133427, "learning_rate": 2.1541747133601344e-06, "loss": 2.0148, "step": 7380 }, { "epoch": 0.81, "grad_norm": 1.099830554413313, "learning_rate": 2.14233496968668e-06, "loss": 2.056, "step": 7385 }, { "epoch": 0.81, "grad_norm": 1.0593287061595995, "learning_rate": 2.130523947854094e-06, "loss": 2.0775, "step": 7390 }, { "epoch": 0.81, "grad_norm": 1.0680855744400113, "learning_rate": 2.1187416910347723e-06, "loss": 1.9964, "step": 7395 }, { "epoch": 0.81, "grad_norm": 1.0989124248042745, "learning_rate": 2.106988242295981e-06, "loss": 2.0139, "step": 7400 }, { "epoch": 0.81, "grad_norm": 1.0518731242805408, "learning_rate": 2.0952636445996776e-06, "loss": 2.0613, "step": 7405 }, { "epoch": 0.81, "grad_norm": 1.0526966658427388, "learning_rate": 2.0835679408023626e-06, "loss": 2.1113, "step": 7410 }, { "epoch": 0.81, "grad_norm": 1.070790860908526, "learning_rate": 2.0719011736549254e-06, "loss": 2.0395, "step": 7415 }, { "epoch": 0.81, "grad_norm": 1.1105200591541124, "learning_rate": 2.06026338580248e-06, "loss": 2.0536, "step": 7420 }, { "epoch": 0.81, "grad_norm": 1.1058355417375696, "learning_rate": 2.0486546197842096e-06, "loss": 2.0453, "step": 7425 }, { "epoch": 0.81, "grad_norm": 1.0627332451776652, "learning_rate": 2.037074918033223e-06, "loss": 2.0706, "step": 7430 }, { "epoch": 0.81, "grad_norm": 1.0125048714285296, "learning_rate": 2.025524322876389e-06, "loss": 2.03, "step": 7435 }, { "epoch": 0.81, "grad_norm": 1.1402023186717118, "learning_rate": 2.0140028765341826e-06, "loss": 2.0277, "step": 7440 }, { "epoch": 0.82, "grad_norm": 1.1416051182184754, "learning_rate": 2.0025106211205347e-06, "loss": 2.0107, "step": 7445 }, { "epoch": 0.82, "grad_norm": 1.0747467140538594, "learning_rate": 1.9910475986426716e-06, "loss": 1.9911, "step": 7450 }, { "epoch": 0.82, "grad_norm": 1.1636981667241744, "learning_rate": 1.97961385100097e-06, "loss": 2.0111, "step": 7455 }, { "epoch": 0.82, "grad_norm": 1.0341093810100355, "learning_rate": 1.9682094199887937e-06, "loss": 2.0255, "step": 7460 }, { "epoch": 0.82, "grad_norm": 1.066099875251944, "learning_rate": 1.9568343472923524e-06, "loss": 2.0213, "step": 7465 }, { "epoch": 0.82, "grad_norm": 1.1593882286867176, "learning_rate": 1.94548867449054e-06, "loss": 2.0651, "step": 7470 }, { "epoch": 0.82, "grad_norm": 1.0559724070962744, "learning_rate": 1.9341724430547905e-06, "loss": 2.0486, "step": 7475 }, { "epoch": 0.82, "grad_norm": 1.1494795983775352, "learning_rate": 1.9228856943489126e-06, "loss": 2.0005, "step": 7480 }, { "epoch": 0.82, "grad_norm": 1.0393224974377526, "learning_rate": 1.9116284696289557e-06, "loss": 2.0408, "step": 7485 }, { "epoch": 0.82, "grad_norm": 1.081020223342638, "learning_rate": 1.9004008100430526e-06, "loss": 2.0487, "step": 7490 }, { "epoch": 0.82, "grad_norm": 1.0584188005122037, "learning_rate": 1.8892027566312599e-06, "loss": 2.0255, "step": 7495 }, { "epoch": 0.82, "grad_norm": 1.0391838092180707, "learning_rate": 1.8780343503254228e-06, "loss": 2.0119, "step": 7500 }, { "epoch": 0.82, "grad_norm": 1.1426129758468906, "learning_rate": 1.8668956319490128e-06, "loss": 1.9953, "step": 7505 }, { "epoch": 0.82, "grad_norm": 1.047390641207398, "learning_rate": 1.855786642216989e-06, "loss": 2.0193, "step": 7510 }, { "epoch": 0.82, "grad_norm": 1.123368676439886, "learning_rate": 1.844707421735643e-06, "loss": 2.01, "step": 7515 }, { "epoch": 0.82, "grad_norm": 1.0811155986604062, "learning_rate": 1.8336580110024528e-06, "loss": 2.0059, "step": 7520 }, { "epoch": 0.82, "grad_norm": 1.0828529563897735, "learning_rate": 1.8226384504059326e-06, "loss": 2.0539, "step": 7525 }, { "epoch": 0.82, "grad_norm": 1.058794677140321, "learning_rate": 1.8116487802254868e-06, "loss": 2.0809, "step": 7530 }, { "epoch": 0.83, "grad_norm": 1.28402267407909, "learning_rate": 1.8006890406312573e-06, "loss": 1.9654, "step": 7535 }, { "epoch": 0.83, "grad_norm": 1.133108510039709, "learning_rate": 1.7897592716839907e-06, "loss": 2.0328, "step": 7540 }, { "epoch": 0.83, "grad_norm": 1.052363198868776, "learning_rate": 1.7788595133348796e-06, "loss": 2.0357, "step": 7545 }, { "epoch": 0.83, "grad_norm": 1.0468567759333511, "learning_rate": 1.7679898054254175e-06, "loss": 2.0206, "step": 7550 }, { "epoch": 0.83, "grad_norm": 1.10154407852079, "learning_rate": 1.757150187687261e-06, "loss": 2.058, "step": 7555 }, { "epoch": 0.83, "grad_norm": 1.1733125935718713, "learning_rate": 1.7463406997420706e-06, "loss": 1.9768, "step": 7560 }, { "epoch": 0.83, "grad_norm": 1.0483933957458034, "learning_rate": 1.735561381101385e-06, "loss": 2.0339, "step": 7565 }, { "epoch": 0.83, "grad_norm": 1.0192559406966806, "learning_rate": 1.724812271166455e-06, "loss": 2.0508, "step": 7570 }, { "epoch": 0.83, "grad_norm": 1.0882407566463836, "learning_rate": 1.714093409228118e-06, "loss": 2.0353, "step": 7575 }, { "epoch": 0.83, "grad_norm": 1.11218334818641, "learning_rate": 1.7034048344666466e-06, "loss": 2.0838, "step": 7580 }, { "epoch": 0.83, "grad_norm": 1.0573984359910253, "learning_rate": 1.6927465859516057e-06, "loss": 2.02, "step": 7585 }, { "epoch": 0.83, "grad_norm": 1.0458814035005162, "learning_rate": 1.6821187026417052e-06, "loss": 2.083, "step": 7590 }, { "epoch": 0.83, "grad_norm": 1.11709492684873, "learning_rate": 1.6715212233846656e-06, "loss": 2.123, "step": 7595 }, { "epoch": 0.83, "grad_norm": 1.1082721473127022, "learning_rate": 1.6609541869170753e-06, "loss": 2.0397, "step": 7600 }, { "epoch": 0.83, "grad_norm": 1.0308113049894536, "learning_rate": 1.6504176318642406e-06, "loss": 2.0541, "step": 7605 }, { "epoch": 0.83, "grad_norm": 1.0292589914341488, "learning_rate": 1.6399115967400547e-06, "loss": 2.0209, "step": 7610 }, { "epoch": 0.83, "grad_norm": 1.0747743351150567, "learning_rate": 1.6294361199468478e-06, "loss": 2.032, "step": 7615 }, { "epoch": 0.83, "grad_norm": 1.164864798061818, "learning_rate": 1.618991239775255e-06, "loss": 2.033, "step": 7620 }, { "epoch": 0.84, "grad_norm": 1.1487980593281588, "learning_rate": 1.608576994404074e-06, "loss": 2.0519, "step": 7625 }, { "epoch": 0.84, "grad_norm": 1.1014919795798828, "learning_rate": 1.5981934219001215e-06, "loss": 2.0588, "step": 7630 }, { "epoch": 0.84, "grad_norm": 1.2308642222923776, "learning_rate": 1.5878405602180958e-06, "loss": 2.0568, "step": 7635 }, { "epoch": 0.84, "grad_norm": 1.021887790558341, "learning_rate": 1.5775184472004423e-06, "loss": 2.0128, "step": 7640 }, { "epoch": 0.84, "grad_norm": 1.1309227468840857, "learning_rate": 1.567227120577206e-06, "loss": 2.0872, "step": 7645 }, { "epoch": 0.84, "grad_norm": 1.137484348287824, "learning_rate": 1.556966617965907e-06, "loss": 2.0237, "step": 7650 }, { "epoch": 0.84, "grad_norm": 1.1053898347267768, "learning_rate": 1.5467369768713924e-06, "loss": 2.0454, "step": 7655 }, { "epoch": 0.84, "grad_norm": 1.0686279893865425, "learning_rate": 1.5365382346857005e-06, "loss": 2.0331, "step": 7660 }, { "epoch": 0.84, "grad_norm": 1.0660800829665595, "learning_rate": 1.5263704286879311e-06, "loss": 2.0824, "step": 7665 }, { "epoch": 0.84, "grad_norm": 1.1084374583060281, "learning_rate": 1.5162335960440966e-06, "loss": 2.0619, "step": 7670 }, { "epoch": 0.84, "grad_norm": 1.0525151999771922, "learning_rate": 1.5061277738069957e-06, "loss": 2.0146, "step": 7675 }, { "epoch": 0.84, "grad_norm": 1.1018226956346946, "learning_rate": 1.496052998916081e-06, "loss": 2.0256, "step": 7680 }, { "epoch": 0.84, "grad_norm": 1.1399292711427105, "learning_rate": 1.4860093081973148e-06, "loss": 1.9752, "step": 7685 }, { "epoch": 0.84, "grad_norm": 1.0785850423945678, "learning_rate": 1.4759967383630402e-06, "loss": 2.0946, "step": 7690 }, { "epoch": 0.84, "grad_norm": 1.0761880778402513, "learning_rate": 1.4660153260118414e-06, "loss": 2.0379, "step": 7695 }, { "epoch": 0.84, "grad_norm": 1.0529898188520448, "learning_rate": 1.4560651076284183e-06, "loss": 2.0461, "step": 7700 }, { "epoch": 0.84, "grad_norm": 1.0694699477335194, "learning_rate": 1.4461461195834491e-06, "loss": 2.0113, "step": 7705 }, { "epoch": 0.84, "grad_norm": 1.0672452768063714, "learning_rate": 1.4362583981334498e-06, "loss": 2.0991, "step": 7710 }, { "epoch": 0.85, "grad_norm": 1.0445394524483362, "learning_rate": 1.4264019794206564e-06, "loss": 2.0339, "step": 7715 }, { "epoch": 0.85, "grad_norm": 1.0215872569244284, "learning_rate": 1.4165768994728835e-06, "loss": 2.0352, "step": 7720 }, { "epoch": 0.85, "grad_norm": 1.0665892186102344, "learning_rate": 1.4067831942033904e-06, "loss": 2.0418, "step": 7725 }, { "epoch": 0.85, "grad_norm": 1.1468610969263697, "learning_rate": 1.397020899410757e-06, "loss": 2.0092, "step": 7730 }, { "epoch": 0.85, "grad_norm": 1.0846612649919403, "learning_rate": 1.3872900507787502e-06, "loss": 1.9853, "step": 7735 }, { "epoch": 0.85, "grad_norm": 1.1115684879015775, "learning_rate": 1.3775906838761933e-06, "loss": 2.0291, "step": 7740 }, { "epoch": 0.85, "grad_norm": 1.152213918347173, "learning_rate": 1.3679228341568308e-06, "loss": 2.0374, "step": 7745 }, { "epoch": 0.85, "grad_norm": 1.0996834711257855, "learning_rate": 1.3582865369592058e-06, "loss": 2.0921, "step": 7750 }, { "epoch": 0.85, "grad_norm": 1.0914503037949708, "learning_rate": 1.3486818275065305e-06, "loss": 2.0314, "step": 7755 }, { "epoch": 0.85, "grad_norm": 1.179337565615113, "learning_rate": 1.3391087409065562e-06, "loss": 2.0351, "step": 7760 }, { "epoch": 0.85, "grad_norm": 1.004729908605616, "learning_rate": 1.3295673121514408e-06, "loss": 2.0322, "step": 7765 }, { "epoch": 0.85, "grad_norm": 1.0949618296584758, "learning_rate": 1.3200575761176272e-06, "loss": 2.0376, "step": 7770 }, { "epoch": 0.85, "grad_norm": 1.123120040486113, "learning_rate": 1.3105795675657141e-06, "loss": 2.0522, "step": 7775 }, { "epoch": 0.85, "grad_norm": 1.0824098032013423, "learning_rate": 1.3011333211403233e-06, "loss": 2.0742, "step": 7780 }, { "epoch": 0.85, "grad_norm": 1.1053894218394558, "learning_rate": 1.2917188713699791e-06, "loss": 2.0009, "step": 7785 }, { "epoch": 0.85, "grad_norm": 1.048890903405524, "learning_rate": 1.2823362526669825e-06, "loss": 2.0667, "step": 7790 }, { "epoch": 0.85, "grad_norm": 1.0350496793098343, "learning_rate": 1.272985499327284e-06, "loss": 2.0743, "step": 7795 }, { "epoch": 0.85, "grad_norm": 1.0897469407198217, "learning_rate": 1.2636666455303581e-06, "loss": 2.0835, "step": 7800 }, { "epoch": 0.85, "grad_norm": 1.0226024552442936, "learning_rate": 1.2543797253390722e-06, "loss": 2.073, "step": 7805 }, { "epoch": 0.86, "grad_norm": 1.1059099798656515, "learning_rate": 1.2451247726995751e-06, "loss": 2.0469, "step": 7810 }, { "epoch": 0.86, "grad_norm": 1.0765474830795538, "learning_rate": 1.2359018214411634e-06, "loss": 2.0292, "step": 7815 }, { "epoch": 0.86, "grad_norm": 1.0768256976760613, "learning_rate": 1.2267109052761572e-06, "loss": 2.0206, "step": 7820 }, { "epoch": 0.86, "grad_norm": 1.0979105050656295, "learning_rate": 1.2175520577997834e-06, "loss": 2.0745, "step": 7825 }, { "epoch": 0.86, "grad_norm": 1.157884789584407, "learning_rate": 1.2084253124900514e-06, "loss": 2.0534, "step": 7830 }, { "epoch": 0.86, "grad_norm": 1.048169247611216, "learning_rate": 1.1993307027076195e-06, "loss": 2.03, "step": 7835 }, { "epoch": 0.86, "grad_norm": 1.0566988430263573, "learning_rate": 1.190268261695693e-06, "loss": 2.0375, "step": 7840 }, { "epoch": 0.86, "grad_norm": 1.1510022979188086, "learning_rate": 1.1812380225798848e-06, "loss": 2.0808, "step": 7845 }, { "epoch": 0.86, "grad_norm": 1.029461320966653, "learning_rate": 1.1722400183681081e-06, "loss": 2.0624, "step": 7850 }, { "epoch": 0.86, "grad_norm": 1.0768970087054741, "learning_rate": 1.1632742819504406e-06, "loss": 2.0346, "step": 7855 }, { "epoch": 0.86, "grad_norm": 1.1717731513620524, "learning_rate": 1.1543408460990157e-06, "loss": 2.069, "step": 7860 }, { "epoch": 0.86, "grad_norm": 1.0591566912859647, "learning_rate": 1.1454397434679022e-06, "loss": 2.0303, "step": 7865 }, { "epoch": 0.86, "grad_norm": 1.075763835291437, "learning_rate": 1.1365710065929814e-06, "loss": 2.0429, "step": 7870 }, { "epoch": 0.86, "grad_norm": 1.0746497726025446, "learning_rate": 1.1277346678918277e-06, "loss": 2.0377, "step": 7875 }, { "epoch": 0.86, "grad_norm": 1.0475980193521743, "learning_rate": 1.1189307596635968e-06, "loss": 2.061, "step": 7880 }, { "epoch": 0.86, "grad_norm": 1.0761899633894798, "learning_rate": 1.1101593140888933e-06, "loss": 2.0706, "step": 7885 }, { "epoch": 0.86, "grad_norm": 1.0932799611822004, "learning_rate": 1.1014203632296705e-06, "loss": 2.0297, "step": 7890 }, { "epoch": 0.86, "grad_norm": 1.1108383463015383, "learning_rate": 1.0927139390290985e-06, "loss": 2.1333, "step": 7895 }, { "epoch": 0.87, "grad_norm": 1.0448352226682285, "learning_rate": 1.0840400733114586e-06, "loss": 2.0095, "step": 7900 }, { "epoch": 0.87, "grad_norm": 1.1077957795576723, "learning_rate": 1.0753987977820214e-06, "loss": 2.0472, "step": 7905 }, { "epoch": 0.87, "grad_norm": 1.0957507640985134, "learning_rate": 1.0667901440269335e-06, "loss": 2.0558, "step": 7910 }, { "epoch": 0.87, "grad_norm": 1.052811611028974, "learning_rate": 1.0582141435130922e-06, "loss": 2.0469, "step": 7915 }, { "epoch": 0.87, "grad_norm": 1.075569569811823, "learning_rate": 1.0496708275880497e-06, "loss": 2.0257, "step": 7920 }, { "epoch": 0.87, "grad_norm": 1.0872043498749104, "learning_rate": 1.0411602274798771e-06, "loss": 1.995, "step": 7925 }, { "epoch": 0.87, "grad_norm": 1.0797581112826826, "learning_rate": 1.032682374297068e-06, "loss": 1.9758, "step": 7930 }, { "epoch": 0.87, "grad_norm": 1.0555628844594165, "learning_rate": 1.0242372990284143e-06, "loss": 2.0399, "step": 7935 }, { "epoch": 0.87, "grad_norm": 1.0823133640884846, "learning_rate": 1.0158250325428986e-06, "loss": 2.0593, "step": 7940 }, { "epoch": 0.87, "grad_norm": 1.0814158742355189, "learning_rate": 1.007445605589573e-06, "loss": 2.0719, "step": 7945 }, { "epoch": 0.87, "grad_norm": 1.0446905229554941, "learning_rate": 9.990990487974583e-07, "loss": 2.0727, "step": 7950 }, { "epoch": 0.87, "grad_norm": 1.1271520247433824, "learning_rate": 9.907853926754241e-07, "loss": 2.0912, "step": 7955 }, { "epoch": 0.87, "grad_norm": 1.0525719429116878, "learning_rate": 9.825046676120764e-07, "loss": 2.0702, "step": 7960 }, { "epoch": 0.87, "grad_norm": 1.138816137263595, "learning_rate": 9.742569038756567e-07, "loss": 2.0499, "step": 7965 }, { "epoch": 0.87, "grad_norm": 1.1063752462669456, "learning_rate": 9.660421316139134e-07, "loss": 2.0283, "step": 7970 }, { "epoch": 0.87, "grad_norm": 1.0829334483983872, "learning_rate": 9.578603808540132e-07, "loss": 2.056, "step": 7975 }, { "epoch": 0.87, "grad_norm": 1.0502449212769656, "learning_rate": 9.497116815024155e-07, "loss": 1.9925, "step": 7980 }, { "epoch": 0.87, "grad_norm": 1.1566875149865863, "learning_rate": 9.415960633447674e-07, "loss": 2.052, "step": 7985 }, { "epoch": 0.88, "grad_norm": 1.2092489386810032, "learning_rate": 9.335135560457986e-07, "loss": 2.0492, "step": 7990 }, { "epoch": 0.88, "grad_norm": 1.062817668190643, "learning_rate": 9.254641891492066e-07, "loss": 1.995, "step": 7995 }, { "epoch": 0.88, "grad_norm": 1.073635060785353, "learning_rate": 9.174479920775503e-07, "loss": 2.047, "step": 8000 }, { "epoch": 0.88, "grad_norm": 1.0448493925530267, "learning_rate": 9.094649941321498e-07, "loss": 2.0512, "step": 8005 }, { "epoch": 0.88, "grad_norm": 1.0537549382760276, "learning_rate": 9.015152244929715e-07, "loss": 2.0322, "step": 8010 }, { "epoch": 0.88, "grad_norm": 1.0594483848619538, "learning_rate": 8.935987122185219e-07, "loss": 2.0704, "step": 8015 }, { "epoch": 0.88, "grad_norm": 1.1163520291812772, "learning_rate": 8.857154862457451e-07, "loss": 2.11, "step": 8020 }, { "epoch": 0.88, "grad_norm": 1.0688793678003834, "learning_rate": 8.778655753899124e-07, "loss": 1.992, "step": 8025 }, { "epoch": 0.88, "grad_norm": 1.0860920499256197, "learning_rate": 8.700490083445234e-07, "loss": 2.011, "step": 8030 }, { "epoch": 0.88, "grad_norm": 1.1386651665480276, "learning_rate": 8.622658136811913e-07, "loss": 2.0643, "step": 8035 }, { "epoch": 0.88, "grad_norm": 1.129531688867724, "learning_rate": 8.545160198495506e-07, "loss": 2.0455, "step": 8040 }, { "epoch": 0.88, "grad_norm": 1.0349064866943611, "learning_rate": 8.467996551771463e-07, "loss": 2.0104, "step": 8045 }, { "epoch": 0.88, "grad_norm": 1.0623576700529331, "learning_rate": 8.391167478693241e-07, "loss": 2.0441, "step": 8050 }, { "epoch": 0.88, "grad_norm": 1.0351824080835033, "learning_rate": 8.314673260091433e-07, "loss": 2.0015, "step": 8055 }, { "epoch": 0.88, "grad_norm": 1.1116424659802349, "learning_rate": 8.238514175572588e-07, "loss": 2.1216, "step": 8060 }, { "epoch": 0.88, "grad_norm": 1.1875460419326873, "learning_rate": 8.1626905035183e-07, "loss": 2.0412, "step": 8065 }, { "epoch": 0.88, "grad_norm": 1.099213915765255, "learning_rate": 8.087202521084092e-07, "loss": 2.0486, "step": 8070 }, { "epoch": 0.88, "grad_norm": 1.0555425176635544, "learning_rate": 8.012050504198488e-07, "loss": 2.0364, "step": 8075 }, { "epoch": 0.89, "grad_norm": 1.1968668168036014, "learning_rate": 7.937234727561938e-07, "loss": 1.9846, "step": 8080 }, { "epoch": 0.89, "grad_norm": 1.1591519279297744, "learning_rate": 7.862755464645866e-07, "loss": 2.0501, "step": 8085 }, { "epoch": 0.89, "grad_norm": 1.05425843274438, "learning_rate": 7.788612987691669e-07, "loss": 2.0038, "step": 8090 }, { "epoch": 0.89, "grad_norm": 1.0406970269753353, "learning_rate": 7.71480756770967e-07, "loss": 2.0172, "step": 8095 }, { "epoch": 0.89, "grad_norm": 1.0836611624899524, "learning_rate": 7.641339474478194e-07, "loss": 2.0658, "step": 8100 }, { "epoch": 0.89, "grad_norm": 1.0602073326632737, "learning_rate": 7.568208976542491e-07, "loss": 2.0904, "step": 8105 }, { "epoch": 0.89, "grad_norm": 1.0822831692909778, "learning_rate": 7.495416341213846e-07, "loss": 2.0271, "step": 8110 }, { "epoch": 0.89, "grad_norm": 1.0589810573748843, "learning_rate": 7.422961834568565e-07, "loss": 2.0815, "step": 8115 }, { "epoch": 0.89, "grad_norm": 1.1365306371105701, "learning_rate": 7.350845721447019e-07, "loss": 2.0618, "step": 8120 }, { "epoch": 0.89, "grad_norm": 1.1566828995196927, "learning_rate": 7.279068265452649e-07, "loss": 2.0478, "step": 8125 }, { "epoch": 0.89, "grad_norm": 1.0647577721558645, "learning_rate": 7.207629728951015e-07, "loss": 2.0598, "step": 8130 }, { "epoch": 0.89, "grad_norm": 1.0506717328348292, "learning_rate": 7.136530373068818e-07, "loss": 1.9928, "step": 8135 }, { "epoch": 0.89, "grad_norm": 1.0511275869171615, "learning_rate": 7.065770457693e-07, "loss": 1.9949, "step": 8140 }, { "epoch": 0.89, "grad_norm": 1.0664973436253407, "learning_rate": 6.995350241469701e-07, "loss": 1.9959, "step": 8145 }, { "epoch": 0.89, "grad_norm": 1.1106315044069988, "learning_rate": 6.925269981803451e-07, "loss": 2.0495, "step": 8150 }, { "epoch": 0.89, "grad_norm": 1.0996222185294755, "learning_rate": 6.855529934856098e-07, "loss": 2.0412, "step": 8155 }, { "epoch": 0.89, "grad_norm": 1.0853325276815844, "learning_rate": 6.786130355545917e-07, "loss": 1.9552, "step": 8160 }, { "epoch": 0.89, "grad_norm": 1.0427515110067045, "learning_rate": 6.717071497546701e-07, "loss": 2.0333, "step": 8165 }, { "epoch": 0.89, "grad_norm": 1.1360805234969171, "learning_rate": 6.64835361328684e-07, "loss": 2.004, "step": 8170 }, { "epoch": 0.9, "grad_norm": 1.07728018194882, "learning_rate": 6.579976953948308e-07, "loss": 2.0423, "step": 8175 }, { "epoch": 0.9, "grad_norm": 1.0828798048654558, "learning_rate": 6.51194176946588e-07, "loss": 1.9985, "step": 8180 }, { "epoch": 0.9, "grad_norm": 1.118079913048319, "learning_rate": 6.444248308526125e-07, "loss": 2.0415, "step": 8185 }, { "epoch": 0.9, "grad_norm": 1.0885784946229151, "learning_rate": 6.376896818566503e-07, "loss": 2.0482, "step": 8190 }, { "epoch": 0.9, "grad_norm": 1.0976352368028202, "learning_rate": 6.309887545774507e-07, "loss": 2.0569, "step": 8195 }, { "epoch": 0.9, "grad_norm": 1.13799776526284, "learning_rate": 6.243220735086719e-07, "loss": 2.083, "step": 8200 }, { "epoch": 0.9, "grad_norm": 1.0537327262126115, "learning_rate": 6.176896630187967e-07, "loss": 1.9709, "step": 8205 }, { "epoch": 0.9, "grad_norm": 1.0453121302403499, "learning_rate": 6.110915473510348e-07, "loss": 2.0119, "step": 8210 }, { "epoch": 0.9, "grad_norm": 1.0945982167917123, "learning_rate": 6.045277506232394e-07, "loss": 2.0602, "step": 8215 }, { "epoch": 0.9, "grad_norm": 1.1309606426598684, "learning_rate": 5.979982968278241e-07, "loss": 2.0545, "step": 8220 }, { "epoch": 0.9, "grad_norm": 1.1481016568716318, "learning_rate": 5.915032098316653e-07, "loss": 2.0534, "step": 8225 }, { "epoch": 0.9, "grad_norm": 1.0557817391234714, "learning_rate": 5.850425133760218e-07, "loss": 1.9797, "step": 8230 }, { "epoch": 0.9, "grad_norm": 1.0214290810025175, "learning_rate": 5.786162310764454e-07, "loss": 2.0476, "step": 8235 }, { "epoch": 0.9, "grad_norm": 1.1222356218896603, "learning_rate": 5.722243864226972e-07, "loss": 1.9998, "step": 8240 }, { "epoch": 0.9, "grad_norm": 1.0637637891177105, "learning_rate": 5.658670027786561e-07, "loss": 2.0103, "step": 8245 }, { "epoch": 0.9, "grad_norm": 1.0340168765926214, "learning_rate": 5.595441033822358e-07, "loss": 1.9818, "step": 8250 }, { "epoch": 0.9, "grad_norm": 1.0275777602340914, "learning_rate": 5.532557113453041e-07, "loss": 2.0259, "step": 8255 }, { "epoch": 0.9, "grad_norm": 1.0946784158713294, "learning_rate": 5.470018496535967e-07, "loss": 2.0265, "step": 8260 }, { "epoch": 0.91, "grad_norm": 1.1162861802327289, "learning_rate": 5.407825411666312e-07, "loss": 2.0085, "step": 8265 }, { "epoch": 0.91, "grad_norm": 1.0225527146410063, "learning_rate": 5.345978086176174e-07, "loss": 2.0233, "step": 8270 }, { "epoch": 0.91, "grad_norm": 1.0450962690253733, "learning_rate": 5.284476746133904e-07, "loss": 2.0456, "step": 8275 }, { "epoch": 0.91, "grad_norm": 1.088527966203044, "learning_rate": 5.223321616343136e-07, "loss": 2.0252, "step": 8280 }, { "epoch": 0.91, "grad_norm": 1.1091290242456069, "learning_rate": 5.162512920341989e-07, "loss": 2.147, "step": 8285 }, { "epoch": 0.91, "grad_norm": 1.0552554321671914, "learning_rate": 5.102050880402343e-07, "loss": 2.0415, "step": 8290 }, { "epoch": 0.91, "grad_norm": 1.0795168823213537, "learning_rate": 5.041935717528912e-07, "loss": 2.0175, "step": 8295 }, { "epoch": 0.91, "grad_norm": 1.111010667182388, "learning_rate": 4.98216765145848e-07, "loss": 2.0334, "step": 8300 }, { "epoch": 0.91, "grad_norm": 1.0394393229730516, "learning_rate": 4.922746900659125e-07, "loss": 2.0425, "step": 8305 }, { "epoch": 0.91, "grad_norm": 1.048585427174845, "learning_rate": 4.863673682329373e-07, "loss": 2.0532, "step": 8310 }, { "epoch": 0.91, "grad_norm": 1.0272476498461895, "learning_rate": 4.804948212397465e-07, "loss": 2.0277, "step": 8315 }, { "epoch": 0.91, "grad_norm": 1.1080132193080967, "learning_rate": 4.746570705520481e-07, "loss": 2.0433, "step": 8320 }, { "epoch": 0.91, "grad_norm": 1.0964472456564187, "learning_rate": 4.6885413750836217e-07, "loss": 2.0673, "step": 8325 }, { "epoch": 0.91, "grad_norm": 1.0428818640568136, "learning_rate": 4.6308604331994133e-07, "loss": 2.0337, "step": 8330 }, { "epoch": 0.91, "grad_norm": 1.0562406975376029, "learning_rate": 4.5735280907069467e-07, "loss": 1.9779, "step": 8335 }, { "epoch": 0.91, "grad_norm": 1.09533228732798, "learning_rate": 4.516544557171065e-07, "loss": 2.042, "step": 8340 }, { "epoch": 0.91, "grad_norm": 1.1291047196461959, "learning_rate": 4.459910040881632e-07, "loss": 1.9799, "step": 8345 }, { "epoch": 0.91, "grad_norm": 1.0366919408959898, "learning_rate": 4.4036247488527864e-07, "loss": 2.0255, "step": 8350 }, { "epoch": 0.92, "grad_norm": 1.0718948837067552, "learning_rate": 4.34768888682211e-07, "loss": 2.0544, "step": 8355 }, { "epoch": 0.92, "grad_norm": 1.087545888211063, "learning_rate": 4.2921026592499526e-07, "loss": 2.0912, "step": 8360 }, { "epoch": 0.92, "grad_norm": 1.0996268345969746, "learning_rate": 4.2368662693186403e-07, "loss": 2.0484, "step": 8365 }, { "epoch": 0.92, "grad_norm": 1.1020586835335737, "learning_rate": 4.1819799189318e-07, "loss": 2.105, "step": 8370 }, { "epoch": 0.92, "grad_norm": 1.089585909485487, "learning_rate": 4.1274438087135273e-07, "loss": 2.0382, "step": 8375 }, { "epoch": 0.92, "grad_norm": 1.0775689510706492, "learning_rate": 4.0732581380076964e-07, "loss": 2.0356, "step": 8380 }, { "epoch": 0.92, "grad_norm": 1.023592318390289, "learning_rate": 4.0194231048772514e-07, "loss": 2.088, "step": 8385 }, { "epoch": 0.92, "grad_norm": 1.1697892715076412, "learning_rate": 3.9659389061034617e-07, "loss": 2.0418, "step": 8390 }, { "epoch": 0.92, "grad_norm": 1.0512333774805902, "learning_rate": 3.912805737185177e-07, "loss": 2.0286, "step": 8395 }, { "epoch": 0.92, "grad_norm": 1.073639095500842, "learning_rate": 3.860023792338197e-07, "loss": 2.0107, "step": 8400 }, { "epoch": 0.92, "grad_norm": 1.1962886636229824, "learning_rate": 3.807593264494436e-07, "loss": 2.0313, "step": 8405 }, { "epoch": 0.92, "grad_norm": 1.0517093051039863, "learning_rate": 3.755514345301325e-07, "loss": 2.0637, "step": 8410 }, { "epoch": 0.92, "grad_norm": 1.0677193338878124, "learning_rate": 3.7037872251210917e-07, "loss": 2.0041, "step": 8415 }, { "epoch": 0.92, "grad_norm": 1.118750093570913, "learning_rate": 3.65241209303e-07, "loss": 2.0548, "step": 8420 }, { "epoch": 0.92, "grad_norm": 1.1360553727907146, "learning_rate": 3.6013891368177345e-07, "loss": 2.0439, "step": 8425 }, { "epoch": 0.92, "grad_norm": 1.0915309305710958, "learning_rate": 3.5507185429866754e-07, "loss": 1.9644, "step": 8430 }, { "epoch": 0.92, "grad_norm": 1.071355858214417, "learning_rate": 3.5004004967512104e-07, "loss": 2.023, "step": 8435 }, { "epoch": 0.92, "grad_norm": 1.1167386701676851, "learning_rate": 3.450435182037104e-07, "loss": 2.048, "step": 8440 }, { "epoch": 0.93, "grad_norm": 1.0486011249287561, "learning_rate": 3.4008227814807726e-07, "loss": 1.9954, "step": 8445 }, { "epoch": 0.93, "grad_norm": 1.0617158930999893, "learning_rate": 3.351563476428632e-07, "loss": 2.0777, "step": 8450 }, { "epoch": 0.93, "grad_norm": 1.057990858544036, "learning_rate": 3.302657446936486e-07, "loss": 2.082, "step": 8455 }, { "epoch": 0.93, "grad_norm": 1.0549001996685448, "learning_rate": 3.2541048717687485e-07, "loss": 1.9959, "step": 8460 }, { "epoch": 0.93, "grad_norm": 1.0549300955097738, "learning_rate": 3.205905928397923e-07, "loss": 2.0469, "step": 8465 }, { "epoch": 0.93, "grad_norm": 1.0468566463287496, "learning_rate": 3.1580607930038587e-07, "loss": 2.1097, "step": 8470 }, { "epoch": 0.93, "grad_norm": 1.1841194197406848, "learning_rate": 3.110569640473182e-07, "loss": 2.1122, "step": 8475 }, { "epoch": 0.93, "grad_norm": 1.0616403335729612, "learning_rate": 3.06343264439859e-07, "loss": 2.0431, "step": 8480 }, { "epoch": 0.93, "grad_norm": 1.016678294544556, "learning_rate": 3.0166499770782677e-07, "loss": 2.0534, "step": 8485 }, { "epoch": 0.93, "grad_norm": 1.0045182741274552, "learning_rate": 2.9702218095151945e-07, "loss": 2.023, "step": 8490 }, { "epoch": 0.93, "grad_norm": 1.0550304789172613, "learning_rate": 2.924148311416597e-07, "loss": 2.0715, "step": 8495 }, { "epoch": 0.93, "grad_norm": 1.18876541116007, "learning_rate": 2.8784296511932706e-07, "loss": 2.0454, "step": 8500 }, { "epoch": 0.93, "grad_norm": 1.0390712660472083, "learning_rate": 2.8330659959589944e-07, "loss": 2.066, "step": 8505 }, { "epoch": 0.93, "grad_norm": 1.0661904822219928, "learning_rate": 2.7880575115299293e-07, "loss": 2.0772, "step": 8510 }, { "epoch": 0.93, "grad_norm": 1.088263757778212, "learning_rate": 2.7434043624239404e-07, "loss": 2.0586, "step": 8515 }, { "epoch": 0.93, "grad_norm": 1.0624167907741053, "learning_rate": 2.6991067118601e-07, "loss": 2.0427, "step": 8520 }, { "epoch": 0.93, "grad_norm": 1.0396200278995997, "learning_rate": 2.655164721758008e-07, "loss": 2.0254, "step": 8525 }, { "epoch": 0.93, "grad_norm": 1.1099379377033909, "learning_rate": 2.6115785527372705e-07, "loss": 2.0605, "step": 8530 }, { "epoch": 0.93, "grad_norm": 1.0501754618429333, "learning_rate": 2.568348364116813e-07, "loss": 2.0516, "step": 8535 }, { "epoch": 0.94, "grad_norm": 1.088200827861936, "learning_rate": 2.525474313914411e-07, "loss": 2.0526, "step": 8540 }, { "epoch": 0.94, "grad_norm": 1.0441535548015923, "learning_rate": 2.482956558846017e-07, "loss": 2.0615, "step": 8545 }, { "epoch": 0.94, "grad_norm": 1.0679346914069165, "learning_rate": 2.44079525432529e-07, "loss": 2.0596, "step": 8550 }, { "epoch": 0.94, "grad_norm": 1.0485625096457418, "learning_rate": 2.398990554462899e-07, "loss": 2.0013, "step": 8555 }, { "epoch": 0.94, "grad_norm": 1.0629055931173041, "learning_rate": 2.3575426120660883e-07, "loss": 2.0181, "step": 8560 }, { "epoch": 0.94, "grad_norm": 1.0263204578249079, "learning_rate": 2.3164515786380348e-07, "loss": 2.0582, "step": 8565 }, { "epoch": 0.94, "grad_norm": 1.0969503590973775, "learning_rate": 2.275717604377292e-07, "loss": 2.0175, "step": 8570 }, { "epoch": 0.94, "grad_norm": 1.0278424818775862, "learning_rate": 2.2353408381773133e-07, "loss": 2.0534, "step": 8575 }, { "epoch": 0.94, "grad_norm": 1.099794410239063, "learning_rate": 2.1953214276258294e-07, "loss": 2.0853, "step": 8580 }, { "epoch": 0.94, "grad_norm": 1.0643381363485742, "learning_rate": 2.1556595190043718e-07, "loss": 2.0412, "step": 8585 }, { "epoch": 0.94, "grad_norm": 1.0780613743741922, "learning_rate": 2.1163552572876722e-07, "loss": 2.0667, "step": 8590 }, { "epoch": 0.94, "grad_norm": 1.107016249637676, "learning_rate": 2.077408786143209e-07, "loss": 1.9996, "step": 8595 }, { "epoch": 0.94, "grad_norm": 1.0697798782623968, "learning_rate": 2.0388202479306062e-07, "loss": 2.0485, "step": 8600 }, { "epoch": 0.94, "grad_norm": 1.0957489912890277, "learning_rate": 2.0005897837011568e-07, "loss": 2.0407, "step": 8605 }, { "epoch": 0.94, "grad_norm": 1.1371789685732685, "learning_rate": 1.9627175331973114e-07, "loss": 2.0537, "step": 8610 }, { "epoch": 0.94, "grad_norm": 1.087986622024545, "learning_rate": 1.9252036348521464e-07, "loss": 2.0187, "step": 8615 }, { "epoch": 0.94, "grad_norm": 1.0956737728411794, "learning_rate": 1.8880482257888854e-07, "loss": 2.0117, "step": 8620 }, { "epoch": 0.94, "grad_norm": 1.097422272197665, "learning_rate": 1.851251441820323e-07, "loss": 2.0159, "step": 8625 }, { "epoch": 0.95, "grad_norm": 0.987732027112551, "learning_rate": 1.8148134174484467e-07, "loss": 2.0063, "step": 8630 }, { "epoch": 0.95, "grad_norm": 1.085665061748627, "learning_rate": 1.7787342858638589e-07, "loss": 2.0424, "step": 8635 }, { "epoch": 0.95, "grad_norm": 1.0575473085344036, "learning_rate": 1.7430141789453124e-07, "loss": 2.0733, "step": 8640 }, { "epoch": 0.95, "grad_norm": 1.0553994194850556, "learning_rate": 1.7076532272592315e-07, "loss": 2.0531, "step": 8645 }, { "epoch": 0.95, "grad_norm": 1.0512782780560663, "learning_rate": 1.6726515600592353e-07, "loss": 2.0627, "step": 8650 }, { "epoch": 0.95, "grad_norm": 1.0583883063467834, "learning_rate": 1.6380093052856482e-07, "loss": 2.059, "step": 8655 }, { "epoch": 0.95, "grad_norm": 1.077165172097117, "learning_rate": 1.6037265895650688e-07, "loss": 2.0363, "step": 8660 }, { "epoch": 0.95, "grad_norm": 1.0367627703088333, "learning_rate": 1.5698035382098687e-07, "loss": 1.9644, "step": 8665 }, { "epoch": 0.95, "grad_norm": 1.1047380298188336, "learning_rate": 1.536240275217793e-07, "loss": 2.0211, "step": 8670 }, { "epoch": 0.95, "grad_norm": 1.0842764632237492, "learning_rate": 1.5030369232713838e-07, "loss": 1.9933, "step": 8675 }, { "epoch": 0.95, "grad_norm": 1.0702597880621731, "learning_rate": 1.4701936037377019e-07, "loss": 2.0707, "step": 8680 }, { "epoch": 0.95, "grad_norm": 1.109168895086716, "learning_rate": 1.4377104366677274e-07, "loss": 2.0304, "step": 8685 }, { "epoch": 0.95, "grad_norm": 1.0649607830431849, "learning_rate": 1.4055875407960274e-07, "loss": 2.0592, "step": 8690 }, { "epoch": 0.95, "grad_norm": 1.0742151668956033, "learning_rate": 1.3738250335402658e-07, "loss": 2.0035, "step": 8695 }, { "epoch": 0.95, "grad_norm": 1.0774763659886095, "learning_rate": 1.3424230310007946e-07, "loss": 1.97, "step": 8700 }, { "epoch": 0.95, "grad_norm": 1.0818855021565348, "learning_rate": 1.3113816479602304e-07, "loss": 2.0233, "step": 8705 }, { "epoch": 0.95, "grad_norm": 1.08746463612767, "learning_rate": 1.280700997883033e-07, "loss": 2.0321, "step": 8710 }, { "epoch": 0.95, "grad_norm": 1.0732692240961477, "learning_rate": 1.250381192915051e-07, "loss": 2.0714, "step": 8715 }, { "epoch": 0.96, "grad_norm": 1.1058789574701493, "learning_rate": 1.2204223438832096e-07, "loss": 2.0614, "step": 8720 }, { "epoch": 0.96, "grad_norm": 1.1668656560261987, "learning_rate": 1.1908245602950009e-07, "loss": 2.0133, "step": 8725 }, { "epoch": 0.96, "grad_norm": 1.0727357901040624, "learning_rate": 1.16158795033815e-07, "loss": 2.0797, "step": 8730 }, { "epoch": 0.96, "grad_norm": 0.987848716822071, "learning_rate": 1.132712620880172e-07, "loss": 2.0347, "step": 8735 }, { "epoch": 0.96, "grad_norm": 1.0891027635300488, "learning_rate": 1.1041986774680269e-07, "loss": 2.0499, "step": 8740 }, { "epoch": 0.96, "grad_norm": 1.0735159022929956, "learning_rate": 1.0760462243277204e-07, "loss": 2.0312, "step": 8745 }, { "epoch": 0.96, "grad_norm": 1.0699090487346967, "learning_rate": 1.0482553643638927e-07, "loss": 2.0163, "step": 8750 }, { "epoch": 0.96, "grad_norm": 1.1171505032028382, "learning_rate": 1.0208261991594859e-07, "loss": 2.0311, "step": 8755 }, { "epoch": 0.96, "grad_norm": 1.1354531981339695, "learning_rate": 9.93758828975333e-08, "loss": 2.0377, "step": 8760 }, { "epoch": 0.96, "grad_norm": 1.0880541286093504, "learning_rate": 9.670533527498139e-08, "loss": 2.1072, "step": 8765 }, { "epoch": 0.96, "grad_norm": 1.036072204513383, "learning_rate": 9.407098680985104e-08, "loss": 2.0026, "step": 8770 }, { "epoch": 0.96, "grad_norm": 1.097363164400368, "learning_rate": 9.147284713138082e-08, "loss": 2.0533, "step": 8775 }, { "epoch": 0.96, "grad_norm": 1.0561426119232993, "learning_rate": 8.891092573645955e-08, "loss": 2.0615, "step": 8780 }, { "epoch": 0.96, "grad_norm": 1.0890069718421924, "learning_rate": 8.638523198958415e-08, "loss": 2.0076, "step": 8785 }, { "epoch": 0.96, "grad_norm": 1.071199114908289, "learning_rate": 8.389577512283198e-08, "loss": 2.0648, "step": 8790 }, { "epoch": 0.96, "grad_norm": 1.0820298510287865, "learning_rate": 8.144256423582631e-08, "loss": 2.0193, "step": 8795 }, { "epoch": 0.96, "grad_norm": 1.5070803241716144, "learning_rate": 7.902560829570194e-08, "loss": 2.0976, "step": 8800 }, { "epoch": 0.96, "grad_norm": 1.0928669357324612, "learning_rate": 7.664491613706859e-08, "loss": 2.0135, "step": 8805 }, { "epoch": 0.97, "grad_norm": 1.0044790259389607, "learning_rate": 7.430049646198755e-08, "loss": 2.0304, "step": 8810 }, { "epoch": 0.97, "grad_norm": 1.0831509475995482, "learning_rate": 7.199235783992953e-08, "loss": 2.08, "step": 8815 }, { "epoch": 0.97, "grad_norm": 1.155196443569112, "learning_rate": 6.972050870775238e-08, "loss": 2.0178, "step": 8820 }, { "epoch": 0.97, "grad_norm": 1.0140482490269411, "learning_rate": 6.748495736966454e-08, "loss": 2.0502, "step": 8825 }, { "epoch": 0.97, "grad_norm": 1.3552122855175122, "learning_rate": 6.528571199719502e-08, "loss": 2.0603, "step": 8830 }, { "epoch": 0.97, "grad_norm": 1.074806553206276, "learning_rate": 6.312278062916788e-08, "loss": 2.0693, "step": 8835 }, { "epoch": 0.97, "grad_norm": 1.1278601480344714, "learning_rate": 6.099617117166889e-08, "loss": 2.0342, "step": 8840 }, { "epoch": 0.97, "grad_norm": 1.0940988702740788, "learning_rate": 5.8905891398015614e-08, "loss": 2.0179, "step": 8845 }, { "epoch": 0.97, "grad_norm": 1.0239882580761461, "learning_rate": 5.6851948948734024e-08, "loss": 2.029, "step": 8850 }, { "epoch": 0.97, "grad_norm": 1.1032044850133964, "learning_rate": 5.483435133152526e-08, "loss": 2.0594, "step": 8855 }, { "epoch": 0.97, "grad_norm": 1.1082539010323642, "learning_rate": 5.2853105921242265e-08, "loss": 2.0468, "step": 8860 }, { "epoch": 0.97, "grad_norm": 1.0950084292485398, "learning_rate": 5.090821995986095e-08, "loss": 1.9796, "step": 8865 }, { "epoch": 0.97, "grad_norm": 1.0289756922228421, "learning_rate": 4.899970055645131e-08, "loss": 2.0351, "step": 8870 }, { "epoch": 0.97, "grad_norm": 1.1023969816775974, "learning_rate": 4.712755468715524e-08, "loss": 2.0559, "step": 8875 }, { "epoch": 0.97, "grad_norm": 1.0618885148277757, "learning_rate": 4.5291789195163196e-08, "loss": 2.0659, "step": 8880 }, { "epoch": 0.97, "grad_norm": 1.0260767202718877, "learning_rate": 4.349241079068089e-08, "loss": 2.0325, "step": 8885 }, { "epoch": 0.97, "grad_norm": 1.0681525586755578, "learning_rate": 4.172942605091268e-08, "loss": 2.0896, "step": 8890 }, { "epoch": 0.97, "grad_norm": 1.0669271894840704, "learning_rate": 4.000284142003264e-08, "loss": 2.0364, "step": 8895 }, { "epoch": 0.97, "grad_norm": 1.1371835932050292, "learning_rate": 3.831266320916349e-08, "loss": 2.0233, "step": 8900 }, { "epoch": 0.98, "grad_norm": 1.055916861964122, "learning_rate": 3.665889759635222e-08, "loss": 2.0363, "step": 8905 }, { "epoch": 0.98, "grad_norm": 1.1629478651512861, "learning_rate": 3.5041550626551124e-08, "loss": 2.0763, "step": 8910 }, { "epoch": 0.98, "grad_norm": 1.0611315608166385, "learning_rate": 3.346062821158902e-08, "loss": 2.0082, "step": 8915 }, { "epoch": 0.98, "grad_norm": 1.0932723093312926, "learning_rate": 3.191613613015454e-08, "loss": 2.0291, "step": 8920 }, { "epoch": 0.98, "grad_norm": 1.022423326884814, "learning_rate": 3.0408080027775066e-08, "loss": 2.0379, "step": 8925 }, { "epoch": 0.98, "grad_norm": 1.0508580611198997, "learning_rate": 2.8936465416794514e-08, "loss": 1.9958, "step": 8930 }, { "epoch": 0.98, "grad_norm": 1.1156856767395884, "learning_rate": 2.7501297676350013e-08, "loss": 2.0388, "step": 8935 }, { "epoch": 0.98, "grad_norm": 1.08247701540056, "learning_rate": 2.6102582052361937e-08, "loss": 2.0501, "step": 8940 }, { "epoch": 0.98, "grad_norm": 1.1464987365463157, "learning_rate": 2.4740323657503895e-08, "loss": 2.0438, "step": 8945 }, { "epoch": 0.98, "grad_norm": 1.0735979188370217, "learning_rate": 2.341452747118944e-08, "loss": 1.973, "step": 8950 }, { "epoch": 0.98, "grad_norm": 1.0635374189668498, "learning_rate": 2.21251983395554e-08, "loss": 2.0199, "step": 8955 }, { "epoch": 0.98, "grad_norm": 1.1228559734385861, "learning_rate": 2.0872340975438555e-08, "loss": 2.022, "step": 8960 }, { "epoch": 0.98, "grad_norm": 1.085695489853954, "learning_rate": 1.9655959958364557e-08, "loss": 2.0329, "step": 8965 }, { "epoch": 0.98, "grad_norm": 1.0726258671585331, "learning_rate": 1.847605973452682e-08, "loss": 2.0817, "step": 8970 }, { "epoch": 0.98, "grad_norm": 1.0008401797559021, "learning_rate": 1.7332644616773198e-08, "loss": 2.0138, "step": 8975 }, { "epoch": 0.98, "grad_norm": 1.0213124680903753, "learning_rate": 1.6225718784586008e-08, "loss": 2.049, "step": 8980 }, { "epoch": 0.98, "grad_norm": 1.026092387360889, "learning_rate": 1.5155286284073146e-08, "loss": 2.042, "step": 8985 }, { "epoch": 0.98, "grad_norm": 1.0497253457193707, "learning_rate": 1.4121351027946984e-08, "loss": 1.9693, "step": 8990 }, { "epoch": 0.99, "grad_norm": 1.0454053017901987, "learning_rate": 1.31239167955155e-08, "loss": 2.0103, "step": 8995 }, { "epoch": 0.99, "grad_norm": 1.0337975997855344, "learning_rate": 1.2162987232662294e-08, "loss": 1.9555, "step": 9000 }, { "epoch": 0.99, "grad_norm": 1.050849215880343, "learning_rate": 1.1238565851841021e-08, "loss": 2.0311, "step": 9005 }, { "epoch": 0.99, "grad_norm": 1.0996592945231747, "learning_rate": 1.0350656032053208e-08, "loss": 1.9777, "step": 9010 }, { "epoch": 0.99, "grad_norm": 1.0362940418612334, "learning_rate": 9.499261018846018e-09, "loss": 2.0486, "step": 9015 }, { "epoch": 0.99, "grad_norm": 1.0655396511886823, "learning_rate": 8.684383924291161e-09, "loss": 2.0641, "step": 9020 }, { "epoch": 0.99, "grad_norm": 1.1295664221703239, "learning_rate": 7.906027726981568e-09, "loss": 2.047, "step": 9025 }, { "epoch": 0.99, "grad_norm": 1.0737623803911402, "learning_rate": 7.1641952720136185e-09, "loss": 1.9904, "step": 9030 }, { "epoch": 0.99, "grad_norm": 1.231548458449595, "learning_rate": 6.458889270980484e-09, "loss": 2.0526, "step": 9035 }, { "epoch": 0.99, "grad_norm": 1.0771352268883891, "learning_rate": 5.7901123019632465e-09, "loss": 2.0222, "step": 9040 }, { "epoch": 0.99, "grad_norm": 1.0830251903699233, "learning_rate": 5.157866809516465e-09, "loss": 1.9958, "step": 9045 }, { "epoch": 0.99, "grad_norm": 1.1013012250702732, "learning_rate": 4.562155104665955e-09, "loss": 2.01, "step": 9050 }, { "epoch": 0.99, "grad_norm": 1.017657043201425, "learning_rate": 4.002979364895465e-09, "loss": 2.0381, "step": 9055 }, { "epoch": 0.99, "grad_norm": 1.0875102824181286, "learning_rate": 3.480341634138906e-09, "loss": 2.0245, "step": 9060 }, { "epoch": 0.99, "grad_norm": 1.0739859707038664, "learning_rate": 2.9942438227748004e-09, "loss": 2.0614, "step": 9065 }, { "epoch": 0.99, "grad_norm": 1.0472858874471227, "learning_rate": 2.54468770762073e-09, "loss": 2.0685, "step": 9070 }, { "epoch": 0.99, "grad_norm": 1.0301681090028931, "learning_rate": 2.131674931922234e-09, "loss": 2.0087, "step": 9075 }, { "epoch": 0.99, "grad_norm": 1.043055706471751, "learning_rate": 1.7552070053494796e-09, "loss": 2.1004, "step": 9080 }, { "epoch": 1.0, "grad_norm": 1.052957816215289, "learning_rate": 1.4152853039928194e-09, "loss": 1.9864, "step": 9085 }, { "epoch": 1.0, "grad_norm": 1.065283127980355, "learning_rate": 1.111911070356131e-09, "loss": 2.0063, "step": 9090 }, { "epoch": 1.0, "grad_norm": 1.0742080000302494, "learning_rate": 8.450854133512653e-10, "loss": 1.9976, "step": 9095 }, { "epoch": 1.0, "grad_norm": 1.0185736369490819, "learning_rate": 6.148093082969375e-10, "loss": 2.0287, "step": 9100 }, { "epoch": 1.0, "grad_norm": 1.081781229053032, "learning_rate": 4.210835969142846e-10, "loss": 2.0759, "step": 9105 }, { "epoch": 1.0, "grad_norm": 1.162041908306624, "learning_rate": 2.6390898732020496e-10, "loss": 1.9957, "step": 9110 }, { "epoch": 1.0, "grad_norm": 1.125025100973935, "learning_rate": 1.4328605403068906e-10, "loss": 2.0975, "step": 9115 }, { "epoch": 1.0, "grad_norm": 1.0639351258385779, "learning_rate": 5.921523795304752e-11, "loss": 2.0413, "step": 9120 }, { "epoch": 1.0, "grad_norm": 1.0762515955225003, "learning_rate": 1.1696846390352178e-11, "loss": 2.075, "step": 9125 }, { "epoch": 1.0, "eval_loss": 2.0463554859161377, "eval_runtime": 26.7014, "eval_samples_per_second": 1210.725, "eval_steps_per_second": 37.863, "step": 9129 }, { "epoch": 1.0, "step": 9129, "total_flos": 6892164218880.0, "train_loss": 2.1077401449730213, "train_runtime": 1063.9239, "train_samples_per_second": 274.555, "train_steps_per_second": 8.581 } ], "logging_steps": 5, "max_steps": 9129, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 6892164218880.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }