ArabianGPT-0.8B-FT-QA / trainer_state.json
riotu-lab's picture
uplaod 11 files
b3cef99 verified
raw
history blame contribute delete
No virus
188 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.9999981318415465,
"eval_steps": 500,
"global_step": 535286,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0018681584534637058,
"grad_norm": 7.906698703765869,
"learning_rate": 9.999978643682902e-05,
"loss": 3.2962,
"step": 500
},
{
"epoch": 0.0037363169069274116,
"grad_norm": 8.97018051147461,
"learning_rate": 9.99991423149794e-05,
"loss": 3.1699,
"step": 1000
},
{
"epoch": 0.005604475360391117,
"grad_norm": 15.646541595458984,
"learning_rate": 9.999806763655335e-05,
"loss": 3.0952,
"step": 1500
},
{
"epoch": 0.007472633813854823,
"grad_norm": 7.570132732391357,
"learning_rate": 9.999656241080522e-05,
"loss": 3.0763,
"step": 2000
},
{
"epoch": 0.009340792267318529,
"grad_norm": 5.853657245635986,
"learning_rate": 9.999462665069693e-05,
"loss": 3.0355,
"step": 2500
},
{
"epoch": 0.011208950720782235,
"grad_norm": 6.587205410003662,
"learning_rate": 9.999226553509718e-05,
"loss": 3.0291,
"step": 3000
},
{
"epoch": 0.01307710917424594,
"grad_norm": 6.537953853607178,
"learning_rate": 9.998946962095583e-05,
"loss": 2.9902,
"step": 3500
},
{
"epoch": 0.014945267627709646,
"grad_norm": 5.199572563171387,
"learning_rate": 9.998624323353232e-05,
"loss": 2.9598,
"step": 4000
},
{
"epoch": 0.016813426081173352,
"grad_norm": 6.676388263702393,
"learning_rate": 9.998258640060996e-05,
"loss": 2.9677,
"step": 4500
},
{
"epoch": 0.018681584534637058,
"grad_norm": 5.827391147613525,
"learning_rate": 9.997849915367876e-05,
"loss": 2.9525,
"step": 5000
},
{
"epoch": 0.020549742988100764,
"grad_norm": 7.3195953369140625,
"learning_rate": 9.997398152793517e-05,
"loss": 2.8857,
"step": 5500
},
{
"epoch": 0.02241790144156447,
"grad_norm": 5.648918151855469,
"learning_rate": 9.99690335622817e-05,
"loss": 2.8672,
"step": 6000
},
{
"epoch": 0.024286059895028175,
"grad_norm": 5.5112528800964355,
"learning_rate": 9.996366648525912e-05,
"loss": 2.9211,
"step": 6500
},
{
"epoch": 0.02615421834849188,
"grad_norm": 5.33510684967041,
"learning_rate": 9.995785883176955e-05,
"loss": 2.8746,
"step": 7000
},
{
"epoch": 0.028022376801955587,
"grad_norm": 4.034855365753174,
"learning_rate": 9.995162097720716e-05,
"loss": 2.8976,
"step": 7500
},
{
"epoch": 0.029890535255419293,
"grad_norm": 6.461133003234863,
"learning_rate": 9.994495297528784e-05,
"loss": 2.8834,
"step": 8000
},
{
"epoch": 0.031758693708883,
"grad_norm": 5.551321029663086,
"learning_rate": 9.993785488343162e-05,
"loss": 2.7976,
"step": 8500
},
{
"epoch": 0.033626852162346704,
"grad_norm": 4.432355880737305,
"learning_rate": 9.993032676276217e-05,
"loss": 2.8252,
"step": 9000
},
{
"epoch": 0.03549501061581041,
"grad_norm": 4.1577839851379395,
"learning_rate": 9.99223686781062e-05,
"loss": 2.8277,
"step": 9500
},
{
"epoch": 0.037363169069274116,
"grad_norm": 4.840776443481445,
"learning_rate": 9.991398069799303e-05,
"loss": 2.8151,
"step": 10000
},
{
"epoch": 0.03923132752273782,
"grad_norm": 5.0539093017578125,
"learning_rate": 9.99051628946539e-05,
"loss": 2.8342,
"step": 10500
},
{
"epoch": 0.04109948597620153,
"grad_norm": 4.669162750244141,
"learning_rate": 9.989593426795811e-05,
"loss": 2.8473,
"step": 11000
},
{
"epoch": 0.04296764442966523,
"grad_norm": 6.490626811981201,
"learning_rate": 9.98862579089188e-05,
"loss": 2.7918,
"step": 11500
},
{
"epoch": 0.04483580288312894,
"grad_norm": 5.523123741149902,
"learning_rate": 9.98761519653822e-05,
"loss": 2.8058,
"step": 12000
},
{
"epoch": 0.046703961336592645,
"grad_norm": 4.8026227951049805,
"learning_rate": 9.98656165243734e-05,
"loss": 2.76,
"step": 12500
},
{
"epoch": 0.04857211979005635,
"grad_norm": 4.3018083572387695,
"learning_rate": 9.985467403479736e-05,
"loss": 2.7533,
"step": 13000
},
{
"epoch": 0.050440278243520056,
"grad_norm": 3.7984395027160645,
"learning_rate": 9.984330394823319e-05,
"loss": 2.7928,
"step": 13500
},
{
"epoch": 0.05230843669698376,
"grad_norm": 3.971073627471924,
"learning_rate": 9.983148229059621e-05,
"loss": 2.7542,
"step": 14000
},
{
"epoch": 0.05417659515044747,
"grad_norm": 4.415925979614258,
"learning_rate": 9.98192315201501e-05,
"loss": 2.7767,
"step": 14500
},
{
"epoch": 0.056044753603911174,
"grad_norm": 4.695183277130127,
"learning_rate": 9.980655174238964e-05,
"loss": 2.7724,
"step": 15000
},
{
"epoch": 0.05791291205737488,
"grad_norm": 5.4851484298706055,
"learning_rate": 9.979344306650395e-05,
"loss": 2.7768,
"step": 15500
},
{
"epoch": 0.059781070510838585,
"grad_norm": 4.120709419250488,
"learning_rate": 9.977990560537549e-05,
"loss": 2.7775,
"step": 16000
},
{
"epoch": 0.06164922896430229,
"grad_norm": 3.63053560256958,
"learning_rate": 9.976593947557912e-05,
"loss": 2.7329,
"step": 16500
},
{
"epoch": 0.063517387417766,
"grad_norm": 4.178781509399414,
"learning_rate": 9.97515447973811e-05,
"loss": 2.7428,
"step": 17000
},
{
"epoch": 0.0653855458712297,
"grad_norm": 3.8429136276245117,
"learning_rate": 9.973675176842667e-05,
"loss": 2.7136,
"step": 17500
},
{
"epoch": 0.06725370432469341,
"grad_norm": 3.6935720443725586,
"learning_rate": 9.972150122544814e-05,
"loss": 2.6918,
"step": 18000
},
{
"epoch": 0.06912186277815711,
"grad_norm": 4.678779125213623,
"learning_rate": 9.970582251673812e-05,
"loss": 2.686,
"step": 18500
},
{
"epoch": 0.07099002123162082,
"grad_norm": 5.219886779785156,
"learning_rate": 9.968971577731036e-05,
"loss": 2.7664,
"step": 19000
},
{
"epoch": 0.07285817968508453,
"grad_norm": 3.985466241836548,
"learning_rate": 9.967318114586451e-05,
"loss": 2.7409,
"step": 19500
},
{
"epoch": 0.07472633813854823,
"grad_norm": 5.018237590789795,
"learning_rate": 9.965621876478483e-05,
"loss": 2.7278,
"step": 20000
},
{
"epoch": 0.07659449659201194,
"grad_norm": 4.305635452270508,
"learning_rate": 9.963882878013921e-05,
"loss": 2.7453,
"step": 20500
},
{
"epoch": 0.07846265504547564,
"grad_norm": 3.6431195735931396,
"learning_rate": 9.962101134167761e-05,
"loss": 2.6693,
"step": 21000
},
{
"epoch": 0.08033081349893935,
"grad_norm": 3.750077962875366,
"learning_rate": 9.960280351865064e-05,
"loss": 2.7108,
"step": 21500
},
{
"epoch": 0.08219897195240305,
"grad_norm": 3.730613946914673,
"learning_rate": 9.95841324906568e-05,
"loss": 2.6607,
"step": 22000
},
{
"epoch": 0.08406713040586676,
"grad_norm": 4.009971618652344,
"learning_rate": 9.956503447985205e-05,
"loss": 2.7232,
"step": 22500
},
{
"epoch": 0.08593528885933047,
"grad_norm": 3.1298115253448486,
"learning_rate": 9.954550965069465e-05,
"loss": 2.6655,
"step": 23000
},
{
"epoch": 0.08780344731279417,
"grad_norm": 3.9897303581237793,
"learning_rate": 9.952555817131835e-05,
"loss": 2.6755,
"step": 23500
},
{
"epoch": 0.08967160576625788,
"grad_norm": 5.565286636352539,
"learning_rate": 9.950522139495593e-05,
"loss": 2.6854,
"step": 24000
},
{
"epoch": 0.09153976421972158,
"grad_norm": 3.5274269580841064,
"learning_rate": 9.948441798666596e-05,
"loss": 2.6821,
"step": 24500
},
{
"epoch": 0.09340792267318529,
"grad_norm": 4.026999473571777,
"learning_rate": 9.946323133845033e-05,
"loss": 2.6389,
"step": 25000
},
{
"epoch": 0.095276081126649,
"grad_norm": 4.1627326011657715,
"learning_rate": 9.944157671638854e-05,
"loss": 2.6786,
"step": 25500
},
{
"epoch": 0.0971442395801127,
"grad_norm": 3.341585159301758,
"learning_rate": 9.94194963391034e-05,
"loss": 2.6419,
"step": 26000
},
{
"epoch": 0.09901239803357641,
"grad_norm": 3.5735983848571777,
"learning_rate": 9.939699039673516e-05,
"loss": 2.652,
"step": 26500
},
{
"epoch": 0.10088055648704011,
"grad_norm": 3.736764669418335,
"learning_rate": 9.937405908308882e-05,
"loss": 2.701,
"step": 27000
},
{
"epoch": 0.10274871494050382,
"grad_norm": 3.172218084335327,
"learning_rate": 9.935070259563231e-05,
"loss": 2.6086,
"step": 27500
},
{
"epoch": 0.10461687339396752,
"grad_norm": 3.945516347885132,
"learning_rate": 9.932692113549484e-05,
"loss": 2.6714,
"step": 28000
},
{
"epoch": 0.10648503184743123,
"grad_norm": 2.7730209827423096,
"learning_rate": 9.930271490746525e-05,
"loss": 2.6346,
"step": 28500
},
{
"epoch": 0.10835319030089494,
"grad_norm": 3.7872776985168457,
"learning_rate": 9.92780841199901e-05,
"loss": 2.6376,
"step": 29000
},
{
"epoch": 0.11022134875435864,
"grad_norm": 3.9411559104919434,
"learning_rate": 9.925302898517198e-05,
"loss": 2.6674,
"step": 29500
},
{
"epoch": 0.11208950720782235,
"grad_norm": 4.368437767028809,
"learning_rate": 9.922760110043857e-05,
"loss": 2.6232,
"step": 30000
},
{
"epoch": 0.11395766566128605,
"grad_norm": 4.385318279266357,
"learning_rate": 9.920169876946009e-05,
"loss": 2.595,
"step": 30500
},
{
"epoch": 0.11582582411474976,
"grad_norm": 3.4636647701263428,
"learning_rate": 9.917537274891421e-05,
"loss": 2.6073,
"step": 31000
},
{
"epoch": 0.11769398256821346,
"grad_norm": 2.474412202835083,
"learning_rate": 9.914862326550168e-05,
"loss": 2.655,
"step": 31500
},
{
"epoch": 0.11956214102167717,
"grad_norm": 3.5162529945373535,
"learning_rate": 9.912145054956974e-05,
"loss": 2.6259,
"step": 32000
},
{
"epoch": 0.12143029947514088,
"grad_norm": 3.149369716644287,
"learning_rate": 9.909385483511026e-05,
"loss": 2.6045,
"step": 32500
},
{
"epoch": 0.12329845792860458,
"grad_norm": 3.873689651489258,
"learning_rate": 9.906583635975763e-05,
"loss": 2.6476,
"step": 33000
},
{
"epoch": 0.1251666163820683,
"grad_norm": 4.371992588043213,
"learning_rate": 9.90374526682891e-05,
"loss": 2.6149,
"step": 33500
},
{
"epoch": 0.127034774835532,
"grad_norm": 4.554148197174072,
"learning_rate": 9.900859024291592e-05,
"loss": 2.6146,
"step": 34000
},
{
"epoch": 0.1289029332889957,
"grad_norm": 4.277965545654297,
"learning_rate": 9.897930579088681e-05,
"loss": 2.5902,
"step": 34500
},
{
"epoch": 0.1307710917424594,
"grad_norm": 4.317843914031982,
"learning_rate": 9.894959956437835e-05,
"loss": 2.6276,
"step": 35000
},
{
"epoch": 0.13263925019592313,
"grad_norm": 3.6088337898254395,
"learning_rate": 9.891953249519332e-05,
"loss": 2.5647,
"step": 35500
},
{
"epoch": 0.13450740864938682,
"grad_norm": 2.6994011402130127,
"learning_rate": 9.888898433303897e-05,
"loss": 2.6306,
"step": 36000
},
{
"epoch": 0.13637556710285054,
"grad_norm": 3.670053005218506,
"learning_rate": 9.885801517418857e-05,
"loss": 2.6103,
"step": 36500
},
{
"epoch": 0.13824372555631423,
"grad_norm": 3.3493151664733887,
"learning_rate": 9.882662528532621e-05,
"loss": 2.5293,
"step": 37000
},
{
"epoch": 0.14011188400977795,
"grad_norm": 4.308838844299316,
"learning_rate": 9.879481493675895e-05,
"loss": 2.5701,
"step": 37500
},
{
"epoch": 0.14198004246324164,
"grad_norm": 3.550856828689575,
"learning_rate": 9.876258440241463e-05,
"loss": 2.5949,
"step": 38000
},
{
"epoch": 0.14384820091670536,
"grad_norm": 3.9775218963623047,
"learning_rate": 9.872999967960666e-05,
"loss": 2.5844,
"step": 38500
},
{
"epoch": 0.14571635937016905,
"grad_norm": 3.936997413635254,
"learning_rate": 9.869693044893364e-05,
"loss": 2.5558,
"step": 39000
},
{
"epoch": 0.14758451782363277,
"grad_norm": 4.209615707397461,
"learning_rate": 9.866344187539423e-05,
"loss": 2.5605,
"step": 39500
},
{
"epoch": 0.14945267627709646,
"grad_norm": 4.603176116943359,
"learning_rate": 9.862960248064681e-05,
"loss": 2.6045,
"step": 40000
},
{
"epoch": 0.15132083473056018,
"grad_norm": 3.0863678455352783,
"learning_rate": 9.859527692735271e-05,
"loss": 2.5638,
"step": 40500
},
{
"epoch": 0.15318899318402387,
"grad_norm": 3.8357596397399902,
"learning_rate": 9.856053290655904e-05,
"loss": 2.5569,
"step": 41000
},
{
"epoch": 0.1550571516374876,
"grad_norm": 3.3822269439697266,
"learning_rate": 9.85253707174563e-05,
"loss": 2.5459,
"step": 41500
},
{
"epoch": 0.1569253100909513,
"grad_norm": 4.058901309967041,
"learning_rate": 9.848979066283589e-05,
"loss": 2.6128,
"step": 42000
},
{
"epoch": 0.158793468544415,
"grad_norm": 4.78932523727417,
"learning_rate": 9.84537930490876e-05,
"loss": 2.5862,
"step": 42500
},
{
"epoch": 0.1606616269978787,
"grad_norm": 3.3654229640960693,
"learning_rate": 9.841737818619692e-05,
"loss": 2.5509,
"step": 43000
},
{
"epoch": 0.16252978545134242,
"grad_norm": 3.9686570167541504,
"learning_rate": 9.838054638774244e-05,
"loss": 2.5089,
"step": 43500
},
{
"epoch": 0.1643979439048061,
"grad_norm": 2.973649740219116,
"learning_rate": 9.834329797089303e-05,
"loss": 2.5321,
"step": 44000
},
{
"epoch": 0.16626610235826983,
"grad_norm": 2.5326201915740967,
"learning_rate": 9.83056332564052e-05,
"loss": 2.5408,
"step": 44500
},
{
"epoch": 0.16813426081173352,
"grad_norm": 3.884883165359497,
"learning_rate": 9.826762914491992e-05,
"loss": 2.5352,
"step": 45000
},
{
"epoch": 0.17000241926519724,
"grad_norm": 3.9567508697509766,
"learning_rate": 9.822913364272259e-05,
"loss": 2.5619,
"step": 45500
},
{
"epoch": 0.17187057771866093,
"grad_norm": 3.041057825088501,
"learning_rate": 9.819022282598776e-05,
"loss": 2.555,
"step": 46000
},
{
"epoch": 0.17373873617212465,
"grad_norm": 3.1877288818359375,
"learning_rate": 9.815089702978735e-05,
"loss": 2.5458,
"step": 46500
},
{
"epoch": 0.17560689462558834,
"grad_norm": 3.142703056335449,
"learning_rate": 9.811115659276677e-05,
"loss": 2.5607,
"step": 47000
},
{
"epoch": 0.17747505307905206,
"grad_norm": 3.609555959701538,
"learning_rate": 9.807100185714202e-05,
"loss": 2.5683,
"step": 47500
},
{
"epoch": 0.17934321153251576,
"grad_norm": 3.200345277786255,
"learning_rate": 9.803051471896693e-05,
"loss": 2.5496,
"step": 48000
},
{
"epoch": 0.18121136998597948,
"grad_norm": 3.56850266456604,
"learning_rate": 9.798953325390536e-05,
"loss": 2.5425,
"step": 48500
},
{
"epoch": 0.18307952843944317,
"grad_norm": 3.4314849376678467,
"learning_rate": 9.794813853757214e-05,
"loss": 2.5238,
"step": 49000
},
{
"epoch": 0.1849476868929069,
"grad_norm": 3.024343967437744,
"learning_rate": 9.790633092642875e-05,
"loss": 2.5786,
"step": 49500
},
{
"epoch": 0.18681584534637058,
"grad_norm": 3.2595534324645996,
"learning_rate": 9.786419563225273e-05,
"loss": 2.5386,
"step": 50000
},
{
"epoch": 0.1886840037998343,
"grad_norm": 3.6985089778900146,
"learning_rate": 9.782156413906974e-05,
"loss": 2.5338,
"step": 50500
},
{
"epoch": 0.190552162253298,
"grad_norm": 2.9342880249023438,
"learning_rate": 9.777852084104404e-05,
"loss": 2.4992,
"step": 51000
},
{
"epoch": 0.1924203207067617,
"grad_norm": 2.8690543174743652,
"learning_rate": 9.773506610883352e-05,
"loss": 2.571,
"step": 51500
},
{
"epoch": 0.1942884791602254,
"grad_norm": 2.8353734016418457,
"learning_rate": 9.769120031663902e-05,
"loss": 2.4895,
"step": 52000
},
{
"epoch": 0.19615663761368912,
"grad_norm": 3.6773738861083984,
"learning_rate": 9.764692384220111e-05,
"loss": 2.5121,
"step": 52500
},
{
"epoch": 0.19802479606715281,
"grad_norm": 3.3569443225860596,
"learning_rate": 9.760223706679688e-05,
"loss": 2.527,
"step": 53000
},
{
"epoch": 0.19989295452061653,
"grad_norm": 2.970712184906006,
"learning_rate": 9.755714037523662e-05,
"loss": 2.5337,
"step": 53500
},
{
"epoch": 0.20176111297408023,
"grad_norm": 3.2004318237304688,
"learning_rate": 9.751172557674817e-05,
"loss": 2.5342,
"step": 54000
},
{
"epoch": 0.20362927142754395,
"grad_norm": 3.16782546043396,
"learning_rate": 9.746581103930153e-05,
"loss": 2.524,
"step": 54500
},
{
"epoch": 0.20549742988100764,
"grad_norm": 3.3260490894317627,
"learning_rate": 9.741948776050147e-05,
"loss": 2.4701,
"step": 55000
},
{
"epoch": 0.20736558833447136,
"grad_norm": 3.6631577014923096,
"learning_rate": 9.737275613925072e-05,
"loss": 2.5314,
"step": 55500
},
{
"epoch": 0.20923374678793505,
"grad_norm": 2.5733258724212646,
"learning_rate": 9.732561657796828e-05,
"loss": 2.5362,
"step": 56000
},
{
"epoch": 0.21110190524139877,
"grad_norm": 3.8227956295013428,
"learning_rate": 9.727816498322433e-05,
"loss": 2.4807,
"step": 56500
},
{
"epoch": 0.21297006369486246,
"grad_norm": 3.5182738304138184,
"learning_rate": 9.723021157702207e-05,
"loss": 2.5263,
"step": 57000
},
{
"epoch": 0.21483822214832618,
"grad_norm": 3.405224084854126,
"learning_rate": 9.71818514582792e-05,
"loss": 2.5105,
"step": 57500
},
{
"epoch": 0.21670638060178987,
"grad_norm": 2.988802671432495,
"learning_rate": 9.713308504343815e-05,
"loss": 2.5297,
"step": 58000
},
{
"epoch": 0.2185745390552536,
"grad_norm": 2.3862366676330566,
"learning_rate": 9.708391275244016e-05,
"loss": 2.5006,
"step": 58500
},
{
"epoch": 0.22044269750871728,
"grad_norm": 3.3643691539764404,
"learning_rate": 9.703433500872156e-05,
"loss": 2.5255,
"step": 59000
},
{
"epoch": 0.222310855962181,
"grad_norm": 3.6664035320281982,
"learning_rate": 9.698435223921016e-05,
"loss": 2.4421,
"step": 59500
},
{
"epoch": 0.2241790144156447,
"grad_norm": 3.3508718013763428,
"learning_rate": 9.693396487432153e-05,
"loss": 2.4893,
"step": 60000
},
{
"epoch": 0.22604717286910841,
"grad_norm": 3.5202372074127197,
"learning_rate": 9.688337731857194e-05,
"loss": 2.505,
"step": 60500
},
{
"epoch": 0.2279153313225721,
"grad_norm": 4.265177249908447,
"learning_rate": 9.683218368212872e-05,
"loss": 2.5134,
"step": 61000
},
{
"epoch": 0.22978348977603583,
"grad_norm": 3.761479377746582,
"learning_rate": 9.67805867606742e-05,
"loss": 2.477,
"step": 61500
},
{
"epoch": 0.23165164822949952,
"grad_norm": 3.254711866378784,
"learning_rate": 9.67285869985239e-05,
"loss": 2.4894,
"step": 62000
},
{
"epoch": 0.23351980668296324,
"grad_norm": 3.4447569847106934,
"learning_rate": 9.667629004906115e-05,
"loss": 2.5338,
"step": 62500
},
{
"epoch": 0.23538796513642693,
"grad_norm": 3.283677577972412,
"learning_rate": 9.662348675576849e-05,
"loss": 2.5028,
"step": 63000
},
{
"epoch": 0.23725612358989065,
"grad_norm": 3.641008138656616,
"learning_rate": 9.657028197461201e-05,
"loss": 2.5102,
"step": 63500
},
{
"epoch": 0.23912428204335434,
"grad_norm": 2.3239517211914062,
"learning_rate": 9.651667616375301e-05,
"loss": 2.4692,
"step": 64000
},
{
"epoch": 0.24099244049681806,
"grad_norm": 2.590287446975708,
"learning_rate": 9.646266978480605e-05,
"loss": 2.4753,
"step": 64500
},
{
"epoch": 0.24286059895028175,
"grad_norm": 3.5106756687164307,
"learning_rate": 9.640826330283514e-05,
"loss": 2.4541,
"step": 65000
},
{
"epoch": 0.24472875740374547,
"grad_norm": 2.9911463260650635,
"learning_rate": 9.635345718634972e-05,
"loss": 2.5228,
"step": 65500
},
{
"epoch": 0.24659691585720916,
"grad_norm": 3.7811479568481445,
"learning_rate": 9.629825190730053e-05,
"loss": 2.468,
"step": 66000
},
{
"epoch": 0.24846507431067288,
"grad_norm": 3.073608875274658,
"learning_rate": 9.624275954658023e-05,
"loss": 2.5416,
"step": 66500
},
{
"epoch": 0.2503332327641366,
"grad_norm": 2.943208932876587,
"learning_rate": 9.618675816793752e-05,
"loss": 2.4685,
"step": 67000
},
{
"epoch": 0.25220139121760027,
"grad_norm": 2.2683610916137695,
"learning_rate": 9.613047225704368e-05,
"loss": 2.4953,
"step": 67500
},
{
"epoch": 0.254069549671064,
"grad_norm": 3.0341203212738037,
"learning_rate": 9.607367670392133e-05,
"loss": 2.4601,
"step": 68000
},
{
"epoch": 0.2559377081245277,
"grad_norm": 3.2594239711761475,
"learning_rate": 9.60164843975031e-05,
"loss": 2.4339,
"step": 68500
},
{
"epoch": 0.2578058665779914,
"grad_norm": 3.045818328857422,
"learning_rate": 9.595889583028791e-05,
"loss": 2.4237,
"step": 69000
},
{
"epoch": 0.2596740250314551,
"grad_norm": 3.0980165004730225,
"learning_rate": 9.590091149818697e-05,
"loss": 2.5111,
"step": 69500
},
{
"epoch": 0.2615421834849188,
"grad_norm": 2.206389904022217,
"learning_rate": 9.584253190051957e-05,
"loss": 2.4885,
"step": 70000
},
{
"epoch": 0.26341034193838253,
"grad_norm": 3.909090518951416,
"learning_rate": 9.578387548236723e-05,
"loss": 2.4945,
"step": 70500
},
{
"epoch": 0.26527850039184625,
"grad_norm": 3.3355019092559814,
"learning_rate": 9.572470765314143e-05,
"loss": 2.4225,
"step": 71000
},
{
"epoch": 0.2671466588453099,
"grad_norm": 2.9104554653167725,
"learning_rate": 9.56651460756897e-05,
"loss": 2.4666,
"step": 71500
},
{
"epoch": 0.26901481729877363,
"grad_norm": 2.195571184158325,
"learning_rate": 9.560519126291337e-05,
"loss": 2.4738,
"step": 72000
},
{
"epoch": 0.27088297575223735,
"grad_norm": 2.8600668907165527,
"learning_rate": 9.554484373110011e-05,
"loss": 2.3982,
"step": 72500
},
{
"epoch": 0.2727511342057011,
"grad_norm": 2.985612630844116,
"learning_rate": 9.54842258704496e-05,
"loss": 2.4708,
"step": 73000
},
{
"epoch": 0.27461929265916474,
"grad_norm": 2.609339475631714,
"learning_rate": 9.542309524577655e-05,
"loss": 2.4385,
"step": 73500
},
{
"epoch": 0.27648745111262846,
"grad_norm": 2.9328203201293945,
"learning_rate": 9.536157347014623e-05,
"loss": 2.3942,
"step": 74000
},
{
"epoch": 0.2783556095660922,
"grad_norm": 3.242722511291504,
"learning_rate": 9.529966107333978e-05,
"loss": 2.4568,
"step": 74500
},
{
"epoch": 0.2802237680195559,
"grad_norm": 2.90252423286438,
"learning_rate": 9.523735858850218e-05,
"loss": 2.4495,
"step": 75000
},
{
"epoch": 0.2820919264730196,
"grad_norm": 2.491132974624634,
"learning_rate": 9.517466655213752e-05,
"loss": 2.4401,
"step": 75500
},
{
"epoch": 0.2839600849264833,
"grad_norm": 2.714989185333252,
"learning_rate": 9.511171205407364e-05,
"loss": 2.4607,
"step": 76000
},
{
"epoch": 0.285828243379947,
"grad_norm": 3.1541576385498047,
"learning_rate": 9.50482433139732e-05,
"loss": 2.4522,
"step": 76500
},
{
"epoch": 0.2876964018334107,
"grad_norm": 3.280564546585083,
"learning_rate": 9.498438665087013e-05,
"loss": 2.4696,
"step": 77000
},
{
"epoch": 0.28956456028687444,
"grad_norm": 3.0421793460845947,
"learning_rate": 9.492014261465201e-05,
"loss": 2.482,
"step": 77500
},
{
"epoch": 0.2914327187403381,
"grad_norm": 2.658756971359253,
"learning_rate": 9.485551175854214e-05,
"loss": 2.4464,
"step": 78000
},
{
"epoch": 0.2933008771938018,
"grad_norm": 4.537105083465576,
"learning_rate": 9.479049463909488e-05,
"loss": 2.444,
"step": 78500
},
{
"epoch": 0.29516903564726554,
"grad_norm": 2.9097115993499756,
"learning_rate": 9.472509181619083e-05,
"loss": 2.4631,
"step": 79000
},
{
"epoch": 0.29703719410072926,
"grad_norm": 2.133843421936035,
"learning_rate": 9.465943581295223e-05,
"loss": 2.4159,
"step": 79500
},
{
"epoch": 0.2989053525541929,
"grad_norm": 2.5699055194854736,
"learning_rate": 9.459326404463687e-05,
"loss": 2.4392,
"step": 80000
},
{
"epoch": 0.30077351100765665,
"grad_norm": 2.927656412124634,
"learning_rate": 9.452684176567582e-05,
"loss": 2.4121,
"step": 80500
},
{
"epoch": 0.30264166946112037,
"grad_norm": 3.3542892932891846,
"learning_rate": 9.44599033266823e-05,
"loss": 2.4138,
"step": 81000
},
{
"epoch": 0.3045098279145841,
"grad_norm": 2.9518256187438965,
"learning_rate": 9.439258203104611e-05,
"loss": 2.4193,
"step": 81500
},
{
"epoch": 0.30637798636804775,
"grad_norm": 2.9476184844970703,
"learning_rate": 9.432487845848965e-05,
"loss": 2.3944,
"step": 82000
},
{
"epoch": 0.30824614482151147,
"grad_norm": 2.688512086868286,
"learning_rate": 9.425679319202733e-05,
"loss": 2.4331,
"step": 82500
},
{
"epoch": 0.3101143032749752,
"grad_norm": 2.971700429916382,
"learning_rate": 9.418832681796042e-05,
"loss": 2.4513,
"step": 83000
},
{
"epoch": 0.3119824617284389,
"grad_norm": 2.495612382888794,
"learning_rate": 9.411947992587194e-05,
"loss": 2.3972,
"step": 83500
},
{
"epoch": 0.3138506201819026,
"grad_norm": 3.071038246154785,
"learning_rate": 9.405025310862172e-05,
"loss": 2.4309,
"step": 84000
},
{
"epoch": 0.3157187786353663,
"grad_norm": 3.627650260925293,
"learning_rate": 9.398064696234121e-05,
"loss": 2.4297,
"step": 84500
},
{
"epoch": 0.31758693708883,
"grad_norm": 2.077777147293091,
"learning_rate": 9.391066208642838e-05,
"loss": 2.4245,
"step": 85000
},
{
"epoch": 0.31945509554229373,
"grad_norm": 3.0603654384613037,
"learning_rate": 9.384044018651683e-05,
"loss": 2.4145,
"step": 85500
},
{
"epoch": 0.3213232539957574,
"grad_norm": 2.993283271789551,
"learning_rate": 9.37697004170087e-05,
"loss": 2.4095,
"step": 86000
},
{
"epoch": 0.3231914124492211,
"grad_norm": 2.8521878719329834,
"learning_rate": 9.369858373438785e-05,
"loss": 2.3967,
"step": 86500
},
{
"epoch": 0.32505957090268484,
"grad_norm": 3.297847032546997,
"learning_rate": 9.362709075105988e-05,
"loss": 2.4343,
"step": 87000
},
{
"epoch": 0.32692772935614856,
"grad_norm": 2.3240292072296143,
"learning_rate": 9.355522208267086e-05,
"loss": 2.3947,
"step": 87500
},
{
"epoch": 0.3287958878096122,
"grad_norm": 3.8041253089904785,
"learning_rate": 9.348297834810195e-05,
"loss": 2.4111,
"step": 88000
},
{
"epoch": 0.33066404626307594,
"grad_norm": 2.6961183547973633,
"learning_rate": 9.341036016946413e-05,
"loss": 2.4159,
"step": 88500
},
{
"epoch": 0.33253220471653966,
"grad_norm": 3.0299246311187744,
"learning_rate": 9.33373681720928e-05,
"loss": 2.4012,
"step": 89000
},
{
"epoch": 0.3344003631700034,
"grad_norm": 2.75026273727417,
"learning_rate": 9.326415008694199e-05,
"loss": 2.3755,
"step": 89500
},
{
"epoch": 0.33626852162346704,
"grad_norm": 2.4696195125579834,
"learning_rate": 9.319056093086089e-05,
"loss": 2.3953,
"step": 90000
},
{
"epoch": 0.33813668007693076,
"grad_norm": 2.428610324859619,
"learning_rate": 9.311645274788967e-05,
"loss": 2.4433,
"step": 90500
},
{
"epoch": 0.3400048385303945,
"grad_norm": 2.851217269897461,
"learning_rate": 9.304197327710381e-05,
"loss": 2.429,
"step": 91000
},
{
"epoch": 0.3418729969838582,
"grad_norm": 3.0488922595977783,
"learning_rate": 9.296712315986686e-05,
"loss": 2.417,
"step": 91500
},
{
"epoch": 0.34374115543732187,
"grad_norm": 2.7306880950927734,
"learning_rate": 9.289190304073406e-05,
"loss": 2.4539,
"step": 92000
},
{
"epoch": 0.3456093138907856,
"grad_norm": 3.2483866214752197,
"learning_rate": 9.281631356744687e-05,
"loss": 2.3616,
"step": 92500
},
{
"epoch": 0.3474774723442493,
"grad_norm": 2.66874098777771,
"learning_rate": 9.274035539092736e-05,
"loss": 2.3984,
"step": 93000
},
{
"epoch": 0.349345630797713,
"grad_norm": 2.5911643505096436,
"learning_rate": 9.266402916527259e-05,
"loss": 2.4403,
"step": 93500
},
{
"epoch": 0.3512137892511767,
"grad_norm": 3.084787607192993,
"learning_rate": 9.258748930120269e-05,
"loss": 2.3685,
"step": 94000
},
{
"epoch": 0.3530819477046404,
"grad_norm": 3.077162742614746,
"learning_rate": 9.251042968504211e-05,
"loss": 2.4033,
"step": 94500
},
{
"epoch": 0.35495010615810413,
"grad_norm": 2.7327165603637695,
"learning_rate": 9.243300399970075e-05,
"loss": 2.357,
"step": 95000
},
{
"epoch": 0.35681826461156785,
"grad_norm": 2.942444324493408,
"learning_rate": 9.235521291191276e-05,
"loss": 2.4114,
"step": 95500
},
{
"epoch": 0.3586864230650315,
"grad_norm": 2.504429817199707,
"learning_rate": 9.227705709155896e-05,
"loss": 2.3763,
"step": 96000
},
{
"epoch": 0.36055458151849523,
"grad_norm": 3.322981119155884,
"learning_rate": 9.219853721166094e-05,
"loss": 2.4037,
"step": 96500
},
{
"epoch": 0.36242273997195895,
"grad_norm": 2.8509936332702637,
"learning_rate": 9.21196539483753e-05,
"loss": 2.4089,
"step": 97000
},
{
"epoch": 0.36429089842542267,
"grad_norm": 3.585662603378296,
"learning_rate": 9.204040798098783e-05,
"loss": 2.4132,
"step": 97500
},
{
"epoch": 0.36615905687888634,
"grad_norm": 2.8213889598846436,
"learning_rate": 9.196095956872841e-05,
"loss": 2.3647,
"step": 98000
},
{
"epoch": 0.36802721533235006,
"grad_norm": 3.3626108169555664,
"learning_rate": 9.188099096546838e-05,
"loss": 2.4143,
"step": 98500
},
{
"epoch": 0.3698953737858138,
"grad_norm": 2.993591785430908,
"learning_rate": 9.180066171330013e-05,
"loss": 2.3806,
"step": 99000
},
{
"epoch": 0.3717635322392775,
"grad_norm": 2.9788472652435303,
"learning_rate": 9.171997250396128e-05,
"loss": 2.3571,
"step": 99500
},
{
"epoch": 0.37363169069274116,
"grad_norm": 2.3888766765594482,
"learning_rate": 9.163908648731292e-05,
"loss": 2.3841,
"step": 100000
},
{
"epoch": 0.3754998491462049,
"grad_norm": 3.0424160957336426,
"learning_rate": 9.155768016766876e-05,
"loss": 2.4152,
"step": 100500
},
{
"epoch": 0.3773680075996686,
"grad_norm": 2.592036724090576,
"learning_rate": 9.147591598323593e-05,
"loss": 2.3465,
"step": 101000
},
{
"epoch": 0.3792361660531323,
"grad_norm": 2.8690261840820312,
"learning_rate": 9.139379463810866e-05,
"loss": 2.3974,
"step": 101500
},
{
"epoch": 0.381104324506596,
"grad_norm": 2.7227180004119873,
"learning_rate": 9.131148215032317e-05,
"loss": 2.3688,
"step": 102000
},
{
"epoch": 0.3829724829600597,
"grad_norm": 2.856623888015747,
"learning_rate": 9.12286493191618e-05,
"loss": 2.4341,
"step": 102500
},
{
"epoch": 0.3848406414135234,
"grad_norm": 2.56028151512146,
"learning_rate": 9.114546145658827e-05,
"loss": 2.427,
"step": 103000
},
{
"epoch": 0.38670879986698714,
"grad_norm": 3.3118507862091064,
"learning_rate": 9.106208671644056e-05,
"loss": 2.3166,
"step": 103500
},
{
"epoch": 0.3885769583204508,
"grad_norm": 3.2025699615478516,
"learning_rate": 9.097819164962692e-05,
"loss": 2.4462,
"step": 104000
},
{
"epoch": 0.3904451167739145,
"grad_norm": 3.240300416946411,
"learning_rate": 9.089394370816208e-05,
"loss": 2.4285,
"step": 104500
},
{
"epoch": 0.39231327522737824,
"grad_norm": 3.5723962783813477,
"learning_rate": 9.080934361752857e-05,
"loss": 2.355,
"step": 105000
},
{
"epoch": 0.39418143368084196,
"grad_norm": 3.186774253845215,
"learning_rate": 9.072456235949608e-05,
"loss": 2.4029,
"step": 105500
},
{
"epoch": 0.39604959213430563,
"grad_norm": 2.629359006881714,
"learning_rate": 9.063926085974259e-05,
"loss": 2.3459,
"step": 106000
},
{
"epoch": 0.39791775058776935,
"grad_norm": 3.2429652214050293,
"learning_rate": 9.055360940396558e-05,
"loss": 2.3847,
"step": 106500
},
{
"epoch": 0.39978590904123307,
"grad_norm": 2.427645206451416,
"learning_rate": 9.046760872973364e-05,
"loss": 2.3435,
"step": 107000
},
{
"epoch": 0.4016540674946968,
"grad_norm": 2.556652784347534,
"learning_rate": 9.038143262321399e-05,
"loss": 2.4121,
"step": 107500
},
{
"epoch": 0.40352222594816045,
"grad_norm": 2.9563798904418945,
"learning_rate": 9.029473643152501e-05,
"loss": 2.3786,
"step": 108000
},
{
"epoch": 0.40539038440162417,
"grad_norm": 2.457141876220703,
"learning_rate": 9.020769325060857e-05,
"loss": 2.3734,
"step": 108500
},
{
"epoch": 0.4072585428550879,
"grad_norm": 2.489871025085449,
"learning_rate": 9.012030383001778e-05,
"loss": 2.3934,
"step": 109000
},
{
"epoch": 0.4091267013085516,
"grad_norm": 2.9061882495880127,
"learning_rate": 9.003256892228738e-05,
"loss": 2.3507,
"step": 109500
},
{
"epoch": 0.4109948597620153,
"grad_norm": 3.2263598442077637,
"learning_rate": 8.994448928292711e-05,
"loss": 2.3866,
"step": 110000
},
{
"epoch": 0.412863018215479,
"grad_norm": 2.9006874561309814,
"learning_rate": 8.985606567041537e-05,
"loss": 2.3546,
"step": 110500
},
{
"epoch": 0.4147311766689427,
"grad_norm": 2.51509428024292,
"learning_rate": 8.976747672185874e-05,
"loss": 2.3669,
"step": 111000
},
{
"epoch": 0.41659933512240643,
"grad_norm": 2.6938908100128174,
"learning_rate": 8.967836813445061e-05,
"loss": 2.3485,
"step": 111500
},
{
"epoch": 0.4184674935758701,
"grad_norm": 2.7218174934387207,
"learning_rate": 8.958891786553452e-05,
"loss": 2.3798,
"step": 112000
},
{
"epoch": 0.4203356520293338,
"grad_norm": 3.0031161308288574,
"learning_rate": 8.949912668539173e-05,
"loss": 2.3501,
"step": 112500
},
{
"epoch": 0.42220381048279754,
"grad_norm": 2.5878889560699463,
"learning_rate": 8.940899536723916e-05,
"loss": 2.3512,
"step": 113000
},
{
"epoch": 0.42407196893626126,
"grad_norm": 2.7273967266082764,
"learning_rate": 8.931852468722277e-05,
"loss": 2.3394,
"step": 113500
},
{
"epoch": 0.4259401273897249,
"grad_norm": 2.3990983963012695,
"learning_rate": 8.922771542441081e-05,
"loss": 2.3104,
"step": 114000
},
{
"epoch": 0.42780828584318864,
"grad_norm": 3.0549476146698,
"learning_rate": 8.913656836078725e-05,
"loss": 2.3557,
"step": 114500
},
{
"epoch": 0.42967644429665236,
"grad_norm": 2.417224168777466,
"learning_rate": 8.904508428124488e-05,
"loss": 2.32,
"step": 115000
},
{
"epoch": 0.4315446027501161,
"grad_norm": 2.56392502784729,
"learning_rate": 8.895363192352878e-05,
"loss": 2.3651,
"step": 115500
},
{
"epoch": 0.43341276120357974,
"grad_norm": 2.027083396911621,
"learning_rate": 8.886147751859986e-05,
"loss": 2.3277,
"step": 116000
},
{
"epoch": 0.43528091965704346,
"grad_norm": 1.902034044265747,
"learning_rate": 8.876898846663621e-05,
"loss": 2.3185,
"step": 116500
},
{
"epoch": 0.4371490781105072,
"grad_norm": 2.7564985752105713,
"learning_rate": 8.867616556408684e-05,
"loss": 2.3674,
"step": 117000
},
{
"epoch": 0.4390172365639709,
"grad_norm": 3.024198532104492,
"learning_rate": 8.858300961027575e-05,
"loss": 2.3832,
"step": 117500
},
{
"epoch": 0.44088539501743457,
"grad_norm": 2.2952866554260254,
"learning_rate": 8.84895214073948e-05,
"loss": 2.3799,
"step": 118000
},
{
"epoch": 0.4427535534708983,
"grad_norm": 2.352498769760132,
"learning_rate": 8.839570176049705e-05,
"loss": 2.3958,
"step": 118500
},
{
"epoch": 0.444621711924362,
"grad_norm": 3.565748453140259,
"learning_rate": 8.830155147748969e-05,
"loss": 2.3614,
"step": 119000
},
{
"epoch": 0.4464898703778257,
"grad_norm": 3.0577287673950195,
"learning_rate": 8.82072606579692e-05,
"loss": 2.3458,
"step": 119500
},
{
"epoch": 0.4483580288312894,
"grad_norm": 2.6253695487976074,
"learning_rate": 8.81124521950556e-05,
"loss": 2.3273,
"step": 120000
},
{
"epoch": 0.4502261872847531,
"grad_norm": 2.1585161685943604,
"learning_rate": 8.801731553517346e-05,
"loss": 2.3298,
"step": 120500
},
{
"epoch": 0.45209434573821683,
"grad_norm": 2.5908641815185547,
"learning_rate": 8.792185149757116e-05,
"loss": 2.323,
"step": 121000
},
{
"epoch": 0.45396250419168055,
"grad_norm": 1.9700515270233154,
"learning_rate": 8.78262528108574e-05,
"loss": 2.3285,
"step": 121500
},
{
"epoch": 0.4558306626451442,
"grad_norm": 2.0091867446899414,
"learning_rate": 8.773013713746569e-05,
"loss": 2.3353,
"step": 122000
},
{
"epoch": 0.45769882109860793,
"grad_norm": 3.026522159576416,
"learning_rate": 8.763369655932719e-05,
"loss": 2.3478,
"step": 122500
},
{
"epoch": 0.45956697955207165,
"grad_norm": 2.7834973335266113,
"learning_rate": 8.753693190691863e-05,
"loss": 2.3256,
"step": 123000
},
{
"epoch": 0.4614351380055354,
"grad_norm": 3.004798173904419,
"learning_rate": 8.743984401350747e-05,
"loss": 2.3466,
"step": 123500
},
{
"epoch": 0.46330329645899904,
"grad_norm": 2.611668586730957,
"learning_rate": 8.734262885694443e-05,
"loss": 2.3222,
"step": 124000
},
{
"epoch": 0.46517145491246276,
"grad_norm": 2.902439594268799,
"learning_rate": 8.72448976347505e-05,
"loss": 2.3485,
"step": 124500
},
{
"epoch": 0.4670396133659265,
"grad_norm": 2.932037353515625,
"learning_rate": 8.714684568634262e-05,
"loss": 2.3258,
"step": 125000
},
{
"epoch": 0.4689077718193902,
"grad_norm": 2.526458263397217,
"learning_rate": 8.70484738560735e-05,
"loss": 2.3549,
"step": 125500
},
{
"epoch": 0.47077593027285386,
"grad_norm": 2.8670670986175537,
"learning_rate": 8.694978299105044e-05,
"loss": 2.3685,
"step": 126000
},
{
"epoch": 0.4726440887263176,
"grad_norm": 2.95123553276062,
"learning_rate": 8.685077394112803e-05,
"loss": 2.327,
"step": 126500
},
{
"epoch": 0.4745122471797813,
"grad_norm": 3.010820150375366,
"learning_rate": 8.675164652779493e-05,
"loss": 2.3247,
"step": 127000
},
{
"epoch": 0.476380405633245,
"grad_norm": 1.896767258644104,
"learning_rate": 8.665200430068873e-05,
"loss": 2.3158,
"step": 127500
},
{
"epoch": 0.4782485640867087,
"grad_norm": 2.559565305709839,
"learning_rate": 8.655204645293866e-05,
"loss": 2.3425,
"step": 128000
},
{
"epoch": 0.4801167225401724,
"grad_norm": 2.658048391342163,
"learning_rate": 8.645177384530965e-05,
"loss": 2.3565,
"step": 128500
},
{
"epoch": 0.4819848809936361,
"grad_norm": 1.818748116493225,
"learning_rate": 8.635118734127712e-05,
"loss": 2.3441,
"step": 129000
},
{
"epoch": 0.48385303944709984,
"grad_norm": 2.627014398574829,
"learning_rate": 8.625028780701953e-05,
"loss": 2.3296,
"step": 129500
},
{
"epoch": 0.4857211979005635,
"grad_norm": 2.687391519546509,
"learning_rate": 8.614907611141099e-05,
"loss": 2.3334,
"step": 130000
},
{
"epoch": 0.4875893563540272,
"grad_norm": 3.092353582382202,
"learning_rate": 8.604755312601363e-05,
"loss": 2.3278,
"step": 130500
},
{
"epoch": 0.48945751480749095,
"grad_norm": 3.0431768894195557,
"learning_rate": 8.59459237010844e-05,
"loss": 2.299,
"step": 131000
},
{
"epoch": 0.49132567326095467,
"grad_norm": 2.2302520275115967,
"learning_rate": 8.584378137971116e-05,
"loss": 2.2837,
"step": 131500
},
{
"epoch": 0.49319383171441833,
"grad_norm": 2.7669031620025635,
"learning_rate": 8.574133039752728e-05,
"loss": 2.3202,
"step": 132000
},
{
"epoch": 0.49506199016788205,
"grad_norm": 2.6957993507385254,
"learning_rate": 8.563857163676681e-05,
"loss": 2.3214,
"step": 132500
},
{
"epoch": 0.49693014862134577,
"grad_norm": 2.662504196166992,
"learning_rate": 8.553571241931346e-05,
"loss": 2.2907,
"step": 133000
},
{
"epoch": 0.4987983070748095,
"grad_norm": 2.6600215435028076,
"learning_rate": 8.54323413698205e-05,
"loss": 2.2866,
"step": 133500
},
{
"epoch": 0.5006664655282732,
"grad_norm": 1.6196849346160889,
"learning_rate": 8.532866520254174e-05,
"loss": 2.3064,
"step": 134000
},
{
"epoch": 0.5025346239817369,
"grad_norm": 2.3502981662750244,
"learning_rate": 8.522468481026161e-05,
"loss": 2.3447,
"step": 134500
},
{
"epoch": 0.5044027824352005,
"grad_norm": 2.94901442527771,
"learning_rate": 8.512040108838428e-05,
"loss": 2.3602,
"step": 135000
},
{
"epoch": 0.5062709408886643,
"grad_norm": 2.749366283416748,
"learning_rate": 8.501581493492603e-05,
"loss": 2.3389,
"step": 135500
},
{
"epoch": 0.508139099342128,
"grad_norm": 3.2299070358276367,
"learning_rate": 8.491113732620424e-05,
"loss": 2.3348,
"step": 136000
},
{
"epoch": 0.5100072577955918,
"grad_norm": 2.3727314472198486,
"learning_rate": 8.480616028924504e-05,
"loss": 2.2864,
"step": 136500
},
{
"epoch": 0.5118754162490554,
"grad_norm": 1.8499844074249268,
"learning_rate": 8.470067345222588e-05,
"loss": 2.271,
"step": 137000
},
{
"epoch": 0.5137435747025191,
"grad_norm": 3.1945462226867676,
"learning_rate": 8.459488779801767e-05,
"loss": 2.2967,
"step": 137500
},
{
"epoch": 0.5156117331559829,
"grad_norm": 2.6457462310791016,
"learning_rate": 8.448880423757021e-05,
"loss": 2.2784,
"step": 138000
},
{
"epoch": 0.5174798916094465,
"grad_norm": 2.016098976135254,
"learning_rate": 8.438242368439869e-05,
"loss": 2.3013,
"step": 138500
},
{
"epoch": 0.5193480500629102,
"grad_norm": 1.97508704662323,
"learning_rate": 8.42757470545757e-05,
"loss": 2.3232,
"step": 139000
},
{
"epoch": 0.521216208516374,
"grad_norm": 2.349184274673462,
"learning_rate": 8.416877526672355e-05,
"loss": 2.3266,
"step": 139500
},
{
"epoch": 0.5230843669698376,
"grad_norm": 2.6522152423858643,
"learning_rate": 8.406150924200616e-05,
"loss": 2.2941,
"step": 140000
},
{
"epoch": 0.5249525254233014,
"grad_norm": 3.5393903255462646,
"learning_rate": 8.395394990412121e-05,
"loss": 2.3459,
"step": 140500
},
{
"epoch": 0.5268206838767651,
"grad_norm": 2.5476553440093994,
"learning_rate": 8.38460981792922e-05,
"loss": 2.2942,
"step": 141000
},
{
"epoch": 0.5286888423302287,
"grad_norm": 2.8197927474975586,
"learning_rate": 8.373817157288324e-05,
"loss": 2.3426,
"step": 141500
},
{
"epoch": 0.5305570007836925,
"grad_norm": 2.1316707134246826,
"learning_rate": 8.362973844302275e-05,
"loss": 2.2985,
"step": 142000
},
{
"epoch": 0.5324251592371562,
"grad_norm": 1.9890694618225098,
"learning_rate": 8.352101571809362e-05,
"loss": 2.2896,
"step": 142500
},
{
"epoch": 0.5342933176906198,
"grad_norm": 3.057724952697754,
"learning_rate": 8.34120043343376e-05,
"loss": 2.3079,
"step": 143000
},
{
"epoch": 0.5361614761440836,
"grad_norm": 2.373011350631714,
"learning_rate": 8.330270523048216e-05,
"loss": 2.3294,
"step": 143500
},
{
"epoch": 0.5380296345975473,
"grad_norm": 2.1205389499664307,
"learning_rate": 8.31931193477324e-05,
"loss": 2.2969,
"step": 144000
},
{
"epoch": 0.539897793051011,
"grad_norm": 2.767277956008911,
"learning_rate": 8.308324762976294e-05,
"loss": 2.2901,
"step": 144500
},
{
"epoch": 0.5417659515044747,
"grad_norm": 2.847618579864502,
"learning_rate": 8.297309102270986e-05,
"loss": 2.3128,
"step": 145000
},
{
"epoch": 0.5436341099579384,
"grad_norm": 2.3643147945404053,
"learning_rate": 8.286287163899844e-05,
"loss": 2.2991,
"step": 145500
},
{
"epoch": 0.5455022684114021,
"grad_norm": 3.874725103378296,
"learning_rate": 8.275214866701926e-05,
"loss": 2.2602,
"step": 146000
},
{
"epoch": 0.5473704268648658,
"grad_norm": 2.4457411766052246,
"learning_rate": 8.264114365714206e-05,
"loss": 2.3038,
"step": 146500
},
{
"epoch": 0.5492385853183295,
"grad_norm": 2.56156063079834,
"learning_rate": 8.252985756526198e-05,
"loss": 2.3193,
"step": 147000
},
{
"epoch": 0.5511067437717933,
"grad_norm": 3.2425754070281982,
"learning_rate": 8.241851476105105e-05,
"loss": 2.294,
"step": 147500
},
{
"epoch": 0.5529749022252569,
"grad_norm": 3.299207925796509,
"learning_rate": 8.23066699398898e-05,
"loss": 2.2933,
"step": 148000
},
{
"epoch": 0.5548430606787207,
"grad_norm": 2.3422181606292725,
"learning_rate": 8.219454691697226e-05,
"loss": 2.3066,
"step": 148500
},
{
"epoch": 0.5567112191321844,
"grad_norm": 2.9155092239379883,
"learning_rate": 8.208214665782109e-05,
"loss": 2.2698,
"step": 149000
},
{
"epoch": 0.558579377585648,
"grad_norm": 3.0940420627593994,
"learning_rate": 8.196969575847251e-05,
"loss": 2.2787,
"step": 149500
},
{
"epoch": 0.5604475360391118,
"grad_norm": 3.761610507965088,
"learning_rate": 8.185674448258929e-05,
"loss": 2.3008,
"step": 150000
},
{
"epoch": 0.5623156944925755,
"grad_norm": 2.735173463821411,
"learning_rate": 8.174374560372093e-05,
"loss": 2.3122,
"step": 150500
},
{
"epoch": 0.5641838529460392,
"grad_norm": 2.3430800437927246,
"learning_rate": 8.163024719393988e-05,
"loss": 2.2645,
"step": 151000
},
{
"epoch": 0.5660520113995029,
"grad_norm": 2.489206314086914,
"learning_rate": 8.151647640726769e-05,
"loss": 2.2695,
"step": 151500
},
{
"epoch": 0.5679201698529666,
"grad_norm": 3.2072606086730957,
"learning_rate": 8.140243422341638e-05,
"loss": 2.2641,
"step": 152000
},
{
"epoch": 0.5697883283064303,
"grad_norm": 3.0480380058288574,
"learning_rate": 8.128812162443502e-05,
"loss": 2.3294,
"step": 152500
},
{
"epoch": 0.571656486759894,
"grad_norm": 3.000128746032715,
"learning_rate": 8.117353959470134e-05,
"loss": 2.2637,
"step": 153000
},
{
"epoch": 0.5735246452133577,
"grad_norm": 3.1820998191833496,
"learning_rate": 8.105868912091317e-05,
"loss": 2.2759,
"step": 153500
},
{
"epoch": 0.5753928036668214,
"grad_norm": 2.6837666034698486,
"learning_rate": 8.094357119208004e-05,
"loss": 2.2549,
"step": 154000
},
{
"epoch": 0.5772609621202851,
"grad_norm": 2.4082396030426025,
"learning_rate": 8.082841783357048e-05,
"loss": 2.3007,
"step": 154500
},
{
"epoch": 0.5791291205737489,
"grad_norm": 2.461305618286133,
"learning_rate": 8.0712768500827e-05,
"loss": 2.2654,
"step": 155000
},
{
"epoch": 0.5809972790272125,
"grad_norm": 2.9279286861419678,
"learning_rate": 8.059708678275976e-05,
"loss": 2.2669,
"step": 155500
},
{
"epoch": 0.5828654374806762,
"grad_norm": 2.3760006427764893,
"learning_rate": 8.048091002168906e-05,
"loss": 2.2429,
"step": 156000
},
{
"epoch": 0.58473359593414,
"grad_norm": 2.879556894302368,
"learning_rate": 8.036447078099056e-05,
"loss": 2.2694,
"step": 156500
},
{
"epoch": 0.5866017543876036,
"grad_norm": 1.9433120489120483,
"learning_rate": 8.024777006335506e-05,
"loss": 2.243,
"step": 157000
},
{
"epoch": 0.5884699128410673,
"grad_norm": 2.5363948345184326,
"learning_rate": 8.013080887372506e-05,
"loss": 2.267,
"step": 157500
},
{
"epoch": 0.5903380712945311,
"grad_norm": 2.3004775047302246,
"learning_rate": 8.001358821928599e-05,
"loss": 2.2711,
"step": 158000
},
{
"epoch": 0.5922062297479948,
"grad_norm": 2.1187326908111572,
"learning_rate": 7.989610910945766e-05,
"loss": 2.2733,
"step": 158500
},
{
"epoch": 0.5940743882014585,
"grad_norm": 2.612976312637329,
"learning_rate": 7.977860828524794e-05,
"loss": 2.2617,
"step": 159000
},
{
"epoch": 0.5959425466549222,
"grad_norm": 2.5254204273223877,
"learning_rate": 7.96606158136407e-05,
"loss": 2.2624,
"step": 159500
},
{
"epoch": 0.5978107051083859,
"grad_norm": 2.352216958999634,
"learning_rate": 7.954236792618814e-05,
"loss": 2.2923,
"step": 160000
},
{
"epoch": 0.5996788635618496,
"grad_norm": 2.5276451110839844,
"learning_rate": 7.942386564115584e-05,
"loss": 2.281,
"step": 160500
},
{
"epoch": 0.6015470220153133,
"grad_norm": 2.3592355251312256,
"learning_rate": 7.930510997900007e-05,
"loss": 2.252,
"step": 161000
},
{
"epoch": 0.603415180468777,
"grad_norm": 3.495464324951172,
"learning_rate": 7.918610196235899e-05,
"loss": 2.2379,
"step": 161500
},
{
"epoch": 0.6052833389222407,
"grad_norm": 2.2157094478607178,
"learning_rate": 7.906684261604388e-05,
"loss": 2.2813,
"step": 162000
},
{
"epoch": 0.6071514973757044,
"grad_norm": 3.170558452606201,
"learning_rate": 7.894733296703025e-05,
"loss": 2.2457,
"step": 162500
},
{
"epoch": 0.6090196558291682,
"grad_norm": 3.1325762271881104,
"learning_rate": 7.882781381038415e-05,
"loss": 2.2531,
"step": 163000
},
{
"epoch": 0.6108878142826318,
"grad_norm": 2.3855438232421875,
"learning_rate": 7.87078071409669e-05,
"loss": 2.2665,
"step": 163500
},
{
"epoch": 0.6127559727360955,
"grad_norm": 2.261495351791382,
"learning_rate": 7.858755326060588e-05,
"loss": 2.2769,
"step": 164000
},
{
"epoch": 0.6146241311895593,
"grad_norm": 3.212700128555298,
"learning_rate": 7.846705320484082e-05,
"loss": 2.2719,
"step": 164500
},
{
"epoch": 0.6164922896430229,
"grad_norm": 2.875687837600708,
"learning_rate": 7.83465497456751e-05,
"loss": 2.2756,
"step": 165000
},
{
"epoch": 0.6183604480964866,
"grad_norm": 3.213188886642456,
"learning_rate": 7.822556094134869e-05,
"loss": 2.2475,
"step": 165500
},
{
"epoch": 0.6202286065499504,
"grad_norm": 2.9114816188812256,
"learning_rate": 7.81043290788352e-05,
"loss": 2.2411,
"step": 166000
},
{
"epoch": 0.622096765003414,
"grad_norm": 2.960690498352051,
"learning_rate": 7.798285520209603e-05,
"loss": 2.2823,
"step": 166500
},
{
"epoch": 0.6239649234568778,
"grad_norm": 2.9522547721862793,
"learning_rate": 7.786138402665644e-05,
"loss": 2.2186,
"step": 167000
},
{
"epoch": 0.6258330819103415,
"grad_norm": 2.8541057109832764,
"learning_rate": 7.773942974047013e-05,
"loss": 2.2735,
"step": 167500
},
{
"epoch": 0.6277012403638051,
"grad_norm": 2.182999849319458,
"learning_rate": 7.761723658230827e-05,
"loss": 2.2556,
"step": 168000
},
{
"epoch": 0.6295693988172689,
"grad_norm": 2.0711419582366943,
"learning_rate": 7.749480560441025e-05,
"loss": 2.2949,
"step": 168500
},
{
"epoch": 0.6314375572707326,
"grad_norm": 2.7931690216064453,
"learning_rate": 7.737238343214024e-05,
"loss": 2.2579,
"step": 169000
},
{
"epoch": 0.6333057157241962,
"grad_norm": 2.2357709407806396,
"learning_rate": 7.724948045003347e-05,
"loss": 2.2145,
"step": 169500
},
{
"epoch": 0.63517387417766,
"grad_norm": 2.4123311042785645,
"learning_rate": 7.712634281504125e-05,
"loss": 2.2908,
"step": 170000
},
{
"epoch": 0.6370420326311237,
"grad_norm": 3.390855312347412,
"learning_rate": 7.700321856241075e-05,
"loss": 2.1975,
"step": 170500
},
{
"epoch": 0.6389101910845875,
"grad_norm": 2.8016293048858643,
"learning_rate": 7.687961526877562e-05,
"loss": 2.2842,
"step": 171000
},
{
"epoch": 0.6407783495380511,
"grad_norm": 2.734112501144409,
"learning_rate": 7.675578050726744e-05,
"loss": 2.2881,
"step": 171500
},
{
"epoch": 0.6426465079915148,
"grad_norm": 2.7221627235412598,
"learning_rate": 7.66317153442619e-05,
"loss": 2.2748,
"step": 172000
},
{
"epoch": 0.6445146664449786,
"grad_norm": 2.9320507049560547,
"learning_rate": 7.650766966527448e-05,
"loss": 2.2157,
"step": 172500
},
{
"epoch": 0.6463828248984422,
"grad_norm": 2.428924798965454,
"learning_rate": 7.638314736178451e-05,
"loss": 2.2613,
"step": 173000
},
{
"epoch": 0.6482509833519059,
"grad_norm": 2.5038206577301025,
"learning_rate": 7.62583978656453e-05,
"loss": 2.2606,
"step": 173500
},
{
"epoch": 0.6501191418053697,
"grad_norm": 2.3970868587493896,
"learning_rate": 7.613342225110954e-05,
"loss": 2.2383,
"step": 174000
},
{
"epoch": 0.6519873002588333,
"grad_norm": 2.124425172805786,
"learning_rate": 7.60082215943772e-05,
"loss": 2.2513,
"step": 174500
},
{
"epoch": 0.6538554587122971,
"grad_norm": 3.180497884750366,
"learning_rate": 7.58830480456262e-05,
"loss": 2.2722,
"step": 175000
},
{
"epoch": 0.6557236171657608,
"grad_norm": 2.8902299404144287,
"learning_rate": 7.575740098553152e-05,
"loss": 2.2439,
"step": 175500
},
{
"epoch": 0.6575917756192244,
"grad_norm": 2.987680196762085,
"learning_rate": 7.563153212126435e-05,
"loss": 2.233,
"step": 176000
},
{
"epoch": 0.6594599340726882,
"grad_norm": 2.5328335762023926,
"learning_rate": 7.550544253671663e-05,
"loss": 2.2434,
"step": 176500
},
{
"epoch": 0.6613280925261519,
"grad_norm": 2.5823991298675537,
"learning_rate": 7.537913331768098e-05,
"loss": 2.2261,
"step": 177000
},
{
"epoch": 0.6631962509796155,
"grad_norm": 3.252668619155884,
"learning_rate": 7.525260555184135e-05,
"loss": 2.2626,
"step": 177500
},
{
"epoch": 0.6650644094330793,
"grad_norm": 2.427614688873291,
"learning_rate": 7.512586032876367e-05,
"loss": 2.2249,
"step": 178000
},
{
"epoch": 0.666932567886543,
"grad_norm": 2.6210880279541016,
"learning_rate": 7.49988987398865e-05,
"loss": 2.2602,
"step": 178500
},
{
"epoch": 0.6688007263400068,
"grad_norm": 2.7572479248046875,
"learning_rate": 7.487223101332892e-05,
"loss": 2.2325,
"step": 179000
},
{
"epoch": 0.6706688847934704,
"grad_norm": 3.2144672870635986,
"learning_rate": 7.474484082913688e-05,
"loss": 2.2835,
"step": 179500
},
{
"epoch": 0.6725370432469341,
"grad_norm": 2.4524009227752686,
"learning_rate": 7.461723756021062e-05,
"loss": 2.274,
"step": 180000
},
{
"epoch": 0.6744052017003979,
"grad_norm": 2.676546335220337,
"learning_rate": 7.44894223053775e-05,
"loss": 2.2941,
"step": 180500
},
{
"epoch": 0.6762733601538615,
"grad_norm": 3.0090246200561523,
"learning_rate": 7.43613961652904e-05,
"loss": 2.2545,
"step": 181000
},
{
"epoch": 0.6781415186073252,
"grad_norm": 2.6397953033447266,
"learning_rate": 7.423316024241814e-05,
"loss": 2.2541,
"step": 181500
},
{
"epoch": 0.680009677060789,
"grad_norm": 3.0165371894836426,
"learning_rate": 7.410471564103606e-05,
"loss": 2.2319,
"step": 182000
},
{
"epoch": 0.6818778355142526,
"grad_norm": 2.1070499420166016,
"learning_rate": 7.39760634672165e-05,
"loss": 2.2617,
"step": 182500
},
{
"epoch": 0.6837459939677164,
"grad_norm": 2.777233123779297,
"learning_rate": 7.384746275141047e-05,
"loss": 2.2206,
"step": 183000
},
{
"epoch": 0.6856141524211801,
"grad_norm": 2.188089370727539,
"learning_rate": 7.371839916767453e-05,
"loss": 2.2428,
"step": 183500
},
{
"epoch": 0.6874823108746437,
"grad_norm": 2.427400827407837,
"learning_rate": 7.358913133818016e-05,
"loss": 2.2161,
"step": 184000
},
{
"epoch": 0.6893504693281075,
"grad_norm": 2.542616605758667,
"learning_rate": 7.34596603760887e-05,
"loss": 2.266,
"step": 184500
},
{
"epoch": 0.6912186277815712,
"grad_norm": 2.6249241828918457,
"learning_rate": 7.333024694314207e-05,
"loss": 2.2383,
"step": 185000
},
{
"epoch": 0.6930867862350348,
"grad_norm": 2.5798895359039307,
"learning_rate": 7.320037346301442e-05,
"loss": 2.2524,
"step": 185500
},
{
"epoch": 0.6949549446884986,
"grad_norm": 2.9020352363586426,
"learning_rate": 7.307030019799232e-05,
"loss": 2.2251,
"step": 186000
},
{
"epoch": 0.6968231031419623,
"grad_norm": 3.3277840614318848,
"learning_rate": 7.294002826817298e-05,
"loss": 2.2608,
"step": 186500
},
{
"epoch": 0.698691261595426,
"grad_norm": 2.6658146381378174,
"learning_rate": 7.280955879536435e-05,
"loss": 2.2689,
"step": 187000
},
{
"epoch": 0.7005594200488897,
"grad_norm": 2.736542224884033,
"learning_rate": 7.267915443013911e-05,
"loss": 2.2004,
"step": 187500
},
{
"epoch": 0.7024275785023534,
"grad_norm": 2.440765619277954,
"learning_rate": 7.254829363303503e-05,
"loss": 2.2541,
"step": 188000
},
{
"epoch": 0.7042957369558172,
"grad_norm": 2.6804561614990234,
"learning_rate": 7.241723866627799e-05,
"loss": 2.2647,
"step": 188500
},
{
"epoch": 0.7061638954092808,
"grad_norm": 2.6702585220336914,
"learning_rate": 7.228599065841891e-05,
"loss": 2.2004,
"step": 189000
},
{
"epoch": 0.7080320538627445,
"grad_norm": 2.5987019538879395,
"learning_rate": 7.215481381028357e-05,
"loss": 2.2509,
"step": 189500
},
{
"epoch": 0.7099002123162083,
"grad_norm": 2.9680731296539307,
"learning_rate": 7.20231834929401e-05,
"loss": 2.2262,
"step": 190000
},
{
"epoch": 0.7117683707696719,
"grad_norm": 3.8419201374053955,
"learning_rate": 7.189136352781376e-05,
"loss": 2.2313,
"step": 190500
},
{
"epoch": 0.7136365292231357,
"grad_norm": 2.6179468631744385,
"learning_rate": 7.175935505004304e-05,
"loss": 2.2466,
"step": 191000
},
{
"epoch": 0.7155046876765994,
"grad_norm": 1.9412791728973389,
"learning_rate": 7.162742377434187e-05,
"loss": 2.2336,
"step": 191500
},
{
"epoch": 0.717372846130063,
"grad_norm": 2.312648057937622,
"learning_rate": 7.149504205451939e-05,
"loss": 2.2124,
"step": 192000
},
{
"epoch": 0.7192410045835268,
"grad_norm": 2.4080445766448975,
"learning_rate": 7.136247523488743e-05,
"loss": 2.2103,
"step": 192500
},
{
"epoch": 0.7211091630369905,
"grad_norm": 3.0859153270721436,
"learning_rate": 7.122972445701587e-05,
"loss": 2.1961,
"step": 193000
},
{
"epoch": 0.7229773214904541,
"grad_norm": 3.438227415084839,
"learning_rate": 7.10970569129335e-05,
"loss": 2.2128,
"step": 193500
},
{
"epoch": 0.7248454799439179,
"grad_norm": 2.6577913761138916,
"learning_rate": 7.096394201181632e-05,
"loss": 2.2254,
"step": 194000
},
{
"epoch": 0.7267136383973816,
"grad_norm": 2.579580068588257,
"learning_rate": 7.083064658434042e-05,
"loss": 2.2562,
"step": 194500
},
{
"epoch": 0.7285817968508453,
"grad_norm": 2.957392454147339,
"learning_rate": 7.069717177834997e-05,
"loss": 2.2762,
"step": 195000
},
{
"epoch": 0.730449955304309,
"grad_norm": 1.9975017309188843,
"learning_rate": 7.056378622641193e-05,
"loss": 2.2385,
"step": 195500
},
{
"epoch": 0.7323181137577727,
"grad_norm": 3.1538219451904297,
"learning_rate": 7.042995646610036e-05,
"loss": 2.2086,
"step": 196000
},
{
"epoch": 0.7341862722112364,
"grad_norm": 2.2817578315734863,
"learning_rate": 7.02959507777287e-05,
"loss": 2.2153,
"step": 196500
},
{
"epoch": 0.7360544306647001,
"grad_norm": 2.5474236011505127,
"learning_rate": 7.016177031525738e-05,
"loss": 2.2388,
"step": 197000
},
{
"epoch": 0.7379225891181638,
"grad_norm": 2.5271482467651367,
"learning_rate": 7.002795399479169e-05,
"loss": 2.2344,
"step": 197500
},
{
"epoch": 0.7397907475716275,
"grad_norm": 1.9711894989013672,
"learning_rate": 6.989342813955246e-05,
"loss": 2.1875,
"step": 198000
},
{
"epoch": 0.7416589060250912,
"grad_norm": 2.832296133041382,
"learning_rate": 6.97587309764484e-05,
"loss": 2.2378,
"step": 198500
},
{
"epoch": 0.743527064478555,
"grad_norm": 3.224106788635254,
"learning_rate": 6.962386366539439e-05,
"loss": 2.1749,
"step": 199000
},
{
"epoch": 0.7453952229320187,
"grad_norm": 2.2426908016204834,
"learning_rate": 6.948882736777054e-05,
"loss": 2.1997,
"step": 199500
},
{
"epoch": 0.7472633813854823,
"grad_norm": 2.7945656776428223,
"learning_rate": 6.935362324641206e-05,
"loss": 2.2217,
"step": 200000
},
{
"epoch": 0.7491315398389461,
"grad_norm": 2.7567574977874756,
"learning_rate": 6.921825246559942e-05,
"loss": 2.2296,
"step": 200500
},
{
"epoch": 0.7509996982924098,
"grad_norm": 2.5919723510742188,
"learning_rate": 6.908298742798458e-05,
"loss": 2.2364,
"step": 201000
},
{
"epoch": 0.7528678567458734,
"grad_norm": 2.993880271911621,
"learning_rate": 6.894728715432299e-05,
"loss": 2.2065,
"step": 201500
},
{
"epoch": 0.7547360151993372,
"grad_norm": 2.4301109313964844,
"learning_rate": 6.881142372028077e-05,
"loss": 2.2457,
"step": 202000
},
{
"epoch": 0.7566041736528009,
"grad_norm": 2.623084783554077,
"learning_rate": 6.867539829581595e-05,
"loss": 2.1742,
"step": 202500
},
{
"epoch": 0.7584723321062646,
"grad_norm": 3.4304981231689453,
"learning_rate": 6.853921205228139e-05,
"loss": 2.2292,
"step": 203000
},
{
"epoch": 0.7603404905597283,
"grad_norm": 1.7889618873596191,
"learning_rate": 6.84028661624149e-05,
"loss": 2.217,
"step": 203500
},
{
"epoch": 0.762208649013192,
"grad_norm": 2.954709053039551,
"learning_rate": 6.8266361800329e-05,
"loss": 2.2491,
"step": 204000
},
{
"epoch": 0.7640768074666557,
"grad_norm": 2.892221212387085,
"learning_rate": 6.812970014150086e-05,
"loss": 2.2431,
"step": 204500
},
{
"epoch": 0.7659449659201194,
"grad_norm": 1.9717577695846558,
"learning_rate": 6.799315615334446e-05,
"loss": 2.2397,
"step": 205000
},
{
"epoch": 0.7678131243735831,
"grad_norm": 2.904269218444824,
"learning_rate": 6.785618374157811e-05,
"loss": 2.1972,
"step": 205500
},
{
"epoch": 0.7696812828270468,
"grad_norm": 3.807295083999634,
"learning_rate": 6.771933197025247e-05,
"loss": 2.2292,
"step": 206000
},
{
"epoch": 0.7715494412805105,
"grad_norm": 3.4538333415985107,
"learning_rate": 6.758205351413722e-05,
"loss": 2.1935,
"step": 206500
},
{
"epoch": 0.7734175997339743,
"grad_norm": 2.769444227218628,
"learning_rate": 6.744462365404948e-05,
"loss": 2.1709,
"step": 207000
},
{
"epoch": 0.775285758187438,
"grad_norm": 3.002584934234619,
"learning_rate": 6.730704357343616e-05,
"loss": 2.1863,
"step": 207500
},
{
"epoch": 0.7771539166409016,
"grad_norm": 2.559108257293701,
"learning_rate": 6.716959006322012e-05,
"loss": 2.2118,
"step": 208000
},
{
"epoch": 0.7790220750943654,
"grad_norm": 3.1521153450012207,
"learning_rate": 6.703171339157552e-05,
"loss": 2.19,
"step": 208500
},
{
"epoch": 0.780890233547829,
"grad_norm": 2.7111008167266846,
"learning_rate": 6.689369005509088e-05,
"loss": 2.2044,
"step": 209000
},
{
"epoch": 0.7827583920012927,
"grad_norm": 2.8580000400543213,
"learning_rate": 6.675552124232371e-05,
"loss": 2.2458,
"step": 209500
},
{
"epoch": 0.7846265504547565,
"grad_norm": 2.7248494625091553,
"learning_rate": 6.661720814308425e-05,
"loss": 2.2096,
"step": 210000
},
{
"epoch": 0.7864947089082202,
"grad_norm": 3.5847723484039307,
"learning_rate": 6.647875194842521e-05,
"loss": 2.2238,
"step": 210500
},
{
"epoch": 0.7883628673616839,
"grad_norm": 3.013185977935791,
"learning_rate": 6.634015385063155e-05,
"loss": 2.2128,
"step": 211000
},
{
"epoch": 0.7902310258151476,
"grad_norm": 3.160470962524414,
"learning_rate": 6.620141504321021e-05,
"loss": 2.2604,
"step": 211500
},
{
"epoch": 0.7920991842686113,
"grad_norm": 3.009772300720215,
"learning_rate": 6.606281461596562e-05,
"loss": 2.2169,
"step": 212000
},
{
"epoch": 0.793967342722075,
"grad_norm": 2.7089791297912598,
"learning_rate": 6.592379825008977e-05,
"loss": 2.1894,
"step": 212500
},
{
"epoch": 0.7958355011755387,
"grad_norm": 2.2874131202697754,
"learning_rate": 6.578492320297462e-05,
"loss": 2.2472,
"step": 213000
},
{
"epoch": 0.7977036596290024,
"grad_norm": 3.115208864212036,
"learning_rate": 6.564563405749691e-05,
"loss": 2.1696,
"step": 213500
},
{
"epoch": 0.7995718180824661,
"grad_norm": 3.074309825897217,
"learning_rate": 6.550621018309538e-05,
"loss": 2.2022,
"step": 214000
},
{
"epoch": 0.8014399765359298,
"grad_norm": 2.6160593032836914,
"learning_rate": 6.536665278038796e-05,
"loss": 2.2136,
"step": 214500
},
{
"epoch": 0.8033081349893936,
"grad_norm": 2.875887155532837,
"learning_rate": 6.522696305114238e-05,
"loss": 2.222,
"step": 215000
},
{
"epoch": 0.8051762934428572,
"grad_norm": 1.9582101106643677,
"learning_rate": 6.508714219826595e-05,
"loss": 2.1975,
"step": 215500
},
{
"epoch": 0.8070444518963209,
"grad_norm": 3.11397647857666,
"learning_rate": 6.494719142579506e-05,
"loss": 2.2285,
"step": 216000
},
{
"epoch": 0.8089126103497847,
"grad_norm": 2.7110836505889893,
"learning_rate": 6.480711193888488e-05,
"loss": 2.1638,
"step": 216500
},
{
"epoch": 0.8107807688032483,
"grad_norm": 2.2085702419281006,
"learning_rate": 6.4666904943799e-05,
"loss": 2.2144,
"step": 217000
},
{
"epoch": 0.812648927256712,
"grad_norm": 3.44262957572937,
"learning_rate": 6.452657164789899e-05,
"loss": 2.2248,
"step": 217500
},
{
"epoch": 0.8145170857101758,
"grad_norm": 2.770791530609131,
"learning_rate": 6.438639430044904e-05,
"loss": 2.1861,
"step": 218000
},
{
"epoch": 0.8163852441636394,
"grad_norm": 3.2068679332733154,
"learning_rate": 6.424581227590346e-05,
"loss": 2.1691,
"step": 218500
},
{
"epoch": 0.8182534026171032,
"grad_norm": 3.264312744140625,
"learning_rate": 6.410510757669032e-05,
"loss": 2.159,
"step": 219000
},
{
"epoch": 0.8201215610705669,
"grad_norm": 3.264051675796509,
"learning_rate": 6.396428141445709e-05,
"loss": 2.1775,
"step": 219500
},
{
"epoch": 0.8219897195240305,
"grad_norm": 2.961418867111206,
"learning_rate": 6.382333500189714e-05,
"loss": 2.1851,
"step": 220000
},
{
"epoch": 0.8238578779774943,
"grad_norm": 4.034390449523926,
"learning_rate": 6.368226955273941e-05,
"loss": 2.1552,
"step": 220500
},
{
"epoch": 0.825726036430958,
"grad_norm": 2.0030012130737305,
"learning_rate": 6.354136876505816e-05,
"loss": 2.1762,
"step": 221000
},
{
"epoch": 0.8275941948844217,
"grad_norm": 2.7552449703216553,
"learning_rate": 6.340006911997954e-05,
"loss": 2.1758,
"step": 221500
},
{
"epoch": 0.8294623533378854,
"grad_norm": 2.4928476810455322,
"learning_rate": 6.325865408316381e-05,
"loss": 2.1951,
"step": 222000
},
{
"epoch": 0.8313305117913491,
"grad_norm": 2.8218753337860107,
"learning_rate": 6.311712487237538e-05,
"loss": 2.1348,
"step": 222500
},
{
"epoch": 0.8331986702448129,
"grad_norm": 3.4085326194763184,
"learning_rate": 6.297548270636179e-05,
"loss": 2.2058,
"step": 223000
},
{
"epoch": 0.8350668286982765,
"grad_norm": 3.3644134998321533,
"learning_rate": 6.283372880484332e-05,
"loss": 2.1574,
"step": 223500
},
{
"epoch": 0.8369349871517402,
"grad_norm": 3.0675761699676514,
"learning_rate": 6.269186438850234e-05,
"loss": 2.1725,
"step": 224000
},
{
"epoch": 0.838803145605204,
"grad_norm": 2.6877012252807617,
"learning_rate": 6.2549890678973e-05,
"loss": 2.1889,
"step": 224500
},
{
"epoch": 0.8406713040586676,
"grad_norm": 3.4169256687164307,
"learning_rate": 6.240837743960651e-05,
"loss": 2.1423,
"step": 225000
},
{
"epoch": 0.8425394625121313,
"grad_norm": 3.0024383068084717,
"learning_rate": 6.22661892373068e-05,
"loss": 2.178,
"step": 225500
},
{
"epoch": 0.8444076209655951,
"grad_norm": 3.079028606414795,
"learning_rate": 6.212389540742632e-05,
"loss": 2.2295,
"step": 226000
},
{
"epoch": 0.8462757794190587,
"grad_norm": 2.90077805519104,
"learning_rate": 6.198149717529692e-05,
"loss": 2.1684,
"step": 226500
},
{
"epoch": 0.8481439378725225,
"grad_norm": 3.053629159927368,
"learning_rate": 6.18389957671496e-05,
"loss": 2.1738,
"step": 227000
},
{
"epoch": 0.8500120963259862,
"grad_norm": 3.0925843715667725,
"learning_rate": 6.16963924101038e-05,
"loss": 2.1551,
"step": 227500
},
{
"epoch": 0.8518802547794498,
"grad_norm": 3.0221009254455566,
"learning_rate": 6.155368833215677e-05,
"loss": 2.1966,
"step": 228000
},
{
"epoch": 0.8537484132329136,
"grad_norm": 2.5803329944610596,
"learning_rate": 6.141088476217323e-05,
"loss": 2.164,
"step": 228500
},
{
"epoch": 0.8556165716863773,
"grad_norm": 3.4956555366516113,
"learning_rate": 6.126826883078718e-05,
"loss": 2.1776,
"step": 229000
},
{
"epoch": 0.8574847301398411,
"grad_norm": 2.8954169750213623,
"learning_rate": 6.112527015957583e-05,
"loss": 2.1944,
"step": 229500
},
{
"epoch": 0.8593528885933047,
"grad_norm": 3.2150614261627197,
"learning_rate": 6.0982175685556475e-05,
"loss": 2.1942,
"step": 230000
},
{
"epoch": 0.8612210470467684,
"grad_norm": 2.8969147205352783,
"learning_rate": 6.083898664095558e-05,
"loss": 2.152,
"step": 230500
},
{
"epoch": 0.8630892055002322,
"grad_norm": 2.898751974105835,
"learning_rate": 6.069599091590918e-05,
"loss": 2.1624,
"step": 231000
},
{
"epoch": 0.8649573639536958,
"grad_norm": 3.5042660236358643,
"learning_rate": 6.05529034527542e-05,
"loss": 2.1428,
"step": 231500
},
{
"epoch": 0.8668255224071595,
"grad_norm": 3.0192151069641113,
"learning_rate": 6.040943845887397e-05,
"loss": 2.1942,
"step": 232000
},
{
"epoch": 0.8686936808606233,
"grad_norm": 3.0444955825805664,
"learning_rate": 6.026588382641243e-05,
"loss": 2.1533,
"step": 232500
},
{
"epoch": 0.8705618393140869,
"grad_norm": 3.1138992309570312,
"learning_rate": 6.012224079155855e-05,
"loss": 2.1841,
"step": 233000
},
{
"epoch": 0.8724299977675507,
"grad_norm": 2.3980443477630615,
"learning_rate": 5.997879813783181e-05,
"loss": 2.1724,
"step": 233500
},
{
"epoch": 0.8742981562210144,
"grad_norm": 2.9543912410736084,
"learning_rate": 5.9834982180414524e-05,
"loss": 2.1502,
"step": 234000
},
{
"epoch": 0.876166314674478,
"grad_norm": 2.555027961730957,
"learning_rate": 5.969108153121932e-05,
"loss": 2.1499,
"step": 234500
},
{
"epoch": 0.8780344731279418,
"grad_norm": 2.4806180000305176,
"learning_rate": 5.954709742941489e-05,
"loss": 2.1733,
"step": 235000
},
{
"epoch": 0.8799026315814055,
"grad_norm": 2.855769634246826,
"learning_rate": 5.9403031114888505e-05,
"loss": 2.1783,
"step": 235500
},
{
"epoch": 0.8817707900348691,
"grad_norm": 2.85447359085083,
"learning_rate": 5.9258883828235466e-05,
"loss": 2.1684,
"step": 236000
},
{
"epoch": 0.8836389484883329,
"grad_norm": 3.5129261016845703,
"learning_rate": 5.911494534352925e-05,
"loss": 2.1825,
"step": 236500
},
{
"epoch": 0.8855071069417966,
"grad_norm": 3.9751412868499756,
"learning_rate": 5.8970639992924826e-05,
"loss": 2.1827,
"step": 237000
},
{
"epoch": 0.8873752653952603,
"grad_norm": 3.1551120281219482,
"learning_rate": 5.882625739363443e-05,
"loss": 2.2232,
"step": 237500
},
{
"epoch": 0.889243423848724,
"grad_norm": 3.2931878566741943,
"learning_rate": 5.868179878897693e-05,
"loss": 2.1291,
"step": 238000
},
{
"epoch": 0.8911115823021877,
"grad_norm": 3.2662160396575928,
"learning_rate": 5.853726542292572e-05,
"loss": 2.1776,
"step": 238500
},
{
"epoch": 0.8929797407556515,
"grad_norm": 2.764841079711914,
"learning_rate": 5.8392658540097975e-05,
"loss": 2.1069,
"step": 239000
},
{
"epoch": 0.8948478992091151,
"grad_norm": 1.903836965560913,
"learning_rate": 5.8247979385743945e-05,
"loss": 2.1436,
"step": 239500
},
{
"epoch": 0.8967160576625788,
"grad_norm": 2.859905481338501,
"learning_rate": 5.8103229205736235e-05,
"loss": 2.1426,
"step": 240000
},
{
"epoch": 0.8985842161160426,
"grad_norm": 3.1984663009643555,
"learning_rate": 5.79586989552882e-05,
"loss": 2.1798,
"step": 240500
},
{
"epoch": 0.9004523745695062,
"grad_norm": 2.157151222229004,
"learning_rate": 5.781381059984584e-05,
"loss": 2.1766,
"step": 241000
},
{
"epoch": 0.90232053302297,
"grad_norm": 3.674839973449707,
"learning_rate": 5.7668854957498444e-05,
"loss": 2.1925,
"step": 241500
},
{
"epoch": 0.9041886914764337,
"grad_norm": 2.9118549823760986,
"learning_rate": 5.752383327649953e-05,
"loss": 2.1655,
"step": 242000
},
{
"epoch": 0.9060568499298973,
"grad_norm": 3.0006792545318604,
"learning_rate": 5.737903704244284e-05,
"loss": 2.1639,
"step": 242500
},
{
"epoch": 0.9079250083833611,
"grad_norm": 3.3966879844665527,
"learning_rate": 5.723388715699902e-05,
"loss": 2.1106,
"step": 243000
},
{
"epoch": 0.9097931668368248,
"grad_norm": 3.6091904640197754,
"learning_rate": 5.708896546422721e-05,
"loss": 2.1847,
"step": 243500
},
{
"epoch": 0.9116613252902884,
"grad_norm": 2.7571775913238525,
"learning_rate": 5.694369236403816e-05,
"loss": 2.1453,
"step": 244000
},
{
"epoch": 0.9135294837437522,
"grad_norm": 3.4625306129455566,
"learning_rate": 5.6798359469775195e-05,
"loss": 2.1599,
"step": 244500
},
{
"epoch": 0.9153976421972159,
"grad_norm": 2.573812246322632,
"learning_rate": 5.665296803294042e-05,
"loss": 2.1393,
"step": 245000
},
{
"epoch": 0.9172658006506796,
"grad_norm": 2.3979828357696533,
"learning_rate": 5.650751930554011e-05,
"loss": 2.1714,
"step": 245500
},
{
"epoch": 0.9191339591041433,
"grad_norm": 3.1871445178985596,
"learning_rate": 5.6362014540073884e-05,
"loss": 2.1164,
"step": 246000
},
{
"epoch": 0.921002117557607,
"grad_norm": 2.8169736862182617,
"learning_rate": 5.6216454989523906e-05,
"loss": 2.1343,
"step": 246500
},
{
"epoch": 0.9228702760110707,
"grad_norm": 3.2970011234283447,
"learning_rate": 5.607113318609965e-05,
"loss": 2.1403,
"step": 247000
},
{
"epoch": 0.9247384344645344,
"grad_norm": 2.7862350940704346,
"learning_rate": 5.5925467929508655e-05,
"loss": 2.148,
"step": 247500
},
{
"epoch": 0.9266065929179981,
"grad_norm": 2.888575553894043,
"learning_rate": 5.5779751647058663e-05,
"loss": 2.184,
"step": 248000
},
{
"epoch": 0.9284747513714618,
"grad_norm": 2.52675199508667,
"learning_rate": 5.56339855935533e-05,
"loss": 2.078,
"step": 248500
},
{
"epoch": 0.9303429098249255,
"grad_norm": 2.9500951766967773,
"learning_rate": 5.54881710242247e-05,
"loss": 2.1206,
"step": 249000
},
{
"epoch": 0.9322110682783893,
"grad_norm": 2.5412566661834717,
"learning_rate": 5.5342309194722885e-05,
"loss": 2.1395,
"step": 249500
},
{
"epoch": 0.934079226731853,
"grad_norm": 2.3108468055725098,
"learning_rate": 5.519640136110478e-05,
"loss": 2.1498,
"step": 250000
},
{
"epoch": 0.9359473851853166,
"grad_norm": 2.373042345046997,
"learning_rate": 5.505044877982351e-05,
"loss": 2.1532,
"step": 250500
},
{
"epoch": 0.9378155436387804,
"grad_norm": 2.997445821762085,
"learning_rate": 5.490474474242996e-05,
"loss": 2.1451,
"step": 251000
},
{
"epoch": 0.939683702092244,
"grad_norm": 2.837625741958618,
"learning_rate": 5.4758706519924406e-05,
"loss": 2.1425,
"step": 251500
},
{
"epoch": 0.9415518605457077,
"grad_norm": 2.954401731491089,
"learning_rate": 5.461262731886816e-05,
"loss": 2.1568,
"step": 252000
},
{
"epoch": 0.9434200189991715,
"grad_norm": 3.2825334072113037,
"learning_rate": 5.446650839719003e-05,
"loss": 2.15,
"step": 252500
},
{
"epoch": 0.9452881774526352,
"grad_norm": 3.196861505508423,
"learning_rate": 5.4320643365477844e-05,
"loss": 2.1278,
"step": 253000
},
{
"epoch": 0.9471563359060989,
"grad_norm": 2.7488534450531006,
"learning_rate": 5.417444885085084e-05,
"loss": 2.1859,
"step": 253500
},
{
"epoch": 0.9490244943595626,
"grad_norm": 2.5847301483154297,
"learning_rate": 5.4028218388879116e-05,
"loss": 2.1445,
"step": 254000
},
{
"epoch": 0.9508926528130263,
"grad_norm": 3.6500895023345947,
"learning_rate": 5.388195323879396e-05,
"loss": 2.1439,
"step": 254500
},
{
"epoch": 0.95276081126649,
"grad_norm": 2.848147392272949,
"learning_rate": 5.373594728980722e-05,
"loss": 2.1709,
"step": 255000
},
{
"epoch": 0.9546289697199537,
"grad_norm": 2.592301368713379,
"learning_rate": 5.35899092980915e-05,
"loss": 2.1306,
"step": 255500
},
{
"epoch": 0.9564971281734174,
"grad_norm": 1.9539679288864136,
"learning_rate": 5.344354776311128e-05,
"loss": 2.115,
"step": 256000
},
{
"epoch": 0.9583652866268811,
"grad_norm": 3.211258888244629,
"learning_rate": 5.329715657477968e-05,
"loss": 2.166,
"step": 256500
},
{
"epoch": 0.9602334450803448,
"grad_norm": 2.754812240600586,
"learning_rate": 5.31507369937121e-05,
"loss": 2.1639,
"step": 257000
},
{
"epoch": 0.9621016035338086,
"grad_norm": 2.349533796310425,
"learning_rate": 5.300458320043379e-05,
"loss": 2.155,
"step": 257500
},
{
"epoch": 0.9639697619872722,
"grad_norm": 3.3088858127593994,
"learning_rate": 5.285811066719044e-05,
"loss": 2.1429,
"step": 258000
},
{
"epoch": 0.9658379204407359,
"grad_norm": 3.420562505722046,
"learning_rate": 5.2711613521958034e-05,
"loss": 2.133,
"step": 258500
},
{
"epoch": 0.9677060788941997,
"grad_norm": 2.4579176902770996,
"learning_rate": 5.256509302626437e-05,
"loss": 2.1483,
"step": 259000
},
{
"epoch": 0.9695742373476633,
"grad_norm": 3.574404239654541,
"learning_rate": 5.241855044183839e-05,
"loss": 2.1599,
"step": 259500
},
{
"epoch": 0.971442395801127,
"grad_norm": 2.763312816619873,
"learning_rate": 5.227198703059918e-05,
"loss": 2.1175,
"step": 260000
},
{
"epoch": 0.9733105542545908,
"grad_norm": 3.4662206172943115,
"learning_rate": 5.2125404054645224e-05,
"loss": 2.1439,
"step": 260500
},
{
"epoch": 0.9751787127080545,
"grad_norm": 2.4736666679382324,
"learning_rate": 5.197880277624344e-05,
"loss": 2.166,
"step": 261000
},
{
"epoch": 0.9770468711615182,
"grad_norm": 2.448014974594116,
"learning_rate": 5.1832184457818365e-05,
"loss": 2.1184,
"step": 261500
},
{
"epoch": 0.9789150296149819,
"grad_norm": 2.605496644973755,
"learning_rate": 5.168584364503971e-05,
"loss": 2.0694,
"step": 262000
},
{
"epoch": 0.9807831880684456,
"grad_norm": 2.6576755046844482,
"learning_rate": 5.153919506218703e-05,
"loss": 2.1525,
"step": 262500
},
{
"epoch": 0.9826513465219093,
"grad_norm": 3.0602567195892334,
"learning_rate": 5.139253322489586e-05,
"loss": 2.12,
"step": 263000
},
{
"epoch": 0.984519504975373,
"grad_norm": 2.233271598815918,
"learning_rate": 5.124585939611224e-05,
"loss": 2.124,
"step": 263500
},
{
"epoch": 0.9863876634288367,
"grad_norm": 3.0819501876831055,
"learning_rate": 5.109946821786733e-05,
"loss": 2.1361,
"step": 264000
},
{
"epoch": 0.9882558218823004,
"grad_norm": 2.7308757305145264,
"learning_rate": 5.0952774213009e-05,
"loss": 2.1196,
"step": 264500
},
{
"epoch": 0.9901239803357641,
"grad_norm": 2.309229612350464,
"learning_rate": 5.080607200354588e-05,
"loss": 2.071,
"step": 265000
},
{
"epoch": 0.9919921387892279,
"grad_norm": 3.331204652786255,
"learning_rate": 5.065965627716091e-05,
"loss": 2.0675,
"step": 265500
},
{
"epoch": 0.9938602972426915,
"grad_norm": 3.6821019649505615,
"learning_rate": 5.051294145852407e-05,
"loss": 2.1329,
"step": 266000
},
{
"epoch": 0.9957284556961552,
"grad_norm": 1.9205609560012817,
"learning_rate": 5.036622222280509e-05,
"loss": 2.1563,
"step": 266500
},
{
"epoch": 0.997596614149619,
"grad_norm": 3.6985223293304443,
"learning_rate": 5.021949983344428e-05,
"loss": 2.139,
"step": 267000
},
{
"epoch": 0.9994647726030826,
"grad_norm": 3.8483798503875732,
"learning_rate": 5.007277555390912e-05,
"loss": 2.1531,
"step": 267500
},
{
"epoch": 1.0013329310565464,
"grad_norm": 2.758868932723999,
"learning_rate": 4.992605064768335e-05,
"loss": 2.0257,
"step": 268000
},
{
"epoch": 1.00320108951001,
"grad_norm": 2.7047057151794434,
"learning_rate": 4.9779619825319616e-05,
"loss": 1.9918,
"step": 268500
},
{
"epoch": 1.0050692479634737,
"grad_norm": 3.4775989055633545,
"learning_rate": 4.963289745111303e-05,
"loss": 1.9841,
"step": 269000
},
{
"epoch": 1.0069374064169374,
"grad_norm": 3.1174392700195312,
"learning_rate": 4.9486178238129e-05,
"loss": 1.9998,
"step": 269500
},
{
"epoch": 1.008805564870401,
"grad_norm": 3.418029546737671,
"learning_rate": 4.933946344980765e-05,
"loss": 2.0305,
"step": 270000
},
{
"epoch": 1.010673723323865,
"grad_norm": 4.21517276763916,
"learning_rate": 4.919275434955098e-05,
"loss": 1.9349,
"step": 270500
},
{
"epoch": 1.0125418817773286,
"grad_norm": 3.2260196208953857,
"learning_rate": 4.904605220071203e-05,
"loss": 1.9659,
"step": 271000
},
{
"epoch": 1.0144100402307923,
"grad_norm": 2.354206085205078,
"learning_rate": 4.889935826658396e-05,
"loss": 1.9459,
"step": 271500
},
{
"epoch": 1.016278198684256,
"grad_norm": 2.399245262145996,
"learning_rate": 4.8752967169003024e-05,
"loss": 1.9669,
"step": 272000
},
{
"epoch": 1.0181463571377196,
"grad_norm": 2.836991786956787,
"learning_rate": 4.8606293431139685e-05,
"loss": 1.9754,
"step": 272500
},
{
"epoch": 1.0200145155911835,
"grad_norm": 2.369506597518921,
"learning_rate": 4.845963169487281e-05,
"loss": 1.9748,
"step": 273000
},
{
"epoch": 1.0218826740446472,
"grad_norm": 4.3176140785217285,
"learning_rate": 4.831298322314752e-05,
"loss": 1.9874,
"step": 273500
},
{
"epoch": 1.0237508324981108,
"grad_norm": 2.473726749420166,
"learning_rate": 4.8166349278794803e-05,
"loss": 1.9784,
"step": 274000
},
{
"epoch": 1.0256189909515745,
"grad_norm": 3.3185558319091797,
"learning_rate": 4.8019731124520506e-05,
"loss": 2.0007,
"step": 274500
},
{
"epoch": 1.0274871494050382,
"grad_norm": 3.276498317718506,
"learning_rate": 4.787313002289445e-05,
"loss": 1.9758,
"step": 275000
},
{
"epoch": 1.029355307858502,
"grad_norm": 3.0989725589752197,
"learning_rate": 4.772654723633967e-05,
"loss": 2.0042,
"step": 275500
},
{
"epoch": 1.0312234663119657,
"grad_norm": 2.4186153411865234,
"learning_rate": 4.7580277133162835e-05,
"loss": 2.0053,
"step": 276000
},
{
"epoch": 1.0330916247654294,
"grad_norm": 2.4179837703704834,
"learning_rate": 4.74340277836311e-05,
"loss": 1.9908,
"step": 276500
},
{
"epoch": 1.034959783218893,
"grad_norm": 3.3896212577819824,
"learning_rate": 4.728750742427794e-05,
"loss": 1.9604,
"step": 277000
},
{
"epoch": 1.0368279416723567,
"grad_norm": 2.6385319232940674,
"learning_rate": 4.714101042295578e-05,
"loss": 1.9896,
"step": 277500
},
{
"epoch": 1.0386961001258204,
"grad_norm": 3.6427805423736572,
"learning_rate": 4.6994538041191235e-05,
"loss": 2.0044,
"step": 278000
},
{
"epoch": 1.0405642585792843,
"grad_norm": 3.0906810760498047,
"learning_rate": 4.684809154029888e-05,
"loss": 2.0074,
"step": 278500
},
{
"epoch": 1.042432417032748,
"grad_norm": 3.357675313949585,
"learning_rate": 4.67019649921625e-05,
"loss": 2.0337,
"step": 279000
},
{
"epoch": 1.0443005754862116,
"grad_norm": 3.163966655731201,
"learning_rate": 4.655557397799212e-05,
"loss": 1.9936,
"step": 279500
},
{
"epoch": 1.0461687339396752,
"grad_norm": 2.073416233062744,
"learning_rate": 4.640921262473603e-05,
"loss": 1.9917,
"step": 280000
},
{
"epoch": 1.048036892393139,
"grad_norm": 4.012736797332764,
"learning_rate": 4.626288219275275e-05,
"loss": 1.9811,
"step": 280500
},
{
"epoch": 1.0499050508466028,
"grad_norm": 3.065397262573242,
"learning_rate": 4.611658394213446e-05,
"loss": 2.0052,
"step": 281000
},
{
"epoch": 1.0517732093000665,
"grad_norm": 3.3266775608062744,
"learning_rate": 4.597061162810362e-05,
"loss": 1.997,
"step": 281500
},
{
"epoch": 1.0536413677535301,
"grad_norm": 2.940035820007324,
"learning_rate": 4.582438144871442e-05,
"loss": 1.9267,
"step": 282000
},
{
"epoch": 1.0555095262069938,
"grad_norm": 3.5627119541168213,
"learning_rate": 4.567818722674258e-05,
"loss": 1.973,
"step": 282500
},
{
"epoch": 1.0573776846604575,
"grad_norm": 2.702580213546753,
"learning_rate": 4.553203022110738e-05,
"loss": 1.9818,
"step": 283000
},
{
"epoch": 1.0592458431139213,
"grad_norm": 3.027751922607422,
"learning_rate": 4.538591169040759e-05,
"loss": 2.0195,
"step": 283500
},
{
"epoch": 1.061114001567385,
"grad_norm": 2.598694086074829,
"learning_rate": 4.5239832892910685e-05,
"loss": 1.9988,
"step": 284000
},
{
"epoch": 1.0629821600208487,
"grad_norm": 2.5287024974823,
"learning_rate": 4.5093795086541985e-05,
"loss": 1.9794,
"step": 284500
},
{
"epoch": 1.0648503184743123,
"grad_norm": 2.937054395675659,
"learning_rate": 4.494779952887383e-05,
"loss": 1.9804,
"step": 285000
},
{
"epoch": 1.066718476927776,
"grad_norm": 2.625366687774658,
"learning_rate": 4.48021393369639e-05,
"loss": 2.002,
"step": 285500
},
{
"epoch": 1.0685866353812399,
"grad_norm": 2.97308349609375,
"learning_rate": 4.465623195716817e-05,
"loss": 1.974,
"step": 286000
},
{
"epoch": 1.0704547938347035,
"grad_norm": 2.940298080444336,
"learning_rate": 4.4510370594051275e-05,
"loss": 1.9722,
"step": 286500
},
{
"epoch": 1.0723229522881672,
"grad_norm": 2.5476973056793213,
"learning_rate": 4.436455650366615e-05,
"loss": 2.0061,
"step": 287000
},
{
"epoch": 1.0741911107416309,
"grad_norm": 3.88171124458313,
"learning_rate": 4.4218790941658633e-05,
"loss": 1.9859,
"step": 287500
},
{
"epoch": 1.0760592691950945,
"grad_norm": 2.958958864212036,
"learning_rate": 4.407307516325668e-05,
"loss": 1.9929,
"step": 288000
},
{
"epoch": 1.0779274276485582,
"grad_norm": 3.2626969814300537,
"learning_rate": 4.3927410423259555e-05,
"loss": 2.0427,
"step": 288500
},
{
"epoch": 1.079795586102022,
"grad_norm": 2.726310968399048,
"learning_rate": 4.378208914789977e-05,
"loss": 1.9826,
"step": 289000
},
{
"epoch": 1.0816637445554858,
"grad_norm": 3.683236598968506,
"learning_rate": 4.36365301389968e-05,
"loss": 2.006,
"step": 289500
},
{
"epoch": 1.0835319030089494,
"grad_norm": 3.4819111824035645,
"learning_rate": 4.349102592770976e-05,
"loss": 1.9865,
"step": 290000
},
{
"epoch": 1.085400061462413,
"grad_norm": 3.417532444000244,
"learning_rate": 4.334557776701607e-05,
"loss": 1.9988,
"step": 290500
},
{
"epoch": 1.0872682199158767,
"grad_norm": 2.9879865646362305,
"learning_rate": 4.3200477633104895e-05,
"loss": 1.9888,
"step": 291000
},
{
"epoch": 1.0891363783693406,
"grad_norm": 2.8864903450012207,
"learning_rate": 4.305514521222923e-05,
"loss": 1.9602,
"step": 291500
},
{
"epoch": 1.0910045368228043,
"grad_norm": 3.8783183097839355,
"learning_rate": 4.290987259543744e-05,
"loss": 2.0115,
"step": 292000
},
{
"epoch": 1.092872695276268,
"grad_norm": 3.2339043617248535,
"learning_rate": 4.2764661033712623e-05,
"loss": 2.016,
"step": 292500
},
{
"epoch": 1.0947408537297316,
"grad_norm": 3.942629337310791,
"learning_rate": 4.261951177751206e-05,
"loss": 1.9975,
"step": 293000
},
{
"epoch": 1.0966090121831953,
"grad_norm": 5.084557056427002,
"learning_rate": 4.2474426076756546e-05,
"loss": 1.9484,
"step": 293500
},
{
"epoch": 1.098477170636659,
"grad_norm": 3.621943473815918,
"learning_rate": 4.2329405180819554e-05,
"loss": 1.9364,
"step": 294000
},
{
"epoch": 1.1003453290901228,
"grad_norm": 3.5090487003326416,
"learning_rate": 4.2184450338516527e-05,
"loss": 2.0112,
"step": 294500
},
{
"epoch": 1.1022134875435865,
"grad_norm": 4.1997246742248535,
"learning_rate": 4.204014221253661e-05,
"loss": 1.9631,
"step": 295000
},
{
"epoch": 1.1040816459970502,
"grad_norm": 3.7712690830230713,
"learning_rate": 4.189532294497906e-05,
"loss": 1.9428,
"step": 295500
},
{
"epoch": 1.1059498044505138,
"grad_norm": 4.392169952392578,
"learning_rate": 4.175057346905878e-05,
"loss": 2.0024,
"step": 296000
},
{
"epoch": 1.1078179629039775,
"grad_norm": 3.103431463241577,
"learning_rate": 4.160589503125397e-05,
"loss": 1.9671,
"step": 296500
},
{
"epoch": 1.1096861213574414,
"grad_norm": 2.2490739822387695,
"learning_rate": 4.1461288877431045e-05,
"loss": 1.9978,
"step": 297000
},
{
"epoch": 1.111554279810905,
"grad_norm": 3.9997470378875732,
"learning_rate": 4.1317045243873654e-05,
"loss": 1.9756,
"step": 297500
},
{
"epoch": 1.1134224382643687,
"grad_norm": 3.8243391513824463,
"learning_rate": 4.117258724232387e-05,
"loss": 1.9927,
"step": 298000
},
{
"epoch": 1.1152905967178324,
"grad_norm": 3.207801342010498,
"learning_rate": 4.102820525609035e-05,
"loss": 1.9807,
"step": 298500
},
{
"epoch": 1.117158755171296,
"grad_norm": 2.981112480163574,
"learning_rate": 4.08839005284867e-05,
"loss": 1.9757,
"step": 299000
},
{
"epoch": 1.11902691362476,
"grad_norm": 2.8603618144989014,
"learning_rate": 4.0739674302161204e-05,
"loss": 1.9882,
"step": 299500
},
{
"epoch": 1.1208950720782236,
"grad_norm": 3.422062635421753,
"learning_rate": 4.059552781908619e-05,
"loss": 1.9883,
"step": 300000
},
{
"epoch": 1.1227632305316873,
"grad_norm": 3.2499775886535645,
"learning_rate": 4.045146232054726e-05,
"loss": 1.9715,
"step": 300500
},
{
"epoch": 1.124631388985151,
"grad_norm": 3.5448482036590576,
"learning_rate": 4.030776693079458e-05,
"loss": 1.9895,
"step": 301000
},
{
"epoch": 1.1264995474386146,
"grad_norm": 3.52693510055542,
"learning_rate": 4.016386695421753e-05,
"loss": 1.9936,
"step": 301500
},
{
"epoch": 1.1283677058920785,
"grad_norm": 3.247986078262329,
"learning_rate": 4.002005167932884e-05,
"loss": 1.9916,
"step": 302000
},
{
"epoch": 1.1302358643455421,
"grad_norm": 3.287041425704956,
"learning_rate": 3.987632234456198e-05,
"loss": 1.971,
"step": 302500
},
{
"epoch": 1.1321040227990058,
"grad_norm": 2.758507251739502,
"learning_rate": 3.9732680187610403e-05,
"loss": 2.0091,
"step": 303000
},
{
"epoch": 1.1339721812524695,
"grad_norm": 2.9558610916137695,
"learning_rate": 3.958912644541679e-05,
"loss": 2.0046,
"step": 303500
},
{
"epoch": 1.1358403397059331,
"grad_norm": 3.0163705348968506,
"learning_rate": 3.944566235416254e-05,
"loss": 1.9902,
"step": 304000
},
{
"epoch": 1.1377084981593968,
"grad_norm": 2.4738314151763916,
"learning_rate": 3.9302289149256985e-05,
"loss": 1.969,
"step": 304500
},
{
"epoch": 1.1395766566128607,
"grad_norm": 3.352306604385376,
"learning_rate": 3.915929453473775e-05,
"loss": 1.9639,
"step": 305000
},
{
"epoch": 1.1414448150663243,
"grad_norm": 3.9805781841278076,
"learning_rate": 3.9016106617675985e-05,
"loss": 1.9703,
"step": 305500
},
{
"epoch": 1.143312973519788,
"grad_norm": 2.410222291946411,
"learning_rate": 3.8873013285987326e-05,
"loss": 1.9836,
"step": 306000
},
{
"epoch": 1.1451811319732517,
"grad_norm": 3.830815076828003,
"learning_rate": 3.873030167047204e-05,
"loss": 1.9474,
"step": 306500
},
{
"epoch": 1.1470492904267153,
"grad_norm": 3.884229898452759,
"learning_rate": 3.858740101002805e-05,
"loss": 1.9912,
"step": 307000
},
{
"epoch": 1.1489174488801792,
"grad_norm": 3.097529172897339,
"learning_rate": 3.8444598626660855e-05,
"loss": 1.9851,
"step": 307500
},
{
"epoch": 1.1507856073336429,
"grad_norm": 3.3618969917297363,
"learning_rate": 3.8301895750081664e-05,
"loss": 1.9897,
"step": 308000
},
{
"epoch": 1.1526537657871065,
"grad_norm": 2.846202850341797,
"learning_rate": 3.8159293609144794e-05,
"loss": 1.9649,
"step": 308500
},
{
"epoch": 1.1545219242405702,
"grad_norm": 3.3975071907043457,
"learning_rate": 3.801679343183709e-05,
"loss": 1.9611,
"step": 309000
},
{
"epoch": 1.1563900826940339,
"grad_norm": 3.390746831893921,
"learning_rate": 3.787468113544101e-05,
"loss": 1.9809,
"step": 309500
},
{
"epoch": 1.1582582411474975,
"grad_norm": 3.883208990097046,
"learning_rate": 3.773238835577244e-05,
"loss": 1.9741,
"step": 310000
},
{
"epoch": 1.1601263996009614,
"grad_norm": 2.655240535736084,
"learning_rate": 3.7590201215933385e-05,
"loss": 1.9929,
"step": 310500
},
{
"epoch": 1.161994558054425,
"grad_norm": 3.561328649520874,
"learning_rate": 3.7448120940337014e-05,
"loss": 1.9941,
"step": 311000
},
{
"epoch": 1.1638627165078888,
"grad_norm": 4.378994464874268,
"learning_rate": 3.7306148752476284e-05,
"loss": 1.9692,
"step": 311500
},
{
"epoch": 1.1657308749613524,
"grad_norm": 2.515988826751709,
"learning_rate": 3.716428587491332e-05,
"loss": 1.9721,
"step": 312000
},
{
"epoch": 1.1675990334148163,
"grad_norm": 2.2535147666931152,
"learning_rate": 3.702253352926898e-05,
"loss": 1.9904,
"step": 312500
},
{
"epoch": 1.16946719186828,
"grad_norm": 3.65279483795166,
"learning_rate": 3.688117610505848e-05,
"loss": 1.8969,
"step": 313000
},
{
"epoch": 1.1713353503217436,
"grad_norm": 3.5840914249420166,
"learning_rate": 3.6739648257134945e-05,
"loss": 1.9981,
"step": 313500
},
{
"epoch": 1.1732035087752073,
"grad_norm": 4.6728973388671875,
"learning_rate": 3.659823459780314e-05,
"loss": 2.0034,
"step": 314000
},
{
"epoch": 1.175071667228671,
"grad_norm": 3.8465287685394287,
"learning_rate": 3.6456936344815585e-05,
"loss": 1.9575,
"step": 314500
},
{
"epoch": 1.1769398256821346,
"grad_norm": 3.005547046661377,
"learning_rate": 3.631603696099265e-05,
"loss": 1.9799,
"step": 315000
},
{
"epoch": 1.1788079841355985,
"grad_norm": 3.0555107593536377,
"learning_rate": 3.617497293307507e-05,
"loss": 1.9681,
"step": 315500
},
{
"epoch": 1.1806761425890622,
"grad_norm": 3.1861069202423096,
"learning_rate": 3.6034027956326125e-05,
"loss": 2.0004,
"step": 316000
},
{
"epoch": 1.1825443010425258,
"grad_norm": 3.5906646251678467,
"learning_rate": 3.589320324446236e-05,
"loss": 1.984,
"step": 316500
},
{
"epoch": 1.1844124594959895,
"grad_norm": 3.118577480316162,
"learning_rate": 3.5752500010164694e-05,
"loss": 2.0166,
"step": 317000
},
{
"epoch": 1.1862806179494532,
"grad_norm": 3.639019727706909,
"learning_rate": 3.561220050290951e-05,
"loss": 1.9152,
"step": 317500
},
{
"epoch": 1.188148776402917,
"grad_norm": 2.516979455947876,
"learning_rate": 3.547174360858504e-05,
"loss": 1.9838,
"step": 318000
},
{
"epoch": 1.1900169348563807,
"grad_norm": 4.030247688293457,
"learning_rate": 3.5331411821133284e-05,
"loss": 1.9957,
"step": 318500
},
{
"epoch": 1.1918850933098444,
"grad_norm": 2.944655656814575,
"learning_rate": 3.519120634899048e-05,
"loss": 1.9557,
"step": 319000
},
{
"epoch": 1.193753251763308,
"grad_norm": 2.9035158157348633,
"learning_rate": 3.505112839950505e-05,
"loss": 1.9852,
"step": 319500
},
{
"epoch": 1.1956214102167717,
"grad_norm": 4.2154364585876465,
"learning_rate": 3.491117917892734e-05,
"loss": 1.9863,
"step": 320000
},
{
"epoch": 1.1974895686702354,
"grad_norm": 3.7261621952056885,
"learning_rate": 3.4771359892399204e-05,
"loss": 1.9478,
"step": 320500
},
{
"epoch": 1.1993577271236993,
"grad_norm": 4.7101240158081055,
"learning_rate": 3.463195098856492e-05,
"loss": 1.9688,
"step": 321000
},
{
"epoch": 1.201225885577163,
"grad_norm": 3.4447665214538574,
"learning_rate": 3.44923949151937e-05,
"loss": 1.9768,
"step": 321500
},
{
"epoch": 1.2030940440306266,
"grad_norm": 2.6960058212280273,
"learning_rate": 3.4352972382140294e-05,
"loss": 1.9639,
"step": 322000
},
{
"epoch": 1.2049622024840903,
"grad_norm": 3.2135891914367676,
"learning_rate": 3.421368459001103e-05,
"loss": 2.0298,
"step": 322500
},
{
"epoch": 1.206830360937554,
"grad_norm": 3.953632116317749,
"learning_rate": 3.4074532738252e-05,
"loss": 2.0028,
"step": 323000
},
{
"epoch": 1.2086985193910178,
"grad_norm": 3.091557025909424,
"learning_rate": 3.393551802513865e-05,
"loss": 1.9353,
"step": 323500
},
{
"epoch": 1.2105666778444815,
"grad_norm": 3.2774996757507324,
"learning_rate": 3.379664164776548e-05,
"loss": 1.9976,
"step": 324000
},
{
"epoch": 1.2124348362979451,
"grad_norm": 4.057534694671631,
"learning_rate": 3.365790480203579e-05,
"loss": 1.9577,
"step": 324500
},
{
"epoch": 1.2143029947514088,
"grad_norm": 3.725080728530884,
"learning_rate": 3.351958573365166e-05,
"loss": 1.9619,
"step": 325000
},
{
"epoch": 1.2161711532048725,
"grad_norm": 2.542310953140259,
"learning_rate": 3.338140801561512e-05,
"loss": 1.9413,
"step": 325500
},
{
"epoch": 1.2180393116583361,
"grad_norm": 3.8798625469207764,
"learning_rate": 3.324309635334674e-05,
"loss": 1.9272,
"step": 326000
},
{
"epoch": 1.2199074701118,
"grad_norm": 2.8388006687164307,
"learning_rate": 3.310492898945492e-05,
"loss": 1.9717,
"step": 326500
},
{
"epoch": 1.2217756285652637,
"grad_norm": 3.845374822616577,
"learning_rate": 3.296690711373742e-05,
"loss": 1.9995,
"step": 327000
},
{
"epoch": 1.2236437870187273,
"grad_norm": 3.3350958824157715,
"learning_rate": 3.282903191473914e-05,
"loss": 1.9505,
"step": 327500
},
{
"epoch": 1.225511945472191,
"grad_norm": 3.514188289642334,
"learning_rate": 3.2691304579741944e-05,
"loss": 1.9493,
"step": 328000
},
{
"epoch": 1.2273801039256549,
"grad_norm": 4.140675067901611,
"learning_rate": 3.255372629475436e-05,
"loss": 1.9381,
"step": 328500
},
{
"epoch": 1.2292482623791186,
"grad_norm": 3.2821719646453857,
"learning_rate": 3.241629824450141e-05,
"loss": 1.9647,
"step": 329000
},
{
"epoch": 1.2311164208325822,
"grad_norm": 3.671809434890747,
"learning_rate": 3.227929601377734e-05,
"loss": 1.948,
"step": 329500
},
{
"epoch": 1.2329845792860459,
"grad_norm": 4.461349010467529,
"learning_rate": 3.214244577120278e-05,
"loss": 1.9533,
"step": 330000
},
{
"epoch": 1.2348527377395095,
"grad_norm": 4.116054058074951,
"learning_rate": 3.200547490304101e-05,
"loss": 1.9278,
"step": 330500
},
{
"epoch": 1.2367208961929732,
"grad_norm": 3.0734941959381104,
"learning_rate": 3.1868658990759734e-05,
"loss": 1.9038,
"step": 331000
},
{
"epoch": 1.238589054646437,
"grad_norm": 4.233485698699951,
"learning_rate": 3.173199921251894e-05,
"loss": 1.9466,
"step": 331500
},
{
"epoch": 1.2404572130999008,
"grad_norm": 3.6610071659088135,
"learning_rate": 3.159549674513415e-05,
"loss": 1.9437,
"step": 332000
},
{
"epoch": 1.2423253715533644,
"grad_norm": 3.757662773132324,
"learning_rate": 3.145915276406623e-05,
"loss": 1.9695,
"step": 332500
},
{
"epoch": 1.244193530006828,
"grad_norm": 4.0608062744140625,
"learning_rate": 3.1322968443411296e-05,
"loss": 1.9398,
"step": 333000
},
{
"epoch": 1.2460616884602917,
"grad_norm": 3.5959203243255615,
"learning_rate": 3.118694495589054e-05,
"loss": 1.9154,
"step": 333500
},
{
"epoch": 1.2479298469137556,
"grad_norm": 4.01427698135376,
"learning_rate": 3.105135503334797e-05,
"loss": 1.9268,
"step": 334000
},
{
"epoch": 1.2497980053672193,
"grad_norm": 4.18043851852417,
"learning_rate": 3.091565639719372e-05,
"loss": 1.9349,
"step": 334500
},
{
"epoch": 1.251666163820683,
"grad_norm": 3.132768154144287,
"learning_rate": 3.0780122101651435e-05,
"loss": 1.9476,
"step": 335000
},
{
"epoch": 1.2535343222741466,
"grad_norm": 2.99275803565979,
"learning_rate": 3.0644753313844755e-05,
"loss": 1.9625,
"step": 335500
},
{
"epoch": 1.2554024807276103,
"grad_norm": 3.58479380607605,
"learning_rate": 3.0509551199472118e-05,
"loss": 1.9545,
"step": 336000
},
{
"epoch": 1.257270639181074,
"grad_norm": 3.13480544090271,
"learning_rate": 3.0374786823074896e-05,
"loss": 1.9398,
"step": 336500
},
{
"epoch": 1.2591387976345378,
"grad_norm": 3.130760431289673,
"learning_rate": 3.0239921207753986e-05,
"loss": 1.9582,
"step": 337000
},
{
"epoch": 1.2610069560880015,
"grad_norm": 3.4282748699188232,
"learning_rate": 3.0105225751989453e-05,
"loss": 1.9285,
"step": 337500
},
{
"epoch": 1.2628751145414652,
"grad_norm": 3.996558666229248,
"learning_rate": 2.9970701615681463e-05,
"loss": 1.9397,
"step": 338000
},
{
"epoch": 1.2647432729949288,
"grad_norm": 3.9144933223724365,
"learning_rate": 2.9836349957254927e-05,
"loss": 1.9361,
"step": 338500
},
{
"epoch": 1.2666114314483927,
"grad_norm": 2.7201411724090576,
"learning_rate": 2.9702171933649482e-05,
"loss": 1.9221,
"step": 339000
},
{
"epoch": 1.2684795899018564,
"grad_norm": 3.485480785369873,
"learning_rate": 2.956843653156831e-05,
"loss": 1.951,
"step": 339500
},
{
"epoch": 1.27034774835532,
"grad_norm": 4.514249324798584,
"learning_rate": 2.943460888939414e-05,
"loss": 1.9556,
"step": 340000
},
{
"epoch": 1.2722159068087837,
"grad_norm": 3.043680429458618,
"learning_rate": 2.930095834154558e-05,
"loss": 1.9673,
"step": 340500
},
{
"epoch": 1.2740840652622474,
"grad_norm": 2.636143207550049,
"learning_rate": 2.9167486038924823e-05,
"loss": 1.9492,
"step": 341000
},
{
"epoch": 1.275952223715711,
"grad_norm": 3.6190054416656494,
"learning_rate": 2.9034193130899155e-05,
"loss": 1.9648,
"step": 341500
},
{
"epoch": 1.2778203821691747,
"grad_norm": 4.245516777038574,
"learning_rate": 2.890108076529099e-05,
"loss": 1.9589,
"step": 342000
},
{
"epoch": 1.2796885406226386,
"grad_norm": 3.619927406311035,
"learning_rate": 2.876841576763556e-05,
"loss": 1.9439,
"step": 342500
},
{
"epoch": 1.2815566990761023,
"grad_norm": 3.657912015914917,
"learning_rate": 2.863566755729298e-05,
"loss": 1.9564,
"step": 343000
},
{
"epoch": 1.283424857529566,
"grad_norm": 3.4643499851226807,
"learning_rate": 2.8503103321182943e-05,
"loss": 1.9754,
"step": 343500
},
{
"epoch": 1.2852930159830296,
"grad_norm": 4.774941444396973,
"learning_rate": 2.8370724200853072e-05,
"loss": 1.9406,
"step": 344000
},
{
"epoch": 1.2871611744364935,
"grad_norm": 3.5722765922546387,
"learning_rate": 2.8238531336256975e-05,
"loss": 1.9708,
"step": 344500
},
{
"epoch": 1.2890293328899571,
"grad_norm": 3.9576704502105713,
"learning_rate": 2.8106525865744272e-05,
"loss": 1.9503,
"step": 345000
},
{
"epoch": 1.2908974913434208,
"grad_norm": 4.773796558380127,
"learning_rate": 2.7974972371021873e-05,
"loss": 1.967,
"step": 345500
},
{
"epoch": 1.2927656497968845,
"grad_norm": 3.749734401702881,
"learning_rate": 2.784334471679681e-05,
"loss": 1.9484,
"step": 346000
},
{
"epoch": 1.2946338082503481,
"grad_norm": 4.330195903778076,
"learning_rate": 2.7711907859717524e-05,
"loss": 1.9094,
"step": 346500
},
{
"epoch": 1.2965019667038118,
"grad_norm": 3.0685718059539795,
"learning_rate": 2.758066293162346e-05,
"loss": 1.9195,
"step": 347000
},
{
"epoch": 1.2983701251572755,
"grad_norm": 3.8571877479553223,
"learning_rate": 2.7449611062701342e-05,
"loss": 1.9457,
"step": 347500
},
{
"epoch": 1.3002382836107393,
"grad_norm": 3.673949718475342,
"learning_rate": 2.731875338147545e-05,
"loss": 1.9046,
"step": 348000
},
{
"epoch": 1.302106442064203,
"grad_norm": 3.5845327377319336,
"learning_rate": 2.7188091014797774e-05,
"loss": 1.9871,
"step": 348500
},
{
"epoch": 1.3039746005176667,
"grad_norm": 5.045246124267578,
"learning_rate": 2.7057885822898532e-05,
"loss": 1.9445,
"step": 349000
},
{
"epoch": 1.3058427589711303,
"grad_norm": 4.416993141174316,
"learning_rate": 2.692761706288961e-05,
"loss": 1.9242,
"step": 349500
},
{
"epoch": 1.3077109174245942,
"grad_norm": 5.05975341796875,
"learning_rate": 2.6797546985612997e-05,
"loss": 1.9729,
"step": 350000
},
{
"epoch": 1.3095790758780579,
"grad_norm": 3.4689128398895264,
"learning_rate": 2.6667676711138423e-05,
"loss": 1.9479,
"step": 350500
},
{
"epoch": 1.3114472343315215,
"grad_norm": 3.177008628845215,
"learning_rate": 2.6538266495259985e-05,
"loss": 1.9456,
"step": 351000
},
{
"epoch": 1.3133153927849852,
"grad_norm": 3.6939172744750977,
"learning_rate": 2.6408798774518146e-05,
"loss": 1.934,
"step": 351500
},
{
"epoch": 1.3151835512384489,
"grad_norm": 4.592978477478027,
"learning_rate": 2.6279534204197788e-05,
"loss": 1.8931,
"step": 352000
},
{
"epoch": 1.3170517096919125,
"grad_norm": 4.249555587768555,
"learning_rate": 2.6150473897432166e-05,
"loss": 1.9352,
"step": 352500
},
{
"epoch": 1.3189198681453764,
"grad_norm": 3.4636592864990234,
"learning_rate": 2.6021876469757334e-05,
"loss": 1.9227,
"step": 353000
},
{
"epoch": 1.32078802659884,
"grad_norm": 3.9055769443511963,
"learning_rate": 2.5893227608380464e-05,
"loss": 2.0114,
"step": 353500
},
{
"epoch": 1.3226561850523038,
"grad_norm": 3.659078359603882,
"learning_rate": 2.576478633715232e-05,
"loss": 1.9675,
"step": 354000
},
{
"epoch": 1.3245243435057674,
"grad_norm": 4.109720230102539,
"learning_rate": 2.563655376211658e-05,
"loss": 1.9515,
"step": 354500
},
{
"epoch": 1.3263925019592313,
"grad_norm": 3.4679160118103027,
"learning_rate": 2.550853098751974e-05,
"loss": 1.965,
"step": 355000
},
{
"epoch": 1.328260660412695,
"grad_norm": 3.3445444107055664,
"learning_rate": 2.538097452833215e-05,
"loss": 1.9422,
"step": 355500
},
{
"epoch": 1.3301288188661586,
"grad_norm": 4.475471496582031,
"learning_rate": 2.5253374235012317e-05,
"loss": 1.9533,
"step": 356000
},
{
"epoch": 1.3319969773196223,
"grad_norm": 3.064134359359741,
"learning_rate": 2.5125987041797306e-05,
"loss": 1.9263,
"step": 356500
},
{
"epoch": 1.333865135773086,
"grad_norm": 3.313082218170166,
"learning_rate": 2.4998814045653785e-05,
"loss": 1.8802,
"step": 357000
},
{
"epoch": 1.3357332942265496,
"grad_norm": 5.206328392028809,
"learning_rate": 2.4872110041523282e-05,
"loss": 1.8967,
"step": 357500
},
{
"epoch": 1.3376014526800133,
"grad_norm": 4.334334373474121,
"learning_rate": 2.4745368289174596e-05,
"loss": 1.9429,
"step": 358000
},
{
"epoch": 1.3394696111334772,
"grad_norm": 5.680240154266357,
"learning_rate": 2.4618844011511794e-05,
"loss": 1.9209,
"step": 358500
},
{
"epoch": 1.3413377695869408,
"grad_norm": 3.261059284210205,
"learning_rate": 2.449253829807073e-05,
"loss": 1.9251,
"step": 359000
},
{
"epoch": 1.3432059280404045,
"grad_norm": 3.2310187816619873,
"learning_rate": 2.4366704188693773e-05,
"loss": 1.9056,
"step": 359500
},
{
"epoch": 1.3450740864938682,
"grad_norm": 4.145471096038818,
"learning_rate": 2.424083842220842e-05,
"loss": 1.926,
"step": 360000
},
{
"epoch": 1.346942244947332,
"grad_norm": 4.704455852508545,
"learning_rate": 2.411519447505653e-05,
"loss": 1.9485,
"step": 360500
},
{
"epoch": 1.3488104034007957,
"grad_norm": 3.9618282318115234,
"learning_rate": 2.3989773429193175e-05,
"loss": 1.9304,
"step": 361000
},
{
"epoch": 1.3506785618542594,
"grad_norm": 3.921598434448242,
"learning_rate": 2.3864576364654012e-05,
"loss": 1.91,
"step": 361500
},
{
"epoch": 1.352546720307723,
"grad_norm": 4.026153087615967,
"learning_rate": 2.3739604359545953e-05,
"loss": 1.9588,
"step": 362000
},
{
"epoch": 1.3544148787611867,
"grad_norm": 3.6452534198760986,
"learning_rate": 2.3615107755379164e-05,
"loss": 1.9613,
"step": 362500
},
{
"epoch": 1.3562830372146504,
"grad_norm": 3.757392406463623,
"learning_rate": 2.349058864020204e-05,
"loss": 1.9386,
"step": 363000
},
{
"epoch": 1.358151195668114,
"grad_norm": 4.3105902671813965,
"learning_rate": 2.3366297804968707e-05,
"loss": 1.9171,
"step": 363500
},
{
"epoch": 1.360019354121578,
"grad_norm": 4.3953938484191895,
"learning_rate": 2.3242236319982296e-05,
"loss": 1.9274,
"step": 364000
},
{
"epoch": 1.3618875125750416,
"grad_norm": 3.9918718338012695,
"learning_rate": 2.3118652685036857e-05,
"loss": 1.9505,
"step": 364500
},
{
"epoch": 1.3637556710285053,
"grad_norm": 4.170524597167969,
"learning_rate": 2.2995052639511584e-05,
"loss": 1.9666,
"step": 365000
},
{
"epoch": 1.365623829481969,
"grad_norm": 2.33520245552063,
"learning_rate": 2.2871685141129013e-05,
"loss": 1.8909,
"step": 365500
},
{
"epoch": 1.3674919879354328,
"grad_norm": 3.8575286865234375,
"learning_rate": 2.2748551252241096e-05,
"loss": 1.9036,
"step": 366000
},
{
"epoch": 1.3693601463888965,
"grad_norm": 3.738067150115967,
"learning_rate": 2.262589759672201e-05,
"loss": 1.9242,
"step": 366500
},
{
"epoch": 1.3712283048423601,
"grad_norm": 3.2097079753875732,
"learning_rate": 2.2503233633312364e-05,
"loss": 1.9669,
"step": 367000
},
{
"epoch": 1.3730964632958238,
"grad_norm": 4.111919403076172,
"learning_rate": 2.2380806452236224e-05,
"loss": 1.9115,
"step": 367500
},
{
"epoch": 1.3749646217492875,
"grad_norm": 3.6487059593200684,
"learning_rate": 2.2258617107748202e-05,
"loss": 1.9221,
"step": 368000
},
{
"epoch": 1.3768327802027511,
"grad_norm": 3.9140658378601074,
"learning_rate": 2.213666665205488e-05,
"loss": 1.9077,
"step": 368500
},
{
"epoch": 1.378700938656215,
"grad_norm": 4.236271858215332,
"learning_rate": 2.2015199316183162e-05,
"loss": 1.9248,
"step": 369000
},
{
"epoch": 1.3805690971096787,
"grad_norm": 3.9722940921783447,
"learning_rate": 2.189372930344269e-05,
"loss": 1.9075,
"step": 369500
},
{
"epoch": 1.3824372555631423,
"grad_norm": 3.9439289569854736,
"learning_rate": 2.1772501321647675e-05,
"loss": 1.9325,
"step": 370000
},
{
"epoch": 1.384305414016606,
"grad_norm": 3.183210611343384,
"learning_rate": 2.1651516414726137e-05,
"loss": 1.9372,
"step": 370500
},
{
"epoch": 1.38617357247007,
"grad_norm": 4.380889892578125,
"learning_rate": 2.1530775624512915e-05,
"loss": 1.9119,
"step": 371000
},
{
"epoch": 1.3880417309235336,
"grad_norm": 3.137747049331665,
"learning_rate": 2.1410520736652044e-05,
"loss": 1.8852,
"step": 371500
},
{
"epoch": 1.3899098893769972,
"grad_norm": 4.502001762390137,
"learning_rate": 2.129027080352e-05,
"loss": 1.9157,
"step": 372000
},
{
"epoch": 1.3917780478304609,
"grad_norm": 3.3394224643707275,
"learning_rate": 2.1170268097883096e-05,
"loss": 1.9329,
"step": 372500
},
{
"epoch": 1.3936462062839245,
"grad_norm": 3.0865299701690674,
"learning_rate": 2.1050513653118137e-05,
"loss": 1.9178,
"step": 373000
},
{
"epoch": 1.3955143647373882,
"grad_norm": 4.535000324249268,
"learning_rate": 2.0931247261291493e-05,
"loss": 1.9163,
"step": 373500
},
{
"epoch": 1.3973825231908519,
"grad_norm": 3.5877630710601807,
"learning_rate": 2.0811991928172553e-05,
"loss": 1.9437,
"step": 374000
},
{
"epoch": 1.3992506816443158,
"grad_norm": 4.446563243865967,
"learning_rate": 2.0692987941141717e-05,
"loss": 1.9458,
"step": 374500
},
{
"epoch": 1.4011188400977794,
"grad_norm": 3.427525758743286,
"learning_rate": 2.0574236324975526e-05,
"loss": 1.9163,
"step": 375000
},
{
"epoch": 1.402986998551243,
"grad_norm": 4.324997901916504,
"learning_rate": 2.0455974845157404e-05,
"loss": 1.9447,
"step": 375500
},
{
"epoch": 1.4048551570047068,
"grad_norm": 4.460984706878662,
"learning_rate": 2.0337730526503722e-05,
"loss": 1.8936,
"step": 376000
},
{
"epoch": 1.4067233154581706,
"grad_norm": 3.0335512161254883,
"learning_rate": 2.0219741637935503e-05,
"loss": 1.9274,
"step": 376500
},
{
"epoch": 1.4085914739116343,
"grad_norm": 3.983215808868408,
"learning_rate": 2.010200919548798e-05,
"loss": 1.9456,
"step": 377000
},
{
"epoch": 1.410459632365098,
"grad_norm": 4.645228385925293,
"learning_rate": 1.9984534212988126e-05,
"loss": 1.8914,
"step": 377500
},
{
"epoch": 1.4123277908185616,
"grad_norm": 4.4612250328063965,
"learning_rate": 1.986755187644178e-05,
"loss": 1.9379,
"step": 378000
},
{
"epoch": 1.4141959492720253,
"grad_norm": 3.9466419219970703,
"learning_rate": 1.9750594326473332e-05,
"loss": 1.9053,
"step": 378500
},
{
"epoch": 1.416064107725489,
"grad_norm": 3.384223461151123,
"learning_rate": 1.9633897262584083e-05,
"loss": 1.9777,
"step": 379000
},
{
"epoch": 1.4179322661789528,
"grad_norm": 3.591265916824341,
"learning_rate": 1.9517461689685075e-05,
"loss": 1.9357,
"step": 379500
},
{
"epoch": 1.4198004246324165,
"grad_norm": 4.8993730545043945,
"learning_rate": 1.9401520693960035e-05,
"loss": 1.9063,
"step": 380000
},
{
"epoch": 1.4216685830858802,
"grad_norm": 4.398604869842529,
"learning_rate": 1.9285610580773773e-05,
"loss": 1.8615,
"step": 380500
},
{
"epoch": 1.4235367415393438,
"grad_norm": 3.6538774967193604,
"learning_rate": 1.916996495777159e-05,
"loss": 1.9166,
"step": 381000
},
{
"epoch": 1.4254048999928077,
"grad_norm": 3.730799436569214,
"learning_rate": 1.905458482081028e-05,
"loss": 1.8853,
"step": 381500
},
{
"epoch": 1.4272730584462714,
"grad_norm": 5.199082851409912,
"learning_rate": 1.8939701124169172e-05,
"loss": 1.8736,
"step": 382000
},
{
"epoch": 1.429141216899735,
"grad_norm": 4.507551670074463,
"learning_rate": 1.8824854401777008e-05,
"loss": 1.9045,
"step": 382500
},
{
"epoch": 1.4310093753531987,
"grad_norm": 2.917692184448242,
"learning_rate": 1.8710276137269065e-05,
"loss": 1.8737,
"step": 383000
},
{
"epoch": 1.4328775338066624,
"grad_norm": 4.9208221435546875,
"learning_rate": 1.8595967317310803e-05,
"loss": 1.8852,
"step": 383500
},
{
"epoch": 1.434745692260126,
"grad_norm": 4.914313793182373,
"learning_rate": 1.8481928926247323e-05,
"loss": 1.9188,
"step": 384000
},
{
"epoch": 1.4366138507135897,
"grad_norm": 4.2889556884765625,
"learning_rate": 1.836838920853576e-05,
"loss": 1.9626,
"step": 384500
},
{
"epoch": 1.4384820091670536,
"grad_norm": 4.040252208709717,
"learning_rate": 1.8254894073216665e-05,
"loss": 1.9157,
"step": 385000
},
{
"epoch": 1.4403501676205173,
"grad_norm": 4.800929546356201,
"learning_rate": 1.8141672303869356e-05,
"loss": 1.8893,
"step": 385500
},
{
"epoch": 1.442218326073981,
"grad_norm": 3.5540807247161865,
"learning_rate": 1.8028724875478063e-05,
"loss": 1.9504,
"step": 386000
},
{
"epoch": 1.4440864845274446,
"grad_norm": 3.3006908893585205,
"learning_rate": 1.791627782948606e-05,
"loss": 1.9409,
"step": 386500
},
{
"epoch": 1.4459546429809085,
"grad_norm": 2.976499080657959,
"learning_rate": 1.7803881444967192e-05,
"loss": 1.9083,
"step": 387000
},
{
"epoch": 1.4478228014343721,
"grad_norm": 4.687767505645752,
"learning_rate": 1.7691762310215786e-05,
"loss": 1.9419,
"step": 387500
},
{
"epoch": 1.4496909598878358,
"grad_norm": 4.436933517456055,
"learning_rate": 1.7579921390721e-05,
"loss": 1.9205,
"step": 388000
},
{
"epoch": 1.4515591183412995,
"grad_norm": 4.451811790466309,
"learning_rate": 1.7468582493799596e-05,
"loss": 1.9,
"step": 388500
},
{
"epoch": 1.4534272767947631,
"grad_norm": 4.564020156860352,
"learning_rate": 1.7357300330458897e-05,
"loss": 1.8913,
"step": 389000
},
{
"epoch": 1.4552954352482268,
"grad_norm": 3.211652994155884,
"learning_rate": 1.724629926252035e-05,
"loss": 1.8884,
"step": 389500
},
{
"epoch": 1.4571635937016905,
"grad_norm": 4.224535942077637,
"learning_rate": 1.7135580245845107e-05,
"loss": 1.9185,
"step": 390000
},
{
"epoch": 1.4590317521551543,
"grad_norm": 3.9640257358551025,
"learning_rate": 1.7025364822818328e-05,
"loss": 1.9193,
"step": 390500
},
{
"epoch": 1.460899910608618,
"grad_norm": 3.1013686656951904,
"learning_rate": 1.6915212197670978e-05,
"loss": 1.9274,
"step": 391000
},
{
"epoch": 1.4627680690620817,
"grad_norm": 5.020761966705322,
"learning_rate": 1.68053444748701e-05,
"loss": 1.8856,
"step": 391500
},
{
"epoch": 1.4646362275155453,
"grad_norm": 3.306040048599243,
"learning_rate": 1.6695762600517374e-05,
"loss": 1.9403,
"step": 392000
},
{
"epoch": 1.4665043859690092,
"grad_norm": 4.234299182891846,
"learning_rate": 1.658668582157294e-05,
"loss": 1.8777,
"step": 392500
},
{
"epoch": 1.468372544422473,
"grad_norm": 6.068370342254639,
"learning_rate": 1.6477677896163034e-05,
"loss": 1.8937,
"step": 393000
},
{
"epoch": 1.4702407028759366,
"grad_norm": 4.372175216674805,
"learning_rate": 1.636895864082966e-05,
"loss": 1.9034,
"step": 393500
},
{
"epoch": 1.4721088613294002,
"grad_norm": 4.099493980407715,
"learning_rate": 1.6260528991784696e-05,
"loss": 1.9204,
"step": 394000
},
{
"epoch": 1.4739770197828639,
"grad_norm": 3.7667877674102783,
"learning_rate": 1.6152389882746138e-05,
"loss": 1.9014,
"step": 394500
},
{
"epoch": 1.4758451782363275,
"grad_norm": 2.797348976135254,
"learning_rate": 1.60447576486997e-05,
"loss": 1.9077,
"step": 395000
},
{
"epoch": 1.4777133366897914,
"grad_norm": 4.806083679199219,
"learning_rate": 1.593720182508714e-05,
"loss": 1.9239,
"step": 395500
},
{
"epoch": 1.479581495143255,
"grad_norm": 4.35167121887207,
"learning_rate": 1.58299393257415e-05,
"loss": 1.9147,
"step": 396000
},
{
"epoch": 1.4814496535967188,
"grad_norm": 7.256587982177734,
"learning_rate": 1.5722971074330122e-05,
"loss": 1.9101,
"step": 396500
},
{
"epoch": 1.4833178120501824,
"grad_norm": 4.269795894622803,
"learning_rate": 1.5616511042961456e-05,
"loss": 1.9253,
"step": 397000
},
{
"epoch": 1.4851859705036463,
"grad_norm": 3.5930633544921875,
"learning_rate": 1.551013345518685e-05,
"loss": 1.9399,
"step": 397500
},
{
"epoch": 1.48705412895711,
"grad_norm": 4.802802085876465,
"learning_rate": 1.5404052869284143e-05,
"loss": 1.924,
"step": 398000
},
{
"epoch": 1.4889222874105736,
"grad_norm": 5.457955360412598,
"learning_rate": 1.5298270198742908e-05,
"loss": 1.925,
"step": 398500
},
{
"epoch": 1.4907904458640373,
"grad_norm": 4.350592613220215,
"learning_rate": 1.5192997023342925e-05,
"loss": 1.9841,
"step": 399000
},
{
"epoch": 1.492658604317501,
"grad_norm": 3.5578579902648926,
"learning_rate": 1.5087812313349553e-05,
"loss": 1.8914,
"step": 399500
},
{
"epoch": 1.4945267627709646,
"grad_norm": 4.802867412567139,
"learning_rate": 1.4982928241953386e-05,
"loss": 1.8969,
"step": 400000
},
{
"epoch": 1.4963949212244283,
"grad_norm": 4.002582550048828,
"learning_rate": 1.4878345712340435e-05,
"loss": 1.904,
"step": 400500
},
{
"epoch": 1.4982630796778922,
"grad_norm": 4.3025665283203125,
"learning_rate": 1.4774273882839745e-05,
"loss": 1.916,
"step": 401000
},
{
"epoch": 1.5001312381313558,
"grad_norm": 4.821669101715088,
"learning_rate": 1.4670296528381727e-05,
"loss": 1.8837,
"step": 401500
},
{
"epoch": 1.5019993965848195,
"grad_norm": 3.655703067779541,
"learning_rate": 1.456662340786592e-05,
"loss": 1.95,
"step": 402000
},
{
"epoch": 1.5038675550382834,
"grad_norm": 3.852405548095703,
"learning_rate": 1.4463255414050487e-05,
"loss": 1.8723,
"step": 402500
},
{
"epoch": 1.505735713491747,
"grad_norm": 4.878715515136719,
"learning_rate": 1.4360193437066122e-05,
"loss": 1.8876,
"step": 403000
},
{
"epoch": 1.5076038719452107,
"grad_norm": 4.768284320831299,
"learning_rate": 1.4257643567674483e-05,
"loss": 1.9061,
"step": 403500
},
{
"epoch": 1.5094720303986744,
"grad_norm": 4.845045566558838,
"learning_rate": 1.4155195667736094e-05,
"loss": 1.8932,
"step": 404000
},
{
"epoch": 1.511340188852138,
"grad_norm": 3.8661012649536133,
"learning_rate": 1.4053056437417239e-05,
"loss": 1.9518,
"step": 404500
},
{
"epoch": 1.5132083473056017,
"grad_norm": 4.624420166015625,
"learning_rate": 1.3951226756267382e-05,
"loss": 1.8403,
"step": 405000
},
{
"epoch": 1.5150765057590654,
"grad_norm": 3.6633214950561523,
"learning_rate": 1.3849910229293806e-05,
"loss": 1.8943,
"step": 405500
},
{
"epoch": 1.516944664212529,
"grad_norm": 5.2839155197143555,
"learning_rate": 1.3748701650989005e-05,
"loss": 1.8692,
"step": 406000
},
{
"epoch": 1.518812822665993,
"grad_norm": 3.8412556648254395,
"learning_rate": 1.3647805242737227e-05,
"loss": 1.8699,
"step": 406500
},
{
"epoch": 1.5206809811194566,
"grad_norm": 3.3254265785217285,
"learning_rate": 1.3547221873385652e-05,
"loss": 1.8909,
"step": 407000
},
{
"epoch": 1.5225491395729203,
"grad_norm": 3.2033207416534424,
"learning_rate": 1.3446952409085728e-05,
"loss": 1.8986,
"step": 407500
},
{
"epoch": 1.5244172980263841,
"grad_norm": 4.760767459869385,
"learning_rate": 1.334719730796591e-05,
"loss": 1.8756,
"step": 408000
},
{
"epoch": 1.5262854564798478,
"grad_norm": 4.965844631195068,
"learning_rate": 1.3247557609288142e-05,
"loss": 1.8743,
"step": 408500
},
{
"epoch": 1.5281536149333115,
"grad_norm": 4.014163494110107,
"learning_rate": 1.314823439615473e-05,
"loss": 1.9219,
"step": 409000
},
{
"epoch": 1.5300217733867751,
"grad_norm": 4.178042888641357,
"learning_rate": 1.3049228523865536e-05,
"loss": 1.881,
"step": 409500
},
{
"epoch": 1.5318899318402388,
"grad_norm": 4.607501983642578,
"learning_rate": 1.2950737902223226e-05,
"loss": 1.9469,
"step": 410000
},
{
"epoch": 1.5337580902937025,
"grad_norm": 4.652303695678711,
"learning_rate": 1.2852368627651334e-05,
"loss": 1.8881,
"step": 410500
},
{
"epoch": 1.5356262487471661,
"grad_norm": 4.992543697357178,
"learning_rate": 1.2754319241706458e-05,
"loss": 1.9569,
"step": 411000
},
{
"epoch": 1.5374944072006298,
"grad_norm": 3.5058271884918213,
"learning_rate": 1.2656590588719214e-05,
"loss": 1.9032,
"step": 411500
},
{
"epoch": 1.5393625656540937,
"grad_norm": 3.973353147506714,
"learning_rate": 1.2559183510258338e-05,
"loss": 1.8669,
"step": 412000
},
{
"epoch": 1.5412307241075573,
"grad_norm": 4.776645660400391,
"learning_rate": 1.2462292692129003e-05,
"loss": 1.8993,
"step": 412500
},
{
"epoch": 1.543098882561021,
"grad_norm": 4.160543441772461,
"learning_rate": 1.2365530629011917e-05,
"loss": 1.9269,
"step": 413000
},
{
"epoch": 1.544967041014485,
"grad_norm": 4.14699125289917,
"learning_rate": 1.226909264681978e-05,
"loss": 1.9139,
"step": 413500
},
{
"epoch": 1.5468351994679486,
"grad_norm": 4.639766693115234,
"learning_rate": 1.2172979576006998e-05,
"loss": 1.8844,
"step": 414000
},
{
"epoch": 1.5487033579214122,
"grad_norm": 3.771737575531006,
"learning_rate": 1.207719224423004e-05,
"loss": 1.8961,
"step": 414500
},
{
"epoch": 1.550571516374876,
"grad_norm": 4.165931701660156,
"learning_rate": 1.1981922071418567e-05,
"loss": 1.891,
"step": 415000
},
{
"epoch": 1.5524396748283396,
"grad_norm": 5.3882341384887695,
"learning_rate": 1.1886788033865165e-05,
"loss": 1.8854,
"step": 415500
},
{
"epoch": 1.5543078332818032,
"grad_norm": 4.879900932312012,
"learning_rate": 1.1791982199822898e-05,
"loss": 1.8817,
"step": 416000
},
{
"epoch": 1.5561759917352669,
"grad_norm": 4.769500732421875,
"learning_rate": 1.169750538569126e-05,
"loss": 1.9078,
"step": 416500
},
{
"epoch": 1.5580441501887305,
"grad_norm": 5.184789657592773,
"learning_rate": 1.1603546369284646e-05,
"loss": 1.864,
"step": 417000
},
{
"epoch": 1.5599123086421944,
"grad_norm": 3.5462260246276855,
"learning_rate": 1.1509729370737072e-05,
"loss": 1.9012,
"step": 417500
},
{
"epoch": 1.561780467095658,
"grad_norm": 4.478038311004639,
"learning_rate": 1.1416243822658057e-05,
"loss": 1.8541,
"step": 418000
},
{
"epoch": 1.563648625549122,
"grad_norm": 4.2772650718688965,
"learning_rate": 1.1323090530077756e-05,
"loss": 1.9176,
"step": 418500
},
{
"epoch": 1.5655167840025856,
"grad_norm": 4.45164155960083,
"learning_rate": 1.123045560271172e-05,
"loss": 1.9191,
"step": 419000
},
{
"epoch": 1.5673849424560493,
"grad_norm": 4.31321382522583,
"learning_rate": 1.1137968556258127e-05,
"loss": 1.9104,
"step": 419500
},
{
"epoch": 1.569253100909513,
"grad_norm": 3.313171625137329,
"learning_rate": 1.1045816161609301e-05,
"loss": 1.8969,
"step": 420000
},
{
"epoch": 1.5711212593629766,
"grad_norm": 5.630086898803711,
"learning_rate": 1.0953999212315213e-05,
"loss": 1.8921,
"step": 420500
},
{
"epoch": 1.5729894178164403,
"grad_norm": 4.993584632873535,
"learning_rate": 1.0862518499037283e-05,
"loss": 1.8845,
"step": 421000
},
{
"epoch": 1.574857576269904,
"grad_norm": 5.677700996398926,
"learning_rate": 1.077155676004855e-05,
"loss": 1.8988,
"step": 421500
},
{
"epoch": 1.5767257347233676,
"grad_norm": 4.58486795425415,
"learning_rate": 1.068075020279995e-05,
"loss": 1.9101,
"step": 422000
},
{
"epoch": 1.5785938931768315,
"grad_norm": 4.042180061340332,
"learning_rate": 1.0590282234591004e-05,
"loss": 1.9224,
"step": 422500
},
{
"epoch": 1.5804620516302952,
"grad_norm": 3.4549098014831543,
"learning_rate": 1.0500153634466675e-05,
"loss": 1.8885,
"step": 423000
},
{
"epoch": 1.5823302100837588,
"grad_norm": 4.782561302185059,
"learning_rate": 1.0410544415482986e-05,
"loss": 1.9126,
"step": 423500
},
{
"epoch": 1.5841983685372227,
"grad_norm": 4.326170921325684,
"learning_rate": 1.0321096194361922e-05,
"loss": 1.8519,
"step": 424000
},
{
"epoch": 1.5860665269906864,
"grad_norm": 4.411458492279053,
"learning_rate": 1.0231989659361606e-05,
"loss": 1.8756,
"step": 424500
},
{
"epoch": 1.58793468544415,
"grad_norm": 4.059584140777588,
"learning_rate": 1.0143225577803328e-05,
"loss": 1.897,
"step": 425000
},
{
"epoch": 1.5898028438976137,
"grad_norm": 4.62555456161499,
"learning_rate": 1.0054981212748877e-05,
"loss": 1.9044,
"step": 425500
},
{
"epoch": 1.5916710023510774,
"grad_norm": 3.3062992095947266,
"learning_rate": 9.966903639519581e-06,
"loss": 1.8671,
"step": 426000
},
{
"epoch": 1.593539160804541,
"grad_norm": 3.750192880630493,
"learning_rate": 9.879170802462034e-06,
"loss": 1.9024,
"step": 426500
},
{
"epoch": 1.5954073192580047,
"grad_norm": 3.6934866905212402,
"learning_rate": 9.791783457068221e-06,
"loss": 1.8972,
"step": 427000
},
{
"epoch": 1.5972754777114684,
"grad_norm": 4.577314376831055,
"learning_rate": 9.704916092006999e-06,
"loss": 1.9391,
"step": 427500
},
{
"epoch": 1.5991436361649323,
"grad_norm": 4.8952226638793945,
"learning_rate": 9.618221289776025e-06,
"loss": 1.8756,
"step": 428000
},
{
"epoch": 1.601011794618396,
"grad_norm": 5.817446231842041,
"learning_rate": 9.531874226317888e-06,
"loss": 1.8756,
"step": 428500
},
{
"epoch": 1.6028799530718596,
"grad_norm": 3.9412033557891846,
"learning_rate": 9.445875645191288e-06,
"loss": 1.912,
"step": 429000
},
{
"epoch": 1.6047481115253235,
"grad_norm": 4.50702428817749,
"learning_rate": 9.360397236655304e-06,
"loss": 1.8652,
"step": 429500
},
{
"epoch": 1.6066162699787871,
"grad_norm": 4.587414741516113,
"learning_rate": 9.27509713820291e-06,
"loss": 1.9097,
"step": 430000
},
{
"epoch": 1.6084844284322508,
"grad_norm": 6.312617301940918,
"learning_rate": 9.190147733261234e-06,
"loss": 1.8736,
"step": 430500
},
{
"epoch": 1.6103525868857145,
"grad_norm": 5.86572790145874,
"learning_rate": 9.105549753353348e-06,
"loss": 1.8866,
"step": 431000
},
{
"epoch": 1.6122207453391781,
"grad_norm": 4.819661617279053,
"learning_rate": 9.021303926976055e-06,
"loss": 1.8648,
"step": 431500
},
{
"epoch": 1.6140889037926418,
"grad_norm": 4.977511882781982,
"learning_rate": 8.937578412834564e-06,
"loss": 1.8504,
"step": 432000
},
{
"epoch": 1.6159570622461055,
"grad_norm": 3.8270606994628906,
"learning_rate": 8.85403835895094e-06,
"loss": 1.9031,
"step": 432500
},
{
"epoch": 1.6178252206995691,
"grad_norm": 3.582000255584717,
"learning_rate": 8.770852624432785e-06,
"loss": 1.9016,
"step": 433000
},
{
"epoch": 1.619693379153033,
"grad_norm": 4.828258037567139,
"learning_rate": 8.688021925615658e-06,
"loss": 1.9003,
"step": 433500
},
{
"epoch": 1.6215615376064967,
"grad_norm": 4.899356842041016,
"learning_rate": 8.60571157016748e-06,
"loss": 1.902,
"step": 434000
},
{
"epoch": 1.6234296960599606,
"grad_norm": 3.5516891479492188,
"learning_rate": 8.523592365898686e-06,
"loss": 1.8574,
"step": 434500
},
{
"epoch": 1.6252978545134242,
"grad_norm": 4.53317928314209,
"learning_rate": 8.441830326558064e-06,
"loss": 1.8844,
"step": 435000
},
{
"epoch": 1.627166012966888,
"grad_norm": 6.883234977722168,
"learning_rate": 8.360426156221358e-06,
"loss": 1.859,
"step": 435500
},
{
"epoch": 1.6290341714203516,
"grad_norm": 5.441802024841309,
"learning_rate": 8.279542288766052e-06,
"loss": 1.9012,
"step": 436000
},
{
"epoch": 1.6309023298738152,
"grad_norm": 3.1804521083831787,
"learning_rate": 8.198855237101328e-06,
"loss": 1.8847,
"step": 436500
},
{
"epoch": 1.632770488327279,
"grad_norm": 4.132668972015381,
"learning_rate": 8.118528146766863e-06,
"loss": 1.8517,
"step": 437000
},
{
"epoch": 1.6346386467807426,
"grad_norm": 4.795321464538574,
"learning_rate": 8.038561709481684e-06,
"loss": 1.9175,
"step": 437500
},
{
"epoch": 1.6365068052342062,
"grad_norm": 4.67226505279541,
"learning_rate": 7.959115462975215e-06,
"loss": 1.857,
"step": 438000
},
{
"epoch": 1.63837496368767,
"grad_norm": 5.205322742462158,
"learning_rate": 7.879871669780554e-06,
"loss": 1.8824,
"step": 438500
},
{
"epoch": 1.6402431221411338,
"grad_norm": 5.369668960571289,
"learning_rate": 7.800990584772722e-06,
"loss": 1.876,
"step": 439000
},
{
"epoch": 1.6421112805945974,
"grad_norm": 4.469278335571289,
"learning_rate": 7.722472887218802e-06,
"loss": 1.8871,
"step": 439500
},
{
"epoch": 1.6439794390480613,
"grad_norm": 4.810849189758301,
"learning_rate": 7.644319253256577e-06,
"loss": 1.892,
"step": 440000
},
{
"epoch": 1.645847597501525,
"grad_norm": 5.1172027587890625,
"learning_rate": 7.5666855692307025e-06,
"loss": 1.9003,
"step": 440500
},
{
"epoch": 1.6477157559549886,
"grad_norm": 5.264705181121826,
"learning_rate": 7.48926134684001e-06,
"loss": 1.866,
"step": 441000
},
{
"epoch": 1.6495839144084523,
"grad_norm": 3.7140793800354004,
"learning_rate": 7.41220319629074e-06,
"loss": 1.8958,
"step": 441500
},
{
"epoch": 1.651452072861916,
"grad_norm": 4.509251117706299,
"learning_rate": 7.335511781152121e-06,
"loss": 1.8784,
"step": 442000
},
{
"epoch": 1.6533202313153796,
"grad_norm": 4.2154388427734375,
"learning_rate": 7.259340042775581e-06,
"loss": 1.8476,
"step": 442500
},
{
"epoch": 1.6551883897688433,
"grad_norm": 6.030950546264648,
"learning_rate": 7.183383339768157e-06,
"loss": 1.9157,
"step": 443000
},
{
"epoch": 1.657056548222307,
"grad_norm": 4.760791301727295,
"learning_rate": 7.107795342603074e-06,
"loss": 1.8709,
"step": 443500
},
{
"epoch": 1.6589247066757709,
"grad_norm": 4.554337978363037,
"learning_rate": 7.032576702189675e-06,
"loss": 1.8865,
"step": 444000
},
{
"epoch": 1.6607928651292345,
"grad_norm": 5.714734077453613,
"learning_rate": 6.9578773938351495e-06,
"loss": 1.8687,
"step": 444500
},
{
"epoch": 1.6626610235826982,
"grad_norm": 4.749231338500977,
"learning_rate": 6.883398664985902e-06,
"loss": 1.8953,
"step": 445000
},
{
"epoch": 1.664529182036162,
"grad_norm": 2.8103106021881104,
"learning_rate": 6.809291225230813e-06,
"loss": 1.8854,
"step": 445500
},
{
"epoch": 1.6663973404896257,
"grad_norm": 6.017327308654785,
"learning_rate": 6.735555712729713e-06,
"loss": 1.8829,
"step": 446000
},
{
"epoch": 1.6682654989430894,
"grad_norm": 5.306553363800049,
"learning_rate": 6.662339116102778e-06,
"loss": 1.8542,
"step": 446500
},
{
"epoch": 1.670133657396553,
"grad_norm": 5.078936576843262,
"learning_rate": 6.5893486127564465e-06,
"loss": 1.9077,
"step": 447000
},
{
"epoch": 1.6720018158500167,
"grad_norm": 5.262309551239014,
"learning_rate": 6.516731930651387e-06,
"loss": 1.8863,
"step": 447500
},
{
"epoch": 1.6738699743034804,
"grad_norm": 5.343240261077881,
"learning_rate": 6.444489695110101e-06,
"loss": 1.8784,
"step": 448000
},
{
"epoch": 1.675738132756944,
"grad_norm": 4.112715244293213,
"learning_rate": 6.372622528230676e-06,
"loss": 1.8559,
"step": 448500
},
{
"epoch": 1.6776062912104077,
"grad_norm": 3.1489148139953613,
"learning_rate": 6.301273656494144e-06,
"loss": 1.8633,
"step": 449000
},
{
"epoch": 1.6794744496638716,
"grad_norm": 5.503724575042725,
"learning_rate": 6.230157727089419e-06,
"loss": 1.8898,
"step": 449500
},
{
"epoch": 1.6813426081173353,
"grad_norm": 4.443988800048828,
"learning_rate": 6.159418712018961e-06,
"loss": 1.881,
"step": 450000
},
{
"epoch": 1.6832107665707992,
"grad_norm": 3.3895161151885986,
"learning_rate": 6.089057220436195e-06,
"loss": 1.8802,
"step": 450500
},
{
"epoch": 1.6850789250242628,
"grad_norm": 4.960055828094482,
"learning_rate": 6.0192134471937224e-06,
"loss": 1.8593,
"step": 451000
},
{
"epoch": 1.6869470834777265,
"grad_norm": 4.596670150756836,
"learning_rate": 5.949608058974171e-06,
"loss": 1.8924,
"step": 451500
},
{
"epoch": 1.6888152419311901,
"grad_norm": 3.810817003250122,
"learning_rate": 5.8803820009804165e-06,
"loss": 1.8412,
"step": 452000
},
{
"epoch": 1.6906834003846538,
"grad_norm": 6.2422380447387695,
"learning_rate": 5.8115358693374035e-06,
"loss": 1.875,
"step": 452500
},
{
"epoch": 1.6925515588381175,
"grad_norm": 4.921154499053955,
"learning_rate": 5.7432068079726676e-06,
"loss": 1.8729,
"step": 453000
},
{
"epoch": 1.6944197172915811,
"grad_norm": 5.331964015960693,
"learning_rate": 5.675121541510353e-06,
"loss": 1.8726,
"step": 453500
},
{
"epoch": 1.6962878757450448,
"grad_norm": 4.561686038970947,
"learning_rate": 5.607417968953904e-06,
"loss": 1.8597,
"step": 454000
},
{
"epoch": 1.6981560341985087,
"grad_norm": 5.06734037399292,
"learning_rate": 5.5400966733176905e-06,
"loss": 1.8741,
"step": 454500
},
{
"epoch": 1.7000241926519724,
"grad_norm": 6.29988956451416,
"learning_rate": 5.473291728727564e-06,
"loss": 1.9034,
"step": 455000
},
{
"epoch": 1.701892351105436,
"grad_norm": 5.206850051879883,
"learning_rate": 5.406735955363129e-06,
"loss": 1.8556,
"step": 455500
},
{
"epoch": 1.7037605095589,
"grad_norm": 3.8202433586120605,
"learning_rate": 5.340564187047786e-06,
"loss": 1.8677,
"step": 456000
},
{
"epoch": 1.7056286680123636,
"grad_norm": 3.6107611656188965,
"learning_rate": 5.2747769936051125e-06,
"loss": 1.8593,
"step": 456500
},
{
"epoch": 1.7074968264658272,
"grad_norm": 4.204036235809326,
"learning_rate": 5.20937494154699e-06,
"loss": 1.8571,
"step": 457000
},
{
"epoch": 1.709364984919291,
"grad_norm": 5.234120845794678,
"learning_rate": 5.1444882414578675e-06,
"loss": 1.8433,
"step": 457500
},
{
"epoch": 1.7112331433727546,
"grad_norm": 3.4716298580169678,
"learning_rate": 5.079857385347997e-06,
"loss": 1.8765,
"step": 458000
},
{
"epoch": 1.7131013018262182,
"grad_norm": 5.14175271987915,
"learning_rate": 5.015613349129866e-06,
"loss": 1.9206,
"step": 458500
},
{
"epoch": 1.714969460279682,
"grad_norm": 4.21678352355957,
"learning_rate": 4.951756686026798e-06,
"loss": 1.8835,
"step": 459000
},
{
"epoch": 1.7168376187331456,
"grad_norm": 3.8663065433502197,
"learning_rate": 4.888414495895577e-06,
"loss": 1.8974,
"step": 459500
},
{
"epoch": 1.7187057771866094,
"grad_norm": 4.44641637802124,
"learning_rate": 4.825333447862485e-06,
"loss": 1.8963,
"step": 460000
},
{
"epoch": 1.720573935640073,
"grad_norm": 4.290149211883545,
"learning_rate": 4.762641411497825e-06,
"loss": 1.8818,
"step": 460500
},
{
"epoch": 1.722442094093537,
"grad_norm": 3.1460719108581543,
"learning_rate": 4.700338926660225e-06,
"loss": 1.8916,
"step": 461000
},
{
"epoch": 1.7243102525470007,
"grad_norm": 3.602639675140381,
"learning_rate": 4.63842652985379e-06,
"loss": 1.8656,
"step": 461500
},
{
"epoch": 1.7261784110004643,
"grad_norm": 4.454497337341309,
"learning_rate": 4.577027407582085e-06,
"loss": 1.8377,
"step": 462000
},
{
"epoch": 1.728046569453928,
"grad_norm": 4.91801118850708,
"learning_rate": 4.5158960000806275e-06,
"loss": 1.8708,
"step": 462500
},
{
"epoch": 1.7299147279073916,
"grad_norm": 5.951587200164795,
"learning_rate": 4.45515626889988e-06,
"loss": 1.8598,
"step": 463000
},
{
"epoch": 1.7317828863608553,
"grad_norm": 3.9829583168029785,
"learning_rate": 4.394808737086631e-06,
"loss": 1.8637,
"step": 463500
},
{
"epoch": 1.733651044814319,
"grad_norm": 4.84136962890625,
"learning_rate": 4.334973441658552e-06,
"loss": 1.849,
"step": 464000
},
{
"epoch": 1.7355192032677826,
"grad_norm": 5.9698991775512695,
"learning_rate": 4.275411077223152e-06,
"loss": 1.8716,
"step": 464500
},
{
"epoch": 1.7373873617212465,
"grad_norm": 6.253756046295166,
"learning_rate": 4.216242459991293e-06,
"loss": 1.877,
"step": 465000
},
{
"epoch": 1.7392555201747102,
"grad_norm": 4.6036152839660645,
"learning_rate": 4.157468099480438e-06,
"loss": 1.8532,
"step": 465500
},
{
"epoch": 1.7411236786281739,
"grad_norm": 4.482430934906006,
"learning_rate": 4.099204866700346e-06,
"loss": 1.858,
"step": 466000
},
{
"epoch": 1.7429918370816377,
"grad_norm": 4.4797749519348145,
"learning_rate": 4.041219743568814e-06,
"loss": 1.8436,
"step": 466500
},
{
"epoch": 1.7448599955351014,
"grad_norm": 5.49769926071167,
"learning_rate": 3.983630384327791e-06,
"loss": 1.8767,
"step": 467000
},
{
"epoch": 1.746728153988565,
"grad_norm": 5.328680038452148,
"learning_rate": 3.9264372848953125e-06,
"loss": 1.8929,
"step": 467500
},
{
"epoch": 1.7485963124420287,
"grad_norm": 3.2703754901885986,
"learning_rate": 3.869640937777136e-06,
"loss": 1.7657,
"step": 468000
},
{
"epoch": 1.7504644708954924,
"grad_norm": 4.710208892822266,
"learning_rate": 3.813241832062481e-06,
"loss": 1.868,
"step": 468500
},
{
"epoch": 1.752332629348956,
"grad_norm": 3.9908735752105713,
"learning_rate": 3.7572404534197746e-06,
"loss": 1.9306,
"step": 469000
},
{
"epoch": 1.7542007878024197,
"grad_norm": 5.898683071136475,
"learning_rate": 3.701637284092546e-06,
"loss": 1.8756,
"step": 469500
},
{
"epoch": 1.7560689462558834,
"grad_norm": 5.575063705444336,
"learning_rate": 3.6465428136502942e-06,
"loss": 1.8415,
"step": 470000
},
{
"epoch": 1.7579371047093473,
"grad_norm": 3.8220248222351074,
"learning_rate": 3.591736697164866e-06,
"loss": 1.8549,
"step": 470500
},
{
"epoch": 1.759805263162811,
"grad_norm": 4.483773708343506,
"learning_rate": 3.5373302151939625e-06,
"loss": 1.8414,
"step": 471000
},
{
"epoch": 1.7616734216162746,
"grad_norm": 5.593682289123535,
"learning_rate": 3.4833238362470044e-06,
"loss": 1.8729,
"step": 471500
},
{
"epoch": 1.7635415800697385,
"grad_norm": 3.2169010639190674,
"learning_rate": 3.4298248369353582e-06,
"loss": 1.8556,
"step": 472000
},
{
"epoch": 1.7654097385232022,
"grad_norm": 5.516305923461914,
"learning_rate": 3.3766192532610986e-06,
"loss": 1.8855,
"step": 472500
},
{
"epoch": 1.7672778969766658,
"grad_norm": 5.06584358215332,
"learning_rate": 3.3239203637443983e-06,
"loss": 1.8967,
"step": 473000
},
{
"epoch": 1.7691460554301295,
"grad_norm": 4.666677474975586,
"learning_rate": 3.271517404347946e-06,
"loss": 1.8351,
"step": 473500
},
{
"epoch": 1.7710142138835931,
"grad_norm": 5.4451823234558105,
"learning_rate": 3.2195168369637765e-06,
"loss": 1.8405,
"step": 474000
},
{
"epoch": 1.7728823723370568,
"grad_norm": 4.598884582519531,
"learning_rate": 3.1679191093832883e-06,
"loss": 1.8774,
"step": 474500
},
{
"epoch": 1.7747505307905205,
"grad_norm": 5.018040657043457,
"learning_rate": 3.1167246659289217e-06,
"loss": 1.8544,
"step": 475000
},
{
"epoch": 1.7766186892439841,
"grad_norm": 5.349071502685547,
"learning_rate": 3.065933947450339e-06,
"loss": 1.8779,
"step": 475500
},
{
"epoch": 1.778486847697448,
"grad_norm": 4.253110408782959,
"learning_rate": 3.015547391320589e-06,
"loss": 1.8161,
"step": 476000
},
{
"epoch": 1.7803550061509117,
"grad_norm": 3.6783599853515625,
"learning_rate": 2.9655654314323655e-06,
"loss": 1.8395,
"step": 476500
},
{
"epoch": 1.7822231646043756,
"grad_norm": 4.650113582611084,
"learning_rate": 2.916185998547194e-06,
"loss": 1.8573,
"step": 477000
},
{
"epoch": 1.7840913230578392,
"grad_norm": 4.785963535308838,
"learning_rate": 2.8670128962200117e-06,
"loss": 1.839,
"step": 477500
},
{
"epoch": 1.785959481511303,
"grad_norm": 4.258472442626953,
"learning_rate": 2.818245669206393e-06,
"loss": 1.8937,
"step": 478000
},
{
"epoch": 1.7878276399647666,
"grad_norm": 5.702148914337158,
"learning_rate": 2.7698847374545255e-06,
"loss": 1.8767,
"step": 478500
},
{
"epoch": 1.7896957984182302,
"grad_norm": 5.909474849700928,
"learning_rate": 2.7219305174139067e-06,
"loss": 1.8927,
"step": 479000
},
{
"epoch": 1.791563956871694,
"grad_norm": 4.348086357116699,
"learning_rate": 2.6743834220317286e-06,
"loss": 1.8478,
"step": 479500
},
{
"epoch": 1.7934321153251576,
"grad_norm": 4.148903846740723,
"learning_rate": 2.62724386074929e-06,
"loss": 1.855,
"step": 480000
},
{
"epoch": 1.7953002737786212,
"grad_norm": 4.32988977432251,
"learning_rate": 2.580512239498528e-06,
"loss": 1.8551,
"step": 480500
},
{
"epoch": 1.7971684322320851,
"grad_norm": 4.866036415100098,
"learning_rate": 2.534188960698475e-06,
"loss": 1.8938,
"step": 481000
},
{
"epoch": 1.7990365906855488,
"grad_norm": 4.053302764892578,
"learning_rate": 2.4883658441394673e-06,
"loss": 1.8759,
"step": 481500
},
{
"epoch": 1.8009047491390124,
"grad_norm": 5.242681980133057,
"learning_rate": 2.4428596247633885e-06,
"loss": 1.8914,
"step": 482000
},
{
"epoch": 1.8027729075924763,
"grad_norm": 5.018854141235352,
"learning_rate": 2.3977629332031404e-06,
"loss": 1.8592,
"step": 482500
},
{
"epoch": 1.80464106604594,
"grad_norm": 4.828859329223633,
"learning_rate": 2.3530761577989e-06,
"loss": 1.8676,
"step": 483000
},
{
"epoch": 1.8065092244994037,
"grad_norm": 3.3137731552124023,
"learning_rate": 2.3088878265754845e-06,
"loss": 1.8182,
"step": 483500
},
{
"epoch": 1.8083773829528673,
"grad_norm": 6.416788101196289,
"learning_rate": 2.2650212126383242e-06,
"loss": 1.8656,
"step": 484000
},
{
"epoch": 1.810245541406331,
"grad_norm": 4.340769290924072,
"learning_rate": 2.2215656579332167e-06,
"loss": 1.9075,
"step": 484500
},
{
"epoch": 1.8121136998597946,
"grad_norm": 4.634076118469238,
"learning_rate": 2.17852153666806e-06,
"loss": 1.8799,
"step": 485000
},
{
"epoch": 1.8139818583132583,
"grad_norm": 4.349535942077637,
"learning_rate": 2.1359740729170296e-06,
"loss": 1.8522,
"step": 485500
},
{
"epoch": 1.815850016766722,
"grad_norm": 4.439642429351807,
"learning_rate": 2.0937531022739987e-06,
"loss": 1.8578,
"step": 486000
},
{
"epoch": 1.8177181752201859,
"grad_norm": 4.639336585998535,
"learning_rate": 2.051944665700545e-06,
"loss": 1.883,
"step": 486500
},
{
"epoch": 1.8195863336736495,
"grad_norm": 4.625245571136475,
"learning_rate": 2.010549123220773e-06,
"loss": 1.8886,
"step": 487000
},
{
"epoch": 1.8214544921271132,
"grad_norm": 4.0239667892456055,
"learning_rate": 1.9696483832278845e-06,
"loss": 1.8653,
"step": 487500
},
{
"epoch": 1.823322650580577,
"grad_norm": 4.363647937774658,
"learning_rate": 1.92907886722582e-06,
"loss": 1.8718,
"step": 488000
},
{
"epoch": 1.8251908090340407,
"grad_norm": 4.025300025939941,
"learning_rate": 1.8889233033491493e-06,
"loss": 1.8352,
"step": 488500
},
{
"epoch": 1.8270589674875044,
"grad_norm": 6.883707046508789,
"learning_rate": 1.8491820373886358e-06,
"loss": 1.9056,
"step": 489000
},
{
"epoch": 1.828927125940968,
"grad_norm": 5.169373512268066,
"learning_rate": 1.8098554115674292e-06,
"loss": 1.8994,
"step": 489500
},
{
"epoch": 1.8307952843944317,
"grad_norm": 5.691972255706787,
"learning_rate": 1.7710985840431572e-06,
"loss": 1.8602,
"step": 490000
},
{
"epoch": 1.8326634428478954,
"grad_norm": 4.719027042388916,
"learning_rate": 1.7326005889664986e-06,
"loss": 1.8645,
"step": 490500
},
{
"epoch": 1.834531601301359,
"grad_norm": 5.3066816329956055,
"learning_rate": 1.6945182379445534e-06,
"loss": 1.879,
"step": 491000
},
{
"epoch": 1.8363997597548227,
"grad_norm": 5.338113307952881,
"learning_rate": 1.6568518589150705e-06,
"loss": 1.8811,
"step": 491500
},
{
"epoch": 1.8382679182082866,
"grad_norm": 3.351616382598877,
"learning_rate": 1.61960177623377e-06,
"loss": 1.8459,
"step": 492000
},
{
"epoch": 1.8401360766617503,
"grad_norm": 5.075439929962158,
"learning_rate": 1.5827683106715008e-06,
"loss": 1.8515,
"step": 492500
},
{
"epoch": 1.8420042351152142,
"grad_norm": 4.089956283569336,
"learning_rate": 1.5463517794115367e-06,
"loss": 1.8624,
"step": 493000
},
{
"epoch": 1.8438723935686778,
"grad_norm": 6.492163181304932,
"learning_rate": 1.5103524960467908e-06,
"loss": 1.8245,
"step": 493500
},
{
"epoch": 1.8457405520221415,
"grad_norm": 6.452279567718506,
"learning_rate": 1.4748415171010387e-06,
"loss": 1.8406,
"step": 494000
},
{
"epoch": 1.8476087104756052,
"grad_norm": 3.7838053703308105,
"learning_rate": 1.4396768198986554e-06,
"loss": 1.8508,
"step": 494500
},
{
"epoch": 1.8494768689290688,
"grad_norm": 3.706258535385132,
"learning_rate": 1.4049302891993631e-06,
"loss": 1.8484,
"step": 495000
},
{
"epoch": 1.8513450273825325,
"grad_norm": 4.734787940979004,
"learning_rate": 1.3706022242152227e-06,
"loss": 1.8616,
"step": 495500
},
{
"epoch": 1.8532131858359961,
"grad_norm": 5.525266170501709,
"learning_rate": 1.336760321043634e-06,
"loss": 1.8696,
"step": 496000
},
{
"epoch": 1.8550813442894598,
"grad_norm": 3.555717706680298,
"learning_rate": 1.3032692323137307e-06,
"loss": 1.8539,
"step": 496500
},
{
"epoch": 1.8569495027429237,
"grad_norm": 4.906459331512451,
"learning_rate": 1.2701974847307452e-06,
"loss": 1.8555,
"step": 497000
},
{
"epoch": 1.8588176611963874,
"grad_norm": 5.703590393066406,
"learning_rate": 1.2375453630847134e-06,
"loss": 1.8088,
"step": 497500
},
{
"epoch": 1.860685819649851,
"grad_norm": 4.265283107757568,
"learning_rate": 1.2053771937288626e-06,
"loss": 1.8823,
"step": 498000
},
{
"epoch": 1.862553978103315,
"grad_norm": 4.899601936340332,
"learning_rate": 1.1735643232264836e-06,
"loss": 1.8687,
"step": 498500
},
{
"epoch": 1.8644221365567786,
"grad_norm": 4.975470542907715,
"learning_rate": 1.1422342758236281e-06,
"loss": 1.871,
"step": 499000
},
{
"epoch": 1.8662902950102422,
"grad_norm": 4.806349754333496,
"learning_rate": 1.1112617500700973e-06,
"loss": 1.8244,
"step": 499500
},
{
"epoch": 1.868158453463706,
"grad_norm": 5.105782508850098,
"learning_rate": 1.0807102188935214e-06,
"loss": 1.8867,
"step": 500000
},
{
"epoch": 1.8700266119171696,
"grad_norm": 5.97845458984375,
"learning_rate": 1.050579945381669e-06,
"loss": 1.8339,
"step": 500500
},
{
"epoch": 1.8718947703706332,
"grad_norm": 4.778586387634277,
"learning_rate": 1.0208711889947376e-06,
"loss": 1.8423,
"step": 501000
},
{
"epoch": 1.873762928824097,
"grad_norm": 4.4693169593811035,
"learning_rate": 9.915842055631286e-07,
"loss": 1.8629,
"step": 501500
},
{
"epoch": 1.8756310872775606,
"grad_norm": 5.0336222648620605,
"learning_rate": 9.62719247285221e-07,
"loss": 1.8386,
"step": 502000
},
{
"epoch": 1.8774992457310244,
"grad_norm": 4.51587438583374,
"learning_rate": 9.342765627252504e-07,
"loss": 1.8566,
"step": 502500
},
{
"epoch": 1.879367404184488,
"grad_norm": 4.207951068878174,
"learning_rate": 9.062563968110948e-07,
"loss": 1.8517,
"step": 503000
},
{
"epoch": 1.8812355626379518,
"grad_norm": 3.8609273433685303,
"learning_rate": 8.787137635712206e-07,
"loss": 1.8727,
"step": 503500
},
{
"epoch": 1.8831037210914157,
"grad_norm": 4.1626877784729,
"learning_rate": 8.515385089467198e-07,
"loss": 1.89,
"step": 504000
},
{
"epoch": 1.8849718795448793,
"grad_norm": 3.9561331272125244,
"learning_rate": 8.247864854485199e-07,
"loss": 1.8863,
"step": 504500
},
{
"epoch": 1.886840037998343,
"grad_norm": 4.846907138824463,
"learning_rate": 7.98457923445789e-07,
"loss": 1.8208,
"step": 505000
},
{
"epoch": 1.8887081964518067,
"grad_norm": 4.7613911628723145,
"learning_rate": 7.726044364189499e-07,
"loss": 1.8515,
"step": 505500
},
{
"epoch": 1.8905763549052703,
"grad_norm": 5.021259307861328,
"learning_rate": 7.47122625883645e-07,
"loss": 1.8398,
"step": 506000
},
{
"epoch": 1.892444513358734,
"grad_norm": 6.04338264465332,
"learning_rate": 7.220649456289641e-07,
"loss": 1.8433,
"step": 506500
},
{
"epoch": 1.8943126718121976,
"grad_norm": 4.8739094734191895,
"learning_rate": 6.974316114336077e-07,
"loss": 1.8352,
"step": 507000
},
{
"epoch": 1.8961808302656613,
"grad_norm": 4.441490650177002,
"learning_rate": 6.732708291258827e-07,
"loss": 1.8887,
"step": 507500
},
{
"epoch": 1.8980489887191252,
"grad_norm": 3.811279058456421,
"learning_rate": 6.494859700278133e-07,
"loss": 1.8689,
"step": 508000
},
{
"epoch": 1.8999171471725889,
"grad_norm": 2.8529744148254395,
"learning_rate": 6.26126081986883e-07,
"loss": 1.9027,
"step": 508500
},
{
"epoch": 1.9017853056260527,
"grad_norm": 4.631827354431152,
"learning_rate": 6.031913661616207e-07,
"loss": 1.848,
"step": 509000
},
{
"epoch": 1.9036534640795164,
"grad_norm": 3.616713762283325,
"learning_rate": 5.807266140930689e-07,
"loss": 1.8911,
"step": 509500
},
{
"epoch": 1.90552162253298,
"grad_norm": 5.187899112701416,
"learning_rate": 5.586419802097898e-07,
"loss": 1.8309,
"step": 510000
},
{
"epoch": 1.9073897809864437,
"grad_norm": 5.249440670013428,
"learning_rate": 5.369830996666103e-07,
"loss": 1.8542,
"step": 510500
},
{
"epoch": 1.9092579394399074,
"grad_norm": 5.117617607116699,
"learning_rate": 5.157501589742042e-07,
"loss": 1.8459,
"step": 511000
},
{
"epoch": 1.911126097893371,
"grad_norm": 5.904655456542969,
"learning_rate": 4.949433409753679e-07,
"loss": 1.8495,
"step": 511500
},
{
"epoch": 1.9129942563468347,
"grad_norm": 6.1428632736206055,
"learning_rate": 4.7460316030914495e-07,
"loss": 1.8274,
"step": 512000
},
{
"epoch": 1.9148624148002984,
"grad_norm": 4.737666130065918,
"learning_rate": 4.546482684189279e-07,
"loss": 1.8814,
"step": 512500
},
{
"epoch": 1.9167305732537623,
"grad_norm": 5.555963516235352,
"learning_rate": 4.351200253877141e-07,
"loss": 1.8644,
"step": 513000
},
{
"epoch": 1.918598731707226,
"grad_norm": 4.281107425689697,
"learning_rate": 4.160185993786592e-07,
"loss": 1.8685,
"step": 513500
},
{
"epoch": 1.9204668901606896,
"grad_norm": 4.849224090576172,
"learning_rate": 3.973441548794699e-07,
"loss": 1.8921,
"step": 514000
},
{
"epoch": 1.9223350486141535,
"grad_norm": 5.799472332000732,
"learning_rate": 3.791329209122674e-07,
"loss": 1.8326,
"step": 514500
},
{
"epoch": 1.9242032070676172,
"grad_norm": 5.754580020904541,
"learning_rate": 3.613120634338663e-07,
"loss": 1.8677,
"step": 515000
},
{
"epoch": 1.9260713655210808,
"grad_norm": 4.404658317565918,
"learning_rate": 3.4391865855858406e-07,
"loss": 1.8637,
"step": 515500
},
{
"epoch": 1.9279395239745445,
"grad_norm": 4.911507606506348,
"learning_rate": 3.2695285606589856e-07,
"loss": 1.85,
"step": 516000
},
{
"epoch": 1.9298076824280082,
"grad_norm": 4.071664333343506,
"learning_rate": 3.1044745117284056e-07,
"loss": 1.8303,
"step": 516500
},
{
"epoch": 1.9316758408814718,
"grad_norm": 5.3374223709106445,
"learning_rate": 2.9433643213220284e-07,
"loss": 1.8384,
"step": 517000
},
{
"epoch": 1.9335439993349355,
"grad_norm": 5.541077613830566,
"learning_rate": 2.7865344244054625e-07,
"loss": 1.8562,
"step": 517500
},
{
"epoch": 1.9354121577883991,
"grad_norm": 4.992559432983398,
"learning_rate": 2.6339861714849144e-07,
"loss": 1.8563,
"step": 518000
},
{
"epoch": 1.937280316241863,
"grad_norm": 3.9907846450805664,
"learning_rate": 2.486013131539955e-07,
"loss": 1.8736,
"step": 518500
},
{
"epoch": 1.9391484746953267,
"grad_norm": 3.9517438411712646,
"learning_rate": 2.3420235009178893e-07,
"loss": 1.859,
"step": 519000
},
{
"epoch": 1.9410166331487904,
"grad_norm": 4.987946510314941,
"learning_rate": 2.2023193420994125e-07,
"loss": 1.8258,
"step": 519500
},
{
"epoch": 1.9428847916022542,
"grad_norm": 4.550879955291748,
"learning_rate": 2.0669018581160883e-07,
"loss": 1.8678,
"step": 520000
},
{
"epoch": 1.944752950055718,
"grad_norm": 3.339261293411255,
"learning_rate": 1.936030194349736e-07,
"loss": 1.8278,
"step": 520500
},
{
"epoch": 1.9466211085091816,
"grad_norm": 5.5620951652526855,
"learning_rate": 1.8091809424235495e-07,
"loss": 1.8996,
"step": 521000
},
{
"epoch": 1.9484892669626452,
"grad_norm": 3.614462375640869,
"learning_rate": 1.6866217507570114e-07,
"loss": 1.8478,
"step": 521500
},
{
"epoch": 1.950357425416109,
"grad_norm": 4.48366117477417,
"learning_rate": 1.5683536747416184e-07,
"loss": 1.8555,
"step": 522000
},
{
"epoch": 1.9522255838695726,
"grad_norm": 5.737336158752441,
"learning_rate": 1.454601400492306e-07,
"loss": 1.8463,
"step": 522500
},
{
"epoch": 1.9540937423230362,
"grad_norm": 3.779061794281006,
"learning_rate": 1.3449099869505266e-07,
"loss": 1.8293,
"step": 523000
},
{
"epoch": 1.9559619007765,
"grad_norm": 5.098133087158203,
"learning_rate": 1.239512631635298e-07,
"loss": 1.8594,
"step": 523500
},
{
"epoch": 1.9578300592299638,
"grad_norm": 4.416299343109131,
"learning_rate": 1.1384102421526654e-07,
"loss": 1.8593,
"step": 524000
},
{
"epoch": 1.9596982176834274,
"grad_norm": 3.656932830810547,
"learning_rate": 1.0417930144245858e-07,
"loss": 1.836,
"step": 524500
},
{
"epoch": 1.9615663761368913,
"grad_norm": 5.132260322570801,
"learning_rate": 9.492745373296808e-08,
"loss": 1.8943,
"step": 525000
},
{
"epoch": 1.963434534590355,
"grad_norm": 4.663350582122803,
"learning_rate": 8.61053525388622e-08,
"loss": 1.8534,
"step": 525500
},
{
"epoch": 1.9653026930438187,
"grad_norm": 6.682803153991699,
"learning_rate": 7.77130738297216e-08,
"loss": 1.8735,
"step": 526000
},
{
"epoch": 1.9671708514972823,
"grad_norm": 6.555516719818115,
"learning_rate": 6.976618556056025e-08,
"loss": 1.88,
"step": 526500
},
{
"epoch": 1.969039009950746,
"grad_norm": 5.245980739593506,
"learning_rate": 6.223290493156397e-08,
"loss": 1.8565,
"step": 527000
},
{
"epoch": 1.9709071684042097,
"grad_norm": 3.9505879878997803,
"learning_rate": 5.512965235983658e-08,
"loss": 1.8449,
"step": 527500
},
{
"epoch": 1.9727753268576733,
"grad_norm": 6.470322132110596,
"learning_rate": 4.8456489013481986e-08,
"loss": 1.8588,
"step": 528000
},
{
"epoch": 1.974643485311137,
"grad_norm": 5.629650592803955,
"learning_rate": 4.221347235697226e-08,
"loss": 1.8839,
"step": 528500
},
{
"epoch": 1.9765116437646009,
"grad_norm": 3.961327075958252,
"learning_rate": 3.6411852409129475e-08,
"loss": 1.8824,
"step": 529000
},
{
"epoch": 1.9783798022180645,
"grad_norm": 4.475338935852051,
"learning_rate": 3.1028426160295554e-08,
"loss": 1.8725,
"step": 529500
},
{
"epoch": 1.9802479606715282,
"grad_norm": 6.577774524688721,
"learning_rate": 2.607529667921771e-08,
"loss": 1.8575,
"step": 530000
},
{
"epoch": 1.982116119124992,
"grad_norm": 6.510643005371094,
"learning_rate": 2.1552506618677248e-08,
"loss": 1.8503,
"step": 530500
},
{
"epoch": 1.9839842775784557,
"grad_norm": 4.923540115356445,
"learning_rate": 1.746785020741437e-08,
"loss": 1.8607,
"step": 531000
},
{
"epoch": 1.9858524360319194,
"grad_norm": 4.267704486846924,
"learning_rate": 1.3804991262938994e-08,
"loss": 1.8248,
"step": 531500
},
{
"epoch": 1.987720594485383,
"grad_norm": 5.18399715423584,
"learning_rate": 1.0572577402029326e-08,
"loss": 1.8468,
"step": 532000
},
{
"epoch": 1.9895887529388467,
"grad_norm": 4.5753045082092285,
"learning_rate": 7.770636459902836e-09,
"loss": 1.8354,
"step": 532500
},
{
"epoch": 1.9914569113923104,
"grad_norm": 4.492304801940918,
"learning_rate": 5.403505802398234e-09,
"loss": 1.8668,
"step": 533000
},
{
"epoch": 1.993325069845774,
"grad_norm": 6.207240104675293,
"learning_rate": 3.461718322739227e-09,
"loss": 1.8532,
"step": 533500
},
{
"epoch": 1.9951932282992377,
"grad_norm": 6.569146156311035,
"learning_rate": 1.9504649954538156e-09,
"loss": 1.8313,
"step": 534000
},
{
"epoch": 1.9970613867527016,
"grad_norm": 3.274258852005005,
"learning_rate": 8.69758834370904e-10,
"loss": 1.9104,
"step": 534500
},
{
"epoch": 1.9989295452061653,
"grad_norm": 4.226444721221924,
"learning_rate": 2.2047974543304427e-10,
"loss": 1.8681,
"step": 535000
}
],
"logging_steps": 500,
"max_steps": 535286,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.4321334103279616e+18,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}