{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.994495412844037, "eval_steps": 500, "global_step": 408, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.014678899082568808, "grad_norm": 2.871569871902466, "learning_rate": 2.439024390243903e-07, "logits/chosen": -1.156640887260437, "logits/rejected": -2.0261764526367188, "logps/chosen": -291.95379638671875, "logps/rejected": -199.91015625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2 }, { "epoch": 0.029357798165137616, "grad_norm": 2.7688803672790527, "learning_rate": 4.878048780487805e-07, "logits/chosen": -1.1512565612792969, "logits/rejected": -1.9958158731460571, "logps/chosen": -313.67742919921875, "logps/rejected": -219.4925537109375, "loss": 0.6952, "rewards/accuracies": 0.5625, "rewards/chosen": 0.0017839791253209114, "rewards/margins": -0.0021596220321953297, "rewards/rejected": 0.003943601623177528, "step": 4 }, { "epoch": 0.044036697247706424, "grad_norm": 2.8300042152404785, "learning_rate": 7.317073170731707e-07, "logits/chosen": -1.217061996459961, "logits/rejected": -2.1603338718414307, "logps/chosen": -318.8204650878906, "logps/rejected": -219.18704223632812, "loss": 0.6906, "rewards/accuracies": 0.515625, "rewards/chosen": 0.003621376119554043, "rewards/margins": 0.007228089962154627, "rewards/rejected": -0.003606713144108653, "step": 6 }, { "epoch": 0.05871559633027523, "grad_norm": 2.636244058609009, "learning_rate": 9.75609756097561e-07, "logits/chosen": -1.359943151473999, "logits/rejected": -2.125555992126465, "logps/chosen": -271.85272216796875, "logps/rejected": -177.42059326171875, "loss": 0.6913, "rewards/accuracies": 0.53125, "rewards/chosen": 0.010891949757933617, "rewards/margins": 0.005428856238722801, "rewards/rejected": 0.005463093984872103, "step": 8 }, { "epoch": 0.07339449541284404, "grad_norm": 3.117539882659912, "learning_rate": 1.2195121951219514e-06, "logits/chosen": -1.1746495962142944, "logits/rejected": -2.142481565475464, "logps/chosen": -329.56201171875, "logps/rejected": -171.868896484375, "loss": 0.6837, "rewards/accuracies": 0.609375, "rewards/chosen": 0.019563177600502968, "rewards/margins": 0.020585114136338234, "rewards/rejected": -0.0010219333926215768, "step": 10 }, { "epoch": 0.08807339449541285, "grad_norm": 3.573014497756958, "learning_rate": 1.4634146341463414e-06, "logits/chosen": -1.1120442152023315, "logits/rejected": -1.9781230688095093, "logps/chosen": -373.2279052734375, "logps/rejected": -240.803955078125, "loss": 0.6932, "rewards/accuracies": 0.453125, "rewards/chosen": 0.010507804341614246, "rewards/margins": 0.00216490775346756, "rewards/rejected": 0.008342898450791836, "step": 12 }, { "epoch": 0.10275229357798166, "grad_norm": 3.1432557106018066, "learning_rate": 1.707317073170732e-06, "logits/chosen": -1.1176837682724, "logits/rejected": -1.9580059051513672, "logps/chosen": -281.2641296386719, "logps/rejected": -181.50938415527344, "loss": 0.6889, "rewards/accuracies": 0.546875, "rewards/chosen": 0.01434221863746643, "rewards/margins": 0.010814160108566284, "rewards/rejected": 0.0035280571319162846, "step": 14 }, { "epoch": 0.11743119266055047, "grad_norm": 3.08245587348938, "learning_rate": 1.951219512195122e-06, "logits/chosen": -1.2329456806182861, "logits/rejected": -2.0007548332214355, "logps/chosen": -292.1178894042969, "logps/rejected": -199.83258056640625, "loss": 0.6897, "rewards/accuracies": 0.625, "rewards/chosen": 0.004180246964097023, "rewards/margins": 0.008593017235398293, "rewards/rejected": -0.004412769805639982, "step": 16 }, { "epoch": 0.13211009174311927, "grad_norm": 3.315281391143799, "learning_rate": 2.1951219512195125e-06, "logits/chosen": -1.1571717262268066, "logits/rejected": -2.041630268096924, "logps/chosen": -337.57818603515625, "logps/rejected": -212.22586059570312, "loss": 0.6881, "rewards/accuracies": 0.609375, "rewards/chosen": 0.012045616284012794, "rewards/margins": 0.011737149208784103, "rewards/rejected": 0.0003084660565946251, "step": 18 }, { "epoch": 0.14678899082568808, "grad_norm": 3.288015127182007, "learning_rate": 2.4390243902439027e-06, "logits/chosen": -1.170533299446106, "logits/rejected": -2.111523389816284, "logps/chosen": -332.5646057128906, "logps/rejected": -171.13861083984375, "loss": 0.6866, "rewards/accuracies": 0.59375, "rewards/chosen": 0.018359623849391937, "rewards/margins": 0.014990389347076416, "rewards/rejected": 0.003369236597791314, "step": 20 }, { "epoch": 0.1614678899082569, "grad_norm": 3.0890462398529053, "learning_rate": 2.682926829268293e-06, "logits/chosen": -1.326155662536621, "logits/rejected": -2.235764265060425, "logps/chosen": -321.82012939453125, "logps/rejected": -199.34010314941406, "loss": 0.6867, "rewards/accuracies": 0.5625, "rewards/chosen": 0.009998606517910957, "rewards/margins": 0.014448178000748158, "rewards/rejected": -0.0044495705515146255, "step": 22 }, { "epoch": 0.1761467889908257, "grad_norm": 3.174973249435425, "learning_rate": 2.926829268292683e-06, "logits/chosen": -1.1311931610107422, "logits/rejected": -2.1738736629486084, "logps/chosen": -394.0300598144531, "logps/rejected": -168.5726776123047, "loss": 0.6941, "rewards/accuracies": 0.390625, "rewards/chosen": -0.001855961512774229, "rewards/margins": -0.00018751714378595352, "rewards/rejected": -0.0016684436704963446, "step": 24 }, { "epoch": 0.1908256880733945, "grad_norm": 2.7846882343292236, "learning_rate": 3.1707317073170736e-06, "logits/chosen": -1.315462589263916, "logits/rejected": -2.179847478866577, "logps/chosen": -349.72467041015625, "logps/rejected": -194.91355895996094, "loss": 0.6842, "rewards/accuracies": 0.59375, "rewards/chosen": 0.020769033581018448, "rewards/margins": 0.020399674773216248, "rewards/rejected": 0.00036935764364898205, "step": 26 }, { "epoch": 0.20550458715596331, "grad_norm": 2.960986852645874, "learning_rate": 3.414634146341464e-06, "logits/chosen": -1.218693733215332, "logits/rejected": -2.219115734100342, "logps/chosen": -303.5213928222656, "logps/rejected": -176.81622314453125, "loss": 0.693, "rewards/accuracies": 0.515625, "rewards/chosen": 0.001343409065157175, "rewards/margins": 0.0020109512843191624, "rewards/rejected": -0.0006675421027466655, "step": 28 }, { "epoch": 0.22018348623853212, "grad_norm": 2.6187989711761475, "learning_rate": 3.6585365853658537e-06, "logits/chosen": -1.2147996425628662, "logits/rejected": -2.09503173828125, "logps/chosen": -311.60198974609375, "logps/rejected": -211.1887664794922, "loss": 0.6827, "rewards/accuracies": 0.625, "rewards/chosen": 0.027364609763026237, "rewards/margins": 0.022295203059911728, "rewards/rejected": 0.0050694081000983715, "step": 30 }, { "epoch": 0.23486238532110093, "grad_norm": 3.18058180809021, "learning_rate": 3.902439024390244e-06, "logits/chosen": -1.269258975982666, "logits/rejected": -2.129913806915283, "logps/chosen": -310.4969787597656, "logps/rejected": -175.62393188476562, "loss": 0.6784, "rewards/accuracies": 0.65625, "rewards/chosen": 0.02811383828520775, "rewards/margins": 0.031415536999702454, "rewards/rejected": -0.0033016952220350504, "step": 32 }, { "epoch": 0.24954128440366974, "grad_norm": 3.44490647315979, "learning_rate": 4.146341463414634e-06, "logits/chosen": -1.2504366636276245, "logits/rejected": -2.2198028564453125, "logps/chosen": -346.65069580078125, "logps/rejected": -176.64193725585938, "loss": 0.6835, "rewards/accuracies": 0.578125, "rewards/chosen": 0.027392717078328133, "rewards/margins": 0.02168484590947628, "rewards/rejected": 0.0057078697718679905, "step": 34 }, { "epoch": 0.26422018348623855, "grad_norm": 2.8181567192077637, "learning_rate": 4.390243902439025e-06, "logits/chosen": -1.2708137035369873, "logits/rejected": -2.0570731163024902, "logps/chosen": -332.41156005859375, "logps/rejected": -219.01556396484375, "loss": 0.6776, "rewards/accuracies": 0.59375, "rewards/chosen": 0.04396076127886772, "rewards/margins": 0.0344666913151741, "rewards/rejected": 0.009494070895016193, "step": 36 }, { "epoch": 0.27889908256880735, "grad_norm": 3.29911208152771, "learning_rate": 4.634146341463416e-06, "logits/chosen": -1.2899575233459473, "logits/rejected": -2.1684398651123047, "logps/chosen": -316.49993896484375, "logps/rejected": -214.9636688232422, "loss": 0.6867, "rewards/accuracies": 0.59375, "rewards/chosen": 0.022868501022458076, "rewards/margins": 0.014677047729492188, "rewards/rejected": 0.008191454224288464, "step": 38 }, { "epoch": 0.29357798165137616, "grad_norm": 2.80910325050354, "learning_rate": 4.8780487804878055e-06, "logits/chosen": -1.1400400400161743, "logits/rejected": -2.0709128379821777, "logps/chosen": -368.51824951171875, "logps/rejected": -194.36216735839844, "loss": 0.6702, "rewards/accuracies": 0.703125, "rewards/chosen": 0.04931124299764633, "rewards/margins": 0.04868461191654205, "rewards/rejected": 0.0006266293348744512, "step": 40 }, { "epoch": 0.30825688073394497, "grad_norm": 3.187028169631958, "learning_rate": 4.999908404322799e-06, "logits/chosen": -1.142716646194458, "logits/rejected": -2.20780348777771, "logps/chosen": -343.4991760253906, "logps/rejected": -184.4697265625, "loss": 0.6621, "rewards/accuracies": 0.859375, "rewards/chosen": 0.06981995701789856, "rewards/margins": 0.06430794298648834, "rewards/rejected": 0.005512019619345665, "step": 42 }, { "epoch": 0.3229357798165138, "grad_norm": 2.664074659347534, "learning_rate": 4.999175679175577e-06, "logits/chosen": -1.209214448928833, "logits/rejected": -2.1323928833007812, "logps/chosen": -270.0044860839844, "logps/rejected": -171.32073974609375, "loss": 0.656, "rewards/accuracies": 0.828125, "rewards/chosen": 0.07609987258911133, "rewards/margins": 0.07787147164344788, "rewards/rejected": -0.0017715932335704565, "step": 44 }, { "epoch": 0.3376146788990826, "grad_norm": 2.661236047744751, "learning_rate": 4.997710443643461e-06, "logits/chosen": -1.235365629196167, "logits/rejected": -2.0518736839294434, "logps/chosen": -279.3170166015625, "logps/rejected": -219.13522338867188, "loss": 0.6659, "rewards/accuracies": 0.671875, "rewards/chosen": 0.0708962082862854, "rewards/margins": 0.05834145471453667, "rewards/rejected": 0.012554753571748734, "step": 46 }, { "epoch": 0.3522935779816514, "grad_norm": 3.9819839000701904, "learning_rate": 4.995513127188151e-06, "logits/chosen": -1.1877082586288452, "logits/rejected": -2.2009482383728027, "logps/chosen": -392.36041259765625, "logps/rejected": -197.9148406982422, "loss": 0.661, "rewards/accuracies": 0.765625, "rewards/chosen": 0.10185055434703827, "rewards/margins": 0.06925681233406067, "rewards/rejected": 0.0325937457382679, "step": 48 }, { "epoch": 0.3669724770642202, "grad_norm": 3.3627212047576904, "learning_rate": 4.992584373844853e-06, "logits/chosen": -1.3079514503479004, "logits/rejected": -2.1042516231536865, "logps/chosen": -367.0893859863281, "logps/rejected": -195.80905151367188, "loss": 0.6609, "rewards/accuracies": 0.6875, "rewards/chosen": 0.10206526517868042, "rewards/margins": 0.06879469007253647, "rewards/rejected": 0.03327057510614395, "step": 50 }, { "epoch": 0.381651376146789, "grad_norm": 3.4364843368530273, "learning_rate": 4.98892504203351e-06, "logits/chosen": -1.3703242540359497, "logits/rejected": -2.135772228240967, "logps/chosen": -305.8392639160156, "logps/rejected": -170.4441680908203, "loss": 0.6426, "rewards/accuracies": 0.921875, "rewards/chosen": 0.13675755262374878, "rewards/margins": 0.10614188760519028, "rewards/rejected": 0.030615665018558502, "step": 52 }, { "epoch": 0.3963302752293578, "grad_norm": 3.014284372329712, "learning_rate": 4.9845362043071925e-06, "logits/chosen": -1.1213593482971191, "logits/rejected": -2.040038585662842, "logps/chosen": -311.7105712890625, "logps/rejected": -176.02438354492188, "loss": 0.6448, "rewards/accuracies": 0.796875, "rewards/chosen": 0.13455447554588318, "rewards/margins": 0.10230613499879837, "rewards/rejected": 0.03224834054708481, "step": 54 }, { "epoch": 0.41100917431192663, "grad_norm": 3.0536396503448486, "learning_rate": 4.97941914703774e-06, "logits/chosen": -1.2472190856933594, "logits/rejected": -2.175790309906006, "logps/chosen": -310.2051086425781, "logps/rejected": -214.69712829589844, "loss": 0.6303, "rewards/accuracies": 0.859375, "rewards/chosen": 0.18810473382472992, "rewards/margins": 0.13538572192192078, "rewards/rejected": 0.052719030529260635, "step": 56 }, { "epoch": 0.42568807339449544, "grad_norm": 3.687453031539917, "learning_rate": 4.973575370038718e-06, "logits/chosen": -1.161484956741333, "logits/rejected": -2.056807518005371, "logps/chosen": -331.156005859375, "logps/rejected": -206.752685546875, "loss": 0.6109, "rewards/accuracies": 0.84375, "rewards/chosen": 0.22813093662261963, "rewards/margins": 0.178737074136734, "rewards/rejected": 0.04939386993646622, "step": 58 }, { "epoch": 0.44036697247706424, "grad_norm": 2.6800389289855957, "learning_rate": 4.967006586125827e-06, "logits/chosen": -1.3047680854797363, "logits/rejected": -2.1053338050842285, "logps/chosen": -320.47052001953125, "logps/rejected": -198.96849060058594, "loss": 0.5949, "rewards/accuracies": 0.984375, "rewards/chosen": 0.25883767008781433, "rewards/margins": 0.21324561536312103, "rewards/rejected": 0.0455920584499836, "step": 60 }, { "epoch": 0.45504587155963305, "grad_norm": 3.319866180419922, "learning_rate": 4.959714720614871e-06, "logits/chosen": -1.2463948726654053, "logits/rejected": -2.2376761436462402, "logps/chosen": -343.1983642578125, "logps/rejected": -197.24610900878906, "loss": 0.5745, "rewards/accuracies": 0.90625, "rewards/chosen": 0.3166658878326416, "rewards/margins": 0.26522931456565857, "rewards/rejected": 0.05143657326698303, "step": 62 }, { "epoch": 0.46972477064220186, "grad_norm": 2.6847336292266846, "learning_rate": 4.951701910757446e-06, "logits/chosen": -1.252946138381958, "logits/rejected": -2.0270633697509766, "logps/chosen": -273.5660400390625, "logps/rejected": -200.33984375, "loss": 0.5721, "rewards/accuracies": 0.953125, "rewards/chosen": 0.3213706910610199, "rewards/margins": 0.2726665437221527, "rewards/rejected": 0.04870418459177017, "step": 64 }, { "epoch": 0.48440366972477067, "grad_norm": 3.1882617473602295, "learning_rate": 4.942970505114514e-06, "logits/chosen": -1.1212793588638306, "logits/rejected": -2.0485286712646484, "logps/chosen": -329.2900390625, "logps/rejected": -188.50067138671875, "loss": 0.551, "rewards/accuracies": 0.921875, "rewards/chosen": 0.37985220551490784, "rewards/margins": 0.32368168234825134, "rewards/rejected": 0.056170523166656494, "step": 66 }, { "epoch": 0.4990825688073395, "grad_norm": 2.6009716987609863, "learning_rate": 4.933523062868033e-06, "logits/chosen": -1.1749910116195679, "logits/rejected": -2.1656789779663086, "logps/chosen": -290.49560546875, "logps/rejected": -177.18348693847656, "loss": 0.5495, "rewards/accuracies": 0.9375, "rewards/chosen": 0.399168461561203, "rewards/margins": 0.3291959762573242, "rewards/rejected": 0.0699724480509758, "step": 68 }, { "epoch": 0.5137614678899083, "grad_norm": 2.7933292388916016, "learning_rate": 4.923362353070859e-06, "logits/chosen": -0.9930830597877502, "logits/rejected": -2.1664011478424072, "logps/chosen": -308.12164306640625, "logps/rejected": -170.24810791015625, "loss": 0.5133, "rewards/accuracies": 0.953125, "rewards/chosen": 0.4771941602230072, "rewards/margins": 0.4229365289211273, "rewards/rejected": 0.05425760895013809, "step": 70 }, { "epoch": 0.5284403669724771, "grad_norm": 2.4665513038635254, "learning_rate": 4.912491353835138e-06, "logits/chosen": -1.2331562042236328, "logits/rejected": -2.0544230937957764, "logps/chosen": -277.6913757324219, "logps/rejected": -196.8771209716797, "loss": 0.5365, "rewards/accuracies": 0.96875, "rewards/chosen": 0.4595210552215576, "rewards/margins": 0.3660896122455597, "rewards/rejected": 0.09343138337135315, "step": 72 }, { "epoch": 0.5431192660550459, "grad_norm": 2.463873863220215, "learning_rate": 4.900913251459418e-06, "logits/chosen": -1.1638422012329102, "logits/rejected": -2.0549814701080322, "logps/chosen": -280.3222961425781, "logps/rejected": -182.9549560546875, "loss": 0.5144, "rewards/accuracies": 0.953125, "rewards/chosen": 0.49483105540275574, "rewards/margins": 0.4287148714065552, "rewards/rejected": 0.06611625105142593, "step": 74 }, { "epoch": 0.5577981651376147, "grad_norm": 2.5419061183929443, "learning_rate": 4.8886314394947396e-06, "logits/chosen": -1.0577822923660278, "logits/rejected": -2.03446364402771, "logps/chosen": -299.0617980957031, "logps/rejected": -196.64585876464844, "loss": 0.4634, "rewards/accuracies": 0.984375, "rewards/chosen": 0.6822911500930786, "rewards/margins": 0.5850739479064941, "rewards/rejected": 0.09721729159355164, "step": 76 }, { "epoch": 0.5724770642201835, "grad_norm": 2.5450778007507324, "learning_rate": 4.875649517749985e-06, "logits/chosen": -1.0982365608215332, "logits/rejected": -2.1213526725769043, "logps/chosen": -301.862548828125, "logps/rejected": -203.84742736816406, "loss": 0.4663, "rewards/accuracies": 0.9375, "rewards/chosen": 0.6690702438354492, "rewards/margins": 0.5742719769477844, "rewards/rejected": 0.0947982519865036, "step": 78 }, { "epoch": 0.5871559633027523, "grad_norm": 2.306406259536743, "learning_rate": 4.861971291236772e-06, "logits/chosen": -1.243112325668335, "logits/rejected": -2.0873706340789795, "logps/chosen": -346.6309509277344, "logps/rejected": -203.404052734375, "loss": 0.4685, "rewards/accuracies": 0.890625, "rewards/chosen": 0.7503749132156372, "rewards/margins": 0.5946022868156433, "rewards/rejected": 0.15577253699302673, "step": 80 }, { "epoch": 0.6018348623853211, "grad_norm": 2.5219640731811523, "learning_rate": 4.847600769054201e-06, "logits/chosen": -1.2759498357772827, "logits/rejected": -2.1124911308288574, "logps/chosen": -385.54498291015625, "logps/rejected": -234.3006591796875, "loss": 0.425, "rewards/accuracies": 0.9375, "rewards/chosen": 0.8981677293777466, "rewards/margins": 0.7219379544258118, "rewards/rejected": 0.1762298047542572, "step": 82 }, { "epoch": 0.6165137614678899, "grad_norm": 2.297736406326294, "learning_rate": 4.832542163213787e-06, "logits/chosen": -1.1348319053649902, "logits/rejected": -2.198058605194092, "logps/chosen": -278.73016357421875, "logps/rejected": -165.02432250976562, "loss": 0.4082, "rewards/accuracies": 0.953125, "rewards/chosen": 0.8287730813026428, "rewards/margins": 0.7731601595878601, "rewards/rejected": 0.05561291426420212, "step": 84 }, { "epoch": 0.6311926605504588, "grad_norm": 2.225381851196289, "learning_rate": 4.816799887404911e-06, "logits/chosen": -1.299065351486206, "logits/rejected": -2.1710290908813477, "logps/chosen": -316.4049377441406, "logps/rejected": -197.56884765625, "loss": 0.4463, "rewards/accuracies": 0.984375, "rewards/chosen": 0.8337810039520264, "rewards/margins": 0.6480390429496765, "rewards/rejected": 0.18574194610118866, "step": 86 }, { "epoch": 0.6458715596330276, "grad_norm": 2.1773393154144287, "learning_rate": 4.800378555701168e-06, "logits/chosen": -1.145480751991272, "logits/rejected": -2.0223851203918457, "logps/chosen": -370.3527526855469, "logps/rejected": -196.968505859375, "loss": 0.4075, "rewards/accuracies": 0.96875, "rewards/chosen": 0.898174524307251, "rewards/margins": 0.7861010432243347, "rewards/rejected": 0.11207354068756104, "step": 88 }, { "epoch": 0.6605504587155964, "grad_norm": 2.2705249786376953, "learning_rate": 4.783282981207979e-06, "logits/chosen": -1.191556453704834, "logits/rejected": -2.307077407836914, "logps/chosen": -312.4258728027344, "logps/rejected": -179.88075256347656, "loss": 0.3893, "rewards/accuracies": 0.953125, "rewards/chosen": 1.0026105642318726, "rewards/margins": 0.8912415504455566, "rewards/rejected": 0.11136899888515472, "step": 90 }, { "epoch": 0.6752293577981652, "grad_norm": 1.855381965637207, "learning_rate": 4.765518174651864e-06, "logits/chosen": -1.1708786487579346, "logits/rejected": -2.0928103923797607, "logps/chosen": -301.8147277832031, "logps/rejected": -201.09478759765625, "loss": 0.3757, "rewards/accuracies": 0.953125, "rewards/chosen": 0.9942986369132996, "rewards/margins": 0.8888772130012512, "rewards/rejected": 0.10542140901088715, "step": 92 }, { "epoch": 0.689908256880734, "grad_norm": 2.0521061420440674, "learning_rate": 4.747089342911793e-06, "logits/chosen": -1.011386513710022, "logits/rejected": -2.1828246116638184, "logps/chosen": -308.777099609375, "logps/rejected": -185.42471313476562, "loss": 0.3329, "rewards/accuracies": 0.984375, "rewards/chosen": 1.1515001058578491, "rewards/margins": 1.04723060131073, "rewards/rejected": 0.10426945239305496, "step": 94 }, { "epoch": 0.7045871559633028, "grad_norm": 1.8322721719741821, "learning_rate": 4.728001887493048e-06, "logits/chosen": -1.0440161228179932, "logits/rejected": -2.2036566734313965, "logps/chosen": -317.36346435546875, "logps/rejected": -204.70556640625, "loss": 0.3371, "rewards/accuracies": 0.953125, "rewards/chosen": 1.257871150970459, "rewards/margins": 1.1031622886657715, "rewards/rejected": 0.15470871329307556, "step": 96 }, { "epoch": 0.7192660550458716, "grad_norm": 1.716375708580017, "learning_rate": 4.708261402944036e-06, "logits/chosen": -1.1383062601089478, "logits/rejected": -2.189666271209717, "logps/chosen": -333.7127380371094, "logps/rejected": -198.931884765625, "loss": 0.2993, "rewards/accuracies": 0.984375, "rewards/chosen": 1.3938922882080078, "rewards/margins": 1.2491440773010254, "rewards/rejected": 0.14474821090698242, "step": 98 }, { "epoch": 0.7339449541284404, "grad_norm": 1.7844756841659546, "learning_rate": 4.687873675216522e-06, "logits/chosen": -1.0265507698059082, "logits/rejected": -1.989030122756958, "logps/chosen": -318.661865234375, "logps/rejected": -211.2397918701172, "loss": 0.3127, "rewards/accuracies": 0.953125, "rewards/chosen": 1.453789472579956, "rewards/margins": 1.268122673034668, "rewards/rejected": 0.1856667846441269, "step": 100 }, { "epoch": 0.7486238532110092, "grad_norm": 1.7730361223220825, "learning_rate": 4.666844679969765e-06, "logits/chosen": -1.3037304878234863, "logits/rejected": -2.2598671913146973, "logps/chosen": -312.95440673828125, "logps/rejected": -219.03636169433594, "loss": 0.3016, "rewards/accuracies": 1.0, "rewards/chosen": 1.2438600063323975, "rewards/margins": 1.2158725261688232, "rewards/rejected": 0.02798762172460556, "step": 102 }, { "epoch": 0.763302752293578, "grad_norm": 1.6278932094573975, "learning_rate": 4.6451805808190464e-06, "logits/chosen": -1.1335176229476929, "logits/rejected": -2.17392635345459, "logps/chosen": -299.39410400390625, "logps/rejected": -186.06622314453125, "loss": 0.2634, "rewards/accuracies": 0.984375, "rewards/chosen": 1.4063000679016113, "rewards/margins": 1.4068892002105713, "rewards/rejected": -0.0005892012268304825, "step": 104 }, { "epoch": 0.7779816513761468, "grad_norm": 1.5209800004959106, "learning_rate": 4.622887727529104e-06, "logits/chosen": -1.1014411449432373, "logits/rejected": -2.1214916706085205, "logps/chosen": -271.7640075683594, "logps/rejected": -216.89988708496094, "loss": 0.2627, "rewards/accuracies": 0.984375, "rewards/chosen": 1.386069416999817, "rewards/margins": 1.4243448972702026, "rewards/rejected": -0.03827540576457977, "step": 106 }, { "epoch": 0.7926605504587156, "grad_norm": 1.5802730321884155, "learning_rate": 4.599972654153018e-06, "logits/chosen": -0.9640820026397705, "logits/rejected": -2.146678924560547, "logps/chosen": -315.3819885253906, "logps/rejected": -184.68304443359375, "loss": 0.2601, "rewards/accuracies": 0.953125, "rewards/chosen": 1.5327023267745972, "rewards/margins": 1.4815881252288818, "rewards/rejected": 0.05111423879861832, "step": 108 }, { "epoch": 0.8073394495412844, "grad_norm": 1.6033107042312622, "learning_rate": 4.5764420771170735e-06, "logits/chosen": -0.9946492910385132, "logits/rejected": -2.0975136756896973, "logps/chosen": -292.52398681640625, "logps/rejected": -202.83602905273438, "loss": 0.2738, "rewards/accuracies": 0.96875, "rewards/chosen": 1.4424684047698975, "rewards/margins": 1.4901291131973267, "rewards/rejected": -0.04766057804226875, "step": 110 }, { "epoch": 0.8220183486238533, "grad_norm": 1.6146634817123413, "learning_rate": 4.552302893252166e-06, "logits/chosen": -1.2488244771957397, "logits/rejected": -2.2399239540100098, "logps/chosen": -319.301025390625, "logps/rejected": -215.10731506347656, "loss": 0.2432, "rewards/accuracies": 1.0, "rewards/chosen": 1.492225170135498, "rewards/margins": 1.5009602308273315, "rewards/rejected": -0.008735168725252151, "step": 112 }, { "epoch": 0.8366972477064221, "grad_norm": 1.879619836807251, "learning_rate": 4.52756217777234e-06, "logits/chosen": -1.2845666408538818, "logits/rejected": -2.2133727073669434, "logps/chosen": -325.5247497558594, "logps/rejected": -219.1314697265625, "loss": 0.2626, "rewards/accuracies": 1.0, "rewards/chosen": 1.571778655052185, "rewards/margins": 1.5141938924789429, "rewards/rejected": 0.05758478865027428, "step": 114 }, { "epoch": 0.8513761467889909, "grad_norm": 1.5598102807998657, "learning_rate": 4.502227182201035e-06, "logits/chosen": -0.9802009463310242, "logits/rejected": -2.0259878635406494, "logps/chosen": -275.55816650390625, "logps/rejected": -185.3338165283203, "loss": 0.2275, "rewards/accuracies": 1.0, "rewards/chosen": 1.5974626541137695, "rewards/margins": 1.620395541191101, "rewards/rejected": -0.022933142259716988, "step": 116 }, { "epoch": 0.8660550458715597, "grad_norm": 1.277979850769043, "learning_rate": 4.476305332245662e-06, "logits/chosen": -1.1266419887542725, "logits/rejected": -2.322726249694824, "logps/chosen": -327.810302734375, "logps/rejected": -161.39149475097656, "loss": 0.1984, "rewards/accuracies": 1.0, "rewards/chosen": 1.5967400074005127, "rewards/margins": 1.8176448345184326, "rewards/rejected": -0.2209048718214035, "step": 118 }, { "epoch": 0.8807339449541285, "grad_norm": 1.6042323112487793, "learning_rate": 4.449804225621116e-06, "logits/chosen": -1.0289760828018188, "logits/rejected": -2.102262496948242, "logps/chosen": -291.6026611328125, "logps/rejected": -190.6699676513672, "loss": 0.249, "rewards/accuracies": 0.984375, "rewards/chosen": 1.5607997179031372, "rewards/margins": 1.628572702407837, "rewards/rejected": -0.06777279078960419, "step": 120 }, { "epoch": 0.8954128440366973, "grad_norm": 1.430982232093811, "learning_rate": 4.422731629822887e-06, "logits/chosen": -0.9640188217163086, "logits/rejected": -2.000277519226074, "logps/chosen": -327.7152404785156, "logps/rejected": -205.96337890625, "loss": 0.2425, "rewards/accuracies": 0.953125, "rewards/chosen": 1.6806552410125732, "rewards/margins": 1.685612440109253, "rewards/rejected": -0.004957253113389015, "step": 122 }, { "epoch": 0.9100917431192661, "grad_norm": 1.513214111328125, "learning_rate": 4.395095479850396e-06, "logits/chosen": -0.972959578037262, "logits/rejected": -1.9764440059661865, "logps/chosen": -299.74847412109375, "logps/rejected": -197.39337158203125, "loss": 0.2516, "rewards/accuracies": 0.953125, "rewards/chosen": 1.5539629459381104, "rewards/margins": 1.6879091262817383, "rewards/rejected": -0.13394607603549957, "step": 124 }, { "epoch": 0.9247706422018349, "grad_norm": 1.2460252046585083, "learning_rate": 4.366903875881243e-06, "logits/chosen": -1.1148145198822021, "logits/rejected": -2.3518619537353516, "logps/chosen": -287.5447692871094, "logps/rejected": -175.43360900878906, "loss": 0.19, "rewards/accuracies": 1.0, "rewards/chosen": 1.5850166082382202, "rewards/margins": 1.998946189880371, "rewards/rejected": -0.4139295220375061, "step": 126 }, { "epoch": 0.9394495412844037, "grad_norm": 1.4544743299484253, "learning_rate": 4.3381650808970365e-06, "logits/chosen": -1.0423675775527954, "logits/rejected": -1.992466926574707, "logps/chosen": -265.5049743652344, "logps/rejected": -196.2741241455078, "loss": 0.2207, "rewards/accuracies": 0.984375, "rewards/chosen": 1.5719702243804932, "rewards/margins": 1.7563403844833374, "rewards/rejected": -0.18437033891677856, "step": 128 }, { "epoch": 0.9541284403669725, "grad_norm": 1.5302927494049072, "learning_rate": 4.308887518261507e-06, "logits/chosen": -0.8528121113777161, "logits/rejected": -1.961355447769165, "logps/chosen": -288.3016357421875, "logps/rejected": -206.49557495117188, "loss": 0.2075, "rewards/accuracies": 0.96875, "rewards/chosen": 1.7642028331756592, "rewards/margins": 1.9030241966247559, "rewards/rejected": -0.13882134854793549, "step": 130 }, { "epoch": 0.9688073394495413, "grad_norm": 1.4101622104644775, "learning_rate": 4.279079769251617e-06, "logits/chosen": -1.2729012966156006, "logits/rejected": -2.241056203842163, "logps/chosen": -362.6707458496094, "logps/rejected": -222.91549682617188, "loss": 0.1861, "rewards/accuracies": 0.96875, "rewards/chosen": 1.8280537128448486, "rewards/margins": 2.065840005874634, "rewards/rejected": -0.23778626322746277, "step": 132 }, { "epoch": 0.9834862385321101, "grad_norm": 1.1177998781204224, "learning_rate": 4.248750570542373e-06, "logits/chosen": -1.0287914276123047, "logits/rejected": -2.1009342670440674, "logps/chosen": -281.2322998046875, "logps/rejected": -189.8081512451172, "loss": 0.1931, "rewards/accuracies": 1.0, "rewards/chosen": 1.5686054229736328, "rewards/margins": 1.9290703535079956, "rewards/rejected": -0.360464870929718, "step": 134 }, { "epoch": 0.998165137614679, "grad_norm": 1.2145086526870728, "learning_rate": 4.21790881164611e-06, "logits/chosen": -0.9554519653320312, "logits/rejected": -2.0969762802124023, "logps/chosen": -292.5300598144531, "logps/rejected": -207.0960235595703, "loss": 0.1734, "rewards/accuracies": 0.984375, "rewards/chosen": 1.7437012195587158, "rewards/margins": 2.259512424468994, "rewards/rejected": -0.5158110857009888, "step": 136 }, { "epoch": 1.0128440366972478, "grad_norm": 1.095413327217102, "learning_rate": 4.186563532306957e-06, "logits/chosen": -0.9077868461608887, "logits/rejected": -2.1029911041259766, "logps/chosen": -300.1116943359375, "logps/rejected": -180.19322204589844, "loss": 0.1588, "rewards/accuracies": 1.0, "rewards/chosen": 1.7422677278518677, "rewards/margins": 2.208810567855835, "rewards/rejected": -0.4665430784225464, "step": 138 }, { "epoch": 1.0275229357798166, "grad_norm": 1.3220425844192505, "learning_rate": 4.154723919851291e-06, "logits/chosen": -1.077134132385254, "logits/rejected": -2.1211631298065186, "logps/chosen": -300.9671325683594, "logps/rejected": -185.4986114501953, "loss": 0.2096, "rewards/accuracies": 0.984375, "rewards/chosen": 1.5104981660842896, "rewards/margins": 1.9692846536636353, "rewards/rejected": -0.4587865471839905, "step": 140 }, { "epoch": 1.0422018348623854, "grad_norm": 0.9982088208198547, "learning_rate": 4.122399306494918e-06, "logits/chosen": -1.1294522285461426, "logits/rejected": -2.265366792678833, "logps/chosen": -348.11553955078125, "logps/rejected": -211.96484375, "loss": 0.1527, "rewards/accuracies": 0.984375, "rewards/chosen": 1.8556833267211914, "rewards/margins": 2.2423272132873535, "rewards/rejected": -0.38664379715919495, "step": 142 }, { "epoch": 1.0568807339449542, "grad_norm": 0.9512726068496704, "learning_rate": 4.089599166607794e-06, "logits/chosen": -1.0260741710662842, "logits/rejected": -2.078310489654541, "logps/chosen": -301.7906494140625, "logps/rejected": -200.17333984375, "loss": 0.132, "rewards/accuracies": 0.984375, "rewards/chosen": 1.709304690361023, "rewards/margins": 2.428473949432373, "rewards/rejected": -0.7191690802574158, "step": 144 }, { "epoch": 1.071559633027523, "grad_norm": 0.9233289957046509, "learning_rate": 4.05633311393708e-06, "logits/chosen": -0.9745887517929077, "logits/rejected": -2.032710313796997, "logps/chosen": -267.1161804199219, "logps/rejected": -185.32769775390625, "loss": 0.1508, "rewards/accuracies": 0.984375, "rewards/chosen": 1.708602786064148, "rewards/margins": 2.2974541187286377, "rewards/rejected": -0.5888515710830688, "step": 146 }, { "epoch": 1.0862385321100918, "grad_norm": 0.9916685223579407, "learning_rate": 4.022610898789349e-06, "logits/chosen": -0.9556669592857361, "logits/rejected": -2.117856979370117, "logps/chosen": -277.4543762207031, "logps/rejected": -200.896728515625, "loss": 0.1394, "rewards/accuracies": 1.0, "rewards/chosen": 1.785478115081787, "rewards/margins": 2.374891519546509, "rewards/rejected": -0.5894135236740112, "step": 148 }, { "epoch": 1.1009174311926606, "grad_norm": 1.2182554006576538, "learning_rate": 3.988442405172755e-06, "logits/chosen": -0.8240389823913574, "logits/rejected": -2.0166051387786865, "logps/chosen": -293.0532531738281, "logps/rejected": -215.48983764648438, "loss": 0.121, "rewards/accuracies": 1.0, "rewards/chosen": 2.0409958362579346, "rewards/margins": 2.6529250144958496, "rewards/rejected": -0.6119292378425598, "step": 150 }, { "epoch": 1.1155963302752294, "grad_norm": 1.0240944623947144, "learning_rate": 3.953837647900031e-06, "logits/chosen": -0.899176836013794, "logits/rejected": -2.119375705718994, "logps/chosen": -283.8042907714844, "logps/rejected": -211.6457977294922, "loss": 0.1437, "rewards/accuracies": 0.984375, "rewards/chosen": 1.990633249282837, "rewards/margins": 2.620699405670166, "rewards/rejected": -0.6300662159919739, "step": 152 }, { "epoch": 1.1302752293577982, "grad_norm": 1.21559476852417, "learning_rate": 3.918806769653135e-06, "logits/chosen": -0.8191251754760742, "logits/rejected": -2.017087459564209, "logps/chosen": -331.17724609375, "logps/rejected": -209.400146484375, "loss": 0.152, "rewards/accuracies": 0.96875, "rewards/chosen": 1.9586645364761353, "rewards/margins": 2.562222480773926, "rewards/rejected": -0.6035579442977905, "step": 154 }, { "epoch": 1.144954128440367, "grad_norm": 1.2314106225967407, "learning_rate": 3.88336003801042e-06, "logits/chosen": -0.9168681502342224, "logits/rejected": -2.054666519165039, "logps/chosen": -264.9989013671875, "logps/rejected": -192.7652587890625, "loss": 0.1509, "rewards/accuracies": 1.0, "rewards/chosen": 1.7247517108917236, "rewards/margins": 2.3017380237579346, "rewards/rejected": -0.5769862532615662, "step": 156 }, { "epoch": 1.1596330275229358, "grad_norm": 0.9996971487998962, "learning_rate": 3.847507842437205e-06, "logits/chosen": -0.788710355758667, "logits/rejected": -2.0527966022491455, "logps/chosen": -306.01373291015625, "logps/rejected": -187.79794311523438, "loss": 0.1149, "rewards/accuracies": 1.0, "rewards/chosen": 1.9181973934173584, "rewards/margins": 2.8037965297698975, "rewards/rejected": -0.8855991959571838, "step": 158 }, { "epoch": 1.1743119266055047, "grad_norm": 0.9679911732673645, "learning_rate": 3.811260691240604e-06, "logits/chosen": -0.8132730722427368, "logits/rejected": -2.0696139335632324, "logps/chosen": -351.917236328125, "logps/rejected": -204.27964782714844, "loss": 0.1011, "rewards/accuracies": 1.0, "rewards/chosen": 2.159268856048584, "rewards/margins": 2.8514890670776367, "rewards/rejected": -0.6922197937965393, "step": 160 }, { "epoch": 1.1889908256880735, "grad_norm": 0.9500184059143066, "learning_rate": 3.774629208489547e-06, "logits/chosen": -0.9215357899665833, "logits/rejected": -2.1160709857940674, "logps/chosen": -253.12631225585938, "logps/rejected": -187.95811462402344, "loss": 0.1173, "rewards/accuracies": 1.0, "rewards/chosen": 1.8172812461853027, "rewards/margins": 2.5532562732696533, "rewards/rejected": -0.7359753251075745, "step": 162 }, { "epoch": 1.2036697247706423, "grad_norm": 1.0494946241378784, "learning_rate": 3.7376241309008433e-06, "logits/chosen": -1.0810823440551758, "logits/rejected": -2.151219606399536, "logps/chosen": -337.37255859375, "logps/rejected": -198.8866424560547, "loss": 0.1245, "rewards/accuracies": 1.0, "rewards/chosen": 2.12947154045105, "rewards/margins": 2.8516111373901367, "rewards/rejected": -0.7221395373344421, "step": 164 }, { "epoch": 1.218348623853211, "grad_norm": 0.822201669216156, "learning_rate": 3.7002563046922502e-06, "logits/chosen": -1.0325469970703125, "logits/rejected": -2.2076807022094727, "logps/chosen": -337.1971435546875, "logps/rejected": -189.85455322265625, "loss": 0.0923, "rewards/accuracies": 1.0, "rewards/chosen": 1.9719089269638062, "rewards/margins": 3.0117526054382324, "rewards/rejected": -1.0398434400558472, "step": 166 }, { "epoch": 1.2330275229357799, "grad_norm": 0.681236982345581, "learning_rate": 3.6625366824034337e-06, "logits/chosen": -0.7656459212303162, "logits/rejected": -2.049311399459839, "logps/chosen": -289.5611877441406, "logps/rejected": -224.8207550048828, "loss": 0.0906, "rewards/accuracies": 1.0, "rewards/chosen": 2.0607948303222656, "rewards/margins": 3.2540061473846436, "rewards/rejected": -1.1932108402252197, "step": 168 }, { "epoch": 1.2477064220183487, "grad_norm": 0.9994679689407349, "learning_rate": 3.6244763196857714e-06, "logits/chosen": -0.9609106183052063, "logits/rejected": -2.1387076377868652, "logps/chosen": -307.74798583984375, "logps/rejected": -199.2579345703125, "loss": 0.1095, "rewards/accuracies": 1.0, "rewards/chosen": 2.0705361366271973, "rewards/margins": 3.1018238067626953, "rewards/rejected": -1.0312877893447876, "step": 170 }, { "epoch": 1.2623853211009175, "grad_norm": 1.2497354745864868, "learning_rate": 3.5860863720619333e-06, "logits/chosen": -0.9625377058982849, "logits/rejected": -2.073275089263916, "logps/chosen": -297.9329833984375, "logps/rejected": -200.70681762695312, "loss": 0.1191, "rewards/accuracies": 0.984375, "rewards/chosen": 2.188173532485962, "rewards/margins": 2.902965784072876, "rewards/rejected": -0.7147922515869141, "step": 172 }, { "epoch": 1.2770642201834863, "grad_norm": 0.862918496131897, "learning_rate": 3.547378091656186e-06, "logits/chosen": -0.7778910994529724, "logits/rejected": -2.1054413318634033, "logps/chosen": -304.24798583984375, "logps/rejected": -189.96273803710938, "loss": 0.0918, "rewards/accuracies": 1.0, "rewards/chosen": 1.911596655845642, "rewards/margins": 3.0262608528137207, "rewards/rejected": -1.1146641969680786, "step": 174 }, { "epoch": 1.2917431192660551, "grad_norm": 0.7902020812034607, "learning_rate": 3.5083628238963913e-06, "logits/chosen": -1.0238415002822876, "logits/rejected": -1.960688829421997, "logps/chosen": -243.5750274658203, "logps/rejected": -191.24264526367188, "loss": 0.1312, "rewards/accuracies": 1.0, "rewards/chosen": 1.8321638107299805, "rewards/margins": 2.780503988265991, "rewards/rejected": -0.948340117931366, "step": 176 }, { "epoch": 1.306422018348624, "grad_norm": 0.9199721813201904, "learning_rate": 3.4690520041886473e-06, "logits/chosen": -0.7949679493904114, "logits/rejected": -2.0139424800872803, "logps/chosen": -287.1697082519531, "logps/rejected": -230.3143310546875, "loss": 0.1007, "rewards/accuracies": 0.984375, "rewards/chosen": 2.025575876235962, "rewards/margins": 3.0686216354370117, "rewards/rejected": -1.0430455207824707, "step": 178 }, { "epoch": 1.3211009174311927, "grad_norm": 0.6183698773384094, "learning_rate": 3.4294571545655653e-06, "logits/chosen": -0.8391042947769165, "logits/rejected": -2.1887526512145996, "logps/chosen": -302.6844482421875, "logps/rejected": -199.70486450195312, "loss": 0.0751, "rewards/accuracies": 1.0, "rewards/chosen": 2.0722954273223877, "rewards/margins": 3.3604629039764404, "rewards/rejected": -1.2881678342819214, "step": 180 }, { "epoch": 1.3357798165137615, "grad_norm": 0.6749584674835205, "learning_rate": 3.38958988030915e-06, "logits/chosen": -1.1391972303390503, "logits/rejected": -2.056378126144409, "logps/chosen": -285.07562255859375, "logps/rejected": -243.91146850585938, "loss": 0.1161, "rewards/accuracies": 0.984375, "rewards/chosen": 1.9815781116485596, "rewards/margins": 3.0620830059051514, "rewards/rejected": -1.0805050134658813, "step": 182 }, { "epoch": 1.3504587155963304, "grad_norm": 0.9916686415672302, "learning_rate": 3.3494618665492833e-06, "logits/chosen": -0.974543571472168, "logits/rejected": -1.9790008068084717, "logps/chosen": -265.7524719238281, "logps/rejected": -210.968994140625, "loss": 0.1316, "rewards/accuracies": 1.0, "rewards/chosen": 1.7159056663513184, "rewards/margins": 2.7169814109802246, "rewards/rejected": -1.0010758638381958, "step": 184 }, { "epoch": 1.3651376146788992, "grad_norm": 0.7534170746803284, "learning_rate": 3.3090848748388042e-06, "logits/chosen": -0.9359984993934631, "logits/rejected": -2.1165120601654053, "logps/chosen": -365.70556640625, "logps/rejected": -213.4051513671875, "loss": 0.0804, "rewards/accuracies": 1.0, "rewards/chosen": 1.9640876054763794, "rewards/margins": 3.4107747077941895, "rewards/rejected": -1.4466872215270996, "step": 186 }, { "epoch": 1.379816513761468, "grad_norm": 0.7047733068466187, "learning_rate": 3.2684707397061887e-06, "logits/chosen": -1.0234425067901611, "logits/rejected": -2.067413806915283, "logps/chosen": -304.1073913574219, "logps/rejected": -191.95208740234375, "loss": 0.0835, "rewards/accuracies": 1.0, "rewards/chosen": 2.091536045074463, "rewards/margins": 3.2847490310668945, "rewards/rejected": -1.1932129859924316, "step": 188 }, { "epoch": 1.3944954128440368, "grad_norm": 0.915761411190033, "learning_rate": 3.2276313651868364e-06, "logits/chosen": -0.8797706365585327, "logits/rejected": -2.130256414413452, "logps/chosen": -307.41839599609375, "logps/rejected": -180.803466796875, "loss": 0.0983, "rewards/accuracies": 1.0, "rewards/chosen": 1.8479622602462769, "rewards/margins": 3.153357982635498, "rewards/rejected": -1.3053958415985107, "step": 190 }, { "epoch": 1.4091743119266056, "grad_norm": 0.7284132838249207, "learning_rate": 3.1865787213339926e-06, "logits/chosen": -0.8553410768508911, "logits/rejected": -2.044377565383911, "logps/chosen": -292.92144775390625, "logps/rejected": -205.9124298095703, "loss": 0.0818, "rewards/accuracies": 1.0, "rewards/chosen": 2.0033276081085205, "rewards/margins": 3.4773898124694824, "rewards/rejected": -1.474062442779541, "step": 192 }, { "epoch": 1.4238532110091744, "grad_norm": 0.7230023145675659, "learning_rate": 3.1453248407103156e-06, "logits/chosen": -0.8956843614578247, "logits/rejected": -2.0704410076141357, "logps/chosen": -297.47418212890625, "logps/rejected": -189.42091369628906, "loss": 0.0838, "rewards/accuracies": 1.0, "rewards/chosen": 1.829613208770752, "rewards/margins": 3.2233619689941406, "rewards/rejected": -1.3937489986419678, "step": 194 }, { "epoch": 1.4385321100917432, "grad_norm": 1.092043161392212, "learning_rate": 3.1038818148611178e-06, "logits/chosen": -0.9160604476928711, "logits/rejected": -1.9689029455184937, "logps/chosen": -323.4578552246094, "logps/rejected": -202.6251220703125, "loss": 0.0932, "rewards/accuracies": 0.984375, "rewards/chosen": 1.9870991706848145, "rewards/margins": 3.379544973373413, "rewards/rejected": -1.3924458026885986, "step": 196 }, { "epoch": 1.453211009174312, "grad_norm": 0.849423348903656, "learning_rate": 3.062261790770331e-06, "logits/chosen": -0.8054502010345459, "logits/rejected": -2.017672061920166, "logps/chosen": -268.9284973144531, "logps/rejected": -201.11390686035156, "loss": 0.1081, "rewards/accuracies": 1.0, "rewards/chosen": 1.7870291471481323, "rewards/margins": 2.959620952606201, "rewards/rejected": -1.1725919246673584, "step": 198 }, { "epoch": 1.4678899082568808, "grad_norm": 0.5849136710166931, "learning_rate": 3.0204769673002123e-06, "logits/chosen": -0.8214648365974426, "logits/rejected": -2.103921890258789, "logps/chosen": -343.6684265136719, "logps/rejected": -218.4034423828125, "loss": 0.0698, "rewards/accuracies": 1.0, "rewards/chosen": 1.9067230224609375, "rewards/margins": 3.2361087799072266, "rewards/rejected": -1.3293852806091309, "step": 200 }, { "epoch": 1.4825688073394496, "grad_norm": 0.8638609647750854, "learning_rate": 2.978539591615848e-06, "logits/chosen": -0.9360217452049255, "logits/rejected": -1.8377161026000977, "logps/chosen": -310.77203369140625, "logps/rejected": -217.95361328125, "loss": 0.0893, "rewards/accuracies": 1.0, "rewards/chosen": 1.8223047256469727, "rewards/margins": 3.374411106109619, "rewards/rejected": -1.5521066188812256, "step": 202 }, { "epoch": 1.4972477064220184, "grad_norm": 0.7469986081123352, "learning_rate": 2.936461955595501e-06, "logits/chosen": -0.9148820638656616, "logits/rejected": -2.0849192142486572, "logps/chosen": -309.4117736816406, "logps/rejected": -211.26283264160156, "loss": 0.0876, "rewards/accuracies": 1.0, "rewards/chosen": 2.0167510509490967, "rewards/margins": 3.318629503250122, "rewards/rejected": -1.301878571510315, "step": 204 }, { "epoch": 1.5119266055045872, "grad_norm": 0.48730000853538513, "learning_rate": 2.8942563922278487e-06, "logits/chosen": -0.8627596497535706, "logits/rejected": -1.997396469116211, "logps/chosen": -297.6988220214844, "logps/rejected": -219.91180419921875, "loss": 0.0659, "rewards/accuracies": 1.0, "rewards/chosen": 1.9595637321472168, "rewards/margins": 3.744344711303711, "rewards/rejected": -1.7847814559936523, "step": 206 }, { "epoch": 1.526605504587156, "grad_norm": 1.1042286157608032, "learning_rate": 2.8519352719971783e-06, "logits/chosen": -0.9377632141113281, "logits/rejected": -2.024191379547119, "logps/chosen": -327.47027587890625, "logps/rejected": -223.6087646484375, "loss": 0.1017, "rewards/accuracies": 0.984375, "rewards/chosen": 2.0964934825897217, "rewards/margins": 3.4565787315368652, "rewards/rejected": -1.3600847721099854, "step": 208 }, { "epoch": 1.5412844036697249, "grad_norm": 0.7358872294425964, "learning_rate": 2.8095109992575824e-06, "logits/chosen": -0.9008034467697144, "logits/rejected": -2.1022136211395264, "logps/chosen": -340.1212158203125, "logps/rejected": -223.19918823242188, "loss": 0.0661, "rewards/accuracies": 1.0, "rewards/chosen": 2.2666172981262207, "rewards/margins": 3.668931007385254, "rewards/rejected": -1.4023137092590332, "step": 210 }, { "epoch": 1.5559633027522937, "grad_norm": 0.823003888130188, "learning_rate": 2.7669960085972407e-06, "logits/chosen": -0.8504350185394287, "logits/rejected": -2.14664888381958, "logps/chosen": -363.5140075683594, "logps/rejected": -241.92892456054688, "loss": 0.068, "rewards/accuracies": 1.0, "rewards/chosen": 2.1337637901306152, "rewards/margins": 3.620941638946533, "rewards/rejected": -1.4871773719787598, "step": 212 }, { "epoch": 1.5706422018348625, "grad_norm": 0.9012424349784851, "learning_rate": 2.7244027611938247e-06, "logits/chosen": -0.6944912672042847, "logits/rejected": -1.8317877054214478, "logps/chosen": -261.44049072265625, "logps/rejected": -243.61410522460938, "loss": 0.1016, "rewards/accuracies": 1.0, "rewards/chosen": 1.750416874885559, "rewards/margins": 3.3300774097442627, "rewards/rejected": -1.5796607732772827, "step": 214 }, { "epoch": 1.5853211009174313, "grad_norm": 0.9822458028793335, "learning_rate": 2.6817437411621194e-06, "logits/chosen": -0.8393555283546448, "logits/rejected": -1.9610698223114014, "logps/chosen": -357.4717102050781, "logps/rejected": -259.9384765625, "loss": 0.0773, "rewards/accuracies": 1.0, "rewards/chosen": 2.1349892616271973, "rewards/margins": 3.471536636352539, "rewards/rejected": -1.336547613143921, "step": 216 }, { "epoch": 1.6, "grad_norm": 0.7191787958145142, "learning_rate": 2.639031451894923e-06, "logits/chosen": -0.8827037811279297, "logits/rejected": -1.878009557723999, "logps/chosen": -341.8013916015625, "logps/rejected": -246.149169921875, "loss": 0.0649, "rewards/accuracies": 1.0, "rewards/chosen": 2.039233446121216, "rewards/margins": 3.67873215675354, "rewards/rejected": -1.6394988298416138, "step": 218 }, { "epoch": 1.614678899082569, "grad_norm": 0.7397493124008179, "learning_rate": 2.5962784123982843e-06, "logits/chosen": -0.9270643591880798, "logits/rejected": -2.148819923400879, "logps/chosen": -318.17242431640625, "logps/rejected": -221.3958282470703, "loss": 0.077, "rewards/accuracies": 1.0, "rewards/chosen": 1.9221253395080566, "rewards/margins": 3.66496205329895, "rewards/rejected": -1.7428367137908936, "step": 220 }, { "epoch": 1.6293577981651377, "grad_norm": 0.5408302545547485, "learning_rate": 2.5534971536221804e-06, "logits/chosen": -0.7174456715583801, "logits/rejected": -1.9402276277542114, "logps/chosen": -280.41265869140625, "logps/rejected": -214.61036682128906, "loss": 0.0679, "rewards/accuracies": 0.984375, "rewards/chosen": 1.8169009685516357, "rewards/margins": 3.658087730407715, "rewards/rejected": -1.841186761856079, "step": 222 }, { "epoch": 1.6440366972477065, "grad_norm": 0.6373718976974487, "learning_rate": 2.5107002147876814e-06, "logits/chosen": -0.8338260650634766, "logits/rejected": -1.8052666187286377, "logps/chosen": -274.18408203125, "logps/rejected": -231.13385009765625, "loss": 0.0677, "rewards/accuracies": 1.0, "rewards/chosen": 1.8962229490280151, "rewards/margins": 3.6884357929229736, "rewards/rejected": -1.792212724685669, "step": 224 }, { "epoch": 1.6587155963302753, "grad_norm": 1.006023645401001, "learning_rate": 2.467900139711693e-06, "logits/chosen": -0.8586325645446777, "logits/rejected": -1.8590312004089355, "logps/chosen": -284.29498291015625, "logps/rejected": -219.96942138671875, "loss": 0.1002, "rewards/accuracies": 1.0, "rewards/chosen": 1.610360860824585, "rewards/margins": 3.314877986907959, "rewards/rejected": -1.7045170068740845, "step": 226 }, { "epoch": 1.6733944954128441, "grad_norm": 0.4218728542327881, "learning_rate": 2.4251094731303586e-06, "logits/chosen": -0.7588306665420532, "logits/rejected": -2.020467758178711, "logps/chosen": -301.7962341308594, "logps/rejected": -201.7028350830078, "loss": 0.0583, "rewards/accuracies": 1.0, "rewards/chosen": 2.0787746906280518, "rewards/margins": 3.6262543201446533, "rewards/rejected": -1.547479271888733, "step": 228 }, { "epoch": 1.688073394495413, "grad_norm": 0.44578853249549866, "learning_rate": 2.3823407570221812e-06, "logits/chosen": -0.681371808052063, "logits/rejected": -2.0245919227600098, "logps/chosen": -310.5913391113281, "logps/rejected": -196.76724243164062, "loss": 0.0614, "rewards/accuracies": 1.0, "rewards/chosen": 1.959742784500122, "rewards/margins": 3.5932304859161377, "rewards/rejected": -1.633487582206726, "step": 230 }, { "epoch": 1.7027522935779817, "grad_norm": 0.6186323761940002, "learning_rate": 2.3396065269319655e-06, "logits/chosen": -0.8481271862983704, "logits/rejected": -2.065420150756836, "logps/chosen": -310.2200012207031, "logps/rejected": -194.04493713378906, "loss": 0.0628, "rewards/accuracies": 1.0, "rewards/chosen": 2.0263564586639404, "rewards/margins": 3.842787265777588, "rewards/rejected": -1.8164305686950684, "step": 232 }, { "epoch": 1.7174311926605506, "grad_norm": 0.5135802626609802, "learning_rate": 2.2969193082966353e-06, "logits/chosen": -0.7080973386764526, "logits/rejected": -2.007819652557373, "logps/chosen": -296.1108093261719, "logps/rejected": -216.99868774414062, "loss": 0.0534, "rewards/accuracies": 1.0, "rewards/chosen": 1.9229261875152588, "rewards/margins": 3.921962022781372, "rewards/rejected": -1.9990354776382446, "step": 234 }, { "epoch": 1.7321100917431194, "grad_norm": 0.9138413071632385, "learning_rate": 2.2542916127740194e-06, "logits/chosen": -0.6951168775558472, "logits/rejected": -1.6621724367141724, "logps/chosen": -323.9538269042969, "logps/rejected": -260.28900146484375, "loss": 0.0728, "rewards/accuracies": 1.0, "rewards/chosen": 2.0283045768737793, "rewards/margins": 3.7805428504943848, "rewards/rejected": -1.7522385120391846, "step": 236 }, { "epoch": 1.7467889908256882, "grad_norm": 0.62326979637146, "learning_rate": 2.211735934575674e-06, "logits/chosen": -0.7624643445014954, "logits/rejected": -2.0803322792053223, "logps/chosen": -293.841552734375, "logps/rejected": -189.69631958007812, "loss": 0.0703, "rewards/accuracies": 1.0, "rewards/chosen": 1.6327762603759766, "rewards/margins": 3.612248420715332, "rewards/rejected": -1.979472279548645, "step": 238 }, { "epoch": 1.761467889908257, "grad_norm": 0.5615968108177185, "learning_rate": 2.1692647468048235e-06, "logits/chosen": -0.8942849636077881, "logits/rejected": -1.9355003833770752, "logps/chosen": -318.2629699707031, "logps/rejected": -235.68296813964844, "loss": 0.0618, "rewards/accuracies": 1.0, "rewards/chosen": 1.87472403049469, "rewards/margins": 4.362048149108887, "rewards/rejected": -2.4873242378234863, "step": 240 }, { "epoch": 1.7761467889908258, "grad_norm": 0.6113856434822083, "learning_rate": 2.126890497800477e-06, "logits/chosen": -0.9161121845245361, "logits/rejected": -1.843569040298462, "logps/chosen": -309.8831787109375, "logps/rejected": -226.34967041015625, "loss": 0.0821, "rewards/accuracies": 1.0, "rewards/chosen": 1.9025682210922241, "rewards/margins": 3.5658414363861084, "rewards/rejected": -1.6632736921310425, "step": 242 }, { "epoch": 1.7908256880733946, "grad_norm": 0.7386473417282104, "learning_rate": 2.084625607488816e-06, "logits/chosen": -0.7687922716140747, "logits/rejected": -1.982967734336853, "logps/chosen": -285.9901428222656, "logps/rejected": -213.30564880371094, "loss": 0.0699, "rewards/accuracies": 1.0, "rewards/chosen": 2.0006887912750244, "rewards/margins": 4.181596279144287, "rewards/rejected": -2.180907726287842, "step": 244 }, { "epoch": 1.8055045871559634, "grad_norm": 0.620130717754364, "learning_rate": 2.0424824637428995e-06, "logits/chosen": -0.7613787651062012, "logits/rejected": -2.176778554916382, "logps/chosen": -278.2284851074219, "logps/rejected": -196.99716186523438, "loss": 0.0636, "rewards/accuracies": 0.984375, "rewards/chosen": 1.8024476766586304, "rewards/margins": 3.8208680152893066, "rewards/rejected": -2.0184202194213867, "step": 246 }, { "epoch": 1.8201834862385322, "grad_norm": 0.964788556098938, "learning_rate": 2.0004734187517744e-06, "logits/chosen": -0.9343721270561218, "logits/rejected": -1.8525314331054688, "logps/chosen": -328.5677795410156, "logps/rejected": -199.83840942382812, "loss": 0.0751, "rewards/accuracies": 1.0, "rewards/chosen": 1.8486145734786987, "rewards/margins": 3.6849093437194824, "rewards/rejected": -1.8362950086593628, "step": 248 }, { "epoch": 1.834862385321101, "grad_norm": 0.3955663740634918, "learning_rate": 1.9586107854000327e-06, "logits/chosen": -0.9676373600959778, "logits/rejected": -2.1090657711029053, "logps/chosen": -307.66302490234375, "logps/rejected": -193.6895751953125, "loss": 0.057, "rewards/accuracies": 1.0, "rewards/chosen": 1.8102517127990723, "rewards/margins": 3.7792747020721436, "rewards/rejected": -1.9690231084823608, "step": 250 }, { "epoch": 1.8495412844036698, "grad_norm": 0.6646362543106079, "learning_rate": 1.916906833658899e-06, "logits/chosen": -0.7113239169120789, "logits/rejected": -1.9650328159332275, "logps/chosen": -337.66107177734375, "logps/rejected": -242.595703125, "loss": 0.0528, "rewards/accuracies": 1.0, "rewards/chosen": 1.94010329246521, "rewards/margins": 4.105426788330078, "rewards/rejected": -2.165323495864868, "step": 252 }, { "epoch": 1.8642201834862386, "grad_norm": 0.6509953737258911, "learning_rate": 1.8753737869898921e-06, "logits/chosen": -0.794485330581665, "logits/rejected": -1.901089072227478, "logps/chosen": -258.750732421875, "logps/rejected": -213.0238037109375, "loss": 0.0543, "rewards/accuracies": 1.0, "rewards/chosen": 1.602518916130066, "rewards/margins": 4.075562477111816, "rewards/rejected": -2.473043441772461, "step": 254 }, { "epoch": 1.8788990825688074, "grad_norm": 0.4499273896217346, "learning_rate": 1.8340238187621185e-06, "logits/chosen": -0.7047321200370789, "logits/rejected": -1.8908119201660156, "logps/chosen": -273.62322998046875, "logps/rejected": -198.1600341796875, "loss": 0.0727, "rewards/accuracies": 0.984375, "rewards/chosen": 1.7582603693008423, "rewards/margins": 3.6687071323394775, "rewards/rejected": -1.9104465246200562, "step": 256 }, { "epoch": 1.8935779816513763, "grad_norm": 0.8414962291717529, "learning_rate": 1.7928690486842438e-06, "logits/chosen": -0.871714174747467, "logits/rejected": -2.0030646324157715, "logps/chosen": -264.42059326171875, "logps/rejected": -185.0747833251953, "loss": 0.0654, "rewards/accuracies": 1.0, "rewards/chosen": 1.900803565979004, "rewards/margins": 3.7862067222595215, "rewards/rejected": -1.8854031562805176, "step": 258 }, { "epoch": 1.908256880733945, "grad_norm": 0.8687112927436829, "learning_rate": 1.7519215392522026e-06, "logits/chosen": -0.8036646246910095, "logits/rejected": -2.0354790687561035, "logps/chosen": -293.50811767578125, "logps/rejected": -191.86962890625, "loss": 0.0604, "rewards/accuracies": 1.0, "rewards/chosen": 1.9246323108673096, "rewards/margins": 4.019070148468018, "rewards/rejected": -2.094437599182129, "step": 260 }, { "epoch": 1.9229357798165139, "grad_norm": 0.4811760485172272, "learning_rate": 1.7111932922136715e-06, "logits/chosen": -0.7815529108047485, "logits/rejected": -1.7573397159576416, "logps/chosen": -263.450927734375, "logps/rejected": -229.46728515625, "loss": 0.0629, "rewards/accuracies": 1.0, "rewards/chosen": 1.6779381036758423, "rewards/margins": 3.870631217956543, "rewards/rejected": -2.1926932334899902, "step": 262 }, { "epoch": 1.9376146788990827, "grad_norm": 0.5513655543327332, "learning_rate": 1.6706962450503408e-06, "logits/chosen": -0.6383249759674072, "logits/rejected": -1.9680360555648804, "logps/chosen": -293.52130126953125, "logps/rejected": -217.34693908691406, "loss": 0.0472, "rewards/accuracies": 1.0, "rewards/chosen": 1.9342231750488281, "rewards/margins": 4.369998931884766, "rewards/rejected": -2.4357762336730957, "step": 264 }, { "epoch": 1.9522935779816515, "grad_norm": 0.7187495827674866, "learning_rate": 1.630442267479034e-06, "logits/chosen": -0.6566349267959595, "logits/rejected": -1.9347317218780518, "logps/chosen": -277.68890380859375, "logps/rejected": -224.22335815429688, "loss": 0.0569, "rewards/accuracies": 1.0, "rewards/chosen": 2.052359104156494, "rewards/margins": 4.0643744468688965, "rewards/rejected": -2.0120151042938232, "step": 266 }, { "epoch": 1.9669724770642203, "grad_norm": 0.33258092403411865, "learning_rate": 1.5904431579726837e-06, "logits/chosen": -0.7657849192619324, "logits/rejected": -2.030609369277954, "logps/chosen": -306.955322265625, "logps/rejected": -190.61703491210938, "loss": 0.0564, "rewards/accuracies": 0.984375, "rewards/chosen": 1.6874788999557495, "rewards/margins": 4.016414165496826, "rewards/rejected": -2.328935146331787, "step": 268 }, { "epoch": 1.981651376146789, "grad_norm": 0.5519306659698486, "learning_rate": 1.5507106403021897e-06, "logits/chosen": -0.7592746019363403, "logits/rejected": -2.0932528972625732, "logps/chosen": -341.2933349609375, "logps/rejected": -232.65756225585938, "loss": 0.0422, "rewards/accuracies": 1.0, "rewards/chosen": 2.3383517265319824, "rewards/margins": 4.453994274139404, "rewards/rejected": -2.115642547607422, "step": 270 }, { "epoch": 1.996330275229358, "grad_norm": 0.5155956745147705, "learning_rate": 1.511256360100171e-06, "logits/chosen": -0.7073550224304199, "logits/rejected": -2.0184946060180664, "logps/chosen": -306.38116455078125, "logps/rejected": -217.38668823242188, "loss": 0.0538, "rewards/accuracies": 1.0, "rewards/chosen": 1.8283504247665405, "rewards/margins": 4.175046920776367, "rewards/rejected": -2.346696376800537, "step": 272 }, { "epoch": 2.0110091743119267, "grad_norm": 0.7801055908203125, "learning_rate": 1.4720918814476234e-06, "logits/chosen": -0.9376870393753052, "logits/rejected": -2.1091787815093994, "logps/chosen": -266.37811279296875, "logps/rejected": -209.93710327148438, "loss": 0.0743, "rewards/accuracies": 1.0, "rewards/chosen": 1.6686122417449951, "rewards/margins": 4.310929775238037, "rewards/rejected": -2.642317771911621, "step": 274 }, { "epoch": 2.0256880733944955, "grad_norm": 0.6762734055519104, "learning_rate": 1.4332286834844792e-06, "logits/chosen": -0.9745014309883118, "logits/rejected": -2.0172030925750732, "logps/chosen": -297.90997314453125, "logps/rejected": -215.2535400390625, "loss": 0.0594, "rewards/accuracies": 1.0, "rewards/chosen": 1.7038819789886475, "rewards/margins": 3.7982983589172363, "rewards/rejected": -2.0944161415100098, "step": 276 }, { "epoch": 2.0403669724770643, "grad_norm": 0.6311278939247131, "learning_rate": 1.3946781570450563e-06, "logits/chosen": -0.792485773563385, "logits/rejected": -2.0446367263793945, "logps/chosen": -316.6257019042969, "logps/rejected": -225.79873657226562, "loss": 0.0436, "rewards/accuracies": 1.0, "rewards/chosen": 2.1141061782836914, "rewards/margins": 4.177621841430664, "rewards/rejected": -2.0635154247283936, "step": 278 }, { "epoch": 2.055045871559633, "grad_norm": 0.4802553653717041, "learning_rate": 1.3564516013194023e-06, "logits/chosen": -0.5846218466758728, "logits/rejected": -1.8708997964859009, "logps/chosen": -278.2353515625, "logps/rejected": -215.2820587158203, "loss": 0.0528, "rewards/accuracies": 1.0, "rewards/chosen": 1.7591543197631836, "rewards/margins": 4.34114933013916, "rewards/rejected": -2.5819950103759766, "step": 280 }, { "epoch": 2.069724770642202, "grad_norm": 0.6416748762130737, "learning_rate": 1.3185602205414894e-06, "logits/chosen": -0.7558883428573608, "logits/rejected": -1.8708809614181519, "logps/chosen": -280.8486633300781, "logps/rejected": -198.6562042236328, "loss": 0.0654, "rewards/accuracies": 1.0, "rewards/chosen": 1.9279158115386963, "rewards/margins": 3.8500123023986816, "rewards/rejected": -1.9220962524414062, "step": 282 }, { "epoch": 2.0844036697247708, "grad_norm": 0.8262112736701965, "learning_rate": 1.2810151207052465e-06, "logits/chosen": -0.8148822784423828, "logits/rejected": -1.9564712047576904, "logps/chosen": -348.1204833984375, "logps/rejected": -250.0408172607422, "loss": 0.0648, "rewards/accuracies": 1.0, "rewards/chosen": 1.6666010618209839, "rewards/margins": 3.944627046585083, "rewards/rejected": -2.2780258655548096, "step": 284 }, { "epoch": 2.0990825688073396, "grad_norm": 0.4954426884651184, "learning_rate": 1.2438273063093811e-06, "logits/chosen": -0.6735963225364685, "logits/rejected": -1.8776307106018066, "logps/chosen": -291.0019836425781, "logps/rejected": -194.4311981201172, "loss": 0.0706, "rewards/accuracies": 0.984375, "rewards/chosen": 1.7643864154815674, "rewards/margins": 3.9881067276000977, "rewards/rejected": -2.223719835281372, "step": 286 }, { "epoch": 2.1137614678899084, "grad_norm": 0.8490874171257019, "learning_rate": 1.2070076771319536e-06, "logits/chosen": -0.9455384612083435, "logits/rejected": -1.8131248950958252, "logps/chosen": -365.147705078125, "logps/rejected": -228.15090942382812, "loss": 0.0704, "rewards/accuracies": 1.0, "rewards/chosen": 1.8452998399734497, "rewards/margins": 3.8042831420898438, "rewards/rejected": -1.958983063697815, "step": 288 }, { "epoch": 2.128440366972477, "grad_norm": 0.7720925807952881, "learning_rate": 1.1705670250356417e-06, "logits/chosen": -0.6748377084732056, "logits/rejected": -1.9302213191986084, "logps/chosen": -322.6198425292969, "logps/rejected": -224.2333526611328, "loss": 0.053, "rewards/accuracies": 1.0, "rewards/chosen": 2.065561532974243, "rewards/margins": 4.38578987121582, "rewards/rejected": -2.320228338241577, "step": 290 }, { "epoch": 2.143119266055046, "grad_norm": 0.4491863548755646, "learning_rate": 1.1345160308046413e-06, "logits/chosen": -0.7005204558372498, "logits/rejected": -2.1741456985473633, "logps/chosen": -398.5745849609375, "logps/rejected": -235.8988800048828, "loss": 0.0613, "rewards/accuracies": 0.984375, "rewards/chosen": 1.9305046796798706, "rewards/margins": 4.522059440612793, "rewards/rejected": -2.591554641723633, "step": 292 }, { "epoch": 2.157798165137615, "grad_norm": 0.6360311508178711, "learning_rate": 1.0988652610141154e-06, "logits/chosen": -0.7096176147460938, "logits/rejected": -1.7769296169281006, "logps/chosen": -288.8116760253906, "logps/rejected": -241.828369140625, "loss": 0.0548, "rewards/accuracies": 0.984375, "rewards/chosen": 1.8457667827606201, "rewards/margins": 3.990344524383545, "rewards/rejected": -2.144577980041504, "step": 294 }, { "epoch": 2.1724770642201836, "grad_norm": 0.3716106116771698, "learning_rate": 1.063625164933124e-06, "logits/chosen": -0.6774280667304993, "logits/rejected": -1.9684358835220337, "logps/chosen": -345.6900939941406, "logps/rejected": -240.35296630859375, "loss": 0.0367, "rewards/accuracies": 1.0, "rewards/chosen": 2.1445114612579346, "rewards/margins": 4.849350452423096, "rewards/rejected": -2.704838752746582, "step": 296 }, { "epoch": 2.1871559633027524, "grad_norm": 0.525005578994751, "learning_rate": 1.0288060714619359e-06, "logits/chosen": -0.9460769891738892, "logits/rejected": -2.1344943046569824, "logps/chosen": -330.6282043457031, "logps/rejected": -195.7637481689453, "loss": 0.0515, "rewards/accuracies": 1.0, "rewards/chosen": 2.1682145595550537, "rewards/margins": 4.502593040466309, "rewards/rejected": -2.334378480911255, "step": 298 }, { "epoch": 2.2018348623853212, "grad_norm": 0.458524227142334, "learning_rate": 9.944181861046188e-07, "logits/chosen": -0.7203876376152039, "logits/rejected": -1.8515840768814087, "logps/chosen": -347.7017517089844, "logps/rejected": -233.29393005371094, "loss": 0.0642, "rewards/accuracies": 0.984375, "rewards/chosen": 1.7970588207244873, "rewards/margins": 4.392740249633789, "rewards/rejected": -2.5956814289093018, "step": 300 }, { "epoch": 2.21651376146789, "grad_norm": 0.43879008293151855, "learning_rate": 9.604715879777986e-07, "logits/chosen": -0.7226991057395935, "logits/rejected": -2.0477523803710938, "logps/chosen": -290.74530029296875, "logps/rejected": -181.2649688720703, "loss": 0.0503, "rewards/accuracies": 1.0, "rewards/chosen": 1.9678212404251099, "rewards/margins": 4.382383346557617, "rewards/rejected": -2.4145617485046387, "step": 302 }, { "epoch": 2.231192660550459, "grad_norm": 0.4379405081272125, "learning_rate": 9.269762268564616e-07, "logits/chosen": -0.8170676231384277, "logits/rejected": -2.0070619583129883, "logps/chosen": -265.16571044921875, "logps/rejected": -186.93112182617188, "loss": 0.0614, "rewards/accuracies": 1.0, "rewards/chosen": 1.8528399467468262, "rewards/margins": 4.132846355438232, "rewards/rejected": -2.2800064086914062, "step": 304 }, { "epoch": 2.2458715596330276, "grad_norm": 0.8870872855186462, "learning_rate": 8.939419202576694e-07, "logits/chosen": -0.5970391631126404, "logits/rejected": -1.7150076627731323, "logps/chosen": -268.1172180175781, "logps/rejected": -207.63734436035156, "loss": 0.0883, "rewards/accuracies": 1.0, "rewards/chosen": 1.653077244758606, "rewards/margins": 3.3603720664978027, "rewards/rejected": -1.7072948217391968, "step": 306 }, { "epoch": 2.2605504587155965, "grad_norm": 0.737343966960907, "learning_rate": 8.61378350563033e-07, "logits/chosen": -0.7202005386352539, "logits/rejected": -1.8895469903945923, "logps/chosen": -262.6046447753906, "logps/rejected": -221.35336303710938, "loss": 0.0627, "rewards/accuracies": 1.0, "rewards/chosen": 1.750601053237915, "rewards/margins": 3.905064582824707, "rewards/rejected": -2.154463529586792, "step": 308 }, { "epoch": 2.2752293577981653, "grad_norm": 0.5748594403266907, "learning_rate": 8.292950621808022e-07, "logits/chosen": -0.7942256927490234, "logits/rejected": -1.9462255239486694, "logps/chosen": -297.3062438964844, "logps/rejected": -220.7073516845703, "loss": 0.0439, "rewards/accuracies": 1.0, "rewards/chosen": 1.907147765159607, "rewards/margins": 4.309232711791992, "rewards/rejected": -2.402085065841675, "step": 310 }, { "epoch": 2.289908256880734, "grad_norm": 0.5790998339653015, "learning_rate": 7.977014587483925e-07, "logits/chosen": -0.8033642768859863, "logits/rejected": -1.9477308988571167, "logps/chosen": -285.6184997558594, "logps/rejected": -257.89910888671875, "loss": 0.0548, "rewards/accuracies": 1.0, "rewards/chosen": 1.8432916402816772, "rewards/margins": 4.054279804229736, "rewards/rejected": -2.2109880447387695, "step": 312 }, { "epoch": 2.304587155963303, "grad_norm": 0.6188729405403137, "learning_rate": 7.666068003761684e-07, "logits/chosen": -0.7408751249313354, "logits/rejected": -1.9631062746047974, "logps/chosen": -308.1776123046875, "logps/rejected": -198.61410522460938, "loss": 0.0454, "rewards/accuracies": 1.0, "rewards/chosen": 1.7822659015655518, "rewards/margins": 4.317424297332764, "rewards/rejected": -2.535158634185791, "step": 314 }, { "epoch": 2.3192660550458717, "grad_norm": 0.5603534579277039, "learning_rate": 7.360202009332993e-07, "logits/chosen": -0.8284570574760437, "logits/rejected": -2.0091702938079834, "logps/chosen": -307.47088623046875, "logps/rejected": -215.27903747558594, "loss": 0.0523, "rewards/accuracies": 1.0, "rewards/chosen": 1.6950868368148804, "rewards/margins": 4.28853178024292, "rewards/rejected": -2.59344482421875, "step": 316 }, { "epoch": 2.3339449541284405, "grad_norm": 0.3757495582103729, "learning_rate": 7.059506253764773e-07, "logits/chosen": -0.7530102729797363, "logits/rejected": -1.9654746055603027, "logps/chosen": -326.5684814453125, "logps/rejected": -226.15786743164062, "loss": 0.0454, "rewards/accuracies": 1.0, "rewards/chosen": 1.8712252378463745, "rewards/margins": 4.424376964569092, "rewards/rejected": -2.553151845932007, "step": 318 }, { "epoch": 2.3486238532110093, "grad_norm": 0.6858806014060974, "learning_rate": 6.764068871222825e-07, "logits/chosen": -0.5249571204185486, "logits/rejected": -1.8156137466430664, "logps/chosen": -298.5492858886719, "logps/rejected": -212.81187438964844, "loss": 0.0564, "rewards/accuracies": 1.0, "rewards/chosen": 1.8946092128753662, "rewards/margins": 4.014616012573242, "rewards/rejected": -2.120006561279297, "step": 320 }, { "epoch": 2.363302752293578, "grad_norm": 0.4449942409992218, "learning_rate": 6.473976454639608e-07, "logits/chosen": -0.7823415398597717, "logits/rejected": -2.0849199295043945, "logps/chosen": -306.5335693359375, "logps/rejected": -197.4770050048828, "loss": 0.0458, "rewards/accuracies": 1.0, "rewards/chosen": 2.062614679336548, "rewards/margins": 4.463562488555908, "rewards/rejected": -2.4009478092193604, "step": 322 }, { "epoch": 2.377981651376147, "grad_norm": 0.699175238609314, "learning_rate": 6.189314030333796e-07, "logits/chosen": -0.5810756087303162, "logits/rejected": -1.8031431436538696, "logps/chosen": -292.385009765625, "logps/rejected": -252.5111083984375, "loss": 0.0572, "rewards/accuracies": 1.0, "rewards/chosen": 1.7218618392944336, "rewards/margins": 4.281242847442627, "rewards/rejected": -2.5593810081481934, "step": 324 }, { "epoch": 2.3926605504587157, "grad_norm": 0.5212377309799194, "learning_rate": 5.910165033089e-07, "logits/chosen": -0.6628118753433228, "logits/rejected": -2.0212368965148926, "logps/chosen": -328.28729248046875, "logps/rejected": -230.48863220214844, "loss": 0.0409, "rewards/accuracies": 1.0, "rewards/chosen": 2.0009829998016357, "rewards/margins": 4.275434494018555, "rewards/rejected": -2.274451732635498, "step": 326 }, { "epoch": 2.4073394495412845, "grad_norm": 0.5146971344947815, "learning_rate": 5.636611281698956e-07, "logits/chosen": -0.7095816731452942, "logits/rejected": -1.83794367313385, "logps/chosen": -272.989990234375, "logps/rejected": -213.9049835205078, "loss": 0.0519, "rewards/accuracies": 1.0, "rewards/chosen": 1.5980645418167114, "rewards/margins": 3.967491388320923, "rewards/rejected": -2.369426727294922, "step": 328 }, { "epoch": 2.4220183486238533, "grad_norm": 0.6070245504379272, "learning_rate": 5.368732954986389e-07, "logits/chosen": -0.8353590369224548, "logits/rejected": -1.9633159637451172, "logps/chosen": -291.64990234375, "logps/rejected": -226.0115966796875, "loss": 0.0595, "rewards/accuracies": 1.0, "rewards/chosen": 1.7673264741897583, "rewards/margins": 4.178317070007324, "rewards/rejected": -2.4109902381896973, "step": 330 }, { "epoch": 2.436697247706422, "grad_norm": 0.411520391702652, "learning_rate": 5.106608568302504e-07, "logits/chosen": -0.8378889560699463, "logits/rejected": -1.9491535425186157, "logps/chosen": -269.3817443847656, "logps/rejected": -226.02801513671875, "loss": 0.0607, "rewards/accuracies": 0.984375, "rewards/chosen": 1.776644229888916, "rewards/margins": 4.346107006072998, "rewards/rejected": -2.569462537765503, "step": 332 }, { "epoch": 2.451376146788991, "grad_norm": 0.538725733757019, "learning_rate": 4.850314950514124e-07, "logits/chosen": -0.5758827328681946, "logits/rejected": -1.8072640895843506, "logps/chosen": -293.7169189453125, "logps/rejected": -222.18191528320312, "loss": 0.0476, "rewards/accuracies": 1.0, "rewards/chosen": 1.8555824756622314, "rewards/margins": 4.301581382751465, "rewards/rejected": -2.4459989070892334, "step": 334 }, { "epoch": 2.4660550458715598, "grad_norm": 0.6865962147712708, "learning_rate": 4.599927221485034e-07, "logits/chosen": -0.6990569233894348, "logits/rejected": -1.9968361854553223, "logps/chosen": -290.4656066894531, "logps/rejected": -200.7618408203125, "loss": 0.0467, "rewards/accuracies": 1.0, "rewards/chosen": 1.737301230430603, "rewards/margins": 4.213505744934082, "rewards/rejected": -2.4762046337127686, "step": 336 }, { "epoch": 2.4807339449541286, "grad_norm": 0.5077099204063416, "learning_rate": 4.3555187700583175e-07, "logits/chosen": -0.6568117141723633, "logits/rejected": -1.949430227279663, "logps/chosen": -277.690673828125, "logps/rejected": -220.354248046875, "loss": 0.0369, "rewards/accuracies": 1.0, "rewards/chosen": 1.8167202472686768, "rewards/margins": 4.532853126525879, "rewards/rejected": -2.716132879257202, "step": 338 }, { "epoch": 2.4954128440366974, "grad_norm": 0.49507051706314087, "learning_rate": 4.1171612325460244e-07, "logits/chosen": -0.7259389162063599, "logits/rejected": -1.808924674987793, "logps/chosen": -290.60845947265625, "logps/rejected": -214.2810821533203, "loss": 0.0532, "rewards/accuracies": 1.0, "rewards/chosen": 1.6286503076553345, "rewards/margins": 4.091249942779541, "rewards/rejected": -2.462599515914917, "step": 340 }, { "epoch": 2.510091743119266, "grad_norm": 0.348964124917984, "learning_rate": 3.8849244717325206e-07, "logits/chosen": -0.727351188659668, "logits/rejected": -1.7707502841949463, "logps/chosen": -281.7944030761719, "logps/rejected": -234.1016082763672, "loss": 0.0477, "rewards/accuracies": 1.0, "rewards/chosen": 1.9929590225219727, "rewards/margins": 4.754191875457764, "rewards/rejected": -2.761232852935791, "step": 342 }, { "epoch": 2.524770642201835, "grad_norm": 0.5973061919212341, "learning_rate": 3.658876556397628e-07, "logits/chosen": -0.8893070816993713, "logits/rejected": -2.0552244186401367, "logps/chosen": -266.6897277832031, "logps/rejected": -199.23097229003906, "loss": 0.059, "rewards/accuracies": 1.0, "rewards/chosen": 1.8175071477890015, "rewards/margins": 4.262746810913086, "rewards/rejected": -2.445240020751953, "step": 344 }, { "epoch": 2.539449541284404, "grad_norm": 0.669189453125, "learning_rate": 3.4390837413656256e-07, "logits/chosen": -0.7612945437431335, "logits/rejected": -2.0197830200195312, "logps/chosen": -289.519775390625, "logps/rejected": -236.5748748779297, "loss": 0.0455, "rewards/accuracies": 1.0, "rewards/chosen": 1.8634060621261597, "rewards/margins": 4.5830864906311035, "rewards/rejected": -2.7196803092956543, "step": 346 }, { "epoch": 2.5541284403669726, "grad_norm": 0.49376487731933594, "learning_rate": 3.225610448085903e-07, "logits/chosen": -0.703992486000061, "logits/rejected": -1.8440505266189575, "logps/chosen": -282.47967529296875, "logps/rejected": -213.64584350585938, "loss": 0.0459, "rewards/accuracies": 1.0, "rewards/chosen": 1.8914369344711304, "rewards/margins": 4.45714807510376, "rewards/rejected": -2.565711498260498, "step": 348 }, { "epoch": 2.5688073394495414, "grad_norm": 0.6668093204498291, "learning_rate": 3.018519245750989e-07, "logits/chosen": -0.775786817073822, "logits/rejected": -1.7931033372879028, "logps/chosen": -332.7348937988281, "logps/rejected": -254.2784423828125, "loss": 0.0578, "rewards/accuracies": 0.984375, "rewards/chosen": 1.7138545513153076, "rewards/margins": 4.272766590118408, "rewards/rejected": -2.5589118003845215, "step": 350 }, { "epoch": 2.5834862385321102, "grad_norm": 0.46660616993904114, "learning_rate": 2.817870832957459e-07, "logits/chosen": -0.6354199051856995, "logits/rejected": -1.8320108652114868, "logps/chosen": -270.6486511230469, "logps/rejected": -209.35401916503906, "loss": 0.0468, "rewards/accuracies": 1.0, "rewards/chosen": 1.8665962219238281, "rewards/margins": 4.460667133331299, "rewards/rejected": -2.594071388244629, "step": 352 }, { "epoch": 2.598165137614679, "grad_norm": 0.7453739047050476, "learning_rate": 2.6237240199151386e-07, "logits/chosen": -0.7968777418136597, "logits/rejected": -2.040590763092041, "logps/chosen": -278.96051025390625, "logps/rejected": -198.50344848632812, "loss": 0.0617, "rewards/accuracies": 1.0, "rewards/chosen": 1.8018391132354736, "rewards/margins": 3.8879919052124023, "rewards/rejected": -2.0861527919769287, "step": 354 }, { "epoch": 2.612844036697248, "grad_norm": 0.6645973920822144, "learning_rate": 2.436135711209786e-07, "logits/chosen": -1.0428318977355957, "logits/rejected": -2.0885515213012695, "logps/chosen": -291.73846435546875, "logps/rejected": -194.1337890625, "loss": 0.0523, "rewards/accuracies": 1.0, "rewards/chosen": 1.6952629089355469, "rewards/margins": 4.137482166290283, "rewards/rejected": -2.4422197341918945, "step": 356 }, { "epoch": 2.6275229357798167, "grad_norm": 0.6470810174942017, "learning_rate": 2.2551608891243026e-07, "logits/chosen": -1.004224419593811, "logits/rejected": -2.139845609664917, "logps/chosen": -366.6258544921875, "logps/rejected": -242.1497344970703, "loss": 0.053, "rewards/accuracies": 1.0, "rewards/chosen": 1.7366106510162354, "rewards/margins": 3.9900641441345215, "rewards/rejected": -2.2534537315368652, "step": 358 }, { "epoch": 2.6422018348623855, "grad_norm": 0.4664456248283386, "learning_rate": 2.0808525975233807e-07, "logits/chosen": -0.6308703422546387, "logits/rejected": -1.8344846963882446, "logps/chosen": -294.6444091796875, "logps/rejected": -229.31024169921875, "loss": 0.0686, "rewards/accuracies": 0.984375, "rewards/chosen": 1.5859092473983765, "rewards/margins": 4.006678104400635, "rewards/rejected": -2.4207687377929688, "step": 360 }, { "epoch": 2.6568807339449543, "grad_norm": 0.8631575107574463, "learning_rate": 1.9132619263063144e-07, "logits/chosen": -0.6818079352378845, "logits/rejected": -1.9622324705123901, "logps/chosen": -360.9144287109375, "logps/rejected": -245.8763885498047, "loss": 0.0507, "rewards/accuracies": 1.0, "rewards/chosen": 2.0507702827453613, "rewards/margins": 4.704620838165283, "rewards/rejected": -2.653850555419922, "step": 362 }, { "epoch": 2.671559633027523, "grad_norm": 0.2709774672985077, "learning_rate": 1.7524379964325155e-07, "logits/chosen": -0.7185477614402771, "logits/rejected": -1.9397680759429932, "logps/chosen": -340.63604736328125, "logps/rejected": -234.94650268554688, "loss": 0.038, "rewards/accuracies": 1.0, "rewards/chosen": 1.689025640487671, "rewards/margins": 4.374544620513916, "rewards/rejected": -2.685518741607666, "step": 364 }, { "epoch": 2.686238532110092, "grad_norm": 0.533819854259491, "learning_rate": 1.5984279455240975e-07, "logits/chosen": -0.8093196153640747, "logits/rejected": -1.8389997482299805, "logps/chosen": -295.1790771484375, "logps/rejected": -222.28091430664062, "loss": 0.0453, "rewards/accuracies": 1.0, "rewards/chosen": 1.990577220916748, "rewards/margins": 4.476598262786865, "rewards/rejected": -2.486021041870117, "step": 366 }, { "epoch": 2.7009174311926607, "grad_norm": 0.49332335591316223, "learning_rate": 1.451276914049818e-07, "logits/chosen": -0.7148327827453613, "logits/rejected": -1.8303236961364746, "logps/chosen": -265.9716491699219, "logps/rejected": -208.4017791748047, "loss": 0.0493, "rewards/accuracies": 1.0, "rewards/chosen": 1.5792350769042969, "rewards/margins": 4.226253509521484, "rewards/rejected": -2.6470184326171875, "step": 368 }, { "epoch": 2.7155963302752295, "grad_norm": 0.44270747900009155, "learning_rate": 1.3110280320943692e-07, "logits/chosen": -0.6963136792182922, "logits/rejected": -2.0225512981414795, "logps/chosen": -283.3634033203125, "logps/rejected": -202.95513916015625, "loss": 0.0372, "rewards/accuracies": 1.0, "rewards/chosen": 1.947371244430542, "rewards/margins": 4.56584358215332, "rewards/rejected": -2.6184728145599365, "step": 370 }, { "epoch": 2.7302752293577983, "grad_norm": 0.43144798278808594, "learning_rate": 1.1777224067169218e-07, "logits/chosen": -0.6372362375259399, "logits/rejected": -1.8398162126541138, "logps/chosen": -290.5965576171875, "logps/rejected": -221.44293212890625, "loss": 0.0397, "rewards/accuracies": 1.0, "rewards/chosen": 1.9076143503189087, "rewards/margins": 4.488263130187988, "rewards/rejected": -2.580648899078369, "step": 372 }, { "epoch": 2.744954128440367, "grad_norm": 0.6024923920631409, "learning_rate": 1.0513991099025872e-07, "logits/chosen": -0.797070324420929, "logits/rejected": -1.9885629415512085, "logps/chosen": -334.8035888671875, "logps/rejected": -221.12759399414062, "loss": 0.0531, "rewards/accuracies": 1.0, "rewards/chosen": 1.713030457496643, "rewards/margins": 3.9955036640167236, "rewards/rejected": -2.28247332572937, "step": 374 }, { "epoch": 2.759633027522936, "grad_norm": 0.42591243982315063, "learning_rate": 9.320951671104194e-08, "logits/chosen": -0.6949442625045776, "logits/rejected": -1.9821323156356812, "logps/chosen": -326.1830749511719, "logps/rejected": -219.29837036132812, "loss": 0.0364, "rewards/accuracies": 1.0, "rewards/chosen": 2.3185112476348877, "rewards/margins": 4.512205600738525, "rewards/rejected": -2.1936943531036377, "step": 376 }, { "epoch": 2.7743119266055047, "grad_norm": 0.5629270076751709, "learning_rate": 8.198455464212108e-08, "logits/chosen": -0.734917402267456, "logits/rejected": -1.9623744487762451, "logps/chosen": -304.7027282714844, "logps/rejected": -207.42129516601562, "loss": 0.0376, "rewards/accuracies": 1.0, "rewards/chosen": 2.003225326538086, "rewards/margins": 4.722169876098633, "rewards/rejected": -2.7189443111419678, "step": 378 }, { "epoch": 2.7889908256880735, "grad_norm": 0.45993342995643616, "learning_rate": 7.146831482883115e-08, "logits/chosen": -0.5041406750679016, "logits/rejected": -1.9728295803070068, "logps/chosen": -310.0588073730469, "logps/rejected": -202.70693969726562, "loss": 0.0358, "rewards/accuracies": 1.0, "rewards/chosen": 2.020038604736328, "rewards/margins": 4.748332500457764, "rewards/rejected": -2.7282943725585938, "step": 380 }, { "epoch": 2.8036697247706424, "grad_norm": 0.772245466709137, "learning_rate": 6.16638795894492e-08, "logits/chosen": -0.6536301374435425, "logits/rejected": -1.7665328979492188, "logps/chosen": -273.5377197265625, "logps/rejected": -230.2478485107422, "loss": 0.0608, "rewards/accuracies": 1.0, "rewards/chosen": 1.9565868377685547, "rewards/margins": 4.441068649291992, "rewards/rejected": -2.4844815731048584, "step": 382 }, { "epoch": 2.818348623853211, "grad_norm": 0.7546908855438232, "learning_rate": 5.257412261176375e-08, "logits/chosen": -0.8912358283996582, "logits/rejected": -1.845367193222046, "logps/chosen": -286.1430969238281, "logps/rejected": -220.30615234375, "loss": 0.0467, "rewards/accuracies": 1.0, "rewards/chosen": 1.9721031188964844, "rewards/margins": 4.426693916320801, "rewards/rejected": -2.4545907974243164, "step": 384 }, { "epoch": 2.83302752293578, "grad_norm": 0.8646131157875061, "learning_rate": 4.4201708110795384e-08, "logits/chosen": -0.7442179918289185, "logits/rejected": -1.8478055000305176, "logps/chosen": -304.1729431152344, "logps/rejected": -233.10855102539062, "loss": 0.0542, "rewards/accuracies": 1.0, "rewards/chosen": 1.854830265045166, "rewards/margins": 4.118017673492432, "rewards/rejected": -2.2631874084472656, "step": 386 }, { "epoch": 2.847706422018349, "grad_norm": 0.5237764120101929, "learning_rate": 3.654909004791152e-08, "logits/chosen": -0.7583023309707642, "logits/rejected": -2.0417721271514893, "logps/chosen": -305.7694091796875, "logps/rejected": -214.91346740722656, "loss": 0.0519, "rewards/accuracies": 1.0, "rewards/chosen": 1.7464522123336792, "rewards/margins": 4.34686279296875, "rewards/rejected": -2.600410223007202, "step": 388 }, { "epoch": 2.8623853211009176, "grad_norm": 0.4562954306602478, "learning_rate": 2.9618511411570462e-08, "logits/chosen": -0.8513392210006714, "logits/rejected": -1.93741774559021, "logps/chosen": -298.6360778808594, "logps/rejected": -200.97865295410156, "loss": 0.0594, "rewards/accuracies": 1.0, "rewards/chosen": 1.5848456621170044, "rewards/margins": 4.198357105255127, "rewards/rejected": -2.613511562347412, "step": 390 }, { "epoch": 2.8770642201834864, "grad_norm": 0.8423421382904053, "learning_rate": 2.3412003559898088e-08, "logits/chosen": -0.701295018196106, "logits/rejected": -1.7541186809539795, "logps/chosen": -282.74078369140625, "logps/rejected": -235.9966278076172, "loss": 0.0715, "rewards/accuracies": 1.0, "rewards/chosen": 1.771613359451294, "rewards/margins": 3.9545040130615234, "rewards/rejected": -2.1828906536102295, "step": 392 }, { "epoch": 2.891743119266055, "grad_norm": 0.7763597369194031, "learning_rate": 1.793138562529634e-08, "logits/chosen": -0.818265438079834, "logits/rejected": -2.0317091941833496, "logps/chosen": -358.9674377441406, "logps/rejected": -211.57412719726562, "loss": 0.0469, "rewards/accuracies": 1.0, "rewards/chosen": 2.096813917160034, "rewards/margins": 4.269334316253662, "rewards/rejected": -2.172520399093628, "step": 394 }, { "epoch": 2.906422018348624, "grad_norm": 0.543138325214386, "learning_rate": 1.317826398125277e-08, "logits/chosen": -0.8907778263092041, "logits/rejected": -2.0098018646240234, "logps/chosen": -306.7756652832031, "logps/rejected": -236.91026306152344, "loss": 0.0441, "rewards/accuracies": 0.984375, "rewards/chosen": 2.1162776947021484, "rewards/margins": 4.927333831787109, "rewards/rejected": -2.81105637550354, "step": 396 }, { "epoch": 2.921100917431193, "grad_norm": 0.40663444995880127, "learning_rate": 9.15403177151275e-09, "logits/chosen": -0.744702160358429, "logits/rejected": -1.7668923139572144, "logps/chosen": -288.4136962890625, "logps/rejected": -249.9727325439453, "loss": 0.0399, "rewards/accuracies": 1.0, "rewards/chosen": 1.962416648864746, "rewards/margins": 4.565864562988281, "rewards/rejected": -2.603447914123535, "step": 398 }, { "epoch": 2.9357798165137616, "grad_norm": 0.4989350736141205, "learning_rate": 5.85986850174608e-09, "logits/chosen": -0.6515053510665894, "logits/rejected": -2.1018004417419434, "logps/chosen": -325.62371826171875, "logps/rejected": -215.43890380859375, "loss": 0.0425, "rewards/accuracies": 1.0, "rewards/chosen": 2.057291030883789, "rewards/margins": 4.5394287109375, "rewards/rejected": -2.4821372032165527, "step": 400 }, { "epoch": 2.9504587155963304, "grad_norm": 0.5903070569038391, "learning_rate": 3.296739693834927e-09, "logits/chosen": -0.936674952507019, "logits/rejected": -1.8789682388305664, "logps/chosen": -317.982666015625, "logps/rejected": -212.38453674316406, "loss": 0.0552, "rewards/accuracies": 1.0, "rewards/chosen": 1.4384199380874634, "rewards/margins": 3.8503003120422363, "rewards/rejected": -2.4118804931640625, "step": 402 }, { "epoch": 2.9651376146788992, "grad_norm": 0.7236863374710083, "learning_rate": 1.4653966028774225e-09, "logits/chosen": -0.7320691347122192, "logits/rejected": -1.8207372426986694, "logps/chosen": -326.4574890136719, "logps/rejected": -244.78536987304688, "loss": 0.0475, "rewards/accuracies": 1.0, "rewards/chosen": 1.709165096282959, "rewards/margins": 4.418180465698242, "rewards/rejected": -2.7090158462524414, "step": 404 }, { "epoch": 2.979816513761468, "grad_norm": 0.37622901797294617, "learning_rate": 3.6637599699351766e-10, "logits/chosen": -0.6842759847640991, "logits/rejected": -2.033496856689453, "logps/chosen": -302.9255065917969, "logps/rejected": -209.06802368164062, "loss": 0.0455, "rewards/accuracies": 1.0, "rewards/chosen": 1.8500210046768188, "rewards/margins": 4.324655532836914, "rewards/rejected": -2.4746341705322266, "step": 406 }, { "epoch": 2.994495412844037, "grad_norm": 0.5513418316841125, "learning_rate": 0.0, "logits/chosen": -0.8159844279289246, "logits/rejected": -1.814368724822998, "logps/chosen": -333.46990966796875, "logps/rejected": -244.63433837890625, "loss": 0.0625, "rewards/accuracies": 0.984375, "rewards/chosen": 1.8996886014938354, "rewards/margins": 4.328461170196533, "rewards/rejected": -2.4287726879119873, "step": 408 }, { "epoch": 2.994495412844037, "step": 408, "total_flos": 7.837376281021809e+17, "train_loss": 0.2111055671474805, "train_runtime": 8097.1834, "train_samples_per_second": 1.614, "train_steps_per_second": 0.05 } ], "logging_steps": 2, "max_steps": 408, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 7.837376281021809e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }