{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9982631930527722, "eval_steps": 400, "global_step": 467, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01068804275217101, "grad_norm": 68.85501439409333, "learning_rate": 1.0638297872340425e-07, "logits/chosen": -1.007319450378418, "logits/rejected": -0.9782959222793579, "logps/chosen": -0.2740143835544586, "logps/rejected": -0.27172547578811646, "loss": 3.0362, "rewards/accuracies": 0.4437499940395355, "rewards/chosen": -2.7401437759399414, "rewards/margins": -0.02288895845413208, "rewards/rejected": -2.717254638671875, "step": 5 }, { "epoch": 0.02137608550434202, "grad_norm": 43.462307413510835, "learning_rate": 2.127659574468085e-07, "logits/chosen": -1.0488415956497192, "logits/rejected": -0.9815706014633179, "logps/chosen": -0.29419511556625366, "logps/rejected": -0.2995131313800812, "loss": 3.1785, "rewards/accuracies": 0.5, "rewards/chosen": -2.941951274871826, "rewards/margins": 0.05318016931414604, "rewards/rejected": -2.995131015777588, "step": 10 }, { "epoch": 0.03206412825651302, "grad_norm": 59.3077193791241, "learning_rate": 3.1914893617021275e-07, "logits/chosen": -0.9633978605270386, "logits/rejected": -0.9824289083480835, "logps/chosen": -0.2639780044555664, "logps/rejected": -0.30077746510505676, "loss": 3.0982, "rewards/accuracies": 0.59375, "rewards/chosen": -2.639780044555664, "rewards/margins": 0.36799436807632446, "rewards/rejected": -3.0077743530273438, "step": 15 }, { "epoch": 0.04275217100868404, "grad_norm": 100.89613137154764, "learning_rate": 4.25531914893617e-07, "logits/chosen": -0.9631160497665405, "logits/rejected": -0.9371601343154907, "logps/chosen": -0.27738872170448303, "logps/rejected": -0.29155653715133667, "loss": 3.0229, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -2.7738869190216064, "rewards/margins": 0.1416785567998886, "rewards/rejected": -2.9155654907226562, "step": 20 }, { "epoch": 0.053440213760855046, "grad_norm": 58.79708288817627, "learning_rate": 5.319148936170212e-07, "logits/chosen": -1.013113260269165, "logits/rejected": -0.9839662313461304, "logps/chosen": -0.27208274602890015, "logps/rejected": -0.2786521315574646, "loss": 3.2744, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -2.720827341079712, "rewards/margins": 0.06569431722164154, "rewards/rejected": -2.7865219116210938, "step": 25 }, { "epoch": 0.06412825651302605, "grad_norm": 58.05968359177723, "learning_rate": 6.382978723404255e-07, "logits/chosen": -1.0021781921386719, "logits/rejected": -0.9577069282531738, "logps/chosen": -0.274009644985199, "logps/rejected": -0.2803052067756653, "loss": 3.0005, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": -2.7400965690612793, "rewards/margins": 0.06295552104711533, "rewards/rejected": -2.8030521869659424, "step": 30 }, { "epoch": 0.07481629926519706, "grad_norm": 73.78592278395888, "learning_rate": 7.446808510638297e-07, "logits/chosen": -1.0400080680847168, "logits/rejected": -0.9649320840835571, "logps/chosen": -0.2968520522117615, "logps/rejected": -0.32594868540763855, "loss": 3.0023, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -2.968520164489746, "rewards/margins": 0.29096680879592896, "rewards/rejected": -3.259486675262451, "step": 35 }, { "epoch": 0.08550434201736808, "grad_norm": 56.93905382011994, "learning_rate": 8.51063829787234e-07, "logits/chosen": -0.9891254305839539, "logits/rejected": -0.9455845952033997, "logps/chosen": -0.282151460647583, "logps/rejected": -0.3367989659309387, "loss": 2.9458, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -2.82151460647583, "rewards/margins": 0.5464746952056885, "rewards/rejected": -3.3679893016815186, "step": 40 }, { "epoch": 0.09619238476953908, "grad_norm": 79.32682008333322, "learning_rate": 9.574468085106384e-07, "logits/chosen": -1.0468851327896118, "logits/rejected": -1.0032992362976074, "logps/chosen": -0.36789190769195557, "logps/rejected": -0.42787107825279236, "loss": 3.0263, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -3.6789193153381348, "rewards/margins": 0.5997918248176575, "rewards/rejected": -4.278710842132568, "step": 45 }, { "epoch": 0.10688042752171009, "grad_norm": 91.62537725815142, "learning_rate": 9.998741174712533e-07, "logits/chosen": -1.0219460725784302, "logits/rejected": -0.9719876050949097, "logps/chosen": -0.38097304105758667, "logps/rejected": -0.43333888053894043, "loss": 3.0722, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -3.8097305297851562, "rewards/margins": 0.5236578583717346, "rewards/rejected": -4.333388328552246, "step": 50 }, { "epoch": 0.11756847027388109, "grad_norm": 133.17377723672328, "learning_rate": 9.991050648838675e-07, "logits/chosen": -1.0433048009872437, "logits/rejected": -1.0081396102905273, "logps/chosen": -0.31340470910072327, "logps/rejected": -0.3743906617164612, "loss": 2.8041, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -3.134047031402588, "rewards/margins": 0.6098597645759583, "rewards/rejected": -3.7439064979553223, "step": 55 }, { "epoch": 0.1282565130260521, "grad_norm": 89.75230108032684, "learning_rate": 9.97637968732563e-07, "logits/chosen": -1.0939303636550903, "logits/rejected": -1.0607116222381592, "logps/chosen": -0.32906678318977356, "logps/rejected": -0.3560132384300232, "loss": 2.8477, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -3.290667772293091, "rewards/margins": 0.2694646716117859, "rewards/rejected": -3.5601325035095215, "step": 60 }, { "epoch": 0.13894455577822312, "grad_norm": 73.19084024449225, "learning_rate": 9.954748808839674e-07, "logits/chosen": -1.0200589895248413, "logits/rejected": -0.9912623167037964, "logps/chosen": -0.38689833879470825, "logps/rejected": -0.4406129717826843, "loss": 2.8473, "rewards/accuracies": 0.59375, "rewards/chosen": -3.868983030319214, "rewards/margins": 0.5371465682983398, "rewards/rejected": -4.406129360198975, "step": 65 }, { "epoch": 0.14963259853039412, "grad_norm": 46.36345887436274, "learning_rate": 9.926188266120295e-07, "logits/chosen": -1.0427110195159912, "logits/rejected": -1.017996907234192, "logps/chosen": -0.34935158491134644, "logps/rejected": -0.43221673369407654, "loss": 2.8927, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -3.493516206741333, "rewards/margins": 0.8286512494087219, "rewards/rejected": -4.32216739654541, "step": 70 }, { "epoch": 0.16032064128256512, "grad_norm": 86.0599966129396, "learning_rate": 9.890738003669027e-07, "logits/chosen": -0.9983121752738953, "logits/rejected": -0.9283790588378906, "logps/chosen": -0.3482792377471924, "logps/rejected": -0.41999250650405884, "loss": 2.7867, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -3.482792615890503, "rewards/margins": 0.7171324491500854, "rewards/rejected": -4.199924468994141, "step": 75 }, { "epoch": 0.17100868403473615, "grad_norm": 47.31795000764903, "learning_rate": 9.848447601883433e-07, "logits/chosen": -0.974830150604248, "logits/rejected": -0.9607669115066528, "logps/chosen": -0.3474265933036804, "logps/rejected": -0.45176610350608826, "loss": 2.8224, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -3.4742660522460938, "rewards/margins": 1.0433948040008545, "rewards/rejected": -4.517661094665527, "step": 80 }, { "epoch": 0.18169672678690715, "grad_norm": 73.04066752438544, "learning_rate": 9.799376207714444e-07, "logits/chosen": -0.9964420199394226, "logits/rejected": -0.9738152623176575, "logps/chosen": -0.3353765606880188, "logps/rejected": -0.4040314555168152, "loss": 2.7119, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -3.3537659645080566, "rewards/margins": 0.6865488886833191, "rewards/rejected": -4.040314674377441, "step": 85 }, { "epoch": 0.19238476953907815, "grad_norm": 76.692510819006, "learning_rate": 9.743592451943998e-07, "logits/chosen": -1.0402719974517822, "logits/rejected": -1.0053231716156006, "logps/chosen": -0.4286496043205261, "logps/rejected": -0.519912838935852, "loss": 2.88, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -4.286496162414551, "rewards/margins": 0.9126325845718384, "rewards/rejected": -5.1991286277771, "step": 90 }, { "epoch": 0.20307281229124916, "grad_norm": 46.68556671183106, "learning_rate": 9.681174353198686e-07, "logits/chosen": -1.123992681503296, "logits/rejected": -1.0409624576568604, "logps/chosen": -0.4465855658054352, "logps/rejected": -0.4793754518032074, "loss": 2.83, "rewards/accuracies": 0.53125, "rewards/chosen": -4.465855598449707, "rewards/margins": 0.3278988003730774, "rewards/rejected": -4.793754577636719, "step": 95 }, { "epoch": 0.21376085504342018, "grad_norm": 89.65351102525449, "learning_rate": 9.612209208833646e-07, "logits/chosen": -1.0020520687103271, "logits/rejected": -0.9762676358222961, "logps/chosen": -0.44748955965042114, "logps/rejected": -0.4820302128791809, "loss": 3.012, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -4.47489595413208, "rewards/margins": 0.34540656208992004, "rewards/rejected": -4.8203020095825195, "step": 100 }, { "epoch": 0.22444889779559118, "grad_norm": 74.62602523369812, "learning_rate": 9.536793472839324e-07, "logits/chosen": -1.025708794593811, "logits/rejected": -0.9719223976135254, "logps/chosen": -0.3887873888015747, "logps/rejected": -0.4932475686073303, "loss": 2.8263, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -3.887873888015747, "rewards/margins": 1.0446012020111084, "rewards/rejected": -4.9324750900268555, "step": 105 }, { "epoch": 0.23513694054776219, "grad_norm": 68.98493486526344, "learning_rate": 9.455032620941839e-07, "logits/chosen": -0.9832603335380554, "logits/rejected": -0.921442985534668, "logps/chosen": -0.43601909279823303, "logps/rejected": -0.5775080323219299, "loss": 2.7095, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -4.360190391540527, "rewards/margins": 1.4148895740509033, "rewards/rejected": -5.775080680847168, "step": 110 }, { "epoch": 0.2458249832999332, "grad_norm": 104.15189984616235, "learning_rate": 9.367041003085648e-07, "logits/chosen": -1.0532798767089844, "logits/rejected": -0.991862416267395, "logps/chosen": -0.47051066160202026, "logps/rejected": -0.5247665047645569, "loss": 2.7215, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -4.705106258392334, "rewards/margins": 0.5425585508346558, "rewards/rejected": -5.247664451599121, "step": 115 }, { "epoch": 0.2565130260521042, "grad_norm": 58.826951355159096, "learning_rate": 9.272941683504808e-07, "logits/chosen": -1.003598928451538, "logits/rejected": -0.9121431112289429, "logps/chosen": -0.47750264406204224, "logps/rejected": -0.6715680956840515, "loss": 2.5484, "rewards/accuracies": 0.71875, "rewards/chosen": -4.775026798248291, "rewards/margins": 1.9406547546386719, "rewards/rejected": -6.715681552886963, "step": 120 }, { "epoch": 0.26720106880427524, "grad_norm": 66.98580734997083, "learning_rate": 9.172866268606513e-07, "logits/chosen": -1.0721389055252075, "logits/rejected": -1.0289056301116943, "logps/chosen": -0.5379669666290283, "logps/rejected": -0.6219452023506165, "loss": 2.3715, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -5.379669189453125, "rewards/margins": 0.8397828340530396, "rewards/rejected": -6.219452381134033, "step": 125 }, { "epoch": 0.27788911155644624, "grad_norm": 84.77905548021701, "learning_rate": 9.066954722907638e-07, "logits/chosen": -1.100857138633728, "logits/rejected": -1.0902975797653198, "logps/chosen": -0.5108389258384705, "logps/rejected": -0.7690914869308472, "loss": 2.3622, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -5.108389377593994, "rewards/margins": 2.5825250148773193, "rewards/rejected": -7.690914154052734, "step": 130 }, { "epoch": 0.28857715430861725, "grad_norm": 83.79514155786627, "learning_rate": 8.955355173281707e-07, "logits/chosen": -1.0757827758789062, "logits/rejected": -1.024726152420044, "logps/chosen": -0.5399268269538879, "logps/rejected": -0.6515442728996277, "loss": 2.4957, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -5.399269104003906, "rewards/margins": 1.1161742210388184, "rewards/rejected": -6.515442848205566, "step": 135 }, { "epoch": 0.29926519706078825, "grad_norm": 84.91769818698891, "learning_rate": 8.838223701790055e-07, "logits/chosen": -1.1459643840789795, "logits/rejected": -1.1201996803283691, "logps/chosen": -0.611223578453064, "logps/rejected": -0.7296082973480225, "loss": 2.4149, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -6.112236022949219, "rewards/margins": 1.183847188949585, "rewards/rejected": -7.296082973480225, "step": 140 }, { "epoch": 0.30995323981295925, "grad_norm": 102.01113634181742, "learning_rate": 8.71572412738697e-07, "logits/chosen": -1.0526931285858154, "logits/rejected": -1.0242080688476562, "logps/chosen": -0.6204456090927124, "logps/rejected": -0.8014122843742371, "loss": 2.176, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -6.204455375671387, "rewards/margins": 1.8096668720245361, "rewards/rejected": -8.01412296295166, "step": 145 }, { "epoch": 0.32064128256513025, "grad_norm": 74.15980695565004, "learning_rate": 8.588027776804058e-07, "logits/chosen": -1.093958854675293, "logits/rejected": -1.0704580545425415, "logps/chosen": -0.649553656578064, "logps/rejected": -0.8072296977043152, "loss": 2.2659, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -6.495536804199219, "rewards/margins": 1.5767608880996704, "rewards/rejected": -8.072297096252441, "step": 150 }, { "epoch": 0.33132932531730125, "grad_norm": 72.62800724311847, "learning_rate": 8.455313244934324e-07, "logits/chosen": -1.1008553504943848, "logits/rejected": -1.0772287845611572, "logps/chosen": -0.6936475038528442, "logps/rejected": -0.9287503957748413, "loss": 2.1291, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -6.9364752769470215, "rewards/margins": 2.3510289192199707, "rewards/rejected": -9.287505149841309, "step": 155 }, { "epoch": 0.3420173680694723, "grad_norm": 79.70729288803042, "learning_rate": 8.317766145051057e-07, "logits/chosen": -1.1146085262298584, "logits/rejected": -1.0959725379943848, "logps/chosen": -0.7885189056396484, "logps/rejected": -1.1322988271713257, "loss": 2.2489, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -7.885189056396484, "rewards/margins": 3.4377999305725098, "rewards/rejected": -11.322988510131836, "step": 160 }, { "epoch": 0.3527054108216433, "grad_norm": 69.7876072320057, "learning_rate": 8.175578849210894e-07, "logits/chosen": -1.1170759201049805, "logits/rejected": -1.0901795625686646, "logps/chosen": -0.8508628606796265, "logps/rejected": -1.1688997745513916, "loss": 2.149, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -8.508628845214844, "rewards/margins": 3.180368661880493, "rewards/rejected": -11.688997268676758, "step": 165 }, { "epoch": 0.3633934535738143, "grad_norm": 81.33338167428913, "learning_rate": 8.028950219204099e-07, "logits/chosen": -1.1123058795928955, "logits/rejected": -1.089231014251709, "logps/chosen": -0.8155210614204407, "logps/rejected": -1.1522643566131592, "loss": 1.9615, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -8.1552095413208, "rewards/margins": 3.367433547973633, "rewards/rejected": -11.52264404296875, "step": 170 }, { "epoch": 0.3740814963259853, "grad_norm": 148.27715900401293, "learning_rate": 7.878085328428368e-07, "logits/chosen": -1.1275430917739868, "logits/rejected": -1.0795894861221313, "logps/chosen": -0.9230238199234009, "logps/rejected": -1.1233540773391724, "loss": 1.8921, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -9.23023796081543, "rewards/margins": 2.0033037662506104, "rewards/rejected": -11.233541488647461, "step": 175 }, { "epoch": 0.3847695390781563, "grad_norm": 106.59570449253785, "learning_rate": 7.723195175075135e-07, "logits/chosen": -1.0845016241073608, "logits/rejected": -1.0633010864257812, "logps/chosen": -0.8883851170539856, "logps/rejected": -1.1803685426712036, "loss": 1.8854, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -8.883851051330566, "rewards/margins": 2.9198341369628906, "rewards/rejected": -11.803686141967773, "step": 180 }, { "epoch": 0.3954575818303273, "grad_norm": 121.9704404283126, "learning_rate": 7.564496387029531e-07, "logits/chosen": -1.1263529062271118, "logits/rejected": -1.0700018405914307, "logps/chosen": -0.9720249176025391, "logps/rejected": -1.2823493480682373, "loss": 1.8685, "rewards/accuracies": 0.78125, "rewards/chosen": -9.720248222351074, "rewards/margins": 3.1032447814941406, "rewards/rejected": -12.823492050170898, "step": 185 }, { "epoch": 0.4061456245824983, "grad_norm": 76.82499833409383, "learning_rate": 7.402210918896689e-07, "logits/chosen": -1.109298586845398, "logits/rejected": -1.1157281398773193, "logps/chosen": -1.0440528392791748, "logps/rejected": -1.4969879388809204, "loss": 1.726, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -10.440529823303223, "rewards/margins": 4.529348850250244, "rewards/rejected": -14.969879150390625, "step": 190 }, { "epoch": 0.4168336673346693, "grad_norm": 98.31254492588728, "learning_rate": 7.236565741578162e-07, "logits/chosen": -1.0706043243408203, "logits/rejected": -1.049782633781433, "logps/chosen": -1.0430076122283936, "logps/rejected": -1.383687973022461, "loss": 1.7355, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -10.430075645446777, "rewards/margins": 3.4068026542663574, "rewards/rejected": -13.836878776550293, "step": 195 }, { "epoch": 0.42752171008684037, "grad_norm": 121.71126629656227, "learning_rate": 7.067792524832603e-07, "logits/chosen": -1.0543218851089478, "logits/rejected": -1.0419845581054688, "logps/chosen": -1.0992997884750366, "logps/rejected": -1.4704090356826782, "loss": 1.8859, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -10.992999076843262, "rewards/margins": 3.711090564727783, "rewards/rejected": -14.704089164733887, "step": 200 }, { "epoch": 0.43820975283901137, "grad_norm": 106.07902118865707, "learning_rate": 6.896127313264642e-07, "logits/chosen": -1.1024644374847412, "logits/rejected": -1.052335500717163, "logps/chosen": -1.1648457050323486, "logps/rejected": -1.5202152729034424, "loss": 1.8443, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -11.648456573486328, "rewards/margins": 3.553696870803833, "rewards/rejected": -15.202153205871582, "step": 205 }, { "epoch": 0.44889779559118237, "grad_norm": 119.52331235635606, "learning_rate": 6.721810196195174e-07, "logits/chosen": -1.120891809463501, "logits/rejected": -1.1110631227493286, "logps/chosen": -1.2501453161239624, "logps/rejected": -1.6519298553466797, "loss": 1.7963, "rewards/accuracies": 0.84375, "rewards/chosen": -12.50145435333252, "rewards/margins": 4.017845153808594, "rewards/rejected": -16.519298553466797, "step": 210 }, { "epoch": 0.45958583834335337, "grad_norm": 98.00238843896823, "learning_rate": 6.545084971874736e-07, "logits/chosen": -1.063614845275879, "logits/rejected": -1.0467475652694702, "logps/chosen": -1.3223060369491577, "logps/rejected": -1.7558891773223877, "loss": 1.7131, "rewards/accuracies": 0.78125, "rewards/chosen": -13.223058700561523, "rewards/margins": 4.335831642150879, "rewards/rejected": -17.55889320373535, "step": 215 }, { "epoch": 0.47027388109552437, "grad_norm": 114.86693788333515, "learning_rate": 6.3661988065096e-07, "logits/chosen": -1.1257785558700562, "logits/rejected": -1.1067885160446167, "logps/chosen": -1.3776687383651733, "logps/rejected": -1.8501077890396118, "loss": 1.7429, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -13.776687622070312, "rewards/margins": 4.724389553070068, "rewards/rejected": -18.50107765197754, "step": 220 }, { "epoch": 0.48096192384769537, "grad_norm": 129.99247004695826, "learning_rate": 6.185401888577487e-07, "logits/chosen": -1.105886697769165, "logits/rejected": -1.0728118419647217, "logps/chosen": -1.3728220462799072, "logps/rejected": -1.8042919635772705, "loss": 1.7141, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -13.728219985961914, "rewards/margins": 4.314699649810791, "rewards/rejected": -18.042917251586914, "step": 225 }, { "epoch": 0.4916499665998664, "grad_norm": 98.65669520922688, "learning_rate": 6.002947078916364e-07, "logits/chosen": -1.1883631944656372, "logits/rejected": -1.1372156143188477, "logps/chosen": -1.363006353378296, "logps/rejected": -1.7407630681991577, "loss": 1.6921, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -13.630064010620117, "rewards/margins": 3.7775673866271973, "rewards/rejected": -17.407630920410156, "step": 230 }, { "epoch": 0.5023380093520374, "grad_norm": 94.32058594396345, "learning_rate": 5.819089557075688e-07, "logits/chosen": -1.222760796546936, "logits/rejected": -1.1934791803359985, "logps/chosen": -1.3279025554656982, "logps/rejected": -1.7879940271377563, "loss": 1.5978, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -13.279026985168457, "rewards/margins": 4.600913047790527, "rewards/rejected": -17.879940032958984, "step": 235 }, { "epoch": 0.5130260521042084, "grad_norm": 97.68263555717222, "learning_rate": 5.634086464424742e-07, "logits/chosen": -1.2020962238311768, "logits/rejected": -1.2033917903900146, "logps/chosen": -1.2446677684783936, "logps/rejected": -1.7023578882217407, "loss": 1.6344, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -12.446677207946777, "rewards/margins": 4.576901435852051, "rewards/rejected": -17.02358055114746, "step": 240 }, { "epoch": 0.5237140948563794, "grad_norm": 202.05423816354624, "learning_rate": 5.448196544517167e-07, "logits/chosen": -1.2868934869766235, "logits/rejected": -1.225835919380188, "logps/chosen": -1.2727655172348022, "logps/rejected": -1.7838712930679321, "loss": 1.4597, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -12.727655410766602, "rewards/margins": 5.111058235168457, "rewards/rejected": -17.838712692260742, "step": 245 }, { "epoch": 0.5344021376085505, "grad_norm": 120.34400804479175, "learning_rate": 5.26167978121472e-07, "logits/chosen": -1.2153074741363525, "logits/rejected": -1.1993039846420288, "logps/chosen": -1.342670202255249, "logps/rejected": -1.8750245571136475, "loss": 1.5212, "rewards/accuracies": 0.84375, "rewards/chosen": -13.426701545715332, "rewards/margins": 5.323545932769775, "rewards/rejected": -18.750247955322266, "step": 250 }, { "epoch": 0.5450901803607214, "grad_norm": 95.36132768916778, "learning_rate": 5.074797035076318e-07, "logits/chosen": -1.265678882598877, "logits/rejected": -1.2382146120071411, "logps/chosen": -1.475931167602539, "logps/rejected": -1.921068549156189, "loss": 1.6491, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -14.759312629699707, "rewards/margins": 4.4513750076293945, "rewards/rejected": -19.2106876373291, "step": 255 }, { "epoch": 0.5557782231128925, "grad_norm": 117.86214570617697, "learning_rate": 4.887809678520975e-07, "logits/chosen": -1.2461557388305664, "logits/rejected": -1.2149869203567505, "logps/chosen": -1.3435603380203247, "logps/rejected": -1.779606580734253, "loss": 1.5681, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -13.435603141784668, "rewards/margins": 4.360462188720703, "rewards/rejected": -17.796064376831055, "step": 260 }, { "epoch": 0.5664662658650634, "grad_norm": 91.7079115328607, "learning_rate": 4.700979230274829e-07, "logits/chosen": -1.2090892791748047, "logits/rejected": -1.1903409957885742, "logps/chosen": -1.4390499591827393, "logps/rejected": -1.8992010354995728, "loss": 1.6488, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -14.390498161315918, "rewards/margins": 4.6015119552612305, "rewards/rejected": -18.99201011657715, "step": 265 }, { "epoch": 0.5771543086172345, "grad_norm": 220.87890749833002, "learning_rate": 4.514566989613559e-07, "logits/chosen": -1.207067847251892, "logits/rejected": -1.1773267984390259, "logps/chosen": -1.3406851291656494, "logps/rejected": -1.8256925344467163, "loss": 1.6271, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -13.406852722167969, "rewards/margins": 4.850072860717773, "rewards/rejected": -18.25692367553711, "step": 270 }, { "epoch": 0.5878423513694054, "grad_norm": 87.3849854148476, "learning_rate": 4.328833670911724e-07, "logits/chosen": -1.1608107089996338, "logits/rejected": -1.1226792335510254, "logps/chosen": -1.3398956060409546, "logps/rejected": -1.7405239343643188, "loss": 1.7113, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -13.398956298828125, "rewards/margins": 4.006283760070801, "rewards/rejected": -17.40523910522461, "step": 275 }, { "epoch": 0.5985303941215765, "grad_norm": 148.2686835393089, "learning_rate": 4.144039039010124e-07, "logits/chosen": -1.2560105323791504, "logits/rejected": -1.2310097217559814, "logps/chosen": -1.3750091791152954, "logps/rejected": -1.8952747583389282, "loss": 1.5337, "rewards/accuracies": 0.8125, "rewards/chosen": -13.750091552734375, "rewards/margins": 5.202655792236328, "rewards/rejected": -18.952749252319336, "step": 280 }, { "epoch": 0.6092184368737475, "grad_norm": 98.69515577428142, "learning_rate": 3.960441545911204e-07, "logits/chosen": -1.224327564239502, "logits/rejected": -1.1907469034194946, "logps/chosen": -1.431443691253662, "logps/rejected": -1.9771114587783813, "loss": 1.3013, "rewards/accuracies": 0.875, "rewards/chosen": -14.314435958862305, "rewards/margins": 5.4566779136657715, "rewards/rejected": -19.7711124420166, "step": 285 }, { "epoch": 0.6199064796259185, "grad_norm": 141.5275248058356, "learning_rate": 3.778297969310529e-07, "logits/chosen": -1.257261872291565, "logits/rejected": -1.213803768157959, "logps/chosen": -1.40240478515625, "logps/rejected": -1.8522781133651733, "loss": 1.5508, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -14.0240478515625, "rewards/margins": 4.498733997344971, "rewards/rejected": -18.52277946472168, "step": 290 }, { "epoch": 0.6305945223780896, "grad_norm": 114.51767800087, "learning_rate": 3.5978630534699865e-07, "logits/chosen": -1.2009317874908447, "logits/rejected": -1.1843944787979126, "logps/chosen": -1.482761025428772, "logps/rejected": -1.9243135452270508, "loss": 1.4133, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -14.827609062194824, "rewards/margins": 4.415524482727051, "rewards/rejected": -19.243135452270508, "step": 295 }, { "epoch": 0.6412825651302605, "grad_norm": 133.03386433388292, "learning_rate": 3.4193891529348795e-07, "logits/chosen": -1.1327836513519287, "logits/rejected": -1.1056478023529053, "logps/chosen": -1.485954999923706, "logps/rejected": -1.863865852355957, "loss": 1.8363, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -14.859548568725586, "rewards/margins": 3.779109239578247, "rewards/rejected": -18.638660430908203, "step": 300 }, { "epoch": 0.6519706078824316, "grad_norm": 91.68870349271927, "learning_rate": 3.243125879593286e-07, "logits/chosen": -1.239027738571167, "logits/rejected": -1.1926300525665283, "logps/chosen": -1.4515793323516846, "logps/rejected": -1.8309190273284912, "loss": 1.7083, "rewards/accuracies": 0.75, "rewards/chosen": -14.515792846679688, "rewards/margins": 3.7933971881866455, "rewards/rejected": -18.30919075012207, "step": 305 }, { "epoch": 0.6626586506346025, "grad_norm": 135.3549980063555, "learning_rate": 3.069319753571269e-07, "logits/chosen": -1.2658618688583374, "logits/rejected": -1.2451133728027344, "logps/chosen": -1.528917670249939, "logps/rejected": -1.9810943603515625, "loss": 1.6482, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -15.289176940917969, "rewards/margins": 4.521766662597656, "rewards/rejected": -19.810943603515625, "step": 310 }, { "epoch": 0.6733466933867736, "grad_norm": 103.62793682266408, "learning_rate": 2.898213858452173e-07, "logits/chosen": -1.2625854015350342, "logits/rejected": -1.2041596174240112, "logps/chosen": -1.4363172054290771, "logps/rejected": -1.9117801189422607, "loss": 1.5096, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -14.363171577453613, "rewards/margins": 4.754631042480469, "rewards/rejected": -19.117801666259766, "step": 315 }, { "epoch": 0.6840347361389446, "grad_norm": 124.93488530926095, "learning_rate": 2.730047501302266e-07, "logits/chosen": -1.2458070516586304, "logits/rejected": -1.2413748502731323, "logps/chosen": -1.4891000986099243, "logps/rejected": -2.0556411743164062, "loss": 1.431, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -14.891000747680664, "rewards/margins": 5.665411472320557, "rewards/rejected": -20.556411743164062, "step": 320 }, { "epoch": 0.6947227788911156, "grad_norm": 132.66286552485195, "learning_rate": 2.5650558779781635e-07, "logits/chosen": -1.262664556503296, "logits/rejected": -1.211101770401001, "logps/chosen": -1.6323843002319336, "logps/rejected": -2.2400825023651123, "loss": 1.4653, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -16.323843002319336, "rewards/margins": 6.076982498168945, "rewards/rejected": -22.40082550048828, "step": 325 }, { "epoch": 0.7054108216432866, "grad_norm": 77.86519610350517, "learning_rate": 2.403469744184154e-07, "logits/chosen": -1.1737123727798462, "logits/rejected": -1.1300561428070068, "logps/chosen": -1.5188909769058228, "logps/rejected": -1.950216293334961, "loss": 1.5313, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -15.188909530639648, "rewards/margins": 4.313254356384277, "rewards/rejected": -19.50216293334961, "step": 330 }, { "epoch": 0.7160988643954576, "grad_norm": 116.83352825706731, "learning_rate": 2.2455150927394878e-07, "logits/chosen": -1.202048420906067, "logits/rejected": -1.1833049058914185, "logps/chosen": -1.4710850715637207, "logps/rejected": -1.9782819747924805, "loss": 1.3351, "rewards/accuracies": 0.84375, "rewards/chosen": -14.710851669311523, "rewards/margins": 5.071969032287598, "rewards/rejected": -19.782819747924805, "step": 335 }, { "epoch": 0.7267869071476286, "grad_norm": 117.85874090140514, "learning_rate": 2.0914128375069722e-07, "logits/chosen": -1.2336275577545166, "logits/rejected": -1.1968821287155151, "logps/chosen": -1.4882738590240479, "logps/rejected": -2.008549928665161, "loss": 1.5616, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -14.88273811340332, "rewards/margins": 5.202761650085449, "rewards/rejected": -20.085498809814453, "step": 340 }, { "epoch": 0.7374749498997996, "grad_norm": 111.48616582145915, "learning_rate": 1.9413785044249676e-07, "logits/chosen": -1.2667595148086548, "logits/rejected": -1.2406939268112183, "logps/chosen": -1.5598745346069336, "logps/rejected": -2.16188383102417, "loss": 1.5993, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -15.59874439239502, "rewards/margins": 6.020094394683838, "rewards/rejected": -21.618839263916016, "step": 345 }, { "epoch": 0.7481629926519706, "grad_norm": 133.44639116296503, "learning_rate": 1.7956219300748792e-07, "logits/chosen": -1.244403600692749, "logits/rejected": -1.24759042263031, "logps/chosen": -1.5018080472946167, "logps/rejected": -2.005552291870117, "loss": 1.5981, "rewards/accuracies": 0.84375, "rewards/chosen": -15.01807975769043, "rewards/margins": 5.037442207336426, "rewards/rejected": -20.055522918701172, "step": 350 }, { "epoch": 0.7588510354041417, "grad_norm": 145.88647972080093, "learning_rate": 1.6543469682057104e-07, "logits/chosen": -1.1816396713256836, "logits/rejected": -1.195244550704956, "logps/chosen": -1.4624321460723877, "logps/rejected": -1.9389760494232178, "loss": 1.3978, "rewards/accuracies": 0.84375, "rewards/chosen": -14.624320983886719, "rewards/margins": 4.765438079833984, "rewards/rejected": -19.389759063720703, "step": 355 }, { "epoch": 0.7695390781563126, "grad_norm": 89.39232956423915, "learning_rate": 1.5177512046261666e-07, "logits/chosen": -1.247115135192871, "logits/rejected": -1.2440454959869385, "logps/chosen": -1.4620821475982666, "logps/rejected": -2.070996046066284, "loss": 1.464, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -14.620820999145508, "rewards/margins": 6.08914041519165, "rewards/rejected": -20.7099609375, "step": 360 }, { "epoch": 0.7802271209084837, "grad_norm": 97.94137628606371, "learning_rate": 1.3860256808630427e-07, "logits/chosen": -1.2937978506088257, "logits/rejected": -1.2186272144317627, "logps/chosen": -1.5094481706619263, "logps/rejected": -2.0971579551696777, "loss": 1.4916, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -15.094482421875, "rewards/margins": 5.87709903717041, "rewards/rejected": -20.97157859802246, "step": 365 }, { "epoch": 0.7909151636606546, "grad_norm": 122.16117083194588, "learning_rate": 1.2593546269723647e-07, "logits/chosen": -1.2087019681930542, "logits/rejected": -1.195441484451294, "logps/chosen": -1.4670366048812866, "logps/rejected": -1.9255189895629883, "loss": 1.5354, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -14.670367240905762, "rewards/margins": 4.5848236083984375, "rewards/rejected": -19.255189895629883, "step": 370 }, { "epoch": 0.8016032064128257, "grad_norm": 118.93381227295681, "learning_rate": 1.1379152038770029e-07, "logits/chosen": -1.241072416305542, "logits/rejected": -1.2429205179214478, "logps/chosen": -1.5817298889160156, "logps/rejected": -2.1309893131256104, "loss": 1.5269, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -15.817300796508789, "rewards/margins": 5.492591381072998, "rewards/rejected": -21.309892654418945, "step": 375 }, { "epoch": 0.8122912491649966, "grad_norm": 149.16584822794186, "learning_rate": 1.0218772555910954e-07, "logits/chosen": -1.2636191844940186, "logits/rejected": -1.2404122352600098, "logps/chosen": -1.4529675245285034, "logps/rejected": -1.929680585861206, "loss": 1.5703, "rewards/accuracies": 0.84375, "rewards/chosen": -14.52967643737793, "rewards/margins": 4.767130374908447, "rewards/rejected": -19.29680824279785, "step": 380 }, { "epoch": 0.8229792919171677, "grad_norm": 134.81427667533356, "learning_rate": 9.114030716778432e-08, "logits/chosen": -1.2461379766464233, "logits/rejected": -1.2212632894515991, "logps/chosen": -1.488556146621704, "logps/rejected": -2.128378391265869, "loss": 1.3302, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -14.8855619430542, "rewards/margins": 6.398221015930176, "rewards/rejected": -21.283782958984375, "step": 385 }, { "epoch": 0.8336673346693386, "grad_norm": 106.31823929279197, "learning_rate": 8.066471602728803e-08, "logits/chosen": -1.262342095375061, "logits/rejected": -1.2427619695663452, "logps/chosen": -1.546651840209961, "logps/rejected": -2.105900526046753, "loss": 1.5107, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -15.466519355773926, "rewards/margins": 5.5924882888793945, "rewards/rejected": -21.05900764465332, "step": 390 }, { "epoch": 0.8443553774215097, "grad_norm": 82.49774941818607, "learning_rate": 7.077560319906694e-08, "logits/chosen": -1.2678358554840088, "logits/rejected": -1.2441434860229492, "logps/chosen": -1.4695792198181152, "logps/rejected": -2.003164768218994, "loss": 1.4509, "rewards/accuracies": 0.8125, "rewards/chosen": -14.695793151855469, "rewards/margins": 5.335854530334473, "rewards/rejected": -20.031646728515625, "step": 395 }, { "epoch": 0.8550434201736807, "grad_norm": 117.67752136496273, "learning_rate": 6.148679950161672e-08, "logits/chosen": -1.2668910026550293, "logits/rejected": -1.2486127614974976, "logps/chosen": -1.4717615842819214, "logps/rejected": -1.9521501064300537, "loss": 1.4439, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -14.717615127563477, "rewards/margins": 4.803887367248535, "rewards/rejected": -19.521503448486328, "step": 400 }, { "epoch": 0.8550434201736807, "eval_logits/chosen": -1.484498143196106, "eval_logits/rejected": -1.4926748275756836, "eval_logps/chosen": -1.490028738975525, "eval_logps/rejected": -2.012206554412842, "eval_loss": 1.3685932159423828, "eval_rewards/accuracies": 0.8353658318519592, "eval_rewards/chosen": -14.900286674499512, "eval_rewards/margins": 5.221778392791748, "eval_rewards/rejected": -20.122066497802734, "eval_runtime": 95.1291, "eval_samples_per_second": 20.614, "eval_steps_per_second": 1.293, "step": 400 }, { "epoch": 0.8657314629258517, "grad_norm": 120.99808262565784, "learning_rate": 5.2811296166831666e-08, "logits/chosen": -1.2351343631744385, "logits/rejected": -1.2513123750686646, "logps/chosen": -1.5402753353118896, "logps/rejected": -1.9941785335540771, "loss": 1.4541, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -15.402752876281738, "rewards/margins": 4.539034366607666, "rewards/rejected": -19.94178581237793, "step": 405 }, { "epoch": 0.8764195056780227, "grad_norm": 184.47321309186552, "learning_rate": 4.4761226670592066e-08, "logits/chosen": -1.247280478477478, "logits/rejected": -1.234876036643982, "logps/chosen": -1.4927871227264404, "logps/rejected": -1.9950872659683228, "loss": 1.6499, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -14.927871704101562, "rewards/margins": 5.023002624511719, "rewards/rejected": -19.95087242126465, "step": 410 }, { "epoch": 0.8871075484301937, "grad_norm": 94.29238243376733, "learning_rate": 3.734784976300165e-08, "logits/chosen": -1.2503280639648438, "logits/rejected": -1.1914690732955933, "logps/chosen": -1.4084547758102417, "logps/rejected": -1.9929052591323853, "loss": 1.6712, "rewards/accuracies": 0.875, "rewards/chosen": -14.084548950195312, "rewards/margins": 5.844505310058594, "rewards/rejected": -19.929052352905273, "step": 415 }, { "epoch": 0.8977955911823647, "grad_norm": 129.92639778174433, "learning_rate": 3.058153372200695e-08, "logits/chosen": -1.280215859413147, "logits/rejected": -1.2256780862808228, "logps/chosen": -1.3968563079833984, "logps/rejected": -1.92953622341156, "loss": 1.514, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -13.968562126159668, "rewards/margins": 5.326799392700195, "rewards/rejected": -19.295360565185547, "step": 420 }, { "epoch": 0.9084836339345357, "grad_norm": 134.3230870539629, "learning_rate": 2.4471741852423233e-08, "logits/chosen": -1.282026767730713, "logits/rejected": -1.2692543268203735, "logps/chosen": -1.5369470119476318, "logps/rejected": -2.0062360763549805, "loss": 1.6752, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -15.369470596313477, "rewards/margins": 4.6928911209106445, "rewards/rejected": -20.062358856201172, "step": 425 }, { "epoch": 0.9191716766867067, "grad_norm": 234.26732939024393, "learning_rate": 1.9027019250647036e-08, "logits/chosen": -1.2548713684082031, "logits/rejected": -1.235939383506775, "logps/chosen": -1.565554141998291, "logps/rejected": -2.0692152976989746, "loss": 1.597, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -15.655542373657227, "rewards/margins": 5.036610126495361, "rewards/rejected": -20.692150115966797, "step": 430 }, { "epoch": 0.9298597194388778, "grad_norm": 86.1878830813907, "learning_rate": 1.4254980853566246e-08, "logits/chosen": -1.2161545753479004, "logits/rejected": -1.1717766523361206, "logps/chosen": -1.4371172189712524, "logps/rejected": -1.9690968990325928, "loss": 1.4224, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -14.371172904968262, "rewards/margins": 5.319798469543457, "rewards/rejected": -19.69097328186035, "step": 435 }, { "epoch": 0.9405477621910487, "grad_norm": 124.03718973399216, "learning_rate": 1.016230078838226e-08, "logits/chosen": -1.2341668605804443, "logits/rejected": -1.1728591918945312, "logps/chosen": -1.4924213886260986, "logps/rejected": -1.9679025411605835, "loss": 1.4177, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -14.924214363098145, "rewards/margins": 4.754812240600586, "rewards/rejected": -19.679025650024414, "step": 440 }, { "epoch": 0.9512358049432198, "grad_norm": 94.74857172171315, "learning_rate": 6.754703038239329e-09, "logits/chosen": -1.1919021606445312, "logits/rejected": -1.1725019216537476, "logps/chosen": -1.5140306949615479, "logps/rejected": -2.1036159992218018, "loss": 1.2401, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -15.140307426452637, "rewards/margins": 5.895852088928223, "rewards/rejected": -21.03615951538086, "step": 445 }, { "epoch": 0.9619238476953907, "grad_norm": 125.47450644355813, "learning_rate": 4.036953436716895e-09, "logits/chosen": -1.3033939599990845, "logits/rejected": -1.2800980806350708, "logps/chosen": -1.4365876913070679, "logps/rejected": -1.9429874420166016, "loss": 1.4917, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -14.365875244140625, "rewards/margins": 5.063995361328125, "rewards/rejected": -19.42987060546875, "step": 450 }, { "epoch": 0.9726118904475618, "grad_norm": 107.55948121005738, "learning_rate": 2.0128530023804656e-09, "logits/chosen": -1.2646621465682983, "logits/rejected": -1.2269269227981567, "logps/chosen": -1.468353271484375, "logps/rejected": -2.0639400482177734, "loss": 1.1516, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -14.68353271484375, "rewards/margins": 5.955868244171143, "rewards/rejected": -20.639400482177734, "step": 455 }, { "epoch": 0.9832999331997327, "grad_norm": 93.15728602919113, "learning_rate": 6.852326227130833e-10, "logits/chosen": -1.2586472034454346, "logits/rejected": -1.2455804347991943, "logps/chosen": -1.5446629524230957, "logps/rejected": -2.104782819747925, "loss": 1.4082, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -15.446629524230957, "rewards/margins": 5.601197719573975, "rewards/rejected": -21.047826766967773, "step": 460 }, { "epoch": 0.9939879759519038, "grad_norm": 101.92365413278564, "learning_rate": 5.594909486328348e-11, "logits/chosen": -1.2412211894989014, "logits/rejected": -1.244315505027771, "logps/chosen": -1.5213388204574585, "logps/rejected": -2.075791835784912, "loss": 1.5253, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -15.213388442993164, "rewards/margins": 5.544531345367432, "rewards/rejected": -20.757919311523438, "step": 465 }, { "epoch": 0.9982631930527722, "step": 467, "total_flos": 0.0, "train_loss": 1.9910774169693157, "train_runtime": 11451.8288, "train_samples_per_second": 5.229, "train_steps_per_second": 0.041 } ], "logging_steps": 5, "max_steps": 467, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }