File size: 31,498 Bytes
ecf66fe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
{
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 0.9885057471264368,
  "eval_steps": 1,
  "global_step": 43,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "epoch": 0.022988505747126436,
      "eval_logits/chosen": -0.285895973443985,
      "eval_logits/rejected": -0.2622124254703522,
      "eval_logps/chosen": -265.5323791503906,
      "eval_logps/rejected": -265.8489074707031,
      "eval_loss": 2.506535530090332,
      "eval_nll_loss": 0.7676451206207275,
      "eval_rewards/accuracies": 0.5173913240432739,
      "eval_rewards/chosen": -26.55323600769043,
      "eval_rewards/margins": 0.03165607899427414,
      "eval_rewards/rejected": -26.58489227294922,
      "eval_runtime": 73.2699,
      "eval_samples_per_second": 24.922,
      "eval_steps_per_second": 1.57,
      "step": 1
    },
    {
      "epoch": 0.04597701149425287,
      "eval_logits/chosen": -0.28692948818206787,
      "eval_logits/rejected": -0.2633576989173889,
      "eval_logps/chosen": -265.1800537109375,
      "eval_logps/rejected": -265.50384521484375,
      "eval_loss": 2.505967855453491,
      "eval_nll_loss": 0.766638994216919,
      "eval_rewards/accuracies": 0.52173912525177,
      "eval_rewards/chosen": -26.51800537109375,
      "eval_rewards/margins": 0.03237998113036156,
      "eval_rewards/rejected": -26.550386428833008,
      "eval_runtime": 73.1616,
      "eval_samples_per_second": 24.958,
      "eval_steps_per_second": 1.572,
      "step": 2
    },
    {
      "epoch": 0.06896551724137931,
      "eval_logits/chosen": -0.28926920890808105,
      "eval_logits/rejected": -0.2657304108142853,
      "eval_logps/chosen": -265.0088195800781,
      "eval_logps/rejected": -265.2777404785156,
      "eval_loss": 2.505052328109741,
      "eval_nll_loss": 0.7661022543907166,
      "eval_rewards/accuracies": 0.52173912525177,
      "eval_rewards/chosen": -26.500883102416992,
      "eval_rewards/margins": 0.02689189836382866,
      "eval_rewards/rejected": -26.527772903442383,
      "eval_runtime": 73.4564,
      "eval_samples_per_second": 24.858,
      "eval_steps_per_second": 1.566,
      "step": 3
    },
    {
      "epoch": 0.09195402298850575,
      "eval_logits/chosen": -0.29259422421455383,
      "eval_logits/rejected": -0.26898470520973206,
      "eval_logps/chosen": -263.95184326171875,
      "eval_logps/rejected": -264.3146667480469,
      "eval_loss": 2.498246669769287,
      "eval_nll_loss": 0.7631996870040894,
      "eval_rewards/accuracies": 0.5239130258560181,
      "eval_rewards/chosen": -26.395187377929688,
      "eval_rewards/margins": 0.03628147765994072,
      "eval_rewards/rejected": -26.43147087097168,
      "eval_runtime": 73.5575,
      "eval_samples_per_second": 24.824,
      "eval_steps_per_second": 1.563,
      "step": 4
    },
    {
      "epoch": 0.11494252873563218,
      "eval_logits/chosen": -0.29848381876945496,
      "eval_logits/rejected": -0.27501967549324036,
      "eval_logps/chosen": -262.6512145996094,
      "eval_logps/rejected": -263.0111999511719,
      "eval_loss": 2.489372968673706,
      "eval_nll_loss": 0.7594311237335205,
      "eval_rewards/accuracies": 0.52173912525177,
      "eval_rewards/chosen": -26.26512336730957,
      "eval_rewards/margins": 0.035997405648231506,
      "eval_rewards/rejected": -26.30112075805664,
      "eval_runtime": 73.7594,
      "eval_samples_per_second": 24.756,
      "eval_steps_per_second": 1.559,
      "step": 5
    },
    {
      "epoch": 0.13793103448275862,
      "eval_logits/chosen": -0.30859696865081787,
      "eval_logits/rejected": -0.2858428359031677,
      "eval_logps/chosen": -259.449951171875,
      "eval_logps/rejected": -259.8551330566406,
      "eval_loss": 2.4688832759857178,
      "eval_nll_loss": 0.7501848340034485,
      "eval_rewards/accuracies": 0.5282608866691589,
      "eval_rewards/chosen": -25.94499397277832,
      "eval_rewards/margins": 0.040517814457416534,
      "eval_rewards/rejected": -25.98551368713379,
      "eval_runtime": 73.5054,
      "eval_samples_per_second": 24.842,
      "eval_steps_per_second": 1.565,
      "step": 6
    },
    {
      "epoch": 0.16091954022988506,
      "eval_logits/chosen": -0.3201945424079895,
      "eval_logits/rejected": -0.297221302986145,
      "eval_logps/chosen": -257.0843200683594,
      "eval_logps/rejected": -257.527099609375,
      "eval_loss": 2.4511067867279053,
      "eval_nll_loss": 0.7433211207389832,
      "eval_rewards/accuracies": 0.5282608866691589,
      "eval_rewards/chosen": -25.708433151245117,
      "eval_rewards/margins": 0.04427630454301834,
      "eval_rewards/rejected": -25.752708435058594,
      "eval_runtime": 73.716,
      "eval_samples_per_second": 24.771,
      "eval_steps_per_second": 1.56,
      "step": 7
    },
    {
      "epoch": 0.1839080459770115,
      "eval_logits/chosen": -0.348645955324173,
      "eval_logits/rejected": -0.3254188001155853,
      "eval_logps/chosen": -252.2147216796875,
      "eval_logps/rejected": -252.7242431640625,
      "eval_loss": 2.4179530143737793,
      "eval_nll_loss": 0.7291316390037537,
      "eval_rewards/accuracies": 0.532608687877655,
      "eval_rewards/chosen": -25.22147560119629,
      "eval_rewards/margins": 0.05095084756612778,
      "eval_rewards/rejected": -25.27242660522461,
      "eval_runtime": 73.8275,
      "eval_samples_per_second": 24.733,
      "eval_steps_per_second": 1.558,
      "step": 8
    },
    {
      "epoch": 0.20689655172413793,
      "eval_logits/chosen": -0.37005820870399475,
      "eval_logits/rejected": -0.3462548851966858,
      "eval_logps/chosen": -248.8451385498047,
      "eval_logps/rejected": -249.3928985595703,
      "eval_loss": 2.3951992988586426,
      "eval_nll_loss": 0.7191779017448425,
      "eval_rewards/accuracies": 0.5282608866691589,
      "eval_rewards/chosen": -24.8845157623291,
      "eval_rewards/margins": 0.054776255041360855,
      "eval_rewards/rejected": -24.939287185668945,
      "eval_runtime": 73.7047,
      "eval_samples_per_second": 24.775,
      "eval_steps_per_second": 1.56,
      "step": 9
    },
    {
      "epoch": 0.22988505747126436,
      "grad_norm": 55.518348693847656,
      "learning_rate": 8.684210526315789e-07,
      "logits/chosen": -0.35856884717941284,
      "logits/rejected": -0.3261299431324005,
      "logps/chosen": -264.810302734375,
      "logps/rejected": -258.8919982910156,
      "loss": 2.6865,
      "nll_loss": 0.7651573419570923,
      "rewards/accuracies": 0.4749999940395355,
      "rewards/chosen": -26.481029510498047,
      "rewards/margins": -0.5918328166007996,
      "rewards/rejected": -25.889196395874023,
      "step": 10
    },
    {
      "epoch": 0.22988505747126436,
      "eval_logits/chosen": -0.384502112865448,
      "eval_logits/rejected": -0.3603852689266205,
      "eval_logps/chosen": -246.21482849121094,
      "eval_logps/rejected": -246.78208923339844,
      "eval_loss": 2.376126766204834,
      "eval_nll_loss": 0.7115476727485657,
      "eval_rewards/accuracies": 0.5347825884819031,
      "eval_rewards/chosen": -24.621484756469727,
      "eval_rewards/margins": 0.05672362819314003,
      "eval_rewards/rejected": -24.678205490112305,
      "eval_runtime": 73.7798,
      "eval_samples_per_second": 24.749,
      "eval_steps_per_second": 1.559,
      "step": 10
    },
    {
      "epoch": 0.25287356321839083,
      "eval_logits/chosen": -0.397601842880249,
      "eval_logits/rejected": -0.3731386959552765,
      "eval_logps/chosen": -244.02699279785156,
      "eval_logps/rejected": -244.7050323486328,
      "eval_loss": 2.3608767986297607,
      "eval_nll_loss": 0.7050958275794983,
      "eval_rewards/accuracies": 0.539130449295044,
      "eval_rewards/chosen": -24.402700424194336,
      "eval_rewards/margins": 0.06780331581830978,
      "eval_rewards/rejected": -24.470500946044922,
      "eval_runtime": 73.0824,
      "eval_samples_per_second": 24.985,
      "eval_steps_per_second": 1.574,
      "step": 11
    },
    {
      "epoch": 0.27586206896551724,
      "eval_logits/chosen": -0.4218127429485321,
      "eval_logits/rejected": -0.3970121443271637,
      "eval_logps/chosen": -240.5603790283203,
      "eval_logps/rejected": -241.30628967285156,
      "eval_loss": 2.3367197513580322,
      "eval_nll_loss": 0.6951096057891846,
      "eval_rewards/accuracies": 0.5347825884819031,
      "eval_rewards/chosen": -24.05603790283203,
      "eval_rewards/margins": 0.074591264128685,
      "eval_rewards/rejected": -24.130634307861328,
      "eval_runtime": 73.2125,
      "eval_samples_per_second": 24.941,
      "eval_steps_per_second": 1.571,
      "step": 12
    },
    {
      "epoch": 0.2988505747126437,
      "eval_logits/chosen": -0.4434413015842438,
      "eval_logits/rejected": -0.4179251492023468,
      "eval_logps/chosen": -236.7858123779297,
      "eval_logps/rejected": -237.64541625976562,
      "eval_loss": 2.310944080352783,
      "eval_nll_loss": 0.6840075850486755,
      "eval_rewards/accuracies": 0.530434787273407,
      "eval_rewards/chosen": -23.6785831451416,
      "eval_rewards/margins": 0.08595678210258484,
      "eval_rewards/rejected": -23.764541625976562,
      "eval_runtime": 73.2236,
      "eval_samples_per_second": 24.937,
      "eval_steps_per_second": 1.571,
      "step": 13
    },
    {
      "epoch": 0.3218390804597701,
      "eval_logits/chosen": -0.4679478406906128,
      "eval_logits/rejected": -0.4422786235809326,
      "eval_logps/chosen": -233.17481994628906,
      "eval_logps/rejected": -234.0310821533203,
      "eval_loss": 2.290565252304077,
      "eval_nll_loss": 0.6733331680297852,
      "eval_rewards/accuracies": 0.5347825884819031,
      "eval_rewards/chosen": -23.317480087280273,
      "eval_rewards/margins": 0.08562804758548737,
      "eval_rewards/rejected": -23.40311050415039,
      "eval_runtime": 73.3905,
      "eval_samples_per_second": 24.881,
      "eval_steps_per_second": 1.567,
      "step": 14
    },
    {
      "epoch": 0.3448275862068966,
      "eval_logits/chosen": -0.49170100688934326,
      "eval_logits/rejected": -0.4659886956214905,
      "eval_logps/chosen": -229.94561767578125,
      "eval_logps/rejected": -230.9332275390625,
      "eval_loss": 2.272915840148926,
      "eval_nll_loss": 0.663709819316864,
      "eval_rewards/accuracies": 0.5347825884819031,
      "eval_rewards/chosen": -22.99456024169922,
      "eval_rewards/margins": 0.09876058995723724,
      "eval_rewards/rejected": -23.093320846557617,
      "eval_runtime": 73.5456,
      "eval_samples_per_second": 24.828,
      "eval_steps_per_second": 1.564,
      "step": 15
    },
    {
      "epoch": 0.367816091954023,
      "eval_logits/chosen": -0.5142260789871216,
      "eval_logits/rejected": -0.4886496365070343,
      "eval_logps/chosen": -227.06649780273438,
      "eval_logps/rejected": -228.05648803710938,
      "eval_loss": 2.257603406906128,
      "eval_nll_loss": 0.6548909544944763,
      "eval_rewards/accuracies": 0.5369565486907959,
      "eval_rewards/chosen": -22.70665168762207,
      "eval_rewards/margins": 0.09899646788835526,
      "eval_rewards/rejected": -22.805648803710938,
      "eval_runtime": 73.494,
      "eval_samples_per_second": 24.846,
      "eval_steps_per_second": 1.565,
      "step": 16
    },
    {
      "epoch": 0.39080459770114945,
      "eval_logits/chosen": -0.5408182144165039,
      "eval_logits/rejected": -0.5151581764221191,
      "eval_logps/chosen": -224.1295928955078,
      "eval_logps/rejected": -225.16580200195312,
      "eval_loss": 2.241145133972168,
      "eval_nll_loss": 0.6459768414497375,
      "eval_rewards/accuracies": 0.5282608866691589,
      "eval_rewards/chosen": -22.4129581451416,
      "eval_rewards/margins": 0.10362222790718079,
      "eval_rewards/rejected": -22.516578674316406,
      "eval_runtime": 73.7057,
      "eval_samples_per_second": 24.774,
      "eval_steps_per_second": 1.56,
      "step": 17
    },
    {
      "epoch": 0.41379310344827586,
      "eval_logits/chosen": -0.5656267404556274,
      "eval_logits/rejected": -0.5400449633598328,
      "eval_logps/chosen": -221.59368896484375,
      "eval_logps/rejected": -222.6521759033203,
      "eval_loss": 2.230027198791504,
      "eval_nll_loss": 0.6381992697715759,
      "eval_rewards/accuracies": 0.5260869860649109,
      "eval_rewards/chosen": -22.15936851501465,
      "eval_rewards/margins": 0.10584992170333862,
      "eval_rewards/rejected": -22.265216827392578,
      "eval_runtime": 73.8674,
      "eval_samples_per_second": 24.72,
      "eval_steps_per_second": 1.557,
      "step": 18
    },
    {
      "epoch": 0.4367816091954023,
      "eval_logits/chosen": -0.5914514064788818,
      "eval_logits/rejected": -0.565658688545227,
      "eval_logps/chosen": -219.20506286621094,
      "eval_logps/rejected": -220.354736328125,
      "eval_loss": 2.2169623374938965,
      "eval_nll_loss": 0.6308388113975525,
      "eval_rewards/accuracies": 0.530434787273407,
      "eval_rewards/chosen": -21.92050552368164,
      "eval_rewards/margins": 0.11496546864509583,
      "eval_rewards/rejected": -22.035470962524414,
      "eval_runtime": 73.7719,
      "eval_samples_per_second": 24.752,
      "eval_steps_per_second": 1.559,
      "step": 19
    },
    {
      "epoch": 0.45977011494252873,
      "grad_norm": 51.48088455200195,
      "learning_rate": 6.052631578947368e-07,
      "logits/chosen": -0.48232191801071167,
      "logits/rejected": -0.4643561840057373,
      "logps/chosen": -226.7048797607422,
      "logps/rejected": -228.0491943359375,
      "loss": 2.3904,
      "nll_loss": 0.6598069667816162,
      "rewards/accuracies": 0.546875,
      "rewards/chosen": -22.670488357543945,
      "rewards/margins": 0.13443148136138916,
      "rewards/rejected": -22.804920196533203,
      "step": 20
    },
    {
      "epoch": 0.45977011494252873,
      "eval_logits/chosen": -0.617470383644104,
      "eval_logits/rejected": -0.5920071601867676,
      "eval_logps/chosen": -217.05372619628906,
      "eval_logps/rejected": -218.20924377441406,
      "eval_loss": 2.20650315284729,
      "eval_nll_loss": 0.624081552028656,
      "eval_rewards/accuracies": 0.5282608866691589,
      "eval_rewards/chosen": -21.705373764038086,
      "eval_rewards/margins": 0.11555319279432297,
      "eval_rewards/rejected": -21.8209285736084,
      "eval_runtime": 73.6034,
      "eval_samples_per_second": 24.809,
      "eval_steps_per_second": 1.562,
      "step": 20
    },
    {
      "epoch": 0.4827586206896552,
      "eval_logits/chosen": -0.6441444754600525,
      "eval_logits/rejected": -0.6189336180686951,
      "eval_logps/chosen": -214.8709716796875,
      "eval_logps/rejected": -216.1072235107422,
      "eval_loss": 2.193157911300659,
      "eval_nll_loss": 0.6171812415122986,
      "eval_rewards/accuracies": 0.5260869860649109,
      "eval_rewards/chosen": -21.487096786499023,
      "eval_rewards/margins": 0.12362580001354218,
      "eval_rewards/rejected": -21.6107234954834,
      "eval_runtime": 73.1268,
      "eval_samples_per_second": 24.97,
      "eval_steps_per_second": 1.573,
      "step": 21
    },
    {
      "epoch": 0.5057471264367817,
      "eval_logits/chosen": -0.6693909168243408,
      "eval_logits/rejected": -0.6444550156593323,
      "eval_logps/chosen": -212.89871215820312,
      "eval_logps/rejected": -214.12872314453125,
      "eval_loss": 2.1838579177856445,
      "eval_nll_loss": 0.6109142899513245,
      "eval_rewards/accuracies": 0.519565224647522,
      "eval_rewards/chosen": -21.289873123168945,
      "eval_rewards/margins": 0.1229993924498558,
      "eval_rewards/rejected": -21.412874221801758,
      "eval_runtime": 73.3336,
      "eval_samples_per_second": 24.9,
      "eval_steps_per_second": 1.568,
      "step": 22
    },
    {
      "epoch": 0.5287356321839081,
      "eval_logits/chosen": -0.6940123438835144,
      "eval_logits/rejected": -0.6688118577003479,
      "eval_logps/chosen": -210.87289428710938,
      "eval_logps/rejected": -212.1172332763672,
      "eval_loss": 2.17464280128479,
      "eval_nll_loss": 0.6044757962226868,
      "eval_rewards/accuracies": 0.5260869860649109,
      "eval_rewards/chosen": -21.087289810180664,
      "eval_rewards/margins": 0.12443248927593231,
      "eval_rewards/rejected": -21.21172332763672,
      "eval_runtime": 73.7107,
      "eval_samples_per_second": 24.773,
      "eval_steps_per_second": 1.56,
      "step": 23
    },
    {
      "epoch": 0.5517241379310345,
      "eval_logits/chosen": -0.7184363603591919,
      "eval_logits/rejected": -0.6937569379806519,
      "eval_logps/chosen": -209.13641357421875,
      "eval_logps/rejected": -210.39794921875,
      "eval_loss": 2.1655774116516113,
      "eval_nll_loss": 0.5988763570785522,
      "eval_rewards/accuracies": 0.5239130258560181,
      "eval_rewards/chosen": -20.91364097595215,
      "eval_rewards/margins": 0.12615376710891724,
      "eval_rewards/rejected": -21.039793014526367,
      "eval_runtime": 73.2196,
      "eval_samples_per_second": 24.939,
      "eval_steps_per_second": 1.571,
      "step": 24
    },
    {
      "epoch": 0.5747126436781609,
      "eval_logits/chosen": -0.7364875078201294,
      "eval_logits/rejected": -0.711971640586853,
      "eval_logps/chosen": -207.19107055664062,
      "eval_logps/rejected": -208.48138427734375,
      "eval_loss": 2.155548572540283,
      "eval_nll_loss": 0.5926215052604675,
      "eval_rewards/accuracies": 0.5282608866691589,
      "eval_rewards/chosen": -20.719106674194336,
      "eval_rewards/margins": 0.12903204560279846,
      "eval_rewards/rejected": -20.8481388092041,
      "eval_runtime": 73.1876,
      "eval_samples_per_second": 24.95,
      "eval_steps_per_second": 1.571,
      "step": 25
    },
    {
      "epoch": 0.5977011494252874,
      "eval_logits/chosen": -0.7545364499092102,
      "eval_logits/rejected": -0.730129599571228,
      "eval_logps/chosen": -205.48521423339844,
      "eval_logps/rejected": -206.7897186279297,
      "eval_loss": 2.1465742588043213,
      "eval_nll_loss": 0.5872200727462769,
      "eval_rewards/accuracies": 0.5282608866691589,
      "eval_rewards/chosen": -20.548521041870117,
      "eval_rewards/margins": 0.13045117259025574,
      "eval_rewards/rejected": -20.678974151611328,
      "eval_runtime": 73.5262,
      "eval_samples_per_second": 24.835,
      "eval_steps_per_second": 1.564,
      "step": 26
    },
    {
      "epoch": 0.6206896551724138,
      "eval_logits/chosen": -0.7720378041267395,
      "eval_logits/rejected": -0.7476205825805664,
      "eval_logps/chosen": -203.7217559814453,
      "eval_logps/rejected": -205.04006958007812,
      "eval_loss": 2.139249801635742,
      "eval_nll_loss": 0.5815550684928894,
      "eval_rewards/accuracies": 0.5369565486907959,
      "eval_rewards/chosen": -20.37217903137207,
      "eval_rewards/margins": 0.13182921707630157,
      "eval_rewards/rejected": -20.504005432128906,
      "eval_runtime": 73.5829,
      "eval_samples_per_second": 24.816,
      "eval_steps_per_second": 1.563,
      "step": 27
    },
    {
      "epoch": 0.6436781609195402,
      "eval_logits/chosen": -0.781804621219635,
      "eval_logits/rejected": -0.7575309872627258,
      "eval_logps/chosen": -201.85330200195312,
      "eval_logps/rejected": -203.2164306640625,
      "eval_loss": 2.1307995319366455,
      "eval_nll_loss": 0.5756080150604248,
      "eval_rewards/accuracies": 0.532608687877655,
      "eval_rewards/chosen": -20.18532943725586,
      "eval_rewards/margins": 0.13631057739257812,
      "eval_rewards/rejected": -20.321643829345703,
      "eval_runtime": 73.6844,
      "eval_samples_per_second": 24.781,
      "eval_steps_per_second": 1.561,
      "step": 28
    },
    {
      "epoch": 0.6666666666666666,
      "eval_logits/chosen": -0.7925211787223816,
      "eval_logits/rejected": -0.768252432346344,
      "eval_logps/chosen": -199.9458770751953,
      "eval_logps/rejected": -201.3154754638672,
      "eval_loss": 2.1228978633880615,
      "eval_nll_loss": 0.5694720149040222,
      "eval_rewards/accuracies": 0.5282608866691589,
      "eval_rewards/chosen": -19.994586944580078,
      "eval_rewards/margins": 0.13696083426475525,
      "eval_rewards/rejected": -20.131547927856445,
      "eval_runtime": 73.7355,
      "eval_samples_per_second": 24.764,
      "eval_steps_per_second": 1.56,
      "step": 29
    },
    {
      "epoch": 0.6896551724137931,
      "grad_norm": 55.80259323120117,
      "learning_rate": 3.4210526315789473e-07,
      "logits/chosen": -0.6812049150466919,
      "logits/rejected": -0.6623071432113647,
      "logps/chosen": -199.8437042236328,
      "logps/rejected": -201.27694702148438,
      "loss": 2.3172,
      "nll_loss": 0.5909140706062317,
      "rewards/accuracies": 0.53125,
      "rewards/chosen": -19.984371185302734,
      "rewards/margins": 0.14332275092601776,
      "rewards/rejected": -20.127695083618164,
      "step": 30
    },
    {
      "epoch": 0.6896551724137931,
      "eval_logits/chosen": -0.7975767254829407,
      "eval_logits/rejected": -0.7734904885292053,
      "eval_logps/chosen": -197.8929901123047,
      "eval_logps/rejected": -199.30410766601562,
      "eval_loss": 2.113354206085205,
      "eval_nll_loss": 0.5630350708961487,
      "eval_rewards/accuracies": 0.5260869860649109,
      "eval_rewards/chosen": -19.78929901123047,
      "eval_rewards/margins": 0.1411115825176239,
      "eval_rewards/rejected": -19.93041229248047,
      "eval_runtime": 73.6357,
      "eval_samples_per_second": 24.798,
      "eval_steps_per_second": 1.562,
      "step": 30
    },
    {
      "epoch": 0.7126436781609196,
      "eval_logits/chosen": -0.7977136969566345,
      "eval_logits/rejected": -0.7735068202018738,
      "eval_logps/chosen": -195.95989990234375,
      "eval_logps/rejected": -197.4013214111328,
      "eval_loss": 2.1055009365081787,
      "eval_nll_loss": 0.5569384098052979,
      "eval_rewards/accuracies": 0.5282608866691589,
      "eval_rewards/chosen": -19.595989227294922,
      "eval_rewards/margins": 0.1441420167684555,
      "eval_rewards/rejected": -19.74013328552246,
      "eval_runtime": 73.0556,
      "eval_samples_per_second": 24.995,
      "eval_steps_per_second": 1.574,
      "step": 31
    },
    {
      "epoch": 0.735632183908046,
      "eval_logits/chosen": -0.80599045753479,
      "eval_logits/rejected": -0.7817136645317078,
      "eval_logps/chosen": -194.0162811279297,
      "eval_logps/rejected": -195.46153259277344,
      "eval_loss": 2.0985281467437744,
      "eval_nll_loss": 0.5507530570030212,
      "eval_rewards/accuracies": 0.52173912525177,
      "eval_rewards/chosen": -19.401628494262695,
      "eval_rewards/margins": 0.14452561736106873,
      "eval_rewards/rejected": -19.546154022216797,
      "eval_runtime": 73.1881,
      "eval_samples_per_second": 24.949,
      "eval_steps_per_second": 1.571,
      "step": 32
    },
    {
      "epoch": 0.7586206896551724,
      "eval_logits/chosen": -0.8030232787132263,
      "eval_logits/rejected": -0.7785286903381348,
      "eval_logps/chosen": -192.11659240722656,
      "eval_logps/rejected": -193.61715698242188,
      "eval_loss": 2.0903804302215576,
      "eval_nll_loss": 0.5446676015853882,
      "eval_rewards/accuracies": 0.5239130258560181,
      "eval_rewards/chosen": -19.211658477783203,
      "eval_rewards/margins": 0.1500559002161026,
      "eval_rewards/rejected": -19.36171531677246,
      "eval_runtime": 73.4088,
      "eval_samples_per_second": 24.874,
      "eval_steps_per_second": 1.567,
      "step": 33
    },
    {
      "epoch": 0.7816091954022989,
      "eval_logits/chosen": -0.8003183603286743,
      "eval_logits/rejected": -0.7758002281188965,
      "eval_logps/chosen": -190.38067626953125,
      "eval_logps/rejected": -191.8131561279297,
      "eval_loss": 2.08504056930542,
      "eval_nll_loss": 0.539174497127533,
      "eval_rewards/accuracies": 0.5239130258560181,
      "eval_rewards/chosen": -19.038066864013672,
      "eval_rewards/margins": 0.1432473063468933,
      "eval_rewards/rejected": -19.18131446838379,
      "eval_runtime": 73.4902,
      "eval_samples_per_second": 24.847,
      "eval_steps_per_second": 1.565,
      "step": 34
    },
    {
      "epoch": 0.8045977011494253,
      "eval_logits/chosen": -0.796375036239624,
      "eval_logits/rejected": -0.7714610695838928,
      "eval_logps/chosen": -188.9884033203125,
      "eval_logps/rejected": -190.43736267089844,
      "eval_loss": 2.0792500972747803,
      "eval_nll_loss": 0.5345708131790161,
      "eval_rewards/accuracies": 0.5173913240432739,
      "eval_rewards/chosen": -18.898839950561523,
      "eval_rewards/margins": 0.1448965221643448,
      "eval_rewards/rejected": -19.043737411499023,
      "eval_runtime": 73.2997,
      "eval_samples_per_second": 24.911,
      "eval_steps_per_second": 1.569,
      "step": 35
    },
    {
      "epoch": 0.8275862068965517,
      "eval_logits/chosen": -0.7951973080635071,
      "eval_logits/rejected": -0.7701032757759094,
      "eval_logps/chosen": -187.54518127441406,
      "eval_logps/rejected": -188.98013305664062,
      "eval_loss": 2.0720129013061523,
      "eval_nll_loss": 0.5298618078231812,
      "eval_rewards/accuracies": 0.519565224647522,
      "eval_rewards/chosen": -18.754518508911133,
      "eval_rewards/margins": 0.14349476993083954,
      "eval_rewards/rejected": -18.898012161254883,
      "eval_runtime": 73.4171,
      "eval_samples_per_second": 24.872,
      "eval_steps_per_second": 1.566,
      "step": 36
    },
    {
      "epoch": 0.8505747126436781,
      "eval_logits/chosen": -0.7926805019378662,
      "eval_logits/rejected": -0.7679208517074585,
      "eval_logps/chosen": -186.56715393066406,
      "eval_logps/rejected": -188.0532684326172,
      "eval_loss": 2.0663270950317383,
      "eval_nll_loss": 0.526580810546875,
      "eval_rewards/accuracies": 0.5260869860649109,
      "eval_rewards/chosen": -18.656715393066406,
      "eval_rewards/margins": 0.14861242473125458,
      "eval_rewards/rejected": -18.805326461791992,
      "eval_runtime": 73.5278,
      "eval_samples_per_second": 24.834,
      "eval_steps_per_second": 1.564,
      "step": 37
    },
    {
      "epoch": 0.8735632183908046,
      "eval_logits/chosen": -0.7882456183433533,
      "eval_logits/rejected": -0.7631468176841736,
      "eval_logps/chosen": -185.62677001953125,
      "eval_logps/rejected": -187.13912963867188,
      "eval_loss": 2.0643482208251953,
      "eval_nll_loss": 0.5234898924827576,
      "eval_rewards/accuracies": 0.5239130258560181,
      "eval_rewards/chosen": -18.56267738342285,
      "eval_rewards/margins": 0.15123440325260162,
      "eval_rewards/rejected": -18.713911056518555,
      "eval_runtime": 73.4858,
      "eval_samples_per_second": 24.848,
      "eval_steps_per_second": 1.565,
      "step": 38
    },
    {
      "epoch": 0.896551724137931,
      "eval_logits/chosen": -0.7857053279876709,
      "eval_logits/rejected": -0.7608606815338135,
      "eval_logps/chosen": -185.09970092773438,
      "eval_logps/rejected": -186.60646057128906,
      "eval_loss": 2.0600922107696533,
      "eval_nll_loss": 0.5217379927635193,
      "eval_rewards/accuracies": 0.5282608866691589,
      "eval_rewards/chosen": -18.509971618652344,
      "eval_rewards/margins": 0.15067508816719055,
      "eval_rewards/rejected": -18.66064453125,
      "eval_runtime": 73.7485,
      "eval_samples_per_second": 24.76,
      "eval_steps_per_second": 1.559,
      "step": 39
    },
    {
      "epoch": 0.9195402298850575,
      "grad_norm": 50.088340759277344,
      "learning_rate": 7.894736842105262e-08,
      "logits/chosen": -0.8007175326347351,
      "logits/rejected": -0.7798112630844116,
      "logps/chosen": -190.50381469726562,
      "logps/rejected": -193.3760223388672,
      "loss": 2.1039,
      "nll_loss": 0.5438653230667114,
      "rewards/accuracies": 0.546875,
      "rewards/chosen": -19.05038070678711,
      "rewards/margins": 0.2872214913368225,
      "rewards/rejected": -19.337600708007812,
      "step": 40
    },
    {
      "epoch": 0.9195402298850575,
      "eval_logits/chosen": -0.785999596118927,
      "eval_logits/rejected": -0.7610748410224915,
      "eval_logps/chosen": -184.6099090576172,
      "eval_logps/rejected": -186.1282958984375,
      "eval_loss": 2.0597591400146484,
      "eval_nll_loss": 0.5201125144958496,
      "eval_rewards/accuracies": 0.5282608866691589,
      "eval_rewards/chosen": -18.46099090576172,
      "eval_rewards/margins": 0.15183939039707184,
      "eval_rewards/rejected": -18.612829208374023,
      "eval_runtime": 73.6777,
      "eval_samples_per_second": 24.784,
      "eval_steps_per_second": 1.561,
      "step": 40
    },
    {
      "epoch": 0.9425287356321839,
      "eval_logits/chosen": -0.7789402604103088,
      "eval_logits/rejected": -0.754026472568512,
      "eval_logps/chosen": -184.23236083984375,
      "eval_logps/rejected": -185.80072021484375,
      "eval_loss": 2.0538711547851562,
      "eval_nll_loss": 0.5189568400382996,
      "eval_rewards/accuracies": 0.5260869860649109,
      "eval_rewards/chosen": -18.423233032226562,
      "eval_rewards/margins": 0.15683722496032715,
      "eval_rewards/rejected": -18.5800724029541,
      "eval_runtime": 73.0726,
      "eval_samples_per_second": 24.989,
      "eval_steps_per_second": 1.574,
      "step": 41
    },
    {
      "epoch": 0.9655172413793104,
      "eval_logits/chosen": -0.777718722820282,
      "eval_logits/rejected": -0.7525457739830017,
      "eval_logps/chosen": -183.968994140625,
      "eval_logps/rejected": -185.52581787109375,
      "eval_loss": 2.054420232772827,
      "eval_nll_loss": 0.518138587474823,
      "eval_rewards/accuracies": 0.5282608866691589,
      "eval_rewards/chosen": -18.396900177001953,
      "eval_rewards/margins": 0.15568143129348755,
      "eval_rewards/rejected": -18.552579879760742,
      "eval_runtime": 73.2982,
      "eval_samples_per_second": 24.912,
      "eval_steps_per_second": 1.569,
      "step": 42
    },
    {
      "epoch": 0.9885057471264368,
      "eval_logits/chosen": -0.779742419719696,
      "eval_logits/rejected": -0.755063533782959,
      "eval_logps/chosen": -183.93116760253906,
      "eval_logps/rejected": -185.45208740234375,
      "eval_loss": 2.0544536113739014,
      "eval_nll_loss": 0.5179869532585144,
      "eval_rewards/accuracies": 0.5260869860649109,
      "eval_rewards/chosen": -18.393117904663086,
      "eval_rewards/margins": 0.15209028124809265,
      "eval_rewards/rejected": -18.54520606994629,
      "eval_runtime": 73.5834,
      "eval_samples_per_second": 24.815,
      "eval_steps_per_second": 1.563,
      "step": 43
    },
    {
      "epoch": 0.9885057471264368,
      "step": 43,
      "total_flos": 0.0,
      "train_loss": 2.3523660704146985,
      "train_runtime": 5387.5537,
      "train_samples_per_second": 1.031,
      "train_steps_per_second": 0.008
    }
  ],
  "logging_steps": 10,
  "max_steps": 43,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 1,
  "save_steps": 10,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": true
      },
      "attributes": {}
    }
  },
  "total_flos": 0.0,
  "train_batch_size": 4,
  "trial_name": null,
  "trial_params": null
}