{ "best_metric": 1.6826781034469604, "best_model_checkpoint": "outputs/checkpoint-374", "epoch": 10.724014336917563, "eval_steps": 500, "global_step": 374, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.5734767025089605, "grad_norm": 1.254808783531189, "learning_rate": 2.0000000000000003e-06, "loss": 2.9248, "step": 20 }, { "epoch": 0.974910394265233, "eval_loss": 2.816770076751709, "eval_runtime": 12.0371, "eval_samples_per_second": 30.904, "eval_steps_per_second": 3.905, "step": 34 }, { "epoch": 1.146953405017921, "grad_norm": 1.3472272157669067, "learning_rate": 4.000000000000001e-06, "loss": 2.8927, "step": 40 }, { "epoch": 1.7204301075268817, "grad_norm": 1.56429922580719, "learning_rate": 6e-06, "loss": 2.827, "step": 60 }, { "epoch": 1.978494623655914, "eval_loss": 2.5949196815490723, "eval_runtime": 12.0427, "eval_samples_per_second": 30.89, "eval_steps_per_second": 3.903, "step": 69 }, { "epoch": 2.293906810035842, "grad_norm": 2.013805389404297, "learning_rate": 8.000000000000001e-06, "loss": 2.6619, "step": 80 }, { "epoch": 2.867383512544803, "grad_norm": 1.7669492959976196, "learning_rate": 1e-05, "loss": 2.3052, "step": 100 }, { "epoch": 2.982078853046595, "eval_loss": 2.005356550216675, "eval_runtime": 12.033, "eval_samples_per_second": 30.915, "eval_steps_per_second": 3.906, "step": 104 }, { "epoch": 3.4408602150537635, "grad_norm": 0.7907297015190125, "learning_rate": 9.869113551144754e-06, "loss": 2.0129, "step": 120 }, { "epoch": 3.985663082437276, "eval_loss": 1.8571571111679077, "eval_runtime": 12.0327, "eval_samples_per_second": 30.916, "eval_steps_per_second": 3.906, "step": 139 }, { "epoch": 4.014336917562724, "grad_norm": 0.6714785695075989, "learning_rate": 9.48330670957659e-06, "loss": 1.9293, "step": 140 }, { "epoch": 4.587813620071684, "grad_norm": 0.633492648601532, "learning_rate": 8.862778230270276e-06, "loss": 1.8755, "step": 160 }, { "epoch": 4.989247311827957, "eval_loss": 1.7887212038040161, "eval_runtime": 12.0277, "eval_samples_per_second": 30.929, "eval_steps_per_second": 3.908, "step": 174 }, { "epoch": 5.161290322580645, "grad_norm": 0.6426405310630798, "learning_rate": 8.04001562085379e-06, "loss": 1.8144, "step": 180 }, { "epoch": 5.734767025089606, "grad_norm": 0.6501743793487549, "learning_rate": 7.058094271806091e-06, "loss": 1.7905, "step": 200 }, { "epoch": 5.992831541218638, "eval_loss": 1.744368553161621, "eval_runtime": 12.0302, "eval_samples_per_second": 30.922, "eval_steps_per_second": 3.907, "step": 209 }, { "epoch": 6.308243727598566, "grad_norm": 0.6884588599205017, "learning_rate": 5.968422262499983e-06, "loss": 1.7398, "step": 220 }, { "epoch": 6.881720430107527, "grad_norm": 0.6701219081878662, "learning_rate": 4.82804891282148e-06, "loss": 1.7365, "step": 240 }, { "epoch": 6.996415770609319, "eval_loss": 1.7139564752578735, "eval_runtime": 12.0265, "eval_samples_per_second": 30.932, "eval_steps_per_second": 3.908, "step": 244 }, { "epoch": 7.455197132616488, "grad_norm": 0.6416382789611816, "learning_rate": 3.6966779900140193e-06, "loss": 1.7276, "step": 260 }, { "epoch": 8.0, "eval_loss": 1.6951279640197754, "eval_runtime": 12.0247, "eval_samples_per_second": 30.936, "eval_steps_per_second": 3.909, "step": 279 }, { "epoch": 8.028673835125447, "grad_norm": 0.5849926471710205, "learning_rate": 2.633541943047334e-06, "loss": 1.685, "step": 280 }, { "epoch": 8.602150537634408, "grad_norm": 0.7124162912368774, "learning_rate": 1.6943008126564164e-06, "loss": 1.695, "step": 300 }, { "epoch": 8.974910394265233, "eval_loss": 1.6860439777374268, "eval_runtime": 12.028, "eval_samples_per_second": 30.928, "eval_steps_per_second": 3.908, "step": 313 }, { "epoch": 9.175627240143369, "grad_norm": 0.6544684767723083, "learning_rate": 9.281281733115288e-07, "loss": 1.6987, "step": 320 }, { "epoch": 9.74910394265233, "grad_norm": 0.8023189902305603, "learning_rate": 3.751366714022342e-07, "loss": 1.6868, "step": 340 }, { "epoch": 9.978494623655914, "eval_loss": 1.6829451322555542, "eval_runtime": 12.0275, "eval_samples_per_second": 30.929, "eval_steps_per_second": 3.908, "step": 348 }, { "epoch": 10.32258064516129, "grad_norm": 0.6753117442131042, "learning_rate": 6.427794450134529e-08, "loss": 1.692, "step": 360 }, { "epoch": 10.724014336917563, "eval_loss": 1.6826781034469604, "eval_runtime": 12.0235, "eval_samples_per_second": 30.939, "eval_steps_per_second": 3.909, "step": 374 } ], "logging_steps": 20, "max_steps": 374, "num_input_tokens_seen": 0, "num_train_epochs": 11, "save_steps": 500, "total_flos": 1.968170106536755e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }