{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.9892761394101877, "eval_steps": 200, "global_step": 372, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.10723860589812333, "grad_norm": 0.8259232640266418, "learning_rate": 4.210526315789473e-06, "loss": 1.7013, "step": 10 }, { "epoch": 0.21447721179624665, "grad_norm": 0.8551827073097229, "learning_rate": 7.999841591871243e-06, "loss": 1.676, "step": 20 }, { "epoch": 0.32171581769436997, "grad_norm": 1.056661605834961, "learning_rate": 7.9808477930786e-06, "loss": 1.6483, "step": 30 }, { "epoch": 0.4289544235924933, "grad_norm": 0.878664493560791, "learning_rate": 7.93034465929166e-06, "loss": 1.5262, "step": 40 }, { "epoch": 0.5361930294906166, "grad_norm": 0.6924058794975281, "learning_rate": 7.848731934545733e-06, "loss": 1.4716, "step": 50 }, { "epoch": 0.6434316353887399, "grad_norm": 0.49218347668647766, "learning_rate": 7.736655602515024e-06, "loss": 1.3935, "step": 60 }, { "epoch": 0.7506702412868632, "grad_norm": 0.48053157329559326, "learning_rate": 7.5950027734017875e-06, "loss": 1.3322, "step": 70 }, { "epoch": 0.8579088471849866, "grad_norm": 0.4853557050228119, "learning_rate": 7.424894662253031e-06, "loss": 1.2899, "step": 80 }, { "epoch": 0.9651474530831099, "grad_norm": 0.5471060276031494, "learning_rate": 7.227677714282947e-06, "loss": 1.284, "step": 90 }, { "epoch": 1.0723860589812333, "grad_norm": 0.4984244406223297, "learning_rate": 7.004912947446291e-06, "loss": 1.2392, "step": 100 }, { "epoch": 1.1796246648793565, "grad_norm": 0.5704559683799744, "learning_rate": 6.758363596618656e-06, "loss": 1.2033, "step": 110 }, { "epoch": 1.2868632707774799, "grad_norm": 0.6068636178970337, "learning_rate": 6.489981157182848e-06, "loss": 1.1701, "step": 120 }, { "epoch": 1.3941018766756033, "grad_norm": 0.5727308392524719, "learning_rate": 6.2018899384896244e-06, "loss": 1.1775, "step": 130 }, { "epoch": 1.5013404825737267, "grad_norm": 0.542986273765564, "learning_rate": 5.89637024945572e-06, "loss": 1.1182, "step": 140 }, { "epoch": 1.6085790884718498, "grad_norm": 0.5344908237457275, "learning_rate": 5.575840349389077e-06, "loss": 1.0901, "step": 150 }, { "epoch": 1.7158176943699732, "grad_norm": 0.625162661075592, "learning_rate": 5.242837306904692e-06, "loss": 1.0817, "step": 160 }, { "epoch": 1.8230563002680964, "grad_norm": 0.5978776812553406, "learning_rate": 4.899996918437182e-06, "loss": 1.0257, "step": 170 }, { "epoch": 1.9302949061662198, "grad_norm": 0.6914421916007996, "learning_rate": 4.550032845299744e-06, "loss": 1.038, "step": 180 }, { "epoch": 2.037533512064343, "grad_norm": 0.6677165627479553, "learning_rate": 4.195715134424498e-06, "loss": 1.012, "step": 190 }, { "epoch": 2.1447721179624666, "grad_norm": 0.7082802057266235, "learning_rate": 3.839848292797568e-06, "loss": 0.9918, "step": 200 }, { "epoch": 2.1447721179624666, "eval_loss": 0.9989345669746399, "eval_runtime": 5.7346, "eval_samples_per_second": 54.755, "eval_steps_per_second": 1.744, "step": 200 }, { "epoch": 2.25201072386059, "grad_norm": 0.6643340587615967, "learning_rate": 3.485249089134823e-06, "loss": 0.9689, "step": 210 }, { "epoch": 2.359249329758713, "grad_norm": 0.6342740058898926, "learning_rate": 3.134724258503188e-06, "loss": 0.9704, "step": 220 }, { "epoch": 2.4664879356568363, "grad_norm": 0.6604453921318054, "learning_rate": 2.7910482863606053e-06, "loss": 0.9555, "step": 230 }, { "epoch": 2.5737265415549597, "grad_norm": 0.6870635747909546, "learning_rate": 2.4569414478591614e-06, "loss": 0.9478, "step": 240 }, { "epoch": 2.680965147453083, "grad_norm": 0.6661774516105652, "learning_rate": 2.1350482762353478e-06, "loss": 0.9435, "step": 250 }, { "epoch": 2.7882037533512065, "grad_norm": 0.7505815625190735, "learning_rate": 1.827916630715212e-06, "loss": 0.9339, "step": 260 }, { "epoch": 2.89544235924933, "grad_norm": 0.6933086514472961, "learning_rate": 1.5379775296167252e-06, "loss": 0.9253, "step": 270 }, { "epoch": 3.002680965147453, "grad_norm": 0.8025710582733154, "learning_rate": 1.2675259082751067e-06, "loss": 0.9055, "step": 280 }, { "epoch": 3.1099195710455763, "grad_norm": 0.7908952236175537, "learning_rate": 1.018702454096546e-06, "loss": 0.9039, "step": 290 }, { "epoch": 3.2171581769436997, "grad_norm": 0.7214229702949524, "learning_rate": 7.934766625201255e-07, "loss": 0.9274, "step": 300 }, { "epoch": 3.324396782841823, "grad_norm": 0.7900909781455994, "learning_rate": 5.936312480039261e-07, "loss": 0.905, "step": 310 }, { "epoch": 3.4316353887399464, "grad_norm": 0.6883360147476196, "learning_rate": 4.207480334259821e-07, "loss": 0.9131, "step": 320 }, { "epoch": 3.53887399463807, "grad_norm": 0.7782545685768127, "learning_rate": 2.761954295887592e-07, "loss": 0.9053, "step": 330 }, { "epoch": 3.646112600536193, "grad_norm": 0.7688133716583252, "learning_rate": 1.6111760392979012e-07, "loss": 0.8879, "step": 340 }, { "epoch": 3.753351206434316, "grad_norm": 0.8082988262176514, "learning_rate": 7.642542417061859e-08, "loss": 0.9101, "step": 350 }, { "epoch": 3.8605898123324396, "grad_norm": 0.7244554162025452, "learning_rate": 2.2789248587207655e-08, "loss": 0.8842, "step": 360 }, { "epoch": 3.967828418230563, "grad_norm": 0.8839091062545776, "learning_rate": 6.33619968457566e-10, "loss": 0.9048, "step": 370 }, { "epoch": 3.9892761394101877, "step": 372, "total_flos": 3.7910142911093146e+17, "train_loss": 1.1117366898444392, "train_runtime": 1080.8388, "train_samples_per_second": 22.064, "train_steps_per_second": 0.344 } ], "logging_steps": 10, "max_steps": 372, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.7910142911093146e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }