{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.054945054945054944, "eval_steps": 500, "global_step": 1000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.001098901098901099, "grad_norm": 4.387043476104736, "learning_rate": 5.4945054945054946e-08, "loss": 0.8831, "step": 20 }, { "epoch": 0.002197802197802198, "grad_norm": 3.8907365798950195, "learning_rate": 1.0989010989010989e-07, "loss": 0.8866, "step": 40 }, { "epoch": 0.0032967032967032967, "grad_norm": 3.2492220401763916, "learning_rate": 1.6483516483516484e-07, "loss": 0.8454, "step": 60 }, { "epoch": 0.004395604395604396, "grad_norm": 3.0804762840270996, "learning_rate": 2.1978021978021978e-07, "loss": 0.8267, "step": 80 }, { "epoch": 0.005494505494505495, "grad_norm": 2.037411689758301, "learning_rate": 2.7472527472527475e-07, "loss": 0.7371, "step": 100 }, { "epoch": 0.006593406593406593, "grad_norm": 1.5473365783691406, "learning_rate": 3.296703296703297e-07, "loss": 0.6765, "step": 120 }, { "epoch": 0.007692307692307693, "grad_norm": 1.1062999963760376, "learning_rate": 3.846153846153847e-07, "loss": 0.5946, "step": 140 }, { "epoch": 0.008791208791208791, "grad_norm": 1.363224744796753, "learning_rate": 4.3956043956043957e-07, "loss": 0.5402, "step": 160 }, { "epoch": 0.00989010989010989, "grad_norm": 0.9122905731201172, "learning_rate": 4.945054945054946e-07, "loss": 0.464, "step": 180 }, { "epoch": 0.01098901098901099, "grad_norm": 0.676691472530365, "learning_rate": 5.494505494505495e-07, "loss": 0.4041, "step": 200 }, { "epoch": 0.012087912087912088, "grad_norm": 0.5926629900932312, "learning_rate": 6.043956043956044e-07, "loss": 0.3727, "step": 220 }, { "epoch": 0.013186813186813187, "grad_norm": 0.635013997554779, "learning_rate": 6.593406593406594e-07, "loss": 0.3609, "step": 240 }, { "epoch": 0.014285714285714285, "grad_norm": 0.5836207270622253, "learning_rate": 7.142857142857143e-07, "loss": 0.3331, "step": 260 }, { "epoch": 0.015384615384615385, "grad_norm": 0.548773467540741, "learning_rate": 7.692307692307694e-07, "loss": 0.3201, "step": 280 }, { "epoch": 0.016483516483516484, "grad_norm": 0.5962342023849487, "learning_rate": 8.241758241758242e-07, "loss": 0.2993, "step": 300 }, { "epoch": 0.017582417582417582, "grad_norm": 0.5346819162368774, "learning_rate": 8.791208791208791e-07, "loss": 0.2792, "step": 320 }, { "epoch": 0.01868131868131868, "grad_norm": 0.569210946559906, "learning_rate": 9.340659340659341e-07, "loss": 0.273, "step": 340 }, { "epoch": 0.01978021978021978, "grad_norm": 0.5142342448234558, "learning_rate": 9.890109890109891e-07, "loss": 0.2621, "step": 360 }, { "epoch": 0.020879120879120878, "grad_norm": 0.5067290663719177, "learning_rate": 1.043956043956044e-06, "loss": 0.2641, "step": 380 }, { "epoch": 0.02197802197802198, "grad_norm": 0.44699764251708984, "learning_rate": 1.098901098901099e-06, "loss": 0.2553, "step": 400 }, { "epoch": 0.023076923076923078, "grad_norm": 0.5279501080513, "learning_rate": 1.153846153846154e-06, "loss": 0.2488, "step": 420 }, { "epoch": 0.024175824175824177, "grad_norm": 0.5009133219718933, "learning_rate": 1.2087912087912089e-06, "loss": 0.2499, "step": 440 }, { "epoch": 0.025274725274725275, "grad_norm": 0.5484806895256042, "learning_rate": 1.263736263736264e-06, "loss": 0.2485, "step": 460 }, { "epoch": 0.026373626373626374, "grad_norm": 0.44114941358566284, "learning_rate": 1.3186813186813187e-06, "loss": 0.2499, "step": 480 }, { "epoch": 0.027472527472527472, "grad_norm": 0.5557438731193542, "learning_rate": 1.3736263736263736e-06, "loss": 0.2455, "step": 500 }, { "epoch": 0.02857142857142857, "grad_norm": 0.5493007898330688, "learning_rate": 1.4285714285714286e-06, "loss": 0.2396, "step": 520 }, { "epoch": 0.02967032967032967, "grad_norm": 0.5284810662269592, "learning_rate": 1.4835164835164835e-06, "loss": 0.2383, "step": 540 }, { "epoch": 0.03076923076923077, "grad_norm": 0.5073139667510986, "learning_rate": 1.5384615384615387e-06, "loss": 0.2389, "step": 560 }, { "epoch": 0.031868131868131866, "grad_norm": 0.5186975002288818, "learning_rate": 1.5934065934065933e-06, "loss": 0.2372, "step": 580 }, { "epoch": 0.03296703296703297, "grad_norm": 0.5739095211029053, "learning_rate": 1.6483516483516484e-06, "loss": 0.2358, "step": 600 }, { "epoch": 0.03406593406593406, "grad_norm": 0.5144438147544861, "learning_rate": 1.7032967032967032e-06, "loss": 0.2319, "step": 620 }, { "epoch": 0.035164835164835165, "grad_norm": 0.4886190593242645, "learning_rate": 1.7582417582417583e-06, "loss": 0.2297, "step": 640 }, { "epoch": 0.03626373626373627, "grad_norm": 0.6088211536407471, "learning_rate": 1.8131868131868135e-06, "loss": 0.2356, "step": 660 }, { "epoch": 0.03736263736263736, "grad_norm": 0.4712292551994324, "learning_rate": 1.8681318681318681e-06, "loss": 0.2334, "step": 680 }, { "epoch": 0.038461538461538464, "grad_norm": 0.5712177157402039, "learning_rate": 1.9230769230769234e-06, "loss": 0.2302, "step": 700 }, { "epoch": 0.03956043956043956, "grad_norm": 0.5427699089050293, "learning_rate": 1.9780219780219782e-06, "loss": 0.2248, "step": 720 }, { "epoch": 0.04065934065934066, "grad_norm": 0.6642568707466125, "learning_rate": 2.032967032967033e-06, "loss": 0.2291, "step": 740 }, { "epoch": 0.041758241758241756, "grad_norm": 0.5859007239341736, "learning_rate": 2.087912087912088e-06, "loss": 0.227, "step": 760 }, { "epoch": 0.04285714285714286, "grad_norm": 0.6507712602615356, "learning_rate": 2.142857142857143e-06, "loss": 0.227, "step": 780 }, { "epoch": 0.04395604395604396, "grad_norm": 0.5675429105758667, "learning_rate": 2.197802197802198e-06, "loss": 0.2259, "step": 800 }, { "epoch": 0.045054945054945054, "grad_norm": 0.6223055124282837, "learning_rate": 2.252747252747253e-06, "loss": 0.2283, "step": 820 }, { "epoch": 0.046153846153846156, "grad_norm": 0.5504657030105591, "learning_rate": 2.307692307692308e-06, "loss": 0.2246, "step": 840 }, { "epoch": 0.04725274725274725, "grad_norm": 0.48020097613334656, "learning_rate": 2.3626373626373625e-06, "loss": 0.225, "step": 860 }, { "epoch": 0.04835164835164835, "grad_norm": 0.4979713261127472, "learning_rate": 2.4175824175824177e-06, "loss": 0.2212, "step": 880 }, { "epoch": 0.04945054945054945, "grad_norm": 0.49497634172439575, "learning_rate": 2.4725274725274726e-06, "loss": 0.2234, "step": 900 }, { "epoch": 0.05054945054945055, "grad_norm": 0.6207996010780334, "learning_rate": 2.527472527472528e-06, "loss": 0.2256, "step": 920 }, { "epoch": 0.051648351648351645, "grad_norm": 0.530981719493866, "learning_rate": 2.5824175824175822e-06, "loss": 0.2231, "step": 940 }, { "epoch": 0.05274725274725275, "grad_norm": 0.5495067834854126, "learning_rate": 2.6373626373626375e-06, "loss": 0.223, "step": 960 }, { "epoch": 0.05384615384615385, "grad_norm": 0.5651763081550598, "learning_rate": 2.6923076923076928e-06, "loss": 0.2213, "step": 980 }, { "epoch": 0.054945054945054944, "grad_norm": 0.553247332572937, "learning_rate": 2.747252747252747e-06, "loss": 0.2219, "step": 1000 } ], "logging_steps": 20, "max_steps": 91000, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 6.336356548608e+17, "train_batch_size": 32, "trial_name": null, "trial_params": null }