{
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 3.9892761394101877,
  "eval_steps": 200,
  "global_step": 372,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "epoch": 0.10723860589812333,
      "grad_norm": 0.8259232640266418,
      "learning_rate": 4.210526315789473e-06,
      "loss": 1.7013,
      "step": 10
    },
    {
      "epoch": 0.21447721179624665,
      "grad_norm": 0.8551827073097229,
      "learning_rate": 7.999841591871243e-06,
      "loss": 1.676,
      "step": 20
    },
    {
      "epoch": 0.32171581769436997,
      "grad_norm": 1.056661605834961,
      "learning_rate": 7.9808477930786e-06,
      "loss": 1.6483,
      "step": 30
    },
    {
      "epoch": 0.4289544235924933,
      "grad_norm": 0.878664493560791,
      "learning_rate": 7.93034465929166e-06,
      "loss": 1.5262,
      "step": 40
    },
    {
      "epoch": 0.5361930294906166,
      "grad_norm": 0.6924058794975281,
      "learning_rate": 7.848731934545733e-06,
      "loss": 1.4716,
      "step": 50
    },
    {
      "epoch": 0.6434316353887399,
      "grad_norm": 0.49218347668647766,
      "learning_rate": 7.736655602515024e-06,
      "loss": 1.3935,
      "step": 60
    },
    {
      "epoch": 0.7506702412868632,
      "grad_norm": 0.48053157329559326,
      "learning_rate": 7.5950027734017875e-06,
      "loss": 1.3322,
      "step": 70
    },
    {
      "epoch": 0.8579088471849866,
      "grad_norm": 0.4853557050228119,
      "learning_rate": 7.424894662253031e-06,
      "loss": 1.2899,
      "step": 80
    },
    {
      "epoch": 0.9651474530831099,
      "grad_norm": 0.5471060276031494,
      "learning_rate": 7.227677714282947e-06,
      "loss": 1.284,
      "step": 90
    },
    {
      "epoch": 1.0723860589812333,
      "grad_norm": 0.4984244406223297,
      "learning_rate": 7.004912947446291e-06,
      "loss": 1.2392,
      "step": 100
    },
    {
      "epoch": 1.1796246648793565,
      "grad_norm": 0.5704559683799744,
      "learning_rate": 6.758363596618656e-06,
      "loss": 1.2033,
      "step": 110
    },
    {
      "epoch": 1.2868632707774799,
      "grad_norm": 0.6068636178970337,
      "learning_rate": 6.489981157182848e-06,
      "loss": 1.1701,
      "step": 120
    },
    {
      "epoch": 1.3941018766756033,
      "grad_norm": 0.5727308392524719,
      "learning_rate": 6.2018899384896244e-06,
      "loss": 1.1775,
      "step": 130
    },
    {
      "epoch": 1.5013404825737267,
      "grad_norm": 0.542986273765564,
      "learning_rate": 5.89637024945572e-06,
      "loss": 1.1182,
      "step": 140
    },
    {
      "epoch": 1.6085790884718498,
      "grad_norm": 0.5344908237457275,
      "learning_rate": 5.575840349389077e-06,
      "loss": 1.0901,
      "step": 150
    },
    {
      "epoch": 1.7158176943699732,
      "grad_norm": 0.625162661075592,
      "learning_rate": 5.242837306904692e-06,
      "loss": 1.0817,
      "step": 160
    },
    {
      "epoch": 1.8230563002680964,
      "grad_norm": 0.5978776812553406,
      "learning_rate": 4.899996918437182e-06,
      "loss": 1.0257,
      "step": 170
    },
    {
      "epoch": 1.9302949061662198,
      "grad_norm": 0.6914421916007996,
      "learning_rate": 4.550032845299744e-06,
      "loss": 1.038,
      "step": 180
    },
    {
      "epoch": 2.037533512064343,
      "grad_norm": 0.6677165627479553,
      "learning_rate": 4.195715134424498e-06,
      "loss": 1.012,
      "step": 190
    },
    {
      "epoch": 2.1447721179624666,
      "grad_norm": 0.7082802057266235,
      "learning_rate": 3.839848292797568e-06,
      "loss": 0.9918,
      "step": 200
    },
    {
      "epoch": 2.1447721179624666,
      "eval_loss": 0.9989345669746399,
      "eval_runtime": 5.7346,
      "eval_samples_per_second": 54.755,
      "eval_steps_per_second": 1.744,
      "step": 200
    },
    {
      "epoch": 2.25201072386059,
      "grad_norm": 0.6643340587615967,
      "learning_rate": 3.485249089134823e-06,
      "loss": 0.9689,
      "step": 210
    },
    {
      "epoch": 2.359249329758713,
      "grad_norm": 0.6342740058898926,
      "learning_rate": 3.134724258503188e-06,
      "loss": 0.9704,
      "step": 220
    },
    {
      "epoch": 2.4664879356568363,
      "grad_norm": 0.6604453921318054,
      "learning_rate": 2.7910482863606053e-06,
      "loss": 0.9555,
      "step": 230
    },
    {
      "epoch": 2.5737265415549597,
      "grad_norm": 0.6870635747909546,
      "learning_rate": 2.4569414478591614e-06,
      "loss": 0.9478,
      "step": 240
    },
    {
      "epoch": 2.680965147453083,
      "grad_norm": 0.6661774516105652,
      "learning_rate": 2.1350482762353478e-06,
      "loss": 0.9435,
      "step": 250
    },
    {
      "epoch": 2.7882037533512065,
      "grad_norm": 0.7505815625190735,
      "learning_rate": 1.827916630715212e-06,
      "loss": 0.9339,
      "step": 260
    },
    {
      "epoch": 2.89544235924933,
      "grad_norm": 0.6933086514472961,
      "learning_rate": 1.5379775296167252e-06,
      "loss": 0.9253,
      "step": 270
    },
    {
      "epoch": 3.002680965147453,
      "grad_norm": 0.8025710582733154,
      "learning_rate": 1.2675259082751067e-06,
      "loss": 0.9055,
      "step": 280
    },
    {
      "epoch": 3.1099195710455763,
      "grad_norm": 0.7908952236175537,
      "learning_rate": 1.018702454096546e-06,
      "loss": 0.9039,
      "step": 290
    },
    {
      "epoch": 3.2171581769436997,
      "grad_norm": 0.7214229702949524,
      "learning_rate": 7.934766625201255e-07,
      "loss": 0.9274,
      "step": 300
    },
    {
      "epoch": 3.324396782841823,
      "grad_norm": 0.7900909781455994,
      "learning_rate": 5.936312480039261e-07,
      "loss": 0.905,
      "step": 310
    },
    {
      "epoch": 3.4316353887399464,
      "grad_norm": 0.6883360147476196,
      "learning_rate": 4.207480334259821e-07,
      "loss": 0.9131,
      "step": 320
    },
    {
      "epoch": 3.53887399463807,
      "grad_norm": 0.7782545685768127,
      "learning_rate": 2.761954295887592e-07,
      "loss": 0.9053,
      "step": 330
    },
    {
      "epoch": 3.646112600536193,
      "grad_norm": 0.7688133716583252,
      "learning_rate": 1.6111760392979012e-07,
      "loss": 0.8879,
      "step": 340
    },
    {
      "epoch": 3.753351206434316,
      "grad_norm": 0.8082988262176514,
      "learning_rate": 7.642542417061859e-08,
      "loss": 0.9101,
      "step": 350
    },
    {
      "epoch": 3.8605898123324396,
      "grad_norm": 0.7244554162025452,
      "learning_rate": 2.2789248587207655e-08,
      "loss": 0.8842,
      "step": 360
    },
    {
      "epoch": 3.967828418230563,
      "grad_norm": 0.8839091062545776,
      "learning_rate": 6.33619968457566e-10,
      "loss": 0.9048,
      "step": 370
    },
    {
      "epoch": 3.9892761394101877,
      "step": 372,
      "total_flos": 3.7910142911093146e+17,
      "train_loss": 1.1117366898444392,
      "train_runtime": 1080.8388,
      "train_samples_per_second": 22.064,
      "train_steps_per_second": 0.344
    }
  ],
  "logging_steps": 10,
  "max_steps": 372,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 4,
  "save_steps": 200,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": true
      },
      "attributes": {}
    }
  },
  "total_flos": 3.7910142911093146e+17,
  "train_batch_size": 4,
  "trial_name": null,
  "trial_params": null
}