{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "9510dd98",
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import AutoTokenizer, AutoModelForSeq2SeqLM\n",
    "from datasets import load_dataset\n",
    "from transformers import get_scheduler\n",
    "import torch\n",
    "from torch.utils.data import DataLoader\n",
    "from datasets import load_dataset\n",
    "from transformers import AutoTokenizer, AutoModelForTokenClassification\n",
    "from transformers import DataCollatorForSeq2Seq\n",
    "from accelerate import Accelerator\n",
    "import evaluate\n",
    "import datasets\n",
    "\n",
    "from tqdm.auto import tqdm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "f1da6c6c",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/kave/miniconda3/envs/afterhours_dev/lib/python3.10/site-packages/transformers/models/t5/tokenization_t5_fast.py:155: FutureWarning: This tokenizer was incorrectly instantiated with a model max length of 512 which will be corrected in Transformers v5.\n",
      "For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.\n",
      "- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.\n",
      "- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.\n",
      "- To avoid this warning, please instantiate this tokenizer with `model_max_length` set to your preferred value.\n",
      "  warnings.warn(\n"
     ]
    }
   ],
   "source": [
    "tokenizer = AutoTokenizer.from_pretrained(\"t5-base\")\n",
    "model = AutoModelForSeq2SeqLM.from_pretrained(\"t5-base\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "a6de1719",
   "metadata": {},
   "outputs": [],
   "source": [
    "# prep dataset\n",
    "dataset = load_dataset(\"tner/mit_restaurant\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "8617d7d6",
   "metadata": {},
   "outputs": [],
   "source": [
    "ner_tags = {\n",
    "    \"O\": 0,\n",
    "    \"B-Rating\": 1,\n",
    "    \"I-Rating\": 2,\n",
    "    \"B-Amenity\": 3,\n",
    "    \"I-Amenity\": 4,\n",
    "    \"B-Location\": 5,\n",
    "    \"I-Location\": 6,\n",
    "    \"B-Restaurant_Name\": 7,\n",
    "    \"I-Restaurant_Name\": 8,\n",
    "    \"B-Price\": 9,\n",
    "    \"B-Hours\": 10,\n",
    "    \"I-Hours\": 11,\n",
    "    \"B-Dish\": 12,\n",
    "    \"I-Dish\": 13,\n",
    "    \"B-Cuisine\": 14,\n",
    "    \"I-Price\": 15,\n",
    "    \"I-Cuisine\": 16,\n",
    "}\n",
    "\n",
    "\n",
    "label_names = {v: k for k, v in ner_tags.items()}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "de52b597",
   "metadata": {},
   "outputs": [],
   "source": [
    "def decode_tags(tags, words):\n",
    "    dict_out = {}\n",
    "    word_ = \"\"\n",
    "    for tag, word in zip(tags[::-1], words[::-1]):\n",
    "        if tag == 0:\n",
    "            continue\n",
    "        word_ = word_ + \" \" + word\n",
    "        if label_names[tag].startswith(\"B\"):\n",
    "            tag_name = label_names[tag][2:]\n",
    "            word_ = word_.strip()\n",
    "            if tag_name not in dict_out:\n",
    "                dict_out[tag_name] = [word_]\n",
    "            else:\n",
    "                dict_out[tag_name].append(word_)\n",
    "            word_ = \"\"\n",
    "    return dict_out\n",
    "\n",
    "\n",
    "def format_to_text(decoded):\n",
    "    text = \"\"\n",
    "    for key, value in decoded.items():\n",
    "        text += f\"{key}: {', '.join(value)}\\n\"\n",
    "    return text"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "5da715a8",
   "metadata": {},
   "outputs": [],
   "source": [
    "def generate_t5_data(example):\n",
    "    decoded = decode_tags(example[\"tags\"], example[\"tokens\"])\n",
    "    return {\"tokens\": \" \".join(example[\"tokens\"]), \"labels\": format_to_text(decoded)}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "57416e20",
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import T5Tokenizer, T5ForConditionalGeneration\n",
    "import torch\n",
    "\n",
    "# the following 2 hyperparameters are task-specific\n",
    "max_source_length = 512\n",
    "max_target_length = 128\n",
    "\n",
    "# encode the inputs\n",
    "task_prefix = \"What is the user intent?\"\n",
    "\n",
    "\n",
    "def tokenize(example):\n",
    "    tokenized = tokenizer(\n",
    "        task_prefix + example[\"tokens\"],\n",
    "        text_target=example[\"labels\"],\n",
    "        max_length=512,\n",
    "        truncation=True,\n",
    "    )\n",
    "    return tokenized"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "137905d7",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "23bafa0f97bc4d4da8a96397f0f3bd5a",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Map:   0%|          | 0/6900 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "tokenized_datasets = dataset.map(generate_t5_data)\n",
    "tokenized_datasets = tokenized_datasets.remove_columns([\"tags\"])\n",
    "tokenized_datasets = tokenized_datasets.map(tokenize)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "e2bdf1b0",
   "metadata": {},
   "outputs": [],
   "source": [
    "import evaluate\n",
    "\n",
    "metric = evaluate.load(\"sacrebleu\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "cd9871bf",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "\n",
    "\n",
    "def compute_metrics(eval_preds):\n",
    "    preds, labels = eval_preds\n",
    "    # In case the model returns more than the prediction logits\n",
    "    if isinstance(preds, tuple):\n",
    "        preds = preds[0]\n",
    "\n",
    "    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)\n",
    "\n",
    "    # Replace -100s in the labels as we can't decode them\n",
    "    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)\n",
    "    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)\n",
    "\n",
    "    # Some simple post-processing\n",
    "    decoded_preds = [pred.strip() for pred in decoded_preds]\n",
    "    decoded_labels = [[label.strip()] for label in decoded_labels]\n",
    "\n",
    "    result = metric.compute(predictions=decoded_preds, references=decoded_labels)\n",
    "    return {\"bleu\": result[\"score\"]}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "09afe1d0",
   "metadata": {},
   "outputs": [],
   "source": [
    "data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "58e84fd1",
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import Seq2SeqTrainingArguments\n",
    "\n",
    "args = Seq2SeqTrainingArguments(\n",
    "    f\"T5 test\",\n",
    "    evaluation_strategy=\"no\",\n",
    "    save_strategy=\"epoch\",\n",
    "    learning_rate=3e-4,\n",
    "    per_device_train_batch_size=64,\n",
    "    per_device_eval_batch_size=32,\n",
    "    weight_decay=0.01,\n",
    "    save_total_limit=3,\n",
    "    num_train_epochs=20,\n",
    "    predict_with_generate=True,\n",
    "    fp16=True,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "edfcbac1",
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import Seq2SeqTrainer\n",
    "\n",
    "trainer = Seq2SeqTrainer(\n",
    "    model,\n",
    "    args,\n",
    "    train_dataset=tokenized_datasets[\"train\"],\n",
    "    eval_dataset=tokenized_datasets[\"validation\"],\n",
    "    data_collator=data_collator,\n",
    "    tokenizer=tokenizer,\n",
    "    compute_metrics=compute_metrics,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "e0065364",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "\n",
       "    <div>\n",
       "      \n",
       "      <progress value='12' max='12' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
       "      [12/12 00:15]\n",
       "    </div>\n",
       "    "
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Trainer is attempting to log a value of \"{'summarization': {'early_stopping': True, 'length_penalty': 2.0, 'max_length': 200, 'min_length': 30, 'no_repeat_ngram_size': 3, 'num_beams': 4, 'prefix': 'summarize: '}, 'translation_en_to_de': {'early_stopping': True, 'max_length': 300, 'num_beams': 4, 'prefix': 'translate English to German: '}, 'translation_en_to_fr': {'early_stopping': True, 'max_length': 300, 'num_beams': 4, 'prefix': 'translate English to French: '}, 'translation_en_to_ro': {'early_stopping': True, 'max_length': 300, 'num_beams': 4, 'prefix': 'translate English to Romanian: '}}\" for key \"task_specific_params\" as a parameter. MLflow's log_param() only accepts values no longer than 250 characters so we dropped this attribute. You can use `MLFLOW_FLATTEN_PARAMS` environment variable to flatten the parameters and avoid this message.\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "{'eval_loss': 6.675447940826416,\n",
       " 'eval_bleu': 0.006728795795564811,\n",
       " 'eval_runtime': 17.5858,\n",
       " 'eval_samples_per_second': 43.217,\n",
       " 'eval_steps_per_second': 0.682}"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "trainer.evaluate(max_length=512)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "64ad307b",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/kave/miniconda3/envs/afterhours_dev/lib/python3.10/site-packages/transformers/optimization.py:411: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n",
      "  warnings.warn(\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "\n",
       "    <div>\n",
       "      \n",
       "      <progress value='4' max='1080' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
       "      [   4/1080 00:01 < 09:22, 1.91 it/s, Epoch 0.06/20]\n",
       "    </div>\n",
       "    <table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       " <tr style=\"text-align: left;\">\n",
       "      <th>Step</th>\n",
       "      <th>Training Loss</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "  </tbody>\n",
       "</table><p>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "ename": "OutOfMemoryError",
     "evalue": "CUDA out of memory. Tried to allocate 456.00 MiB (GPU 0; 11.75 GiB total capacity; 10.26 GiB already allocated; 131.12 MiB free; 10.83 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mOutOfMemoryError\u001b[0m                          Traceback (most recent call last)",
      "Cell \u001b[0;32mIn[15], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mtrainer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtrain\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n",
      "File \u001b[0;32m~/miniconda3/envs/afterhours_dev/lib/python3.10/site-packages/transformers/trainer.py:1539\u001b[0m, in \u001b[0;36mTrainer.train\u001b[0;34m(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)\u001b[0m\n\u001b[1;32m   1534\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmodel_wrapped \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmodel\n\u001b[1;32m   1536\u001b[0m inner_training_loop \u001b[38;5;241m=\u001b[39m find_executable_batch_size(\n\u001b[1;32m   1537\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_inner_training_loop, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_train_batch_size, args\u001b[38;5;241m.\u001b[39mauto_find_batch_size\n\u001b[1;32m   1538\u001b[0m )\n\u001b[0;32m-> 1539\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43minner_training_loop\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m   1540\u001b[0m \u001b[43m    \u001b[49m\u001b[43margs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1541\u001b[0m \u001b[43m    \u001b[49m\u001b[43mresume_from_checkpoint\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mresume_from_checkpoint\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1542\u001b[0m \u001b[43m    \u001b[49m\u001b[43mtrial\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtrial\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1543\u001b[0m \u001b[43m    \u001b[49m\u001b[43mignore_keys_for_eval\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mignore_keys_for_eval\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1544\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n",
      "File \u001b[0;32m~/miniconda3/envs/afterhours_dev/lib/python3.10/site-packages/transformers/trainer.py:1809\u001b[0m, in \u001b[0;36mTrainer._inner_training_loop\u001b[0;34m(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)\u001b[0m\n\u001b[1;32m   1806\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcontrol \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcallback_handler\u001b[38;5;241m.\u001b[39mon_step_begin(args, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstate, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcontrol)\n\u001b[1;32m   1808\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39maccelerator\u001b[38;5;241m.\u001b[39maccumulate(model):\n\u001b[0;32m-> 1809\u001b[0m     tr_loss_step \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtraining_step\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minputs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1811\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m (\n\u001b[1;32m   1812\u001b[0m     args\u001b[38;5;241m.\u001b[39mlogging_nan_inf_filter\n\u001b[1;32m   1813\u001b[0m     \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m is_torch_tpu_available()\n\u001b[1;32m   1814\u001b[0m     \u001b[38;5;129;01mand\u001b[39;00m (torch\u001b[38;5;241m.\u001b[39misnan(tr_loss_step) \u001b[38;5;129;01mor\u001b[39;00m torch\u001b[38;5;241m.\u001b[39misinf(tr_loss_step))\n\u001b[1;32m   1815\u001b[0m ):\n\u001b[1;32m   1816\u001b[0m     \u001b[38;5;66;03m# if loss is nan or inf simply add the average of previous logged losses\u001b[39;00m\n\u001b[1;32m   1817\u001b[0m     tr_loss \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m tr_loss \u001b[38;5;241m/\u001b[39m (\u001b[38;5;241m1\u001b[39m \u001b[38;5;241m+\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstate\u001b[38;5;241m.\u001b[39mglobal_step \u001b[38;5;241m-\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_globalstep_last_logged)\n",
      "File \u001b[0;32m~/miniconda3/envs/afterhours_dev/lib/python3.10/site-packages/transformers/trainer.py:2654\u001b[0m, in \u001b[0;36mTrainer.training_step\u001b[0;34m(self, model, inputs)\u001b[0m\n\u001b[1;32m   2651\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m loss_mb\u001b[38;5;241m.\u001b[39mreduce_mean()\u001b[38;5;241m.\u001b[39mdetach()\u001b[38;5;241m.\u001b[39mto(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39margs\u001b[38;5;241m.\u001b[39mdevice)\n\u001b[1;32m   2653\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcompute_loss_context_manager():\n\u001b[0;32m-> 2654\u001b[0m     loss \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcompute_loss\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minputs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   2656\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39margs\u001b[38;5;241m.\u001b[39mn_gpu \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m1\u001b[39m:\n\u001b[1;32m   2657\u001b[0m     loss \u001b[38;5;241m=\u001b[39m loss\u001b[38;5;241m.\u001b[39mmean()  \u001b[38;5;66;03m# mean() to average on multi-gpu parallel training\u001b[39;00m\n",
      "File \u001b[0;32m~/miniconda3/envs/afterhours_dev/lib/python3.10/site-packages/transformers/trainer.py:2679\u001b[0m, in \u001b[0;36mTrainer.compute_loss\u001b[0;34m(self, model, inputs, return_outputs)\u001b[0m\n\u001b[1;32m   2677\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m   2678\u001b[0m     labels \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m-> 2679\u001b[0m outputs \u001b[38;5;241m=\u001b[39m \u001b[43mmodel\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43minputs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   2680\u001b[0m \u001b[38;5;66;03m# Save past state if it exists\u001b[39;00m\n\u001b[1;32m   2681\u001b[0m \u001b[38;5;66;03m# TODO: this needs to be fixed and made cleaner later.\u001b[39;00m\n\u001b[1;32m   2682\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39margs\u001b[38;5;241m.\u001b[39mpast_index \u001b[38;5;241m>\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0\u001b[39m:\n",
      "File \u001b[0;32m~/miniconda3/envs/afterhours_dev/lib/python3.10/site-packages/torch/nn/modules/module.py:1501\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1496\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m   1497\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m   1498\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m   1499\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m   1500\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1501\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1502\u001b[0m \u001b[38;5;66;03m# Do not call functions when jit is used\u001b[39;00m\n\u001b[1;32m   1503\u001b[0m full_backward_hooks, non_full_backward_hooks \u001b[38;5;241m=\u001b[39m [], []\n",
      "File \u001b[0;32m~/miniconda3/envs/afterhours_dev/lib/python3.10/site-packages/accelerate/utils/operations.py:581\u001b[0m, in \u001b[0;36mconvert_outputs_to_fp32.<locals>.forward\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    580\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mforward\u001b[39m(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[0;32m--> 581\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mmodel_forward\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
      "File \u001b[0;32m~/miniconda3/envs/afterhours_dev/lib/python3.10/site-packages/accelerate/utils/operations.py:569\u001b[0m, in \u001b[0;36mConvertOutputsToFp32.__call__\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m    568\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__call__\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[0;32m--> 569\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m convert_to_fp32(\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmodel_forward\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m)\n",
      "File \u001b[0;32m~/miniconda3/envs/afterhours_dev/lib/python3.10/site-packages/torch/amp/autocast_mode.py:14\u001b[0m, in \u001b[0;36mautocast_decorator.<locals>.decorate_autocast\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m     11\u001b[0m \u001b[38;5;129m@functools\u001b[39m\u001b[38;5;241m.\u001b[39mwraps(func)\n\u001b[1;32m     12\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mdecorate_autocast\u001b[39m(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[1;32m     13\u001b[0m     \u001b[38;5;28;01mwith\u001b[39;00m autocast_instance:\n\u001b[0;32m---> 14\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
      "File \u001b[0;32m~/miniconda3/envs/afterhours_dev/lib/python3.10/site-packages/accelerate/utils/operations.py:581\u001b[0m, in \u001b[0;36mconvert_outputs_to_fp32.<locals>.forward\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    580\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mforward\u001b[39m(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[0;32m--> 581\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mmodel_forward\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
      "File \u001b[0;32m~/miniconda3/envs/afterhours_dev/lib/python3.10/site-packages/accelerate/utils/operations.py:569\u001b[0m, in \u001b[0;36mConvertOutputsToFp32.__call__\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m    568\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__call__\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[0;32m--> 569\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mconvert_to_fp32\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmodel_forward\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n",
      "File \u001b[0;32m~/miniconda3/envs/afterhours_dev/lib/python3.10/site-packages/accelerate/utils/operations.py:548\u001b[0m, in \u001b[0;36mconvert_to_fp32\u001b[0;34m(tensor)\u001b[0m\n\u001b[1;32m    545\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_is_fp16_bf16_tensor\u001b[39m(tensor):\n\u001b[1;32m    546\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mhasattr\u001b[39m(tensor, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdtype\u001b[39m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;129;01mand\u001b[39;00m tensor\u001b[38;5;241m.\u001b[39mdtype \u001b[38;5;129;01min\u001b[39;00m (torch\u001b[38;5;241m.\u001b[39mfloat16, torch\u001b[38;5;241m.\u001b[39mbfloat16)\n\u001b[0;32m--> 548\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mrecursively_apply\u001b[49m\u001b[43m(\u001b[49m\u001b[43m_convert_to_fp32\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtensor\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtest_type\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m_is_fp16_bf16_tensor\u001b[49m\u001b[43m)\u001b[49m\n",
      "File \u001b[0;32m~/miniconda3/envs/afterhours_dev/lib/python3.10/site-packages/accelerate/utils/operations.py:120\u001b[0m, in \u001b[0;36mrecursively_apply\u001b[0;34m(func, data, test_type, error_on_other_type, *args, **kwargs)\u001b[0m\n\u001b[1;32m    109\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m honor_type(\n\u001b[1;32m    110\u001b[0m         data,\n\u001b[1;32m    111\u001b[0m         (\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    116\u001b[0m         ),\n\u001b[1;32m    117\u001b[0m     )\n\u001b[1;32m    118\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(data, Mapping):\n\u001b[1;32m    119\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mtype\u001b[39m(data)(\n\u001b[0;32m--> 120\u001b[0m         {\n\u001b[1;32m    121\u001b[0m             k: recursively_apply(\n\u001b[1;32m    122\u001b[0m                 func, v, \u001b[38;5;241m*\u001b[39margs, test_type\u001b[38;5;241m=\u001b[39mtest_type, error_on_other_type\u001b[38;5;241m=\u001b[39merror_on_other_type, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs\n\u001b[1;32m    123\u001b[0m             )\n\u001b[1;32m    124\u001b[0m             \u001b[38;5;28;01mfor\u001b[39;00m k, v \u001b[38;5;129;01min\u001b[39;00m data\u001b[38;5;241m.\u001b[39mitems()\n\u001b[1;32m    125\u001b[0m         }\n\u001b[1;32m    126\u001b[0m     )\n\u001b[1;32m    127\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m test_type(data):\n\u001b[1;32m    128\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m func(data, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n",
      "File \u001b[0;32m~/miniconda3/envs/afterhours_dev/lib/python3.10/site-packages/accelerate/utils/operations.py:121\u001b[0m, in \u001b[0;36m<dictcomp>\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m    109\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m honor_type(\n\u001b[1;32m    110\u001b[0m         data,\n\u001b[1;32m    111\u001b[0m         (\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    116\u001b[0m         ),\n\u001b[1;32m    117\u001b[0m     )\n\u001b[1;32m    118\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(data, Mapping):\n\u001b[1;32m    119\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mtype\u001b[39m(data)(\n\u001b[1;32m    120\u001b[0m         {\n\u001b[0;32m--> 121\u001b[0m             k: \u001b[43mrecursively_apply\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    122\u001b[0m \u001b[43m                \u001b[49m\u001b[43mfunc\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mv\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtest_type\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtest_type\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43merror_on_other_type\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43merror_on_other_type\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\n\u001b[1;32m    123\u001b[0m \u001b[43m            \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    124\u001b[0m             \u001b[38;5;28;01mfor\u001b[39;00m k, v \u001b[38;5;129;01min\u001b[39;00m data\u001b[38;5;241m.\u001b[39mitems()\n\u001b[1;32m    125\u001b[0m         }\n\u001b[1;32m    126\u001b[0m     )\n\u001b[1;32m    127\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m test_type(data):\n\u001b[1;32m    128\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m func(data, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n",
      "File \u001b[0;32m~/miniconda3/envs/afterhours_dev/lib/python3.10/site-packages/accelerate/utils/operations.py:128\u001b[0m, in \u001b[0;36mrecursively_apply\u001b[0;34m(func, data, test_type, error_on_other_type, *args, **kwargs)\u001b[0m\n\u001b[1;32m    119\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mtype\u001b[39m(data)(\n\u001b[1;32m    120\u001b[0m         {\n\u001b[1;32m    121\u001b[0m             k: recursively_apply(\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    125\u001b[0m         }\n\u001b[1;32m    126\u001b[0m     )\n\u001b[1;32m    127\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m test_type(data):\n\u001b[0;32m--> 128\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdata\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    129\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m error_on_other_type:\n\u001b[1;32m    130\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mTypeError\u001b[39;00m(\n\u001b[1;32m    131\u001b[0m         \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mUnsupported types (\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mtype\u001b[39m(data)\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m) passed to `\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mfunc\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m`. Only nested list/tuple/dicts of \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    132\u001b[0m         \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mobjects that are valid for `\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mtest_type\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m` should be passed.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    133\u001b[0m     )\n",
      "File \u001b[0;32m~/miniconda3/envs/afterhours_dev/lib/python3.10/site-packages/accelerate/utils/operations.py:543\u001b[0m, in \u001b[0;36mconvert_to_fp32.<locals>._convert_to_fp32\u001b[0;34m(tensor)\u001b[0m\n\u001b[1;32m    542\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_convert_to_fp32\u001b[39m(tensor):\n\u001b[0;32m--> 543\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mtensor\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfloat\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n",
      "\u001b[0;31mOutOfMemoryError\u001b[0m: CUDA out of memory. Tried to allocate 456.00 MiB (GPU 0; 11.75 GiB total capacity; 10.26 GiB already allocated; 131.12 MiB free; 10.83 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF"
     ]
    }
   ],
   "source": [
    "trainer.train()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6672ff53",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}