unslothai · tohrnii · Jan 31, 2024 · Feb 22, 2024 · Feb 22, 2024 · Feb 26, 2024
diff --git a/.gitignore b/.gitignore
@@ -158,3 +158,5 @@ cython_debug/
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
+
+**/runpod
diff --git a/examples/Alpaca_Mixtral_HF.ipynb b/examples/Alpaca_Mixtral_HF.ipynb
@@ -0,0 +1,125 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!\n",
+    "dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+\n",
+    "load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig\n",
+    "alpaca_prompt = \"\"\"Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n",
+    "\n",
+    "### Instruction:\n",
+    "{}\n",
+    "\n",
+    "### Input:\n",
+    "{}\n",
+    "\n",
+    "### Response:\n",
+    "{}\"\"\"\n",
+    "tokenizer = AutoTokenizer.from_pretrained(\"mistralai/Mixtral-8x7B-v0.1\", use_fast=True)\n",
+    "tokenizer.pad_token = tokenizer.eos_token\n",
+    "EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN\n",
+    "def formatting_prompts_func(examples):\n",
+    "    instructions = examples[\"instruction\"]\n",
+    "    inputs       = examples[\"input\"]\n",
+    "    outputs      = examples[\"output\"]\n",
+    "    texts = []\n",
+    "    for instruction, input, output in zip(instructions, inputs, outputs):\n",
+    "        # Must add EOS_TOKEN, otherwise your generation will go on forever!\n",
+    "        text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN\n",
+    "        texts.append(text)\n",
+    "    return { \"text\" : texts, }\n",
+    "pass\n",
+    "\n",
+    "from datasets import load_dataset\n",
+    "dataset = load_dataset(\"yahma/alpaca-cleaned\", split = \"train\")\n",
+    "dataset = dataset.map(formatting_prompts_func, batched = True,)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from trl import SFTTrainer\n",
+    "from transformers import TrainingArguments\n",
+    "from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model, PeftConfig\n",
+    "\n",
+    "compute_dtype = getattr(torch, \"float16\")\n",
+    "device_map = {\"\": 0}\n",
+    "bnb_config = BitsAndBytesConfig(\n",
+    "        load_in_4bit=True,\n",
+    "        bnb_4bit_quant_type='nf4',\n",
+    "        bnb_4bit_compute_dtype=compute_dtype,\n",
+    "        bnb_4bit_use_double_quant=False,\n",
+    ")\n",
+    "model_name = \"mistralai/Mixtral-8x7B-v0.1\"\n",
+    "model = AutoModelForCausalLM.from_pretrained(model_name, \n",
+    "                                                      device_map=device_map,\n",
+    "                                                      quantization_config=bnb_config,\n",
+    "                                                      trust_remote_code=True,\n",
+    "                                                      use_auth_token=False)\n",
+    "\n",
+    "model = prepare_model_for_kbit_training(model)\n",
+    "model.gradient_checkpointing_enable()\n",
+    "lora_config = LoraConfig(\n",
+    "    r=16,\n",
+    "    lora_alpha=16,\n",
+    "    lora_dropout=0,\n",
+    "    bias=\"none\",\n",
+    "    task_type=\"CAUSAL_LM\",\n",
+    "    target_modules=[\"q_proj\", \"k_proj\", \"v_proj\", \"o_proj\"],\n",
+    "    # modules_to_save=model_config.lora_modules_to_save,\n",
+    ")\n",
+    "model = get_peft_model(model, lora_config)\n",
+    "trainer = SFTTrainer(\n",
+    "        model=model,\n",
+    "        train_dataset=dataset,\n",
+    "        dataset_text_field=\"text\",\n",
+    "        max_seq_length=max_seq_length,\n",
+    "        tokenizer=tokenizer,\n",
+    "        dataset_num_proc = 2,\n",
+    "        packing=False,\n",
+    "        args=TrainingArguments(\n",
+    "            per_device_train_batch_size = 2,\n",
+    "            gradient_accumulation_steps = 4,\n",
+    "            warmup_steps = 5,\n",
+    "            max_steps = 60,\n",
+    "            learning_rate = 2e-4,\n",
+    "            fp16 = not torch.cuda.is_bf16_supported(),\n",
+    "            bf16 = torch.cuda.is_bf16_supported(),\n",
+    "            logging_steps = 1,\n",
+    "            optim = \"adamw_8bit\",\n",
+    "            weight_decay = 0.01,\n",
+    "            lr_scheduler_type = \"linear\",\n",
+    "            seed = 3407,\n",
+    "            output_dir = \"outputs\",\n",
+    "        ),\n",
+    "    )\n",
+    "trainer.train()"
+   ]
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "name": "python"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}