From 3fb04389168b18fce81bc0205243adab2b3cacb4 Mon Sep 17 00:00:00 2001 From: Michael Benayoun Date: Tue, 11 Feb 2025 15:14:56 +0100 Subject: [PATCH] Restore script for tutorial --- .../sft_lora_finetune_llm.py | 39 +++---------------- 1 file changed, 6 insertions(+), 33 deletions(-) diff --git a/docs/source/training_tutorials/sft_lora_finetune_llm.py b/docs/source/training_tutorials/sft_lora_finetune_llm.py index 79496b352..9c383ff85 100644 --- a/docs/source/training_tutorials/sft_lora_finetune_llm.py +++ b/docs/source/training_tutorials/sft_lora_finetune_llm.py @@ -11,7 +11,6 @@ from optimum.neuron import NeuronHfArgumentParser as HfArgumentParser from optimum.neuron import NeuronSFTConfig, NeuronSFTTrainer, NeuronTrainingArguments from optimum.neuron.distributed import lazy_load_for_parallelism -from optimum.neuron.trainers import NeuronTrainer def format_dolly(examples): @@ -24,13 +23,6 @@ def format_dolly(examples): output_text.append(prompt) return output_text -def format_dolly(example): - instruction = f"### Instruction\n{example['instruction']}" - context = f"### Context\n{example['context']}" if len(example["context"]) > 0 else None - response = f"### Answer\n{example['response']}" - prompt = "\n\n".join([i for i in [instruction, context, response] if i is not None]) - return {"text": prompt} - def training_function(script_args, training_args): dataset = load_dataset("databricks/databricks-dolly-15k", split="train") @@ -38,24 +30,12 @@ def training_function(script_args, training_args): tokenizer = AutoTokenizer.from_pretrained(script_args.model_id) tokenizer.pad_token = tokenizer.eos_token - # TODO: remove, we try to overfit. - dataset = dataset.select([0] * 10000) - dataset = dataset.map(format_dolly) - - def tokenize(example): - output = tokenizer(example["text"], padding="max_length", max_length=1024, truncation=True) - output["labels"] = output["input_ids"] - return output - - dataset = dataset.map(tokenize) - - with lazy_load_for_parallelism(tensor_parallel_size=training_args.tensor_parallel_size): model = AutoModelForCausalLM.from_pretrained(script_args.model_id) config = LoraConfig( r=16, - lora_alpha=32, + lora_alpha=16, lora_dropout=0.05, target_modules=["q_proj", "gate_proj", "v_proj", "o_proj", "k_proj", "up_proj", "down_proj"], bias="none", @@ -66,24 +46,17 @@ def tokenize(example): sft_config = NeuronSFTConfig( max_seq_length=1024, packing=False, - dataset_text_field="text", **args, ) - trainer = NeuronTrainer( + trainer = NeuronSFTTrainer( + args=sft_config, model=model, - args=training_args, - train_dataset=dataset, + peft_config=config, tokenizer=tokenizer, + train_dataset=dataset, + formatting_func=format_dolly, ) - #trainer = NeuronSFTTrainer( - # args=sft_config, - # model=model, - # # peft_config=config, - # tokenizer=tokenizer, - # train_dataset=dataset, - # # formatting_func=format_dolly, - #) # Start training trainer.train()