Skip to content

Commit

Permalink
Restore script for tutorial
Browse files Browse the repository at this point in the history
  • Loading branch information
michaelbenayoun committed Feb 11, 2025
1 parent c09e1b5 commit 3fb0438
Showing 1 changed file with 6 additions and 33 deletions.
39 changes: 6 additions & 33 deletions docs/source/training_tutorials/sft_lora_finetune_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
from optimum.neuron import NeuronHfArgumentParser as HfArgumentParser
from optimum.neuron import NeuronSFTConfig, NeuronSFTTrainer, NeuronTrainingArguments
from optimum.neuron.distributed import lazy_load_for_parallelism
from optimum.neuron.trainers import NeuronTrainer


def format_dolly(examples):
Expand All @@ -24,38 +23,19 @@ def format_dolly(examples):
output_text.append(prompt)
return output_text

def format_dolly(example):
instruction = f"### Instruction\n{example['instruction']}"
context = f"### Context\n{example['context']}" if len(example["context"]) > 0 else None
response = f"### Answer\n{example['response']}"
prompt = "\n\n".join([i for i in [instruction, context, response] if i is not None])
return {"text": prompt}


def training_function(script_args, training_args):
dataset = load_dataset("databricks/databricks-dolly-15k", split="train")

tokenizer = AutoTokenizer.from_pretrained(script_args.model_id)
tokenizer.pad_token = tokenizer.eos_token

# TODO: remove, we try to overfit.
dataset = dataset.select([0] * 10000)
dataset = dataset.map(format_dolly)

def tokenize(example):
output = tokenizer(example["text"], padding="max_length", max_length=1024, truncation=True)
output["labels"] = output["input_ids"]
return output

dataset = dataset.map(tokenize)


with lazy_load_for_parallelism(tensor_parallel_size=training_args.tensor_parallel_size):
model = AutoModelForCausalLM.from_pretrained(script_args.model_id)

config = LoraConfig(
r=16,
lora_alpha=32,
lora_alpha=16,
lora_dropout=0.05,
target_modules=["q_proj", "gate_proj", "v_proj", "o_proj", "k_proj", "up_proj", "down_proj"],
bias="none",
Expand All @@ -66,24 +46,17 @@ def tokenize(example):
sft_config = NeuronSFTConfig(
max_seq_length=1024,
packing=False,
dataset_text_field="text",
**args,
)

trainer = NeuronTrainer(
trainer = NeuronSFTTrainer(
args=sft_config,
model=model,
args=training_args,
train_dataset=dataset,
peft_config=config,
tokenizer=tokenizer,
train_dataset=dataset,
formatting_func=format_dolly,
)
#trainer = NeuronSFTTrainer(
# args=sft_config,
# model=model,
# # peft_config=config,
# tokenizer=tokenizer,
# train_dataset=dataset,
# # formatting_func=format_dolly,
#)

# Start training
trainer.train()
Expand Down

0 comments on commit 3fb0438

Please sign in to comment.