config_quantization.yml

# Model info
model_name : "shuyuej/Llama-3.3-70B-Instruct-GPTQ"

# Dataset info
dataset_hf : "shuyuej/PodGPT-Demo-Data"

# This is my Hugging Face `read` and `write` tokens. Please replace it to yours.
# `read` token: for downloading models
# `write` token: for uploading your models to Hugging Face
# For your information: https://huggingface.co/settings/tokens
hf_read_token : "YOUR_HUGGING_FACE_READ_TOKEN"  # Hugging Face `read` Token
hf_write_token : "YOUR_HUGGING_FACE_WRITE_TOKEN"  # Hugging Face `write` Token

# Evaluate the original pre-trained model's performance
eval_pretrain : False

# LoRA
# LoRA attention dimension
lora_r : 16
# Alpha parameter for LoRA scaling
lora_alpha : 32
# Dropout probability for LoRA layers
lora_dropout : 0.1

# Saving path
result_dir : "./results"
save_dir : "./save_folder"
data_save_dir : "./save_folder/data"

# Training length and number of the generated tokens
train_max_len : 2048
max_new_tokens : 1024

# Batch size
train_batch_size : 1

# Number of training epochs
epochs : 1

# Optimizer, Learning rate schedule, warm-up ratio
optim : "adamw_torch"
lr_scheduler_type : "cosine"
warmup_ratio : 0.03

# activation checkpointing
# When enabled, a lot of memory can be freed at the cost of small decrease in the training speed
# due to recomputing parts of the graph during back-propagation.
gradient_checkpointing : True
# Number of update steps to accumulate the gradients
gradient_accumulation_steps : 1
# Specify the maximum norm of the gradients for gradient clipping. 
# Gradient clipping is used to prevent the exploding gradient problem in deep neural networks.
max_grad_norm : 0.01
# Initial learning rate
learning_rate : 0.000005
# Weight decay to apply to all layers except bias/LayerNorm weights
weight_decay : 0.01

# Output directory where the model predictions and checkpoints will be stored
# Enable fp16/bf16 training (set bf16 to True with an A100)
# Whether to use fp16 16-bit (mixed) precision training instead of 32-bit training.
fp16 : True
# Whether to use bf16 16-bit (mixed) precision training instead of 32-bit training. Requires Ampere or higher
# NVIDIA architecture or using CPU (use_cpu) or Ascend NPU. This is an experimental API and it may change.
# For the quantized model with the LoRA adapter, we cannot use bf16 training due to this issue caused by triton:
# AssertionError('First input (bf16) and second input (fp32) must have the same dtype!')
# https://github.com/unslothai/unsloth/issues/1333#issuecomment-2542106253
bf16 : False

# Save the model for a number of steps
save_strategy : "steps"
save_steps : 1000
# Whether to save the optimizer and scheduler
save_only_model : True
# Number of total saved checkpoints
save_total_limit : 10

# Log every X updates steps
logging_steps : 1
# Logging save platform
log_save_platform : "tensorboard"

# Choose which GPU to use
device_map : "auto"

# The number of GPUs and GPU utilization for the vLLM Engine
# https://docs.vllm.ai/en/latest/serving/distributed_serving.html
num_gpus_vllm : 4
gpu_utilization_vllm : 0.95