-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathconfig_quantization.yml
89 lines (74 loc) · 2.98 KB
/
config_quantization.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
# Model info
model_name : "shuyuej/Llama-3.3-70B-Instruct-GPTQ"
# Dataset info
dataset_hf : "shuyuej/PodGPT-Demo-Data"
# This is my Hugging Face `read` and `write` tokens. Please replace it to yours.
# `read` token: for downloading models
# `write` token: for uploading your models to Hugging Face
# For your information: https://huggingface.co/settings/tokens
hf_read_token : "YOUR_HUGGING_FACE_READ_TOKEN" # Hugging Face `read` Token
hf_write_token : "YOUR_HUGGING_FACE_WRITE_TOKEN" # Hugging Face `write` Token
# Evaluate the original pre-trained model's performance
eval_pretrain : False
# LoRA
# LoRA attention dimension
lora_r : 16
# Alpha parameter for LoRA scaling
lora_alpha : 32
# Dropout probability for LoRA layers
lora_dropout : 0.1
# Saving path
result_dir : "./results"
save_dir : "./save_folder"
data_save_dir : "./save_folder/data"
# Training length and number of the generated tokens
train_max_len : 2048
max_new_tokens : 1024
# Batch size
train_batch_size : 1
# Number of training epochs
epochs : 1
# Optimizer, Learning rate schedule, warm-up ratio
optim : "adamw_torch"
lr_scheduler_type : "cosine"
warmup_ratio : 0.03
# activation checkpointing
# When enabled, a lot of memory can be freed at the cost of small decrease in the training speed
# due to recomputing parts of the graph during back-propagation.
gradient_checkpointing : True
# Number of update steps to accumulate the gradients
gradient_accumulation_steps : 1
# Specify the maximum norm of the gradients for gradient clipping.
# Gradient clipping is used to prevent the exploding gradient problem in deep neural networks.
max_grad_norm : 0.01
# Initial learning rate
learning_rate : 0.000005
# Weight decay to apply to all layers except bias/LayerNorm weights
weight_decay : 0.01
# Output directory where the model predictions and checkpoints will be stored
# Enable fp16/bf16 training (set bf16 to True with an A100)
# Whether to use fp16 16-bit (mixed) precision training instead of 32-bit training.
fp16 : True
# Whether to use bf16 16-bit (mixed) precision training instead of 32-bit training. Requires Ampere or higher
# NVIDIA architecture or using CPU (use_cpu) or Ascend NPU. This is an experimental API and it may change.
# For the quantized model with the LoRA adapter, we cannot use bf16 training due to this issue caused by triton:
# AssertionError('First input (bf16) and second input (fp32) must have the same dtype!')
# https://github.com/unslothai/unsloth/issues/1333#issuecomment-2542106253
bf16 : False
# Save the model for a number of steps
save_strategy : "steps"
save_steps : 1000
# Whether to save the optimizer and scheduler
save_only_model : True
# Number of total saved checkpoints
save_total_limit : 10
# Log every X updates steps
logging_steps : 1
# Logging save platform
log_save_platform : "tensorboard"
# Choose which GPU to use
device_map : "auto"
# The number of GPUs and GPU utilization for the vLLM Engine
# https://docs.vllm.ai/en/latest/serving/distributed_serving.html
num_gpus_vllm : 4
gpu_utilization_vllm : 0.95