RuntimeError: CUDA error: out of memory CUDA #1552

UltraHare · 2025-01-17T19:38:44Z

Good afternoon.

I have been having problems trying to Fine Tuning the 3B Llama 3.2 model of 3B parameters. I have tried lowering the batch size from 16 to 2, and I have also tried reducing the number of max sequences to 1024, I have also tried changing the model but I have not been able to train yet. I also checked that I have enough resources to do the training, I have a Tesla M40 with 24G VRAM. Please, I have been for a long time unable to train because of this problem that has occurred in the last update of Unsloth. I will leave you the code I am running.

FineTuningLLM.py
`
def init(self, model: str, prompt: str = None, max_seq_length: int = 2048, dtype = None, load_in_4bit: bool = True, device_map: str = "sequential", rank: int = 16,
target_modules: list[str] = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj",],
lora_dropout: int = 0, bias: str = "none", use_gradient_checkpointing: str = "unsloth",
random_state: int = 3407, use_rslora: bool = False, loftq_config: dict = None):

    self.__max_seq_length = max_seq_length
    self.__dtype = dtype
    self.__load_in_4bit = load_in_4bit
    self.__alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

Instruction:

{}

Input:

{}

Response:

{}"""

if prompt == None else prompt

    if torch.cuda.is_available():
        if torch.cuda.device_count() > 1:
            print("\n\n Especificar dispositivo a usar: \n")
            for i in range(0, torch.cuda.device_count()):
                print(f"{i}. {torch.cuda.get_device_name(i)}\n\n")

            device = int(input("Selecciona un dispostivo por enumerador: "))

            if device <= torch.cuda.device_count() and device >= 0:
                torch.cuda.set_device(device)

        system("clear")

    print(f"CUDA current device: {torch.cuda.current_device()}\n\nCUDA name device: {torch.cuda.get_device_name(torch.cuda.current_device())}\n\n")

    self.__model, self.__tokenizer = FastLanguageModel.from_pretrained(
    model_name = model,
    max_seq_length = self.__max_seq_length,
    dtype = self.__dtype,
    load_in_4bit = self.__load_in_4bit,
    device_map = device_map
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
    )

    self.__model = FastLanguageModel.get_peft_model(
        self.__model,
        r = rank, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
        target_modules = target_modules,
        lora_alpha = rank,
        lora_dropout = lora_dropout, # Supports any, but = 0 is optimized
        bias = bias,    # Supports any, but = "none" is optimized
        # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
        use_gradient_checkpointing = use_gradient_checkpointing, # True or "unsloth" for very long context
        random_state = random_state,
        use_rslora = use_rslora,  # We support rank stabilized LoRA
        loftq_config = loftq_config, # And LoftQ
    )

def setConfigModelLLM(self, model: str, max_seq_length: int = 2048, dtype = None, load_in_4bit: bool = True, device_map: str = "sequential", rank: int = 16,
             target_modules: list[str] = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj",],
             lora_dropout: int = 0, bias: str = "none", use_gradient_checkpointing: str = "unsloth",
             random_state: int = 3407, use_rslora: bool = False, loftq_config: dict = None):
    
    self.__max_seq_length = max_seq_length
    self.__dtype = dtype
    self.__load_in_4bit = load_in_4bit

    self.__model, self.__tokenizer = FastLanguageModel.from_pretrained(
    model_name = model,
    max_seq_length = self.__max_seq_length,
    dtype = self.__dtype,
    load_in_4bit = self.__load_in_4bit,
    device_map = device_map
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
    )

    self.__model = FastLanguageModel.get_peft_model(
        self.__model,
        r = rank, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
        target_modules = target_modules,
        lora_alpha = rank,
        lora_dropout = lora_dropout, # Supports any, but = 0 is optimized
        bias = bias,    # Supports any, but = "none" is optimized
        # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
        use_gradient_checkpointing = use_gradient_checkpointing, # True or "unsloth" for very long context
        random_state = random_state,
        use_rslora = use_rslora,  # We support rank stabilized LoRA
        loftq_config = loftq_config, # And LoftQ
    )

def setDataSetLLM(self, pathDataSet: str):

    EOS_TOKEN = self.__tokenizer.eos_token # Must add EOS_TOKEN
    def formatting_prompts_func(examples):
        instructions = examples["instruction"]
        inputs       = examples["input"]
        outputs      = examples["output"]
        texts = []
        for instruction, input, output in zip(instructions, inputs, outputs):
            # Must add EOS_TOKEN, otherwise your generation will go on forever!
            text = self.__alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
            texts.append(text)
        return { "text" : texts, }
    pass

    self.__dataset = load_dataset("data/csv", data_files=path.basename(pathDataSet), split = "train")
    self.__dataset = self.__dataset.map(formatting_prompts_func, batched = True,)

def trainLLM(self, pathModelSave: str, num_train_epochs: int = 10, per_device_train_batch_size: int = 2, gradient_accumulation_steps: int = 4,
             dataset_num_proc: int = 2, packing: bool = False, warmup_steps: int = 5, learning_rate: float = 2e-4, logging_steps: int = 1,
             optim: str = "adamw_8bit", weight_decay: float = 0.01, lr_scheduler_type: str = "linear", seed: int = 3407, output_dir: str = "outputs",
             report_to: str = "none"):
    
    """
    Entrenamiento de modelos LLM / Fine Tunning

    pathModelSave (str, path, entrenado):
        Directorio en donde se guardara el modelo.
        
    num_train_epochs (int, epocas, por defecto es 10):
        Numero total de epocas por entrenamiento

    per_device_train_batch_size (int, batch_size, por defecto es 2):
        Tamaño de batch por GPU/TPU/MPS/NPU core/CPU para entrenar
    
    gradient_accumulation_steps (int, batch_size, por defecto es 4):
        Numero de pasos de actualizacion que deben acumularse antes de realizar una pasada hacia atras/actualizacion. 
    """

    trainer = SFTTrainer(
        model = self.__model,
        tokenizer = self.__tokenizer,
        train_dataset = self.__dataset,
        dataset_text_field = "text",
        max_seq_length = self.__max_seq_length,
        dataset_num_proc = dataset_num_proc,
        packing = packing,
        args = TrainingArguments(
            per_device_train_batch_size = per_device_train_batch_size,
            gradient_accumulation_steps = gradient_accumulation_steps,
            warmup_steps = warmup_steps,
            num_train_epochs = num_train_epochs, 
            learning_rate = learning_rate,
            fp16 = not is_bfloat16_supported(),
            bf16 = is_bfloat16_supported(),
            logging_steps = logging_steps,
            optim = optim,
            weight_decay = weight_decay,
            lr_scheduler_type = lr_scheduler_type,
            seed = seed,
            output_dir = output_dir,
            report_to = report_to
        ),
    )

    torch.cuda.empty_cache()
    torch.cuda.reset_peak_memory_stats()

    trainer_stats = unsloth_train(trainer)

    torch.cuda.empty_cache()
    torch.cuda.reset_peak_memory_stats()
    
    self.__model.save_pretrained(pathModelSave)
    self.__tokenizer.save_pretrained(pathModelSave)`

main.py
`import os
import argparse

def main():
parser = argparse.ArgumentParser(
prog="FineTunning and Inferences LLMs",
description="SFTTrainer FineTunning and Inferences LLMs"
)
parser.add_argument("--mode_run", "-r", choices=["train", "inferences", "convert_ollama","t", "i", "co"], default="train", help="Select Fine Tunning or Inferences")
parser.add_argument("--epochs", "-ep", type=float, default=10, help="Set epochs for Fine Tunning")
parser.add_argument("--batch_size", "-batch", type=int, default=16, help="Set epochs for Fine Tunning")
args = parser.parse_args()
try:
import FineTunningLLM

    pathModels = "models/llama"
    pathPDF = "data/pdf"
    pathDataSet = "data/csv/data.csv"
    pathJSON = "data/json"
    pathGGUF = "models/ollama"
    pathOutput = "outputs/"

    try:
        os.makedirs(pathModels, exist_ok=True)
        os.makedirs(pathPDF, exist_ok=True)
        os.makedirs(os.path.dirname(pathDataSet), exist_ok=True)
        os.makedirs(pathGGUF, exist_ok=True)
        os.makedirs(pathJSON, exist_ok=True)
    except OSError:
        pass

    print("\nModelos disponibles: \n\n")

    for name_folder in os.listdir(pathModels):
        print(f"{name_folder}\n\n") 
    nameModel = input("Nombre del modelo: ").strip()
    nameModel = os.path.join(pathModels, nameModel) if nameModel != "" else nameModel
    nameModel = nameModel if os.path.exists(nameModel) else "unsloth/Llama-3.2-3B"

    model = FineTunningLLM.TrainLlama(nameModel, rank=16, load_in_4bit=False)

    if args.mode_run == "train" or args.mode_run == "t":

        print("\n\nGenerando el dataset...\n\n")
        dataSet = FineTunningLLM.DataSetFineTunning()
        listaPDF = dataSet.extractListPDF(20, 20)
        dataSet.generateJSON(listaPDF, dataSet.data_constitucion, dataSet.data_listado)
        dataSet.generateCSV()
        print("\n\nCargando el dataset...\n\n")
        model.setDataSetLLM(pathDataSet)
        print("\n\n¡Deja en blanco si quieres sobreescribir un modelo existente!\n\n")
        saveModel = input("Nombre de modelo a guardar: ").strip()
        saveModelOutput = os.path.join(pathOutput, saveModel)
        saveModel = os.path.join(pathModels, saveModel) if saveModel != "" else os.path.join(pathModels, nameModel)
        model.trainLLM(saveModel, num_train_epochs=args.epochs, per_device_train_batch_size=4, gradient_accumulation_steps=4, output_dir=saveModelOutput)
        pathGGUF = os.path.join(pathGGUF, saveModel)
        model.saveGGUF(pathGGUF)

    elif args.mode_run == "inferences" or args.mode_run == "i":
       
        model.inferencesLLM(pathPDF)

    elif args.mode_run == "convert_ollama" or args.mode_run == "co":

        print("Cargando el dataset...\n\n")
        model.setDataSetLLM(pathDataSet)
        print("\n\n¡Dejar en blanco si quieres sobreescribir o generar un archivo con el mismo nombre!\n\n")
        nameOllama = input("Ubicacion de Modelfile: ").strip()
        nameOllama = nameOllama if nameOllama != "" else nameModel
        print("\n\nGuardando modelo con llama.cpp...\n\n")
        model.importOllama(nameModel, nameOllama)
        pathOllama = os.path.join(pathOllama, nameOllama)
        print(f"Modelo guardado en {pathOllama}\n\n")

except argparse.ArgumentError as e:
    print(f"Argumento invalido: {e}")
    exit(0)

if name == "main":
main()
`

Traceback (most recent call last):
File "/home/oscar/LLM/main.py", line 80, in
main()
File "/home/oscar/LLM/main.py", line 55, in main
model.trainLLM(saveModel, num_train_epochs=args.epochs, per_device_train_batch_size=4, gradient_accumulation_steps=4, output_dir=saveModelOutput)
File "/home/oscar/LLM/FineTunningLLM.py", line 184, in trainLLM
trainer_stats = unsloth_train(trainer)
^^^^^^^^^^^^^^^^^^^^^^
File "/home/oscar/anaconda3/envs/ML/lib/python3.11/site-packages/unsloth/trainer.py", line 45, in unsloth_train
return trainer.train(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "", line 157, in train
File "", line 382, in _fast_inner_training_loop
File "", line 31, in _unsloth_training_step
File "/home/oscar/anaconda3/envs/ML/lib/python3.11/site-packages/unsloth/models/_utils.py", line 1063, in _unsloth_pre_compute_loss
return self._old_compute_loss(model, inputs, *args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/oscar/anaconda3/envs/ML/lib/python3.11/site-packages/transformers/trainer.py", line 3708, in compute_loss
outputs = model(**inputs)
^^^^^^^^^^^^^^^
File "/home/oscar/anaconda3/envs/ML/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/oscar/anaconda3/envs/ML/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/oscar/anaconda3/envs/ML/lib/python3.11/site-packages/accelerate/utils/operations.py", line 823, in forward
return model_forward(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/oscar/anaconda3/envs/ML/lib/python3.11/site-packages/accelerate/utils/operations.py", line 811, in call
return convert_to_fp32(self.model_forward(*args, **kwargs))
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/oscar/anaconda3/envs/ML/lib/python3.11/site-packages/torch/amp/autocast_mode.py", line 44, in decorate_autocast
return func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/home/oscar/anaconda3/envs/ML/lib/python3.11/site-packages/torch/_compile.py", line 32, in inner
return disable_fn(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/oscar/anaconda3/envs/ML/lib/python3.11/site-packages/torch/_dynamo/eval_frame.py", line 632, in _fn
return fn(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^
File "/home/oscar/anaconda3/envs/ML/lib/python3.11/site-packages/unsloth/models/llama.py", line 1126, in PeftModelForCausalLM_fast_forward
return self.base_model(
^^^^^^^^^^^^^^^^
File "/home/oscar/anaconda3/envs/ML/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/oscar/anaconda3/envs/ML/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/oscar/anaconda3/envs/ML/lib/python3.11/site-packages/peft/tuners/tuners_utils.py", line 197, in forward
return self.model.forward(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/oscar/anaconda3/envs/ML/lib/python3.11/site-packages/unsloth/models/llama.py", line 986, in _CausalLM_fast_forward
outputs = self.model(
^^^^^^^^^^^
File "/home/oscar/anaconda3/envs/ML/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/oscar/anaconda3/envs/ML/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/oscar/anaconda3/envs/ML/lib/python3.11/site-packages/unsloth/models/llama.py", line 817, in LlamaModel_fast_forward
hidden_states = Unsloth_Offloaded_Gradient_Checkpointer.apply(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/oscar/anaconda3/envs/ML/lib/python3.11/site-packages/torch/autograd/function.py", line 575, in apply
return super().apply(*args, **kwargs) # type: ignore[misc]
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/oscar/anaconda3/envs/ML/lib/python3.11/site-packages/torch/amp/autocast_mode.py", line 465, in decorate_fwd
return fwd(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^
File "/home/oscar/anaconda3/envs/ML/lib/python3.11/site-packages/unsloth_zoo/gradient_checkpointing.py", line 145, in forward
saved_hidden_states = hidden_states.to("cpu", non_blocking = True)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
RuntimeError: CUDA error: out of memory
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with TORCH_USE_CUDA_DSA to enable device-side assertions.

0%| | 0/200 [01:14<?, ?it/s]

The text was updated successfully, but these errors were encountered:

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

RuntimeError: CUDA error: out of memory CUDA #1552

RuntimeError: CUDA error: out of memory CUDA #1552

UltraHare commented Jan 17, 2025

RuntimeError: CUDA error: out of memory CUDA #1552

RuntimeError: CUDA error: out of memory CUDA #1552

Comments

UltraHare commented Jan 17, 2025

Instruction:

Input:

Response: