Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

RuntimeError: CUDA error: out of memory CUDA #1552

Open
UltraHare opened this issue Jan 17, 2025 · 0 comments
Open

RuntimeError: CUDA error: out of memory CUDA #1552

UltraHare opened this issue Jan 17, 2025 · 0 comments

Comments

@UltraHare
Copy link

Good afternoon.

I have been having problems trying to Fine Tuning the 3B Llama 3.2 model of 3B parameters. I have tried lowering the batch size from 16 to 2, and I have also tried reducing the number of max sequences to 1024, I have also tried changing the model but I have not been able to train yet. I also checked that I have enough resources to do the training, I have a Tesla M40 with 24G VRAM. Please, I have been for a long time unable to train because of this problem that has occurred in the last update of Unsloth. I will leave you the code I am running.

FineTuningLLM.py
`
def init(self, model: str, prompt: str = None, max_seq_length: int = 2048, dtype = None, load_in_4bit: bool = True, device_map: str = "sequential", rank: int = 16,
target_modules: list[str] = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj",],
lora_dropout: int = 0, bias: str = "none", use_gradient_checkpointing: str = "unsloth",
random_state: int = 3407, use_rslora: bool = False, loftq_config: dict = None):

    self.__max_seq_length = max_seq_length
    self.__dtype = dtype
    self.__load_in_4bit = load_in_4bit
    self.__alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

Instruction:

{}

Input:

{}

Response:

{}"""

if prompt == None else prompt

    if torch.cuda.is_available():
        if torch.cuda.device_count() > 1:
            print("\n\n Especificar dispositivo a usar: \n")
            for i in range(0, torch.cuda.device_count()):
                print(f"{i}. {torch.cuda.get_device_name(i)}\n\n")

            device = int(input("Selecciona un dispostivo por enumerador: "))

            if device <= torch.cuda.device_count() and device >= 0:
                torch.cuda.set_device(device)

        system("clear")

    print(f"CUDA current device: {torch.cuda.current_device()}\n\nCUDA name device: {torch.cuda.get_device_name(torch.cuda.current_device())}\n\n")

    self.__model, self.__tokenizer = FastLanguageModel.from_pretrained(
    model_name = model,
    max_seq_length = self.__max_seq_length,
    dtype = self.__dtype,
    load_in_4bit = self.__load_in_4bit,
    device_map = device_map
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
    )

    self.__model = FastLanguageModel.get_peft_model(
        self.__model,
        r = rank, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
        target_modules = target_modules,
        lora_alpha = rank,
        lora_dropout = lora_dropout, # Supports any, but = 0 is optimized
        bias = bias,    # Supports any, but = "none" is optimized
        # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
        use_gradient_checkpointing = use_gradient_checkpointing, # True or "unsloth" for very long context
        random_state = random_state,
        use_rslora = use_rslora,  # We support rank stabilized LoRA
        loftq_config = loftq_config, # And LoftQ
    )

def setConfigModelLLM(self, model: str, max_seq_length: int = 2048, dtype = None, load_in_4bit: bool = True, device_map: str = "sequential", rank: int = 16,
             target_modules: list[str] = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj",],
             lora_dropout: int = 0, bias: str = "none", use_gradient_checkpointing: str = "unsloth",
             random_state: int = 3407, use_rslora: bool = False, loftq_config: dict = None):
    
    self.__max_seq_length = max_seq_length
    self.__dtype = dtype
    self.__load_in_4bit = load_in_4bit

    self.__model, self.__tokenizer = FastLanguageModel.from_pretrained(
    model_name = model,
    max_seq_length = self.__max_seq_length,
    dtype = self.__dtype,
    load_in_4bit = self.__load_in_4bit,
    device_map = device_map
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
    )

    self.__model = FastLanguageModel.get_peft_model(
        self.__model,
        r = rank, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
        target_modules = target_modules,
        lora_alpha = rank,
        lora_dropout = lora_dropout, # Supports any, but = 0 is optimized
        bias = bias,    # Supports any, but = "none" is optimized
        # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
        use_gradient_checkpointing = use_gradient_checkpointing, # True or "unsloth" for very long context
        random_state = random_state,
        use_rslora = use_rslora,  # We support rank stabilized LoRA
        loftq_config = loftq_config, # And LoftQ
    )

def setDataSetLLM(self, pathDataSet: str):

    EOS_TOKEN = self.__tokenizer.eos_token # Must add EOS_TOKEN
    def formatting_prompts_func(examples):
        instructions = examples["instruction"]
        inputs       = examples["input"]
        outputs      = examples["output"]
        texts = []
        for instruction, input, output in zip(instructions, inputs, outputs):
            # Must add EOS_TOKEN, otherwise your generation will go on forever!
            text = self.__alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
            texts.append(text)
        return { "text" : texts, }
    pass

    self.__dataset = load_dataset("data/csv", data_files=path.basename(pathDataSet), split = "train")
    self.__dataset = self.__dataset.map(formatting_prompts_func, batched = True,)

def trainLLM(self, pathModelSave: str, num_train_epochs: int = 10, per_device_train_batch_size: int = 2, gradient_accumulation_steps: int = 4,
             dataset_num_proc: int = 2, packing: bool = False, warmup_steps: int = 5, learning_rate: float = 2e-4, logging_steps: int = 1,
             optim: str = "adamw_8bit", weight_decay: float = 0.01, lr_scheduler_type: str = "linear", seed: int = 3407, output_dir: str = "outputs",
             report_to: str = "none"):
    
    """
    Entrenamiento de modelos LLM / Fine Tunning

    pathModelSave (str, path, entrenado):
        Directorio en donde se guardara el modelo.
        
    num_train_epochs (int, epocas, por defecto es 10):
        Numero total de epocas por entrenamiento

    per_device_train_batch_size (int, batch_size, por defecto es 2):
        Tamaño de batch por GPU/TPU/MPS/NPU core/CPU para entrenar
    
    gradient_accumulation_steps (int, batch_size, por defecto es 4):
        Numero de pasos de actualizacion que deben acumularse antes de realizar una pasada hacia atras/actualizacion. 
    """

    trainer = SFTTrainer(
        model = self.__model,
        tokenizer = self.__tokenizer,
        train_dataset = self.__dataset,
        dataset_text_field = "text",
        max_seq_length = self.__max_seq_length,
        dataset_num_proc = dataset_num_proc,
        packing = packing,
        args = TrainingArguments(
            per_device_train_batch_size = per_device_train_batch_size,
            gradient_accumulation_steps = gradient_accumulation_steps,
            warmup_steps = warmup_steps,
            num_train_epochs = num_train_epochs, 
            learning_rate = learning_rate,
            fp16 = not is_bfloat16_supported(),
            bf16 = is_bfloat16_supported(),
            logging_steps = logging_steps,
            optim = optim,
            weight_decay = weight_decay,
            lr_scheduler_type = lr_scheduler_type,
            seed = seed,
            output_dir = output_dir,
            report_to = report_to
        ),
    )

    torch.cuda.empty_cache()
    torch.cuda.reset_peak_memory_stats()

    trainer_stats = unsloth_train(trainer)

    torch.cuda.empty_cache()
    torch.cuda.reset_peak_memory_stats()
    
    self.__model.save_pretrained(pathModelSave)
    self.__tokenizer.save_pretrained(pathModelSave)`

main.py
`import os
import argparse

def main():
parser = argparse.ArgumentParser(
prog="FineTunning and Inferences LLMs",
description="SFTTrainer FineTunning and Inferences LLMs"
)
parser.add_argument("--mode_run", "-r", choices=["train", "inferences", "convert_ollama","t", "i", "co"], default="train", help="Select Fine Tunning or Inferences")
parser.add_argument("--epochs", "-ep", type=float, default=10, help="Set epochs for Fine Tunning")
parser.add_argument("--batch_size", "-batch", type=int, default=16, help="Set epochs for Fine Tunning")
args = parser.parse_args()
try:
import FineTunningLLM

    pathModels = "models/llama"
    pathPDF = "data/pdf"
    pathDataSet = "data/csv/data.csv"
    pathJSON = "data/json"
    pathGGUF = "models/ollama"
    pathOutput = "outputs/"

    try:
        os.makedirs(pathModels, exist_ok=True)
        os.makedirs(pathPDF, exist_ok=True)
        os.makedirs(os.path.dirname(pathDataSet), exist_ok=True)
        os.makedirs(pathGGUF, exist_ok=True)
        os.makedirs(pathJSON, exist_ok=True)
    except OSError:
        pass

    print("\nModelos disponibles: \n\n")

    for name_folder in os.listdir(pathModels):
        print(f"{name_folder}\n\n") 
    nameModel = input("Nombre del modelo: ").strip()
    nameModel = os.path.join(pathModels, nameModel) if nameModel != "" else nameModel
    nameModel = nameModel if os.path.exists(nameModel) else "unsloth/Llama-3.2-3B"

    model = FineTunningLLM.TrainLlama(nameModel, rank=16, load_in_4bit=False)

    if args.mode_run == "train" or args.mode_run == "t":

        print("\n\nGenerando el dataset...\n\n")
        dataSet = FineTunningLLM.DataSetFineTunning()
        listaPDF = dataSet.extractListPDF(20, 20)
        dataSet.generateJSON(listaPDF, dataSet.data_constitucion, dataSet.data_listado)
        dataSet.generateCSV()
        print("\n\nCargando el dataset...\n\n")
        model.setDataSetLLM(pathDataSet)
        print("\n\n¡Deja en blanco si quieres sobreescribir un modelo existente!\n\n")
        saveModel = input("Nombre de modelo a guardar: ").strip()
        saveModelOutput = os.path.join(pathOutput, saveModel)
        saveModel = os.path.join(pathModels, saveModel) if saveModel != "" else os.path.join(pathModels, nameModel)
        model.trainLLM(saveModel, num_train_epochs=args.epochs, per_device_train_batch_size=4, gradient_accumulation_steps=4, output_dir=saveModelOutput)
        pathGGUF = os.path.join(pathGGUF, saveModel)
        model.saveGGUF(pathGGUF)

    elif args.mode_run == "inferences" or args.mode_run == "i":
       
        model.inferencesLLM(pathPDF)

    elif args.mode_run == "convert_ollama" or args.mode_run == "co":

        print("Cargando el dataset...\n\n")
        model.setDataSetLLM(pathDataSet)
        print("\n\n¡Dejar en blanco si quieres sobreescribir o generar un archivo con el mismo nombre!\n\n")
        nameOllama = input("Ubicacion de Modelfile: ").strip()
        nameOllama = nameOllama if nameOllama != "" else nameModel
        print("\n\nGuardando modelo con llama.cpp...\n\n")
        model.importOllama(nameModel, nameOllama)
        pathOllama = os.path.join(pathOllama, nameOllama)
        print(f"Modelo guardado en {pathOllama}\n\n")

except argparse.ArgumentError as e:
    print(f"Argumento invalido: {e}")
    exit(0)

if name == "main":
main()
`

Traceback (most recent call last):
File "/home/oscar/LLM/main.py", line 80, in
main()
File "/home/oscar/LLM/main.py", line 55, in main
model.trainLLM(saveModel, num_train_epochs=args.epochs, per_device_train_batch_size=4, gradient_accumulation_steps=4, output_dir=saveModelOutput)
File "/home/oscar/LLM/FineTunningLLM.py", line 184, in trainLLM
trainer_stats = unsloth_train(trainer)
^^^^^^^^^^^^^^^^^^^^^^
File "/home/oscar/anaconda3/envs/ML/lib/python3.11/site-packages/unsloth/trainer.py", line 45, in unsloth_train
return trainer.train(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "", line 157, in train
File "", line 382, in _fast_inner_training_loop
File "", line 31, in _unsloth_training_step
File "/home/oscar/anaconda3/envs/ML/lib/python3.11/site-packages/unsloth/models/_utils.py", line 1063, in _unsloth_pre_compute_loss
return self._old_compute_loss(model, inputs, *args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/oscar/anaconda3/envs/ML/lib/python3.11/site-packages/transformers/trainer.py", line 3708, in compute_loss
outputs = model(**inputs)
^^^^^^^^^^^^^^^
File "/home/oscar/anaconda3/envs/ML/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/oscar/anaconda3/envs/ML/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/oscar/anaconda3/envs/ML/lib/python3.11/site-packages/accelerate/utils/operations.py", line 823, in forward
return model_forward(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/oscar/anaconda3/envs/ML/lib/python3.11/site-packages/accelerate/utils/operations.py", line 811, in call
return convert_to_fp32(self.model_forward(*args, **kwargs))
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/oscar/anaconda3/envs/ML/lib/python3.11/site-packages/torch/amp/autocast_mode.py", line 44, in decorate_autocast
return func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/home/oscar/anaconda3/envs/ML/lib/python3.11/site-packages/torch/_compile.py", line 32, in inner
return disable_fn(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/oscar/anaconda3/envs/ML/lib/python3.11/site-packages/torch/_dynamo/eval_frame.py", line 632, in _fn
return fn(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^
File "/home/oscar/anaconda3/envs/ML/lib/python3.11/site-packages/unsloth/models/llama.py", line 1126, in PeftModelForCausalLM_fast_forward
return self.base_model(
^^^^^^^^^^^^^^^^
File "/home/oscar/anaconda3/envs/ML/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/oscar/anaconda3/envs/ML/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/oscar/anaconda3/envs/ML/lib/python3.11/site-packages/peft/tuners/tuners_utils.py", line 197, in forward
return self.model.forward(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/oscar/anaconda3/envs/ML/lib/python3.11/site-packages/unsloth/models/llama.py", line 986, in _CausalLM_fast_forward
outputs = self.model(
^^^^^^^^^^^
File "/home/oscar/anaconda3/envs/ML/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/oscar/anaconda3/envs/ML/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/oscar/anaconda3/envs/ML/lib/python3.11/site-packages/unsloth/models/llama.py", line 817, in LlamaModel_fast_forward
hidden_states = Unsloth_Offloaded_Gradient_Checkpointer.apply(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/oscar/anaconda3/envs/ML/lib/python3.11/site-packages/torch/autograd/function.py", line 575, in apply
return super().apply(*args, **kwargs) # type: ignore[misc]
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/oscar/anaconda3/envs/ML/lib/python3.11/site-packages/torch/amp/autocast_mode.py", line 465, in decorate_fwd
return fwd(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^
File "/home/oscar/anaconda3/envs/ML/lib/python3.11/site-packages/unsloth_zoo/gradient_checkpointing.py", line 145, in forward
saved_hidden_states = hidden_states.to("cpu", non_blocking = True)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
RuntimeError: CUDA error: out of memory
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with TORCH_USE_CUDA_DSA to enable device-side assertions.

0%| | 0/200 [01:14<?, ?it/s]

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant