fix llama3 oom

Signed-off-by: Kaihui-intel <[email protected]>
intel · Sep 29, 2024 · 44c312d · 44c312d
1 parent dea8512
commit 44c312d
Showing 1 changed file with 5 additions and 0 deletions.
diff --git a/neural_compressor/transformers/quantization/utils.py b/neural_compressor/transformers/quantization/utils.py
@@ -525,6 +525,11 @@ def convert_to_quantized_model(model, config, device="cpu"):
     if orig_dtype != torch.float32:
         q_model.to(dtype=orig_dtype)
 
+    if config.use_layer_wise and not (q_model.device == device or q_model.device.type == device):
+        logger.warning(
+            "Do not convert device to avoid out of memory. Recommend using saved quantized model to inference.")
+        return q_model
+
     return q_model.to(device)