Enable Llama 3.1 405B in FP8 (#124)

* add changes for fix * add keepmoduleonhosst, modfy quant json * remove buffer check * add llama 405 checks * remove hardcoded path, reuse module on host check * fix: undefined variable * remove unused import --------- Co-authored-by: Your Name <[email protected]>
huggingface · Feb 5, 2025 · 1e64b1a · 1e64b1a
1 parent df43900
commit 1e64b1a
Show file tree

Hide file tree

Showing 2 changed files with 21 additions and 7 deletions.
diff --git a/examples/text-generation/quantization_config/unit_scale_quant.json b/examples/text-generation/quantization_config/unit_scale_quant.json
@@ -3,5 +3,10 @@
     "mode": "QUANTIZE",
     "observer": "maxabs",
     "scale_method": "unit_scale",
-    "dump_stats_path": "./hqt_output/measure"
+    "whitelist": {"types": [], "names":  []},
+    "blacklist": {"types": [], "names":  []},
+    "quantize_weight": false,
+    "dump_stats_path": "./results/hk",
+    "ignore_modules_wo_measures": "True",
+    "dump_stats_xlsx_path": "./run_outputs/fp8stats.xlsx"
 }
diff --git a/examples/text-generation/utils.py b/examples/text-generation/utils.py
@@ -429,8 +429,13 @@ def setup_distributed_model(args, model_dtype, model_kwargs, logger):
 
     logger.info("DeepSpeed is enabled.")
     deepspeed.init_distributed(dist_backend="hccl")
-    config = AutoConfig.from_pretrained(args.model_name_or_path, torch_dtype=model_dtype, **model_kwargs)
-    load_to_meta = model_on_meta(config)
+    config = AutoConfig.from_pretrained(args.model_name_or_path, **model_kwargs)
+
+    keep_module_on_host = False
+    if "Llama-3.1-405B" in args.model_name_or_path:
+        keep_module_on_host = True
+
+    load_to_meta = False if keep_module_on_host else model_on_meta(config)
 
     if args.assistant_model is None:
         assistant_model = None
@@ -439,7 +444,10 @@ def setup_distributed_model(args, model_dtype, model_kwargs, logger):
 
     if load_to_meta:
         # Construct model with fake meta tensors, later will be replaced on devices during ds-inference ckpt load
-        with deepspeed.OnDevice(dtype=model_dtype, device="meta"):
+
+        deepspeed_device = "cpu" if keep_module_on_host else "meta"
+
+        with deepspeed.OnDevice(dtype=config.torch_dtype, device=deepspeed_device):
             if (
                 hasattr(config, "rope_scaling")
                 and config.rope_scaling
@@ -469,12 +477,12 @@ def setup_distributed_model(args, model_dtype, model_kwargs, logger):
         )
     else:
         # TODO: revisit placement on CPU when auto-injection is possible
-        with deepspeed.OnDevice(dtype=model_dtype, device="cpu"):
+        with deepspeed.OnDevice(dtype=config.torch_dtype, device="cpu"):
             if args.peft_model is not None:
                 model = peft_model(args, model_dtype, logger, **model_kwargs)
             else:
                 model = AutoModelForCausalLM.from_pretrained(
-                    args.model_name_or_path, torch_dtype=model_dtype, **model_kwargs
+                    args.model_name_or_path, torch_dtype=config.torch_dtype, **model_kwargs
                 )
     model.eval()
 
@@ -484,7 +492,8 @@ def setup_distributed_model(args, model_dtype, model_kwargs, logger):
         ).eval()
 
     # Initialize the model
-    ds_inference_kwargs = {"dtype": model_dtype}
+    ds_inference_kwargs = {"dtype": config.torch_dtype}
+    ds_inference_kwargs["keep_module_on_host"] = keep_module_on_host
     ds_inference_kwargs["tensor_parallel"] = {"tp_size": args.world_size}
     ds_inference_kwargs["enable_cuda_graph"] = args.use_hpu_graphs
     ds_inference_kwargs["injection_policy"] = get_ds_injection_policy(config)