Skip to content

Commit

Permalink
Enable Llama 3.1 405B in FP8 (#124)
Browse files Browse the repository at this point in the history
* add changes for fix

* add keepmoduleonhosst, modfy quant json

* remove buffer check

* add llama 405 checks

* remove hardcoded path, reuse module on host check

* fix: undefined variable

* remove unused import

---------

Co-authored-by: Your Name <[email protected]>
  • Loading branch information
2 people authored and astachowiczhabana committed Feb 5, 2025
1 parent df43900 commit 1e64b1a
Show file tree
Hide file tree
Showing 2 changed files with 21 additions and 7 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,10 @@
"mode": "QUANTIZE",
"observer": "maxabs",
"scale_method": "unit_scale",
"dump_stats_path": "./hqt_output/measure"
"whitelist": {"types": [], "names": []},
"blacklist": {"types": [], "names": []},
"quantize_weight": false,
"dump_stats_path": "./results/hk",
"ignore_modules_wo_measures": "True",
"dump_stats_xlsx_path": "./run_outputs/fp8stats.xlsx"
}
21 changes: 15 additions & 6 deletions examples/text-generation/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -429,8 +429,13 @@ def setup_distributed_model(args, model_dtype, model_kwargs, logger):

logger.info("DeepSpeed is enabled.")
deepspeed.init_distributed(dist_backend="hccl")
config = AutoConfig.from_pretrained(args.model_name_or_path, torch_dtype=model_dtype, **model_kwargs)
load_to_meta = model_on_meta(config)
config = AutoConfig.from_pretrained(args.model_name_or_path, **model_kwargs)

keep_module_on_host = False
if "Llama-3.1-405B" in args.model_name_or_path:
keep_module_on_host = True

load_to_meta = False if keep_module_on_host else model_on_meta(config)

if args.assistant_model is None:
assistant_model = None
Expand All @@ -439,7 +444,10 @@ def setup_distributed_model(args, model_dtype, model_kwargs, logger):

if load_to_meta:
# Construct model with fake meta tensors, later will be replaced on devices during ds-inference ckpt load
with deepspeed.OnDevice(dtype=model_dtype, device="meta"):

deepspeed_device = "cpu" if keep_module_on_host else "meta"

with deepspeed.OnDevice(dtype=config.torch_dtype, device=deepspeed_device):
if (
hasattr(config, "rope_scaling")
and config.rope_scaling
Expand Down Expand Up @@ -469,12 +477,12 @@ def setup_distributed_model(args, model_dtype, model_kwargs, logger):
)
else:
# TODO: revisit placement on CPU when auto-injection is possible
with deepspeed.OnDevice(dtype=model_dtype, device="cpu"):
with deepspeed.OnDevice(dtype=config.torch_dtype, device="cpu"):
if args.peft_model is not None:
model = peft_model(args, model_dtype, logger, **model_kwargs)
else:
model = AutoModelForCausalLM.from_pretrained(
args.model_name_or_path, torch_dtype=model_dtype, **model_kwargs
args.model_name_or_path, torch_dtype=config.torch_dtype, **model_kwargs
)
model.eval()

Expand All @@ -484,7 +492,8 @@ def setup_distributed_model(args, model_dtype, model_kwargs, logger):
).eval()

# Initialize the model
ds_inference_kwargs = {"dtype": model_dtype}
ds_inference_kwargs = {"dtype": config.torch_dtype}
ds_inference_kwargs["keep_module_on_host"] = keep_module_on_host
ds_inference_kwargs["tensor_parallel"] = {"tp_size": args.world_size}
ds_inference_kwargs["enable_cuda_graph"] = args.use_hpu_graphs
ds_inference_kwargs["injection_policy"] = get_ds_injection_policy(config)
Expand Down

0 comments on commit 1e64b1a

Please sign in to comment.