From a62344a56f643f6ac51081a805772bd7c97424c8 Mon Sep 17 00:00:00 2001 From: Stephen Horvath Date: Thu, 19 Sep 2024 20:55:21 +1000 Subject: [PATCH] Consolidate CPU & XPU IPEX optimisations, & implement LLM optimisations --- fastchat/model/model_adapter.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/fastchat/model/model_adapter.py b/fastchat/model/model_adapter.py index 92e19dbb78..40cae6f9de 100644 --- a/fastchat/model/model_adapter.py +++ b/fastchat/model/model_adapter.py @@ -366,13 +366,6 @@ def load_model( # Load model model, tokenizer = adapter.load_model(model_path, kwargs) - if ( - device == "cpu" - and kwargs["torch_dtype"] is torch.bfloat16 - and CPU_ISA is not None - ): - model = ipex.optimize(model, dtype=kwargs["torch_dtype"]) - if (device == "cuda" and num_gpus == 1 and not cpu_offloading) or device in ( "mps", "xpu", @@ -380,8 +373,15 @@ def load_model( ): model.to(device) - if device == "xpu": - model = torch.xpu.optimize(model, dtype=kwargs["torch_dtype"], inplace=True) + if ( + device == "cpu" + and kwargs["torch_dtype"] is torch.bfloat16 + and CPU_ISA is not None + ) or device == "xpu": + if "llm" in dir(ipex): + model = ipex.llm.optimize(model, dtype=kwargs["torch_dtype"], device=device) + else: + model = ipex.optimize(model, dtype=kwargs["torch_dtype"]) if debug: print(model)