From a62344a56f643f6ac51081a805772bd7c97424c8 Mon Sep 17 00:00:00 2001
From: Stephen Horvath <s.horvath@outlook.com.au>
Date: Thu, 19 Sep 2024 20:55:21 +1000
Subject: [PATCH] Consolidate CPU & XPU IPEX optimisations, & implement LLM
 optimisations

---
 fastchat/model/model_adapter.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/fastchat/model/model_adapter.py b/fastchat/model/model_adapter.py
index 92e19dbb78..40cae6f9de 100644
--- a/fastchat/model/model_adapter.py
+++ b/fastchat/model/model_adapter.py
@@ -366,13 +366,6 @@ def load_model(
     # Load model
     model, tokenizer = adapter.load_model(model_path, kwargs)
 
-    if (
-        device == "cpu"
-        and kwargs["torch_dtype"] is torch.bfloat16
-        and CPU_ISA is not None
-    ):
-        model = ipex.optimize(model, dtype=kwargs["torch_dtype"])
-
     if (device == "cuda" and num_gpus == 1 and not cpu_offloading) or device in (
         "mps",
         "xpu",
@@ -380,8 +373,15 @@ def load_model(
     ):
         model.to(device)
 
-    if device == "xpu":
-        model = torch.xpu.optimize(model, dtype=kwargs["torch_dtype"], inplace=True)
+    if (
+        device == "cpu"
+        and kwargs["torch_dtype"] is torch.bfloat16
+        and CPU_ISA is not None
+    ) or device == "xpu":
+        if "llm" in dir(ipex):
+            model = ipex.llm.optimize(model, dtype=kwargs["torch_dtype"], device=device)
+        else:
+            model = ipex.optimize(model, dtype=kwargs["torch_dtype"])
 
     if debug:
         print(model)