disable no weights on tgi cuda for now

huggingface · Dec 16, 2024 · a8c4159 · a8c4159
1 parent 3b0138b
commit a8c4159
Show file tree

Hide file tree

Showing 4 changed files with 7 additions and 16 deletions.
diff --git a/examples/cuda_tgi_llama.yaml b/examples/cuda_tgi_llama.yaml
@@ -16,9 +16,8 @@ backend:
   device: cuda
   device_ids: 0
   cuda_graphs: 0 # remove for better perf but bigger memory footprint
-  no_weights: true
+  no_weights: false
   model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
-  image: ghcr.io/huggingface/text-generation-inference:2.4.1
 
 scenario:
   input_shapes:

diff --git a/optimum_benchmark/backends/py_txi/backend.py b/optimum_benchmark/backends/py_txi/backend.py
@@ -46,16 +46,12 @@ def download_pretrained_model(self) -> None:
 
     def create_no_weights_model(self) -> None:
         self.no_weights_model = os.path.join(self.tmpdir.name, "no_weights_model")
+        filename = os.path.join(self.no_weights_model, "model.safetensors")
         os.makedirs(self.no_weights_model, exist_ok=True)
 
         self.pretrained_config.save_pretrained(save_directory=self.no_weights_model)
         self.pretrained_processor.save_pretrained(save_directory=self.no_weights_model)
-        if self.config.task in TEXT_GENERATION_TASKS:
-            self.generation_config.eos_token_id = None
-            self.generation_config.pad_token_id = None
-            self.generation_config.save_pretrained(save_directory=self.no_weights_model)
 
-        filename = os.path.join(self.no_weights_model, "model.safetensors")
         save_model(model=torch.nn.Linear(1, 1), filename=filename, metadata={"format": "pt"})
         with fast_weights_init():
             # unlike Transformers, TXI won't accept any missing tensors so we need to materialize the model
@@ -66,6 +62,11 @@ def create_no_weights_model(self) -> None:
         del self.pretrained_model
         torch.cuda.empty_cache()
 
+        if self.config.task in TEXT_GENERATION_TASKS:
+            self.generation_config.eos_token_id = None
+            self.generation_config.pad_token_id = None
+            self.generation_config.save_pretrained(save_directory=self.no_weights_model)
+
     def load_model_with_no_weights(self) -> None:
         self.config.volumes = {self.no_weights_model: {"bind": "/no_weights_model/", "mode": "rw"}}
         original_model, self.config.model = self.config.model, "/no_weights_model/"

diff --git a/tests/configs/cpu_inference_py_txi_gpt2.yaml b/tests/configs/cpu_inference_py_txi_gpt2.yaml
@@ -9,7 +9,3 @@ defaults:
   - override backend: py-txi
 
 name: cpu_inference_py_txi_gpt2
-
-backend:
-  cuda_graphs: 0
-  image: ghcr.io/huggingface/text-generation-inference:2.4.1
diff --git a/tests/configs/cuda_inference_py_txi_gpt2.yaml b/tests/configs/cuda_inference_py_txi_gpt2.yaml
@@ -3,13 +3,8 @@ defaults:
   - _base_ # inherits from base config
   - _cuda_ # inherits from cuda config
   - _inference_ # inherits from inference config
-  - _no_weights_ # inherits from no weights config
   - _gpt2_ # inherits from gpt2 config
   - _self_ # hydra 1.1 compatibility
   - override backend: py-txi
 
 name: cuda_inference_py_txi_gpt2
-
-backend:
-  cuda_graphs: 0
-  image: ghcr.io/huggingface/text-generation-inference:2.4.1