From 91e95cd0ebc011d4c46e18941c4c5fa578997f84 Mon Sep 17 00:00:00 2001
From: IlyasMoutawwakil <moutawwakil.ilyas.tsi@gmail.com>
Date: Tue, 10 Dec 2024 16:54:41 +0100
Subject: [PATCH] delete cache

---
 .github/workflows/test_cli_cuda_tensorrt_llm.yaml  | 4 +++-
 examples/cuda_trt_llama.yaml                       | 7 +++----
 optimum_benchmark/backends/tensorrt_llm/backend.py | 1 -
 optimum_benchmark/backends/tensorrt_llm/config.py  | 1 -
 4 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/test_cli_cuda_tensorrt_llm.yaml b/.github/workflows/test_cli_cuda_tensorrt_llm.yaml
index 60a616d6..c75aac92 100644
--- a/.github/workflows/test_cli_cuda_tensorrt_llm.yaml
+++ b/.github/workflows/test_cli_cuda_tensorrt_llm.yaml
@@ -56,7 +56,9 @@ jobs:
           contains( github.event.pull_request.labels.*.name, 'examples')
           }}
         name: Run examples
-        run: pytest tests/test_examples.py -x -s -k "cli and cuda and trt"
+        run: |
+          huggingface-cli delete-cache
+          pytest tests/test_examples.py -x -s -k "cli and cuda and trt"
 
   cli_cuda_tensorrt_llm_multi_gpu_tests:
     if: ${{
diff --git a/examples/cuda_trt_llama.yaml b/examples/cuda_trt_llama.yaml
index 26f35b2c..c483fc2f 100644
--- a/examples/cuda_trt_llama.yaml
+++ b/examples/cuda_trt_llama.yaml
@@ -15,11 +15,10 @@ launcher:
 backend:
   device: cuda
   device_ids: 0
-  force_export: true
-  model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
-  max_prompt_length: 64
-  max_new_tokens: 32
   max_batch_size: 4
+  max_new_tokens: 32
+  max_prompt_length: 64
+  model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
 
 scenario:
   input_shapes:
diff --git a/optimum_benchmark/backends/tensorrt_llm/backend.py b/optimum_benchmark/backends/tensorrt_llm/backend.py
index f46ce6c8..a05187c3 100644
--- a/optimum_benchmark/backends/tensorrt_llm/backend.py
+++ b/optimum_benchmark/backends/tensorrt_llm/backend.py
@@ -46,7 +46,6 @@ def load_trtmodel_from_pretrained(self) -> None:
             max_batch_size=self.config.max_batch_size,
             max_new_tokens=self.config.max_new_tokens,
             max_beam_width=self.config.max_beam_width,
-            force_export=self.config.force_export,
             **self.config.model_kwargs,
         )
 
diff --git a/optimum_benchmark/backends/tensorrt_llm/config.py b/optimum_benchmark/backends/tensorrt_llm/config.py
index 4fc83f11..d7f4b1cb 100644
--- a/optimum_benchmark/backends/tensorrt_llm/config.py
+++ b/optimum_benchmark/backends/tensorrt_llm/config.py
@@ -18,7 +18,6 @@ class TRTLLMConfig(BackendConfig):
     pp: int = 1
     use_fp8: bool = False
     dtype: str = "float16"
-    force_export: bool = False
     optimization_level: int = 2
     use_cuda_graph: bool = False