diff --git a/.github/workflows/test_api_cpu.yaml b/.github/workflows/test_api_cpu.yaml
index 126e500b..b48490a5 100644
--- a/.github/workflows/test_api_cpu.yaml
+++ b/.github/workflows/test_api_cpu.yaml
@@ -47,8 +47,21 @@ jobs:
           pip install -e .[testing,timm,diffusers,codecarbon]
 
       - name: Run tests
+        run: |
+          pytest tests/test_api.py -s -k "api and cpu"
         env:
           HF_TOKEN: ${{ secrets.HF_TOKEN }}
           PUSH_REPO_ID: optimum-benchmark/cpu
-        run: |
-          pytest tests/test_api.py -s -k "api and cpu"
+
+      # no examples for now
+      # - if: ${{
+      #     (github.event_name == 'push') ||
+      #     (github.event_name == 'workflow_dispatch') ||
+      #     contains( github.event.pull_request.labels.*.name, 'examples')
+      #     }}
+      #   name: Run examples
+      #   run: |
+      #     pytest tests/test_examples.py -s -k "api and cpu"
+      #   env:
+      #     HF_TOKEN: ${{ secrets.HF_TOKEN }}
+      #     PUSH_REPO_ID: optimum-benchmark/cpu
diff --git a/.github/workflows/test_api_cuda.yaml b/.github/workflows/test_api_cuda.yaml
index c8be0ece..d45afa40 100644
--- a/.github/workflows/test_api_cuda.yaml
+++ b/.github/workflows/test_api_cuda.yaml
@@ -45,8 +45,21 @@ jobs:
           pip install -e .[testing,timm,diffusers,codecarbon]
 
       - name: Run tests
+        run: |
+          pytest tests/test_api.py -x -s -k "api and cuda"
         env:
           HF_TOKEN: ${{ secrets.HF_TOKEN }}
           PUSH_REPO_ID: optimum-benchmark/cuda
+
+      - if: ${{
+          (github.event_name == 'push') ||
+          (github.event_name == 'workflow_dispatch') ||
+          contains( github.event.pull_request.labels.*.name, 'examples')
+          }}
+        name: Run examples
         run: |
-          pytest tests/test_api.py -x -s -k "api and cuda"
+          pip install -e .[testing,torchao,autoawq,auto-gptq]
+          pytest tests/test_examples.py -x -s -k "api and cuda and pytorch"
+        env:
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+          PUSH_REPO_ID: optimum-benchmark/cuda
diff --git a/.github/workflows/test_api_misc.yaml b/.github/workflows/test_api_misc.yaml
index 2da1e7ec..36c26215 100644
--- a/.github/workflows/test_api_misc.yaml
+++ b/.github/workflows/test_api_misc.yaml
@@ -58,8 +58,8 @@ jobs:
           UV_SYSTEM_PYTHON: 1
 
       - name: Run tests
+        run: |
+          pytest tests/test_api.py -s -k "api and not (cpu or cuda or rocm or mps)"
         env:
           HF_TOKEN: ${{ secrets.HF_TOKEN }}
           PUSH_REPO_ID: optimum-benchmark/misc-${{ matrix.os }}-${{ matrix.python }}
-        run: |
-          pytest tests/test_api.py -s -k "api and not (cpu or cuda or rocm or mps)"
diff --git a/.github/workflows/test_cli_cpu_ipex.yaml b/.github/workflows/test_cli_cpu_ipex.yaml
index d6b94d3e..5bf0be92 100644
--- a/.github/workflows/test_cli_cpu_ipex.yaml
+++ b/.github/workflows/test_cli_cpu_ipex.yaml
@@ -36,16 +36,17 @@ jobs:
       - name: Checkout
         uses: actions/checkout@v4
 
-      - name: Set up Python 3.10
-        uses: actions/setup-python@v5
-        with:
-          python-version: "3.10"
-
       - name: Install requirements
         run: |
-          pip install --upgrade pip
-          pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
           pip install -e .[testing,ipex,diffusers,timm]
 
       - name: Run tests
         run: pytest tests/test_cli.py -s -k "cli and cpu and ipex"
+
+      - if: ${{
+          (github.event_name == 'push') ||
+          (github.event_name == 'workflow_dispatch') ||
+          contains( github.event.pull_request.labels.*.name, 'examples')
+          }}
+        name: Run examples
+        run: pytest tests/test_examples.py -s -k "cli and cpu and ipex"
diff --git a/.github/workflows/test_cli_cpu_llama_cpp.yaml b/.github/workflows/test_cli_cpu_llama_cpp.yaml
index 05d43683..145c0f83 100644
--- a/.github/workflows/test_cli_cpu_llama_cpp.yaml
+++ b/.github/workflows/test_cli_cpu_llama_cpp.yaml
@@ -48,4 +48,12 @@ jobs:
           pip install -e .[testing,llama-cpp]
 
       - name: Run tests
-        run: pytest tests/test_cli.py -s -k "llama_cpp"
+        run: pytest tests/test_cli.py -s -k "cli and cpu and llama_cpp"
+
+      - if: ${{
+          (github.event_name == 'push') ||
+          (github.event_name == 'workflow_dispatch') ||
+          contains( github.event.pull_request.labels.*.name, 'examples')
+          }}
+        name: Run examples
+        run: pytest tests/test_examples.py -s -k "cli and cpu and llama_cpp"
diff --git a/.github/workflows/test_cli_cpu_neural_compressor.yaml b/.github/workflows/test_cli_cpu_neural_compressor.yaml
deleted file mode 100644
index 435f4216..00000000
--- a/.github/workflows/test_cli_cpu_neural_compressor.yaml
+++ /dev/null
@@ -1,51 +0,0 @@
-name: CLI CPU Intel Neural Compressor Tests
-
-on:
-  workflow_dispatch:
-  push:
-    branches:
-      - main
-  pull_request:
-    branches:
-      - main
-    types:
-      - opened
-      - reopened
-      - synchronize
-      - labeled
-      - unlabeled
-
-concurrency:
-  cancel-in-progress: true
-  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
-
-jobs:
-  run_cli_cpu_neural_compressor_tests:
-    if: ${{
-      (github.event_name == 'push') ||
-      (github.event_name == 'workflow_dispatch') ||
-      contains( github.event.pull_request.labels.*.name, 'cli') ||
-      contains( github.event.pull_request.labels.*.name, 'cpu') ||
-      contains( github.event.pull_request.labels.*.name, 'neural_compressor') ||
-      contains( github.event.pull_request.labels.*.name, 'cli_cpu_neural_compressor')
-      }}
-
-    runs-on: ubuntu-latest
-
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-
-      - name: Set up Python 3.10
-        uses: actions/setup-python@v5
-        with:
-          python-version: "3.10"
-
-      - name: Install requirements
-        run: |
-          pip install --upgrade pip
-          pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
-          pip install -e .[testing,neural-compressor,diffusers,timm]
-
-      - name: Run tests
-        run: pytest tests/test_cli.py -s -k "cli and cpu and neural_compressor"
diff --git a/.github/workflows/test_cli_cpu_onnxruntime.yaml b/.github/workflows/test_cli_cpu_onnxruntime.yaml
index 21e65235..ef8482b7 100644
--- a/.github/workflows/test_cli_cpu_onnxruntime.yaml
+++ b/.github/workflows/test_cli_cpu_onnxruntime.yaml
@@ -49,3 +49,11 @@ jobs:
 
       - name: Run tests
         run: pytest tests/test_cli.py -s -k "cli and cpu and onnxruntime"
+
+      - if: ${{
+          (github.event_name == 'push') ||
+          (github.event_name == 'workflow_dispatch') ||
+          contains( github.event.pull_request.labels.*.name, 'examples')
+          }}
+        name: Run examples
+        run: pytest tests/test_examples.py -s -k "cli and cpu and onnxruntime"
diff --git a/.github/workflows/test_cli_cpu_openvino.yaml b/.github/workflows/test_cli_cpu_openvino.yaml
index 4612370c..2ef0312e 100644
--- a/.github/workflows/test_cli_cpu_openvino.yaml
+++ b/.github/workflows/test_cli_cpu_openvino.yaml
@@ -36,16 +36,18 @@ jobs:
       - name: Checkout
         uses: actions/checkout@v4
 
-      - name: Set up Python 3.10
-        uses: actions/setup-python@v5
-        with:
-          python-version: "3.10"
-
       - name: Install requirements
         run: |
-          pip install --upgrade pip
           pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
           pip install -e .[testing,openvino,diffusers,timm]
 
       - name: Run tests
         run: pytest tests/test_cli.py -s -k "cli and cpu and openvino"
+
+      - if: ${{
+          (github.event_name == 'push') ||
+          (github.event_name == 'workflow_dispatch') ||
+          contains( github.event.pull_request.labels.*.name, 'examples')
+          }}
+        name: Run examples
+        run: pytest tests/test_examples.py -s -k "cli and cpu and openvino"
diff --git a/.github/workflows/test_cli_cpu_py_txi.yaml b/.github/workflows/test_cli_cpu_py_txi.yaml
index d07f6170..7b1946e7 100644
--- a/.github/workflows/test_cli_cpu_py_txi.yaml
+++ b/.github/workflows/test_cli_cpu_py_txi.yaml
@@ -45,7 +45,16 @@ jobs:
         run: |
           pip install --upgrade pip
           pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
-          pip install -e .[testing,py-txi]
+          pip install -e .[testing,py-txi] git+https://github.com/IlyasMoutawwakil/py-txi.git
 
       - name: Run tests
         run: pytest tests/test_cli.py -s -k "cli and cpu and py_txi"
+
+      # no examples for now
+      # - if: ${{
+      #     (github.event_name == 'push') ||
+      #     (github.event_name == 'workflow_dispatch') ||
+      #     contains( github.event.pull_request.labels.*.name, 'examples')
+      #     }}
+      #   name: Run examples
+      #   run: pytest tests/test_examples.py -s -k "cli and cpu and (tgi or tei)"
diff --git a/.github/workflows/test_cli_cpu_pytorch.yaml b/.github/workflows/test_cli_cpu_pytorch.yaml
index fef2a772..dab603c7 100644
--- a/.github/workflows/test_cli_cpu_pytorch.yaml
+++ b/.github/workflows/test_cli_cpu_pytorch.yaml
@@ -49,3 +49,12 @@ jobs:
 
       - name: Run tests
         run: pytest tests/test_cli.py -s -k "cli and cpu and pytorch"
+
+      # no examples for now
+      # - if: ${{
+      #     (github.event_name == 'push') ||
+      #     (github.event_name == 'workflow_dispatch') ||
+      #     contains( github.event.pull_request.labels.*.name, 'examples')
+      #     }}
+      #   name: Run examples
+      #   run: pytest tests/test_examples.py -s -k "cli and cpu and pytorch"
diff --git a/.github/workflows/test_cli_cuda_onnxruntime.yaml b/.github/workflows/test_cli_cuda_onnxruntime.yaml
index 0584665c..1351e1b0 100644
--- a/.github/workflows/test_cli_cuda_onnxruntime.yaml
+++ b/.github/workflows/test_cli_cuda_onnxruntime.yaml
@@ -48,3 +48,12 @@ jobs:
       - name: Run tests
         run: |
           pytest tests/test_cli.py -x -s -k "cli and cuda and onnxruntime"
+
+      # no examples for now
+      # - if: ${{
+      #     (github.event_name == 'push') ||
+      #     (github.event_name == 'workflow_dispatch') ||
+      #     contains( github.event.pull_request.labels.*.name, 'examples')
+      #     }}
+      #   name: Run examples
+      #   run: pytest tests/test_examples.py -x -s -k "cli and cuda and onnxruntime"
diff --git a/.github/workflows/test_cli_cuda_py_txi.yaml b/.github/workflows/test_cli_cuda_py_txi.yaml
index 7339b98e..5c090b28 100644
--- a/.github/workflows/test_cli_cuda_py_txi.yaml
+++ b/.github/workflows/test_cli_cuda_py_txi.yaml
@@ -45,7 +45,15 @@ jobs:
       - name: Install requirements
         run: |
           pip install --upgrade pip
-          pip install -e .[testing,py-txi]
+          pip install -e .[testing,py-txi] git+https://github.com/IlyasMoutawwakil/py-txi.git
 
       - name: Run tests
         run: pytest tests/test_cli.py -x -s -k "cli and cuda and py_txi"
+
+      - if: ${{
+          (github.event_name == 'push') ||
+          (github.event_name == 'workflow_dispatch') ||
+          contains( github.event.pull_request.labels.*.name, 'examples')
+          }}
+        name: Run examples
+        run: pytest tests/test_examples.py -x -s -k "cli and cuda and (tgi or tei)"
diff --git a/.github/workflows/test_cli_cuda_pytorch.yaml b/.github/workflows/test_cli_cuda_pytorch.yaml
index 0bc5dfaf..2aa54d5d 100644
--- a/.github/workflows/test_cli_cuda_pytorch.yaml
+++ b/.github/workflows/test_cli_cuda_pytorch.yaml
@@ -50,6 +50,14 @@ jobs:
         run: |
           pytest tests/test_cli.py -x -s -k "cli and cuda and pytorch and not (dp or ddp or device_map or deepspeed)"
 
+      - if: ${{
+          (github.event_name == 'push') ||
+          (github.event_name == 'workflow_dispatch') ||
+          contains( github.event.pull_request.labels.*.name, 'examples')
+          }}
+        name: Run examples
+        run: pytest tests/test_examples.py -x -s -k "cli and cuda and pytorch"
+
   run_cli_cuda_pytorch_multi_gpu_tests:
     if: ${{
       (github.event_name == 'push') ||
diff --git a/.github/workflows/test_cli_cuda_tensorrt_llm.yaml b/.github/workflows/test_cli_cuda_tensorrt_llm.yaml
index acb04fe2..c75aac92 100644
--- a/.github/workflows/test_cli_cuda_tensorrt_llm.yaml
+++ b/.github/workflows/test_cli_cuda_tensorrt_llm.yaml
@@ -50,6 +50,16 @@ jobs:
         run: |
           pytest tests/test_cli.py -x -s -k "cli and cuda and tensorrt_llm and not (tp or pp)"
 
+      - if: ${{
+          (github.event_name == 'push') ||
+          (github.event_name == 'workflow_dispatch') ||
+          contains( github.event.pull_request.labels.*.name, 'examples')
+          }}
+        name: Run examples
+        run: |
+          huggingface-cli delete-cache
+          pytest tests/test_examples.py -x -s -k "cli and cuda and trt"
+
   cli_cuda_tensorrt_llm_multi_gpu_tests:
     if: ${{
       (github.event_name == 'push') ||
diff --git a/.github/workflows/test_cli_cuda_torch_ort.yaml b/.github/workflows/test_cli_cuda_torch_ort.yaml
index ee886e8c..7dccafb8 100644
--- a/.github/workflows/test_cli_cuda_torch_ort.yaml
+++ b/.github/workflows/test_cli_cuda_torch_ort.yaml
@@ -44,13 +44,21 @@ jobs:
 
       - name: Install dependencies
         run: |
-          pip install -e .[testing,torch-ort,peft]
-          pip install optimum@git+https://github.com/huggingface/optimum.git
+          pip install -e .[testing,torch-ort,peft] optimum@git+https://github.com/huggingface/optimum.git@fxi-ort-trainer
 
       - name: Run tests
         run: |
           pytest tests/test_cli.py -x -s -k "cli and cuda and torch_ort and not (dp or ddp or device_map) and not (peft)"
 
+      # - if: ${{
+      #     (github.event_name == 'push') ||
+      #     (github.event_name == 'workflow_dispatch') ||
+      #     contains( github.event.pull_request.labels.*.name, 'examples')
+      #     }}
+      #   name: Run examples
+      #   run: |
+      #     pytest tests/test_examples.py -x -s -k "cli and cuda and torch_ort"
+
   run_cli_cuda_torch_ort_multi_gpu_tests:
     if: ${{
       (github.event_name == 'push') ||
@@ -75,8 +83,7 @@ jobs:
 
       - name: Install dependencies
         run: |
-          pip install -e .[testing,torch-ort,peft] 
-          pip install optimum@git+https://github.com/huggingface/optimum.git
+          pip install -e .[testing,torch-ort,peft] optimum@git+https://github.com/huggingface/optimum.git@fxi-ort-trainer
 
       - name: Run tests
         run: |
diff --git a/.github/workflows/test_cli_cuda_vllm.yaml b/.github/workflows/test_cli_cuda_vllm.yaml
index 732513d2..6072dd8c 100644
--- a/.github/workflows/test_cli_cuda_vllm.yaml
+++ b/.github/workflows/test_cli_cuda_vllm.yaml
@@ -50,6 +50,15 @@ jobs:
         run: |
           FORCE_SEQUENTIAL=1 pytest tests/test_cli.py -x -s -k "cli and cuda and vllm and not (tp or pp)"
 
+      - if: ${{
+          (github.event_name == 'push') ||
+          (github.event_name == 'workflow_dispatch') ||
+          contains( github.event.pull_request.labels.*.name, 'examples')
+          }}
+        name: Run examples
+        run: |
+          pytest tests/test_examples.py -x -s -k "cli and cuda and vllm"
+
   run_cli_cuda_vllm_multi_gpu_tests:
     if: ${{
       (github.event_name == 'push') ||
diff --git a/.github/workflows/test_cli_energy_star.yaml b/.github/workflows/test_energy_star.yaml
similarity index 84%
rename from .github/workflows/test_cli_energy_star.yaml
rename to .github/workflows/test_energy_star.yaml
index 24c487f6..db9a22cd 100644
--- a/.github/workflows/test_cli_energy_star.yaml
+++ b/.github/workflows/test_energy_star.yaml
@@ -20,13 +20,11 @@ concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
 
 jobs:
-  run_cli_energy_star_tests:
+  run_energy_star:
     if: ${{
       (github.event_name == 'push') ||
       (github.event_name == 'workflow_dispatch') ||
-      contains( github.event.pull_request.labels.*.name, 'cli') ||
-      contains( github.event.pull_request.labels.*.name, 'energy_star') ||
-      contains( github.event.pull_request.labels.*.name, 'cli_energy_star')
+      contains( github.event.pull_request.labels.*.name, 'energy_star')
       }}
 
     runs-on:
diff --git a/examples/energy_star/_base_.yaml b/energy_star/_base_.yaml
similarity index 100%
rename from examples/energy_star/_base_.yaml
rename to energy_star/_base_.yaml
diff --git a/examples/energy_star/automatic_speech_recognition.yaml b/energy_star/automatic_speech_recognition.yaml
similarity index 100%
rename from examples/energy_star/automatic_speech_recognition.yaml
rename to energy_star/automatic_speech_recognition.yaml
diff --git a/examples/energy_star/image_classification.yaml b/energy_star/image_classification.yaml
similarity index 100%
rename from examples/energy_star/image_classification.yaml
rename to energy_star/image_classification.yaml
diff --git a/examples/energy_star/image_to_text.yaml b/energy_star/image_to_text.yaml
similarity index 100%
rename from examples/energy_star/image_to_text.yaml
rename to energy_star/image_to_text.yaml
diff --git a/examples/energy_star/object_detection.yaml b/energy_star/object_detection.yaml
similarity index 100%
rename from examples/energy_star/object_detection.yaml
rename to energy_star/object_detection.yaml
diff --git a/examples/energy_star/question_answering.yaml b/energy_star/question_answering.yaml
similarity index 100%
rename from examples/energy_star/question_answering.yaml
rename to energy_star/question_answering.yaml
diff --git a/examples/energy_star/sentence_similarity.yaml b/energy_star/sentence_similarity.yaml
similarity index 100%
rename from examples/energy_star/sentence_similarity.yaml
rename to energy_star/sentence_similarity.yaml
diff --git a/examples/energy_star/summarization.yaml b/energy_star/summarization.yaml
similarity index 100%
rename from examples/energy_star/summarization.yaml
rename to energy_star/summarization.yaml
diff --git a/examples/energy_star/t5_question_answering.yaml b/energy_star/t5_question_answering.yaml
similarity index 100%
rename from examples/energy_star/t5_question_answering.yaml
rename to energy_star/t5_question_answering.yaml
diff --git a/examples/energy_star/t5_summarization.yaml b/energy_star/t5_summarization.yaml
similarity index 100%
rename from examples/energy_star/t5_summarization.yaml
rename to energy_star/t5_summarization.yaml
diff --git a/examples/energy_star/t5_text_classification.yaml b/energy_star/t5_text_classification.yaml
similarity index 100%
rename from examples/energy_star/t5_text_classification.yaml
rename to energy_star/t5_text_classification.yaml
diff --git a/examples/energy_star/t5_text_generation.yaml b/energy_star/t5_text_generation.yaml
similarity index 100%
rename from examples/energy_star/t5_text_generation.yaml
rename to energy_star/t5_text_generation.yaml
diff --git a/examples/energy_star/text_classification.yaml b/energy_star/text_classification.yaml
similarity index 100%
rename from examples/energy_star/text_classification.yaml
rename to energy_star/text_classification.yaml
diff --git a/examples/energy_star/text_generation.yaml b/energy_star/text_generation.yaml
similarity index 100%
rename from examples/energy_star/text_generation.yaml
rename to energy_star/text_generation.yaml
diff --git a/examples/energy_star/text_to_image.yaml b/energy_star/text_to_image.yaml
similarity index 100%
rename from examples/energy_star/text_to_image.yaml
rename to energy_star/text_to_image.yaml
diff --git a/examples/ipex_bert.yaml b/examples/cpu_ipex_bert.yaml
similarity index 59%
rename from examples/ipex_bert.yaml
rename to examples/cpu_ipex_bert.yaml
index e549da0a..0e7ed37b 100644
--- a/examples/ipex_bert.yaml
+++ b/examples/cpu_ipex_bert.yaml
@@ -6,7 +6,7 @@ defaults:
   - _base_
   - _self_
 
-name: ipex_bert
+name: cpu_ipex_bert
 
 launcher:
   numactl: true
@@ -14,16 +14,17 @@ launcher:
     cpunodebind: 0
     membind: 0
 
+backend:
+  device: cpu
+  export: true
+  no_weights: false # because on multi-node machines, intializing weights could harm performance
+  torch_dtype: float32 # but use bfloat16 on compatible Intel CPUs
+  model: google-bert/bert-base-uncased
+
 scenario:
-  latency: true
   memory: true
+  latency: true
+
   input_shapes:
     batch_size: 1
     sequence_length: 128
-
-backend:
-  device: cpu
-  no_weights: false
-  export: true
-  torch_dtype: bfloat16
-  model: bert-base-uncased
diff --git a/examples/ipex_llama.yaml b/examples/cpu_ipex_llama.yaml
similarity index 66%
rename from examples/ipex_llama.yaml
rename to examples/cpu_ipex_llama.yaml
index b564316b..898ed0df 100644
--- a/examples/ipex_llama.yaml
+++ b/examples/cpu_ipex_llama.yaml
@@ -6,7 +6,7 @@ defaults:
   - _base_
   - _self_
 
-name: ipex_llama
+name: cpu_ipex_llama
 
 launcher:
   numactl: true
@@ -14,24 +14,21 @@ launcher:
     cpunodebind: 0
     membind: 0
 
+backend:
+  device: cpu
+  export: true
+  no_weights: false # because on multi-node machines, intializing weights could harm performance
+  torch_dtype: float32 # but use bfloat16 on compatible Intel CPUs
+  model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
+
 scenario:
-  latency: true
   memory: true
+  latency: true
 
-  warmup_runs: 10
-  iterations: 10
-  duration: 10
-  
   input_shapes:
     batch_size: 1
-    sequence_length: 256
+    sequence_length: 64
+
   generate_kwargs:
     max_new_tokens: 32
     min_new_tokens: 32
-
-backend:
-  device: cpu
-  export: true
-  no_weights: false
-  torch_dtype: bfloat16
-  model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
diff --git a/examples/llama_cpp_embedding.yaml b/examples/cpu_llama_cpp_embedding.yaml
similarity index 57%
rename from examples/llama_cpp_embedding.yaml
rename to examples/cpu_llama_cpp_embedding.yaml
index bdd86cce..666277c6 100644
--- a/examples/llama_cpp_embedding.yaml
+++ b/examples/cpu_llama_cpp_embedding.yaml
@@ -1,26 +1,24 @@
 defaults:
   - benchmark
   - scenario: inference
-  - launcher: inline
   - backend: llama_cpp
+  - launcher: process
   - _base_
   - _self_
 
-name: llama_cpp_llama
+name: cpu_llama_cpp_embedding
 
 backend:
-  device: mps
-  model: nomic-ai/nomic-embed-text-v1.5-GGUF
+  device: cpu
   task: feature-extraction
+  model: nomic-ai/nomic-embed-text-v1.5-GGUF
   filename: nomic-embed-text-v1.5.Q4_0.gguf
 
 scenario:
   input_shapes:
     batch_size: 1
-    sequence_length: 256
-    vocab_size: 30000
-    type_vocab_size: 1
-    max_position_embeddings: 512
+    sequence_length: 64
+
   generate_kwargs:
-    max_new_tokens: 100
-    min_new_tokens: 100
+    max_new_tokens: 32
+    min_new_tokens: 32
diff --git a/examples/llama_cpp_text_generation.yaml b/examples/cpu_llama_cpp_text_generation.yaml
similarity index 61%
rename from examples/llama_cpp_text_generation.yaml
rename to examples/cpu_llama_cpp_text_generation.yaml
index 96def950..2cd55514 100644
--- a/examples/llama_cpp_text_generation.yaml
+++ b/examples/cpu_llama_cpp_text_generation.yaml
@@ -1,25 +1,23 @@
 defaults:
   - benchmark
   - scenario: inference
-  - launcher: inline
   - backend: llama_cpp
+  - launcher: process
   - _base_
   - _self_
 
-name: llama_cpp_llama
+name: cpu_llama_cpp_text_generation
 
 backend:
-  device: mps
-  model: TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF
+  device: cpu
   task: text-generation
+  model: TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF
   filename: tinyllama-1.1b-chat-v1.0.Q4_0.gguf
 
-
 scenario:
+  memory: true
+  latency: true
+
   input_shapes:
     batch_size: 1
-    sequence_length: 256
-    vocab_size: 32000
-  generate_kwargs:
-    max_new_tokens: 100
-    min_new_tokens: 100
+    sequence_length: 128
diff --git a/examples/onnxruntime_static_quant_vit.yaml b/examples/cpu_onnxruntime_static_quant_vit.yaml
similarity index 70%
rename from examples/onnxruntime_static_quant_vit.yaml
rename to examples/cpu_onnxruntime_static_quant_vit.yaml
index 3d298473..97591bcd 100644
--- a/examples/onnxruntime_static_quant_vit.yaml
+++ b/examples/cpu_onnxruntime_static_quant_vit.yaml
@@ -6,10 +6,11 @@ defaults:
   - _base_
   - _self_
 
-name: onnxruntime_static_quant_vit
+name: cpu_onnxruntime_static_quant_vit
 
 backend:
   device: cpu
+  export: true
   no_weights: true
   model: google/vit-base-patch16-224
   quantization: true
@@ -17,3 +18,9 @@ backend:
     is_static: true
     per_channel: false
   calibration: true
+
+scenario:
+  memory: true
+  latency: true
+  input_shapes:
+    batch_size: 2
diff --git a/examples/onnxruntime_timm.yaml b/examples/cpu_onnxruntime_timm.yaml
similarity index 82%
rename from examples/onnxruntime_timm.yaml
rename to examples/cpu_onnxruntime_timm.yaml
index 165fc28a..963f44f0 100644
--- a/examples/onnxruntime_timm.yaml
+++ b/examples/cpu_onnxruntime_timm.yaml
@@ -10,7 +10,8 @@ name: onnxruntime_timm
 
 backend:
   device: cpu
-  model: timm/mobilenetv3_large_100.ra_in1k
+  export: true
+  model: timm/tiny_vit_21m_224.in1k
 
 scenario:
   memory: true
diff --git a/examples/numactl_bert.yaml b/examples/cpu_openvino_8bit_bert.yaml
similarity index 57%
rename from examples/numactl_bert.yaml
rename to examples/cpu_openvino_8bit_bert.yaml
index 7add65e7..73ef474d 100644
--- a/examples/numactl_bert.yaml
+++ b/examples/cpu_openvino_8bit_bert.yaml
@@ -1,27 +1,24 @@
 defaults:
   - benchmark
   - scenario: inference
+  - backend: openvino
   - launcher: process
-  - backend: pytorch
   - _base_
   - _self_
 
-name: pytorch_bert
+name: openvino_static_quant
 
-launcher:
-  numactl: true
-  numactl_kwargs:
-    cpunodebind: 0
-    membind: 0
+backend:
+  device: cpu
+  reshape: true
+  no_weights: true
+  load_in_8bit: false # enable 8bit on compatible Intel CPU machines
+  model: google-bert/bert-base-uncased
 
 scenario:
-  latency: true
   memory: true
+  latency: true
+
   input_shapes:
     batch_size: 1
     sequence_length: 128
-
-backend:
-  device: cpu
-  no_weights: true
-  model: bert-base-uncased
diff --git a/examples/openvino_diffusion.yaml b/examples/cpu_openvino_diffusion.yaml
similarity index 78%
rename from examples/openvino_diffusion.yaml
rename to examples/cpu_openvino_diffusion.yaml
index f0501101..30d21935 100644
--- a/examples/openvino_diffusion.yaml
+++ b/examples/cpu_openvino_diffusion.yaml
@@ -10,10 +10,9 @@ name: openvino_diffusion
 
 backend:
   device: cpu
-  model: stabilityai/stable-diffusion-2-1
-  reshape: true
   export: true
-  half: true
+  model: stabilityai/stable-diffusion-2-1
+  half: false # enable half-precision on compatible Intel CPU machines
 
 scenario:
   input_shapes:
diff --git a/examples/pytorch_bert.py b/examples/cuda_pytorch_bert.py
similarity index 59%
rename from examples/pytorch_bert.py
rename to examples/cuda_pytorch_bert.py
index 09f62b8d..2a7ddf89 100644
--- a/examples/pytorch_bert.py
+++ b/examples/cuda_pytorch_bert.py
@@ -1,22 +1,20 @@
 import os
 
-from huggingface_hub import whoami
-
 from optimum_benchmark import Benchmark, BenchmarkConfig, InferenceConfig, ProcessConfig, PyTorchConfig
 from optimum_benchmark.logging_utils import setup_logging
 
-try:
-    USERNAME = whoami()["name"]
-except Exception as e:
-    print(f"Failed to get username from Hugging Face Hub: {e}")
-    USERNAME = None
+BENCHMARK_NAME = "cuda_pytorch_bert"
+MODEL = "google-bert/bert-base-uncased"
+PUSH_REPO_ID = os.environ.get("PUSH_REPO_ID", None)
 
-BENCHMARK_NAME = "pytorch_bert"
 
+if __name__ == "__main__":
+    level = os.environ.get("LOG_LEVEL", "INFO")
+    to_file = os.environ.get("LOG_TO_FILE", "0") == "1"
+    setup_logging(level=level, to_file=to_file, prefix="MAIN-PROCESS")
 
-def run_benchmark():
     launcher_config = ProcessConfig(device_isolation=True, device_isolation_action="warn")
-    backend_config = PyTorchConfig(device="cuda", device_ids="0", no_weights=True, model="bert-base-uncased")
+    backend_config = PyTorchConfig(device="cuda", device_ids="0", no_weights=True, model=MODEL)
     scenario_config = InferenceConfig(memory=True, latency=True, input_shapes={"batch_size": 1, "sequence_length": 128})
     benchmark_config = BenchmarkConfig(
         name=BENCHMARK_NAME,
@@ -27,19 +25,9 @@ def run_benchmark():
         log_report=True,
     )
     benchmark_report = Benchmark.launch(benchmark_config)
-
-    return benchmark_config, benchmark_report
-
-
-if __name__ == "__main__":
-    level = os.environ.get("LOG_LEVEL", "INFO")
-    to_file = os.environ.get("LOG_TO_FILE", "0") == "1"
-    setup_logging(level=level, to_file=to_file, prefix="MAIN-PROCESS")
-
-    benchmark_config, benchmark_report = run_benchmark()
     benchmark = Benchmark(config=benchmark_config, report=benchmark_report)
 
-    if USERNAME is not None:
-        benchmark_config.push_to_hub(repo_id=f"{USERNAME}/benchmarks", subfolder=BENCHMARK_NAME)
-        benchmark_report.push_to_hub(repo_id=f"{USERNAME}/benchmarks", subfolder=BENCHMARK_NAME)
-        benchmark.push_to_hub(repo_id=f"{USERNAME}/benchmarks", subfolder=BENCHMARK_NAME)
+    if PUSH_REPO_ID is not None:
+        benchmark_config.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=BENCHMARK_NAME)
+        benchmark_report.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=BENCHMARK_NAME)
+        benchmark.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=BENCHMARK_NAME)
diff --git a/examples/pytorch_bert.yaml b/examples/cuda_pytorch_bert.yaml
similarity index 90%
rename from examples/pytorch_bert.yaml
rename to examples/cuda_pytorch_bert.yaml
index 8bb702ca..8ab9b5cb 100644
--- a/examples/pytorch_bert.yaml
+++ b/examples/cuda_pytorch_bert.yaml
@@ -12,15 +12,16 @@ launcher:
   device_isolation: true
   device_isolation_action: warn
 
+backend:
+  device: cuda
+  device_ids: 0
+  no_weights: true
+  model: google-bert/bert-base-uncased
+
 scenario:
-  latency: true
   memory: true
+  latency: true
+
   input_shapes:
     batch_size: 1
     sequence_length: 128
-
-backend:
-  device: cuda
-  device_ids: 0
-  no_weights: true
-  model: bert-base-uncased
diff --git a/examples/trt_llama.yaml b/examples/cuda_pytorch_llama.yaml
similarity index 70%
rename from examples/trt_llama.yaml
rename to examples/cuda_pytorch_llama.yaml
index 30cb600a..1f85bd10 100644
--- a/examples/trt_llama.yaml
+++ b/examples/cuda_pytorch_llama.yaml
@@ -1,12 +1,12 @@
 defaults:
   - benchmark
-  - backend: tensorrt-llm
   - scenario: inference
   - launcher: process
+  - backend: pytorch
   - _base_
   - _self_
 
-name: trt_llama
+name: cuda_pytorch_llama
 
 launcher:
   device_isolation: true
@@ -16,12 +16,14 @@ backend:
   device: cuda
   device_ids: 0
   no_weights: true
+  torch_dtype: float16
   model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
 
 scenario:
   input_shapes:
     batch_size: 4
-    sequence_length: 256
+    sequence_length: 64
+
   generate_kwargs:
-    max_new_tokens: 100
-    min_new_tokens: 100
+    max_new_tokens: 32
+    min_new_tokens: 32
diff --git a/examples/pytorch_llama.py b/examples/cuda_pytorch_llama_quants.py
similarity index 81%
rename from examples/pytorch_llama.py
rename to examples/cuda_pytorch_llama_quants.py
index fe732bfa..01d492cb 100644
--- a/examples/pytorch_llama.py
+++ b/examples/cuda_pytorch_llama_quants.py
@@ -1,17 +1,11 @@
 import os
 
-from huggingface_hub import whoami
-
 from optimum_benchmark import Benchmark, BenchmarkConfig, InferenceConfig, ProcessConfig, PyTorchConfig
 from optimum_benchmark.logging_utils import setup_logging
 
-try:
-    USERNAME = whoami()["name"]
-except Exception as e:
-    print(f"Failed to get username from Hugging Face Hub: {e}")
-    USERNAME = None
-
-BENCHMARK_NAME = "pytorch-llama"
+BENCHMARK_NAME = "cuda_pytorch_llama"
+MODEL = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
+PUSH_REPO_ID = os.environ.get("PUSH_REPO_ID", None)
 
 WEIGHTS_CONFIGS = {
     "float16": {
@@ -40,10 +34,10 @@
 def run_benchmark(weight_config: str):
     launcher_config = ProcessConfig(device_isolation=True, device_isolation_action="warn")
     backend_config = PyTorchConfig(
+        model=MODEL,
         device="cuda",
         device_ids="0",
         no_weights=True,
-        model="gpt2",
         **WEIGHTS_CONFIGS[weight_config],
     )
     scenario_config = InferenceConfig(
@@ -52,7 +46,7 @@ def run_benchmark(weight_config: str):
         duration=10,
         iterations=10,
         warmup_runs=10,
-        input_shapes={"batch_size": 1, "sequence_length": 128},
+        input_shapes={"batch_size": 1, "sequence_length": 64},
         generate_kwargs={"max_new_tokens": 32, "min_new_tokens": 32},
     )
     benchmark_config = BenchmarkConfig(
@@ -77,7 +71,5 @@ def run_benchmark(weight_config: str):
         benchmark_config, benchmark_report = run_benchmark(weight_config)
         benchmark = Benchmark(config=benchmark_config, report=benchmark_report)
 
-        if USERNAME is not None:
-            benchmark.push_to_hub(
-                repo_id=f"{USERNAME}/benchmarks", filename=f"{weight_config}.json", subfolder=BENCHMARK_NAME
-            )
+        if PUSH_REPO_ID is not None:
+            benchmark.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=BENCHMARK_NAME, filename=f"{weight_config}.json")
diff --git a/examples/pytorch_vlm.yaml b/examples/cuda_pytorch_vlm.yaml
similarity index 92%
rename from examples/pytorch_vlm.yaml
rename to examples/cuda_pytorch_vlm.yaml
index a39f8c8a..8f1e0f3c 100644
--- a/examples/pytorch_vlm.yaml
+++ b/examples/cuda_pytorch_vlm.yaml
@@ -6,7 +6,7 @@ defaults:
   - _base_
   - _self_
 
-name: pytorch_vlm
+name: cuda_pytorch_vlm
 
 launcher:
   device_isolation: true
@@ -30,7 +30,7 @@ scenario:
   input_shapes:
     # text
     batch_size: 1
-    sequence_length: 256
+    sequence_length: 64
     # image
     num_images: 2
     num_channels: 3
diff --git a/examples/tgi_llama.yaml b/examples/cuda_tgi_llama.yaml
similarity index 63%
rename from examples/tgi_llama.yaml
rename to examples/cuda_tgi_llama.yaml
index 399667fb..297403c8 100644
--- a/examples/tgi_llama.yaml
+++ b/examples/cuda_tgi_llama.yaml
@@ -6,7 +6,7 @@ defaults:
   - _base_
   - _self_
 
-name: tgi_llama
+name: cuda_tgi_llama
 
 launcher:
   device_isolation: true
@@ -14,14 +14,15 @@ launcher:
 
 backend:
   device: cuda
-  device_ids: 4
-  # no_weights: true
+  device_ids: 0
+  cuda_graphs: 0 # remove for better perf but bigger memory footprint
   model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
 
 scenario:
   input_shapes:
     batch_size: 4
-    sequence_length: 256
+    sequence_length: 64
+
   generate_kwargs:
-    max_new_tokens: 100
-    min_new_tokens: 100
+    max_new_tokens: 32
+    min_new_tokens: 32
diff --git a/examples/pytorch_llama.yaml b/examples/cuda_trt_llama.yaml
similarity index 56%
rename from examples/pytorch_llama.yaml
rename to examples/cuda_trt_llama.yaml
index becd1f2e..c483fc2f 100644
--- a/examples/pytorch_llama.yaml
+++ b/examples/cuda_trt_llama.yaml
@@ -1,33 +1,30 @@
 defaults:
   - benchmark
+  - backend: tensorrt-llm
   - scenario: inference
   - launcher: process
-  - backend: pytorch
   - _base_
   - _self_
 
-name: pytorch_llama
+name: cuda_trt_llama
 
 launcher:
   device_isolation: true
   device_isolation_action: warn
 
 backend:
-  model: gpt2
   device: cuda
-  torch_dtype: float16
+  device_ids: 0
+  max_batch_size: 4
+  max_new_tokens: 32
+  max_prompt_length: 64
+  model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
 
 scenario:
-  memory: true
-  latency: true
-
-  warmup_runs: 10
-  iterations: 10
-  duration: 10
-
   input_shapes:
-    batch_size: 1
-    sequence_length: 256
+    batch_size: 4
+    sequence_length: 64
+
   generate_kwargs:
     max_new_tokens: 32
     min_new_tokens: 32
diff --git a/examples/vllm_llama.yaml b/examples/cuda_vllm_llama.yaml
similarity index 62%
rename from examples/vllm_llama.yaml
rename to examples/cuda_vllm_llama.yaml
index 8bbb4025..5ec4b5a8 100644
--- a/examples/vllm_llama.yaml
+++ b/examples/cuda_vllm_llama.yaml
@@ -6,7 +6,7 @@ defaults:
   - _base_
   - _self_
 
-name: vllm_llama
+name: cuda_vllm_llama
 
 launcher:
   device_isolation: true
@@ -15,16 +15,16 @@ launcher:
 backend:
   device: cuda
   device_ids: 0
-  no_weights: false
-  serving_mode: offline
+  serving_mode: online # server-like
   model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
   engine_args:
-    enforce_eager: true
+    enforce_eager: true # remove for better perf but bigger memory footprint
 
 scenario:
   input_shapes:
     batch_size: 4
-    sequence_length: 256
+    sequence_length: 64
+
   generate_kwargs:
-    max_new_tokens: 100
-    min_new_tokens: 100
+    max_new_tokens: 32
+    min_new_tokens: 32
diff --git a/examples/pytorch_bert_mps.yaml b/examples/mps_pytorch_bert.yaml
similarity index 67%
rename from examples/pytorch_bert_mps.yaml
rename to examples/mps_pytorch_bert.yaml
index 4d4dc6e3..27368eb1 100644
--- a/examples/pytorch_bert_mps.yaml
+++ b/examples/mps_pytorch_bert.yaml
@@ -1,15 +1,12 @@
 defaults:
   - benchmark
   - scenario: inference
-  - launcher: process # launcher: inline works, 
+  - launcher: inline # mps fails with python multi-processing for some reason
   - backend: pytorch
   - _base_
   - _self_
 
-name: pytorch_bert
-
-# launcher:
-#   start_method: spawn
+name: mps_pytorch_bert
 
 scenario:
   latency: true
@@ -19,8 +16,6 @@ scenario:
     sequence_length: 128
 
 backend:
-  device: cpu
+  device: mps
   no_weights: true
   model: bert-base-uncased
-
-
diff --git a/examples/neural_compressor_ptq_bert.yaml b/examples/neural_compressor_ptq_bert.yaml
deleted file mode 100644
index cbc32590..00000000
--- a/examples/neural_compressor_ptq_bert.yaml
+++ /dev/null
@@ -1,20 +0,0 @@
-defaults:
-  - benchmark
-  - backend: neural-compressor
-  - scenario: inference
-  - launcher: process
-  - _base_
-  - _self_
-
-name: neural_compressor_ptq_bert
-
-backend:
-  device: cpu
-  no_weights: true
-  model: bert-base-uncased
-  ptq_quantization: true
-  calibration: true
-
-scenario:
-  input_shapes:
-    batch_size: 1
diff --git a/examples/openvino_static_quant_bert.yaml b/examples/openvino_static_quant_bert.yaml
deleted file mode 100644
index caa4363a..00000000
--- a/examples/openvino_static_quant_bert.yaml
+++ /dev/null
@@ -1,21 +0,0 @@
-defaults:
-  - benchmark
-  - scenario: inference
-  - backend: openvino
-  - launcher: process
-  - _base_
-  - _self_
-
-name: openvino_static_quant_bert
-
-backend:
-  device: cpu
-  no_weights: true
-  model: bert-base-uncased
-  quantization: true
-  calibration: true
-  reshape: true
-
-scenario:
-  input_shapes:
-    batch_size: 1
diff --git a/examples/tei_bge.yaml b/examples/tei_bge.yaml
deleted file mode 100644
index dbbab7d5..00000000
--- a/examples/tei_bge.yaml
+++ /dev/null
@@ -1,21 +0,0 @@
-defaults:
-  - benchmark
-  - scenario: inference
-  - launcher: inline
-  - backend: py-txi
-  - _self_
-
-name: tei_bert
-
-launcher:
-  device_isolation: true
-  device_isolation_action: warn
-
-backend:
-  device: cpu
-  model: BAAI/bge-base-en-v1.5
-
-scenario:
-  input_shapes:
-    batch_size: 64
-    sequence_length: 128
diff --git a/optimum_benchmark/__init__.py b/optimum_benchmark/__init__.py
index 313fb22a..7be4c4c3 100644
--- a/optimum_benchmark/__init__.py
+++ b/optimum_benchmark/__init__.py
@@ -1,9 +1,7 @@
 from .backends import (
     BackendConfig,
-    INCConfig,
     IPEXConfig,
     LlamaCppConfig,
-    LLMSwarmConfig,
     ORTConfig,
     OVConfig,
     PyTorchConfig,
@@ -26,10 +24,8 @@
     "EnergyStarConfig",
     "InferenceConfig",
     "IPEXConfig",
-    "INCConfig",
     "InlineConfig",
     "LauncherConfig",
-    "LLMSwarmConfig",
     "ORTConfig",
     "OVConfig",
     "ProcessConfig",
diff --git a/optimum_benchmark/backends/__init__.py b/optimum_benchmark/backends/__init__.py
index ec146f0b..2019270a 100644
--- a/optimum_benchmark/backends/__init__.py
+++ b/optimum_benchmark/backends/__init__.py
@@ -1,8 +1,6 @@
 from .config import BackendConfig
 from .ipex.config import IPEXConfig
 from .llama_cpp.config import LlamaCppConfig
-from .llm_swarm.config import LLMSwarmConfig
-from .neural_compressor.config import INCConfig
 from .onnxruntime.config import ORTConfig
 from .openvino.config import OVConfig
 from .py_txi.config import PyTXIConfig
@@ -18,9 +16,7 @@
     "OVConfig",
     "TorchORTConfig",
     "TRTLLMConfig",
-    "INCConfig",
     "PyTXIConfig",
-    "LLMSwarmConfig",
     "BackendConfig",
     "VLLMConfig",
     "LlamaCppConfig",
diff --git a/optimum_benchmark/backends/llm_swarm/__init__.py b/optimum_benchmark/backends/llm_swarm/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/optimum_benchmark/backends/llm_swarm/backend.py b/optimum_benchmark/backends/llm_swarm/backend.py
deleted file mode 100644
index 8139e4ea..00000000
--- a/optimum_benchmark/backends/llm_swarm/backend.py
+++ /dev/null
@@ -1,85 +0,0 @@
-import asyncio
-from typing import Any, Dict, List
-
-import torch
-from huggingface_hub import AsyncInferenceClient
-from llm_swarm import LLMSwarm
-from llm_swarm import LLMSwarmConfig as LLMSwarmCfg
-
-from ...task_utils import TEXT_GENERATION_TASKS
-from ..base import Backend
-from .config import LLMSwarmConfig
-
-
-class LLMSwarmBackend(Backend[LLMSwarmConfig]):
-    NAME: str = "llm-swarm"
-
-    def __init__(self, config: LLMSwarmConfig) -> None:
-        super().__init__(config)
-
-        if self.config.task not in TEXT_GENERATION_TASKS:
-            raise NotImplementedError(f"LLM Swarm does not support task {self.config.task}")
-
-    def load(self) -> None:
-        self.logger.info("\t+ Downloading pretrained model")
-        self.download_pretrained_model()
-        self.logger.info("\t+ Preparing generation config")
-        self.prepare_generation_config()
-        self.logger.info("\t+ Loading pretrained model")
-        self.load_model_from_pretrained()
-
-    def load_model_from_pretrained(self) -> None:
-        self.llm_swarm_config = LLMSwarmCfg(
-            gpus=self.config.gpus,
-            model=self.config.model,
-            instances=self.config.instances,
-            inference_engine=self.config.inference_engine,
-            slurm_template_path=self.config.slurm_template_path,
-            load_balancer_template_path=self.config.load_balancer_template_path,
-            per_instance_max_parallel_requests=self.config.per_instance_max_parallel_requests,
-            revision=self.config.model_kwargs.get("revision", "main"),
-            debug_endpoint=self.config.debug_endpoint,
-        )
-        self.llm_swarm = LLMSwarm(self.llm_swarm_config).__enter__()
-        self.client = AsyncInferenceClient(self.llm_swarm.endpoint)
-
-    def download_pretrained_model(self) -> None:
-        with torch.device("meta"):
-            self.auto_model_loader.from_pretrained(self.config.model, **self.config.model_kwargs)
-
-    def prepare_generation_config(self) -> None:
-        self.generation_config.eos_token_id = -100
-        self.generation_config.pad_token_id = -100
-
-        model_cache_folder = f"models/{self.config.model}".replace("/", "--")
-        model_cache_path = f"{self.config.volume}/{model_cache_folder}"
-        snapshot_file = f"{model_cache_path}/refs/{self.config.model_kwargs.get('revision', 'main')}"
-        snapshot_ref = open(snapshot_file, "r").read().strip()
-        model_snapshot_path = f"{model_cache_path}/snapshots/{snapshot_ref}"
-        self.logger.info("\t+ Saving new pretrained generation config")
-        self.generation_config.save_pretrained(save_directory=model_snapshot_path)
-
-    def prepare_inputs(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
-        if "inputs" in inputs:
-            inputs = {"prompt": self.pretrained_processor.batch_decode(inputs["inputs"].tolist())}
-        elif "input_ids" in inputs:
-            inputs = {"prompt": self.pretrained_processor.batch_decode(inputs["input_ids"].tolist())}
-        else:
-            raise ValueError("inputs must contain either input_ids or inputs")
-
-        return inputs
-
-    async def single_client_call(self, prompt: str, kwargs: Dict[str, Any]) -> str:
-        return await self.client.text_generation(prompt, max_new_tokens=kwargs.get("max_new_tokens", 1))
-
-    async def batch_client_call(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> List[str]:
-        return await asyncio.gather(*(self.single_client_call(p, kwargs) for p in inputs["prompt"]))
-
-    def forward(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> List[str]:
-        return asyncio.run(self.batch_client_call(inputs, kwargs))
-
-    def prefill(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> List[str]:
-        return asyncio.run(self.batch_client_call(inputs, kwargs))
-
-    def generate(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> List[str]:
-        return asyncio.run(self.batch_client_call(inputs, kwargs))
diff --git a/optimum_benchmark/backends/llm_swarm/config.py b/optimum_benchmark/backends/llm_swarm/config.py
deleted file mode 100644
index 745cdd3f..00000000
--- a/optimum_benchmark/backends/llm_swarm/config.py
+++ /dev/null
@@ -1,31 +0,0 @@
-from dataclasses import dataclass
-from typing import Optional
-
-from ...import_utils import llm_swarm_version
-from ..config import BackendConfig
-
-
-@dataclass
-class LLMSwarmConfig(BackendConfig):
-    name: str = "llm-swarm"
-    version: Optional[str] = llm_swarm_version()
-    _target_: str = "optimum_benchmark.backends.llm_swarm.backend.LLMSwarmBackend"
-
-    # optimum benchmark specific
-    no_weights: bool = False
-
-    # llm-swarm specific
-    gpus: int = 8
-    instances: int = 1
-    inference_engine: str = "tgi"
-    volume: str = "/fsx/ilyas/.cache"
-    per_instance_max_parallel_requests: int = 500
-    slurm_template_path: str = "/fsx/ilyas/swarm-templates/tgi_h100.template.slurm"
-    load_balancer_template_path: str = "/fsx/ilyas/swarm-templates/nginx.template.conf"
-    debug_endpoint: Optional[str] = None
-
-    def __post_init__(self):
-        super().__post_init__()
-
-        # so that downloaded artifacts are stored in the same place
-        self.hub_kwargs["cache_dir"] = self.volume
diff --git a/optimum_benchmark/backends/neural_compressor/__init__.py b/optimum_benchmark/backends/neural_compressor/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/optimum_benchmark/backends/neural_compressor/backend.py b/optimum_benchmark/backends/neural_compressor/backend.py
deleted file mode 100644
index c180a5ba..00000000
--- a/optimum_benchmark/backends/neural_compressor/backend.py
+++ /dev/null
@@ -1,151 +0,0 @@
-import os
-from collections import OrderedDict
-from tempfile import TemporaryDirectory
-from typing import Any, Dict
-
-import torch
-from hydra.utils import get_class
-from neural_compressor.config import AccuracyCriterion, PostTrainingQuantConfig, TuningCriterion
-from optimum.intel.neural_compressor.quantization import INCQuantizer
-
-from ...generators.dataset_generator import DatasetGenerator
-from ..base import Backend
-from ..transformers_utils import fast_weights_init
-from .config import INCConfig
-from .utils import TASKS_TO_INCMODELS
-
-
-class INCBackend(Backend[INCConfig]):
-    NAME: str = "neural-compressor"
-
-    def __init__(self, config: INCConfig):
-        super().__init__(config)
-
-        if self.config.task in TASKS_TO_INCMODELS:
-            self.incmodel_class = get_class(TASKS_TO_INCMODELS[self.config.task])
-            self.logger.info(f"Using INCModel class {self.incmodel_class.__name__}")
-        else:
-            raise NotImplementedError(f"INCBackend does not support task {self.config.task}")
-
-    def load(self) -> None:
-        self.logger.info("\t+ Creating backend temporary directory")
-        self.tmpdir = TemporaryDirectory()
-
-        if self.config.ptq_quantization:
-            if self.config.no_weights:
-                self.logger.info("\t+ Creating no weights AutoModel")
-                self.create_no_weights_model()
-                self.logger.info("\t+ Loading no weights AutoModel")
-                self.load_automodel_with_no_weights()
-            else:
-                self.logger.info("\t+ Loading pretrained AutoModel")
-                self.load_automodel_from_pretrained()
-            self.logger.info("\t+ Applying post-training quantization")
-            self.quantize_automodel()
-            self.logger.info("\t+ Loading quantized INCModel")
-            original_model, self.config.model = self.config.model, self.quantized_model
-            self.load_incmodel_from_pretrained()
-            self.config.model = original_model
-        elif self.config.no_weights:
-            self.logger.info("\t+ Creating no weights INCModel")
-            self.create_no_weights_model()
-            self.logger.info("\t+ Loading no weights INCModel")
-            self.load_incmodel_with_no_weights()
-        else:
-            self.logger.info("\t+ Loading pretrained INCModel")
-            self.load_incmodel_from_pretrained()
-
-        self.tmpdir.cleanup()
-
-    def load_automodel_from_pretrained(self) -> None:
-        self.pretrained_model = self.automodel_loader.from_pretrained(self.config.model, **self.config.model_kwargs)
-
-    def load_automodel_with_no_weights(self) -> None:
-        original_model, self.config.model = self.config.model, self.no_weights_model
-
-        with fast_weights_init():
-            self.load_automodel_from_pretrained()
-
-        self.logger.info("\t+ Tying model weights")
-        self.pretrained_model.tie_weights()
-
-        self.config.model = original_model
-
-    def load_incmodel_from_pretrained(self) -> None:
-        self.pretrained_model = self.incmodel_class.from_pretrained(self.config.model, **self.config.model_kwargs)
-
-    def load_incmodel_with_no_weights(self) -> None:
-        original_model, self.config.model = self.config.model, self.no_weights_model
-
-        with fast_weights_init():
-            self.load_incmodel_from_pretrained()
-
-        self.logger.info("\t+ Tying model weights")
-        self.pretrained_model.model.tie_weights()
-
-        self.config.model = original_model
-
-    def create_no_weights_model(self) -> None:
-        self.no_weights_model = os.path.join(self.tmpdir.name, "no_weights_model")
-        self.logger.info("\t+ Creating no weights model directory")
-        os.makedirs(self.no_weights_model, exist_ok=True)
-        self.logger.info("\t+ Creating no weights model state dict")
-        state_dict = torch.nn.Linear(1, 1).state_dict()
-        self.logger.info("\t+ Saving no weights model pytorch_model.bin")
-        torch.save(state_dict, os.path.join(self.no_weights_model, "pytorch_model.bin"))
-        self.logger.info("\t+ Saving no weights model pretrained config")
-        self.pretrained_config.save_pretrained(save_directory=self.no_weights_model)
-
-    def quantize_automodel(self) -> None:
-        self.quantized_model = f"{self.tmpdir.name}/quantized_model"
-        self.logger.info("\t+ Processing quantization config")
-        ptq_quantization_config = self.config.ptq_quantization_config.copy()
-        ptq_quantization_config["accuracy_criterion"] = AccuracyCriterion(
-            **ptq_quantization_config["accuracy_criterion"]
-        )
-        ptq_quantization_config["tuning_criterion"] = TuningCriterion(**ptq_quantization_config["tuning_criterion"])
-        ptq_quantization_config = PostTrainingQuantConfig(**ptq_quantization_config)
-        self.logger.info("\t+ Creating quantizer")
-        quantizer = INCQuantizer.from_pretrained(
-            model=self.pretrained_model,
-            task=self.config.task,
-            seed=self.config.seed,
-            # TODO: add support for these
-            calibration_fn=None,
-            eval_fn=None,
-        )
-
-        if self.config.calibration:
-            self.logger.info("\t+ Generating calibration dataset")
-            dataset_shapes = {"dataset_size": 1, "sequence_length": 1, **self.model_shapes}
-            calibration_dataset = DatasetGenerator(
-                task=self.config.task, dataset_shapes=dataset_shapes, model_shapes=self.model_shapes
-            )()
-            columns_to_be_removed = list(set(calibration_dataset.column_names) - set(quantizer._signature_columns))
-            calibration_dataset = calibration_dataset.remove_columns(columns_to_be_removed)
-        else:
-            calibration_dataset = None
-
-        self.logger.info("\t+ Quantizing model")
-        quantizer.quantize(
-            save_directory=self.quantized_model,
-            calibration_dataset=calibration_dataset,
-            quantization_config=ptq_quantization_config,
-            # TODO: add support for these
-            remove_unused_columns=True,
-            data_collator=None,
-            file_name=None,
-            batch_size=1,
-        )
-
-    @torch.inference_mode()
-    def forward(self, input: Dict[str, Any], kwargs: Dict[str, Any]) -> OrderedDict:
-        return self.pretrained_model(**input, **kwargs)
-
-    @torch.inference_mode()
-    def prefill(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> OrderedDict:
-        return self.pretrained_model.generate(**inputs, **kwargs)
-
-    @torch.inference_mode()
-    def generate(self, input: Dict[str, Any], kwargs: Dict[str, Any]) -> OrderedDict:
-        return self.pretrained_model.generate(**input, **kwargs)
diff --git a/optimum_benchmark/backends/neural_compressor/config.py b/optimum_benchmark/backends/neural_compressor/config.py
deleted file mode 100644
index 8aea5964..00000000
--- a/optimum_benchmark/backends/neural_compressor/config.py
+++ /dev/null
@@ -1,71 +0,0 @@
-from dataclasses import dataclass, field
-from typing import Any, Dict, Optional
-
-from omegaconf import OmegaConf
-
-from ...import_utils import neural_compressor_version
-from ..config import BackendConfig
-
-# https://github.com/intel/neural-compressor/blob/master/neural_compressor/config.py#L490
-ACCURACY_CRITERION_CONFIG = {"higher_is_better": True, "criterion": "relative", "tolerable_loss": 0.01}
-
-# https://github.com/intel/neural-compressor/blob/master/neural_compressor/config.py#L593
-TUNING_CRITERION_CONFIG = {
-    "strategy": "basic",
-    "strategy_kwargs": None,
-    "timeout": 0,
-    "max_trials": 100,
-    "objective": "performance",
-}
-
-# https://github.com/intel/neural-compressor/blob/master/neural_compressor/config.py#L1242
-PTQ_QUANTIZATION_CONFIG = {
-    "device": "cpu",
-    "backend": "default",
-    "domain": "auto",
-    "recipes": {},
-    "quant_format": "default",
-    "inputs": [],
-    "outputs": [],
-    "approach": "static",
-    "calibration_sampling_size": [100],
-    "op_type_dict": None,
-    "op_name_dict": None,
-    "reduce_range": None,
-    "example_inputs": None,
-    "excluded_precisions": [],
-    "quant_level": "auto",
-    "accuracy_criterion": ACCURACY_CRITERION_CONFIG,
-    "tuning_criterion": TUNING_CRITERION_CONFIG,
-}
-
-
-@dataclass
-class INCConfig(BackendConfig):
-    name: str = "neural-compressor"
-    version: Optional[str] = neural_compressor_version()
-    _target_: str = "optimum_benchmark.backends.neural_compressor.backend.INCBackend"
-
-    # load options
-    no_weights: bool = False
-
-    # post-training quantization options
-    ptq_quantization: bool = False
-    ptq_quantization_config: Dict[str, Any] = field(default_factory=dict)
-
-    # calibration options
-    calibration: bool = False
-    calibration_config: Dict[str, Any] = field(default_factory=dict)
-
-    def __post_init__(self):
-        super().__post_init__()
-
-        if self.device != "cpu":
-            raise ValueError(f"INCBackend only supports CPU devices, got {self.device}")
-
-        if self.ptq_quantization:
-            self.ptq_quantization_config = OmegaConf.to_object(
-                OmegaConf.merge(PTQ_QUANTIZATION_CONFIG, self.ptq_quantization_config)
-            )
-            if self.ptq_quantization_config["approach"] == "static" and not self.calibration:
-                raise ValueError("Calibration must be enabled when using static quantization.")
diff --git a/optimum_benchmark/backends/neural_compressor/utils.py b/optimum_benchmark/backends/neural_compressor/utils.py
deleted file mode 100644
index beb99977..00000000
--- a/optimum_benchmark/backends/neural_compressor/utils.py
+++ /dev/null
@@ -1,5 +0,0 @@
-from optimum.intel.neural_compressor.utils import _HEAD_TO_AUTOMODELS
-
-TASKS_TO_INCMODELS = {
-    task: f"optimum.intel.neural_compressor.{incmodel_name}" for task, incmodel_name in _HEAD_TO_AUTOMODELS.items()
-}
diff --git a/optimum_benchmark/backends/py_txi/config.py b/optimum_benchmark/backends/py_txi/config.py
index e42161e6..73b75b75 100644
--- a/optimum_benchmark/backends/py_txi/config.py
+++ b/optimum_benchmark/backends/py_txi/config.py
@@ -50,7 +50,7 @@ class PyTXIConfig(BackendConfig):
     quantize: Optional[str] = None
     num_shard: Optional[int] = None
     speculate: Optional[int] = None
-    cuda_graphs: Optional[bool] = None
+    cuda_graphs: Optional[int] = None
     disable_custom_kernels: Optional[bool] = None
     trust_remote_code: Optional[bool] = None
 
diff --git a/optimum_benchmark/cli.py b/optimum_benchmark/cli.py
index 4b26266b..5af0723b 100644
--- a/optimum_benchmark/cli.py
+++ b/optimum_benchmark/cli.py
@@ -10,12 +10,10 @@
     Benchmark,
     BenchmarkConfig,
     EnergyStarConfig,
-    INCConfig,
     InferenceConfig,
     InlineConfig,
     IPEXConfig,
     LlamaCppConfig,
-    LLMSwarmConfig,
     ORTConfig,
     OVConfig,
     ProcessConfig,
@@ -43,9 +41,7 @@
 cs.store(group="backend", name=ORTConfig.name, node=ORTConfig)
 cs.store(group="backend", name=TorchORTConfig.name, node=TorchORTConfig)
 cs.store(group="backend", name=TRTLLMConfig.name, node=TRTLLMConfig)
-cs.store(group="backend", name=INCConfig.name, node=INCConfig)
 cs.store(group="backend", name=PyTXIConfig.name, node=PyTXIConfig)
-cs.store(group="backend", name=LLMSwarmConfig.name, node=LLMSwarmConfig)
 cs.store(group="backend", name=VLLMConfig.name, node=VLLMConfig)
 cs.store(group="backend", name=LlamaCppConfig.name, node=LlamaCppConfig)
 # scenarios configurations
diff --git a/setup.py b/setup.py
index 03bbdf07..46a1ed60 100644
--- a/setup.py
+++ b/setup.py
@@ -76,6 +76,7 @@
     "py-txi": ["py-txi"],
     "vllm": ["vllm"],
     # optional dependencies
+    "torchao": ["torchao"],
     "autoawq": ["autoawq"],
     "auto-gptq": ["optimum", "auto-gptq"],
     "sentence-transformers": ["sentence-transformers"],
diff --git a/tests/test_energy_star.py b/tests/test_energy_star.py
index bbb83f55..f2520932 100644
--- a/tests/test_energy_star.py
+++ b/tests/test_energy_star.py
@@ -9,12 +9,16 @@
 LOGGER = getLogger("test-cli")
 
 
-TEST_CONFIG_DIR = Path(__file__).parent.parent / "examples/energy_star"
+TEST_CONFIG_DIR = Path(__file__).parent.parent / "energy_star"
+
 TEST_CONFIG_NAMES = [
     config.split(".")[0]
     for config in os.listdir(TEST_CONFIG_DIR)
     if config.endswith(".yaml") and not (config.startswith("_") or config.endswith("_"))
 ]
+TEST_SCRIPT_PATHS = [
+    str(TEST_CONFIG_DIR / filename) for filename in os.listdir(TEST_CONFIG_DIR) if filename.endswith(".py")
+]
 
 ROCR_VISIBLE_DEVICES = os.environ.get("ROCR_VISIBLE_DEVICES", None)
 CUDA_VISIBLE_DEVICES = os.environ.get("CUDA_VISIBLE_DEVICES", None)
@@ -42,3 +46,11 @@ def test_cli_configs(config_name):
 
     popen = run_subprocess_and_log_stream_output(LOGGER, args)
     assert popen.returncode == 0, f"Failed to run {config_name}"
+
+
+@pytest.mark.parametrize("script_path", TEST_SCRIPT_PATHS)
+def test_api_scripts(script_path):
+    args = ["python", script_path]
+
+    popen = run_subprocess_and_log_stream_output(LOGGER, args)
+    assert popen.returncode == 0, f"Failed to run {script_path}"
diff --git a/tests/test_examples.py b/tests/test_examples.py
new file mode 100644
index 00000000..13cf3cff
--- /dev/null
+++ b/tests/test_examples.py
@@ -0,0 +1,46 @@
+import os
+from logging import getLogger
+from pathlib import Path
+
+import pytest
+
+from optimum_benchmark.logging_utils import run_subprocess_and_log_stream_output
+
+LOGGER = getLogger("test-examples")
+
+
+TEST_CONFIG_DIR = Path(__file__).parent.parent / "examples"
+
+TEST_CONFIG_NAMES = [
+    config.split(".")[0]
+    for config in os.listdir(TEST_CONFIG_DIR)
+    if config.endswith(".yaml") and not (config.startswith("_") or config.endswith("_"))
+]
+
+TEST_SCRIPT_PATHS = [
+    str(TEST_CONFIG_DIR / filename) for filename in os.listdir(TEST_CONFIG_DIR) if filename.endswith(".py")
+]
+
+ROCR_VISIBLE_DEVICES = os.environ.get("ROCR_VISIBLE_DEVICES", None)
+CUDA_VISIBLE_DEVICES = os.environ.get("CUDA_VISIBLE_DEVICES", None)
+
+
+@pytest.mark.parametrize("config_name", TEST_CONFIG_NAMES)
+def test_cli_configs(config_name):
+    args = ["optimum-benchmark", "--config-dir", TEST_CONFIG_DIR, "--config-name", config_name]
+
+    if ROCR_VISIBLE_DEVICES is not None:
+        args += [f'backend.device_ids="{ROCR_VISIBLE_DEVICES}"']
+    elif CUDA_VISIBLE_DEVICES is not None:
+        args += [f'backend.device_ids="{CUDA_VISIBLE_DEVICES}"']
+
+    popen = run_subprocess_and_log_stream_output(LOGGER, args)
+    assert popen.returncode == 0, f"Failed to run {config_name}"
+
+
+@pytest.mark.parametrize("script_path", TEST_SCRIPT_PATHS)
+def test_api_scripts(script_path):
+    args = ["python", script_path]
+
+    popen = run_subprocess_and_log_stream_output(LOGGER, args)
+    assert popen.returncode == 0, f"Failed to run {script_path}"