From 8029a711d880be393f91c43d85ee2ddab838a2d5 Mon Sep 17 00:00:00 2001
From: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com>
Date: Mon, 19 Feb 2024 08:25:33 +0100
Subject: [PATCH] [feature][refactor] Better Metrics and Trackers (#124)

---
 .github/workflows/test_api_cuda.yaml          |  15 +-
 .github/workflows/test_api_rocm.yaml          |   6 +-
 .../workflows/test_cli_cuda_onnxruntime.yaml  |  10 +-
 .github/workflows/test_cli_cuda_pytorch.yaml  |  15 +-
 .../workflows/test_cli_cuda_torch_ort.yaml    |  10 +-
 .../workflows/test_cli_rocm_onnxruntime.yaml  |  10 +-
 .github/workflows/test_cli_rocm_pytorch.yaml  |   6 +-
 .github/workflows/test_cli_tensorrt_llm.yaml  |  14 +-
 .../test_cli_tensorrt_onnxruntime.yaml        |   9 +-
 .gitignore                                    |   1 +
 Makefile                                      |  61 ++-
 README.md                                     | 115 +++---
 docker/cpu.dockerfile                         |   1 -
 docker/cuda.dockerfile                        |   4 +-
 docker/rocm-ort.dockerfile                    |   5 +-
 docker/tensorrt.dockerfile                    |   2 +-
 examples/api_launch.py                        |  21 ++
 examples/pytorch_bert.yaml                    |   6 +-
 optimum_benchmark/backends/base.py            |  24 +-
 optimum_benchmark/backends/config.py          |  54 +--
 optimum_benchmark/backends/diffusers_utils.py |   2 +-
 .../backends/neural_compressor/backend.py     |  32 +-
 .../backends/neural_compressor/config.py      |  10 +-
 .../backends/onnxruntime/backend.py           |  60 ++-
 .../backends/onnxruntime/config.py            |  21 +-
 .../backends/onnxruntime/utils.py             |   8 +-
 .../backends/openvino/backend.py              |  34 +-
 optimum_benchmark/backends/openvino/config.py |   2 +-
 optimum_benchmark/backends/openvino/utils.py  |   6 +-
 optimum_benchmark/backends/peft_utils.py      |  11 +-
 optimum_benchmark/backends/pytorch/backend.py |  61 ++-
 optimum_benchmark/backends/pytorch/config.py  |  15 +-
 .../backends/tensorrt_llm/backend.py          |  10 +-
 .../backends/tensorrt_llm/config.py           |   5 +-
 .../text_generation_inference/backend.py      |  37 +-
 optimum_benchmark/backends/timm_utils.py      |   2 +-
 .../backends/torch_ort/backend.py             |  24 +-
 .../backends/torch_ort/config.py              |   2 +-
 .../backends/transformers_utils.py            |  20 +-
 optimum_benchmark/benchmarks/base.py          |   2 +-
 optimum_benchmark/benchmarks/config.py        |   5 +-
 .../benchmarks/inference/benchmark.py         | 218 ++++++-----
 .../benchmarks/inference/callback.py          |  25 --
 .../benchmarks/inference/config.py            |  19 +-
 .../benchmarks/inference/report.py            | 353 ------------------
 optimum_benchmark/benchmarks/report.py        | 130 ++++++-
 .../benchmarks/training/benchmark.py          |  90 +++--
 .../benchmarks/training/callback.py           |  43 ---
 .../benchmarks/training/config.py             |   9 +-
 .../benchmarks/training/report.py             | 169 ---------
 optimum_benchmark/benchmarks/utils.py         |   1 -
 optimum_benchmark/cli.py                      |  47 +--
 optimum_benchmark/env_utils.py                | 175 ---------
 optimum_benchmark/experiment.py               | 161 ++++----
 .../generators/input_generator.py             |  20 +-
 .../generators/task_generator.py              |  90 +----
 optimum_benchmark/import_utils.py             |  57 ++-
 optimum_benchmark/launchers/base.py           |   5 +-
 optimum_benchmark/launchers/config.py         |   4 +-
 .../launchers/inline/launcher.py              |  17 +-
 .../launchers/isolation_utils.py              |  75 ++--
 .../launchers/process/launcher.py             |  65 ++--
 .../launchers/torchrun/config.py              |   2 +-
 .../launchers/torchrun/launcher.py            |  85 ++---
 optimum_benchmark/logging_utils.py            |  27 +-
 optimum_benchmark/system_utils.py             | 219 +++++++++++
 optimum_benchmark/task_utils.py               |  52 +--
 optimum_benchmark/trackers/energy.py          | 115 ++++--
 optimum_benchmark/trackers/latency.py         | 252 ++++++++++---
 optimum_benchmark/trackers/memory.py          | 338 +++++++++++------
 pyproject.toml                                |  13 +
 setup.py                                      |  27 +-
 tests/configs/_base_.yaml                     |  13 +-
 tests/configs/_bert_sweep_.yaml               |   3 +-
 ..._lm_naive_mp_.yaml => _gpt_naive_mp_.yaml} |   6 +-
 .../{_lm_peft_.yaml => _gpt_peft_.yaml}       |   0
 .../{_lm_sweep_.yaml => _gpt_sweep_.yaml}     |   1 -
 ...nference_neural_compressor_gpt_sweep.yaml} |   4 +-
 ... cpu_inference_onnxruntime_gpt_sweep.yaml} |   4 +-
 ... => cpu_inference_openvino_gpt_sweep.yaml} |   4 +-
 ...l => cpu_inference_pytorch_gpt_sweep.yaml} |   4 +-
 ...ml => cpu_training_pytorch_gpt_sweep.yaml} |   4 +-
 ...cuda_inference_onnxruntime_gpt_sweep.yaml} |   4 +-
 ... cuda_inference_pytorch_gpt_naive_mp.yaml} |   4 +-
 ... => cuda_inference_pytorch_gpt_sweep.yaml} |   4 +-
 ...> cuda_training_pytorch_gpt_naive_mp.yaml} |   4 +-
 ...ml => cuda_training_pytorch_gpt_peft.yaml} |   4 +-
 ...l => cuda_training_pytorch_gpt_sweep.yaml} |   4 +-
 ... => cuda_training_torch_ort_gpt_peft.yaml} |   4 +-
 ...=> cuda_training_torch_ort_gpt_sweep.yaml} |   4 +-
 ...rocm_inference_onnxruntime_gpt_sweep.yaml} |   4 +-
 tests/test_api.py                             | 138 +++----
 tests/test_cli.py                             |  11 +-
 93 files changed, 1863 insertions(+), 2026 deletions(-)
 create mode 100644 examples/api_launch.py
 delete mode 100644 optimum_benchmark/benchmarks/inference/callback.py
 delete mode 100644 optimum_benchmark/benchmarks/inference/report.py
 delete mode 100644 optimum_benchmark/benchmarks/training/callback.py
 delete mode 100644 optimum_benchmark/benchmarks/training/report.py
 delete mode 100644 optimum_benchmark/benchmarks/utils.py
 delete mode 100644 optimum_benchmark/env_utils.py
 create mode 100644 optimum_benchmark/system_utils.py
 rename tests/configs/{_lm_naive_mp_.yaml => _gpt_naive_mp_.yaml} (100%)
 rename tests/configs/{_lm_peft_.yaml => _gpt_peft_.yaml} (100%)
 rename tests/configs/{_lm_sweep_.yaml => _gpt_sweep_.yaml} (81%)
 rename tests/configs/{cpu_inference_neural_compressor_lm_sweep.yaml => cpu_inference_neural_compressor_gpt_sweep.yaml} (70%)
 rename tests/configs/{cpu_inference_onnxruntime_lm_sweep.yaml => cpu_inference_onnxruntime_gpt_sweep.yaml} (71%)
 rename tests/configs/{cpu_inference_openvino_lm_sweep.yaml => cpu_inference_openvino_gpt_sweep.yaml} (71%)
 rename tests/configs/{cpu_inference_pytorch_lm_sweep.yaml => cpu_inference_pytorch_gpt_sweep.yaml} (72%)
 rename tests/configs/{cpu_training_pytorch_lm_sweep.yaml => cpu_training_pytorch_gpt_sweep.yaml} (72%)
 rename tests/configs/{cuda_inference_onnxruntime_lm_sweep.yaml => cuda_inference_onnxruntime_gpt_sweep.yaml} (71%)
 rename tests/configs/{cuda_inference_pytorch_lm_naive_mp.yaml => cuda_inference_pytorch_gpt_naive_mp.yaml} (70%)
 rename tests/configs/{cuda_inference_pytorch_lm_sweep.yaml => cuda_inference_pytorch_gpt_sweep.yaml} (72%)
 rename tests/configs/{cuda_training_pytorch_lm_naive_mp.yaml => cuda_training_pytorch_gpt_naive_mp.yaml} (70%)
 rename tests/configs/{cuda_training_pytorch_lm_peft.yaml => cuda_training_pytorch_gpt_peft.yaml} (69%)
 rename tests/configs/{cuda_training_pytorch_lm_sweep.yaml => cuda_training_pytorch_gpt_sweep.yaml} (69%)
 rename tests/configs/{cuda_training_torch_ort_lm_peft.yaml => cuda_training_torch_ort_gpt_peft.yaml} (69%)
 rename tests/configs/{cuda_training_torch_ort_lm_sweep.yaml => cuda_training_torch_ort_gpt_sweep.yaml} (69%)
 rename tests/configs/{rocm_inference_onnxruntime_lm_sweep.yaml => rocm_inference_onnxruntime_gpt_sweep.yaml} (71%)

diff --git a/.github/workflows/test_api_cuda.yaml b/.github/workflows/test_api_cuda.yaml
index fe08f29d..28d9b435 100644
--- a/.github/workflows/test_api_cuda.yaml
+++ b/.github/workflows/test_api_cuda.yaml
@@ -18,11 +18,11 @@ jobs:
       matrix:
         image:
           [
-            { torch_cuda: cu121, cuda_version: 12.1.1 },
-            { torch_cuda: cu118, cuda_version: 11.8.0 },
+            { torch_cuda: cu118, torch_pre_release: 0, cuda_version: 11.8.0 },
+            { torch_cuda: cu121, torch_pre_release: 1, cuda_version: 12.1.1 },
           ]
 
-    runs-on: hf-dgx-01
+    runs-on: nvidia-gpu
     steps:
       - name: Checkout
         uses: actions/checkout@v3
@@ -37,17 +37,20 @@ jobs:
           --tag opt-bench-cuda:${{ matrix.image.cuda_version }}
           .
 
+      - name: Get GPUs with most free memory
+        id: get_devices
+        run: |
+          echo "::set-output name=devices::$(nvidia-smi --query-gpu=memory.free,index --format=csv,noheader,nounits | sort -n -k1 | tail -n 2 | awk -F', ' '{print $2}' | xargs echo -n | sed 's/ /,/g' | awk '{print $0}')"
+
       - name: Run tests
         run: docker run
           --rm
-          --net host
           --pid host
           --shm-size 64G
           --env USE_CUDA="1"
-          --volume $HOME/.cache/huggingface:/home/user/.cache/huggingface
+          --gpus '"device=${{ steps.get_devices.outputs.devices }}"'
           --volume $(pwd):/workspace/optimum-benchmark
           --workdir /workspace/optimum-benchmark
-          --gpus '"device=0,1"'
           --entrypoint /bin/bash
           opt-bench-cuda:${{ matrix.image.cuda_version }}
           -c "pip install -e .[testing,timm,diffusers] && pytest -k 'api and cuda' -x"
diff --git a/.github/workflows/test_api_rocm.yaml b/.github/workflows/test_api_rocm.yaml
index 31328eb3..7e2bf63a 100644
--- a/.github/workflows/test_api_rocm.yaml
+++ b/.github/workflows/test_api_rocm.yaml
@@ -19,10 +19,10 @@ jobs:
         image:
           [
             { torch_rocm: rocm5.6, torch_pre_release: 0, rocm_version: 5.6.1 },
-            { torch_rocm: rocm5.7, torch_pre_release: 1, rocm_version: 5.7 },
+            { torch_rocm: rocm5.7, torch_pre_release: 1, rocm_version: 5.7.1 },
           ]
 
-    runs-on: hf-amd-mi210-dev
+    runs-on: amd-gpu
     steps:
       - name: Checkout code
         uses: actions/checkout@v3
@@ -41,11 +41,9 @@ jobs:
       - name: Run tests
         run: docker run
           --rm
-          --net host
           --pid host
           --shm-size 64G
           --env USE_ROCM="1"
-          --volume $HOME/.cache/huggingface:/home/user/.cache/huggingface
           --volume $(pwd):/workspace/optimum-benchmark
           --workdir /workspace/optimum-benchmark
           --device /dev/kfd
diff --git a/.github/workflows/test_cli_cuda_onnxruntime.yaml b/.github/workflows/test_cli_cuda_onnxruntime.yaml
index 0b03608e..adb31be3 100644
--- a/.github/workflows/test_cli_cuda_onnxruntime.yaml
+++ b/.github/workflows/test_cli_cuda_onnxruntime.yaml
@@ -13,7 +13,7 @@ concurrency:
 
 jobs:
   build_image_and_run_cli_cuda_onnxruntime_tests:
-    runs-on: hf-dgx-01
+    runs-on: nvidia-gpu
     steps:
       - name: Checkout
         uses: actions/checkout@v3
@@ -28,16 +28,20 @@ jobs:
           --tag opt-bench-cuda:11.8.0
           .
 
+      - name: Get GPUs with most free memory
+        id: get_devices
+        run: |
+          echo "::set-output name=devices::$(nvidia-smi --query-gpu=memory.free,index --format=csv,noheader,nounits | sort -n -k1 | tail -n 2 | awk -F', ' '{print $2}' | xargs echo -n | sed 's/ /,/g' | awk '{print $0}')"
+
       - name: Run tests
         run: docker run
           --rm
-          --net host
           --pid host
           --shm-size 64G
           --env USE_CUDA="1"
           --entrypoint /bin/bash
+          --gpus '"device=${{ steps.get_devices.outputs.devices }}"'
           --volume $(pwd):/workspace/optimum-benchmark
           --workdir /workspace/optimum-benchmark
-          --gpus '"device=0,1"'
           opt-bench-cuda:11.8.0
           -c "pip install -e .[testing,onnxruntime-gpu,diffusers,timm] && pytest -k 'cli and cuda and onnxruntime' -x"
diff --git a/.github/workflows/test_cli_cuda_pytorch.yaml b/.github/workflows/test_cli_cuda_pytorch.yaml
index 1b3fd99f..204722db 100644
--- a/.github/workflows/test_cli_cuda_pytorch.yaml
+++ b/.github/workflows/test_cli_cuda_pytorch.yaml
@@ -18,11 +18,11 @@ jobs:
       matrix:
         image:
           [
-            { torch_cuda: cu121, cuda_version: 12.1.1 },
-            { torch_cuda: cu118, cuda_version: 11.8.0 },
+            { torch_cuda: cu118, torch_pre_release: 0, cuda_version: 11.8.0 },
+            { torch_cuda: cu121, torch_pre_release: 1, cuda_version: 12.1.1 },
           ]
 
-    runs-on: hf-dgx-01
+    runs-on: nvidia-gpu
     steps:
       - name: Checkout
         uses: actions/checkout@v3
@@ -37,17 +37,20 @@ jobs:
           --tag opt-bench-cuda:${{ matrix.image.cuda_version }}
           .
 
+      - name: Get GPUs with most free memory
+        id: get_devices
+        run: |
+          echo "::set-output name=devices::$(nvidia-smi --query-gpu=memory.free,index --format=csv,noheader,nounits | sort -n -k1 | tail -n 2 | awk -F', ' '{print $2}' | xargs echo -n | sed 's/ /,/g' | awk '{print $0}')"
+
       - name: Run tests
         run: docker run
           --rm
-          --net host
           --pid host
           --shm-size 64G
           --env USE_CUDA="1"
-          --volume $HOME/.cache/huggingface:/home/user/.cache/huggingface
+          --gpus '"device=${{ steps.get_devices.outputs.devices }}"'
           --volume $(pwd):/workspace/optimum-benchmark
           --workdir /workspace/optimum-benchmark
-          --gpus '"device=0,1"'
           --entrypoint /bin/bash
           opt-bench-cuda:${{ matrix.image.cuda_version }}
           -c "pip install -e .[testing,diffusers,timm,deepspeed,peft] && pytest -k 'cli and cuda and pytorch' -x"
diff --git a/.github/workflows/test_cli_cuda_torch_ort.yaml b/.github/workflows/test_cli_cuda_torch_ort.yaml
index 71bfd33e..680f3f0f 100644
--- a/.github/workflows/test_cli_cuda_torch_ort.yaml
+++ b/.github/workflows/test_cli_cuda_torch_ort.yaml
@@ -13,7 +13,7 @@ concurrency:
 
 jobs:
   build_image_and_run_cli_cuda_torch_ort_tests:
-    runs-on: hf-dgx-01
+    runs-on: nvidia-gpu
     steps:
       - name: Checkout
         uses: actions/checkout@v3
@@ -28,16 +28,20 @@ jobs:
           --tag opt-bench-cuda:11.8.0
           .
 
+      - name: Get GPUs with most free memory
+        id: get_devices
+        run: |
+          echo "::set-output name=devices::$(nvidia-smi --query-gpu=memory.free,index --format=csv,noheader,nounits | sort -n -k1 | tail -n 2 | awk -F', ' '{print $2}' | xargs echo -n | sed 's/ /,/g' | awk '{print $0}')"
+
       - name: Run tests
         run: docker run
           --rm
-          --net host
           --pid host
           --shm-size 64G
           --env USE_CUDA="1"
           --entrypoint /bin/bash
+          --gpus '"device=${{ steps.get_devices.outputs.devices }}"'
           --volume $(pwd):/workspace/optimum-benchmark
           --workdir /workspace/optimum-benchmark
-          --gpus '"device=0,1"'
           opt-bench-cuda:11.8.0
           -c "pip install -e .[testing,torch-ort,peft] && python -m torch_ort.configure && pytest -k 'cli and cuda and torch_ort' -x"
diff --git a/.github/workflows/test_cli_rocm_onnxruntime.yaml b/.github/workflows/test_cli_rocm_onnxruntime.yaml
index fcd0f53d..8be58292 100644
--- a/.github/workflows/test_cli_rocm_onnxruntime.yaml
+++ b/.github/workflows/test_cli_rocm_onnxruntime.yaml
@@ -13,7 +13,7 @@ concurrency:
 
 jobs:
   build_image_and_run_cli_rocm_onnxruntime_tests:
-    runs-on: hf-amd-mi210-dev
+    runs-on: amd-gpu
     steps:
       - name: Checkout
         uses: actions/checkout@v3
@@ -21,7 +21,7 @@ jobs:
       - name: Check if image exists
         id: check_image
         run: |
-          if [[ "$(docker images -q opt-bench-rocm-ort:5.7 2> /dev/null)" == "" ]]; then
+          if [[ "$(docker images -q opt-bench-rocm-ort:latest 2> /dev/null)" == "" ]]; then
             echo "::set-output name=exists::false"
           else
             echo "::set-output name=exists::true"
@@ -33,14 +33,12 @@ jobs:
           --file docker/rocm-ort.dockerfile
           --build-arg USER_ID=$(id -u)
           --build-arg GROUP_ID=$(id -g)
-          --build-arg ROCM_VERSION=5.7
-          --tag opt-bench-rocm-ort:5.7
+          --tag opt-bench-rocm-ort:latest
           .
 
       - name: Run tests
         run: docker run
           --rm
-          --net host
           --pid host
           --shm-size 64G
           --env USE_ROCM="1"
@@ -50,5 +48,5 @@ jobs:
           --device /dev/dri/renderD128
           --device /dev/dri/renderD129
           --entrypoint /bin/bash
-          opt-bench-rocm-ort:5.7
+          opt-bench-rocm-ort:latest
           -c "pip install -e .[testing,timm,diffusers] && pytest -k 'cli and rocm and onnxruntime' -x"
diff --git a/.github/workflows/test_cli_rocm_pytorch.yaml b/.github/workflows/test_cli_rocm_pytorch.yaml
index 11c9e77a..c4ae7139 100644
--- a/.github/workflows/test_cli_rocm_pytorch.yaml
+++ b/.github/workflows/test_cli_rocm_pytorch.yaml
@@ -19,10 +19,10 @@ jobs:
         image:
           [
             { torch_rocm: rocm5.6, torch_pre_release: 0, rocm_version: 5.6.1 },
-            { torch_rocm: rocm5.7, torch_pre_release: 1, rocm_version: 5.7 },
+            { torch_rocm: rocm5.7, torch_pre_release: 1, rocm_version: 5.7.1 },
           ]
 
-    runs-on: hf-amd-mi210-dev
+    runs-on: [amd-gpu]
     steps:
       - name: Checkout code
         uses: actions/checkout@v3
@@ -41,11 +41,9 @@ jobs:
       - name: Run tests
         run: docker run
           --rm
-          --net host
           --pid host
           --shm-size 64G
           --env USE_ROCM="1"
-          --volume $HOME/.cache/huggingface:/home/user/.cache/huggingface
           --volume $(pwd):/workspace/optimum-benchmark
           --workdir /workspace/optimum-benchmark
           --device /dev/kfd
diff --git a/.github/workflows/test_cli_tensorrt_llm.yaml b/.github/workflows/test_cli_tensorrt_llm.yaml
index 0169fca5..40438055 100644
--- a/.github/workflows/test_cli_tensorrt_llm.yaml
+++ b/.github/workflows/test_cli_tensorrt_llm.yaml
@@ -13,7 +13,7 @@ concurrency:
 
 jobs:
   pull_image_and_run_cli_tensorrt_llm_tests:
-    runs-on: hf-dgx-01
+    runs-on: nvidia-gpu
     steps:
       - name: Checkout
         uses: actions/checkout@v3
@@ -26,18 +26,20 @@ jobs:
           --tag opt-bench-tensorrt-llm:latest
           .
 
+      - name: Get GPUs with most free memory
+        id: get_devices
+        run: |
+          echo "::set-output name=devices::$(nvidia-smi --query-gpu=memory.free,index --format=csv,noheader,nounits | sort -n -k1 | tail -n 2 | awk -F', ' '{print $2}' | xargs echo -n | sed 's/ /,/g' | awk '{print $0}')"
+
       - name: Run tests
         run: docker run
           --rm
-          --net host
           --pid host
           --shm-size 64G
           --env USE_CUDA="1"
-          --env USER_ID=$(id -u)
-          --env GROUP_ID=$(id -g)
+          --gpus '"device=${{ steps.get_devices.outputs.devices }}"'
           --volume $(pwd):/workspace/optimum-benchmark
           --workdir /workspace/optimum-benchmark
-          --gpus '"device=0,1"'
           --entrypoint /bin/bash
           opt-bench-tensorrt-llm:latest
-          -c "pip install -e .[testing] && pytest -k 'cli and tensorrt_llm' -x"
+          -c "pip install -e .[testing] && pip uninstall -y nvidia-ml-py && pytest -k 'cli and tensorrt_llm' -x"
diff --git a/.github/workflows/test_cli_tensorrt_onnxruntime.yaml b/.github/workflows/test_cli_tensorrt_onnxruntime.yaml
index 92f425e7..a98bfc15 100644
--- a/.github/workflows/test_cli_tensorrt_onnxruntime.yaml
+++ b/.github/workflows/test_cli_tensorrt_onnxruntime.yaml
@@ -13,7 +13,7 @@ concurrency:
 
 jobs:
   build_image_and_run_cli_tensorrt_onnxruntime_tests:
-    runs-on: hf-dgx-01
+    runs-on: nvidia-gpu
     steps:
       - name: Checkout
         uses: actions/checkout@v3
@@ -23,15 +23,12 @@ jobs:
           --file docker/tensorrt.dockerfile
           --build-arg USER_ID=$(id -u)
           --build-arg GROUP_ID=$(id -g)
-          --build-arg TENSORRT_VERSION=22.12
-          --build-arg TORCH_CUDA=cu118
-          --tag opt-bench-tensorrt:22.12
+          --tag opt-bench-tensorrt:latest
           .
 
       - name: Run tests
         run: docker run
           --rm
-          --net host
           --pid host
           --shm-size 64G
           --env USE_CUDA="1"
@@ -39,5 +36,5 @@ jobs:
           --workdir /workspace/optimum-benchmark
           --gpus '"device=0,1"'
           --entrypoint /bin/bash
-          opt-bench-tensorrt:22.12
+          opt-bench-tensorrt:latest
           -c "pip install -e .[testing,onnxruntime-gpu,diffusers,timm] && pytest -k 'cli and tensorrt and onnxruntime' -x"
diff --git a/.gitignore b/.gitignore
index 12c19326..a8e86c83 100644
--- a/.gitignore
+++ b/.gitignore
@@ -171,3 +171,4 @@ actions-runner/
 experiments/
 examples/
 .engine/
+amdsmi
\ No newline at end of file
diff --git a/Makefile b/Makefile
index 55e44e1e..0253c183 100644
--- a/Makefile
+++ b/Makefile
@@ -1,5 +1,9 @@
 # List of targets that are not associated with files
-.PHONY:	quality style install install_dev_cpu install_dev_gpu
+.PHONY:	quality style install \
+		build_docker_cpu, build_docker_cuda, build_docker_rocm, \ 
+		test_cli_cpu_pytorch, test_cli_rocm_pytorch, \
+		test_cli_cpu_neural_compressor, test_cli_cpu_onnxruntime, test_cli_cpu_openvino, \
+		test_api_cpu, test_api_cuda, test_api_rocm, test_api_misc
 
 quality:
 	ruff check .
@@ -13,13 +17,13 @@ install:
 	pip install -e .
 
 build_docker_cpu:
-	docker build -f docker/cuda.dockerfile  --build-arg USER_ID=$(id -u) --build-arg GROUP_ID=$(id -g) -t opt-bench-cpu:latest .
+	docker build -f docker/cpu.dockerfile  --build-arg USER_ID=$(shell id -u) --build-arg GROUP_ID=$(shell id -g) -t opt-bench-cpu:latest .
 
 build_docker_cuda:
-	docker build -f docker/cuda.dockerfile  --build-arg USER_ID=$(id -u) --build-arg GROUP_ID=$(id -g) --build-arg TORCH_CUDA=cu118 --build-arg CUDA_VERSION=11.8.0 -t opt-bench-cuda:11.8.0 . 
+	docker build -f docker/cuda.dockerfile  --build-arg USER_ID=$(shell id -u) --build-arg GROUP_ID=$(shell id -g) -t opt-bench-cuda:latest . 
 
 build_docker_rocm:
-	docker build -f docker/rocm.dockerfile  --build-arg USER_ID=$(id -u) --build-arg GROUP_ID=$(id -g) --build-arg TORCH_ROCM=rocm5.6 --build-arg ROCM_VERSION=5.6.1 -t opt-bench-rocm:5.6.1 . 
+	docker build -f docker/rocm.dockerfile  --build-arg USER_ID=$(shell id -u) --build-arg GROUP_ID=$(shell id -g) -t opt-bench-rocm:latest . 
 
 test_cli_cpu_neural_compressor:
 	docker run \
@@ -27,23 +31,23 @@ test_cli_cpu_neural_compressor:
 	--entrypoint /bin/bash \
 	--volume $(PWD):/workspace \
 	--workdir /workspace \
-	opt-bench-cpu:latest -c "pip install -e .[testing,neural-compressor] && pytest tests/ -k 'cli and cpu and neural_compressor' -x"
+	opt-bench-cpu:latest -c "pip install -e .[testing,neural-compressor,diffusers,timm] && pytest tests/ -k 'cli and cpu and neural_compressor' -x"
 
-test_cli_cpu_openvino:
+test_cli_cpu_onnxruntime:
 	docker run \
 	--rm \
 	--entrypoint /bin/bash \
 	--volume $(PWD):/workspace \
 	--workdir /workspace \
-	opt-bench-cpu:latest -c "pip install -e .[testing,openvino,diffusers] && pytest tests/ -k 'cli and cpu and openvino' -x"
+	opt-bench-cpu:latest -c "pip install -e .[testing,onnxruntime,diffusers,timm] && pytest tests/ -k 'cli and cpu and onnxruntime' -x"
 
-test_cli_cpu_onnxruntime:
+test_cli_cpu_openvino:
 	docker run \
 	--rm \
 	--entrypoint /bin/bash \
 	--volume $(PWD):/workspace \
 	--workdir /workspace \
-	opt-bench-cpu:latest -c "pip install -e .[testing,onnxruntime,diffusers,timm] && pytest tests/ -k 'cli and cpu and onnxruntime' -x"
+	opt-bench-cpu:latest -c "pip install -e .[testing,openvino,diffusers,timm] && pytest tests/ -k 'cli and cpu and openvino' -x"
 
 test_cli_cpu_pytorch:
 	docker run \
@@ -53,13 +57,34 @@ test_cli_cpu_pytorch:
 	--workdir /workspace \
 	opt-bench-cpu:latest -c "pip install -e .[testing,diffusers,timm] && pytest tests/ -k 'cli and cpu and pytorch' -x"
 
+test_cli_rocm_pytorch:
+	docker run \
+	--rm \
+	--device=/dev/kfd \
+	--device /dev/dri/renderD128 \
+	--device /dev/dri/renderD129 \
+	--group-add video \
+	--entrypoint /bin/bash \
+	--volume $(PWD):/workspace \
+	--workdir /workspace \
+	opt-bench-rocm:latest -c "pip install -e .[testing,diffusers,timm,deepspeed,peft] && pytest tests/ -k 'cli and cuda and pytorch' -x"
+
+test_cli_cuda_pytorch:
+	docker run \
+	--rm \
+	--gpus '"device=0,1"' \
+	--entrypoint /bin/bash \
+	--volume $(PWD):/workspace \
+	--workdir /workspace \
+	opt-bench-cuda:latest -c "pip install -e .[testing,diffusers,timm,deepspeed,peft] && pytest tests/ -k 'cli and cuda and pytorch' -x"
+
 test_api_cpu:
 	docker run \
 	--rm \
 	--entrypoint /bin/bash \
 	--volume $(PWD):/workspace \
 	--workdir /workspace \
-	opt-bench-cpu:latest -c "pip install -e .[testing,timm,diffusers,codecarbon] && pytest tests/ -k 'api and cpu' -x"
+	opt-bench-cpu:latest -c "pip install -e .[testing,timm,diffusers] && pytest tests/ -k 'api and cpu' -x"
 
 test_api_cuda:
 	docker run \
@@ -68,7 +93,19 @@ test_api_cuda:
 	--entrypoint /bin/bash \
 	--volume $(PWD):/workspace \
 	--workdir /workspace \
-	opt-bench-cuda:11.8.0 -c "pip install -e .[testing,timm,diffusers,codecarbon] && pytest tests/ -k 'api and cuda' -x"
+	opt-bench-cuda:latest -c "pip install -e .[testing,timm,diffusers] && pytest tests/ -k 'api and cuda' -x"
+
+test_api_rocm:
+	docker run \
+	--rm \
+	--device=/dev/kfd \
+	--device /dev/dri/renderD128 \
+	--device /dev/dri/renderD129 \
+	--group-add video \
+	--entrypoint /bin/bash \
+	--volume $(PWD):/workspace \
+	--workdir /workspace \
+	opt-bench-rocm:latest -c "pip install -e .[testing,timm,diffusers] && pytest tests/ -k 'api and cuda' -x"
 
 test_api_misc:
 	docker run \
@@ -76,4 +113,4 @@ test_api_misc:
 	--entrypoint /bin/bash \
 	--volume $(PWD):/workspace \
 	--workdir /workspace \
-	opt-bench-cpu:latest -c "pip install -e .[testing,timm,diffusers,codecarbon] && pytest tests/ -k 'api and not (cpu or cuda or rocm or tensorrt)' -x"
+	opt-bench-cpu:latest -c "pip install -e .[testing,timm,diffusers] && pytest tests/ -k 'api and not (cpu or cuda or rocm or tensorrt)' -x"
diff --git a/README.md b/README.md
index e338b888..49889327 100644
--- a/README.md
+++ b/README.md
@@ -1,37 +1,42 @@
-<p align="center">
-  <img src="logo.png" alt="Optimum-Benchmark Logo" width="350" style="max-width: 100%;" />
-</p>
+<p align="center"><img src="logo.png" alt="Optimum-Benchmark Logo" width="350" style="max-width: 100%;" /></p>
+<p align="center"><q>All benchmarks are wrong, some will cost you less than the others.</q></p>
 <h1 align="center">Optimum-Benchmark 🏋️</h1>
 
-Optimum-Benchmark is a unified [multi-backend & multi-device](#backends--devices-) utility for benchmarking [Transformers](https://github.com/huggingface/transformers), [Diffusers](https://github.com/huggingface/diffusers), [PEFT](https://github.com/huggingface/peft), [TIMM](https://github.com/huggingface/pytorch-image-models) and [Optimum](https://github.com/huggingface/optimum) flavors, along with all their supported [optimizations & quantization schemes](#backend-features-), for [inference & training](#benchmark-features-%EF%B8%8F), in [distributed & non-distributed settings](#backend-features-).
+Optimum-Benchmark is a unified [multi-backend & multi-device](#backends--devices-) utility for benchmarking [Transformers](https://github.com/huggingface/transformers), [Diffusers](https://github.com/huggingface/diffusers), [PEFT](https://github.com/huggingface/peft), [TIMM](https://github.com/huggingface/pytorch-image-models) and [Optimum](https://github.com/huggingface/optimum) flavors, along with all their supported [optimizations & quantization schemes](#backend-features-), for [inference & training](#benchmark-features-%EF%B8%8F), in [distributed & non-distributed settings](#backend-features-), in the most correct and scalable way possible (no need to even download model weights).
 
-## Motivation 🤔
+*News* 📰
+- PYPI release soon.
+- Added a simple Python API to run benchmarks with all isolation and tracking features supported by the CLI.
 
+*Motivations* 🤔
 - HF hardware partners wanting to know how their hardware performs compared to another hardware on the same models.
 - HF ecosystem users wanting to know how their chosen model performs in terms of latency, throughput, memory usage, energy consumption, etc compared to another model.
 - Experimenting with hardware & backend specific optimizations & quantization schemes that can be applied to models and improve their computational/memory/energy efficiency.
 
+*Notes* 📝
+- If you were using `optimum-benchmark` before and want to keep using the old CLI only version, you can still do so by installing from this branch [`0.0.1`](https://github.com/huggingface/optimum-benchmark/tree/0.0.1).
+
 ## Current status 📈
 
 ### API
-
 [![CPU](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_api_cpu.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_api_cpu.yaml)
 [![CUDA](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_api_cuda.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_api_cuda.yaml)
 [![ROCM](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_api_rocm.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_api_rocm.yaml)
+[![MISC](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_api_misc.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_api_misc.yaml)
 
 ### CLI
-
-[![CPU Pytorch Tests](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cpu_pytorch.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cpu_pytorch.yaml)
-[![CPU OnnxRuntime Tests](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cpu_onnxruntime.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cpu_onnxruntime.yaml)
-[![CPU Intel Neural Compressor Tests](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cpu_neural_compressor.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cpu_neural_compressor.yaml)
-[![CPU OpenVINO Tests](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cpu_openvino.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cpu_openvino.yaml)
-[![CUDA Pytorch Tests](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cuda_pytorch.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cuda_pytorch.yaml)
-[![CUDA OnnxRuntime Tests](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cuda_onnxruntime_inference.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cuda_onnxruntime_inference.yaml)
-[![CUDA Torch-ORT Tests](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cuda_torch_ort_training.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cuda_torch_ort_training.yaml)
-[![TensorRT OnnxRuntime Tests](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_tensorrt_onnxruntime_inference.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_tensorrt_onnxruntime_inference.yaml)
-[![TensorRT-LLM Tests](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_tensorrt_llm.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_tensorrt_llm.yaml)
-[![ROCm Pytorch Tests](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_rocm_pytorch.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_rocm_pytorch.yaml)
-[![ROCm OnnxRuntime Tests](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_rocm_onnxruntime_inference.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_rocm_onnxruntime_inference.yaml)
+[![CPU Pytorch Tests](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cpu_pytorch.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cpu_pytorch.yaml)
+[![CPU OnnxRuntime Tests](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cpu_onnxruntime.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cpu_onnxruntime.yaml)
+[![CPU Intel Neural Compressor Tests](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cpu_neural_compressor.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cpu_neural_compressor.yaml)
+[![CPU OpenVINO Tests](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cpu_openvino.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cpu_openvino.yaml)
+[![CUDA Pytorch Tests](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cuda_pytorch.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cuda_pytorch.yaml)
+[![CUDA OnnxRuntime Tests](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cuda_onnxruntime.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cuda_onnxruntime.yaml)
+[![CUDA Torch-ORT Tests](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cuda_torch_ort.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cuda_torch_ort.yaml)
+[![TensorRT OnnxRuntime Tests](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_tensorrt_onnxruntime.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_tensorrt_onnxruntime.yaml)
+[![TensorRT-LLM Tests](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_tensorrt_llm.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_tensorrt_llm.yaml)
+[![ROCm Pytorch Tests](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_rocm_pytorch.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_rocm_pytorch.yaml)
+[![ROCm OnnxRuntime Tests](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_rocm_onnxruntime.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_rocm_onnxruntime.yaml)
+[![MISC Tests](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_misc.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_misc.yaml)
 
 ## Quickstart 🚀
 
@@ -64,46 +69,36 @@ Depending on the backends you want to use, you might need to install some extra
 
 ### Running benchmarks from Python API 🧪
 
-You can run benchmarks from the Python API, using the `launch` function from the `optimum_benchmark.experiment` module. Here's an example of how to run a benchmark using the `pytorch` backend, `process` launcher and `inference` benchmark.
+You can run benchmarks from the Python API, using the `launch` function. Here's an example of how to run a benchmark using the `pytorch` backend, `torchrun` launcher and `inference` benchmark.
 
 ```python
 from optimum_benchmark.logging_utils import setup_logging
 from optimum_benchmark.experiment import launch, ExperimentConfig
 from optimum_benchmark.backends.pytorch.config import PyTorchConfig
-from optimum_benchmark.launchers.process.config import ProcessConfig
+from optimum_benchmark.launchers.torchrun.config import TorchrunConfig
 from optimum_benchmark.benchmarks.inference.config import InferenceConfig
 
-
 if __name__ == "__main__":
     setup_logging(level="INFO")
-    benchmark_config = InferenceConfig(latency=False, memory=True, energy=True)
-    launcher_config = ProcessConfig()
-    backend_config = PyTorchConfig(
-        device="cuda",
-        no_weights=True,
-        device_ids="0,1",
-        device_map="auto",
-        model="IlyasMoutawwakil/vicuna-7b-v1.5-awq-gemm",
-    )
+    launcher_config = TorchrunConfig(nproc_per_node=2)
+    benchmark_config = InferenceConfig(latency=True, memory=True)
+    backend_config = PyTorchConfig(model="gpt2", device="cuda", device_ids="0,1", no_weights=True)
     experiment_config = ExperimentConfig(
-        experiment_name="python-api-launch-experiment",
+        experiment_name="api-launch",
         benchmark=benchmark_config,
         launcher=launcher_config,
         backend=backend_config,
     )
     benchmark_report = launch(experiment_config)
-    benchmark_report.log_all()
-    # or
-    print(benchmark_report.to_dict())
-    # or
-    benchmark_report.push_to_hub("IlyasMoutawwakil/vicuna-7b-v1.5-awq-gemm")
+    experiment_config.push_to_hub("IlyasMoutawwakil/benchmarks") # pushes experiment_config.json to the hub
+    benchmark_report.push_to_hub("IlyasMoutawwakil/benchmarks") # pushes benchmark_report.json to the hub
 ```
 
-Yep, it's that simple! Check the supported backends, launchers and benchmarks in the [features](#features-) section.
+Yep, it's that simple! Check the supported backends, launchers and benchmarks matrix in the [features](#features-) section.
 
 ### Running benchmarks from CLI 🏃‍♂️
 
-You can run a benchmark using the command line by specifying the configuration directory and the configuration name. Both arguments are mandatory for [`hydra`](https://hydra.cc/). `--config-dir` is the directory where the configuration files are stored and `--config-name` is the name of the configuration file without its `.yaml` extension.
+You can also run a benchmark using the command line by specifying the configuration directory and the configuration name. Both arguments are mandatory for [`hydra`](https://hydra.cc/). `--config-dir` is the directory where the configuration files are stored and `--config-name` is the name of the configuration file without its `.yaml` extension.
 
 ```bash
 optimum-benchmark --config-dir examples/ --config-name pytorch_bert
@@ -111,11 +106,11 @@ optimum-benchmark --config-dir examples/ --config-name pytorch_bert
 
 This will run the benchmark using the configuration in [`examples/pytorch_bert.yaml`](examples/pytorch_bert.yaml) and store the results in `runs/pytorch_bert`.
 
-The result files are `benchmark_report.json`, the program's logs `experiment.log` and the configuration that's been used `experiment_config.yaml`, including backend, launcher, benchmark and environment configurations.
+The result files are `benchmark_report.json`, the program's logs `cli.log` and the configuration that's been used `experiment_config.json`, including backend, launcher, benchmark and environment configurations.
 
 The directory for storing these results can be changed by setting `hydra.run.dir` (and/or `hydra.sweep.dir` in case of a multirun) in the command line or in the config file.
 
-### Configuration overrides 🎛️
+#### Configuration overrides 🎛️
 
 It's easy to override the default behavior of a benchmark from the command line.
 
@@ -123,40 +118,17 @@ It's easy to override the default behavior of a benchmark from the command line.
 optimum-benchmark --config-dir examples/ --config-name pytorch_bert backend.model=gpt2 backend.device=cuda
 ```
 
-### Configuration multirun sweeps 🧹
+#### Configuration multirun sweeps 🧹
 
 You can easily run configuration sweeps using the `-m` or `--multirun` option. By default, configurations will be executed serially but other kinds of executions are supported with hydra's launcher plugins : `=submitit`, `hydra/launcher=rays`, etc.
-Note that the hydra launcher `hydra/launcher` is different than our own `launcher`, specifically `hydra/launcher` can only be used in `--multirun` mode, and will only handle the inter-run behavior.
 
 ```bash
 optimum-benchmark --config-dir examples --config-name pytorch_bert -m backend.device=cpu,cuda
 ```
 
-Also, for integer parameters like `batch_size`, one can specify a range of values to sweep over:
-
-```bash
-optimum-benchmark --config-dir examples --config-name pytorch_bert -m device=cpu,cuda benchmark.input_shapes.batch_size='range(1,10,step=2)'
-```
-
 ### Configurations structure 📁
 
-You can create custom configuration files following the [examples here](examples).
-You can also use `hydra`'s [composition](https://hydra.cc/docs/0.11/tutorial/composition/) with a base configuration ([`examples/pytorch_bert.yaml`](examples/pytorch_bert.yaml) for example) and override/define parameters.
-
-To create a configuration that uses a `wav2vec2` model and `onnxruntime` backend, it's as easy as:
-
-```yaml
-defaults:
-  - pytorch_bert
-  - _self_
-  - override backend: onnxruntime
-
-experiment_name: onnxruntime_wav2vec2
-model: bookbot/distil-wav2vec2-adult-child-cls-37m
-device: cpu
-```
-
-Other than the [examples](examples), you can also check [tests](tests/configs/).
+You can create custom configuration files following the [examples here]([examples](https://github.com/IlyasMoutawwakil/optimum-benchmark-examples)).
 
 ## Features 🎨
 
@@ -171,9 +143,9 @@ Everything else is optional or inferred at runtime, but can be configured to you
 
 ### Launchers 🚀
 
+- [x] Distributed inference/training (`launcher=torchrun`)
 - [x] Process isolation between consecutive runs (`launcher=process`)
 - [x] Assert GPU devices (NVIDIA & AMD) isolation (`launcher.device_isolation=true`)
-- [x] Distributed inference/training (`launcher=torchrun`, `launcher.n_proc_per_node=2`)
 
 ### Backends & Devices 📱
 
@@ -191,19 +163,18 @@ Everything else is optional or inferred at runtime, but can be configured to you
 ### Benchmarking 🏋️
 
 - [x] Memory tracking (`benchmark.memory=true`)
-- [x] Latency and throughput tracking of forward pass (default)
+- [x] Energy and efficiency tracking (`benchmark.energy=true`)
+- [x] Latency and throughput tracking (`benchmark.latency=true`)
 - [x] Warm up runs before inference (`benchmark.warmup_runs=20`)
 - [x] Warm up steps during training (`benchmark.warmup_steps=20`)
-- [x] Energy and carbon emissions tracking (`benchmark.energy=true`)
 - [x] Inputs shapes control (e.g. `benchmark.input_shapes.sequence_length=128`)
 - [x] Dataset shapes control (e.g. `benchmark.dataset_shapes.dataset_size=1000`)
-- [x] Latancy and throughput tracking of generation pass (auto-enabled for generative models)
-- [x] Prefill latency and Decoding throughput deduced from generation and forward pass (auto-enabled for generative models)
-- [x] Forward and Generation pass control (e.g. for an LLM `benchmark.generate_kwargs.max_new_tokens=100`, for a diffusion model `benchmark.forward_kwargs.num_images_per_prompt=4`)
+- [x] Prefill latency and Decoding throughput deduced from Generate and Forward pass (auto-enabled for text generation models)
+- [x] Forward, Call and Generate pass kwargs control (e.g. for an LLM `benchmark.generate_kwargs.max_new_tokens=100`, for a diffusion model `benchmark.call_kwargs.num_images_per_prompt=4`)
 
 ### Backend features 🧰
 
-- [x] Random weights initialization (`backend.no_weights=true` for fast model instantiation without downloading weights)
+- [x] "No weights" to benchmark models without downloading their weights (`backend.no_weights=true`)
 - [x] Onnxruntime Quantization and AutoQuantization (`backend.quantization=true` or `backend.auto_quantization=avx2`, etc)
 - [x] Onnxruntime Calibration for Static Quantization (`backend.quantization_config.is_static=true`, etc)
 - [x] Onnxruntime Optimization and AutoOptimization (`backend.optimization=true` or `backend.auto_optimization=O4`, etc)
diff --git a/docker/cpu.dockerfile b/docker/cpu.dockerfile
index 371a89c8..f15db72f 100644
--- a/docker/cpu.dockerfile
+++ b/docker/cpu.dockerfile
@@ -1,6 +1,5 @@
 FROM ubuntu:latest
 
-
 # Ignore interactive questions during `docker build`
 ENV DEBIAN_FRONTEND noninteractive
 
diff --git a/docker/cuda.dockerfile b/docker/cuda.dockerfile
index a2270ffa..664895d1 100644
--- a/docker/cuda.dockerfile
+++ b/docker/cuda.dockerfile
@@ -13,12 +13,12 @@
 # limitations under the License.
 
 ARG CUDNN_VERSION=8
-ARG CUDA_VERSION=12.1.1
+ARG CUDA_VERSION=11.8.0
 ARG UBUNTU_VERSION=22.04
 
 FROM nvidia/cuda:${CUDA_VERSION}-cudnn${CUDNN_VERSION}-devel-ubuntu${UBUNTU_VERSION}
 
-ARG TORCH_CUDA=cu121
+ARG TORCH_CUDA=cu118
 ARG TORCH_PRE_RELEASE=0
 
 # Ignore interactive questions during `docker build`
diff --git a/docker/rocm-ort.dockerfile b/docker/rocm-ort.dockerfile
index 1dafd137..5309962f 100644
--- a/docker/rocm-ort.dockerfile
+++ b/docker/rocm-ort.dockerfile
@@ -13,10 +13,11 @@
 # limitations under the License.
 
 ARG ROCM_VERSION=5.7
-ARG UBUNTU_VERSION=22.04
 ARG PYTHON_VERSION=3.10
+ARG UBUNTU_VERSION=22.04
+ARG PYTORCH_VERSION=2.0.1
 
-FROM rocm/pytorch:rocm${ROCM_VERSION}_ubuntu${UBUNTU_VERSION}_py${PYTHON_VERSION}_pytorch_2.0.1
+FROM rocm/pytorch:rocm${ROCM_VERSION}_ubuntu${UBUNTU_VERSION}_py${PYTHON_VERSION}_pytorch_${PYTORCH_VERSION}
 
 # Ignore interactive questions during `docker build`
 ENV DEBIAN_FRONTEND noninteractive
diff --git a/docker/tensorrt.dockerfile b/docker/tensorrt.dockerfile
index 1e2b8603..35c84a63 100644
--- a/docker/tensorrt.dockerfile
+++ b/docker/tensorrt.dockerfile
@@ -16,7 +16,7 @@ ARG TENSORRT_VERSION=23.09
 
 FROM nvcr.io/nvidia/tensorrt:${TENSORRT_VERSION}-py3
 
-ARG TORCH_CUDA=cu121
+ARG TORCH_CUDA=cu118
 
 # Ignore interactive questions during `docker build`
 ENV DEBIAN_FRONTEND noninteractive
diff --git a/examples/api_launch.py b/examples/api_launch.py
new file mode 100644
index 00000000..987ec8c9
--- /dev/null
+++ b/examples/api_launch.py
@@ -0,0 +1,21 @@
+from optimum_benchmark.backends.pytorch.config import PyTorchConfig
+from optimum_benchmark.benchmarks.inference.config import InferenceConfig
+from optimum_benchmark.experiment import ExperimentConfig, launch
+from optimum_benchmark.launchers.torchrun.config import TorchrunConfig
+from optimum_benchmark.logging_utils import setup_logging
+
+
+if __name__ == "__main__":
+    setup_logging(level="INFO")
+    launcher_config = TorchrunConfig(nproc_per_node=2)
+    benchmark_config = InferenceConfig(latency=True, memory=True)
+    backend_config = PyTorchConfig(model="gpt2", device="cuda", device_ids="0,1", no_weights=True)
+    experiment_config = ExperimentConfig(
+        experiment_name="api-launch",
+        benchmark=benchmark_config,
+        launcher=launcher_config,
+        backend=backend_config,
+    )
+    benchmark_report = launch(experiment_config)
+    experiment_config.push_to_hub("IlyasMoutawwakil/benchmarks")
+    benchmark_report.push_to_hub("IlyasMoutawwakil/benchmarks")
diff --git a/examples/pytorch_bert.yaml b/examples/pytorch_bert.yaml
index 5a36147c..e3b08e87 100644
--- a/examples/pytorch_bert.yaml
+++ b/examples/pytorch_bert.yaml
@@ -9,8 +9,12 @@ defaults:
 
 experiment_name: pytorch_bert
 
+benchmark:
+  latency: true
+  memory: true
+
 backend:
-  device: cpu
+  device: cuda
   device_ids: 0
   model: bert-base-uncased
 
diff --git a/optimum_benchmark/backends/base.py b/optimum_benchmark/backends/base.py
index cf0f5087..2be47a11 100644
--- a/optimum_benchmark/backends/base.py
+++ b/optimum_benchmark/backends/base.py
@@ -1,26 +1,25 @@
 import gc
 import random
 from abc import ABC
-from logging import getLogger
 from collections import OrderedDict
-from typing import Optional, ClassVar, Generic, Dict, Any
+from logging import getLogger
+from typing import Any, ClassVar, Dict, Generic, Optional
 
-from .config import BackendConfigT
-from ..task_utils import get_automodel_class_for_task
+import numpy as np
+from transformers import GenerationConfig, PretrainedConfig, PreTrainedModel, TrainerState
 
+from ..task_utils import get_automodel_class_for_task
+from .config import BackendConfigT
 from .diffusers_utils import extract_diffusers_shapes_from_config, get_diffusers_pretrained_config
-from .timm_utils import extract_timm_shapes_from_config, get_timm_pretrained_config, get_timm_pre_processor
+from .timm_utils import extract_timm_shapes_from_config, get_timm_pre_processor, get_timm_pretrained_config
 from .transformers_utils import (
+    PretrainedProcessor,
     extract_transformers_shapes_from_artifacts,
     get_transformers_generation_config,
-    get_transformers_pretrained_config,
     get_transformers_pre_processor,
-    PretrainedProcessor,
+    get_transformers_pretrained_config,
 )
 
-import numpy as np
-from transformers import GenerationConfig, PretrainedConfig, PreTrainedModel, TrainerState
-
 LOGGER = getLogger("backend")
 
 
@@ -62,10 +61,7 @@ def __init__(self, config: BackendConfigT):
             self.model_type = self.pretrained_config.model_type
 
         self.automodel_class = get_automodel_class_for_task(
-            model_type=self.model_type,
-            library=self.config.library,
-            task=self.config.task,
-            framework="pt",
+            model_type=self.model_type, library=self.config.library, task=self.config.task, framework="pt"
         )
 
     def seed(self) -> None:
diff --git a/optimum_benchmark/backends/config.py b/optimum_benchmark/backends/config.py
index a4919c15..e8c9c231 100644
--- a/optimum_benchmark/backends/config.py
+++ b/optimum_benchmark/backends/config.py
@@ -1,24 +1,17 @@
 import os
 from abc import ABC
-from logging import getLogger
 from dataclasses import dataclass, field
-from typing import Optional, TypeVar, Dict, Any
+from logging import getLogger
+from typing import Any, Dict, Optional, TypeVar
 
-from ..import_utils import is_psutil_available
-from ..env_utils import get_cuda_device_ids, is_nvidia_system, is_rocm_system
-from ..task_utils import infer_library_from_model_name_or_path, infer_task_from_model_name_or_path
+from psutil import cpu_count
 
-if is_psutil_available():
-    from psutil import cpu_count
+from ..system_utils import get_gpu_device_ids, is_nvidia_system, is_rocm_system
+from ..task_utils import infer_library_from_model_name_or_path, infer_task_from_model_name_or_path
 
 LOGGER = getLogger("backend")
 
-HUB_KWARGS = {
-    "revision": "main",
-    "force_download": False,
-    "local_files_only": False,
-    "trust_remote_code": False,
-}
+HUB_KWARGS = {"revision": "main", "force_download": False, "local_files_only": False, "trust_remote_code": False}
 
 
 @dataclass
@@ -31,10 +24,10 @@ class BackendConfig(ABC):
 
     model: Optional[str] = None
     device: Optional[str] = None
-    # yes we use a string here instead of a list
-    # it's easier to pass in a yaml or from cli
-    # also it's consistent with CUDA_VISIBLE_DEVICES
     device_ids: Optional[str] = None
+    # yes we use a string here instead of a list
+    # because it's easier to pass in a yaml or from cli
+    # and it's consistent with GPU environment variables
 
     task: Optional[str] = None
     library: Optional[str] = None
@@ -48,36 +41,49 @@ def __post_init__(self):
         if self.model is None:
             raise ValueError("`model` must be specified.")
 
+        if self.task is None:
+            self.task = infer_task_from_model_name_or_path(self.model)
+
         if self.device is None:
             self.device = "cuda" if is_nvidia_system() or is_rocm_system() else "cpu"
+            LOGGER.warning(f"`device` is not specified, defaulting to {self.device} based on system configuration.")
+
+        if self.device not in ["cuda", "cpu", "mps", "xla"]:
+            raise ValueError(f"`device` must be either `cuda`, `cpu`, `mps` or `xla`, but got {self.device}")
 
         if ":" in self.device:
-            # using device index
+            # support pytorch device index notation
             self.device = self.device.split(":")[0]
             self.device_ids = self.device.split(":")[1]
 
         if self.device == "cuda":
             if self.device_ids is None:
-                self.device_ids = get_cuda_device_ids()
+                self.device_ids = get_gpu_device_ids()
 
             os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
             os.environ["CUDA_VISIBLE_DEVICES"] = self.device_ids
-            # TODO: add rocm specific environment variables ?
 
-        if self.device not in ["cuda", "cpu", "mps", "xla"]:
-            raise ValueError(f"`device` must be either `cuda`, `cpu`, `mps` or `xla`, but got {self.device}")
-
-        if self.task is None:
-            self.task = infer_task_from_model_name_or_path(self.model)
+            if is_rocm_system():
+                # https://rocm.docs.amd.com/en/latest/conceptual/gpu-isolation.html
+                os.environ["GPU_DEVICE_ORDINAL"] = self.device_ids
+                os.environ["HIP_VISIBLE_DEVICES"] = self.device_ids
+                os.environ["ROCR_VISIBLE_DEVICES"] = self.device_ids
 
         if self.library is None:
             self.library = infer_library_from_model_name_or_path(self.model)
 
+        if self.library not in ["transformers", "diffusers", "timm"]:
+            raise ValueError(f"`library` must be either `transformers`, `diffusers` or `timm`, but got {self.library}")
+
         if self.inter_op_num_threads is not None:
+            if not isinstance(self.inter_op_num_threads, int):
+                raise ValueError(f"`inter_op_num_threads` must be an integer, but got {self.inter_op_num_threads}")
             if self.inter_op_num_threads == -1:
                 self.inter_op_num_threads = cpu_count()
 
         if self.intra_op_num_threads is not None:
+            if not isinstance(self.intra_op_num_threads, int):
+                raise ValueError(f"`intra_op_num_threads` must be an integer, but got {self.intra_op_num_threads}")
             if self.intra_op_num_threads == -1:
                 self.intra_op_num_threads = cpu_count()
 
diff --git a/optimum_benchmark/backends/diffusers_utils.py b/optimum_benchmark/backends/diffusers_utils.py
index 705436d3..5b0f56ce 100644
--- a/optimum_benchmark/backends/diffusers_utils.py
+++ b/optimum_benchmark/backends/diffusers_utils.py
@@ -5,7 +5,7 @@
 from ..import_utils import is_diffusers_available
 
 if is_diffusers_available():
-    import diffusers
+    import diffusers  # type: ignore
 
 
 def get_diffusers_pretrained_config(model: str, **kwargs) -> Dict[str, int]:
diff --git a/optimum_benchmark/backends/neural_compressor/backend.py b/optimum_benchmark/backends/neural_compressor/backend.py
index dd2a7a82..cb70fdfc 100644
--- a/optimum_benchmark/backends/neural_compressor/backend.py
+++ b/optimum_benchmark/backends/neural_compressor/backend.py
@@ -1,22 +1,22 @@
-import os
 import gc
-from typing import Any, Dict
+import os
 from logging import getLogger
 from tempfile import TemporaryDirectory
-
-from ...generators.dataset_generator import DatasetGenerator
-from ..transformers_utils import randomize_weights
-from .utils import TASKS_TO_INCMODELS
-from .config import INCConfig
-from ..base import Backend
+from typing import Any, Dict
 
 import torch
 from hydra.utils import get_class
-from transformers.utils import ModelOutput
+from neural_compressor.config import AccuracyCriterion, PostTrainingQuantConfig, TuningCriterion
+from optimum.intel.neural_compressor.quantization import INCQuantizer
 from transformers.modeling_utils import no_init_weights
+from transformers.utils import ModelOutput
 from transformers.utils.logging import set_verbosity_error
-from optimum.intel.neural_compressor.quantization import INCQuantizer
-from neural_compressor.config import PostTrainingQuantConfig, AccuracyCriterion, TuningCriterion
+
+from ...generators.dataset_generator import DatasetGenerator
+from ..base import Backend
+from ..transformers_utils import randomize_weights
+from .config import INCConfig
+from .utils import TASKS_TO_INCMODELS
 
 # disable transformers logging
 set_verbosity_error()
@@ -128,15 +128,9 @@ def quantize_automodel(self) -> None:
 
         if self.config.calibration:
             LOGGER.info("\t+ Generating calibration dataset")
-            dataset_shapes = {
-                "dataset_size": 1,
-                "sequence_length": 1,
-                **self.model_shapes,
-            }
+            dataset_shapes = {"dataset_size": 1, "sequence_length": 1, **self.model_shapes}
             calibration_dataset = DatasetGenerator(
-                task=self.config.task,
-                dataset_shapes=dataset_shapes,
-                model_shapes=self.model_shapes,
+                task=self.config.task, dataset_shapes=dataset_shapes, model_shapes=self.model_shapes
             )()
             columns_to_be_removed = list(set(calibration_dataset.column_names) - set(quantizer._signature_columns))
             calibration_dataset = calibration_dataset.remove_columns(columns_to_be_removed)
diff --git a/optimum_benchmark/backends/neural_compressor/config.py b/optimum_benchmark/backends/neural_compressor/config.py
index 22becfe6..09623e47 100644
--- a/optimum_benchmark/backends/neural_compressor/config.py
+++ b/optimum_benchmark/backends/neural_compressor/config.py
@@ -1,17 +1,13 @@
-from typing import Any, Dict, Optional
 from dataclasses import dataclass, field
+from typing import Any, Dict, Optional
 
 from omegaconf import OmegaConf
 
-from ..config import BackendConfig
 from ...import_utils import neural_compressor_version
+from ..config import BackendConfig
 
 # https://github.com/intel/neural-compressor/blob/master/neural_compressor/config.py#L490
-ACCURACY_CRITERION_CONFIG = {
-    "higher_is_better": True,
-    "criterion": "relative",
-    "tolerable_loss": 0.01,
-}
+ACCURACY_CRITERION_CONFIG = {"higher_is_better": True, "criterion": "relative", "tolerable_loss": 0.01}
 
 # https://github.com/intel/neural-compressor/blob/master/neural_compressor/config.py#L593
 TUNING_CRITERION_CONFIG = {
diff --git a/optimum_benchmark/backends/onnxruntime/backend.py b/optimum_benchmark/backends/onnxruntime/backend.py
index 07d5d860..0d2fc857 100644
--- a/optimum_benchmark/backends/onnxruntime/backend.py
+++ b/optimum_benchmark/backends/onnxruntime/backend.py
@@ -1,40 +1,40 @@
 import gc
 import os
-from logging import getLogger
 from collections import OrderedDict
+from logging import getLogger
 from tempfile import TemporaryDirectory
 from typing import Any, Callable, Dict, List
 
-from ..base import Backend
-from .config import ORTConfig
-from ...task_utils import TEXT_GENERATION_TASKS
-from ...generators.dataset_generator import DatasetGenerator
-from .utils import format_calibration_config, format_quantization_config, TASKS_TO_ORTMODELS, TASKS_TO_ORTSD
-
 import torch
 from datasets import Dataset
 from hydra.utils import get_class
 from onnxruntime import SessionOptions
-from safetensors.torch import save_file
-from transformers import TrainerCallback
-from transformers.modeling_utils import no_init_weights
-from transformers.utils.logging import set_verbosity_error
-from optimum.onnxruntime.configuration import (
-    AutoOptimizationConfig,
-    AutoQuantizationConfig,
-    AutoCalibrationConfig,
-    OptimizationConfig,
-    QuantizationConfig,
-    CalibrationConfig,
-)
 from optimum.onnxruntime import (
-    ONNX_DECODER_WITH_PAST_NAME,
     ONNX_DECODER_NAME,
-    ORTTrainingArguments,
+    ONNX_DECODER_WITH_PAST_NAME,
     ORTOptimizer,
     ORTQuantizer,
     ORTTrainer,
+    ORTTrainingArguments,
 )
+from optimum.onnxruntime.configuration import (
+    AutoCalibrationConfig,
+    AutoOptimizationConfig,
+    AutoQuantizationConfig,
+    CalibrationConfig,
+    OptimizationConfig,
+    QuantizationConfig,
+)
+from safetensors.torch import save_file
+from transformers import TrainerCallback
+from transformers.modeling_utils import no_init_weights
+from transformers.utils.logging import set_verbosity_error
+
+from ...generators.dataset_generator import DatasetGenerator
+from ...task_utils import TEXT_GENERATION_TASKS
+from ..base import Backend
+from .config import ORTConfig
+from .utils import TASKS_TO_ORTMODELS, TASKS_TO_ORTSD, format_calibration_config, format_quantization_config
 
 # disable transformers logging
 set_verbosity_error()
@@ -199,8 +199,7 @@ def optimize_onnx_files(self) -> None:
             )
         elif self.config.optimization:
             optimization_config = OptimizationConfig(
-                optimize_for_gpu=(self.config.device == "cuda"),
-                **self.config.optimization_config,
+                optimize_for_gpu=(self.config.device == "cuda"), **self.config.optimization_config
             )
         LOGGER.info("\t+ Creating optimizer")
         optimizer = ORTOptimizer.from_pretrained(self.config.model, file_names=self.onnx_files_names)
@@ -243,15 +242,9 @@ def quantize_onnx_files(self) -> None:
 
         if self.is_calibrated:
             LOGGER.info("\t+ Generating calibration dataset")
-            dataset_shapes = {
-                "dataset_size": 1,
-                "sequence_length": 1,
-                **self.model_shapes,
-            }
+            dataset_shapes = {"dataset_size": 1, "sequence_length": 1, **self.model_shapes}
             calibration_dataset = DatasetGenerator(
-                task=self.config.task,
-                dataset_shapes=dataset_shapes,
-                model_shapes=self.model_shapes,
+                task=self.config.task, dataset_shapes=dataset_shapes, model_shapes=self.model_shapes
             )()
             columns_to_be_removed = list(set(calibration_dataset.column_names) - set(self.inputs_names))
             calibration_dataset = calibration_dataset.remove_columns(columns_to_be_removed)
@@ -260,10 +253,7 @@ def quantize_onnx_files(self) -> None:
             if self.config.auto_calibration is not None:
                 LOGGER.info("\t+ Processing calibration config")
                 auto_calibration_method = getattr(AutoCalibrationConfig, self.config.auto_calibration)
-                calibration_config = auto_calibration_method(
-                    calibration_dataset,
-                    **self.config.auto_calibration_config,
-                )
+                calibration_config = auto_calibration_method(calibration_dataset, **self.config.auto_calibration_config)
             elif self.config.calibration:
                 LOGGER.info("\t+ Processing calibration config")
                 calibration_config = format_calibration_config(self.config.calibration_config)
diff --git a/optimum_benchmark/backends/onnxruntime/config.py b/optimum_benchmark/backends/onnxruntime/config.py
index e0191b88..19ad747d 100644
--- a/optimum_benchmark/backends/onnxruntime/config.py
+++ b/optimum_benchmark/backends/onnxruntime/config.py
@@ -1,9 +1,9 @@
 import os
-from typing import Any, Dict, Optional
 from dataclasses import dataclass, field
+from typing import Any, Dict, Optional
 
-from ..config import BackendConfig
 from ...import_utils import onnxruntime_version
+from ..config import BackendConfig
 from ..peft_utils import PEFT_CONFIGS, PEFT_TASKS_TYPES
 
 QUANTIZATION_CONFIG = {
@@ -18,14 +18,11 @@
 }
 
 AUTO_QUANTIZATION_CONFIG = {
-    "is_static": False,
+    "is_static": False
     # is_static is mandatory
 }
 
-TRT_PROVIDER_OPTIONS = {
-    "trt_engine_cache_enable": True,
-    "trt_engine_cache_path": "/tmp/trt_cache",
-}
+TRT_PROVIDER_OPTIONS = {"trt_engine_cache_enable": True, "trt_engine_cache_path": "/tmp/trt_cache"}
 
 IO_BINDING_LIBRARIES = ["transformers", "timm"]
 IO_BINDING_PROVIDERS = ["CPUExecutionProvider", "CUDAExecutionProvider"]
@@ -103,10 +100,7 @@ def __post_init__(self):
             os.makedirs(self.provider_options["trt_engine_cache_path"], exist_ok=True)
 
         if self.quantization:
-            self.quantization_config = {
-                **QUANTIZATION_CONFIG,
-                **self.quantization_config,
-            }
+            self.quantization_config = {**QUANTIZATION_CONFIG, **self.quantization_config}
             # raise ValueError if the quantization is static but calibration is not enabled
             if self.quantization_config["is_static"] and self.auto_calibration is None and not self.calibration:
                 raise ValueError(
@@ -115,10 +109,7 @@ def __post_init__(self):
                 )
 
         if self.auto_quantization is not None:
-            self.auto_quantization_config = {
-                **AUTO_QUANTIZATION_CONFIG,
-                **self.auto_quantization_config,
-            }
+            self.auto_quantization_config = {**AUTO_QUANTIZATION_CONFIG, **self.auto_quantization_config}
             if self.auto_quantization_config["is_static"] and self.auto_calibration is None and not self.calibration:
                 raise ValueError(
                     "Quantization is static but calibration is not enabled. "
diff --git a/optimum_benchmark/backends/onnxruntime/utils.py b/optimum_benchmark/backends/onnxruntime/utils.py
index 759962f1..86eeeed9 100644
--- a/optimum_benchmark/backends/onnxruntime/utils.py
+++ b/optimum_benchmark/backends/onnxruntime/utils.py
@@ -1,13 +1,7 @@
 from typing import Any, Dict
 
+from onnxruntime.quantization import CalibrationMethod, QuantFormat, QuantizationMode, QuantType
 from optimum.pipelines import ORT_SUPPORTED_TASKS
-from onnxruntime.quantization import (
-    CalibrationMethod,
-    QuantizationMode,
-    QuantFormat,
-    QuantType,
-)
-
 
 TASKS_TO_ORTSD = {
     "stable-diffusion": "optimum.onnxruntime.ORTStableDiffusionPipeline",
diff --git a/optimum_benchmark/backends/openvino/backend.py b/optimum_benchmark/backends/openvino/backend.py
index 73cbd63d..e883c3ac 100644
--- a/optimum_benchmark/backends/openvino/backend.py
+++ b/optimum_benchmark/backends/openvino/backend.py
@@ -1,26 +1,26 @@
 import gc
-import os
 import inspect
-from typing import Any, Dict
-from logging import getLogger
+import os
 from collections import OrderedDict
+from logging import getLogger
 from tempfile import TemporaryDirectory
-
-from ..base import Backend
-from .config import OVConfig
-from .utils import TASKS_TO_OVMODEL
-from ...task_utils import TEXT_GENERATION_TASKS
-from ..transformers_utils import randomize_weights
-from ...generators.dataset_generator import DatasetGenerator
+from typing import Any, Dict
 
 import torch
 from hydra.utils import get_class
 from openvino.runtime import properties
-from safetensors.torch import save_file
+from optimum.intel.openvino import OVConfig as OVQuantizationConfig  # naming conflict
 from optimum.intel.openvino import OVQuantizer
+from safetensors.torch import save_file
 from transformers.modeling_utils import no_init_weights
 from transformers.utils.logging import set_verbosity_error
-from optimum.intel.openvino import OVConfig as OVQuantizationConfig  # naming conflict
+
+from ...generators.dataset_generator import DatasetGenerator
+from ...task_utils import TEXT_GENERATION_TASKS
+from ..base import Backend
+from ..transformers_utils import randomize_weights
+from .config import OVConfig
+from .utils import TASKS_TO_OVMODEL
 
 # disable transformers logging
 set_verbosity_error()
@@ -143,15 +143,9 @@ def quantize_automodel(self) -> None:
 
         if self.config.calibration:
             LOGGER.info("\t+ Generating calibration dataset")
-            dataset_shapes = {
-                "dataset_size": 1,
-                "sequence_length": 1,
-                **self.model_shapes,
-            }
+            dataset_shapes = {"dataset_size": 1, "sequence_length": 1, **self.model_shapes}
             calibration_dataset = DatasetGenerator(
-                task=self.config.task,
-                dataset_shapes=dataset_shapes,
-                model_shapes=self.model_shapes,
+                task=self.config.task, dataset_shapes=dataset_shapes, model_shapes=self.model_shapes
             )()
             columns_to_be_removed = list(set(calibration_dataset.column_names) - set(quantizer._export_input_names))
             calibration_dataset = calibration_dataset.remove_columns(columns_to_be_removed)
diff --git a/optimum_benchmark/backends/openvino/config.py b/optimum_benchmark/backends/openvino/config.py
index 6f4ba460..6b6797eb 100644
--- a/optimum_benchmark/backends/openvino/config.py
+++ b/optimum_benchmark/backends/openvino/config.py
@@ -1,8 +1,8 @@
 from dataclasses import dataclass, field
 from typing import Any, Dict, Optional
 
-from ..config import BackendConfig
 from ...import_utils import openvino_version
+from ..config import BackendConfig
 
 
 @dataclass
diff --git a/optimum_benchmark/backends/openvino/utils.py b/optimum_benchmark/backends/openvino/utils.py
index 8a39824d..b1005f38 100644
--- a/optimum_benchmark/backends/openvino/utils.py
+++ b/optimum_benchmark/backends/openvino/utils.py
@@ -1,8 +1,4 @@
 from optimum.intel.openvino.utils import _HEAD_TO_AUTOMODELS
 
 TASKS_TO_OVMODEL = {task: f"optimum.intel.openvino.{ovmodel}" for task, ovmodel in _HEAD_TO_AUTOMODELS.items()}
-TASKS_TO_OVMODEL.update(
-    {
-        "feature-extraction": "optimum.intel.openvino.OVModelForFeatureExtraction",
-    }
-)
+TASKS_TO_OVMODEL.update({"feature-extraction": "optimum.intel.openvino.OVModelForFeatureExtraction"})
diff --git a/optimum_benchmark/backends/peft_utils.py b/optimum_benchmark/backends/peft_utils.py
index 1a367120..8ec7d1fa 100644
--- a/optimum_benchmark/backends/peft_utils.py
+++ b/optimum_benchmark/backends/peft_utils.py
@@ -4,23 +4,16 @@
 
 if is_peft_available():
     from peft import (
+        AdaLoraConfig,
         IA3Config,
         LoraConfig,
         PeftConfig,
-        AdaLoraConfig,
         PrefixTuningConfig,
         PromptEncoderConfig,
         PromptLearningConfig,
     )
 
-PEFT_TASKS_TYPES = [
-    "SEQ_CLS",
-    "SEQ_2_SEQ_LM",
-    "CAUSAL_LM",
-    "TOKEN_CLS",
-    "QUESTION_ANS",
-    "FEATURE_EXTRACTION",
-]
+PEFT_TASKS_TYPES = ["SEQ_CLS", "SEQ_2_SEQ_LM", "CAUSAL_LM", "TOKEN_CLS", "QUESTION_ANS", "FEATURE_EXTRACTION"]
 
 PEFT_CONFIG = {
     "base_model_name_or_path": None,
diff --git a/optimum_benchmark/backends/pytorch/backend.py b/optimum_benchmark/backends/pytorch/backend.py
index 268f4306..f7fdf7ab 100644
--- a/optimum_benchmark/backends/pytorch/backend.py
+++ b/optimum_benchmark/backends/pytorch/backend.py
@@ -1,29 +1,33 @@
 import gc
 import os
-from logging import getLogger
 from collections import OrderedDict
+from logging import getLogger
 from tempfile import TemporaryDirectory
 from typing import Any, Callable, Dict, List
 
-from ..base import Backend
-from .config import PyTorchConfig
-from ..peft_utils import get_peft_config_class
-from ..transformers_utils import randomize_weights
-from ...import_utils import is_deepspeed_available, is_peft_available
-
+import datasets.utils.logging as datasets_logging
 import torch
+import transformers.utils.logging as transformers_logging
 from datasets import Dataset
 from safetensors.torch import save_file
-import datasets.utils.logging as datasets_logging
+from transformers import Trainer, TrainerCallback, TrainerState, TrainingArguments
 from transformers.modeling_utils import no_init_weights
-import transformers.utils.logging as transformers_logging
-from transformers import TrainerCallback, TrainerState, Trainer, TrainingArguments
+
+from ...import_utils import is_deepspeed_available, is_peft_available, is_torch_distributed_available
+from ..base import Backend
+from ..peft_utils import get_peft_config_class
+from ..transformers_utils import randomize_weights
+from .config import PyTorchConfig
 
 if is_peft_available():
-    from peft import get_peft_model
+    from peft import get_peft_model  # type: ignore
+
+if is_torch_distributed_available():
+    import torch.distributed
 
 if is_deepspeed_available():
-    from deepspeed import init_inference
+    from deepspeed import init_inference  # type: ignore
+
 
 # disable other loggers
 datasets_logging.set_verbosity_error()
@@ -94,14 +98,12 @@ def __init__(self, config: PyTorchConfig):
                 LOGGER.info("\t+ Using torch.compile on unet forward pass")
                 # TODO: should we compile vae and/or clip as well ?
                 self.pretrained_model.unet.forward = torch.compile(
-                    self.pretrained_model.unet.forward,
-                    **self.config.torch_compile_config,
+                    self.pretrained_model.unet.forward, **self.config.torch_compile_config
                 )
             else:
                 LOGGER.info("\t+ Using torch.compile on forward pass")
                 self.pretrained_model.forward = torch.compile(
-                    self.pretrained_model.forward,
-                    **self.config.torch_compile_config,
+                    self.pretrained_model.forward, **self.config.torch_compile_config
                 )
 
         if self.config.peft_strategy is not None:
@@ -176,9 +178,7 @@ def load_model_from_pretrained(self) -> None:
             LOGGER.info(f"\t+ Loading model directly on device: {self.config.device}")
             with torch.device(self.config.device):
                 self.pretrained_model = self.automodel_class.from_pretrained(
-                    pretrained_model_name_or_path=self.config.model,
-                    **self.config.hub_kwargs,
-                    **self.automodel_kwargs,
+                    pretrained_model_name_or_path=self.config.model, **self.config.hub_kwargs, **self.automodel_kwargs
                 )
 
     def create_no_weights_model(self) -> None:
@@ -233,30 +233,21 @@ def process_quantization_config(self) -> None:
             from transformers import GPTQConfig
 
             self.quantization_config = GPTQConfig(
-                **dict(
-                    getattr(self.pretrained_config, "quantization_config", {}),
-                    **self.config.quantization_config,
-                )
+                **dict(getattr(self.pretrained_config, "quantization_config", {}), **self.config.quantization_config)
             )
         elif self.is_awq_quantized:
             LOGGER.info("\t+ Processing AWQ config")
             from transformers import AwqConfig
 
             self.quantization_config = AwqConfig(
-                **dict(
-                    getattr(self.pretrained_config, "quantization_config", {}),
-                    **self.config.quantization_config,
-                )
+                **dict(getattr(self.pretrained_config, "quantization_config", {}), **self.config.quantization_config)
             )
         elif self.is_bnb_quantized:
             LOGGER.info("\t+ Processing BitsAndBytes config")
             from transformers import BitsAndBytesConfig
 
             self.quantization_config = BitsAndBytesConfig(
-                **dict(
-                    getattr(self.pretrained_config, "quantization_config", {}),
-                    **self.config.quantization_config,
-                )
+                **dict(getattr(self.pretrained_config, "quantization_config", {}), **self.config.quantization_config)
             )
         else:
             self.quantization_config = None
@@ -290,8 +281,8 @@ def is_awq_quantized(self) -> bool:
     def is_exllamav2(self) -> bool:
         return (
             self.is_gptq_quantized
-            and "exllama_config" in self.quantization_config
-            and self.quantization_config["exllama_config"].get("version", None) == 2
+            and hasattr(self.quantization_config, "exllama_config")
+            and self.quantization_config.exllama_config.get("version", None) == 2
         )
 
     @property
@@ -369,6 +360,10 @@ def seed(self):
             torch.cuda.manual_seed_all(self.config.seed)
 
     def clean(self) -> None:
+        if is_torch_distributed_available() and torch.distributed.is_initialized():
+            LOGGER.info("\t+ Waiting for distributed processes to finish before cleaning backend")
+            torch.distributed.barrier()
+
         super().clean()
 
         if hasattr(self, "tmpdir"):
diff --git a/optimum_benchmark/backends/pytorch/config.py b/optimum_benchmark/backends/pytorch/config.py
index d8089f60..7902719d 100644
--- a/optimum_benchmark/backends/pytorch/config.py
+++ b/optimum_benchmark/backends/pytorch/config.py
@@ -1,20 +1,16 @@
 from dataclasses import dataclass, field
 from typing import Any, Dict, Optional
 
-from ..config import BackendConfig
-from ...env_utils import is_rocm_system
 from ...import_utils import torch_version
+from ...system_utils import is_rocm_system
+from ..config import BackendConfig
 from ..peft_utils import PEFT_CONFIGS, PEFT_TASKS_TYPES
 
 DEVICE_MAPS = ["auto", "sequential"]
 AMP_DTYPES = ["bfloat16", "float16"]
 TORCH_DTYPES = ["bfloat16", "float16", "float32", "auto"]
 
-QUANTIZATION_CONFIGS = {
-    "bnb": {"llm_int8_threshold": 0.0},
-    "gptq": {},
-    "awq": {},
-}
+QUANTIZATION_CONFIGS = {"bnb": {"llm_int8_threshold": 0.0}, "gptq": {}, "awq": {}}
 COMPILE_CONFIG = {
     "fullgraph": False,
     "dynamic": False,
@@ -89,10 +85,7 @@ def __post_init__(self):
 
             if self.quantization_config:
                 QUANTIZATION_CONFIG = QUANTIZATION_CONFIGS[self.quantization_scheme]
-                self.quantization_config = {
-                    **QUANTIZATION_CONFIG,
-                    **self.quantization_config,
-                }
+                self.quantization_config = {**QUANTIZATION_CONFIG, **self.quantization_config}
 
         if self.peft_strategy is not None:
             if self.peft_strategy not in PEFT_CONFIGS:
diff --git a/optimum_benchmark/backends/tensorrt_llm/backend.py b/optimum_benchmark/backends/tensorrt_llm/backend.py
index 7c86adeb..3beb1387 100644
--- a/optimum_benchmark/backends/tensorrt_llm/backend.py
+++ b/optimum_benchmark/backends/tensorrt_llm/backend.py
@@ -1,13 +1,13 @@
 from logging import getLogger
 from typing import Any, Dict
 
+from hydra.utils import get_class
+from transformers.utils import ModelOutput
+
 from ..base import Backend
 from .config import TRTLLMConfig
 from .utils import MODEL_TYPE_TO_TRTLLMMODEL
 
-from hydra.utils import get_class
-from transformers.utils import ModelOutput
-
 LOGGER = getLogger("tensorrt-llm")
 
 
@@ -47,9 +47,7 @@ def load_trtmodel_from_pretrained(self) -> None:
 
     def forward(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> ModelOutput:
         return self.pretrained_model.generate(
-            input_ids=inputs.get("input_ids", None),
-            attention_mask=inputs.get("attention_mask", None),
-            max_new_tokens=1,
+            input_ids=inputs.get("input_ids", None), attention_mask=inputs.get("attention_mask", None), max_new_tokens=1
         )
 
     def generate(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> ModelOutput:
diff --git a/optimum_benchmark/backends/tensorrt_llm/config.py b/optimum_benchmark/backends/tensorrt_llm/config.py
index e676accb..d7f4b1cb 100644
--- a/optimum_benchmark/backends/tensorrt_llm/config.py
+++ b/optimum_benchmark/backends/tensorrt_llm/config.py
@@ -1,9 +1,8 @@
-from typing import Optional
 from dataclasses import dataclass
+from typing import Optional
 
-from ..config import BackendConfig
 from ...import_utils import tesnorrt_llm_version
-
+from ..config import BackendConfig
 
 SUPPORTED_DTYPES = ["float16", "bfloat16", "float32"]
 
diff --git a/optimum_benchmark/backends/text_generation_inference/backend.py b/optimum_benchmark/backends/text_generation_inference/backend.py
index 538de53c..c7ecd5ce 100644
--- a/optimum_benchmark/backends/text_generation_inference/backend.py
+++ b/optimum_benchmark/backends/text_generation_inference/backend.py
@@ -1,23 +1,24 @@
 import gc
 import os
 import time
+from concurrent.futures import ThreadPoolExecutor
 from logging import getLogger
-from typing import Any, Dict, List
 from tempfile import TemporaryDirectory
-from concurrent.futures import ThreadPoolExecutor
-
-from ..base import Backend
-from .config import TGIConfig
-from ...task_utils import TEXT_GENERATION_TASKS
-from ..transformers_utils import randomize_weights
+from typing import Any, Dict, List
 
 import torch
-import docker
-import docker.types
-import docker.errors
-from safetensors.torch import save_model
 from huggingface_hub import InferenceClient, snapshot_download
 from huggingface_hub.inference._text_generation import TextGenerationResponse
+from safetensors.torch import save_model
+
+import docker
+import docker.errors
+import docker.types
+
+from ...task_utils import TEXT_GENERATION_TASKS
+from ..base import Backend
+from ..transformers_utils import randomize_weights
+from .config import TGIConfig
 
 # bachend logger
 LOGGER = getLogger("text-generation-inference")
@@ -59,12 +60,7 @@ def load_model_from_pretrained(self) -> None:
         model_cache_path = f"{self.config.volume}/{model_cache_folder}"
 
         snapshot_ref = (
-            open(
-                f"{model_cache_path}/refs/{self.config.hub_kwargs.get('revision', 'main')}",
-                "r",
-            )
-            .read()
-            .strip()
+            open(f"{model_cache_path}/refs/{self.config.hub_kwargs.get('revision', 'main')}", "r").read().strip()
         )
 
         model_snapshot_path = f"{model_cache_path}/snapshots/{snapshot_ref}"
@@ -133,12 +129,7 @@ def start_tgi_server(self) -> None:
             env["HUGGING_FACE_HUB_TOKEN"] = os.environ["HUGGING_FACE_HUB_TOKEN"]
 
         LOGGER.info("\t+ Building TGI command")
-        self.command = [
-            "--model-id",
-            self.config.model,
-            "--revision",
-            self.config.hub_kwargs.get("revision", "main"),
-        ]
+        self.command = ["--model-id", self.config.model, "--revision", self.config.hub_kwargs.get("revision", "main")]
 
         if self.config.sharded is not None:
             self.command.extend(["--sharded", str(self.config.sharded).lower()])
diff --git a/optimum_benchmark/backends/timm_utils.py b/optimum_benchmark/backends/timm_utils.py
index 9e2924b2..07105003 100644
--- a/optimum_benchmark/backends/timm_utils.py
+++ b/optimum_benchmark/backends/timm_utils.py
@@ -1,6 +1,6 @@
 from typing import Any, Dict, Optional
 
-from ..import_utils import is_timm_available, is_transformers_available, is_torch_available
+from ..import_utils import is_timm_available, is_torch_available, is_transformers_available
 
 if is_torch_available():
     import torch
diff --git a/optimum_benchmark/backends/torch_ort/backend.py b/optimum_benchmark/backends/torch_ort/backend.py
index a7515d2f..52bede74 100644
--- a/optimum_benchmark/backends/torch_ort/backend.py
+++ b/optimum_benchmark/backends/torch_ort/backend.py
@@ -4,18 +4,22 @@
 from tempfile import TemporaryDirectory
 from typing import Any, Callable, Dict, List
 
-from ..transformers_utils import randomize_weights
-from ..peft_utils import get_peft_config_class
-from .config import TorchORTConfig
-from ..base import Backend
-
 import torch
 from datasets import Dataset
+from optimum.onnxruntime import ORTTrainer, ORTTrainingArguments
 from safetensors.torch import save_file
 from transformers import TrainerCallback, TrainerState
 from transformers.modeling_utils import no_init_weights
 from transformers.utils.logging import set_verbosity_error
-from optimum.onnxruntime import ORTTrainer, ORTTrainingArguments
+
+from ...import_utils import is_peft_available
+from ..base import Backend
+from ..peft_utils import get_peft_config_class
+from ..transformers_utils import randomize_weights
+from .config import TorchORTConfig
+
+if is_peft_available():
+    from peft import get_peft_model  # type: ignore
 
 # disable transformers logging
 set_verbosity_error()
@@ -39,9 +43,7 @@ def __init__(self, config: TorchORTConfig):
             self.load_automodel_from_pretrained()
 
         if self.config.peft_strategy is not None:
-            LOGGER.info("\t+ Applying PEFT")
-            from peft import get_peft_model
-
+            LOGGER.info("\t+ Using PEFT")
             peft_config_class = get_peft_config_class(self.config.peft_strategy)
             peft_config = peft_config_class(**self.config.peft_config)
             self.pretrained_model = get_peft_model(self.pretrained_model, peft_config=peft_config)
@@ -87,9 +89,7 @@ def load_automodel_with_no_weights(self) -> None:
 
     def load_automodel_from_pretrained(self) -> None:
         self.pretrained_model = self.automodel_class.from_pretrained(
-            self.config.model,
-            **self.automodel_kwargs,
-            **self.config.hub_kwargs,
+            self.config.model, **self.automodel_kwargs, **self.config.hub_kwargs
         ).to(self.config.device)
 
     @property
diff --git a/optimum_benchmark/backends/torch_ort/config.py b/optimum_benchmark/backends/torch_ort/config.py
index ac2de2f7..8559022f 100644
--- a/optimum_benchmark/backends/torch_ort/config.py
+++ b/optimum_benchmark/backends/torch_ort/config.py
@@ -1,8 +1,8 @@
 from dataclasses import dataclass, field
 from typing import Any, Dict, Optional
 
-from ..config import BackendConfig
 from ...import_utils import torch_ort_version
+from ..config import BackendConfig
 from ..peft_utils import PEFT_CONFIGS, PEFT_TASKS_TYPES
 
 
diff --git a/optimum_benchmark/backends/transformers_utils.py b/optimum_benchmark/backends/transformers_utils.py
index 1d7ad410..6835617a 100644
--- a/optimum_benchmark/backends/transformers_utils.py
+++ b/optimum_benchmark/backends/transformers_utils.py
@@ -1,29 +1,24 @@
 import os
 from typing import Any, Dict, Optional, Union
 
-from ..import_utils import is_transformers_available, is_torch_available
+from ..import_utils import is_torch_available, is_transformers_available
 
 if is_torch_available():
     import torch
 
 if is_transformers_available():
     from transformers import (
+        AutoConfig,
+        AutoProcessor,
         FeatureExtractionMixin,
-        ImageProcessingMixin,
-        PreTrainedTokenizer,
         GenerationConfig,
+        ImageProcessingMixin,
         PretrainedConfig,
+        PreTrainedTokenizer,
         ProcessorMixin,
-        AutoProcessor,
-        AutoConfig,
     )
 
-    PretrainedProcessor = Union[
-        FeatureExtractionMixin,
-        ImageProcessingMixin,
-        PreTrainedTokenizer,
-        ProcessorMixin,
-    ]
+    PretrainedProcessor = Union[FeatureExtractionMixin, ImageProcessingMixin, PreTrainedTokenizer, ProcessorMixin]
 
 
 def get_transformers_cache_dir() -> str:
@@ -52,8 +47,7 @@ def get_transformers_pre_processor(model: str, **kwargs) -> Optional["Pretrained
 
 
 def extract_transformers_shapes_from_artifacts(
-    config: "PretrainedConfig",
-    processor: Optional["PretrainedProcessor"] = None,
+    config: "PretrainedConfig", processor: Optional["PretrainedProcessor"] = None
 ) -> Dict[str, Any]:
     artifacts_dict = {}
 
diff --git a/optimum_benchmark/benchmarks/base.py b/optimum_benchmark/benchmarks/base.py
index 84495a1a..a8c42806 100644
--- a/optimum_benchmark/benchmarks/base.py
+++ b/optimum_benchmark/benchmarks/base.py
@@ -3,8 +3,8 @@
 from typing import ClassVar, Generic
 
 from ..backends.base import Backend
-from .report import BenchmarkReport
 from .config import BenchmarkConfigT
+from .report import BenchmarkReport
 
 LOGGER = getLogger("benchmark")
 
diff --git a/optimum_benchmark/benchmarks/config.py b/optimum_benchmark/benchmarks/config.py
index f3e96348..76d102af 100644
--- a/optimum_benchmark/benchmarks/config.py
+++ b/optimum_benchmark/benchmarks/config.py
@@ -1,8 +1,7 @@
 from abc import ABC
-from typing import TypeVar
-from logging import getLogger
 from dataclasses import dataclass
-
+from logging import getLogger
+from typing import TypeVar
 
 LOGGER = getLogger("benchmark")
 
diff --git a/optimum_benchmark/benchmarks/inference/benchmark.py b/optimum_benchmark/benchmarks/inference/benchmark.py
index 9cc96ee1..07c4f9ee 100644
--- a/optimum_benchmark/benchmarks/inference/benchmark.py
+++ b/optimum_benchmark/benchmarks/inference/benchmark.py
@@ -1,26 +1,23 @@
+from dataclasses import dataclass
 from logging import getLogger
-from typing import List, Tuple, Dict
 
-from ..base import Benchmark
-from .config import InferenceConfig
-from ...trackers.energy import EnergyTracker
-from ...trackers.memory import MemoryTracker
-from ...trackers.latency import LatencyTracker
 from ...backends.base import Backend, BackendConfigT
 from ...generators.input_generator import InputGenerator
 from ...import_utils import is_torch_distributed_available
-from ...task_utils import TEXT_GENERATION_TASKS, IMAGE_DIFFUSION_TASKS
-from .report import InferenceReport, TextGenerationReport, ImageDiffusionReport
+from ...task_utils import IMAGE_DIFFUSION_TASKS, TEXT_GENERATION_TASKS
+from ...trackers.energy import Efficiency, EnergyTracker
+from ...trackers.latency import LatencyTracker, Throughput
+from ...trackers.memory import MemoryTracker
+from ..base import Benchmark
+from ..report import BenchmarkMeasurements, BenchmarkReport
+from .config import InferenceConfig
 
 if is_torch_distributed_available():
     import torch.distributed
 
 LOGGER = getLogger("inference")
 
-IMAGE_DIFFUSION_KWARGS = {
-    "num_inference_steps": 30,
-    "num_images_per_prompt": 1,
-}
+IMAGE_DIFFUSION_KWARGS = {"num_inference_steps": 30, "num_images_per_prompt": 1}
 
 TEXT_GENERATION_KWARGS = {
     "num_return_sequences": 1,
@@ -33,6 +30,33 @@
     "num_beams": 1,
 }
 
+EFFICIENCY_UNIT = "samples/kWh"
+THROUGHPUT_UNIT = "samples/s"
+
+PREFILL_THROUGHPUT_UNIT = "tokens/s"
+DECODE_THROUGHPUT_UNIT = "tokens/s"
+CALL_THROUGHPUT_UNIT = "images/s"
+
+PREFILL_EFFICIENCY_UNIT = "tokens/kWh"
+DECODE_EFFICIENCY_UNIT = "tokens/kWh"
+CALL_EFFICIENCY_UNIT = "images/kWh"
+
+
+@dataclass
+class InferenceReport(BenchmarkReport):
+    forward: BenchmarkMeasurements
+
+
+@dataclass
+class ImageDiffusionReport(BenchmarkReport):
+    call: BenchmarkMeasurements
+
+
+@dataclass
+class TextGenerationReport(BenchmarkReport):
+    prefill: BenchmarkMeasurements
+    decode: BenchmarkMeasurements
+
 
 class InferenceBenchmark(Benchmark[InferenceConfig]):
     NAME = "inference"
@@ -42,17 +66,18 @@ def __init__(self, config: InferenceConfig) -> None:
 
     def run(self, backend: Backend[BackendConfigT]) -> None:
         if is_torch_distributed_available() and torch.distributed.is_initialized():
+            LOGGER.info("\t+ Distributing batch size across processes")
             if self.config.input_shapes["batch_size"] % torch.distributed.get_world_size() != 0:
                 raise ValueError(
                     "The batch size must be divisible by the number of processes in a distributed environment"
                 )
             self.config.input_shapes["batch_size"] //= torch.distributed.get_world_size()
+            if backend.config.device == "cuda" and backend.config.task in TEXT_GENERATION_TASKS:
+                TEXT_GENERATION_TASKS["synced_gpus"] = True
 
         LOGGER.info("\t+ Creating input generator")
         self.input_generator = InputGenerator(
-            task=backend.config.task,
-            model_shapes=backend.model_shapes,
-            input_shapes=self.config.input_shapes,
+            task=backend.config.task, model_shapes=backend.model_shapes, input_shapes=self.config.input_shapes
         )
 
         if backend.config.task in TEXT_GENERATION_TASKS:
@@ -64,12 +89,7 @@ def run(self, backend: Backend[BackendConfigT]) -> None:
             LOGGER.info("\t+ Updating Text Generation kwargs with default values")
             self.config.generate_kwargs = {**TEXT_GENERATION_KWARGS, **self.config.generate_kwargs}
             LOGGER.info("\t+ Initializing Text Generation report")
-            self.report = TextGenerationReport(
-                batch_size=self.config.input_shapes["batch_size"],
-                sequence_length=self.config.input_shapes["sequence_length"],
-                num_new_tokens=self.config.generate_kwargs["max_new_tokens"],
-                num_return_sequences=self.config.generate_kwargs["num_return_sequences"],
-            )
+            self.report = TextGenerationReport(prefill=BenchmarkMeasurements(), decode=BenchmarkMeasurements())
 
         elif backend.config.task in IMAGE_DIFFUSION_TASKS:
             LOGGER.info("\t+ Generating and preparing Image Diffusion input")
@@ -78,19 +98,14 @@ def run(self, backend: Backend[BackendConfigT]) -> None:
             LOGGER.info("\t+ Updating Image Diffusion kwargs with default values")
             self.config.forward_kwargs = {**IMAGE_DIFFUSION_KWARGS, **self.config.forward_kwargs}
             LOGGER.info("\t+ Initializing Image Diffusion report")
-            self.report = ImageDiffusionReport(
-                batch_size=self.config.input_shapes["batch_size"],
-                num_images_per_prompts=self.config.forward_kwargs["num_images_per_prompt"],
-            )
+            self.report = ImageDiffusionReport(call=BenchmarkMeasurements())
 
         else:
             LOGGER.info("\t+ Generating and preparing Inference input")
             self.forward_inputs = self.input_generator(mode="forward")
             self.forward_inputs = backend.prepare_inputs(self.forward_inputs)
             LOGGER.info("\t+ Initializing Inference report")
-            self.report = InferenceReport(
-                batch_size=self.config.input_shapes["batch_size"],
-            )
+            self.report = InferenceReport(forward=BenchmarkMeasurements())
 
         LOGGER.info("\t+ Preparing backend for Inference")
         backend.prepare_for_inference(
@@ -103,11 +118,9 @@ def run(self, backend: Backend[BackendConfigT]) -> None:
         LOGGER.info("\t+ Warming up backend for Inference")
         for _ in range(self.config.warmup_runs):
             if backend.config.task in TEXT_GENERATION_TASKS:
-                generate_warmup_kwargs = {"max_new_tokens": 2, "min_new_tokens": 2}
-                _ = backend.generate(self.generate_input, generate_warmup_kwargs)
+                _ = backend.generate(self.generate_input, {"max_new_tokens": 2, "min_new_tokens": 2})
             elif backend.config.task in IMAGE_DIFFUSION_TASKS:
-                diffuse_warmup_kwargs = {"num_inference_steps": 2}
-                _ = backend.call(self.diffuse_input, diffuse_warmup_kwargs)
+                _ = backend.call(self.diffuse_input, {"num_inference_steps": 2})
             else:
                 _ = backend.forward(self.forward_inputs, self.config.forward_kwargs)
 
@@ -117,14 +130,11 @@ def run(self, backend: Backend[BackendConfigT]) -> None:
                 backend=backend.config.name, device=backend.config.device, device_ids=backend.config.device_ids
             )
             if backend.config.task in TEXT_GENERATION_TASKS:
-                forward_memories_dict, generate_memories_dict = self.run_text_generation_memory_tracking(backend)
-                self.report.populate_memory(forward_memories_dict, generate_memories_dict)
+                self.run_text_generation_memory_tracking(backend)
             elif backend.config.task in IMAGE_DIFFUSION_TASKS:
-                call_memories_dict = self.run_image_diffusion_memory_tracking(backend)
-                self.report.populate_memory(call_memories_dict)
+                self.run_image_diffusion_memory_tracking(backend)
             else:
-                forward_memories_dict = self.run_inference_memory_tracking(backend)
-                self.report.populate_memory(forward_memories_dict)
+                self.run_inference_memory_tracking(backend)
 
             self.report.log_memory()
 
@@ -132,146 +142,170 @@ def run(self, backend: Backend[BackendConfigT]) -> None:
             LOGGER.info("\t+ Creating inference latency tracker")
             self.latency_tracker = LatencyTracker(backend=backend.config.name, device=backend.config.device)
             if backend.config.task in TEXT_GENERATION_TASKS:
-                forward_latencies_dict, generate_latencies_dict = self.run_text_generation_latency_tracking(backend)
-                self.report.populate_latency(forward_latencies_dict, generate_latencies_dict)
+                self.run_text_generation_latency_tracking(backend)
             elif backend.config.task in IMAGE_DIFFUSION_TASKS:
-                call_latencies_dict = self.run_image_diffusion_latency_tracking(backend)
-                self.report.populate_latency(call_latencies_dict)
+                self.run_image_diffusion_latency_tracking(backend)
             else:
-                forward_latencies_dict = self.run_latency_inference_tracking(backend)
-                self.report.populate_latency(forward_latencies_dict)
+                self.run_latency_inference_tracking(backend)
 
             self.report.log_latency()
+            self.report.log_throughput()
 
         if self.config.energy:
             LOGGER.info("\t+ Creating inference energy tracker")
             self.energy_tracker = EnergyTracker(device=backend.config.device, device_ids=backend.config.device_ids)
             if backend.config.task in TEXT_GENERATION_TASKS:
-                forward_energies_dict, generate_energies_dict = self.run_text_generation_energy_tracking(backend)
-                self.report.populate_energy(forward_energies_dict, generate_energies_dict)
+                self.run_text_generation_energy_tracking(backend)
             elif backend.config.task in IMAGE_DIFFUSION_TASKS:
-                call_energies_dict = self.run_image_diffusion_energy_tracking(backend)
-                self.report.populate_energy(call_energies_dict)
+                self.run_image_diffusion_energy_tracking(backend)
             else:
-                forward_energies_dict = self.run_inference_energy_tracking(backend)
-                self.report.populate_energy(forward_energies_dict)
+                self.run_inference_energy_tracking(backend)
 
             self.report.log_energy()
+            self.report.log_efficiency()
+
+        self.report.log()
 
     ## Memory tracking
-    def run_text_generation_memory_tracking(self, backend: Backend) -> Tuple[Dict[str, float], Dict[str, float]]:
+    def run_text_generation_memory_tracking(self, backend: Backend):
         LOGGER.info("\t+ Running memory tracking")
         self.memory_tracker.reset()
         with self.memory_tracker.track():
             _ = backend.forward(self.forward_inputs, self.config.forward_kwargs)
 
-        forward_memories_dict = self.memory_tracker.get_memories_dict()
+        self.report.prefill.memory = self.memory_tracker.get_max_memory()
 
         self.memory_tracker.reset()
         with self.memory_tracker.track():
             _ = backend.generate(self.generate_input, self.config.generate_kwargs)
 
-        generate_memories_dict = self.memory_tracker.get_memories_dict()
+        self.report.decode.memory = self.memory_tracker.get_max_memory()
 
-        return forward_memories_dict, generate_memories_dict
-
-    def run_image_diffusion_memory_tracking(self, backend: Backend) -> Dict[str, float]:
+    def run_image_diffusion_memory_tracking(self, backend: Backend):
         LOGGER.info("\t+ Running memory tracking")
         self.memory_tracker.reset()
         with self.memory_tracker.track():
             _ = backend.call(self.diffuse_input, self.config.forward_kwargs)
 
-        call_memories_dict = self.memory_tracker.get_memories_dict()
-
-        return call_memories_dict
+        self.report.call.memory = self.memory_tracker.get_max_memory()
 
-    def run_inference_memory_tracking(self, backend: Backend) -> Dict[str, float]:
+    def run_inference_memory_tracking(self, backend: Backend):
         LOGGER.info("\t+ Running memory tracking")
         self.memory_tracker.reset()
         with self.memory_tracker.track():
             _ = backend.forward(self.forward_inputs, self.config.forward_kwargs)
 
-        forward_memories_dict = self.memory_tracker.get_memories_dict()
-
-        return forward_memories_dict
+        self.report.forward.memory = self.memory_tracker.get_max_memory()
 
     ## Latency tracking
-    def run_text_generation_latency_tracking(self, backend: Backend) -> Tuple[List[float], List[float]]:
+    def run_text_generation_latency_tracking(self, backend: Backend):
         LOGGER.info("\t+ Running latency tracking")
         self.latency_tracker.reset()
-        while self.latency_tracker.get_total_latency() < self.config.duration:
+        while self.latency_tracker.get_elapsed_time() < self.config.duration:
             with self.latency_tracker.track():
                 _ = backend.forward(self.forward_inputs, self.config.forward_kwargs)
 
-        forward_latencies_list = self.latency_tracker.get_latencies_list()
+        self.report.prefill.latency = self.latency_tracker.get_latency()
+        self.report.prefill.throughput = self.latency_tracker.get_throughput(
+            volume=self.prefill_volume, unit=PREFILL_THROUGHPUT_UNIT
+        )
 
         self.latency_tracker.reset()
-        while self.latency_tracker.get_total_latency() < self.config.duration:
+        while self.latency_tracker.get_elapsed_time() < self.config.duration:
             with self.latency_tracker.track():
                 _ = backend.generate(self.generate_input, self.config.generate_kwargs)
 
-        generate_latencies_list = self.latency_tracker.get_latencies_list()
-
-        return forward_latencies_list, generate_latencies_list
+        self.report.decode.latency = self.latency_tracker.get_latency() - self.report.prefill.latency.mean
+        self.report.decode.throughput = Throughput.from_latency(
+            self.report.decode.latency, self.decode_volume, unit=DECODE_THROUGHPUT_UNIT
+        )
 
-    def run_image_diffusion_latency_tracking(self, backend: Backend) -> List[float]:
+    def run_image_diffusion_latency_tracking(self, backend: Backend):
         LOGGER.info("\t+ Running latency tracking")
         self.latency_tracker.reset()
-        while self.latency_tracker.get_total_latency() < self.config.duration:
+        while self.latency_tracker.get_elapsed_time() < self.config.duration:
             with self.latency_tracker.track():
                 _ = backend.call(self.diffuse_input, self.config.forward_kwargs)
 
-        call_latencies_list = self.latency_tracker.get_latencies_list()
-
-        return call_latencies_list
+        self.report.call.latency = self.latency_tracker.get_latency()
+        self.report.call.throughput = Throughput.from_latency(
+            self.report.call.latency, self.call_volume, unit=CALL_THROUGHPUT_UNIT
+        )
 
-    def run_latency_inference_tracking(self, backend: Backend) -> List[float]:
+    def run_latency_inference_tracking(self, backend: Backend):
         LOGGER.info("\t+ Running latency tracking")
         self.latency_tracker.reset()
-        while self.latency_tracker.get_total_latency() < self.config.duration:
+        while self.latency_tracker.get_elapsed_time() < self.config.duration:
             with self.latency_tracker.track():
                 _ = backend.forward(self.forward_inputs, self.config.forward_kwargs)
 
-        forward_latencies_list = self.latency_tracker.get_latencies_list()
-
-        return forward_latencies_list
+        self.report.forward.latency = self.latency_tracker.get_latency()
+        self.report.forward.throughput = Throughput.from_latency(
+            self.report.forward.latency, self.forward_volume, unit=THROUGHPUT_UNIT
+        )
 
     ## Energy tracking
-    def run_text_generation_energy_tracking(self, backend: Backend) -> Tuple[Dict[str, float], Dict[str, float]]:
+    def run_text_generation_energy_tracking(self, backend: Backend):
         LOGGER.info("\t+ Running energy tracking")
         self.energy_tracker.reset()
         with self.energy_tracker.track():
             _ = backend.forward(self.forward_inputs, self.config.forward_kwargs)
 
-        forward_energies_dict = self.energy_tracker.get_energies_dict()
+        self.report.prefill.energy = self.energy_tracker.get_energy()
+        self.report.prefill.efficiency = Efficiency.from_energy(
+            self.report.prefill.energy, self.prefill_volume, unit=PREFILL_EFFICIENCY_UNIT
+        )
 
         self.energy_tracker.reset()
         with self.energy_tracker.track():
             _ = backend.generate(self.generate_input, self.config.generate_kwargs)
 
-        generate_energies_dict = self.energy_tracker.get_energies_dict()
-
-        return forward_energies_dict, generate_energies_dict
+        self.report.decode.energy = self.energy_tracker.get_energy() - self.report.prefill.energy
+        self.report.decode.efficiency = Efficiency.from_energy(
+            self.report.decode.energy, self.decode_volume, unit=DECODE_EFFICIENCY_UNIT
+        )
 
-    def run_image_diffusion_energy_tracking(self, backend: Backend) -> Dict[str, float]:
+    def run_image_diffusion_energy_tracking(self, backend: Backend):
         LOGGER.info("\t+ Running energy tracking")
         self.energy_tracker.reset()
         with self.energy_tracker.track():
             _ = backend.call(self.diffuse_input, self.config.forward_kwargs)
 
-        call_energies_dict = self.energy_tracker.get_energies_dict()
-
-        return call_energies_dict
+        self.report.call.energy = self.energy_tracker.get_energy()
+        self.report.call.efficiency = Efficiency.from_energy(
+            self.report.call.energy, self.call_volume, unit=CALL_EFFICIENCY_UNIT
+        )
 
-    def run_inference_energy_tracking(self, backend: Backend) -> Dict[str, float]:
+    def run_inference_energy_tracking(self, backend: Backend):
         LOGGER.info("\t+ Running energy tracking")
         self.energy_tracker.reset()
         with self.energy_tracker.track():
             _ = backend.forward(self.forward_inputs, self.config.forward_kwargs)
 
-        forward_energies_dict = self.energy_tracker.get_energies_dict()
+        self.report.forward.energy = self.energy_tracker.get_energy()
+        self.report.forward.efficiency = Efficiency.from_energy(
+            self.report.forward.energy, self.forward_volume, unit=EFFICIENCY_UNIT
+        )
+
+    @property
+    def forward_volume(self) -> int:  # in samples
+        return self.config.input_shapes["batch_size"]
+
+    @property
+    def prefill_volume(self) -> int:  # in tokens
+        return self.config.input_shapes["batch_size"] * self.config.input_shapes["sequence_length"]
 
-        return forward_energies_dict
+    @property
+    def call_volume(self) -> int:  # in images
+        return self.config.input_shapes["batch_size"] * self.config.forward_kwargs["num_images_per_prompt"]
+
+    @property
+    def decode_volume(self) -> int:  # in tokens
+        return (
+            self.config.input_shapes["batch_size"]
+            * self.config.generate_kwargs["num_return_sequences"]
+            * self.config.generate_kwargs["max_new_tokens"]
+        )
 
     def get_report(self) -> InferenceReport:
         return self.report
diff --git a/optimum_benchmark/benchmarks/inference/callback.py b/optimum_benchmark/benchmarks/inference/callback.py
deleted file mode 100644
index 4871691d..00000000
--- a/optimum_benchmark/benchmarks/inference/callback.py
+++ /dev/null
@@ -1,25 +0,0 @@
-import time
-
-from ...import_utils import is_torch_available
-
-from transformers import LogitsProcessor
-
-if is_torch_available():
-    import torch
-
-
-# TODO: uses this class for more fine-grained latency measurements in text generation
-class MeasurementProcessor(LogitsProcessor):
-    def __init__(self, device: str, backend: str):
-        self.device = device
-        self.backend = backend
-
-        self.latencies = []
-
-    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor):
-        """
-        Callback to track the time it takes to generate one batch of tokens.
-        """
-        self.latencies.append(time.perf_counter_ns())
-
-        return scores
diff --git a/optimum_benchmark/benchmarks/inference/config.py b/optimum_benchmark/benchmarks/inference/config.py
index d5c4a0bb..7b6cfd3f 100644
--- a/optimum_benchmark/benchmarks/inference/config.py
+++ b/optimum_benchmark/benchmarks/inference/config.py
@@ -1,17 +1,13 @@
+from dataclasses import dataclass, field
 from logging import getLogger
 from typing import Any, Dict, Optional
-from dataclasses import dataclass, field
 
-from ...env_utils import is_rocm_system
+from ...system_utils import is_rocm_system
 from ..config import BenchmarkConfig
 
 LOGGER = getLogger("inference")
 
-INPUT_SHAPES = {
-    "batch_size": 2,
-    "sequence_length": 16,
-    "num_choices": 2,
-}
+INPUT_SHAPES = {"batch_size": 2, "num_choices": 2, "sequence_length": 16}
 
 
 @dataclass
@@ -40,16 +36,13 @@ class InferenceConfig(BenchmarkConfig):
 
     # methods kwargs
     forward_kwargs: Dict[str, Any] = field(
-        default_factory=dict,
-        metadata={"help": "Keyword arguments to pass to the forward method of the model."},
+        default_factory=dict, metadata={"help": "Keyword arguments to pass to the forward method of the model."}
     )
     generate_kwargs: Dict[str, Any] = field(
-        default_factory=dict,
-        metadata={"help": "Keyword arguments to pass to the generate method of the model."},
+        default_factory=dict, metadata={"help": "Keyword arguments to pass to the generate method of the model."}
     )
     call_kwargs: Dict[str, Any] = field(
-        default_factory=dict,
-        metadata={"help": "Keyword arguments to pass to the __call__ method of the pipeline."},
+        default_factory=dict, metadata={"help": "Keyword arguments to pass to the __call__ method of the pipeline."}
     )
 
     def __post_init__(self):
diff --git a/optimum_benchmark/benchmarks/inference/report.py b/optimum_benchmark/benchmarks/inference/report.py
deleted file mode 100644
index 9cd43cfc..00000000
--- a/optimum_benchmark/benchmarks/inference/report.py
+++ /dev/null
@@ -1,353 +0,0 @@
-from dataclasses import dataclass, field
-from statistics import mean, stdev
-from typing import Any, Dict, List
-from logging import getLogger
-
-from ..report import BenchmarkReport
-
-LOGGER = getLogger("report")
-
-
-@dataclass
-class InferenceReport(BenchmarkReport):
-    # Config
-    batch_size: int
-    # Metrics
-    forward: Dict[str, Any] = field(default_factory=dict)
-
-    # POPULATING
-    def populate_latency(self, forward_latencies_list: List[float]):
-        ## Latency
-        self.forward["latency"] = {
-            "list[s]": forward_latencies_list,
-            "mean(s)": compute_mean(forward_latencies_list),
-            "stdev(s)": compute_stdev(forward_latencies_list),
-        }
-        ## Throughput
-        forward_throughputs_list = [self.batch_size / latency for latency in forward_latencies_list]
-        self.forward["throughput"] = {
-            "list[samples/s]": forward_throughputs_list,
-            "mean(samples/s)": compute_mean(forward_throughputs_list),
-            "stdev(samples/s)": compute_stdev(forward_throughputs_list),
-        }
-
-    def populate_memory(self, forward_memories_dict: Dict[str, Any]):
-        self.forward["memory"] = forward_memories_dict
-
-    def populate_energy(self, forward_energies_dict: Dict[str, Any]):
-        self.forward["energy"] = forward_energies_dict
-
-    # LOGGING
-    def log_latency(self):
-        for key, value in self.forward["latency"].items():
-            if "list" in key:
-                continue
-            LOGGER.info(f"\t+ forward.latency.{key}: {value:f} (s)")
-        for key, value in self.forward["throughput"].items():
-            if "list" in key:
-                continue
-            LOGGER.info(f"\t+ forward.throughput.{key}: {value:f} (samples/s)")
-
-    def log_memory(self):
-        for key, value in self.forward["memory"].items():
-            LOGGER.info(f"\t+ forward.memory.{key}: {value:f} (MB)")
-
-    def log_energy(self):
-        for key, value in self.forward["energy"].items():
-            LOGGER.info(f"\t+ forward.energy.{key}: {value:f} (kWh)")
-
-    def log_all(self) -> None:
-        if "latency" in self.forward:
-            self.log_latency()
-        if "memory" in self.forward:
-            self.log_memory()
-        if "energy" in self.forward:
-            self.log_energy()
-
-    # add operator to aggregate multiple reports
-    def __add__(self, other: "InferenceReport") -> "InferenceReport":
-        agg_report = InferenceReport(batch_size=self.batch_size + other.batch_size)
-        if "latency" in self.forward and "latency" in other.forward:
-            agg_forward_latencies_list = [
-                (lat_1 + lat_2) / 2
-                for lat_1, lat_2 in zip(self.forward["latency"]["list[s]"], other.forward["latency"]["list[s]"])
-            ]
-            agg_report.populate_latency(agg_forward_latencies_list)
-
-        if "memory" in self.forward and "memory" in other.forward:
-            agg_forward_memories_dict = {}
-            for key in self.forward["memory"]:
-                if "vram" in key:
-                    # our vram measures are not process-specific
-                    agg_forward_memories_dict[key] = max(self.forward["memory"][key], other.forward["memory"][key])
-                else:
-                    # ram and pytorch measures are process-specific
-                    agg_forward_memories_dict[key] = self.forward["memory"][key] + other.forward["memory"][key]
-
-            agg_report.populate_memory(agg_forward_memories_dict)
-
-        if "energy" in self.forward and "energy" in other.forward:
-            agg_forward_energies_dict = {}
-            for key in self.forward["energy"]:
-                # theoretically, the energies measured by codecarbon are process-specific (it's not clear from the code)
-                agg_forward_energies_dict[key] = self.forward["energy"][key] + other.forward["energy"][key]
-
-            agg_report.populate_energy(agg_forward_energies_dict)
-
-        return agg_report
-
-
-@dataclass
-class ImageDiffusionReport(BenchmarkReport):
-    # Config
-    batch_size: int
-    num_images_per_prompts: int
-    # Metrics
-    call: Dict[str, Any] = field(default_factory=dict)
-
-    # POPULATING
-    def populate_latency(self, call_latencies_list: List[float]):
-        ## Latency
-        self.call["latency"] = {
-            "list[s]": call_latencies_list,
-            "mean(s)": compute_mean(call_latencies_list),
-            "stdev(s)": compute_stdev(call_latencies_list),
-        }
-        ## Throughput
-        call_throughputs_list = [
-            self.batch_size * self.num_images_per_prompts / latency for latency in call_latencies_list
-        ]
-        self.call["throughput"] = {
-            "list[images/s]": call_throughputs_list,
-            "mean[images/s]": compute_mean(call_throughputs_list),
-            "stdev[images/s]": compute_stdev(call_throughputs_list),
-        }
-
-    def populate_memory(self, call_memories_dict: Dict[str, Any]):
-        self.call["memory"] = call_memories_dict
-
-    def populate_energy(self, call_energies_dict: Dict[str, Any]):
-        self.call["energy"] = call_energies_dict
-
-    # LOGGING
-    def log_latency(self):
-        for key, value in self.call["latency"].items():
-            if "list" in key:
-                continue
-            LOGGER.info(f"\t+ call.latency.{key}: {value:f} (s)")
-        for key, value in self.call["throughput"].items():
-            if "list" in key:
-                continue
-            LOGGER.info(f"\t+ call.throughput.{key}: {value:f} (images/s)")
-
-    def log_memory(self):
-        for key, value in self.call["memory"].items():
-            LOGGER.info(f"\t+ call.memory.{key}: {value:f} (MB)")
-
-    def log_energy(self):
-        for key, value in self.call["energy"].items():
-            LOGGER.info(f"\t+ call.energy.{key}: {value:f} (kWh)")
-
-    def log_all(self) -> None:
-        if "latency" in self.call:
-            self.log_latency()
-        if "memory" in self.call:
-            self.log_memory()
-        if "energy" in self.call:
-            self.log_energy()
-
-    # add operator to aggregate multiple reports
-    def __add__(self, other: "ImageDiffusionReport") -> "ImageDiffusionReport":
-        assert self.num_images_per_prompts == other.num_images_per_prompts, "num_images_per_prompts must be the same"
-
-        agg_report = ImageDiffusionReport(
-            batch_size=self.batch_size + other.batch_size,
-            num_images_per_prompts=self.num_images_per_prompts,
-        )
-        if "latency" in self.call and "latency" in other.call:
-            agg_call_latencies_list = [
-                (lat_1 + lat_2) / 2
-                for lat_1, lat_2 in zip(self.call["latency"]["list[s]"], other.call["latency"]["list[s]"])
-            ]
-            agg_report.populate_latency(agg_call_latencies_list)
-
-        if "memory" in self.call and "memory" in other.call:
-            agg_call_memories_dict = {}
-            for key in self.call["memory"]:
-                if "vram" in key:
-                    # our vram measures are not process-specific
-                    agg_call_memories_dict[key] = max(self.call["memory"][key], other.call["memory"][key])
-                else:
-                    # ram and pytorch measures are process-specific
-                    agg_call_memories_dict[key] = self.call["memory"][key] + other.call["memory"][key]
-
-            agg_report.populate_memory(agg_call_memories_dict)
-
-        if "energy" in self.call and "energy" in other.call:
-            agg_call_energies_dict = {}
-            for key in self.call["energy"]:
-                # theoretically, the energies measured by codecarbon are process-specific (it's not clear from the code)
-                agg_call_energies_dict[key] = self.call["energy"][key] + other.call["energy"][key]
-
-            agg_report.populate_energy(agg_call_energies_dict)
-
-        return agg_report
-
-
-@dataclass
-class TextGenerationReport(BenchmarkReport):
-    # Config
-    batch_size: int
-    sequence_length: int
-    num_new_tokens: int
-    num_return_sequences: int
-    # Prefill Metrics
-    prefill: Dict[str, Any] = field(default_factory=dict)
-    # Decode Metrics
-    decode: Dict[str, Any] = field(default_factory=dict)
-
-    def populate_latency(self, forward_latencies_list: List[float], generate_latencies_list: List[float]):
-        ## Latency
-        self.prefill["latency"] = {
-            "list[s]": forward_latencies_list,
-            "mean(s)": compute_mean(forward_latencies_list),
-            "stdev(s)": compute_stdev(forward_latencies_list),
-        }
-        ## Throughput
-        prefill_throughputs_list = [
-            self.batch_size * self.sequence_length / latency for latency in forward_latencies_list
-        ]
-        self.prefill["throughput"] = {
-            "list[tokens/s]": prefill_throughputs_list,
-            "mean[tokens/s]": compute_mean(prefill_throughputs_list),
-            "stdev[tokens/s]": compute_stdev(prefill_throughputs_list),
-        }
-        ## Latency
-        decode_latencies_list = [
-            generate_latency - self.prefill["latency"]["mean(s)"] for generate_latency in generate_latencies_list
-        ]
-        self.decode["latency"] = {
-            "list[s]": decode_latencies_list,
-            "mean(s)": compute_mean(decode_latencies_list),
-            "stdev(s)": compute_stdev(decode_latencies_list),
-        }
-        ## Throughput
-        decode_throughputs_list = [
-            self.batch_size * self.num_new_tokens * self.num_return_sequences / latency
-            for latency in decode_latencies_list
-        ]
-        self.decode["throughput"] = {
-            "list[tokens/s]": decode_throughputs_list,
-            "mean[tokens/s]": compute_mean(decode_throughputs_list),
-            "stdev[tokens/s]": compute_stdev(decode_throughputs_list),
-        }
-
-    def populate_memory(self, forward_memories_dict: Dict[str, Any], generate_memories_dict: Dict[str, Any]):
-        self.prefill["memory"] = forward_memories_dict
-        self.decode["memory"] = generate_memories_dict
-
-    def populate_energy(self, forward_energies_dict: Dict[str, Any], generate_energies_dict: Dict[str, Any]):
-        self.prefill["energy"] = forward_energies_dict
-        self.decode["energy"] = generate_energies_dict
-
-    # LOGGING
-    def log_latency(self):
-        for key, value in self.prefill["latency"].items():
-            if "list" in key:
-                continue
-            LOGGER.info(f"\t+ prefill.latency.{key}: {value:f} (s)")
-        for key, value in self.prefill["throughput"].items():
-            if "list" in key:
-                continue
-            LOGGER.info(f"\t+ prefill.throughput.{key}: {value:f} (tokens/s)")
-        for key, value in self.decode["latency"].items():
-            if "list" in key:
-                continue
-            LOGGER.info(f"\t+ decode.latency.{key}: {value:f} (s)")
-        for key, value in self.decode["throughput"].items():
-            if "list" in key:
-                continue
-            LOGGER.info(f"\t+ decode.throughput.{key}: {value:f} (tokens/s)")
-
-    def log_memory(self):
-        for key, value in self.prefill["memory"].items():
-            LOGGER.info(f"\t+ prefill.memory.{key}: {value:f} (MB)")
-        for key, value in self.decode["memory"].items():
-            LOGGER.info(f"\t+ decode.memory.{key}: {value:f} (MB)")
-
-    def log_energy(self):
-        for key, value in self.prefill["energy"].items():
-            LOGGER.info(f"\t+ prefill.energy.{key}: {value:f} (kWh)")
-        for key, value in self.decode["energy"].items():
-            LOGGER.info(f"\t+ decode.energy.{key}: {value:f} (kWh)")
-
-    def log_all(self) -> None:
-        if "latency" in self.prefill:
-            self.log_latency()
-        if "memory" in self.prefill:
-            self.log_memory()
-        if "energy" in self.prefill:
-            self.log_energy()
-
-    # add operator to aggregate multiple reports
-    def __add__(self, other: "TextGenerationReport") -> "TextGenerationReport":
-        agg_report = TextGenerationReport(
-            batch_size=self.batch_size + other.batch_size,
-            sequence_length=self.sequence_length,
-            num_new_tokens=self.num_new_tokens,
-            num_return_sequences=self.num_return_sequences,
-        )
-        if "latency" in self.prefill and "latency" in other.prefill:
-            agg_forward_latencies_list = [
-                (lat_1 + lat_2) / 2
-                for lat_1, lat_2 in zip(self.prefill["latency"]["list[s]"], other.prefill["latency"]["list[s]"])
-            ]
-            agg_generate_latencies_list = [
-                (lat_1 + lat_2) / 2
-                for lat_1, lat_2 in zip(self.decode["latency"]["list[s]"], other.decode["latency"]["list[s]"])
-            ]
-            agg_report.populate_latency(agg_forward_latencies_list, agg_generate_latencies_list)
-
-        if "memory" in self.prefill and "memory" in other.prefill:
-            agg_forward_memories_dict = {}
-            for key in self.prefill["memory"]:
-                if "vram" in key:
-                    # our vram measures are not process-specific
-                    agg_forward_memories_dict[key] = max(self.prefill["memory"][key], other.prefill["memory"][key])
-                else:
-                    # ram and pytorch measures are process-specific
-                    agg_forward_memories_dict[key] = self.prefill["memory"][key] + other.prefill["memory"][key]
-
-            agg_generate_memories_dict = {}
-            for key in self.decode["memory"]:
-                if "vram" in key:
-                    # our vram measures are not process-specific
-                    agg_generate_memories_dict[key] = max(self.decode["memory"][key], other.decode["memory"][key])
-                else:
-                    # ram and pytorch measures are process-specific
-                    agg_generate_memories_dict[key] = self.decode["memory"][key] + other.decode["memory"][key]
-
-            agg_report.populate_memory(agg_forward_memories_dict, agg_generate_memories_dict)
-
-        if "energy" in self.prefill and "energy" in other.prefill:
-            agg_forward_energies_dict = {}
-            for key in self.prefill["energy"]:
-                # theoretically, the energies measured by codecarbon are process-specific (it's not clear from the code)
-                agg_forward_energies_dict[key] = self.prefill["energy"][key] + other.prefill["energy"][key]
-
-            agg_generate_energies_dict = {}
-            for key in self.decode["energy"]:
-                # theoretically, the energies measured by codecarbon are process-specific (it's not clear from the code)
-                agg_generate_energies_dict[key] = self.decode["energy"][key] + other.decode["energy"][key]
-
-            agg_report.populate_energy(agg_forward_energies_dict, agg_generate_energies_dict)
-
-        return agg_report
-
-
-def compute_mean(values: List[float]) -> float:
-    return mean(values) if len(values) > 0 else 0.0
-
-
-def compute_stdev(values: List[float]) -> float:
-    return stdev(values) if len(values) > 1 else 0.0
diff --git a/optimum_benchmark/benchmarks/report.py b/optimum_benchmark/benchmarks/report.py
index 69491d65..02dbc541 100644
--- a/optimum_benchmark/benchmarks/report.py
+++ b/optimum_benchmark/benchmarks/report.py
@@ -1,11 +1,61 @@
-from dataclasses import dataclass, asdict
-from typing import Union, Optional
-from json import dump
 import os
+from dataclasses import asdict, dataclass
+from json import dump
+from logging import getLogger
+from typing import Any, Dict, List, Optional, Union
 
-from transformers.configuration_utils import PushToHubMixin
-from flatten_dict import flatten
 import pandas as pd
+from flatten_dict import flatten
+from transformers.configuration_utils import PushToHubMixin
+
+from ..trackers.energy import Efficiency, Energy
+from ..trackers.latency import Latency, Throughput
+from ..trackers.memory import Memory
+
+LOGGER = getLogger("report")
+
+REPORT_FILE_NAME = "benchmark_report.json"
+
+
+@dataclass
+class BenchmarkMeasurements:
+    memory: Optional[Memory] = None
+    latency: Optional[Latency] = None
+    throughput: Optional[Throughput] = None
+    energy: Optional[Energy] = None
+    efficiency: Optional[Efficiency] = None
+
+    @staticmethod
+    def aggregate(benchmark_measurements: List["BenchmarkMeasurements"]) -> "BenchmarkMeasurements":
+        memory = (
+            Memory.aggregate([m.memory for m in benchmark_measurements])
+            if benchmark_measurements[0].memory is not None
+            else None
+        )
+        latency = (
+            Latency.aggregate([m.latency for m in benchmark_measurements])
+            if benchmark_measurements[0].latency is not None
+            else None
+        )
+        throughput = (
+            Throughput.aggregate([m.throughput for m in benchmark_measurements if m.throughput is not None])
+            if benchmark_measurements[0].throughput is not None
+            else None
+        )
+        energy = (
+            Energy.aggregate([m.energy for m in benchmark_measurements if m.energy is not None])
+            if benchmark_measurements[0].energy is not None
+            else None
+        )
+        efficiency = (
+            Efficiency.aggregate([m.efficiency for m in benchmark_measurements if m.efficiency is not None])
+            if benchmark_measurements[0].efficiency is not None
+            else None
+        )
+
+        return BenchmarkMeasurements(
+            memory=memory, latency=latency, throughput=throughput, energy=energy, efficiency=efficiency
+        )
 
 
 @dataclass
@@ -22,7 +72,7 @@ def save_pretrained(
         if use_auth_token is not None:
             kwargs["token"] = use_auth_token
 
-        config_file_name = config_file_name if config_file_name is not None else "benchmark_report.json"
+        config_file_name = config_file_name if config_file_name is not None else REPORT_FILE_NAME
 
         if os.path.isfile(save_directory):
             raise AssertionError(f"Provided path ({save_directory}) should be a directory, not a file")
@@ -36,21 +86,17 @@ def save_pretrained(
             files_timestamps = self._get_files_timestamps(save_directory)
 
         output_config_file = os.path.join(save_directory, config_file_name)
-        self.to_json(output_config_file)
+        self.to_json(output_config_file, flat=False)
 
         if push_to_hub:
             self._upload_modified_files(
-                save_directory,
-                repo_id,
-                files_timestamps,
-                commit_message=commit_message,
-                token=kwargs.get("token"),
+                save_directory, repo_id, files_timestamps, commit_message=commit_message, token=kwargs.get("token")
             )
 
-    def to_dict(self) -> dict:
+    def to_dict(self) -> Dict[str, Any]:
         return asdict(self)
 
-    def to_flat_dict(self) -> dict:
+    def to_flat_dict(self) -> Dict[str, Any]:
         report_dict = self.to_dict()
         return flatten(report_dict, reducer="dot")
 
@@ -64,10 +110,60 @@ def to_json(self, path: str, flat: bool = False) -> None:
 
     def to_dataframe(self) -> pd.DataFrame:
         flat_report_dict = self.to_flat_dict()
-        return pd.DataFrame(flat_report_dict, index=[0])
+        return pd.DataFrame.from_dict(flat_report_dict, orient="index")
 
     def to_csv(self, path: str) -> None:
         self.to_dataframe().to_csv(path, index=False)
 
-    def log_all(self) -> None:
-        raise NotImplementedError("`log_all` method must be implemented in the child class")
+    def log_memory(self):
+        for target in self.to_dict().keys():
+            benchmark_measurements: BenchmarkMeasurements = getattr(self, target)
+            if benchmark_measurements.memory is not None:
+                benchmark_measurements.memory.log(prefix=target)
+
+    def log_latency(self):
+        for target in self.to_dict().keys():
+            benchmark_measurements: BenchmarkMeasurements = getattr(self, target)
+            if benchmark_measurements.latency is not None:
+                benchmark_measurements.latency.log(prefix=target)
+
+    def log_throughput(self):
+        for target in self.to_dict().keys():
+            benchmark_measurements: BenchmarkMeasurements = getattr(self, target)
+            if benchmark_measurements.throughput is not None:
+                benchmark_measurements.throughput.log(prefix=target)
+
+    def log_energy(self):
+        for target in self.to_dict().keys():
+            benchmark_measurements: BenchmarkMeasurements = getattr(self, target)
+            if benchmark_measurements.energy is not None:
+                benchmark_measurements.energy.log(prefix=target)
+
+    def log_efficiency(self):
+        for target in self.to_dict().keys():
+            benchmark_measurements: BenchmarkMeasurements = getattr(self, target)
+            if benchmark_measurements.efficiency is not None:
+                benchmark_measurements.efficiency.log(prefix=target)
+
+    def log(self):
+        for target in self.to_dict().keys():
+            benchmark_measurements: BenchmarkMeasurements = getattr(self, target)
+            if benchmark_measurements.memory is not None:
+                benchmark_measurements.memory.log(prefix=target)
+            if benchmark_measurements.latency is not None:
+                benchmark_measurements.latency.log(prefix=target)
+            if benchmark_measurements.throughput is not None:
+                benchmark_measurements.throughput.log(prefix=target)
+            if benchmark_measurements.energy is not None:
+                benchmark_measurements.energy.log(prefix=target)
+            if benchmark_measurements.efficiency is not None:
+                benchmark_measurements.efficiency.log(prefix=target)
+
+    @classmethod
+    def aggregate(cls, reports: List["BenchmarkReport"]) -> "BenchmarkReport":
+        aggregated_measurements = {}
+        for target in reports[0].to_dict().keys():
+            benchmark_measurements = [getattr(report, target) for report in reports]
+            aggregated_measurements[target] = BenchmarkMeasurements.aggregate(benchmark_measurements)
+
+        return cls(**aggregated_measurements)
diff --git a/optimum_benchmark/benchmarks/training/benchmark.py b/optimum_benchmark/benchmarks/training/benchmark.py
index 90c231d0..950cb0f7 100644
--- a/optimum_benchmark/benchmarks/training/benchmark.py
+++ b/optimum_benchmark/benchmarks/training/benchmark.py
@@ -1,19 +1,30 @@
-from logging import getLogger
 from contextlib import ExitStack
+from dataclasses import dataclass
+from logging import getLogger
+
+from transformers import default_data_collator
 
-from ..base import Benchmark
-from .config import TrainingConfig
-from .report import TrainingReport
-from ...trackers.memory import MemoryTracker
-from ...trackers.energy import EnergyTracker
-from .callback import LatencyTrainerCallback
 from ...backends.base import Backend, BackendConfigT
 from ...generators.dataset_generator import DatasetGenerator
-
-from transformers import default_data_collator
+from ...trackers.energy import Efficiency, EnergyTracker
+from ...trackers.latency import LatencyTrainerCallback, Throughput
+from ...trackers.memory import MemoryTracker
+from ..base import Benchmark
+from ..report import BenchmarkMeasurements, BenchmarkReport
+from .config import TrainingConfig
 
 LOGGER = getLogger("training")
 
+TRAIN_THROUGHPUT_UNIT = "samples/s"
+TRAIN_EFFICIENCY_UNIT = "samples/kWh"
+
+
+@dataclass
+class TrainingReport(BenchmarkReport):
+    overall: BenchmarkMeasurements = BenchmarkMeasurements()
+    warmup: BenchmarkMeasurements = BenchmarkMeasurements()
+    train: BenchmarkMeasurements = BenchmarkMeasurements()
+
 
 class TrainingBenchmark(Benchmark[TrainingConfig]):
     NAME = "training"
@@ -24,21 +35,14 @@ def __init__(self, config: TrainingConfig) -> None:
     def run(self, backend: Backend[BackendConfigT]) -> None:
         LOGGER.info("\t+ Creating dataset generator")
         dataset_generator = DatasetGenerator(
-            task=backend.config.task,
-            model_shapes=backend.model_shapes,
-            dataset_shapes=self.config.dataset_shapes,
+            task=backend.config.task, model_shapes=backend.model_shapes, dataset_shapes=self.config.dataset_shapes
         )
 
         LOGGER.info("\t+ Generating training dataset")
         training_dataset = dataset_generator()
 
         LOGGER.info("\t+ Initializing training report")
-        self.report = TrainingReport(
-            max_steps=self.config.max_steps,
-            warmup_steps=self.config.warmup_steps,
-            per_process_batch_size=self.config.training_arguments["per_device_train_batch_size"],
-            gradient_accumulation_steps=self.config.training_arguments["gradient_accumulation_steps"],
-        )
+        self.report = TrainingReport()
 
         training_callbackes = []
         if self.config.latency:
@@ -70,17 +74,51 @@ def run(self, backend: Backend[BackendConfigT]) -> None:
                 training_arguments=self.config.training_arguments,
             )
 
-        if self.config.latency:
-            self.report.populate_latency(overall_latencies_list=latency_callback.get_latencies_list())
-            self.report.log_latency()
-
         if self.config.memory:
-            self.report.populate_memory(overall_memories_dict=memory_tracker.get_memories_dict())
-            self.report.log_memory()
+            self.report.overall.memory = memory_tracker.get_max_memory()
+            self.report.warmup.memory = memory_tracker.get_max_memory()
+            self.report.train.memory = memory_tracker.get_max_memory()
+
+        if self.config.latency:
+            self.report.overall.latency = latency_callback.get_latency()
+            self.report.overall.throughput = Throughput.from_latency(
+                self.report.overall.latency, volume=self.overall_volume, unit=TRAIN_THROUGHPUT_UNIT
+            )
+            self.report.warmup.latency = self.report.overall.latency[: self.config.warmup_steps]
+            self.report.warmup.throughput = Throughput.from_latency(
+                self.report.warmup.latency, volume=self.warmup_volume, unit=TRAIN_THROUGHPUT_UNIT
+            )
+            self.report.train.latency = self.report.overall.latency[self.config.warmup_steps :]
+            self.report.train.throughput = Throughput.from_latency(
+                self.report.train.latency, volume=self.train_volume, unit=TRAIN_THROUGHPUT_UNIT
+            )
 
         if self.config.energy:
-            self.report.populate_energy(overall_energies_dict=energy_tracker.get_energies_dict())
-            self.report.log_energy()
+            # can only get overall energy consumption
+            self.report.overall.energy = energy_tracker.get_energy()
+            self.report.overall.efficiency = Efficiency.from_energy(
+                self.report.overall.energy, volume=self.overall_volume, unit=TRAIN_EFFICIENCY_UNIT
+            )
+
+    @property
+    def overall_volume(self) -> int:
+        return (
+            self.config.max_steps
+            * self.config.training_arguments["per_device_train_batch_size"]
+            * self.config.training_arguments["gradient_accumulation_steps"]
+        )
+
+    @property
+    def warmup_volume(self) -> int:
+        return (
+            self.config.warmup_steps
+            * self.config.training_arguments["per_device_train_batch_size"]
+            * self.config.training_arguments["gradient_accumulation_steps"]
+        )
+
+    @property
+    def train_volume(self) -> int:
+        return self.overall_volume - self.warmup_volume
 
     def get_report(self) -> TrainingReport:
         return self.report
diff --git a/optimum_benchmark/benchmarks/training/callback.py b/optimum_benchmark/benchmarks/training/callback.py
deleted file mode 100644
index 88026d79..00000000
--- a/optimum_benchmark/benchmarks/training/callback.py
+++ /dev/null
@@ -1,43 +0,0 @@
-import time
-from typing import List
-
-import torch
-from transformers import TrainerCallback
-
-
-class LatencyTrainerCallback(TrainerCallback):
-    def __init__(self, device: str, backend: str) -> None:
-        self.device = device
-        self.backend = backend
-        self.all_latencies_list = []
-
-    def on_step_begin(self, *args, **kwargs):
-        # one record per step
-        if self.device == "cuda" and self.backend == "pytorch":
-            self.all_latencies_list.append(torch.cuda.Event(enable_timing=True))
-            self.all_latencies_list[-1].record()
-        else:
-            self.all_latencies_list.append(time.perf_counter_ns())
-
-    def on_train_end(self, *args, **kwargs):
-        # one last record to measure the time of the last step
-        if self.device == "cuda" and self.backend == "pytorch":
-            self.all_latencies_list.append(torch.cuda.Event(enable_timing=True))
-            self.all_latencies_list[-1].record()
-        else:
-            self.all_latencies_list.append(time.perf_counter_ns())
-
-    def get_latencies_list(self) -> List[float]:
-        if self.device == "cuda" and self.backend == "pytorch":
-            torch.cuda.synchronize()  # synchronize the device to make sure all events have been recorded
-            latencies_list = [
-                self.all_latencies_list[i - 1].elapsed_time(self.all_latencies_list[i]) * 1e-3
-                for i in range(1, len(self.all_latencies_list))
-            ]
-        else:
-            latencies_list = [
-                (self.all_latencies_list[i] - self.all_latencies_list[i - 1]) * 1e-9
-                for i in range(1, len(self.all_latencies_list))
-            ]
-
-        return latencies_list
diff --git a/optimum_benchmark/benchmarks/training/config.py b/optimum_benchmark/benchmarks/training/config.py
index e5d19581..6ea9d0b4 100644
--- a/optimum_benchmark/benchmarks/training/config.py
+++ b/optimum_benchmark/benchmarks/training/config.py
@@ -25,11 +25,7 @@
     "ddp_find_unused_parameters": False,
 }
 
-DATASET_SHAPES = {
-    "dataset_size": 500,
-    "sequence_length": 16,
-    "num_choices": 1,
-}
+DATASET_SHAPES = {"dataset_size": 500, "sequence_length": 16, "num_choices": 1}
 
 
 @dataclass
@@ -63,7 +59,8 @@ def __post_init__(self):
         if self.max_steps != self.training_arguments["max_steps"]:
             LOGGER.warning(
                 f"`benchmark.max_steps` ({self.max_steps}) and `benchmark.training_arguments.max_steps` "
-                f"({self.training_arguments['max_steps']}) are different. Using `benchmark.training_arguments.max_steps`."
+                f"({self.training_arguments['max_steps']}) are different. "
+                "Using `benchmark.training_arguments.max_steps`."
             )
             self.max_steps = self.training_arguments["max_steps"]
 
diff --git a/optimum_benchmark/benchmarks/training/report.py b/optimum_benchmark/benchmarks/training/report.py
deleted file mode 100644
index 9eeba211..00000000
--- a/optimum_benchmark/benchmarks/training/report.py
+++ /dev/null
@@ -1,169 +0,0 @@
-from dataclasses import dataclass, field
-from statistics import mean, stdev
-from typing import Any, Dict, List
-from logging import getLogger
-
-from ..report import BenchmarkReport
-
-LOGGER = getLogger("report")
-
-
-@dataclass
-class TrainingReport(BenchmarkReport):
-    max_steps: int
-    warmup_steps: int
-    per_process_batch_size: int
-    gradient_accumulation_steps: int
-
-    overall: Dict[str, Any] = field(default_factory=dict)
-    training: Dict[str, Any] = field(default_factory=dict)
-    warmup: Dict[str, Any] = field(default_factory=dict)
-
-    world_size: int = 1
-
-    # POPULATING
-    def populate_latency(self, overall_latencies_list: List[float]) -> None:
-        assert (
-            len(overall_latencies_list) == self.max_steps
-        ), f"Expected {self.max_steps} latencies, but got {len(overall_latencies_list)} latencies"
-        # Overall
-        ## Latency
-        self.overall["latency"] = {
-            "list[s/step]": overall_latencies_list,
-            "mean(s/step)": compute_mean(overall_latencies_list),
-            "stdev(s/step)": compute_stdev(overall_latencies_list),
-        }
-        ## Throughput
-        overall_throughputs_list = [
-            self.world_size * self.per_process_batch_size * self.gradient_accumulation_steps / latency
-            for latency in overall_latencies_list
-        ]
-        self.overall["throughput"] = {
-            "list[samples/s]": overall_throughputs_list,
-            "mean(samples/s)": compute_mean(overall_throughputs_list),
-            "stdev(samples/s)": compute_stdev(overall_throughputs_list),
-        }
-        # Training
-        ## Latency
-        training_latencies_list = overall_latencies_list[self.warmup_steps :]
-        self.training["latency"] = {
-            "list[s/step]": training_latencies_list,
-            "mean(s/step)": compute_mean(training_latencies_list),
-            "stdev(s/step)": compute_stdev(training_latencies_list),
-        }
-        ## Throughput
-        training_throughputs_list = overall_throughputs_list[self.warmup_steps :]
-        self.training["throughput"] = {
-            "list[samples/s]": training_throughputs_list,
-            "mean(samples/s)": compute_mean(training_throughputs_list),
-            "stdev(samples/s)": compute_stdev(training_throughputs_list),
-        }
-        # Warmup
-        ## Latency
-        warmup_latencies_list = overall_latencies_list[: self.warmup_steps]
-        self.warmup["latency"] = {
-            "list[s/step]": warmup_latencies_list,
-            "mean(s/step)": compute_mean(warmup_latencies_list),
-            "stdev(s/step)": compute_stdev(warmup_latencies_list),
-        }
-        ## Throughput
-        warmup_throughputs_list = overall_throughputs_list[: self.warmup_steps]
-        self.warmup["throughput"] = {
-            "list[samples/s]": warmup_throughputs_list,
-            "mean(samples/s)": compute_mean(warmup_throughputs_list),
-            "stdev(samples/s)": compute_stdev(warmup_throughputs_list),
-        }
-
-    def populate_memory(self, overall_memories_dict: Dict[str, float]) -> None:
-        self.warmup["memory"] = overall_memories_dict
-        self.overall["memory"] = overall_memories_dict
-        self.training["memory"] = overall_memories_dict
-
-    def populate_energy(self, overall_energies_dict: Dict[str, float]) -> None:
-        self.overall["energy"] = overall_energies_dict
-        # can't get training only or warmup only energies
-        # self.warmup["energy"] = overall_energies_dict
-        # self.training["energy"] = overall_energies_dict
-        # TODO: use a callback for energy instead of a tracker
-
-    # LOGGING
-    def log_latency(self):
-        for key, value in self.training["latency"].items():
-            if "list" in key:
-                continue
-            LOGGER.info(f"\t+ training.latency.{key}: {value:f} (s)")
-        for key, value in self.training["throughput"].items():
-            if "list" in key:
-                continue
-            LOGGER.info(f"\t+ training.throughput.{key}: {value:f} (samples/s)")
-
-    def log_memory(self):
-        for key, value in self.training["memory"].items():
-            LOGGER.info(f"\t+ training.memory.{key}: {value:f} (MB)")
-
-    def log_energy(self):
-        for key, value in self.overall["energy"].items():
-            LOGGER.info(f"\t+ overall.energy.{key}: {value:f} (kWh)")
-
-    def log_all(self):
-        if "latency" in self.training:
-            self.log_latency()
-        if "memory" in self.training:
-            self.log_memory()
-        if "energy" in self.training:
-            self.log_energy()
-
-    # LOGIC
-    def __add__(self, other: "TrainingReport") -> "TrainingReport":
-        assert self.max_steps == other.max_steps, "Both reports must have the same max_steps"
-        assert self.warmup_steps == other.warmup_steps, "Both reports must have the same warmup_steps"
-        assert (
-            self.gradient_accumulation_steps == other.gradient_accumulation_steps
-        ), "Both reports must have the same gradient_accumulation_steps"
-
-        agg_report = TrainingReport(
-            max_steps=self.max_steps,
-            warmup_steps=self.warmup_steps,
-            world_size=self.world_size + other.world_size,
-            per_process_batch_size=self.per_process_batch_size,
-            gradient_accumulation_steps=self.gradient_accumulation_steps,
-        )
-
-        if "latency" in self.overall:
-            agg_overall_latencies_list = [
-                max(lat_1, lat_2)
-                for lat_1, lat_2 in zip(
-                    self.overall["latency"]["list[s/step]"], other.overall["latency"]["list[s/step]"]
-                )
-            ]
-            agg_report.populate_latency(agg_overall_latencies_list)
-
-        if "memory" in self.overall:
-            agg_overall_memories_dict = {}
-            for key in self.overall["memory"]:
-                if "vram" in key:
-                    # our vram measures are not process-specific
-                    agg_overall_memories_dict[key] = max(self.overall["memory"][key], other.overall["memory"][key])
-                else:
-                    # ram and pytorch measures are process-specific (can be accumulated)
-                    agg_overall_memories_dict[key] = self.overall["memory"][key] + other.overall["memory"][key]
-
-            agg_report.populate_memory(agg_overall_memories_dict)
-
-        if "energy" in self.overall:
-            agg_overall_energies_dict = {}
-            for key in self.overall["energy"]:
-                # theoretically, the energies measured by codecarbon are process-specific (it's not clear from the code)
-                agg_overall_energies_dict[key] = self.overall["energy"][key] + other.overall["energy"][key]
-
-            agg_report.populate_energy(agg_overall_energies_dict)
-
-        return agg_report
-
-
-def compute_mean(values: List[float]) -> float:
-    return mean(values) if len(values) > 0 else 0.0
-
-
-def compute_stdev(values: List[float]) -> float:
-    return stdev(values) if len(values) > 1 else 0.0
diff --git a/optimum_benchmark/benchmarks/utils.py b/optimum_benchmark/benchmarks/utils.py
deleted file mode 100644
index 8b137891..00000000
--- a/optimum_benchmark/benchmarks/utils.py
+++ /dev/null
@@ -1 +0,0 @@
-
diff --git a/optimum_benchmark/cli.py b/optimum_benchmark/cli.py
index 4961c189..f91a3b2c 100644
--- a/optimum_benchmark/cli.py
+++ b/optimum_benchmark/cli.py
@@ -1,28 +1,25 @@
-import os
 import glob
+import os
 from logging import getLogger
 
 import hydra
-from omegaconf import DictConfig, OmegaConf
 from hydra.core.config_store import ConfigStore
+from omegaconf import DictConfig, OmegaConf
 
-from .launchers.inline.config import InlineConfig
-from .launchers.process.config import ProcessConfig
-from .launchers.torchrun.config import TorchrunConfig
-
+from .backends.neural_compressor.config import INCConfig
+from .backends.onnxruntime.config import ORTConfig
 from .backends.openvino.config import OVConfig
 from .backends.pytorch.config import PyTorchConfig
-from .backends.onnxruntime.config import ORTConfig
-from .backends.torch_ort.config import TorchORTConfig
 from .backends.tensorrt_llm.config import TRTLLMConfig
-from .backends.neural_compressor.config import INCConfig
 from .backends.text_generation_inference.config import TGIConfig
-
+from .backends.torch_ort.config import TorchORTConfig
+from .benchmarks.inference.config import InferenceConfig
 from .benchmarks.report import BenchmarkReport
-from .experiment import launch, ExperimentConfig
 from .benchmarks.training.config import TrainingConfig
-from .benchmarks.inference.config import InferenceConfig
-
+from .experiment import ExperimentConfig, launch
+from .launchers.inline.config import InlineConfig
+from .launchers.process.config import ProcessConfig
+from .launchers.torchrun.config import TorchrunConfig
 
 LOGGER = getLogger("cli")
 
@@ -49,33 +46,17 @@
 # optimum-benchmark
 @hydra.main(version_base=None)
 def benchmark_cli(experiment_config: DictConfig) -> None:
-    os.environ["BENCHMARK_CLI"] = "1"
+    os.environ["BENCHMARK_INTERFACE"] = "CLI"
 
-    if glob.glob("*.csv") and os.environ.get("OVERRIDE_BENCHMARKS", "0") != "1":
+    if glob.glob("benchmark_report.json") and os.environ.get("OVERRIDE_BENCHMARKS", "0") != "1":
         LOGGER.warning(
-            "Skipping benchmark because results already exist. "
-            "Set OVERRIDE_BENCHMARKS=1 to override benchmark results."
+            "Benchmark report already exists. If you want to override it, set the environment variable OVERRIDE_BENCHMARKS=1"
         )
         return
 
-    # fix backend until deprecated model and device are removed
-    if experiment_config.task is not None:
-        LOGGER.warning("`task` is deprecated in experiment. Use `backend.task` instead.")
-        experiment_config.backend.task = experiment_config.task
-    if experiment_config.model is not None:
-        LOGGER.warning("`model` is deprecated in experiment. Use `backend.model` instead.")
-        experiment_config.backend.model = experiment_config.model
-    if experiment_config.device is not None:
-        LOGGER.warning("`device` is deprecated in experiment. Use `backend.device` instead.")
-        experiment_config.backend.device = experiment_config.device
-    if experiment_config.library is not None:
-        LOGGER.warning("`library` is deprecated in experiment. Use `backend.library` instead.")
-        experiment_config.backend.library = experiment_config.library
-
     # Instantiate the experiment configuration and trigger its __post_init__
     experiment_config: ExperimentConfig = OmegaConf.to_object(experiment_config)
-    OmegaConf.save(experiment_config, "experiment_config.yaml", resolve=True)
+    experiment_config.to_json("experiment_config.json")
 
     benchmark_report: BenchmarkReport = launch(experiment_config=experiment_config)
-
     benchmark_report.to_json("benchmark_report.json")
diff --git a/optimum_benchmark/env_utils.py b/optimum_benchmark/env_utils.py
deleted file mode 100644
index ed4b710b..00000000
--- a/optimum_benchmark/env_utils.py
+++ /dev/null
@@ -1,175 +0,0 @@
-import os
-import re
-import platform
-import subprocess
-import importlib.util
-from typing import Optional, List
-
-from .import_utils import is_py3nvml_available, is_pyrsmi_available
-
-import psutil
-
-
-def is_nvidia_system():
-    try:
-        subprocess.check_output("nvidia-smi")
-        return True
-    except Exception:
-        return False
-
-
-def is_rocm_system():
-    try:
-        subprocess.check_output("rocm-smi")
-        return True
-    except Exception:
-        return False
-
-
-def bytes_to_mega_bytes(bytes: int) -> int:
-    # MB, not MiB
-    # Reference: https://en.wikipedia.org/wiki/Byte#Multiple-byte_units
-    return int(bytes * 1e-6)
-
-
-def get_cpu() -> Optional[str]:
-    if platform.system() == "Windows":
-        return platform.processor()
-
-    elif platform.system() == "Darwin":
-        command = "sysctl -n machdep.cpu.brand_string"
-        return str(subprocess.check_output(command, shell=True).decode().strip())
-
-    elif platform.system() == "Linux":
-        command = "cat /proc/cpuinfo"
-        all_info = subprocess.check_output(command, shell=True).decode().strip()
-        for line in all_info.split("\n"):
-            if "model name" in line:
-                return re.sub(".*model name.*:", "", line, 1)
-        return "Could not find device name"
-
-    else:
-        raise ValueError(f"Unknown system '{platform.system()}'")
-
-
-def get_cpu_ram_mb():
-    return bytes_to_mega_bytes(psutil.virtual_memory().total)
-
-
-def get_gpus():
-    if is_nvidia_system():
-        if not is_py3nvml_available():
-            raise ValueError(
-                "The library py3nvml is required to collect information on NVIDIA GPUs, but is not installed. "
-                "Please install it through `pip install py3nvml`."
-            )
-        import py3nvml.py3nvml as nvml
-
-        gpus = []
-        nvml.nvmlInit()
-        device_count = nvml.nvmlDeviceGetCount()
-        for i in range(device_count):
-            handle = nvml.nvmlDeviceGetHandleByIndex(i)
-            gpus.append(nvml.nvmlDeviceGetName(handle))
-        nvml.nvmlShutdown()
-    elif is_rocm_system():
-        if not is_pyrsmi_available():
-            raise ValueError(
-                "The library pyrsmi is required to collect information on ROCm-powered GPUs, but is not installed. "
-                "Please install it following the instructions https://github.com/RadeonOpenCompute/pyrsmi."
-            )
-        from pyrsmi import rocml
-
-        rocml.smi_initialize()
-
-        device_count = rocml.smi_get_device_count()
-
-        gpus = [rocml.smi_get_device_name(index) for index in range(device_count)]
-        rocml.smi_shutdown()
-    else:
-        gpus = []
-
-    return gpus
-
-
-def get_gpu_vram_mb() -> List[int]:
-    if is_nvidia_system():
-        if not is_py3nvml_available():
-            raise ValueError(
-                "The library py3nvml is required to collect information on NVIDIA GPUs, but is not installed. "
-                "Please install it through `pip install py3nvml`."
-            )
-        import py3nvml.py3nvml as nvml
-
-        nvml.nvmlInit()
-        device_count = nvml.nvmlDeviceGetCount()
-        vrams = [nvml.nvmlDeviceGetMemoryInfo(nvml.nvmlDeviceGetHandleByIndex(i)).total for i in range(device_count)]
-        nvml.nvmlShutdown()
-    elif is_rocm_system():
-        if not is_pyrsmi_available():
-            raise ValueError(
-                "The library pyrsmi is required to collect information on ROCm-powered GPUs, but is not installed. "
-                "Please install it following the instructions https://github.com/RadeonOpenCompute/pyrsmi."
-            )
-
-        from pyrsmi import rocml
-
-        rocml.smi_initialize()
-        device_count = rocml.smi_get_device_count()
-        vrams = [rocml.smi_get_device_memory_total(index) for index in range(device_count)]
-        rocml.smi_shutdown()
-    else:
-        vrams = []
-
-    return sum(vrams)
-
-
-def get_cuda_device_ids() -> str:
-    if os.environ.get("CUDA_VISIBLE_DEVICES", None) is not None:
-        device_ids = os.environ["CUDA_VISIBLE_DEVICES"]
-    else:
-        if is_nvidia_system():
-            if not is_py3nvml_available():
-                raise ValueError(
-                    "The library py3nvml is required to collect information on NVIDIA GPUs, but is not installed. "
-                    "Please install it through `pip install py3nvml`."
-                )
-            import py3nvml.py3nvml as nvml
-
-            nvml.nvmlInit()
-            device_ids = list(range(nvml.nvmlDeviceGetCount()))
-            nvml.nvmlShutdown()
-        elif is_rocm_system():
-            if not is_pyrsmi_available():
-                raise ValueError(
-                    "The library pyrsmi is required to collect information on ROCm-powered GPUs, but is not installed. "
-                    "Please install it following the instructions https://github.com/RadeonOpenCompute/pyrsmi."
-                )
-
-            from pyrsmi import rocml
-
-            rocml.smi_initialize()
-            device_ids = list(range(rocml.smi_get_device_count()))
-            rocml.smi_shutdown()
-        else:
-            raise ValueError("No NVIDIA or ROCm GPUs found.")
-
-    return ",".join(str(i) for i in device_ids)
-
-
-def get_git_revision_hash(package_name: str) -> Optional[str]:
-    """
-    Returns the git commit SHA of a package installed from a git repository.
-    """
-
-    try:
-        path = importlib.util.find_spec(package_name).origin
-    except Exception:
-        return None
-
-    try:
-        git_hash = subprocess.check_output(["git", "rev-parse", "HEAD"], cwd=path).decode().strip()
-    except Exception:
-        return None
-
-    return git_hash
diff --git a/optimum_benchmark/experiment.py b/optimum_benchmark/experiment.py
index c9b6d733..c9a556cc 100644
--- a/optimum_benchmark/experiment.py
+++ b/optimum_benchmark/experiment.py
@@ -1,48 +1,38 @@
 import os
-import platform
+from dataclasses import asdict, dataclass, field
 from logging import getLogger
 from tempfile import TemporaryDirectory
-from dataclasses import dataclass, field
-from typing import Any, Dict, Type, Optional, TYPE_CHECKING
+from typing import TYPE_CHECKING, Any, Dict, Optional, Type, Union
 
-from hydra.utils import get_class
-
-from .benchmarks.report import BenchmarkReport
+from .backends.config import BackendConfig
 from .benchmarks.config import BenchmarkConfig
+from .benchmarks.report import BenchmarkReport
+from .import_utils import get_hf_libs_info
 from .launchers.config import LauncherConfig
-from .backends.config import BackendConfig
-from .import_utils import (
-    transformers_version,
-    accelerate_version,
-    diffusers_version,
-    optimum_version,
-    timm_version,
-    peft_version,
-)
-from .env_utils import (
-    get_git_revision_hash,
-    is_nvidia_system,
-    is_rocm_system,
-    get_gpu_vram_mb,
-    get_cpu_ram_mb,
-    get_gpus,
-    get_cpu,
-)
+from .system_utils import get_system_info
 
 if TYPE_CHECKING:
     # avoid importing any torch to be able to set
     # the CUDA_VISIBLE_DEVICES environment variable
     # in BackendConfig __post_init__
+    from .backends.base import Backend
     from .benchmarks.base import Benchmark
     from .launchers.base import Launcher
-    from .backends.base import Backend
 
+from json import dump
+
+import pandas as pd
+from flatten_dict import flatten
+from hydra.utils import get_class
+from transformers.configuration_utils import PushToHubMixin
 
 LOGGER = getLogger("experiment")
 
+EXPERIMENT_FILE_NAME = "experiment_config.json"
+
 
 @dataclass
-class ExperimentConfig:
+class ExperimentConfig(PushToHubMixin):
     # BACKEND CONFIGURATION
     backend: Any  # https://github.com/facebookresearch/hydra/issues/1722#issuecomment-883568386
     # LAUNCHER CONFIGURATION
@@ -59,39 +49,62 @@ class ExperimentConfig:
     library: Optional[str] = None  # deprecated
 
     # ENVIRONMENT CONFIGURATION
-    environment: Dict = field(
-        default_factory=lambda: {
-            "cpu": get_cpu(),
-            "cpu_count": os.cpu_count(),
-            "cpu_ram_mb": get_cpu_ram_mb(),
-            "system": platform.system(),
-            "python_version": platform.python_version(),
-            # libraries
-            "transformers_version": transformers_version(),
-            "transformers_commit": get_git_revision_hash("transformers"),
-            "accelerate_version": accelerate_version(),
-            "accelerate_commit": get_git_revision_hash("accelerate"),
-            "diffusers_version": diffusers_version(),
-            "diffusers_commit": get_git_revision_hash("diffusers"),
-            "optimum_version": optimum_version(),
-            "optimum_commit": get_git_revision_hash("optimum"),
-            "timm_version": timm_version(),
-            "timm_commit": get_git_revision_hash("timm"),
-            "peft_version": peft_version(),
-            "peft_commit": get_git_revision_hash("peft"),
-        }
-    )
-
-    def __post_init__(self):
-        # adding GPU information to the environment
-        if is_nvidia_system() or is_rocm_system():
-            available_gpus = get_gpus()
-            if len(available_gpus) > 0:
-                self.environment["gpu"] = available_gpus[0]
-                self.environment["gpu_count"] = len(available_gpus)
-                self.environment["gpu_vram_mb"] = get_gpu_vram_mb()
-            else:
-                LOGGER.warning("Detected NVIDIA or ROCm system, but no GPUs found.")
+    environment: Dict = field(default_factory=lambda: {**get_system_info(), **get_hf_libs_info()})
+
+    def to_dict(self) -> Dict[str, Any]:
+        return asdict(self)
+
+    def to_flat_dict(self) -> Dict[str, Any]:
+        report_dict = self.to_dict()
+        return flatten(report_dict, reducer="dot")
+
+    def to_json(self, path: str, flat: bool = False) -> None:
+        if flat:
+            with open(path, "w") as f:
+                dump(self.to_flat_dict(), f, indent=4)
+        else:
+            with open(path, "w") as f:
+                dump(self.to_dict(), f, indent=4)
+
+    def to_dataframe(self) -> pd.DataFrame:
+        flat_report_dict = self.to_flat_dict()
+        return pd.DataFrame.from_dict(flat_report_dict, orient="index")
+
+    def to_csv(self, path: str) -> None:
+        self.to_dataframe().to_csv(path, index=False)
+
+    def save_pretrained(
+        self,
+        save_directory: Union[str, os.PathLike],
+        config_file_name: Optional[Union[str, os.PathLike]] = None,
+        push_to_hub: bool = False,
+        **kwargs,
+    ):
+        use_auth_token = kwargs.pop("use_auth_token", None)
+
+        if use_auth_token is not None:
+            kwargs["token"] = use_auth_token
+
+        config_file_name = config_file_name if config_file_name is not None else EXPERIMENT_FILE_NAME
+
+        if os.path.isfile(save_directory):
+            raise AssertionError(f"Provided path ({save_directory}) should be a directory, not a file")
+
+        os.makedirs(save_directory, exist_ok=True)
+
+        if push_to_hub:
+            commit_message = kwargs.pop("commit_message", None)
+            repo_id = kwargs.pop("repo_id", save_directory.split(os.path.sep)[-1])
+            repo_id = self._create_repo(repo_id, **kwargs)
+            files_timestamps = self._get_files_timestamps(save_directory)
+
+        output_config_file = os.path.join(save_directory, config_file_name)
+        self.to_json(output_config_file, flat=False)
+
+        if push_to_hub:
+            self._upload_modified_files(
+                save_directory, repo_id, files_timestamps, commit_message=commit_message, token=kwargs.get("token")
+            )
 
 
 def run(benchmark_config: BenchmarkConfig, backend_config: BackendConfig) -> BenchmarkReport:
@@ -131,11 +144,27 @@ def run(benchmark_config: BenchmarkConfig, backend_config: BackendConfig) -> Ben
 
 
 def launch(experiment_config: ExperimentConfig) -> BenchmarkReport:
-    if os.environ.get("BENCHMARK_CLI", "0") == "0":
+    # fix backend until deprecated model and device are removed
+    if experiment_config.task is not None:
+        LOGGER.warning("`task` is deprecated in experiment config. Use `backend.task` instead.")
+        experiment_config.backend.task = experiment_config.task
+    if experiment_config.model is not None:
+        LOGGER.warning("`model` is deprecated in experiment config. Use `backend.model` instead.")
+        experiment_config.backend.model = experiment_config.model
+    if experiment_config.device is not None:
+        LOGGER.warning("`device` is deprecated in experiment config. Use `backend.device` instead.")
+        experiment_config.backend.device = experiment_config.device
+    if experiment_config.library is not None:
+        LOGGER.warning("`library` is deprecated in experiment config. Use `backend.library` instead.")
+        experiment_config.backend.library = experiment_config.library
+
+    original_dir = os.getcwd()
+    tmpdir = TemporaryDirectory()
+
+    if os.environ.get("BENCHMARK_INTERFACE", "API") == "API":
+        # to not pollute the user's environment
         LOGGER.info("Launching experiment in a temporary directory.")
-        tmep_dir = TemporaryDirectory()
-        original_dir = os.getcwd()
-        os.chdir(tmep_dir.name)
+        os.chdir(tmpdir.name)
 
     launcher_config: LauncherConfig = experiment_config.launcher
 
@@ -145,6 +174,7 @@ def launch(experiment_config: ExperimentConfig) -> BenchmarkReport:
         launcher: Launcher = launcher_factory(launcher_config)
     except Exception as e:
         LOGGER.error(f"Error during launcher allocation: {e}")
+        tmpdir.cleanup()
         raise e
 
     backend_config: BackendConfig = experiment_config.backend
@@ -154,10 +184,11 @@ def launch(experiment_config: ExperimentConfig) -> BenchmarkReport:
         output = launcher.launch(run, benchmark_config, backend_config)
     except Exception as e:
         LOGGER.error(f"Error during experiment launching: {e}")
+        tmpdir.cleanup()
         raise e
 
-    if os.environ.get("BENCHMARK_CLI", "0") == "0":
+    if os.environ.get("BENCHMARK_INTERFACE", "API") == "API":
         os.chdir(original_dir)
-        tmep_dir.cleanup()
+        tmpdir.cleanup()
 
     return output
diff --git a/optimum_benchmark/generators/input_generator.py b/optimum_benchmark/generators/input_generator.py
index 13f1d9aa..0dfc3050 100644
--- a/optimum_benchmark/generators/input_generator.py
+++ b/optimum_benchmark/generators/input_generator.py
@@ -28,27 +28,17 @@ def __call__(self, mode: str) -> Dict[str, Any]:
         if mode == "generate":
             if "pixel_values" in task_input:
                 # image input
-                task_input = {
-                    "inputs": task_input["pixel_values"],
-                }
+                task_input = {"inputs": task_input["pixel_values"]}
             elif "input_values" in task_input:
                 # speech input
-                task_input = {
-                    "inputs": task_input["input_values"],
-                }
+                task_input = {"inputs": task_input["input_values"]}
             elif "input_features" in task_input:
                 # waveform input
-                task_input = {
-                    "inputs": task_input["input_features"],
-                }
+                task_input = {"inputs": task_input["input_features"]}
             elif "input_ids" in task_input:
                 # text input
-                task_input = {
-                    "inputs": task_input["input_ids"],
-                }
+                task_input = {"inputs": task_input["input_ids"]}
         elif mode == "call":
-            task_input = {
-                "prompt": task_input["prompt"],
-            }
+            task_input = {"prompt": task_input["prompt"]}
 
         return task_input
diff --git a/optimum_benchmark/generators/task_generator.py b/optimum_benchmark/generators/task_generator.py
index 1f3e9b23..683d8963 100644
--- a/optimum_benchmark/generators/task_generator.py
+++ b/optimum_benchmark/generators/task_generator.py
@@ -43,40 +43,28 @@ def input_ids(self):
         return self.generate_random_integers(
             min_value=0,
             max_value=self.shapes["vocab_size"],
-            shape=(
-                self.shapes["batch_size"],
-                self.shapes["sequence_length"],
-            ),
+            shape=(self.shapes["batch_size"], self.shapes["sequence_length"]),
         )
 
     def attention_mask(self):
         return self.generate_random_integers(
             min_value=1,  # avoid sparse attention
             max_value=2,
-            shape=(
-                self.shapes["batch_size"],
-                self.shapes["sequence_length"],
-            ),
+            shape=(self.shapes["batch_size"], self.shapes["sequence_length"]),
         )
 
     def token_type_ids(self):
         return self.generate_random_integers(
             min_value=0,
             max_value=self.shapes["type_vocab_size"],
-            shape=(
-                self.shapes["batch_size"],
-                self.shapes["sequence_length"],
-            ),
+            shape=(self.shapes["batch_size"], self.shapes["sequence_length"]),
         )
 
     def position_ids(self):
         return self.generate_ranges(
             start=0,
             stop=self.shapes["sequence_length"],
-            shape=(
-                self.shapes["batch_size"],
-                self.shapes["sequence_length"],
-            ),
+            shape=(self.shapes["batch_size"], self.shapes["sequence_length"]),
         )
 
     def requires_token_type_ids(self):
@@ -91,44 +79,28 @@ def pixel_values(self):
         return self.generate_random_floats(
             min_value=0,
             max_value=1,
-            shape=(
-                self.shapes["batch_size"],
-                self.shapes["num_channels"],
-                self.shapes["height"],
-                self.shapes["width"],
-            ),
+            shape=(self.shapes["batch_size"], self.shapes["num_channels"], self.shapes["height"], self.shapes["width"]),
         )
 
 
 class AudioGenerator(TaskGenerator):
     def input_values(self):
         return self.generate_random_floats(
-            min_value=-1,
-            max_value=1,
-            shape=(
-                self.shapes["batch_size"],
-                self.shapes["sequence_length"],
-            ),
+            min_value=-1, max_value=1, shape=(self.shapes["batch_size"], self.shapes["sequence_length"])
         )
 
     def input_features(self):
         return self.generate_random_floats(
             min_value=-1,
             max_value=1,
-            shape=(
-                self.shapes["batch_size"],
-                self.shapes["feature_size"],
-                self.shapes["nb_max_frames"],
-            ),
+            shape=(self.shapes["batch_size"], self.shapes["feature_size"], self.shapes["nb_max_frames"]),
         )
 
 
 class TextClassificationGenerator(TextGenerator):
     def labels(self):
         return self.generate_random_integers(
-            min_value=0,
-            max_value=self.shapes["num_labels"],
-            shape=(self.shapes["batch_size"],),
+            min_value=0, max_value=self.shapes["num_labels"], shape=(self.shapes["batch_size"],)
         )
 
     def __call__(self):
@@ -154,10 +126,7 @@ def labels(self):
         return self.generate_random_integers(
             min_value=0,
             max_value=self.shapes["num_labels"],
-            shape=(
-                self.shapes["batch_size"],
-                self.shapes["sequence_length"],
-            ),
+            shape=(self.shapes["batch_size"], self.shapes["sequence_length"]),
         )
 
     def __call__(self):
@@ -199,16 +168,12 @@ def __call__(self):
 class QuestionAnsweringGenerator(TextGenerator):
     def start_positions(self):
         return self.generate_random_integers(
-            min_value=0,
-            max_value=self.shapes["sequence_length"],
-            shape=(self.shapes["batch_size"],),
+            min_value=0, max_value=self.shapes["sequence_length"], shape=(self.shapes["batch_size"],)
         )
 
     def end_positions(self):
         return self.generate_random_integers(
-            min_value=0,
-            max_value=self.shapes["sequence_length"],
-            shape=(self.shapes["batch_size"],),
+            min_value=0, max_value=self.shapes["sequence_length"], shape=(self.shapes["batch_size"],)
         )
 
     def __call__(self):
@@ -247,9 +212,7 @@ def __call__(self):
 class MultipleChoiceGenerator(TextGenerator):
     def labels(self):
         return self.generate_random_integers(
-            min_value=0,
-            max_value=self.shapes["num_choices"],
-            shape=(self.shapes["batch_size"],),
+            min_value=0, max_value=self.shapes["num_choices"], shape=(self.shapes["batch_size"],)
         )
 
     def __call__(self):
@@ -283,9 +246,7 @@ def __call__(self):
 class ImageClassificationGenerator(ImageGenerator):
     def labels(self):
         return self.generate_random_integers(
-            min_value=0,
-            max_value=self.shapes["num_labels"],
-            shape=(self.shapes["batch_size"],),
+            min_value=0, max_value=self.shapes["num_labels"], shape=(self.shapes["batch_size"],)
         )
 
     def __call__(self):
@@ -303,15 +264,9 @@ def labels(self):
         return [
             {
                 "class_labels": self.generate_random_integers(
-                    min_value=0,
-                    max_value=self.shapes["num_labels"],
-                    shape=(self.shapes["num_queries"],),
-                ),
-                "boxes": self.generate_random_floats(
-                    min_value=-1,
-                    max_value=1,
-                    shape=(self.shapes["num_queries"], 4),
+                    min_value=0, max_value=self.shapes["num_labels"], shape=(self.shapes["num_queries"],)
                 ),
+                "boxes": self.generate_random_floats(min_value=-1, max_value=1, shape=(self.shapes["num_queries"], 4)),
             }
             for _ in range(self.shapes["batch_size"])
         ]
@@ -331,11 +286,7 @@ def labels(self):
         return self.generate_random_integers(
             min_value=0,
             max_value=self.shapes["num_labels"],
-            shape=(
-                self.shapes["batch_size"],
-                self.shapes["height"],
-                self.shapes["width"],
-            ),
+            shape=(self.shapes["batch_size"], self.shapes["height"], self.shapes["width"]),
         )
 
     def __call__(self):
@@ -351,9 +302,7 @@ def __call__(self):
 class AudioClassificationGenerator(AudioGenerator):
     def labels(self):
         return self.generate_random_integers(
-            min_value=0,
-            max_value=self.shapes["num_labels"],
-            shape=(self.shapes["batch_size"],),
+            min_value=0, max_value=self.shapes["num_labels"], shape=(self.shapes["batch_size"],)
         )
 
     def __call__(self):
@@ -371,10 +320,7 @@ def labels(self):
         return self.generate_random_integers(
             min_value=0,
             max_value=self.shapes["vocab_size"],
-            shape=(
-                self.shapes["batch_size"],
-                self.shapes["sequence_length"],
-            ),
+            shape=(self.shapes["batch_size"], self.shapes["sequence_length"]),
         )
 
     def __call__(self):
diff --git a/optimum_benchmark/import_utils.py b/optimum_benchmark/import_utils.py
index f19fbda3..f247eaf3 100644
--- a/optimum_benchmark/import_utils.py
+++ b/optimum_benchmark/import_utils.py
@@ -1,6 +1,7 @@
 import importlib.metadata
 import importlib.util
-
+import subprocess
+from typing import Optional
 
 _transformers_available = importlib.util.find_spec("transformers") is not None
 _accelerate_available = importlib.util.find_spec("accelerate") is not None
@@ -10,12 +11,11 @@
 _onnx_available = importlib.util.find_spec("onnx") is not None
 _tensorrt_available = importlib.util.find_spec("tensorrt") is not None
 _peft_available = importlib.util.find_spec("peft") is not None
-_py3nvml_available = importlib.util.find_spec("py3nvml") is not None
+_pynvml_available = importlib.util.find_spec("pynvml") is not None
 _torch_distributed_available = importlib.util.find_spec("torch.distributed") is not None
 _onnxruntime_available = importlib.util.find_spec("onnxruntime") is not None
 _openvino_available = importlib.util.find_spec("openvino") is not None
 _neural_compressor_available = importlib.util.find_spec("neural_compressor") is not None
-_pyrsmi_available = importlib.util.find_spec("pyrsmi") is not None
 _codecarbon_available = importlib.util.find_spec("codecarbon") is not None
 _amdsmi_available = importlib.util.find_spec("amdsmi") is not None
 _tensorflow_available = importlib.util.find_spec("tensorflow") is not None
@@ -25,6 +25,7 @@
 _deepspeed_available = importlib.util.find_spec("deepspeed") is not None
 _tensorrt_llm_available = importlib.util.find_spec("tensorrt_llm") is not None
 _psutil_available = importlib.util.find_spec("psutil") is not None
+_optimum_benchmark_available = importlib.util.find_spec("optimum_benchmark") is not None
 
 
 def is_psutil_available():
@@ -83,12 +84,8 @@ def is_onnxruntime_available():
     return _onnxruntime_available
 
 
-def is_py3nvml_available():
-    return _py3nvml_available
-
-
-def is_pyrsmi_available():
-    return _pyrsmi_available
+def is_pynvml_available():
+    return _pynvml_available
 
 
 def is_amdsmi_available():
@@ -178,3 +175,45 @@ def peft_version():
 def tesnorrt_llm_version():
     if _tensorrt_llm_available:
         return importlib.metadata.version("tensorrt_llm")
+
+
+def optimum_benchmark_version():
+    if _optimum_benchmark_available:
+        return importlib.metadata.version("optimum_benchmark")
+
+
+def get_git_revision_hash(package_name: str) -> Optional[str]:
+    """
+    Returns the git commit SHA of a package installed from a git repository.
+    """
+
+    try:
+        path = importlib.util.find_spec(package_name).origin
+    except Exception:
+        return None
+
+    try:
+        git_hash = subprocess.check_output(["git", "rev-parse", "HEAD"], cwd=path).decode().strip()
+    except Exception:
+        return None
+
+    return git_hash
+
+
+def get_hf_libs_info():
+    return {
+        "optimum_benchmark_version": optimum_benchmark_version(),
+        "optimum_benchmark_commit": get_git_revision_hash("optimum_benchmark"),
+        "transformers_version": transformers_version(),
+        "transformers_commit": get_git_revision_hash("transformers"),
+        "accelerate_version": accelerate_version(),
+        "accelerate_commit": get_git_revision_hash("accelerate"),
+        "diffusers_version": diffusers_version(),
+        "diffusers_commit": get_git_revision_hash("diffusers"),
+        "optimum_version": optimum_version(),
+        "optimum_commit": get_git_revision_hash("optimum"),
+        "timm_version": timm_version(),
+        "timm_commit": get_git_revision_hash("timm"),
+        "peft_version": peft_version(),
+        "peft_commit": get_git_revision_hash("peft"),
+    }
diff --git a/optimum_benchmark/launchers/base.py b/optimum_benchmark/launchers/base.py
index 91b50da0..4d5323f4 100644
--- a/optimum_benchmark/launchers/base.py
+++ b/optimum_benchmark/launchers/base.py
@@ -1,7 +1,8 @@
 from abc import ABC
 from logging import getLogger
-from typing import Callable, ClassVar, Generic, Dict, Any
+from typing import Callable, ClassVar, Generic
 
+from ..benchmarks.report import BenchmarkReport
 from .config import LauncherConfigT
 
 LOGGER = getLogger("launcher")
@@ -16,5 +17,5 @@ def __init__(self, config: LauncherConfigT):
         LOGGER.info(f"ََAllocating {self.NAME} launcher")
         self.config = config
 
-    def launch(self, worker: Callable, *worker_args) -> Dict[str, Any]:
+    def launch(self, worker: Callable, *worker_args) -> BenchmarkReport:
         raise NotImplementedError("Launcher must implement launch method")
diff --git a/optimum_benchmark/launchers/config.py b/optimum_benchmark/launchers/config.py
index 2d04caa4..938c3c97 100644
--- a/optimum_benchmark/launchers/config.py
+++ b/optimum_benchmark/launchers/config.py
@@ -1,7 +1,7 @@
 from abc import ABC
-from typing import TypeVar
-from logging import getLogger
 from dataclasses import dataclass
+from logging import getLogger
+from typing import TypeVar
 
 LOGGER = getLogger("launcher")
 
diff --git a/optimum_benchmark/launchers/inline/launcher.py b/optimum_benchmark/launchers/inline/launcher.py
index e5702ba1..64a8002c 100644
--- a/optimum_benchmark/launchers/inline/launcher.py
+++ b/optimum_benchmark/launchers/inline/launcher.py
@@ -1,10 +1,10 @@
-import os
 from logging import getLogger
-from typing import Callable, Dict, Any
+from typing import Callable
 
+from ...benchmarks.report import BenchmarkReport
 from ..base import Launcher
-from .config import InlineConfig
 from ..isolation_utils import device_isolation
+from .config import InlineConfig
 
 LOGGER = getLogger("inline")
 
@@ -15,12 +15,9 @@ class InlineLauncher(Launcher[InlineConfig]):
     def __init__(self, config: InlineConfig):
         super().__init__(config)
 
-    def launch(self, worker: Callable, *worker_args) -> Dict[str, Any]:
-        with device_isolation(
-            benchmark_pid=os.getpid(),
-            enabled=self.config.device_isolation,
-        ):
-            LOGGER.info("\t+ Launching inline experiment (no process isolation)")
-            report: Dict[str, Any] = worker(*worker_args)
+    def launch(self, worker: Callable, *worker_args) -> BenchmarkReport:
+        with device_isolation(enabled=self.config.device_isolation):
+            LOGGER.info("\t+ Launching inline worker (no process isolation)")
+            report = worker(*worker_args)
 
         return report
diff --git a/optimum_benchmark/launchers/isolation_utils.py b/optimum_benchmark/launchers/isolation_utils.py
index 52006bcc..f8a0074c 100644
--- a/optimum_benchmark/launchers/isolation_utils.py
+++ b/optimum_benchmark/launchers/isolation_utils.py
@@ -1,61 +1,64 @@
 import os
-import time
 import signal
-from typing import Dict, Set
+import time
+from contextlib import contextmanager
 from logging import getLogger
 from multiprocessing import Process
-from contextlib import contextmanager
+from typing import Dict, Set
 
+from ..import_utils import is_amdsmi_available, is_psutil_available, is_pynvml_available
 from ..logging_utils import setup_logging
-from ..env_utils import is_nvidia_system, is_rocm_system
-from ..import_utils import is_amdsmi_available, is_py3nvml_available, torch_version, is_psutil_available
+from ..system_utils import get_rocm_version, is_nvidia_system, is_rocm_system
 
 if is_psutil_available():
     import psutil
 
-if is_py3nvml_available():
-    import py3nvml.py3nvml as nvml
+if is_pynvml_available():
+    import pynvml
 
 if is_amdsmi_available():
-    import amdsmi  # type: ignore
+    import amdsmi
 
 LOGGER = getLogger("isolation")
 
 
 def get_nvidia_devices_pids() -> Dict[int, list]:
+    if not is_pynvml_available():
+        raise ValueError(
+            "The library pynvml is required to get the pids running on NVIDIA GPUs, but is not installed. "
+            "Please install the official and NVIDIA maintained PyNVML library through `pip install nvidia-ml-py`."
+        )
+
     devices_pids: Dict[int, list] = {}
     devices_ids = [int(device_id) for device_id in os.environ["CUDA_VISIBLE_DEVICES"].split(",")]
 
-    if not is_py3nvml_available():
-        raise ValueError("get_nvidia_device_pids requires py3nvml. Please install it with `pip install py3nvml`.")
-
-    nvml.nvmlInit()
+    pynvml.nvmlInit()
 
     for device_id in devices_ids:
-        device_handle = nvml.nvmlDeviceGetHandleByIndex(device_id)
-        device_processes = nvml.nvmlDeviceGetComputeRunningProcesses(device_handle)
+        device_handle = pynvml.nvmlDeviceGetHandleByIndex(device_id)
+        device_processes = pynvml.nvmlDeviceGetComputeRunningProcesses(device_handle)
         for device_process in device_processes:
             if device_id not in devices_pids:
                 devices_pids[device_id] = []
 
             devices_pids[device_id].append(device_process.pid)
 
-    nvml.nvmlShutdown()
+    pynvml.nvmlShutdown()
 
     return devices_pids
 
 
 def get_amd_devices_pids() -> Dict[int, list]:
-    devices_pids: Dict[int, list] = {}
-    rocm_version = torch_version().split("rocm")[-1]
-    devices_ids = [int(device_id) for device_id in os.environ["CUDA_VISIBLE_DEVICES"].split(",")]
-
     if not is_amdsmi_available():
         raise ValueError(
-            "get_amd_devices_pids requires amdsmi. "
-            "Please follow the instructions at https://github.com/RadeonOpenCompute/amdsmi/tree/master"
+            "The library amdsmi is required get the pids running on AMD GPUs, but is not installed. "
+            "Please install the official and AMD maintained amdsmi library from https://github.com/ROCm/amdsmi."
         )
 
+    devices_pids: Dict[int, list] = {}
+    rocm_version = get_rocm_version()
+    devices_ids = [int(device_id) for device_id in os.environ["CUDA_VISIBLE_DEVICES"].split(",")]
+
     amdsmi.amdsmi_init()
 
     if rocm_version >= "5.7":
@@ -115,7 +118,6 @@ def get_amd_devices_pids() -> Dict[int, list]:
 
 def get_pids_running_on_system_device() -> Set[int]:
     """Returns the set of pids running on the system device(s)."""
-
     if is_nvidia_system():
         devices_pids = get_nvidia_devices_pids()
     elif is_rocm_system():
@@ -128,29 +130,28 @@ def get_pids_running_on_system_device() -> Set[int]:
     return all_devices_pids
 
 
-def assert_system_devices_isolation(benchmark_pid: int) -> None:
+def assert_system_devices_isolation(main_pid: int) -> None:
     setup_logging("ERROR")
-
     isolation_pid = os.getpid()
 
-    while psutil.pid_exists(benchmark_pid):
+    while psutil.pid_exists(main_pid):
         child_processes = set()
         non_permitted_pids = set()
 
         all_devices_pids = get_pids_running_on_system_device()
 
         for pid in list(all_devices_pids):
-            if pid == benchmark_pid or pid == isolation_pid:
+            if pid == main_pid or pid == isolation_pid:
                 continue
 
             try:
                 info = psutil.Process(pid)
                 parent_pid = info.ppid()
             except Exception as e:
-                LOGGER.error(f"Failed to get info for process {pid} with error {e}")
+                LOGGER.error(f"Failed to get parent pid for process {pid} with error {e}")
                 parent_pid = None
 
-            if parent_pid == benchmark_pid or parent_pid == isolation_pid:
+            if parent_pid == main_pid or parent_pid == isolation_pid:
                 child_processes.add(pid)
             else:
                 non_permitted_pids.add(pid)
@@ -159,29 +160,25 @@ def assert_system_devices_isolation(benchmark_pid: int) -> None:
             LOGGER.error(f"Found non-permitted process(es) running on system device(s): {non_permitted_pids}")
             for pid in child_processes:
                 try:
-                    LOGGER.error(f"Terminating child process {pid}")
-                    os.kill(pid, signal.SIGTERM)
+                    LOGGER.error(f"Interrupting child process {pid} of main process {main_pid}")
+                    os.kill(pid, signal.SIGINT)
                 except Exception as e:
                     LOGGER.error(f"Failed to terminate child process {pid} with error {e}")
 
-            LOGGER.error(f"Terminating benchmark process {benchmark_pid}")
-            os.kill(benchmark_pid, signal.SIGTERM)
-            break
+            LOGGER.error(f"Interrupting main process {main_pid}...")
+            os.kill(main_pid, signal.SIGINT)
+            exit(1)
 
         time.sleep(1)
 
 
 @contextmanager
-def device_isolation(benchmark_pid: int, enabled: bool):
+def device_isolation(enabled: bool):
     if not enabled:
         yield
         return
 
-    isolation_process = Process(
-        target=assert_system_devices_isolation,
-        kwargs={"benchmark_pid": benchmark_pid},
-        daemon=True,
-    )
+    isolation_process = Process(target=assert_system_devices_isolation, kwargs={"main_pid": os.getpid()}, daemon=True)
     isolation_process.start()
     LOGGER.info(f"\t+ Launched device(s) isolation process {isolation_process.pid}.")
 
diff --git a/optimum_benchmark/launchers/process/launcher.py b/optimum_benchmark/launchers/process/launcher.py
index b2619d2f..c08061a5 100644
--- a/optimum_benchmark/launchers/process/launcher.py
+++ b/optimum_benchmark/launchers/process/launcher.py
@@ -1,13 +1,13 @@
-import os
-import multiprocessing as mp
 from logging import getLogger
-from typing import Callable, Dict, Any
-from multiprocessing import Process, Queue
+from typing import Callable
 
-from ..isolation_utils import device_isolation
+import torch.multiprocessing as mp
+
+from ...benchmarks.report import BenchmarkReport
 from ...logging_utils import setup_logging
-from .config import ProcessConfig
 from ..base import Launcher
+from ..isolation_utils import device_isolation
+from .config import ProcessConfig
 
 LOGGER = getLogger("process")
 
@@ -22,35 +22,44 @@ def __init__(self, config: ProcessConfig):
             LOGGER.info(f"\t+ Setting multiprocessing start method to {self.config.start_method}.")
             mp.set_start_method(self.config.start_method, force=True)
 
-    def launch(self, worker: Callable, *worker_args) -> Dict[str, Any]:
-        # worker process can't be daemon since it might spawn its own processes
-        queue = Queue()
-        current_log_level = getLogger().getEffectiveLevel()
-        worker_process = Process(
-            daemon=False,
-            target=target,
-            args=(worker, queue, current_log_level, *worker_args),
-        )
-        worker_process.start()
-        LOGGER.info(f"\t+ Launched worker process with PID {worker_process.pid}.")
+    def launch(self, worker: Callable, *worker_args) -> BenchmarkReport:
+        log_level = getLogger().getEffectiveLevel()
+
+        ctx = mp.get_context(self.config.start_method)
+        queue = ctx.Queue()
+        lock = ctx.Lock()
 
-        with device_isolation(enabled=self.config.device_isolation, benchmark_pid=os.getpid()):
-            worker_process.join()
+        with device_isolation(enabled=self.config.device_isolation):
+            process_context = mp.start_processes(
+                entrypoint,
+                args=(worker, queue, lock, log_level, *worker_args),
+                start_method=self.config.start_method,
+                daemon=False,
+                join=False,
+                nprocs=1,
+            )
+            LOGGER.info(f"\t+ Launched worker process(es) with PID(s): {process_context.pids()}")
+            while not process_context.join():
+                pass
 
-        if worker_process.exitcode != 0:
-            LOGGER.error(f"\t+ Worker process exited with code {worker_process.exitcode}, forwarding...")
-            exit(worker_process.exitcode)
+        # restore the original logging configuration
+        setup_logging(log_level)
 
-        report = queue.get()
+        report: BenchmarkReport = queue.get()
 
         return report
 
 
-def target(fn, q, log_level, *args):
-    """This a pickalable function that correctly sets up the logging configuration for the worker process."""
+def entrypoint(i, worker, queue, lock, log_level, *worker_args):
+    """
+    This a pickalable function that correctly sets up the logging configuration for the worker process,
+    and puts the output of the worker function into a lock-protected queue.
+    """
 
-    setup_logging(log_level)
+    setup_logging(log_level, prefix=f"PROC-{i}")
 
-    out = fn(*args)
+    worker_output = worker(*worker_args)
 
-    q.put(out)
+    lock.acquire()
+    queue.put(worker_output)
+    lock.release()
diff --git a/optimum_benchmark/launchers/torchrun/config.py b/optimum_benchmark/launchers/torchrun/config.py
index 2d87ff03..c1fbfc38 100644
--- a/optimum_benchmark/launchers/torchrun/config.py
+++ b/optimum_benchmark/launchers/torchrun/config.py
@@ -1,7 +1,7 @@
 import uuid
+from dataclasses import dataclass, field
 from logging import getLogger
 from typing import Any, Dict, Optional
-from dataclasses import dataclass, field
 
 from ..config import LauncherConfig
 
diff --git a/optimum_benchmark/launchers/torchrun/launcher.py b/optimum_benchmark/launchers/torchrun/launcher.py
index f327e85c..d5351a34 100644
--- a/optimum_benchmark/launchers/torchrun/launcher.py
+++ b/optimum_benchmark/launchers/torchrun/launcher.py
@@ -1,23 +1,17 @@
-import os
-import multiprocessing as mp
 from logging import getLogger
-from multiprocessing import Queue
-from typing import Callable, Dict, Any
+from typing import Any, Callable, Dict, List
 
-from ..base import Launcher
-from .config import TorchrunConfig
+import torch.distributed
+import torch.multiprocessing as mp
+from torch.distributed.elastic.multiprocessing import Std
+from torch.distributed.elastic.multiprocessing.errors import record
+from torch.distributed.launcher.api import LaunchConfig, launch_agent
+
+from ...benchmarks.report import BenchmarkReport
 from ...logging_utils import setup_logging
+from ..base import Launcher
 from ..isolation_utils import device_isolation
-from ...benchmarks.report import BenchmarkReport
-from ...import_utils import is_torch_distributed_available
-
-if is_torch_distributed_available():
-    import torch.distributed
-    from torch.distributed import FileStore
-    from torch.distributed.elastic.multiprocessing import Std
-    from torch.distributed.elastic.multiprocessing.errors import record
-    from torch.distributed.launcher.api import LaunchConfig, launch_agent
-
+from .config import TorchrunConfig
 
 LOGGER = getLogger("torchrun")
 
@@ -33,6 +27,7 @@ def __init__(self, config: TorchrunConfig):
             mp.set_start_method(self.config.start_method, force=True)
 
     def launch(self, worker: Callable, *worker_args) -> Dict[str, Any]:
+        log_level = getLogger().getEffectiveLevel()
         launch_config = LaunchConfig(
             min_nodes=self.config.min_nodes,
             max_nodes=self.config.max_nodes,
@@ -51,55 +46,51 @@ def launch(self, worker: Callable, *worker_args) -> Dict[str, Any]:
             local_addr=self.config.local_addr,
             log_dir=self.config.log_dir,
         )
-        queue = Queue()
-        current_log_level = getLogger().getEffectiveLevel()
 
-        with device_isolation(enabled=self.config.device_isolation, benchmark_pid=os.getpid()):
+        ctx = mp.get_context(self.config.start_method)
+        queue = ctx.Queue()
+        lock = ctx.Lock()
+
+        with device_isolation(enabled=self.config.device_isolation):
             LOGGER.info(f"\t+ Launching torchrun agent with {self.config.nproc_per_node} workers processes")
             launch_agent(
-                config=launch_config,
-                entrypoint=entrypoint,
-                args=(worker, queue, current_log_level, *worker_args),
+                entrypoint=entrypoint, args=(worker, queue, lock, log_level, *worker_args), config=launch_config
             )
 
-        outputs = []
+        # restore the original logging configuration
+        setup_logging(log_level)
 
+        reports: List[BenchmarkReport] = []
         while not queue.empty():
-            outputs.append(queue.get())
+            reports.append(queue.get())
 
-        if len(outputs) == 1:
-            report: BenchmarkReport = outputs[0]
+        if len(reports) > 1:
+            LOGGER.info(f"\t+ Merging benchmark reports from {len(reports)} workers")
+            report = reports[0].aggregate(reports)
+        elif len(reports) == 1:
+            report = reports[0]
         else:
-            LOGGER.info(f"\t+ Merging benchmark reports from {len(outputs)} workers")
-            report: BenchmarkReport = sum(outputs[1:], outputs[0])
-            report.log_all()
+            raise ValueError("No benchmark report was returned by the workers")
+
+        report.log()
 
         return report
 
 
 @record
-def entrypoint(fn, q, log_level, *args):
+def entrypoint(worker, queue, lock, log_level, *worker_args):
     """
     This a pickalable function that correctly sets up the logging configuration
     """
-    if not torch.distributed.is_initialized():
-        # initialize the process group if not already initialized
-        backend = "nccl" if torch.cuda.is_available() else "gloo"
-        torch.distributed.init_process_group(backend=backend)
 
-    rank = torch.distributed.get_rank()
-
-    if torch.cuda.is_available():
-        torch.cuda.set_device(rank)
+    torch.distributed.init_process_group(backend="nccl" if torch.cuda.is_available() else "gloo")
 
-    if rank == 0:
-        setup_logging(level=log_level, prefix="RANK-0")
-    else:
-        setup_logging(level="ERROR")
+    rank = torch.distributed.get_rank()
+    torch.cuda.set_device(rank) if torch.cuda.is_available() else None
+    setup_logging(level=log_level, prefix=f"RANK-{rank}") if rank == 0 else None
 
-    # TODO: use a tcp store instead
-    store = FileStore("torchrun.filestore")
-    store.set(f"rank_{rank}", str(os.getpid()))
+    output = worker(*worker_args)
 
-    output = fn(*args)
-    q.put(output)
+    lock.acquire()
+    queue.put(output)
+    lock.release()
diff --git a/optimum_benchmark/logging_utils.py b/optimum_benchmark/logging_utils.py
index 72f76889..c4c5ab6a 100644
--- a/optimum_benchmark/logging_utils.py
+++ b/optimum_benchmark/logging_utils.py
@@ -1,9 +1,9 @@
-import os
 import logging
 import logging.config
+import os
 from logging import Logger
+from subprocess import PIPE, STDOUT, Popen
 from typing import Optional
-from subprocess import Popen, PIPE, STDOUT
 
 from omegaconf import OmegaConf
 
@@ -14,34 +14,19 @@
         "colorlog": {
             "()": "colorlog.ColoredFormatter",
             "format": "[%(cyan)s%(asctime)s%(reset)s][%(blue)s%(name)s%(reset)s][%(log_color)s%(levelname)s%(reset)s] - %(message)s",
-            "log_colors": {
-                "DEBUG": "purple",
-                "INFO": "green",
-                "WARNING": "yellow",
-                "CRITICAL": "red",
-                "ERROR": "red",
-            },
-        },
-    },
-    "handlers": {
-        "console": {
-            "formatter": "colorlog",
-            "stream": "ext://sys.stdout",
-            "class": "logging.StreamHandler",
+            "log_colors": {"DEBUG": "purple", "INFO": "green", "WARNING": "yellow", "CRITICAL": "red", "ERROR": "red"},
         },
     },
+    "handlers": {"console": {"formatter": "colorlog", "stream": "ext://sys.stdout", "class": "logging.StreamHandler"}},
     "root": {"level": "INFO", "handlers": ["console"]},
     "disable_existing_loggers": False,
 }
 
 
 def setup_logging(level: str = "INFO", prefix: Optional[str] = None):
-    if os.environ.get("BENCHMARK_CLI", "0") == "1":
+    if os.environ.get("BENCHMARK_INTERFACE", "API") == "CLI":
         hydra_config = OmegaConf.load(".hydra/hydra.yaml")
-        job_logging = OmegaConf.to_container(
-            hydra_config.hydra.job_logging,
-            resolve=True,
-        )
+        job_logging = OmegaConf.to_container(hydra_config.hydra.job_logging, resolve=True)
     else:
         job_logging = API_JOB_LOGGING.copy()
 
diff --git a/optimum_benchmark/system_utils.py b/optimum_benchmark/system_utils.py
new file mode 100644
index 00000000..52d59383
--- /dev/null
+++ b/optimum_benchmark/system_utils.py
@@ -0,0 +1,219 @@
+import os
+import platform
+import re
+import subprocess
+from typing import List, Optional
+
+import psutil
+
+from .import_utils import is_amdsmi_available, is_pynvml_available
+
+
+## CPU related stuff
+def get_cpu() -> Optional[str]:
+    if platform.system() == "Windows":
+        return platform.processor()
+
+    elif platform.system() == "Darwin":
+        command = "sysctl -n machdep.cpu.brand_string"
+        return str(subprocess.check_output(command, shell=True).decode().strip())
+
+    elif platform.system() == "Linux":
+        command = "cat /proc/cpuinfo"
+        all_info = subprocess.check_output(command, shell=True).decode().strip()
+        for line in all_info.split("\n"):
+            if "model name" in line:
+                return re.sub(".*model name.*:", "", line, 1)
+        return "Could not find device name"
+
+    else:
+        raise ValueError(f"Unknown system '{platform.system()}'")
+
+
+def get_cpu_ram_mb():
+    return psutil.virtual_memory().total / 1e6
+
+
+## GPU related stuff
+try:
+    subprocess.check_output("nvidia-smi")
+    _nvidia_system = True
+except Exception:
+    _nvidia_system = False
+
+try:
+    subprocess.check_output("rocm-smi")
+    _rocm_system = True
+except Exception:
+    _rocm_system = False
+
+
+def is_nvidia_system():
+    return _nvidia_system
+
+
+def is_rocm_system():
+    return _rocm_system
+
+
+if is_nvidia_system() and is_pynvml_available():
+    import pynvml
+
+if is_rocm_system() and is_amdsmi_available():
+    import amdsmi
+
+
+def get_rocm_version():
+    for folder in os.listdir("/opt/"):
+        if "rocm" in folder and "rocm" != folder:
+            return folder.split("-")[-1]
+    raise ValueError("Could not find ROCm version.")
+
+
+def get_gpus():
+    if is_nvidia_system():
+        if not is_pynvml_available():
+            raise ValueError(
+                "The library pynvml is required to run memory benchmark on NVIDIA GPUs, but is not installed. "
+                "Please install the official and NVIDIA maintained PyNVML library through `pip install nvidia-ml-py`."
+            )
+
+        gpus = []
+        pynvml.nvmlInit()
+        device_count = pynvml.nvmlDeviceGetCount()
+        for i in range(device_count):
+            handle = pynvml.nvmlDeviceGetHandleByIndex(i)
+            gpus.append(pynvml.nvmlDeviceGetName(handle))
+        pynvml.nvmlShutdown()
+    elif is_rocm_system():
+        if not is_amdsmi_available():
+            raise ValueError(
+                "The library amdsmi is required to run memory benchmark on AMD GPUs, but is not installed. "
+                "Please install the official and AMD maintained amdsmi library from https://github.com/ROCm/amdsmi."
+            )
+
+        gpus = []
+        amdsmi.amdsmi_init()
+        rocm_version = get_rocm_version()
+        if rocm_version >= "5.7":
+            devices_handles = amdsmi.amdsmi_get_processor_handles()
+            for device_handle in devices_handles:
+                gpus.append(amdsmi.amdsmi_get_gpu_vendor_name(device_handle))
+        else:
+            devices_handles = amdsmi.amdsmi_get_device_handles()
+            for device_handle in devices_handles:
+                gpus.append(amdsmi.amdsmi_dev_get_vendor_name(device_handle))
+        amdsmi.amdsmi_shut_down()
+    else:
+        raise ValueError("No NVIDIA or ROCm GPUs found.")
+
+    return gpus
+
+
+def get_gpu_vram_mb() -> List[int]:
+    if is_nvidia_system():
+        if not is_pynvml_available():
+            raise ValueError(
+                "The library pynvml is required to run memory benchmark on NVIDIA GPUs, but is not installed. "
+                "Please install the official and NVIDIA maintained PyNVML library through `pip install nvidia-ml-py`."
+            )
+
+        pynvml.nvmlInit()
+        device_count = pynvml.nvmlDeviceGetCount()
+        vrams = [
+            pynvml.nvmlDeviceGetMemoryInfo(pynvml.nvmlDeviceGetHandleByIndex(i)).total for i in range(device_count)
+        ]
+        pynvml.nvmlShutdown()
+    elif is_rocm_system():
+        if not is_amdsmi_available():
+            raise ValueError(
+                "The library amdsmi is required to run memory benchmark on AMD GPUs, but is not installed. "
+                "Please install the official and AMD maintained amdsmi library from https://github.com/ROCm/amdsmi."
+            )
+
+        amdsmi.amdsmi_init()
+        rocm_version = get_rocm_version()
+
+        if rocm_version >= "5.7":
+            device_handles = amdsmi.amdsmi_get_processor_handles()
+            vrams = [
+                amdsmi.amdsmi_get_gpu_memory_total(device_handle, mem_type=amdsmi.AmdSmiMemoryType.VRAM)
+                for device_handle in device_handles
+            ]
+        else:
+            device_handles = amdsmi.amdsmi_get_device_handles()
+            vrams = [
+                amdsmi.amdsmi_dev_get_memory_total(device_handle, mem_type=amdsmi.AmdSmiMemoryType.VRAM)
+                for device_handle in device_handles
+            ]
+
+        amdsmi.amdsmi_shut_down()
+
+    else:
+        raise ValueError("No NVIDIA or ROCm GPUs found.")
+
+    return sum(vrams)
+
+
+def get_gpu_device_ids() -> str:
+    if os.environ.get("CUDA_VISIBLE_DEVICES", None) is not None:
+        device_ids = os.environ["CUDA_VISIBLE_DEVICES"]
+    elif os.environ.get("GPU_DEVICE_ORDINAL", None) is not None:
+        device_ids = os.environ["GPU_DEVICE_ORDINAL"]
+    elif os.environ.get("HIP_VISIBLE_DEVICES", None) is not None:
+        device_ids = os.environ["HIP_VISIBLE_DEVICES"]
+    elif os.environ.get("ROCR_VISIBLE_DEVICES", None) is not None:
+        device_ids = os.environ["ROCR_VISIBLE_DEVICES"]
+    elif is_nvidia_system():
+        if not is_pynvml_available():
+            raise ValueError(
+                "The library pynvml is required to run memory benchmark on NVIDIA GPUs, but is not installed. "
+                "Please install the official and NVIDIA maintained PyNVML library through `pip install nvidia-ml-py`."
+            )
+
+        pynvml.nvmlInit()
+        device_ids = list(range(pynvml.nvmlDeviceGetCount()))
+        device_ids = ",".join(str(i) for i in device_ids)
+        pynvml.nvmlShutdown()
+    elif is_rocm_system():
+        if not is_amdsmi_available():
+            raise ValueError(
+                "The library amdsmi is required to run memory benchmark on AMD GPUs, but is not installed. "
+                "Please install the official and AMD maintained amdsmi library from https://github.com/ROCm/amdsmi."
+            )
+
+        amdsmi.amdsmi_init()
+        rocm_version = get_rocm_version()
+
+        if rocm_version >= "5.7":
+            device_ids = list(range(len(amdsmi.amdsmi_get_processor_handles())))
+        else:
+            device_ids = list(range(len(amdsmi.amdsmi_get_device_handles())))
+
+        device_ids = ",".join(str(i) for i in device_ids)
+        amdsmi.amdsmi_shut_down()
+    else:
+        raise ValueError("Couldn't infer GPU device ids.")
+
+    return device_ids
+
+
+## System related stuff
+def get_system_info() -> dict:
+    system_dict = {
+        "cpu": get_cpu(),
+        "cpu_count": os.cpu_count(),
+        "cpu_ram_mb": get_cpu_ram_mb(),
+        "system": platform.system(),
+        "machine": platform.machine(),
+        "platform": platform.platform(),
+        "processor": platform.processor(),
+        "python_version": platform.python_version(),
+    }
+
+    if is_nvidia_system() or is_rocm_system():
+        system_dict["gpu"] = get_gpus()
+        system_dict["gpu_count"] = len(get_gpus())
+        system_dict["gpu_vram_mb"] = get_gpu_vram_mb()
+
+    return system_dict
diff --git a/optimum_benchmark/task_utils.py b/optimum_benchmark/task_utils.py
index e35baae3..bd7d7999 100644
--- a/optimum_benchmark/task_utils.py
+++ b/optimum_benchmark/task_utils.py
@@ -1,5 +1,5 @@
-import os
 import importlib
+import os
 from typing import Optional
 
 import huggingface_hub
@@ -16,10 +16,7 @@
     "feature-extraction": "AutoModel",
     "fill-mask": "AutoModelForMaskedLM",
     "image-classification": "AutoModelForImageClassification",
-    "image-segmentation": (
-        "AutoModelForImageSegmentation",
-        "AutoModelForSemanticSegmentation",
-    ),
+    "image-segmentation": ("AutoModelForImageSegmentation", "AutoModelForSemanticSegmentation"),
     "image-to-image": "AutoModelForImageToImage",
     "image-to-text": "AutoModelForVision2Seq",
     "mask-generation": "AutoModel",
@@ -64,12 +61,8 @@
     "stable-diffusion": "StableDiffusionPipeline",
     "stable-diffusion-xl": "StableDiffusionXLImg2ImgPipeline",
 }
-_TIMM_TASKS_TO_MODEL_LOADERS = {
-    "image-classification": "create_model",
-}
-_LIBRARY_TO_TF_TASKS_TO_MODEL_LOADER_MAP = {
-    "transformers": _TRANSFORMERS_TASKS_TO_TF_MODEL_LOADERS,
-}
+_TIMM_TASKS_TO_MODEL_LOADERS = {"image-classification": "create_model"}
+_LIBRARY_TO_TF_TASKS_TO_MODEL_LOADER_MAP = {"transformers": _TRANSFORMERS_TASKS_TO_TF_MODEL_LOADERS}
 _LIBRARY_TO_TASKS_TO_MODEL_LOADER_MAP = {
     "transformers": _TRANSFORMERS_TASKS_TO_MODEL_LOADERS,
     "diffusers": _DIFFUSERS_TASKS_TO_MODEL_LOADERS,
@@ -96,35 +89,15 @@
     "zero-shot-classification": "text-classification",
 }
 _CUSTOM_CLASSES = {
-    ("pt", "pix2struct", "image-to-text"): (
-        "transformers",
-        "Pix2StructForConditionalGeneration",
-    ),
-    ("pt", "pix2struct", "visual-question-answering"): (
-        "transformers",
-        "Pix2StructForConditionalGeneration",
-    ),
-    ("pt", "visual-bert", "question-answering"): (
-        "transformers",
-        "VisualBertForQuestionAnswering",
-    ),
-    ("pt", "vision-encoder-decoder", "document-question-answering"): (
-        "transformers",
-        "VisionEncoderDecoderModel",
-    ),
+    ("pt", "pix2struct", "image-to-text"): ("transformers", "Pix2StructForConditionalGeneration"),
+    ("pt", "pix2struct", "visual-question-answering"): ("transformers", "Pix2StructForConditionalGeneration"),
+    ("pt", "visual-bert", "question-answering"): ("transformers", "VisualBertForQuestionAnswering"),
+    ("pt", "vision-encoder-decoder", "document-question-answering"): ("transformers", "VisionEncoderDecoderModel"),
 }
 
-IMAGE_DIFFUSION_TASKS = [
-    "stable-diffusion",
-    "stable-diffusion-xl",
-]
+IMAGE_DIFFUSION_TASKS = ["stable-diffusion", "stable-diffusion-xl"]
 
-TEXT_GENERATION_TASKS = [
-    "image-to-text",
-    "text-generation",
-    "text2text-generation",
-    "automatic-speech-recognition",
-]
+TEXT_GENERATION_TASKS = ["image-to-text", "text-generation", "text2text-generation", "automatic-speech-recognition"]
 
 
 def map_from_synonym(task: str) -> str:
@@ -166,10 +139,7 @@ def infer_task_from_model_name_or_path(model_name_or_path: str, revision: Option
         else:
             pipeline_tag = getattr(model_info, "pipeline_tag", None)
             # conversational is not a supported task per se, just an alias that may map to text-generaton or text2text-generation
-            if pipeline_tag is not None and pipeline_tag not in [
-                "conversational",
-                "object-detection",
-            ]:
+            if pipeline_tag is not None and pipeline_tag not in ["conversational", "object-detection"]:
                 inferred_task_name = map_from_synonym(model_info.pipeline_tag)
             else:
                 transformers_info = model_info.transformersInfo
diff --git a/optimum_benchmark/trackers/energy.py b/optimum_benchmark/trackers/energy.py
index 7d3bb7ad..d5335b5d 100644
--- a/optimum_benchmark/trackers/energy.py
+++ b/optimum_benchmark/trackers/energy.py
@@ -1,35 +1,99 @@
 import os
-from logging import getLogger
 from contextlib import contextmanager
-from typing import Optional, Dict
+from dataclasses import dataclass
+from logging import getLogger
+from typing import List, Literal, Optional
 
-from ..env_utils import get_cuda_device_ids
-from ..import_utils import is_codecarbon_available
+from ..import_utils import is_codecarbon_available, is_torch_distributed_available
+from ..system_utils import get_gpu_device_ids
 
-if is_codecarbon_available():
-    from codecarbon import EmissionsTracker, OfflineEmissionsTracker
+if is_torch_distributed_available():
+    import torch.distributed
 
+if is_codecarbon_available():
+    from codecarbon import (
+        EmissionsTracker,  # type: ignore
+        OfflineEmissionsTracker,
+    )
 
 LOGGER = getLogger("energy")
 
+ENERGY_UNIT = "kWh"
+Energy_Unit_Literal = Literal["kWh"]
+Efficiency_Unit_Literal = Literal["samples/kWh", "tokens/kWh", "images/kWh"]
+
+
+@dataclass
+class Energy:
+    unit: Energy_Unit_Literal
+
+    cpu: float
+    ram: float
+    gpu: float
+    total: float
+
+    @staticmethod
+    def aggregate(energies: List["Energy"]) -> "Energy":
+        if len(energies) == 0 or all(energy is None for energy in energies):
+            return None
+        elif any(energy is None for energy in energies):
+            raise ValueError("Some energy measurements are missing")
+
+        cpu = sum(energy.cpu for energy in energies)
+        gpu = sum(energy.gpu for energy in energies)
+        ram = sum(energy.ram for energy in energies)
+        total = sum(energy.total for energy in energies)
+
+        return Energy(cpu=cpu, gpu=gpu, ram=ram, total=total, unit=ENERGY_UNIT)
+
+    def log(self, prefix: str = "forward"):
+        LOGGER.info(f"\t\t+ {prefix} CPU energy: {self.cpu:f} ({self.unit})")
+        LOGGER.info(f"\t\t+ {prefix} GPU energy: {self.gpu:f} ({self.unit})")
+        LOGGER.info(f"\t\t+ {prefix} RAM energy: {self.ram:f} ({self.unit})")
+        LOGGER.info(f"\t\t+ {prefix} total energy: {self.total:f} ({self.unit})")
+
+
+@dataclass
+class Efficiency:
+    unit: Efficiency_Unit_Literal
+
+    value: float
+
+    @staticmethod
+    def aggregate(efficiencies: List["Efficiency"]) -> "Efficiency":
+        if len(efficiencies) == 0:
+            raise ValueError("No efficiency measurements to aggregate")
+        elif any(efficiency is None for efficiency in efficiencies):
+            raise ValueError("Some efficiency measurements are None")
+
+        unit = efficiencies[0].unit
+        value = sum(efficiency.value for efficiency in efficiencies) / len(efficiencies)
+
+        return Efficiency(value=value, unit=unit)
+
+    @staticmethod
+    def from_energy(energy: "Energy", volume: int, unit: str) -> "Efficiency":
+        return Efficiency(value=volume / energy.total if energy.total > 0 else 0, unit=unit)
+
+    def log(self, prefix: str = "forward"):
+        LOGGER.info(f"\t\t+ {prefix} efficiency: {self.value:f} ({self.unit})")
+
 
 class EnergyTracker:
     def __init__(self, device: str, device_ids: Optional[str] = None):
         self.device = device
-
-        self.cpu_energy: float = 0
-        self.gpu_energy: float = 0
-        self.ram_energy: float = 0
-        self.total_energy: float = 0
+        self.device_ids = device_ids
+        self.distributed = is_torch_distributed_available() and torch.distributed.is_initialized()
 
         if self.device == "cuda":
-            if device_ids is None:
+            if self.device_ids is None:
                 LOGGER.warning("\t+ `device=cuda` but `device_ids` not provided. Using all available CUDA devices.")
-                self.device_ids = list(map(int, get_cuda_device_ids().split(",")))
-            else:
-                self.device_ids = list(map(int, device_ids.split(",")))
-        else:
-            self.device_ids = []
+                self.device_ids = get_gpu_device_ids()
+
+            self.device_ids = list(map(int, self.device_ids.split(",")))
+            LOGGER.info(f"\t+ Tracking GPU energy on devices {self.device_ids}")
+
+        self.reset()
 
     def reset(self):
         self.cpu_energy = 0
@@ -72,10 +136,16 @@ def track(self, interval=1, file_prefix="method"):
                 country_iso_code=os.environ.get("COUNTRY_ISO_CODE", "FRA"),
             )
 
+        if self.distributed:
+            torch.distributed.barrier(device_ids=[torch.cuda.current_device()] if self.device == "cuda" else None)
+
         self.emission_tracker.start()
         yield
         self.emission_tracker.stop()
 
+        if self.distributed:
+            torch.distributed.barrier(device_ids=[torch.cuda.current_device()] if self.device == "cuda" else None)
+
         self.cpu_energy = self.emission_tracker._total_cpu_energy.kWh
         self.gpu_energy = self.emission_tracker._total_gpu_energy.kWh
         self.ram_energy = self.emission_tracker._total_ram_energy.kWh
@@ -84,10 +154,7 @@ def track(self, interval=1, file_prefix="method"):
     def get_elapsed_time(self) -> float:
         return self.emission_tracker._last_measured_time - self.emission_tracker._start_time
 
-    def get_energies_dict(self) -> Dict[str, float]:
-        return {
-            "cpu_energy(kHh)": self.cpu_energy,
-            "gpu_energy(kHh)": self.gpu_energy,
-            "ram_energy(kHh)": self.ram_energy,
-            "total(kHh)": self.total_energy,
-        }
+    def get_energy(self) -> Energy:
+        return Energy(
+            unit=ENERGY_UNIT, cpu=self.cpu_energy, gpu=self.gpu_energy, ram=self.ram_energy, total=self.total_energy
+        )
diff --git a/optimum_benchmark/trackers/latency.py b/optimum_benchmark/trackers/latency.py
index 369c2b70..e076875f 100644
--- a/optimum_benchmark/trackers/latency.py
+++ b/optimum_benchmark/trackers/latency.py
@@ -1,91 +1,241 @@
+import time
 from contextlib import contextmanager
+from dataclasses import dataclass
 from logging import getLogger
-from typing import List
-import time
+from typing import List, Literal, Union
 
-from ..import_utils import is_torch_distributed_available, is_torch_available
-
-if is_torch_available():
-    import torch
+from ..import_utils import is_torch_distributed_available
 
 if is_torch_distributed_available():
     import torch.distributed
 
+import torch
+from transformers import LogitsProcessor, TrainerCallback
+
 LOGGER = getLogger("latency")
 
+LATENCY_UNIT = "s"
+Latency_Unit_Literal = Literal["s"]
+Throughput_Unit_Literal = Literal["samples/s", "tokens/s", "images/s", "steps/s"]
+
+
+@dataclass
+class Latency:
+    unit: Latency_Unit_Literal
+
+    mean: float
+    stdev: float
+    values: List[float]
+
+    def __getitem__(self, index: int) -> float:
+        if isinstance(index, slice):
+            return Latency.from_values(values=self.values[index], unit=self.unit)
+        else:
+            return Latency.from_values(values=[self.values[index]], unit=self.unit)
+
+    def __sub__(self, scalar: float) -> "Latency":
+        if not isinstance(scalar, (int, float)):
+            raise ValueError(f"Cannot subtract non-scalar value from latency: {scalar}")
+
+        latencies = [lat - scalar for lat in self.values]
+        return Latency.from_values(values=latencies, unit=self.unit)
+
+    @staticmethod
+    def aggregate(latencies: List["Latency"]) -> "Latency":
+        if len(latencies) == 0 or all(latency is None for latency in latencies):
+            return None
+        elif any(latency is None for latency in latencies):
+            raise ValueError("Some latency measurements are missing")
+
+        unit = latencies[0].unit
+        values = sum((lat.values for lat in latencies), [])
+        return Latency.from_values(values=values, unit=unit)
+
+    @staticmethod
+    def from_values(values: List[float], unit: str) -> "Latency":
+        mean = sum(values) / len(values) if len(values) > 0 else 0
+        stdev = (sum((val - mean) ** 2 for val in values) / len(values)) ** 0.5 if len(values) > 1 else 0
+        return Latency(mean=mean, stdev=stdev, values=values, unit=unit)
+
+    def log(self, prefix: str = "forward"):
+        LOGGER.info(f"\t\t+ {prefix} latency: {self.mean:f} ± 2 x {self.stdev:f} ({self.unit})")
+
+
+@dataclass
+class Throughput:
+    unit: Throughput_Unit_Literal
+
+    value: float
+
+    @staticmethod
+    def aggregate(throughputs: List["Throughput"]) -> "Throughput":
+        if len(throughputs) == 0:
+            raise ValueError("No throughput measurements to aggregate")
+        elif any(throughput is None for throughput in throughputs):
+            raise ValueError("Some throughput measurements are missing")
+
+        unit = throughputs[0].unit
+        value = sum(throughput.value for throughput in throughputs)
+
+        return Throughput(value=value, unit=unit)
+
+    @staticmethod
+    def from_latency(latency: Latency, volume: int, unit: str) -> "Throughput":
+        value = volume / latency.mean if latency.mean > 0 else 0
+        return Throughput(value=value, unit=unit)
+
+    def log(self, prefix: str = "forward"):
+        LOGGER.info(f"\t\t+ {prefix} throughput: {self.value:f} {self.unit}")
+
 
 class LatencyTracker:
     def __init__(self, device: str, backend: str):
         self.device = device
         self.backend = backend
+        self.distributed = is_torch_distributed_available() and torch.distributed.is_initialized()
 
-        self.latencies: List[float] = []
-
-        # this is not in track, because this tracker is used repeatedly
-        if is_torch_distributed_available() and torch.distributed.is_initialized():
-            LOGGER.info("\t+ Tracking Pytorch Distributed latency")
-        elif self.device == "cuda" and self.backend == "pytorch":
+        if self.backend == "pytorch" and self.device == "cuda":
             LOGGER.info("\t+ Tracking Pytorch CUDA latency")
         else:
             LOGGER.info("\t+ Tracking CPU latency")
 
+        self.reset()
+
     def reset(self):
-        self.latencies = []
+        self.start_events: List[Union[float, torch.cuda.Event]] = []
+        self.end_events: List[Union[float, torch.cuda.Event]] = []
+        self.start_time: float = time.perf_counter()
 
     @contextmanager
     def track(self):
-        if is_torch_distributed_available() and torch.distributed.is_initialized():
-            yield from self._pytorch_distributed_latency()
-        elif self.backend == "pytorch" and self.device == "cuda":
+        if self.distributed:
+            torch.distributed.barrier(device_ids=[torch.cuda.current_device()] if self.device == "cuda" else None)
+
+        if self.backend == "pytorch" and self.device == "cuda":
             yield from self._pytorch_cuda_latency()
         else:
             yield from self._cpu_latency()
 
-    def _pytorch_distributed_latency(self):
-        torch.distributed.barrier()  # synchronize before workload
-        start = time.perf_counter_ns()
+        if self.distributed:
+            torch.distributed.barrier(device_ids=[torch.cuda.current_device()] if self.device == "cuda" else None)
+
+    def _pytorch_cuda_latency(self):
+        start = torch.cuda.Event(enable_timing=True)
+        start.record()
+        self.start_events.append(start)
+
         yield
-        torch.distributed.barrier()  # synchronize after workload
-        end = time.perf_counter_ns()
 
-        latency = (end - start) / 1e9
-        self.latencies.append(latency)
+        end = torch.cuda.Event(enable_timing=True)
+        end.record()
+        self.end_events.append(end)
 
-        LOGGER.debug(f"\t+ Tracked Pytorch distributed latency: {latency:.2e}s")
+    def _cpu_latency(self):
+        start = time.perf_counter()
+        self.start_events.append(start)
 
-    def _pytorch_cuda_latency(self):
-        # Note: torch.cuda.Event is not used here,
-        # there's actually no specific need to use cuda events if you're synchronizing
-        # it's rather a feature that can be used to measure kernel latency without synchronizing,
-        # allowing us to measure the time it takes to perform an operation without necessarily stalling the GPU.
-        # An interesting use case is with cuda graphs where synchronization makes us shoot the optimization in the foot.
-        # details: https://developer.nvidia.com/blog/how-implement-performance-metrics-cuda-cc/
-        torch.cuda.synchronize()  # synchronize before workload
-        start = time.perf_counter_ns()
         yield
-        torch.cuda.synchronize()  # synchronize after workload
-        end = time.perf_counter_ns()
 
-        latency = (end - start) / 1e9
-        self.latencies.append(latency)
+        end = time.perf_counter()
+        self.end_events.append(end)
 
-        LOGGER.debug(f"\t+ Tracked Pytorch CUDA latency: {latency:.2e}s")
+    def get_elapsed_time(self) -> float:
+        # we measured in cpu to not synchronize all events
+        return time.perf_counter() - self.start_time
 
-    def _cpu_latency(self):
-        start = time.perf_counter_ns()
-        yield
-        end = time.perf_counter_ns()
+    def get_latency(self) -> Latency:
+        if self.backend == "pytorch" and self.device == "cuda":
+            # synchronize the last event to make sure it has been recorded
+            self.start_events[-1].synchronize()
+            self.end_events[-1].synchronize()
+
+            latencies_list = [
+                self.start_events[i].elapsed_time(self.end_events[i]) / 1e3 for i in range(len(self.start_events))
+            ]
+        else:
+            latencies_list = [(self.end_events[i] - self.start_events[i]) for i in range(len(self.start_events))]
+
+        return Latency.from_values(latencies_list, unit=LATENCY_UNIT)
+
+    def get_throughput(self, volume: int, unit: str) -> Throughput:
+        return Throughput.from_latency(self.get_latency(), volume, unit)
+
+
+class LatencyTrainerCallback(TrainerCallback):
+    def __init__(self, device: str, backend: str) -> None:
+        self.device = device
+        self.backend = backend
+
+        self.reset()
+
+    def reset(self):
+        self.events: List[Union[float, torch.cuda.Event]] = []
 
-        latency = (end - start) / 1e9
-        self.latencies.append(latency)
+    def on_step_begin(self, *args, **kwargs):
+        if self.device == "cuda" and self.backend == "pytorch":
+            event = torch.cuda.Event(enable_timing=True)
+            event.record()
+            self.events.append(event)
+        else:
+            self.events.append(time.perf_counter())
+
+    def on_train_end(self, *args, **kwargs):
+        # one last record to measure the time of the last step
+        if self.device == "cuda" and self.backend == "pytorch":
+            event = torch.cuda.Event(enable_timing=True)
+            event.record()
+            self.events.append(event)
+        else:
+            self.events.append(time.perf_counter())
 
-        LOGGER.debug(f"\t+ Tracked CPU latency: {latency:.2e}s")
+    def get_latency(self) -> Latency:
+        if self.device == "cuda" and self.backend == "pytorch":
+            # synchronize the device to make sure all events have been recorded
+            torch.cuda.synchronize()
+            latencies_list = [self.events[i - 1].elapsed_time(self.events[i]) / 1e3 for i in range(1, len(self.events))]
+        else:
+            latencies_list = [(self.events[i] - self.events[i - 1]) for i in range(1, len(self.events))]
 
-    def get_total_count(self):
-        return len(self.latencies)
+        return Latency.from_values(latencies_list, unit=LATENCY_UNIT)
+
+    def get_throughput(self, volume: int, unit: str) -> Throughput:
+        return Throughput.from_latency(self.get_latency(), volume, unit)
+
+
+class LatencyLogitsProcessor(LogitsProcessor):
+    def __init__(self, device: str, backend: str):
+        self.device = device
+        self.backend = backend
+
+        self.reset()
+
+    def reset(self):
+        if self.device == "cuda" and self.backend == "pytorch":
+            event = torch.cuda.Event(enable_timing=True)
+            event.record()
+            self.events = [event]
+        else:
+            self.events = [time.perf_counter()]
+
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor):
+        if self.device == "cuda" and self.backend == "pytorch":
+            event = torch.cuda.Event(enable_timing=True)
+            event.record()
+            self.events.append(event)
+        else:
+            self.events.append(time.perf_counter())
+
+        return scores
+
+    def get_latency(self) -> Latency:
+        if self.device == "cuda" and self.backend == "pytorch":
+            # synchronize the device to make sure all events have been recorded
+            torch.cuda.synchronize()
+            latencies_list = [self.events[i - 1].elapsed_time(self.events[i]) / 1e3 for i in range(1, len(self.events))]
+        else:
+            latencies_list = [(self.events[i] - self.events[i - 1]) for i in range(1, len(self.events))]
 
-    def get_total_latency(self):
-        return sum(self.latencies)
+        return Latency.from_values(latencies_list, unit=LATENCY_UNIT)
 
-    def get_latencies_list(self) -> List[float]:
-        return self.latencies
+    def get_throughput(self, volume: int, unit: str) -> Throughput:
+        return Throughput.from_latency(self.get_latency(), volume, unit)
diff --git a/optimum_benchmark/trackers/memory.py b/optimum_benchmark/trackers/memory.py
index 816f1d5a..017c21fe 100644
--- a/optimum_benchmark/trackers/memory.py
+++ b/optimum_benchmark/trackers/memory.py
@@ -1,86 +1,110 @@
 import os
-from logging import getLogger
 from contextlib import contextmanager
-from typing import List, Optional, Dict
+from dataclasses import dataclass
+from logging import getLogger
 from multiprocessing import Pipe, Process
 from multiprocessing.connection import Connection
+from typing import List, Literal, Optional
 
-from ..env_utils import bytes_to_mega_bytes, get_cuda_device_ids, is_nvidia_system, is_rocm_system
-from ..import_utils import is_py3nvml_available, is_pyrsmi_available, is_torch_available
+from ..import_utils import is_amdsmi_available, is_pynvml_available, is_torch_available, is_torch_distributed_available
+from ..system_utils import get_gpu_device_ids, get_rocm_version, is_nvidia_system, is_rocm_system
 
-if is_nvidia_system():
-    if is_py3nvml_available():
-        import py3nvml.py3nvml as nvml
-    else:
-        raise ValueError(
-            "The library py3nvml is required to run memory benchmark on NVIDIA GPUs, but is not installed. "
-            "Please install it through `pip install py3nvml`."
-        )
+if is_torch_distributed_available():
+    import torch.distributed
 
-if is_rocm_system():
-    if is_pyrsmi_available():
-        from pyrsmi import rocml
-    else:
-        raise ValueError(
-            "The library pyrsmi is required to run memory benchmark on AMD GPUs, but is not installed. "
-            "Please install it through `pip install pyrsmi@git+https://github.com/RadeonOpenCompute/pyrsmi.git."
-        )
+if is_nvidia_system() and is_pynvml_available():
+    import pynvml
+
+if is_rocm_system() and is_amdsmi_available():
+    import amdsmi  # type: ignore
 
 if is_torch_available():
     import torch
 
 import psutil
 
-
 LOGGER = getLogger("memory")
 
+MEMORY_UNIT = "MB"
+Memory_Unit_Literal = Literal["MB"]
 
-class MemoryTracker:
-    """
-    Memory tracker to measure max memory usage of CPU or GPU devices.
 
-    Args:
-        device (str): Device to track memory usage. Can be either "cuda" or any other device.
-        backend (str): Backend to track memory usage. Can be either "pytorch" or any other backend.
-        device_ids (List[int], optional): List of device IDs to track memory usage. Defaults to None.
-    """
+@dataclass
+class Memory:
+    unit: Memory_Unit_Literal
+
+    max_ram: float
+    max_vram: Optional[float] = None
+    max_reserved: Optional[float] = None
+    max_allocated: Optional[float] = None
+
+    @staticmethod
+    def aggregate(memories: List["Memory"]) -> "Memory":
+        if len(memories) == 0:
+            raise ValueError("No memory measurements to aggregate")
+        elif any(memory is None for memory in memories):
+            raise ValueError("Some memory measurements are missing")
 
+        unit = memories[0].unit
+        max_ram = sum(memory.max_ram for memory in memories)
+        max_vram = sum(memory.max_vram for memory in memories) if memories[0].max_vram is not None else None
+        max_reserved = sum(memory.max_reserved for memory in memories) if memories[0].max_reserved is not None else None
+        max_allocated = (
+            sum(memory.max_allocated for memory in memories) if memories[0].max_allocated is not None else None
+        )
+        return Memory(
+            unit=unit, max_ram=max_ram, max_vram=max_vram, max_reserved=max_reserved, max_allocated=max_allocated
+        )
+
+    def log(self, prefix: str = "forward"):
+        LOGGER.info(f"\t\t+ {prefix} max RAM memory: {self.max_ram:f} ({self.unit})")
+        if self.max_vram is not None:
+            LOGGER.info(f"\t\t+ {prefix} max VRAM memory: {self.max_vram:f} ({self.unit})")
+        if self.max_reserved is not None:
+            LOGGER.info(f"\t\t+ {prefix} max reserved memory: {self.max_reserved:f} ({self.unit})")
+        if self.max_allocated is not None:
+            LOGGER.info(f"\t\t+ {prefix} max allocated memory: {self.max_allocated:f} ({self.unit})")
+
+
+class MemoryTracker:
     def __init__(self, device: str, backend: str, device_ids: Optional[str] = None):
         self.device = device
         self.backend = backend
+        self.device_ids = device_ids
+        self.distributed = is_torch_distributed_available() and torch.distributed.is_initialized()
 
-        self.max_memory_used = 0
-        self.max_memory_reserved = 0
-        self.max_memory_allocated = 0
+        LOGGER.info("\t+ Tracking RAM memory")
 
         if self.device == "cuda":
-            if device_ids is None:
+            if self.device_ids is None:
                 LOGGER.warning("\t+ `device=cuda` but `device_ids` not provided. Using all available CUDA devices.")
-                self.device_ids = list(map(int, get_cuda_device_ids().split(",")))
-            else:
-                self.device_ids = list(map(int, device_ids.split(",")))
+                self.device_ids = get_gpu_device_ids()
 
+            self.device_ids = list(map(int, self.device_ids.split(",")))
             LOGGER.info(f"\t+ Tracking VRAM memory of CUDA devices: {self.device_ids}")
 
             if self.backend == "pytorch":
-                self.pytorch_device_ids = list(range(torch.cuda.device_count()))
-                LOGGER.info(f"\t+ Tracking Pytorch memory of Pytorch CUDA devices: {self.pytorch_device_ids}")
-
-                if len(self.device_ids) != len(self.pytorch_device_ids):
+                num_pytorch_devices = torch.cuda.device_count()
+                if len(self.device_ids) != num_pytorch_devices:
                     raise ValueError(
                         "The number of CUDA devices and Pytorch CUDA devices must be the same. "
-                        f"Got {len(self.device_ids)} and {len(self.pytorch_device_ids)} respectively."
+                        f"Got {len(self.device_ids)} and {num_pytorch_devices} respectively."
                     )
-        else:
-            LOGGER.info("\t+ Tracking RAM memory")
+                LOGGER.info(f"\t+ Tracking Allocated/Reserved memory of {num_pytorch_devices} Pytorch CUDA devices")
+
+        self.reset()
 
     def reset(self):
-        self.max_memory_used = 0
-        self.max_memory_reserved = 0
-        self.max_memory_allocated = 0
+        self.max_ram_memory = 0
+        self.max_vram_memory = 0
+        self.max_reserved_memory = 0
+        self.max_allocated_memory = 0
 
     @contextmanager
     def track(self):
+        if self.distributed:
+            torch.distributed.barrier(device_ids=[torch.cuda.current_device()] if self.device == "cuda" else None)
+
         if self.device == "cuda" and self.backend == "pytorch":
             yield from self._cuda_pytorch_memory()
         elif self.device == "cuda":
@@ -88,122 +112,202 @@ def track(self):
         else:
             yield from self._cpu_memory()
 
+        if self.distributed:
+            torch.distributed.barrier(device_ids=[torch.cuda.current_device()] if self.device == "cuda" else None)
+
     def _cuda_pytorch_memory(self):
         torch.cuda.empty_cache()
-        for pytorch_device_index in self.pytorch_device_ids:
+
+        for device in range(torch.cuda.device_count()):
             try:
-                torch.cuda.reset_peak_memory_stats(device=pytorch_device_index)
+                torch.cuda.reset_peak_memory_stats(device=device)
             except Exception as e:
-                LOGGER.warning(f"\t+ Could not reset max memory stats for device {pytorch_device_index}: {e}")
+                LOGGER.warning(f"\t\t+ Could not reset max memory stats for device {device}: {e}")
 
         yield from self._cuda_memory()
 
-        for pytorch_device_index in self.pytorch_device_ids:
-            self.max_memory_reserved += torch.cuda.max_memory_reserved(device=pytorch_device_index)
-            self.max_memory_allocated += torch.cuda.max_memory_allocated(device=pytorch_device_index)
+        self.max_allocated_memory = sum(
+            torch.cuda.max_memory_allocated(device=device) / 1e6 for device in range(torch.cuda.device_count())
+        )
+        self.max_reserved_memory = sum(
+            torch.cuda.max_memory_reserved(device=device) / 1e6 for device in range(torch.cuda.device_count())
+        )
 
-        LOGGER.debug(f"\t+ Pytorch max memory reserved: {self.get_max_memory_reserved_mb()} MB")
-        LOGGER.debug(f"\t+ Pytorch max memory allocated: {self.get_max_memory_allocated_mb()} MB")
+        torch.cuda.empty_cache()
 
-    def _cuda_memory(self, interval: float = 0.001):
+    def _cuda_memory(self):
         child_connection, parent_connection = Pipe()
         memory_process = Process(
-            target=monitor_gpu_max_vram_memory,
-            args=(self.device_ids, child_connection, interval),
-            daemon=True,
+            target=monitor_gpu_vram_memory, args=(os.getpid(), self.device_ids, child_connection), daemon=True
         )
         memory_process.start()
         parent_connection.recv()  # wait for memory process to be ready
 
-        yield
+        yield from self._cpu_memory()
 
         parent_connection.send(True)
-        self.max_memory_used = parent_connection.recv()
-        LOGGER.debug(f"\t+ Max memory (VRAM) used: {self.get_max_memory_used_mb()} MB")
+        self.max_vram_memory = parent_connection.recv()
 
-    def _cpu_memory(self, interval: float = 0.001):
+    def _cpu_memory(self):
         child_connection, parent_connection = Pipe()
-        memory_process = Process(
-            target=monitor_cpu_max_ram_memory,
-            args=(os.getpid(), child_connection, interval),
-            daemon=True,
-        )
+        memory_process = Process(target=monitor_cpu_ram_memory, args=(os.getpid(), child_connection), daemon=True)
         memory_process.start()
         parent_connection.recv()  # wait for memory process to be ready
 
         yield
 
         parent_connection.send(True)
-        self.max_memory_used = parent_connection.recv()
-        LOGGER.debug(f"\t+ Max memory (RAM) used: {self.get_max_memory_used_mb()} MB")
-
-    def get_max_memory_used_mb(self) -> int:
-        return bytes_to_mega_bytes(self.max_memory_used)
-
-    def get_max_memory_allocated_mb(self) -> int:
-        return bytes_to_mega_bytes(self.max_memory_allocated)
-
-    def get_max_memory_reserved_mb(self) -> int:
-        return bytes_to_mega_bytes(self.max_memory_reserved)
+        self.max_ram_memory = parent_connection.recv()
 
-    def get_memories_dict(self) -> Dict[str, int]:
+    def get_max_memory(self):
         if self.device == "cuda" and self.backend == "pytorch":
-            return {
-                "max_vram_used(MB)": self.get_max_memory_used_mb(),
-                "max_memory_reserved(MB)": self.get_max_memory_reserved_mb(),
-                "max_memory_allocated(MB)": self.get_max_memory_allocated_mb(),
-            }
+            return Memory(
+                unit=MEMORY_UNIT,
+                max_ram=self.max_ram_memory,
+                max_vram=self.max_vram_memory,
+                max_reserved=self.max_reserved_memory,
+                max_allocated=self.max_allocated_memory,
+            )
         elif self.device == "cuda":
-            return {"max_vram_used(MB)": self.get_max_memory_used_mb()}
+            return Memory(unit=MEMORY_UNIT, max_ram=self.max_ram_memory, max_vram=self.max_vram_memory)
         else:
-            return {"max_ram_used(MB)": self.get_max_memory_used_mb()}
+            return Memory(unit=MEMORY_UNIT, max_ram=self.max_ram_memory)
 
 
-def monitor_cpu_max_ram_memory(process_id: int, connection: Connection, interval: float):
+def monitor_cpu_ram_memory(process_id: int, connection: Connection, interval: float = 0.001):
+    stop = False
+    max_memory = 0
     process = psutil.Process(process_id)
-    max_memory_usage = 0
     connection.send(0)
-    stop = False
 
     while not stop:
         meminfo_attr = "memory_info" if hasattr(process, "memory_info") else "get_memory_info"
-        current_memory_usage = getattr(process, meminfo_attr)()[0]
-        max_memory_usage = max(max_memory_usage, current_memory_usage)
+        current_used_memory = getattr(process, meminfo_attr)()[0]
+        max_memory = max(max_memory, current_used_memory)
         stop = connection.poll(interval)
 
-    connection.send(max_memory_usage)
+    connection.send(max_memory / 1e6)  # convert to MB
     connection.close()
 
 
-def monitor_gpu_max_vram_memory(device_ids: List[int], connection: Connection, interval: float):
-    if is_nvidia_system() and is_py3nvml_available():
-        nvml.nvmlInit()
-        handles = [nvml.nvmlDeviceGetHandleByIndex(device_id) for device_id in device_ids]
-        max_memory_usage = 0
-        connection.send(0)
-        stop = False
-
-        while not stop:
-            current_memory_usage = sum(nvml.nvmlDeviceGetMemoryInfo(handle).used for handle in handles)
-            max_memory_usage = max(max_memory_usage, current_memory_usage)
-            stop = connection.poll(interval)
+def monitor_gpu_vram_memory(process_id: int, device_ids: List[int], connection: Connection, interval: float = 0.01):
+    stop = False
+    max_memory = 0
+    connection.send(0)
 
-        connection.send(max_memory_usage)
-        nvml.nvmlShutdown()
-        connection.close()
-    elif is_rocm_system() and is_pyrsmi_available():
-        rocml.smi_initialize()
-        max_memory_usage = 0
-        connection.send(0)
-        stop = False
+    if is_nvidia_system():
+        if not is_pynvml_available():
+            raise ValueError(
+                "The library pynvml is required to run memory benchmark on NVIDIA GPUs, but is not installed. "
+                "Please install the official and NVIDIA maintained PyNVML library through `pip install nvidia-ml-py`."
+            )
+        pynvml.nvmlInit()
+        devices_handles = [pynvml.nvmlDeviceGetHandleByIndex(device_id) for device_id in device_ids]
 
         while not stop:
-            current_memory_usage = sum(rocml.smi_get_device_memory_used(device_id) for device_id in device_ids)
-            max_memory_usage = max(max_memory_usage, current_memory_usage)
+            current_used_memory = 0
+            for device_id, device_handle in zip(device_ids, devices_handles):
+                try:
+                    device_processes = pynvml.nvmlDeviceGetComputeRunningProcesses(device_handle)
+                except Exception as e:
+                    LOGGER.warning(f"\t\t+ Could not get process list for device {device_id}: {e}")
+                    continue
+                for device_process in device_processes:
+                    if device_process.pid == process_id:
+                        current_used_memory += device_process.usedGpuMemory
+                    else:
+                        try:
+                            cpu_process = psutil.Process(device_process.pid)
+                        except Exception as e:
+                            LOGGER.warning(f"\t\t+ Could not get process info for process {device_process.pid}: {e}")
+                            continue
+                        if cpu_process.parent() is not None and cpu_process.parent().pid == process_id:
+                            current_used_memory += device_process.usedGpuMemory
+
+            max_memory = max(max_memory, current_used_memory)
             stop = connection.poll(interval)
 
-        connection.send(max_memory_usage)
-        rocml.smi_shutdown()
-        connection.close()
+        pynvml.nvmlShutdown()
+
+    elif is_rocm_system():
+        if not is_amdsmi_available():
+            raise ValueError(
+                "The library amdsmi is required to run memory benchmark on AMD GPUs, but is not installed. "
+                "Please install the official and AMD maintained amdsmi library from https://github.com/ROCm/amdsmi."
+            )
+        amdsmi.amdsmi_init()
+        rocm_version = get_rocm_version()
+
+        if rocm_version >= "5.7":
+            devices_handles = amdsmi.amdsmi_get_processor_handles()
+            while not stop:
+                current_used_memory = 0
+                for device_id in device_ids:
+                    device_handle = devices_handles[device_id]
+                    try:
+                        processes_handles = amdsmi.amdsmi_get_gpu_process_list(device_handle)
+                    except Exception as e:
+                        LOGGER.warning(f"\t\t+ Could not get process list for device {device_id}: {e}")
+                        continue
+                    for process_handle in processes_handles:
+                        try:
+                            gpu_process_info = amdsmi.amdsmi_get_gpu_process_info(device_handle, process_handle)
+                        except Exception as e:
+                            LOGGER.warning(f"\t\t+ Could not get process info for process {process_handle}: {e}")
+                            continue
+                        # only memory usage of the monitored process and its children is tracked
+                        if gpu_process_info["pid"] == process_id:
+                            current_used_memory += gpu_process_info["memory_usage"]["vram_mem"]
+                        else:
+                            try:
+                                cpu_process_info = psutil.Process(gpu_process_info["pid"])
+                            except Exception as e:
+                                LOGGER.warning(
+                                    f"\t\t+ Could not get process info for process {gpu_process_info['pid']}: {e}"
+                                )
+                                continue
+                            if cpu_process_info.parent() is not None and cpu_process_info.ppid() == process_id:
+                                current_used_memory += gpu_process_info["memory_usage"]["vram_mem"]
+
+                max_memory = max(max_memory, current_used_memory)
+                stop = connection.poll(interval)
+        else:
+            devices_handles = amdsmi.amdsmi_get_device_handles()
+            while not stop:
+                current_used_memory = 0
+                for device_id in device_ids:
+                    device_handle = devices_handles[device_id]
+                    try:
+                        processes_handles = amdsmi.amdsmi_get_process_list(device_handle)
+                    except Exception as e:
+                        LOGGER.warning(f"\t\t+ Could not get process list for device {device_id}: {e}")
+                        continue
+                    for process_handle in processes_handles:
+                        try:
+                            gpu_process_info = amdsmi.amdsmi_get_process_info(device_handle, process_handle)
+                        except Exception as e:
+                            LOGGER.warning(f"\t\t+ Could not get process info for process {process_handle}: {e}")
+                            continue
+                        # only memory usage of the monitored process and its children is tracked
+                        if gpu_process_info["pid"] == process_id:
+                            current_used_memory += gpu_process_info["memory_usage"]["vram_mem"]
+                        else:
+                            try:
+                                cpu_process_info = psutil.Process(gpu_process_info["pid"])
+                            except Exception as e:
+                                LOGGER.warning(
+                                    f"\t\t+ Could not get process info for process {gpu_process_info['pid']}: {e}"
+                                )
+                                continue
+                            if cpu_process_info.parent() is not None and cpu_process_info.ppid() == process_id:
+                                current_used_memory += gpu_process_info["memory_usage"]["vram_mem"]
+
+                max_memory = max(max_memory, current_used_memory)
+                stop = connection.poll(interval)
+
+        amdsmi.amdsmi_shut_down()
     else:
         raise ValueError("Only NVIDIA and AMD ROCm GPUs are supported for CUDA memory tracking.")
+
+    connection.send(max_memory / 1e6)  # convert to MB
+    connection.close()
diff --git a/pyproject.toml b/pyproject.toml
index e9ce4301..58e5b284 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,5 +1,18 @@
+# [tool.isort]
+# profile = "ruff"
+# lines_after_imports = 2
+# known_first_party = "optimum_benchmark"
+
 [tool.ruff]
 line-length = 120
+ignore = ["C901", "E501", "E741", "W605"]
+select = ["C", "E", "F", "I", "W", "I001"]
+
+[tool.ruff.format]
+line-ending = "auto"
+quote-style = "double"
+indent-style = "space"
+skip-magic-trailing-comma = false
 
 [tool.pytest.ini_options]
 log_cli = true
diff --git a/setup.py b/setup.py
index 40504fd3..f993adc4 100644
--- a/setup.py
+++ b/setup.py
@@ -1,5 +1,6 @@
 import os
 import subprocess
+
 from setuptools import find_packages, setup
 
 MIN_OPTIMUM_VERSION = "1.16.0"
@@ -12,13 +13,10 @@
     "hydra_colorlog",
     "hydra-core",
     "omegaconf",
-    # Other
+    # CPU Memory
     "psutil",
-    "pandas",
     # Reporting
-    "rich",
-    "tabulate",
-    "matplotlib",
+    "pandas",
     "flatten_dict",
 ]
 
@@ -28,20 +26,21 @@
 USE_ROCM = os.environ.get("USE_ROCM", None) == "1"
 
 if USE_CUDA:
-    INSTALL_REQUIRES.append("py3nvml")
+    INSTALL_REQUIRES.append("nvidia-ml-py")
 else:
     try:
         subprocess.run(["nvidia-smi"], stdout=subprocess.DEVNULL)
-        INSTALL_REQUIRES.append("py3nvml")
+        INSTALL_REQUIRES.append("nvidia-ml-py")
     except FileNotFoundError:
         pass
 
+# we keep this as a check that amdsmi is installed since it's not available on pypi
 if USE_ROCM:
-    INSTALL_REQUIRES.append("pyrsmi@git+https://github.com/RadeonOpenCompute/pyrsmi.git")
+    INSTALL_REQUIRES.append("amdsmi")
 else:
     try:
-        subprocess.run(["nvidia-smi"], stdout=subprocess.DEVNULL)
-        INSTALL_REQUIRES.append("pyrsmi@git+https://github.com/RadeonOpenCompute/pyrsmi.git")
+        subprocess.run(["rocm-smi"], stdout=subprocess.DEVNULL)
+        INSTALL_REQUIRES.append("amdsmi")
     except FileNotFoundError:
         pass
 
@@ -54,11 +53,7 @@
     "onnxruntime": [f"optimum[onnxruntime]>={MIN_OPTIMUM_VERSION}"],
     "neural-compressor": [f"optimum[neural-compressor]>={MIN_OPTIMUM_VERSION}"],
     "onnxruntime-gpu": [f"optimum[onnxruntime-gpu]>={MIN_OPTIMUM_VERSION}"],
-    "torch-ort": [
-        f"optimum>={MIN_OPTIMUM_VERSION}",
-        "onnxruntime-training",
-        "torch-ort",
-    ],
+    "torch-ort": [f"optimum>={MIN_OPTIMUM_VERSION}", "onnxruntime-training", "torch-ort"],
     # docker-based backends
     "text-generation-inference": ["docker"],
     # specific settings
@@ -75,6 +70,6 @@
     install_requires=INSTALL_REQUIRES,
     extras_require=EXTRAS_REQUIRE,
     packages=find_packages(),
-    version="0.0.2",
+    version="0.1.0",
     entry_points={"console_scripts": ["optimum-benchmark=optimum_benchmark.cli:benchmark_cli"]},
 )
diff --git a/tests/configs/_base_.yaml b/tests/configs/_base_.yaml
index d983b841..27acb325 100644
--- a/tests/configs/_base_.yaml
+++ b/tests/configs/_base_.yaml
@@ -2,8 +2,8 @@ defaults:
   - launcher: process # isolated process launcher
   - experiment # inheriting experiment schema
   - _self_ # for hydra 1.1 compatibility
-  - override hydra/hydra_logging: colorlog # colorful logging
-  - override hydra/job_logging: colorlog # colorful logging
+  - override hydra/hydra_logging: colorlog
+  - override hydra/job_logging: colorlog
   - override hydra/launcher: joblib # for parallelization
 
 experiment_name: ${device}_${benchmark.name}_${backend.name}_${task}
@@ -20,13 +20,12 @@ hydra:
     # change working directory to the run directory
     chdir: true
     env_set:
-      # set environment variable OVERRIDE_BENCHMARKS to 1
-      # to not skip benchmarks that have been run before
+      # to not skip benchmarks if results already exist
       OVERRIDE_BENCHMARKS: 1
 
   # we are using joblib launcher to parallelize testing since
-  # we're having ccorrect benchmarks is not important while testing
+  # having correct benchmark values is not important while testing
   # to force sequential execution, uncomment the following three lines
   # launcher:
-  #   n_jobs: 1 # for debugging
-  #   batch_size: 1 # for debugging
+  #   n_jobs: -1 # 1 for debugging
+  #   batch_size: auto # 1 for debugging
diff --git a/tests/configs/_bert_sweep_.yaml b/tests/configs/_bert_sweep_.yaml
index e6a6c4fc..c4986d0d 100644
--- a/tests/configs/_bert_sweep_.yaml
+++ b/tests/configs/_bert_sweep_.yaml
@@ -1,6 +1,5 @@
 hydra:
   sweeper:
     params:
-      backend.no_weights: false,true
+      backend.model: hf-internal-testing/tiny-random-bert
       backend.task: fill-mask,text-classification,token-classification,question-answering
-      backend.model: hf-internal-testing/tiny-random-bert,hf-internal-testing/tiny-random-roberta
diff --git a/tests/configs/_lm_naive_mp_.yaml b/tests/configs/_gpt_naive_mp_.yaml
similarity index 100%
rename from tests/configs/_lm_naive_mp_.yaml
rename to tests/configs/_gpt_naive_mp_.yaml
index 2ac16fb8..cf2adfd3 100644
--- a/tests/configs/_lm_naive_mp_.yaml
+++ b/tests/configs/_gpt_naive_mp_.yaml
@@ -1,6 +1,6 @@
 backend:
-  device_ids: 0,1
-  device_map: auto
+  model: gpt2
   task: text-generation
   library: transformers
-  model: gpt2
+  device_ids: 0,1
+  device_map: auto
diff --git a/tests/configs/_lm_peft_.yaml b/tests/configs/_gpt_peft_.yaml
similarity index 100%
rename from tests/configs/_lm_peft_.yaml
rename to tests/configs/_gpt_peft_.yaml
diff --git a/tests/configs/_lm_sweep_.yaml b/tests/configs/_gpt_sweep_.yaml
similarity index 81%
rename from tests/configs/_lm_sweep_.yaml
rename to tests/configs/_gpt_sweep_.yaml
index 763d7120..1ff5e2c7 100644
--- a/tests/configs/_lm_sweep_.yaml
+++ b/tests/configs/_gpt_sweep_.yaml
@@ -2,5 +2,4 @@ hydra:
   sweeper:
     params:
       backend.task: text-generation
-      backend.no_weights: false,true
       backend.model: hf-internal-testing/tiny-random-gpt2,IlyasMoutawwakil/tiny-random-llama
diff --git a/tests/configs/cpu_inference_neural_compressor_lm_sweep.yaml b/tests/configs/cpu_inference_neural_compressor_gpt_sweep.yaml
similarity index 70%
rename from tests/configs/cpu_inference_neural_compressor_lm_sweep.yaml
rename to tests/configs/cpu_inference_neural_compressor_gpt_sweep.yaml
index 21fb30d9..bf2f9d15 100644
--- a/tests/configs/cpu_inference_neural_compressor_lm_sweep.yaml
+++ b/tests/configs/cpu_inference_neural_compressor_gpt_sweep.yaml
@@ -3,8 +3,8 @@ defaults:
   # order of inheritance, last one overrides previous ones
   - _base_ # inherits from base config
   - _inference_ # inherits from inference config
-  - _lm_sweep_ # inherits from gpt_sweep config
+  - _gpt_sweep_ # inherits from gpt_sweep config
   - _cpu_ # inherits from cpu config
   - _self_ # hydra 1.1 compatibility
 
-experiment_name: cpu_inference_neural_compressor_lm_sweep
+experiment_name: cpu_inference_neural_compressor_gpt_sweep
diff --git a/tests/configs/cpu_inference_onnxruntime_lm_sweep.yaml b/tests/configs/cpu_inference_onnxruntime_gpt_sweep.yaml
similarity index 71%
rename from tests/configs/cpu_inference_onnxruntime_lm_sweep.yaml
rename to tests/configs/cpu_inference_onnxruntime_gpt_sweep.yaml
index 6e3c214c..a958bb55 100644
--- a/tests/configs/cpu_inference_onnxruntime_lm_sweep.yaml
+++ b/tests/configs/cpu_inference_onnxruntime_gpt_sweep.yaml
@@ -3,8 +3,8 @@ defaults:
   # order of inheritance, last one overrides previous ones
   - _base_ # inherits from base config
   - _inference_ # inherits from inference config
-  - _lm_sweep_ # inherits from gpt_sweep config
+  - _gpt_sweep_ # inherits from gpt_sweep config
   - _cpu_ # inherits from cpu config
   - _self_ # hydra 1.1 compatibility
 
-experiment_name: cpu_inference_onnxruntime_lm_sweep
+experiment_name: cpu_inference_onnxruntime_gpt_sweep
diff --git a/tests/configs/cpu_inference_openvino_lm_sweep.yaml b/tests/configs/cpu_inference_openvino_gpt_sweep.yaml
similarity index 71%
rename from tests/configs/cpu_inference_openvino_lm_sweep.yaml
rename to tests/configs/cpu_inference_openvino_gpt_sweep.yaml
index 8389d7b3..486f9e8f 100644
--- a/tests/configs/cpu_inference_openvino_lm_sweep.yaml
+++ b/tests/configs/cpu_inference_openvino_gpt_sweep.yaml
@@ -3,8 +3,8 @@ defaults:
   # order of inheritance, last one overrides previous ones
   - _base_ # inherits from base config
   - _inference_ # inherits from inference config
-  - _lm_sweep_ # inherits from gpt_sweep config
+  - _gpt_sweep_ # inherits from gpt_sweep config
   - _cpu_ # inherits from cpu config
   - _self_ # hydra 1.1 compatibility
 
-experiment_name: cpu_inference_openvino_lm_sweep
+experiment_name: cpu_inference_openvino_gpt_sweep
diff --git a/tests/configs/cpu_inference_pytorch_lm_sweep.yaml b/tests/configs/cpu_inference_pytorch_gpt_sweep.yaml
similarity index 72%
rename from tests/configs/cpu_inference_pytorch_lm_sweep.yaml
rename to tests/configs/cpu_inference_pytorch_gpt_sweep.yaml
index c30d7b60..b4720e88 100644
--- a/tests/configs/cpu_inference_pytorch_lm_sweep.yaml
+++ b/tests/configs/cpu_inference_pytorch_gpt_sweep.yaml
@@ -3,8 +3,8 @@ defaults:
   # order of inheritance, last one overrides previous ones
   - _base_ # inherits from base config
   - _inference_ # inherits from inference config
-  - _lm_sweep_ # inherits from gpt_sweep config
+  - _gpt_sweep_ # inherits from gpt_sweep config
   - _cpu_ # inherits from cpu config
   - _self_ # hydra 1.1 compatibility
 
-experiment_name: cpu_inference_pytorch_lm_sweep
+experiment_name: cpu_inference_pytorch_gpt_sweep
diff --git a/tests/configs/cpu_training_pytorch_lm_sweep.yaml b/tests/configs/cpu_training_pytorch_gpt_sweep.yaml
similarity index 72%
rename from tests/configs/cpu_training_pytorch_lm_sweep.yaml
rename to tests/configs/cpu_training_pytorch_gpt_sweep.yaml
index 8b3fbb83..5f8987b6 100644
--- a/tests/configs/cpu_training_pytorch_lm_sweep.yaml
+++ b/tests/configs/cpu_training_pytorch_gpt_sweep.yaml
@@ -3,8 +3,8 @@ defaults:
   # order of inheritance, last one overrides previous ones
   - _base_ # inherits from base config
   - _training_ # inherits from training config
-  - _lm_sweep_ # inherits from gpt_sweep config
+  - _gpt_sweep_ # inherits from gpt_sweep config
   - _cpu_ # inherits from cpu config
   - _self_ # hydra 1.1 compatibility
 
-experiment_name: cpu_training_pytorch_lm_sweep
+experiment_name: cpu_training_pytorch_gpt_sweep
diff --git a/tests/configs/cuda_inference_onnxruntime_lm_sweep.yaml b/tests/configs/cuda_inference_onnxruntime_gpt_sweep.yaml
similarity index 71%
rename from tests/configs/cuda_inference_onnxruntime_lm_sweep.yaml
rename to tests/configs/cuda_inference_onnxruntime_gpt_sweep.yaml
index e220b955..f9b38910 100644
--- a/tests/configs/cuda_inference_onnxruntime_lm_sweep.yaml
+++ b/tests/configs/cuda_inference_onnxruntime_gpt_sweep.yaml
@@ -3,8 +3,8 @@ defaults:
   # order of inheritance, last one overrides previous ones
   - _base_ # inherits from base config
   - _inference_ # inherits from inference config
-  - _lm_sweep_ # inherits from gpt_sweep config
+  - _gpt_sweep_ # inherits from gpt_sweep config
   - _cuda_ # inherits from cpu config
   - _self_ # hydra 1.1 compatibility
 
-experiment_name: cuda_inference_onnxruntime_lm_sweep
+experiment_name: cuda_inference_onnxruntime_gpt_sweep
diff --git a/tests/configs/cuda_inference_pytorch_lm_naive_mp.yaml b/tests/configs/cuda_inference_pytorch_gpt_naive_mp.yaml
similarity index 70%
rename from tests/configs/cuda_inference_pytorch_lm_naive_mp.yaml
rename to tests/configs/cuda_inference_pytorch_gpt_naive_mp.yaml
index a274429f..6e19ba18 100644
--- a/tests/configs/cuda_inference_pytorch_lm_naive_mp.yaml
+++ b/tests/configs/cuda_inference_pytorch_gpt_naive_mp.yaml
@@ -3,8 +3,8 @@ defaults:
   # order of inheritance, last one overrides previous ones
   - _base_ # inherits from base config
   - _inference_ # inherits from inference config
-  - _lm_naive_mp_ # inherits from lm naive mp config
+  - _gpt_naive_mp_ # inherits from lm naive mp config
   - _cuda_ # inherits from cpu config
   - _self_ # hydra 1.1 compatibility
 
-experiment_name: cuda_inference_pytorch_lm_naive_mp
+experiment_name: cuda_inference_pytorch_gpt_naive_mp
diff --git a/tests/configs/cuda_inference_pytorch_lm_sweep.yaml b/tests/configs/cuda_inference_pytorch_gpt_sweep.yaml
similarity index 72%
rename from tests/configs/cuda_inference_pytorch_lm_sweep.yaml
rename to tests/configs/cuda_inference_pytorch_gpt_sweep.yaml
index 23b7ace2..8b033a67 100644
--- a/tests/configs/cuda_inference_pytorch_lm_sweep.yaml
+++ b/tests/configs/cuda_inference_pytorch_gpt_sweep.yaml
@@ -3,8 +3,8 @@ defaults:
   # order of inheritance, last one overrides previous ones
   - _base_ # inherits from base config
   - _inference_ # inherits from inference config
-  - _lm_sweep_ # inherits from gpt_sweep config
+  - _gpt_sweep_ # inherits from gpt_sweep config
   - _cuda_ # inherits from cpu config
   - _self_ # hydra 1.1 compatibility
 
-experiment_name: cpu_inference_pytorch_lm_sweep
+experiment_name: cpu_inference_pytorch_gpt_sweep
diff --git a/tests/configs/cuda_training_pytorch_lm_naive_mp.yaml b/tests/configs/cuda_training_pytorch_gpt_naive_mp.yaml
similarity index 70%
rename from tests/configs/cuda_training_pytorch_lm_naive_mp.yaml
rename to tests/configs/cuda_training_pytorch_gpt_naive_mp.yaml
index 714f8692..ab6d4bc2 100644
--- a/tests/configs/cuda_training_pytorch_lm_naive_mp.yaml
+++ b/tests/configs/cuda_training_pytorch_gpt_naive_mp.yaml
@@ -3,8 +3,8 @@ defaults:
   # order of inheritance, last one overrides previous ones
   - _base_ # inherits from base config
   - _training_ # inherits from training config
-  - _lm_naive_mp_ # inherits from lm naive mp config
+  - _gpt_naive_mp_ # inherits from lm naive mp config
   - _cuda_ # inherits from cpu config
   - _self_ # hydra 1.1 compatibility
 
-experiment_name: cuda_training_pytorch_lm_naive_mp
+experiment_name: cuda_training_pytorch_gpt_naive_mp
diff --git a/tests/configs/cuda_training_pytorch_lm_peft.yaml b/tests/configs/cuda_training_pytorch_gpt_peft.yaml
similarity index 69%
rename from tests/configs/cuda_training_pytorch_lm_peft.yaml
rename to tests/configs/cuda_training_pytorch_gpt_peft.yaml
index be198ecc..1ee6f473 100644
--- a/tests/configs/cuda_training_pytorch_lm_peft.yaml
+++ b/tests/configs/cuda_training_pytorch_gpt_peft.yaml
@@ -3,8 +3,8 @@ defaults:
   # order of inheritance, last one overrides previous ones
   - _base_ # inherits from base config
   - _training_ # inherits from training config
-  - _lm_peft_ # inherits from language modeling peft config
+  - _gpt_peft_ # inherits from language modeling peft config
   - _cuda_ # inherits from cpu config
   - _self_ # hydra 1.1 compatibility
 
-experiment_name: cuda_training_pytorch_lm_peft
+experiment_name: cuda_training_pytorch_gpt_peft
diff --git a/tests/configs/cuda_training_pytorch_lm_sweep.yaml b/tests/configs/cuda_training_pytorch_gpt_sweep.yaml
similarity index 69%
rename from tests/configs/cuda_training_pytorch_lm_sweep.yaml
rename to tests/configs/cuda_training_pytorch_gpt_sweep.yaml
index 17fefe51..004f1f82 100644
--- a/tests/configs/cuda_training_pytorch_lm_sweep.yaml
+++ b/tests/configs/cuda_training_pytorch_gpt_sweep.yaml
@@ -3,8 +3,8 @@ defaults:
   # order of inheritance, last one overrides previous ones
   - _base_ # inherits from base config
   - _training_ # inherits from training config
-  - _lm_sweep_ # inherits from language modeling sweep config
+  - _gpt_sweep_ # inherits from language modeling sweep config
   - _cuda_ # inherits from cpu config
   - _self_ # hydra 1.1 compatibility
 
-experiment_name: cuda_training_pytorch_lm_sweep
+experiment_name: cuda_training_pytorch_gpt_sweep
diff --git a/tests/configs/cuda_training_torch_ort_lm_peft.yaml b/tests/configs/cuda_training_torch_ort_gpt_peft.yaml
similarity index 69%
rename from tests/configs/cuda_training_torch_ort_lm_peft.yaml
rename to tests/configs/cuda_training_torch_ort_gpt_peft.yaml
index 98e347a4..665dec16 100644
--- a/tests/configs/cuda_training_torch_ort_lm_peft.yaml
+++ b/tests/configs/cuda_training_torch_ort_gpt_peft.yaml
@@ -3,8 +3,8 @@ defaults:
   # order of inheritance, last one overrides previous ones
   - _base_ # inherits from base config
   - _training_ # inherits from training config
-  - _lm_peft_ # inherits from language modeling peft config
+  - _gpt_peft_ # inherits from language modeling peft config
   - _cuda_ # inherits from cpu config
   - _self_ # hydra 1.1 compatibility
 
-experiment_name: cuda_training_torch_ort_lm_peft
+experiment_name: cuda_training_torch_ort_gpt_peft
diff --git a/tests/configs/cuda_training_torch_ort_lm_sweep.yaml b/tests/configs/cuda_training_torch_ort_gpt_sweep.yaml
similarity index 69%
rename from tests/configs/cuda_training_torch_ort_lm_sweep.yaml
rename to tests/configs/cuda_training_torch_ort_gpt_sweep.yaml
index 25d4d054..ff8f505f 100644
--- a/tests/configs/cuda_training_torch_ort_lm_sweep.yaml
+++ b/tests/configs/cuda_training_torch_ort_gpt_sweep.yaml
@@ -3,8 +3,8 @@ defaults:
   # order of inheritance, last one overrides previous ones
   - _base_ # inherits from base config
   - _training_ # inherits from inference config
-  - _lm_sweep_ # inherits from language modeling sweep config
+  - _gpt_sweep_ # inherits from language modeling sweep config
   - _cuda_ # inherits from cpu config
   - _self_ # hydra 1.1 compatibility
 
-experiment_name: cuda_training_torch_ort_lm_sweep
+experiment_name: cuda_training_torch_ort_gpt_sweep
diff --git a/tests/configs/rocm_inference_onnxruntime_lm_sweep.yaml b/tests/configs/rocm_inference_onnxruntime_gpt_sweep.yaml
similarity index 71%
rename from tests/configs/rocm_inference_onnxruntime_lm_sweep.yaml
rename to tests/configs/rocm_inference_onnxruntime_gpt_sweep.yaml
index d6630ff1..f53b6612 100644
--- a/tests/configs/rocm_inference_onnxruntime_lm_sweep.yaml
+++ b/tests/configs/rocm_inference_onnxruntime_gpt_sweep.yaml
@@ -3,8 +3,8 @@ defaults:
   # order of inheritance, last one overrides previous ones
   - _base_ # inherits from base config
   - _inference_ # inherits from inference config
-  - _lm_sweep_ # inherits from gpt sweep config
+  - _gpt_sweep_ # inherits from gpt sweep config
   - _rocm_ # inherits from rocm config
   - _self_ # hydra 1.1 compatibility
 
-experiment_name: rocm_inference_onnxruntime_lm_sweep
+experiment_name: rocm_inference_onnxruntime_gpt_sweep
diff --git a/tests/test_api.py b/tests/test_api.py
index 0bf6ced9..30815d82 100644
--- a/tests/test_api.py
+++ b/tests/test_api.py
@@ -1,36 +1,28 @@
-from logging import getLogger
+import gc
 import time
+from tempfile import TemporaryDirectory
 
-import torch
 import pytest
+import torch
 
-from optimum_benchmark.trackers.memory import MemoryTracker
-from optimum_benchmark.trackers.latency import LatencyTracker
-from optimum_benchmark.experiment import ExperimentConfig, launch
-from optimum_benchmark.launchers.inline.config import InlineConfig
 from optimum_benchmark.backends.pytorch.config import PyTorchConfig
-from optimum_benchmark.launchers.process.config import ProcessConfig
-from optimum_benchmark.launchers.torchrun.config import TorchrunConfig
-from optimum_benchmark.benchmarks.inference.config import INPUT_SHAPES
-from optimum_benchmark.benchmarks.training.config import DATASET_SHAPES
-from optimum_benchmark.generators.input_generator import InputGenerator
-from optimum_benchmark.benchmarks.training.config import TrainingConfig
-from optimum_benchmark.benchmarks.inference.config import InferenceConfig
-from optimum_benchmark.generators.dataset_generator import DatasetGenerator
-from optimum_benchmark.task_utils import TEXT_GENERATION_TASKS, IMAGE_DIFFUSION_TASKS
 from optimum_benchmark.backends.timm_utils import extract_timm_shapes_from_config, get_timm_pretrained_config
 from optimum_benchmark.backends.transformers_utils import (
     extract_transformers_shapes_from_artifacts,
     get_transformers_pretrained_config,
 )
+from optimum_benchmark.benchmarks.inference.config import INPUT_SHAPES, InferenceConfig
+from optimum_benchmark.benchmarks.training.config import DATASET_SHAPES
+from optimum_benchmark.experiment import ExperimentConfig, launch
+from optimum_benchmark.generators.dataset_generator import DatasetGenerator
+from optimum_benchmark.generators.input_generator import InputGenerator
+from optimum_benchmark.launchers.inline.config import InlineConfig
+from optimum_benchmark.launchers.process.config import ProcessConfig
+from optimum_benchmark.launchers.torchrun.config import TorchrunConfig
+from optimum_benchmark.task_utils import IMAGE_DIFFUSION_TASKS, TEXT_GENERATION_TASKS
+from optimum_benchmark.trackers.latency import LatencyTracker
+from optimum_benchmark.trackers.memory import MemoryTracker
 
-
-LOGGER = getLogger("test-api")
-
-DEVICES_BACKENDS = [
-    ("cpu", "none"),
-    ("cuda", "pytorch"),
-]
 LIBRARIES_TASKS_MODELS = [
     ("transformers", "fill-mask", "bert-base-uncased"),
     ("timm", "image-classification", "timm/resnet50.a1_in1k"),
@@ -43,18 +35,17 @@
     ("transformers", "image-classification", "google/vit-base-patch16-224"),
     ("transformers", "semantic-segmentation", "google/vit-base-patch16-224"),
 ]
-BENCHMARK_CONFIGS = [
-    InferenceConfig(latency=True, memory=True),
-    TrainingConfig(latency=True, memory=True),
-]
 LAUNCHER_CONFIGS = [
-    TorchrunConfig(nproc_per_node=2, device_isolation=False),
-    ProcessConfig(device_isolation=False),
     InlineConfig(device_isolation=False),
+    ProcessConfig(device_isolation=False),
+    TorchrunConfig(device_isolation=False, nproc_per_node=2),
 ]
+BACKENDS = ["pytorch", "none"]
+DEVICES = ["cpu", "cuda"]
 
 
-@pytest.mark.parametrize("device,backend", DEVICES_BACKENDS)
+@pytest.mark.parametrize("device", DEVICES)
+@pytest.mark.parametrize("backend", BACKENDS)
 def test_api_latency_tracker(device, backend):
     expected_latency = 1
     tracker = LatencyTracker(device=device, backend=backend)
@@ -63,40 +54,55 @@ def test_api_latency_tracker(device, backend):
         with tracker.track():
             time.sleep(1)
 
-    latencies_list = tracker.get_latencies_list()
+    latency = tracker.get_latency()
+    latency.log()
 
-    assert len(latencies_list) == 2
-    assert latencies_list[0] > expected_latency * 0.9
-    assert latencies_list[0] < expected_latency * 1.1
+    assert latency.mean < expected_latency * 1.1
+    assert latency.mean > expected_latency * 0.9
 
 
-@pytest.mark.parametrize("device,backend", DEVICES_BACKENDS)
+@pytest.mark.parametrize("device", DEVICES)
+@pytest.mark.parametrize("backend", BACKENDS)
 def test_api_memory_tracker(device, backend):
     tracker = MemoryTracker(device=device, backend=backend)
 
+    tracker.reset()
     with tracker.track():
+        time.sleep(1)
         pass
 
     # the process consumes memory that we can't control
-    if backend == "pytorch":
-        initial_process_memory = tracker.get_max_memory_allocated_mb()
-    else:
-        initial_process_memory = tracker.get_max_memory_used_mb()
+    initial_memory = tracker.get_max_memory()
+    initial_memory.log()
 
+    tracker.reset()
     with tracker.track():
-        array = torch.ones((10000, 10000), dtype=torch.float64, device=device)
-        expected_memory = array.nbytes / 1e6  # around 800 MB
-
-    if backend == "pytorch":
-        final_process_memory = tracker.get_max_memory_allocated_mb()
+        time.sleep(1)
+        array = torch.randn((10000, 10000), dtype=torch.float64, device=device)
+        expected_memory = array.nbytes / 1e6
+        time.sleep(1)
+
+    final_memory = tracker.get_max_memory()
+    final_memory.log()
+
+    if device == "cuda":
+        if backend == "pytorch":
+            measured_memory = final_memory.max_allocated - initial_memory.max_allocated
+        else:
+            measured_memory = final_memory.max_vram - initial_memory.max_vram
+            if torch.version.hip is not None:
+                return  # skip vram measurement for ROCm
     else:
-        final_process_memory = tracker.get_max_memory_used_mb()
-
-    measured_memory = final_process_memory - initial_process_memory
+        measured_memory = final_memory.max_ram - initial_memory.max_ram
 
     assert measured_memory < expected_memory * 1.1
     assert measured_memory > expected_memory * 0.9
 
+    del array
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+    gc.collect()
+
 
 @pytest.mark.parametrize("library,task,model", LIBRARIES_TASKS_MODELS)
 def test_api_input_generator(library, task, model):
@@ -109,11 +115,7 @@ def test_api_input_generator(library, task, model):
     else:
         raise ValueError(f"Unknown library {library}")
 
-    generator = InputGenerator(
-        task=task,
-        input_shapes=INPUT_SHAPES,
-        model_shapes=model_shapes,
-    )
+    generator = InputGenerator(task=task, input_shapes=INPUT_SHAPES, model_shapes=model_shapes)
 
     if task in TEXT_GENERATION_TASKS:
         _ = generator(mode="forward")
@@ -135,23 +137,31 @@ def test_api_dataset_generator(library, task, model):
     else:
         raise ValueError(f"Unknown library {library}")
 
-    generator = DatasetGenerator(
-        task=task,
-        dataset_shapes=DATASET_SHAPES,
-        model_shapes=model_shapes,
-    )
+    generator = DatasetGenerator(task=task, dataset_shapes=DATASET_SHAPES, model_shapes=model_shapes)
 
     _ = generator()
 
 
-@pytest.mark.parametrize("benchmark_config", BENCHMARK_CONFIGS)
 @pytest.mark.parametrize("launcher_config", LAUNCHER_CONFIGS)
-def test_api_launch_cpu(benchmark_config, launcher_config):
-    backend_config = PyTorchConfig(model="bert-base-uncased", no_weights=True, device="cpu")
+@pytest.mark.parametrize("device", DEVICES)
+def test_api_launch(launcher_config, device):
+    benchmark_config = InferenceConfig(latency=True, memory=True)
+    device_ids = ",".join(str(i) for i in range(torch.cuda.device_count())) if device == "cuda" else None
+    backend_config = PyTorchConfig(model="bert-base-uncased", device_ids=device_ids, no_weights=True, device=device)
     experiment_config = ExperimentConfig(
-        experiment_name="",
-        benchmark=benchmark_config,
-        launcher=launcher_config,
-        backend=backend_config,
+        experiment_name="api-experiment", benchmark=benchmark_config, launcher=launcher_config, backend=backend_config
     )
-    _ = launch(experiment_config)
+    benchmark_report = launch(experiment_config)
+
+    with TemporaryDirectory() as tempdir:
+        experiment_config.to_dict()
+        experiment_config.to_flat_dict()
+        experiment_config.to_dataframe()
+        experiment_config.to_csv(f"{tempdir}/experiment_config.csv")
+        experiment_config.to_json(f"{tempdir}/experiment_config.json")
+
+        benchmark_report.to_dict()
+        benchmark_report.to_flat_dict()
+        benchmark_report.to_dataframe()
+        benchmark_report.to_csv(f"{tempdir}/benchmark_report.csv")
+        benchmark_report.to_json(f"{tempdir}/benchmark_report.json")
diff --git a/tests/test_cli.py b/tests/test_cli.py
index afae3609..739d0f89 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -11,20 +11,13 @@
 TEST_CONFIG_NAMES = [
     config.split(".")[0]
     for config in os.listdir(TEST_CONFIG_DIR)
-    if config.endswith(".yaml") and not (config.startswith("_") or config.endswith("_"))
+    if config.endswith(".yaml") and not (config.startswith("_") or config.endswith("_"))  # or "ds_tp" in config)
 ]
 
 
 @pytest.mark.parametrize("config_name", TEST_CONFIG_NAMES)
 def test_cli_configs(config_name):
-    args = [
-        "optimum-benchmark",
-        "--config-dir",
-        TEST_CONFIG_DIR,
-        "--config-name",
-        config_name,
-        "--multirun",
-    ]
+    args = ["optimum-benchmark", "--config-dir", TEST_CONFIG_DIR, "--config-name", config_name, "--multirun"]
 
     popen = run_subprocess_and_log_stream_output(LOGGER, args)
     assert popen.returncode == 0, f"Failed to run {config_name}"