huggingface · IlyasMoutawwakil · Feb 19, 2024 · Feb 14, 2024 · Feb 14, 2024 · Feb 14, 2024
diff --git a/.github/workflows/test_api_cuda.yaml b/.github/workflows/test_api_cuda.yaml
@@ -18,11 +18,11 @@ jobs:
       matrix:
         image:
           [
-            { torch_cuda: cu121, cuda_version: 12.1.1 },
-            { torch_cuda: cu118, cuda_version: 11.8.0 },
+            { torch_cuda: cu118, torch_pre_release: 0, cuda_version: 11.8.0 },
+            { torch_cuda: cu121, torch_pre_release: 1, cuda_version: 12.1.1 },
           ]
 
-    runs-on: hf-dgx-01
+    runs-on: nvidia-gpu
     steps:
       - name: Checkout
         uses: actions/checkout@v3
@@ -37,17 +37,20 @@ jobs:
           --tag opt-bench-cuda:${{ matrix.image.cuda_version }}
           .
 
+      - name: Get GPUs with most free memory
+        id: get_devices
+        run: |
+          echo "::set-output name=devices::$(nvidia-smi --query-gpu=memory.free,index --format=csv,noheader,nounits | sort -n -k1 | tail -n 2 | awk -F', ' '{print $2}' | xargs echo -n | sed 's/ /,/g' | awk '{print $0}')"
+
       - name: Run tests
         run: docker run
           --rm
-          --net host
           --pid host
           --shm-size 64G
           --env USE_CUDA="1"
-          --volume $HOME/.cache/huggingface:/home/user/.cache/huggingface
+          --gpus '"device=${{ steps.get_devices.outputs.devices }}"'
           --volume $(pwd):/workspace/optimum-benchmark
           --workdir /workspace/optimum-benchmark
-          --gpus '"device=0,1"'
           --entrypoint /bin/bash
           opt-bench-cuda:${{ matrix.image.cuda_version }}
           -c "pip install -e .[testing,timm,diffusers] && pytest -k 'api and cuda' -x"
diff --git a/.github/workflows/test_api_rocm.yaml b/.github/workflows/test_api_rocm.yaml
@@ -19,10 +19,10 @@ jobs:
         image:
           [
             { torch_rocm: rocm5.6, torch_pre_release: 0, rocm_version: 5.6.1 },
-            { torch_rocm: rocm5.7, torch_pre_release: 1, rocm_version: 5.7 },
+            { torch_rocm: rocm5.7, torch_pre_release: 1, rocm_version: 5.7.1 },
           ]
 
-    runs-on: hf-amd-mi210-dev
+    runs-on: amd-gpu
     steps:
       - name: Checkout code
         uses: actions/checkout@v3
@@ -41,11 +41,9 @@ jobs:
       - name: Run tests
         run: docker run
           --rm
-          --net host
           --pid host
           --shm-size 64G
           --env USE_ROCM="1"
-          --volume $HOME/.cache/huggingface:/home/user/.cache/huggingface
           --volume $(pwd):/workspace/optimum-benchmark
           --workdir /workspace/optimum-benchmark
           --device /dev/kfd

diff --git a/.github/workflows/test_cli_cuda_onnxruntime.yaml b/.github/workflows/test_cli_cuda_onnxruntime.yaml
@@ -13,7 +13,7 @@ concurrency:
 
 jobs:
   build_image_and_run_cli_cuda_onnxruntime_tests:
-    runs-on: hf-dgx-01
+    runs-on: nvidia-gpu
     steps:
       - name: Checkout
         uses: actions/checkout@v3
@@ -28,16 +28,20 @@ jobs:
           --tag opt-bench-cuda:11.8.0
           .
 
+      - name: Get GPUs with most free memory
+        id: get_devices
+        run: |
+          echo "::set-output name=devices::$(nvidia-smi --query-gpu=memory.free,index --format=csv,noheader,nounits | sort -n -k1 | tail -n 2 | awk -F', ' '{print $2}' | xargs echo -n | sed 's/ /,/g' | awk '{print $0}')"
+
       - name: Run tests
         run: docker run
           --rm
-          --net host
           --pid host
           --shm-size 64G
           --env USE_CUDA="1"
           --entrypoint /bin/bash
+          --gpus '"device=${{ steps.get_devices.outputs.devices }}"'
           --volume $(pwd):/workspace/optimum-benchmark
           --workdir /workspace/optimum-benchmark
-          --gpus '"device=0,1"'
           opt-bench-cuda:11.8.0
           -c "pip install -e .[testing,onnxruntime-gpu,diffusers,timm] && pytest -k 'cli and cuda and onnxruntime' -x"
diff --git a/.github/workflows/test_cli_cuda_pytorch.yaml b/.github/workflows/test_cli_cuda_pytorch.yaml
@@ -18,11 +18,11 @@ jobs:
       matrix:
         image:
           [
-            { torch_cuda: cu121, cuda_version: 12.1.1 },
-            { torch_cuda: cu118, cuda_version: 11.8.0 },
+            { torch_cuda: cu118, torch_pre_release: 0, cuda_version: 11.8.0 },
+            { torch_cuda: cu121, torch_pre_release: 1, cuda_version: 12.1.1 },
           ]
 
-    runs-on: hf-dgx-01
+    runs-on: nvidia-gpu
     steps:
       - name: Checkout
         uses: actions/checkout@v3
@@ -37,17 +37,20 @@ jobs:
           --tag opt-bench-cuda:${{ matrix.image.cuda_version }}
           .
 
+      - name: Get GPUs with most free memory
+        id: get_devices
+        run: |
+          echo "::set-output name=devices::$(nvidia-smi --query-gpu=memory.free,index --format=csv,noheader,nounits | sort -n -k1 | tail -n 2 | awk -F', ' '{print $2}' | xargs echo -n | sed 's/ /,/g' | awk '{print $0}')"
+
       - name: Run tests
         run: docker run
           --rm
-          --net host
           --pid host
           --shm-size 64G
           --env USE_CUDA="1"
-          --volume $HOME/.cache/huggingface:/home/user/.cache/huggingface
+          --gpus '"device=${{ steps.get_devices.outputs.devices }}"'
           --volume $(pwd):/workspace/optimum-benchmark
           --workdir /workspace/optimum-benchmark
-          --gpus '"device=0,1"'
           --entrypoint /bin/bash
           opt-bench-cuda:${{ matrix.image.cuda_version }}
           -c "pip install -e .[testing,diffusers,timm,deepspeed,peft] && pytest -k 'cli and cuda and pytorch' -x"
diff --git a/.github/workflows/test_cli_cuda_torch_ort.yaml b/.github/workflows/test_cli_cuda_torch_ort.yaml
@@ -13,7 +13,7 @@ concurrency:
 
 jobs:
   build_image_and_run_cli_cuda_torch_ort_tests:
-    runs-on: hf-dgx-01
+    runs-on: nvidia-gpu
     steps:
       - name: Checkout
         uses: actions/checkout@v3
@@ -28,16 +28,20 @@ jobs:
           --tag opt-bench-cuda:11.8.0
           .
 
+      - name: Get GPUs with most free memory
+        id: get_devices
+        run: |
+          echo "::set-output name=devices::$(nvidia-smi --query-gpu=memory.free,index --format=csv,noheader,nounits | sort -n -k1 | tail -n 2 | awk -F', ' '{print $2}' | xargs echo -n | sed 's/ /,/g' | awk '{print $0}')"
+
       - name: Run tests
         run: docker run
           --rm
-          --net host
           --pid host
           --shm-size 64G
           --env USE_CUDA="1"
           --entrypoint /bin/bash
+          --gpus '"device=${{ steps.get_devices.outputs.devices }}"'
           --volume $(pwd):/workspace/optimum-benchmark
           --workdir /workspace/optimum-benchmark
-          --gpus '"device=0,1"'
           opt-bench-cuda:11.8.0
           -c "pip install -e .[testing,torch-ort,peft] && python -m torch_ort.configure && pytest -k 'cli and cuda and torch_ort' -x"
diff --git a/.github/workflows/test_cli_rocm_onnxruntime.yaml b/.github/workflows/test_cli_rocm_onnxruntime.yaml
@@ -13,15 +13,15 @@ concurrency:
 
 jobs:
   build_image_and_run_cli_rocm_onnxruntime_tests:
-    runs-on: hf-amd-mi210-dev
+    runs-on: amd-gpu
     steps:
       - name: Checkout
         uses: actions/checkout@v3
 
       - name: Check if image exists
         id: check_image
         run: |
-          if [[ "$(docker images -q opt-bench-rocm-ort:5.7 2> /dev/null)" == "" ]]; then
+          if [[ "$(docker images -q opt-bench-rocm-ort:latest 2> /dev/null)" == "" ]]; then
             echo "::set-output name=exists::false"
           else
             echo "::set-output name=exists::true"
@@ -33,14 +33,12 @@ jobs:
           --file docker/rocm-ort.dockerfile
           --build-arg USER_ID=$(id -u)
           --build-arg GROUP_ID=$(id -g)
-          --build-arg ROCM_VERSION=5.7
-          --tag opt-bench-rocm-ort:5.7
+          --tag opt-bench-rocm-ort:latest
           .
 
       - name: Run tests
         run: docker run
           --rm
-          --net host
           --pid host
           --shm-size 64G
           --env USE_ROCM="1"
@@ -50,5 +48,5 @@ jobs:
           --device /dev/dri/renderD128
           --device /dev/dri/renderD129
           --entrypoint /bin/bash
-          opt-bench-rocm-ort:5.7
+          opt-bench-rocm-ort:latest
           -c "pip install -e .[testing,timm,diffusers] && pytest -k 'cli and rocm and onnxruntime' -x"
diff --git a/.github/workflows/test_cli_rocm_pytorch.yaml b/.github/workflows/test_cli_rocm_pytorch.yaml
@@ -19,10 +19,10 @@ jobs:
         image:
           [
             { torch_rocm: rocm5.6, torch_pre_release: 0, rocm_version: 5.6.1 },
-            { torch_rocm: rocm5.7, torch_pre_release: 1, rocm_version: 5.7 },
+            { torch_rocm: rocm5.7, torch_pre_release: 1, rocm_version: 5.7.1 },
           ]
 
-    runs-on: hf-amd-mi210-dev
+    runs-on: [amd-gpu]
     steps:
       - name: Checkout code
         uses: actions/checkout@v3
@@ -41,11 +41,9 @@ jobs:
       - name: Run tests
         run: docker run
           --rm
-          --net host
           --pid host
           --shm-size 64G
           --env USE_ROCM="1"
-          --volume $HOME/.cache/huggingface:/home/user/.cache/huggingface
           --volume $(pwd):/workspace/optimum-benchmark
           --workdir /workspace/optimum-benchmark
           --device /dev/kfd

diff --git a/.github/workflows/test_cli_tensorrt_llm.yaml b/.github/workflows/test_cli_tensorrt_llm.yaml
@@ -13,7 +13,7 @@ concurrency:
 
 jobs:
   pull_image_and_run_cli_tensorrt_llm_tests:
-    runs-on: hf-dgx-01
+    runs-on: nvidia-gpu
     steps:
       - name: Checkout
         uses: actions/checkout@v3
@@ -26,18 +26,20 @@ jobs:
           --tag opt-bench-tensorrt-llm:latest
           .
 
+      - name: Get GPUs with most free memory
+        id: get_devices
+        run: |
+          echo "::set-output name=devices::$(nvidia-smi --query-gpu=memory.free,index --format=csv,noheader,nounits | sort -n -k1 | tail -n 2 | awk -F', ' '{print $2}' | xargs echo -n | sed 's/ /,/g' | awk '{print $0}')"
+
       - name: Run tests
         run: docker run
           --rm
-          --net host
           --pid host
           --shm-size 64G
           --env USE_CUDA="1"
-          --env USER_ID=$(id -u)
-          --env GROUP_ID=$(id -g)
+          --gpus '"device=${{ steps.get_devices.outputs.devices }}"'
           --volume $(pwd):/workspace/optimum-benchmark
           --workdir /workspace/optimum-benchmark
-          --gpus '"device=0,1"'
           --entrypoint /bin/bash
           opt-bench-tensorrt-llm:latest
-          -c "pip install -e .[testing] && pytest -k 'cli and tensorrt_llm' -x"
+          -c "pip install -e .[testing] && pip uninstall -y nvidia-ml-py && pytest -k 'cli and tensorrt_llm' -x"
diff --git a/.github/workflows/test_cli_tensorrt_onnxruntime.yaml b/.github/workflows/test_cli_tensorrt_onnxruntime.yaml
@@ -13,7 +13,7 @@ concurrency:
 
 jobs:
   build_image_and_run_cli_tensorrt_onnxruntime_tests:
-    runs-on: hf-dgx-01
+    runs-on: nvidia-gpu
     steps:
       - name: Checkout
         uses: actions/checkout@v3
@@ -23,21 +23,18 @@ jobs:
           --file docker/tensorrt.dockerfile
           --build-arg USER_ID=$(id -u)
           --build-arg GROUP_ID=$(id -g)
-          --build-arg TENSORRT_VERSION=22.12
-          --build-arg TORCH_CUDA=cu118
-          --tag opt-bench-tensorrt:22.12
+          --tag opt-bench-tensorrt:latest
           .
 
       - name: Run tests
         run: docker run
           --rm
-          --net host
           --pid host
           --shm-size 64G
           --env USE_CUDA="1"
           --volume $(pwd):/workspace/optimum-benchmark
           --workdir /workspace/optimum-benchmark
           --gpus '"device=0,1"'
           --entrypoint /bin/bash
-          opt-bench-tensorrt:22.12
+          opt-bench-tensorrt:latest
           -c "pip install -e .[testing,onnxruntime-gpu,diffusers,timm] && pytest -k 'cli and tensorrt and onnxruntime' -x"
diff --git a/.gitignore b/.gitignore
@@ -171,3 +171,4 @@ actions-runner/
 experiments/
 examples/
 .engine/
+amdsmi