use freest gpus for workflows

huggingface · Feb 19, 2024 · b67d1f6 · b67d1f6
1 parent b38b100
commit b67d1f6
Show file tree

Hide file tree

Showing 7 changed files with 35 additions and 13 deletions.
diff --git a/.github/workflows/test_api_cuda.yaml b/.github/workflows/test_api_cuda.yaml
@@ -37,15 +37,20 @@ jobs:
           --tag opt-bench-cuda:${{ matrix.image.cuda_version }}
           .
 
+      - name: Get GPUs with most free memory
+        id: get_devices
+        run: |
+          echo "::set-output name=devices::$(nvidia-smi --query-gpu=memory.free,index --format=csv,noheader,nounits | sort -n -k1 | tail -n 2 | awk -F', ' '{print $2}' | xargs echo -n | sed 's/ /,/g' | awk '{print $0}')"
+
       - name: Run tests
         run: docker run
           --rm
           --pid host
           --shm-size 64G
           --env USE_CUDA="1"
+          --gpus '"device=${{ steps.get_devices.outputs.devices }}"'
           --volume $(pwd):/workspace/optimum-benchmark
           --workdir /workspace/optimum-benchmark
-          --gpus '"device=0,1"'
           --entrypoint /bin/bash
           opt-bench-cuda:${{ matrix.image.cuda_version }}
           -c "pip install -e .[testing,timm,diffusers] && pytest -k 'api and cuda' -x"
diff --git a/.github/workflows/test_cli_cuda_onnxruntime.yaml b/.github/workflows/test_cli_cuda_onnxruntime.yaml
@@ -28,15 +28,20 @@ jobs:
           --tag opt-bench-cuda:11.8.0
           .
 
+      - name: Get GPUs with most free memory
+        id: get_devices
+        run: |
+          echo "::set-output name=devices::$(nvidia-smi --query-gpu=memory.free,index --format=csv,noheader,nounits | sort -n -k1 | tail -n 2 | awk -F', ' '{print $2}' | xargs echo -n | sed 's/ /,/g' | awk '{print $0}')"
+
       - name: Run tests
         run: docker run
           --rm
           --pid host
           --shm-size 64G
           --env USE_CUDA="1"
           --entrypoint /bin/bash
+          --gpus '"device=${{ steps.get_devices.outputs.devices }}"'
           --volume $(pwd):/workspace/optimum-benchmark
           --workdir /workspace/optimum-benchmark
-          --gpus '"device=0,1"'
           opt-bench-cuda:11.8.0
           -c "pip install -e .[testing,onnxruntime-gpu,diffusers,timm] && pytest -k 'cli and cuda and onnxruntime' -x"
diff --git a/.github/workflows/test_cli_cuda_pytorch.yaml b/.github/workflows/test_cli_cuda_pytorch.yaml
@@ -37,15 +37,20 @@ jobs:
           --tag opt-bench-cuda:${{ matrix.image.cuda_version }}
           .
 
+      - name: Get GPUs with most free memory
+        id: get_devices
+        run: |
+          echo "::set-output name=devices::$(nvidia-smi --query-gpu=memory.free,index --format=csv,noheader,nounits | sort -n -k1 | tail -n 2 | awk -F', ' '{print $2}' | xargs echo -n | sed 's/ /,/g' | awk '{print $0}')"
+
       - name: Run tests
         run: docker run
           --rm
           --pid host
           --shm-size 64G
           --env USE_CUDA="1"
+          --gpus '"device=${{ steps.get_devices.outputs.devices }}"'
           --volume $(pwd):/workspace/optimum-benchmark
           --workdir /workspace/optimum-benchmark
-          --gpus '"device=0,1"'
           --entrypoint /bin/bash
           opt-bench-cuda:${{ matrix.image.cuda_version }}
           -c "pip install -e .[testing,diffusers,timm,deepspeed,peft] && pytest -k 'cli and cuda and pytorch' -x"
diff --git a/.github/workflows/test_cli_cuda_torch_ort.yaml b/.github/workflows/test_cli_cuda_torch_ort.yaml
@@ -28,15 +28,20 @@ jobs:
           --tag opt-bench-cuda:11.8.0
           .
 
+      - name: Get GPUs with most free memory
+        id: get_devices
+        run: |
+          echo "::set-output name=devices::$(nvidia-smi --query-gpu=memory.free,index --format=csv,noheader,nounits | sort -n -k1 | tail -n 2 | awk -F', ' '{print $2}' | xargs echo -n | sed 's/ /,/g' | awk '{print $0}')"
+
       - name: Run tests
         run: docker run
           --rm
           --pid host
           --shm-size 64G
           --env USE_CUDA="1"
           --entrypoint /bin/bash
+          --gpus '"device=${{ steps.get_devices.outputs.devices }}"'
           --volume $(pwd):/workspace/optimum-benchmark
           --workdir /workspace/optimum-benchmark
-          --gpus '"device=0,1"'
           opt-bench-cuda:11.8.0
           -c "pip install -e .[testing,torch-ort,peft] && python -m torch_ort.configure && pytest -k 'cli and cuda and torch_ort' -x"
diff --git a/.github/workflows/test_cli_rocm_onnxruntime.yaml b/.github/workflows/test_cli_rocm_onnxruntime.yaml
@@ -21,7 +21,7 @@ jobs:
       - name: Check if image exists
         id: check_image
         run: |
-          if [[ "$(docker images -q opt-bench-rocm-ort:5.7 2> /dev/null)" == "" ]]; then
+          if [[ "$(docker images -q opt-bench-rocm-ort:latest 2> /dev/null)" == "" ]]; then
             echo "::set-output name=exists::false"
           else
             echo "::set-output name=exists::true"
@@ -33,8 +33,7 @@ jobs:
           --file docker/rocm-ort.dockerfile
           --build-arg USER_ID=$(id -u)
           --build-arg GROUP_ID=$(id -g)
-          --build-arg ROCM_VERSION=5.7
-          --tag opt-bench-rocm-ort:5.7
+          --tag opt-bench-rocm-ort:latest
           .
 
       - name: Run tests
@@ -49,5 +48,5 @@ jobs:
           --device /dev/dri/renderD128
           --device /dev/dri/renderD129
           --entrypoint /bin/bash
-          opt-bench-rocm-ort:5.7
+          opt-bench-rocm-ort:latest
           -c "pip install -e .[testing,timm,diffusers] && pytest -k 'cli and rocm and onnxruntime' -x"
diff --git a/.github/workflows/test_cli_tensorrt_llm.yaml b/.github/workflows/test_cli_tensorrt_llm.yaml
@@ -26,15 +26,20 @@ jobs:
           --tag opt-bench-tensorrt-llm:latest
           .
 
+      - name: Get GPUs with most free memory
+        id: get_devices
+        run: |
+          echo "::set-output name=devices::$(nvidia-smi --query-gpu=memory.free,index --format=csv,noheader,nounits | sort -n -k1 | tail -n 2 | awk -F', ' '{print $2}' | xargs echo -n | sed 's/ /,/g' | awk '{print $0}')"
+
       - name: Run tests
         run: docker run
           --rm
           --pid host
           --shm-size 64G
           --env USE_CUDA="1"
+          --gpus '"device=${{ steps.get_devices.outputs.devices }}"'
           --volume $(pwd):/workspace/optimum-benchmark
           --workdir /workspace/optimum-benchmark
-          --gpus '"device=0,1"'
           --entrypoint /bin/bash
           opt-bench-tensorrt-llm:latest
           -c "pip install -e .[testing] && pip uninstall -y nvidia-ml-py && pytest -k 'cli and tensorrt_llm' -x"
diff --git a/.github/workflows/test_cli_tensorrt_onnxruntime.yaml b/.github/workflows/test_cli_tensorrt_onnxruntime.yaml
@@ -23,9 +23,7 @@ jobs:
           --file docker/tensorrt.dockerfile
           --build-arg USER_ID=$(id -u)
           --build-arg GROUP_ID=$(id -g)
-          --build-arg TENSORRT_VERSION=22.12
-          --build-arg TORCH_CUDA=cu118
-          --tag opt-bench-tensorrt:22.12
+          --tag opt-bench-tensorrt:latest
           .
 
       - name: Run tests
@@ -38,5 +36,5 @@ jobs:
           --workdir /workspace/optimum-benchmark
           --gpus '"device=0,1"'
           --entrypoint /bin/bash
-          opt-bench-tensorrt:22.12
+          opt-bench-tensorrt:latest
           -c "pip install -e .[testing,onnxruntime-gpu,diffusers,timm] && pytest -k 'cli and tensorrt and onnxruntime' -x"