Skip to content

Commit

Permalink
[feature][refactor] Better Metrics and Trackers (#124)
Browse files Browse the repository at this point in the history
  • Loading branch information
IlyasMoutawwakil authored Feb 19, 2024
1 parent ccf225a commit 8029a71
Show file tree
Hide file tree
Showing 93 changed files with 1,863 additions and 2,026 deletions.
15 changes: 9 additions & 6 deletions .github/workflows/test_api_cuda.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,11 @@ jobs:
matrix:
image:
[
{ torch_cuda: cu121, cuda_version: 12.1.1 },
{ torch_cuda: cu118, cuda_version: 11.8.0 },
{ torch_cuda: cu118, torch_pre_release: 0, cuda_version: 11.8.0 },
{ torch_cuda: cu121, torch_pre_release: 1, cuda_version: 12.1.1 },
]

runs-on: hf-dgx-01
runs-on: nvidia-gpu
steps:
- name: Checkout
uses: actions/checkout@v3
Expand All @@ -37,17 +37,20 @@ jobs:
--tag opt-bench-cuda:${{ matrix.image.cuda_version }}
.

- name: Get GPUs with most free memory
id: get_devices
run: |
echo "::set-output name=devices::$(nvidia-smi --query-gpu=memory.free,index --format=csv,noheader,nounits | sort -n -k1 | tail -n 2 | awk -F', ' '{print $2}' | xargs echo -n | sed 's/ /,/g' | awk '{print $0}')"
- name: Run tests
run: docker run
--rm
--net host
--pid host
--shm-size 64G
--env USE_CUDA="1"
--volume $HOME/.cache/huggingface:/home/user/.cache/huggingface
--gpus '"device=${{ steps.get_devices.outputs.devices }}"'
--volume $(pwd):/workspace/optimum-benchmark
--workdir /workspace/optimum-benchmark
--gpus '"device=0,1"'
--entrypoint /bin/bash
opt-bench-cuda:${{ matrix.image.cuda_version }}
-c "pip install -e .[testing,timm,diffusers] && pytest -k 'api and cuda' -x"
6 changes: 2 additions & 4 deletions .github/workflows/test_api_rocm.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,10 @@ jobs:
image:
[
{ torch_rocm: rocm5.6, torch_pre_release: 0, rocm_version: 5.6.1 },
{ torch_rocm: rocm5.7, torch_pre_release: 1, rocm_version: 5.7 },
{ torch_rocm: rocm5.7, torch_pre_release: 1, rocm_version: 5.7.1 },
]

runs-on: hf-amd-mi210-dev
runs-on: amd-gpu
steps:
- name: Checkout code
uses: actions/checkout@v3
Expand All @@ -41,11 +41,9 @@ jobs:
- name: Run tests
run: docker run
--rm
--net host
--pid host
--shm-size 64G
--env USE_ROCM="1"
--volume $HOME/.cache/huggingface:/home/user/.cache/huggingface
--volume $(pwd):/workspace/optimum-benchmark
--workdir /workspace/optimum-benchmark
--device /dev/kfd
Expand Down
10 changes: 7 additions & 3 deletions .github/workflows/test_cli_cuda_onnxruntime.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ concurrency:

jobs:
build_image_and_run_cli_cuda_onnxruntime_tests:
runs-on: hf-dgx-01
runs-on: nvidia-gpu
steps:
- name: Checkout
uses: actions/checkout@v3
Expand All @@ -28,16 +28,20 @@ jobs:
--tag opt-bench-cuda:11.8.0
.

- name: Get GPUs with most free memory
id: get_devices
run: |
echo "::set-output name=devices::$(nvidia-smi --query-gpu=memory.free,index --format=csv,noheader,nounits | sort -n -k1 | tail -n 2 | awk -F', ' '{print $2}' | xargs echo -n | sed 's/ /,/g' | awk '{print $0}')"
- name: Run tests
run: docker run
--rm
--net host
--pid host
--shm-size 64G
--env USE_CUDA="1"
--entrypoint /bin/bash
--gpus '"device=${{ steps.get_devices.outputs.devices }}"'
--volume $(pwd):/workspace/optimum-benchmark
--workdir /workspace/optimum-benchmark
--gpus '"device=0,1"'
opt-bench-cuda:11.8.0
-c "pip install -e .[testing,onnxruntime-gpu,diffusers,timm] && pytest -k 'cli and cuda and onnxruntime' -x"
15 changes: 9 additions & 6 deletions .github/workflows/test_cli_cuda_pytorch.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,11 @@ jobs:
matrix:
image:
[
{ torch_cuda: cu121, cuda_version: 12.1.1 },
{ torch_cuda: cu118, cuda_version: 11.8.0 },
{ torch_cuda: cu118, torch_pre_release: 0, cuda_version: 11.8.0 },
{ torch_cuda: cu121, torch_pre_release: 1, cuda_version: 12.1.1 },
]

runs-on: hf-dgx-01
runs-on: nvidia-gpu
steps:
- name: Checkout
uses: actions/checkout@v3
Expand All @@ -37,17 +37,20 @@ jobs:
--tag opt-bench-cuda:${{ matrix.image.cuda_version }}
.

- name: Get GPUs with most free memory
id: get_devices
run: |
echo "::set-output name=devices::$(nvidia-smi --query-gpu=memory.free,index --format=csv,noheader,nounits | sort -n -k1 | tail -n 2 | awk -F', ' '{print $2}' | xargs echo -n | sed 's/ /,/g' | awk '{print $0}')"
- name: Run tests
run: docker run
--rm
--net host
--pid host
--shm-size 64G
--env USE_CUDA="1"
--volume $HOME/.cache/huggingface:/home/user/.cache/huggingface
--gpus '"device=${{ steps.get_devices.outputs.devices }}"'
--volume $(pwd):/workspace/optimum-benchmark
--workdir /workspace/optimum-benchmark
--gpus '"device=0,1"'
--entrypoint /bin/bash
opt-bench-cuda:${{ matrix.image.cuda_version }}
-c "pip install -e .[testing,diffusers,timm,deepspeed,peft] && pytest -k 'cli and cuda and pytorch' -x"
10 changes: 7 additions & 3 deletions .github/workflows/test_cli_cuda_torch_ort.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ concurrency:

jobs:
build_image_and_run_cli_cuda_torch_ort_tests:
runs-on: hf-dgx-01
runs-on: nvidia-gpu
steps:
- name: Checkout
uses: actions/checkout@v3
Expand All @@ -28,16 +28,20 @@ jobs:
--tag opt-bench-cuda:11.8.0
.

- name: Get GPUs with most free memory
id: get_devices
run: |
echo "::set-output name=devices::$(nvidia-smi --query-gpu=memory.free,index --format=csv,noheader,nounits | sort -n -k1 | tail -n 2 | awk -F', ' '{print $2}' | xargs echo -n | sed 's/ /,/g' | awk '{print $0}')"
- name: Run tests
run: docker run
--rm
--net host
--pid host
--shm-size 64G
--env USE_CUDA="1"
--entrypoint /bin/bash
--gpus '"device=${{ steps.get_devices.outputs.devices }}"'
--volume $(pwd):/workspace/optimum-benchmark
--workdir /workspace/optimum-benchmark
--gpus '"device=0,1"'
opt-bench-cuda:11.8.0
-c "pip install -e .[testing,torch-ort,peft] && python -m torch_ort.configure && pytest -k 'cli and cuda and torch_ort' -x"
10 changes: 4 additions & 6 deletions .github/workflows/test_cli_rocm_onnxruntime.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,15 +13,15 @@ concurrency:

jobs:
build_image_and_run_cli_rocm_onnxruntime_tests:
runs-on: hf-amd-mi210-dev
runs-on: amd-gpu
steps:
- name: Checkout
uses: actions/checkout@v3

- name: Check if image exists
id: check_image
run: |
if [[ "$(docker images -q opt-bench-rocm-ort:5.7 2> /dev/null)" == "" ]]; then
if [[ "$(docker images -q opt-bench-rocm-ort:latest 2> /dev/null)" == "" ]]; then
echo "::set-output name=exists::false"
else
echo "::set-output name=exists::true"
Expand All @@ -33,14 +33,12 @@ jobs:
--file docker/rocm-ort.dockerfile
--build-arg USER_ID=$(id -u)
--build-arg GROUP_ID=$(id -g)
--build-arg ROCM_VERSION=5.7
--tag opt-bench-rocm-ort:5.7
--tag opt-bench-rocm-ort:latest
.

- name: Run tests
run: docker run
--rm
--net host
--pid host
--shm-size 64G
--env USE_ROCM="1"
Expand All @@ -50,5 +48,5 @@ jobs:
--device /dev/dri/renderD128
--device /dev/dri/renderD129
--entrypoint /bin/bash
opt-bench-rocm-ort:5.7
opt-bench-rocm-ort:latest
-c "pip install -e .[testing,timm,diffusers] && pytest -k 'cli and rocm and onnxruntime' -x"
6 changes: 2 additions & 4 deletions .github/workflows/test_cli_rocm_pytorch.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,10 @@ jobs:
image:
[
{ torch_rocm: rocm5.6, torch_pre_release: 0, rocm_version: 5.6.1 },
{ torch_rocm: rocm5.7, torch_pre_release: 1, rocm_version: 5.7 },
{ torch_rocm: rocm5.7, torch_pre_release: 1, rocm_version: 5.7.1 },
]

runs-on: hf-amd-mi210-dev
runs-on: [amd-gpu]
steps:
- name: Checkout code
uses: actions/checkout@v3
Expand All @@ -41,11 +41,9 @@ jobs:
- name: Run tests
run: docker run
--rm
--net host
--pid host
--shm-size 64G
--env USE_ROCM="1"
--volume $HOME/.cache/huggingface:/home/user/.cache/huggingface
--volume $(pwd):/workspace/optimum-benchmark
--workdir /workspace/optimum-benchmark
--device /dev/kfd
Expand Down
14 changes: 8 additions & 6 deletions .github/workflows/test_cli_tensorrt_llm.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ concurrency:

jobs:
pull_image_and_run_cli_tensorrt_llm_tests:
runs-on: hf-dgx-01
runs-on: nvidia-gpu
steps:
- name: Checkout
uses: actions/checkout@v3
Expand All @@ -26,18 +26,20 @@ jobs:
--tag opt-bench-tensorrt-llm:latest
.

- name: Get GPUs with most free memory
id: get_devices
run: |
echo "::set-output name=devices::$(nvidia-smi --query-gpu=memory.free,index --format=csv,noheader,nounits | sort -n -k1 | tail -n 2 | awk -F', ' '{print $2}' | xargs echo -n | sed 's/ /,/g' | awk '{print $0}')"
- name: Run tests
run: docker run
--rm
--net host
--pid host
--shm-size 64G
--env USE_CUDA="1"
--env USER_ID=$(id -u)
--env GROUP_ID=$(id -g)
--gpus '"device=${{ steps.get_devices.outputs.devices }}"'
--volume $(pwd):/workspace/optimum-benchmark
--workdir /workspace/optimum-benchmark
--gpus '"device=0,1"'
--entrypoint /bin/bash
opt-bench-tensorrt-llm:latest
-c "pip install -e .[testing] && pytest -k 'cli and tensorrt_llm' -x"
-c "pip install -e .[testing] && pip uninstall -y nvidia-ml-py && pytest -k 'cli and tensorrt_llm' -x"
9 changes: 3 additions & 6 deletions .github/workflows/test_cli_tensorrt_onnxruntime.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ concurrency:

jobs:
build_image_and_run_cli_tensorrt_onnxruntime_tests:
runs-on: hf-dgx-01
runs-on: nvidia-gpu
steps:
- name: Checkout
uses: actions/checkout@v3
Expand All @@ -23,21 +23,18 @@ jobs:
--file docker/tensorrt.dockerfile
--build-arg USER_ID=$(id -u)
--build-arg GROUP_ID=$(id -g)
--build-arg TENSORRT_VERSION=22.12
--build-arg TORCH_CUDA=cu118
--tag opt-bench-tensorrt:22.12
--tag opt-bench-tensorrt:latest
.

- name: Run tests
run: docker run
--rm
--net host
--pid host
--shm-size 64G
--env USE_CUDA="1"
--volume $(pwd):/workspace/optimum-benchmark
--workdir /workspace/optimum-benchmark
--gpus '"device=0,1"'
--entrypoint /bin/bash
opt-bench-tensorrt:22.12
opt-bench-tensorrt:latest
-c "pip install -e .[testing,onnxruntime-gpu,diffusers,timm] && pytest -k 'cli and tensorrt and onnxruntime' -x"
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -171,3 +171,4 @@ actions-runner/
experiments/
examples/
.engine/
amdsmi
Loading

0 comments on commit 8029a71

Please sign in to comment.