From 8029a711d880be393f91c43d85ee2ddab838a2d5 Mon Sep 17 00:00:00 2001 From: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com> Date: Mon, 19 Feb 2024 08:25:33 +0100 Subject: [PATCH] [feature][refactor] Better Metrics and Trackers (#124) --- .github/workflows/test_api_cuda.yaml | 15 +- .github/workflows/test_api_rocm.yaml | 6 +- .../workflows/test_cli_cuda_onnxruntime.yaml | 10 +- .github/workflows/test_cli_cuda_pytorch.yaml | 15 +- .../workflows/test_cli_cuda_torch_ort.yaml | 10 +- .../workflows/test_cli_rocm_onnxruntime.yaml | 10 +- .github/workflows/test_cli_rocm_pytorch.yaml | 6 +- .github/workflows/test_cli_tensorrt_llm.yaml | 14 +- .../test_cli_tensorrt_onnxruntime.yaml | 9 +- .gitignore | 1 + Makefile | 61 ++- README.md | 115 +++--- docker/cpu.dockerfile | 1 - docker/cuda.dockerfile | 4 +- docker/rocm-ort.dockerfile | 5 +- docker/tensorrt.dockerfile | 2 +- examples/api_launch.py | 21 ++ examples/pytorch_bert.yaml | 6 +- optimum_benchmark/backends/base.py | 24 +- optimum_benchmark/backends/config.py | 54 +-- optimum_benchmark/backends/diffusers_utils.py | 2 +- .../backends/neural_compressor/backend.py | 32 +- .../backends/neural_compressor/config.py | 10 +- .../backends/onnxruntime/backend.py | 60 ++- .../backends/onnxruntime/config.py | 21 +- .../backends/onnxruntime/utils.py | 8 +- .../backends/openvino/backend.py | 34 +- optimum_benchmark/backends/openvino/config.py | 2 +- optimum_benchmark/backends/openvino/utils.py | 6 +- optimum_benchmark/backends/peft_utils.py | 11 +- optimum_benchmark/backends/pytorch/backend.py | 61 ++- optimum_benchmark/backends/pytorch/config.py | 15 +- .../backends/tensorrt_llm/backend.py | 10 +- .../backends/tensorrt_llm/config.py | 5 +- .../text_generation_inference/backend.py | 37 +- optimum_benchmark/backends/timm_utils.py | 2 +- .../backends/torch_ort/backend.py | 24 +- .../backends/torch_ort/config.py | 2 +- .../backends/transformers_utils.py | 20 +- optimum_benchmark/benchmarks/base.py | 2 +- optimum_benchmark/benchmarks/config.py | 5 +- .../benchmarks/inference/benchmark.py | 218 ++++++----- .../benchmarks/inference/callback.py | 25 -- .../benchmarks/inference/config.py | 19 +- .../benchmarks/inference/report.py | 353 ------------------ optimum_benchmark/benchmarks/report.py | 130 ++++++- .../benchmarks/training/benchmark.py | 90 +++-- .../benchmarks/training/callback.py | 43 --- .../benchmarks/training/config.py | 9 +- .../benchmarks/training/report.py | 169 --------- optimum_benchmark/benchmarks/utils.py | 1 - optimum_benchmark/cli.py | 47 +-- optimum_benchmark/env_utils.py | 175 --------- optimum_benchmark/experiment.py | 161 ++++---- .../generators/input_generator.py | 20 +- .../generators/task_generator.py | 90 +---- optimum_benchmark/import_utils.py | 57 ++- optimum_benchmark/launchers/base.py | 5 +- optimum_benchmark/launchers/config.py | 4 +- .../launchers/inline/launcher.py | 17 +- .../launchers/isolation_utils.py | 75 ++-- .../launchers/process/launcher.py | 65 ++-- .../launchers/torchrun/config.py | 2 +- .../launchers/torchrun/launcher.py | 85 ++--- optimum_benchmark/logging_utils.py | 27 +- optimum_benchmark/system_utils.py | 219 +++++++++++ optimum_benchmark/task_utils.py | 52 +-- optimum_benchmark/trackers/energy.py | 115 ++++-- optimum_benchmark/trackers/latency.py | 252 ++++++++++--- optimum_benchmark/trackers/memory.py | 338 +++++++++++------ pyproject.toml | 13 + setup.py | 27 +- tests/configs/_base_.yaml | 13 +- tests/configs/_bert_sweep_.yaml | 3 +- ..._lm_naive_mp_.yaml => _gpt_naive_mp_.yaml} | 6 +- .../{_lm_peft_.yaml => _gpt_peft_.yaml} | 0 .../{_lm_sweep_.yaml => _gpt_sweep_.yaml} | 1 - ...nference_neural_compressor_gpt_sweep.yaml} | 4 +- ... cpu_inference_onnxruntime_gpt_sweep.yaml} | 4 +- ... => cpu_inference_openvino_gpt_sweep.yaml} | 4 +- ...l => cpu_inference_pytorch_gpt_sweep.yaml} | 4 +- ...ml => cpu_training_pytorch_gpt_sweep.yaml} | 4 +- ...cuda_inference_onnxruntime_gpt_sweep.yaml} | 4 +- ... cuda_inference_pytorch_gpt_naive_mp.yaml} | 4 +- ... => cuda_inference_pytorch_gpt_sweep.yaml} | 4 +- ...> cuda_training_pytorch_gpt_naive_mp.yaml} | 4 +- ...ml => cuda_training_pytorch_gpt_peft.yaml} | 4 +- ...l => cuda_training_pytorch_gpt_sweep.yaml} | 4 +- ... => cuda_training_torch_ort_gpt_peft.yaml} | 4 +- ...=> cuda_training_torch_ort_gpt_sweep.yaml} | 4 +- ...rocm_inference_onnxruntime_gpt_sweep.yaml} | 4 +- tests/test_api.py | 138 +++---- tests/test_cli.py | 11 +- 93 files changed, 1863 insertions(+), 2026 deletions(-) create mode 100644 examples/api_launch.py delete mode 100644 optimum_benchmark/benchmarks/inference/callback.py delete mode 100644 optimum_benchmark/benchmarks/inference/report.py delete mode 100644 optimum_benchmark/benchmarks/training/callback.py delete mode 100644 optimum_benchmark/benchmarks/training/report.py delete mode 100644 optimum_benchmark/benchmarks/utils.py delete mode 100644 optimum_benchmark/env_utils.py create mode 100644 optimum_benchmark/system_utils.py rename tests/configs/{_lm_naive_mp_.yaml => _gpt_naive_mp_.yaml} (100%) rename tests/configs/{_lm_peft_.yaml => _gpt_peft_.yaml} (100%) rename tests/configs/{_lm_sweep_.yaml => _gpt_sweep_.yaml} (81%) rename tests/configs/{cpu_inference_neural_compressor_lm_sweep.yaml => cpu_inference_neural_compressor_gpt_sweep.yaml} (70%) rename tests/configs/{cpu_inference_onnxruntime_lm_sweep.yaml => cpu_inference_onnxruntime_gpt_sweep.yaml} (71%) rename tests/configs/{cpu_inference_openvino_lm_sweep.yaml => cpu_inference_openvino_gpt_sweep.yaml} (71%) rename tests/configs/{cpu_inference_pytorch_lm_sweep.yaml => cpu_inference_pytorch_gpt_sweep.yaml} (72%) rename tests/configs/{cpu_training_pytorch_lm_sweep.yaml => cpu_training_pytorch_gpt_sweep.yaml} (72%) rename tests/configs/{cuda_inference_onnxruntime_lm_sweep.yaml => cuda_inference_onnxruntime_gpt_sweep.yaml} (71%) rename tests/configs/{cuda_inference_pytorch_lm_naive_mp.yaml => cuda_inference_pytorch_gpt_naive_mp.yaml} (70%) rename tests/configs/{cuda_inference_pytorch_lm_sweep.yaml => cuda_inference_pytorch_gpt_sweep.yaml} (72%) rename tests/configs/{cuda_training_pytorch_lm_naive_mp.yaml => cuda_training_pytorch_gpt_naive_mp.yaml} (70%) rename tests/configs/{cuda_training_pytorch_lm_peft.yaml => cuda_training_pytorch_gpt_peft.yaml} (69%) rename tests/configs/{cuda_training_pytorch_lm_sweep.yaml => cuda_training_pytorch_gpt_sweep.yaml} (69%) rename tests/configs/{cuda_training_torch_ort_lm_peft.yaml => cuda_training_torch_ort_gpt_peft.yaml} (69%) rename tests/configs/{cuda_training_torch_ort_lm_sweep.yaml => cuda_training_torch_ort_gpt_sweep.yaml} (69%) rename tests/configs/{rocm_inference_onnxruntime_lm_sweep.yaml => rocm_inference_onnxruntime_gpt_sweep.yaml} (71%) diff --git a/.github/workflows/test_api_cuda.yaml b/.github/workflows/test_api_cuda.yaml index fe08f29d..28d9b435 100644 --- a/.github/workflows/test_api_cuda.yaml +++ b/.github/workflows/test_api_cuda.yaml @@ -18,11 +18,11 @@ jobs: matrix: image: [ - { torch_cuda: cu121, cuda_version: 12.1.1 }, - { torch_cuda: cu118, cuda_version: 11.8.0 }, + { torch_cuda: cu118, torch_pre_release: 0, cuda_version: 11.8.0 }, + { torch_cuda: cu121, torch_pre_release: 1, cuda_version: 12.1.1 }, ] - runs-on: hf-dgx-01 + runs-on: nvidia-gpu steps: - name: Checkout uses: actions/checkout@v3 @@ -37,17 +37,20 @@ jobs: --tag opt-bench-cuda:${{ matrix.image.cuda_version }} . + - name: Get GPUs with most free memory + id: get_devices + run: | + echo "::set-output name=devices::$(nvidia-smi --query-gpu=memory.free,index --format=csv,noheader,nounits | sort -n -k1 | tail -n 2 | awk -F', ' '{print $2}' | xargs echo -n | sed 's/ /,/g' | awk '{print $0}')" + - name: Run tests run: docker run --rm - --net host --pid host --shm-size 64G --env USE_CUDA="1" - --volume $HOME/.cache/huggingface:/home/user/.cache/huggingface + --gpus '"device=${{ steps.get_devices.outputs.devices }}"' --volume $(pwd):/workspace/optimum-benchmark --workdir /workspace/optimum-benchmark - --gpus '"device=0,1"' --entrypoint /bin/bash opt-bench-cuda:${{ matrix.image.cuda_version }} -c "pip install -e .[testing,timm,diffusers] && pytest -k 'api and cuda' -x" diff --git a/.github/workflows/test_api_rocm.yaml b/.github/workflows/test_api_rocm.yaml index 31328eb3..7e2bf63a 100644 --- a/.github/workflows/test_api_rocm.yaml +++ b/.github/workflows/test_api_rocm.yaml @@ -19,10 +19,10 @@ jobs: image: [ { torch_rocm: rocm5.6, torch_pre_release: 0, rocm_version: 5.6.1 }, - { torch_rocm: rocm5.7, torch_pre_release: 1, rocm_version: 5.7 }, + { torch_rocm: rocm5.7, torch_pre_release: 1, rocm_version: 5.7.1 }, ] - runs-on: hf-amd-mi210-dev + runs-on: amd-gpu steps: - name: Checkout code uses: actions/checkout@v3 @@ -41,11 +41,9 @@ jobs: - name: Run tests run: docker run --rm - --net host --pid host --shm-size 64G --env USE_ROCM="1" - --volume $HOME/.cache/huggingface:/home/user/.cache/huggingface --volume $(pwd):/workspace/optimum-benchmark --workdir /workspace/optimum-benchmark --device /dev/kfd diff --git a/.github/workflows/test_cli_cuda_onnxruntime.yaml b/.github/workflows/test_cli_cuda_onnxruntime.yaml index 0b03608e..adb31be3 100644 --- a/.github/workflows/test_cli_cuda_onnxruntime.yaml +++ b/.github/workflows/test_cli_cuda_onnxruntime.yaml @@ -13,7 +13,7 @@ concurrency: jobs: build_image_and_run_cli_cuda_onnxruntime_tests: - runs-on: hf-dgx-01 + runs-on: nvidia-gpu steps: - name: Checkout uses: actions/checkout@v3 @@ -28,16 +28,20 @@ jobs: --tag opt-bench-cuda:11.8.0 . + - name: Get GPUs with most free memory + id: get_devices + run: | + echo "::set-output name=devices::$(nvidia-smi --query-gpu=memory.free,index --format=csv,noheader,nounits | sort -n -k1 | tail -n 2 | awk -F', ' '{print $2}' | xargs echo -n | sed 's/ /,/g' | awk '{print $0}')" + - name: Run tests run: docker run --rm - --net host --pid host --shm-size 64G --env USE_CUDA="1" --entrypoint /bin/bash + --gpus '"device=${{ steps.get_devices.outputs.devices }}"' --volume $(pwd):/workspace/optimum-benchmark --workdir /workspace/optimum-benchmark - --gpus '"device=0,1"' opt-bench-cuda:11.8.0 -c "pip install -e .[testing,onnxruntime-gpu,diffusers,timm] && pytest -k 'cli and cuda and onnxruntime' -x" diff --git a/.github/workflows/test_cli_cuda_pytorch.yaml b/.github/workflows/test_cli_cuda_pytorch.yaml index 1b3fd99f..204722db 100644 --- a/.github/workflows/test_cli_cuda_pytorch.yaml +++ b/.github/workflows/test_cli_cuda_pytorch.yaml @@ -18,11 +18,11 @@ jobs: matrix: image: [ - { torch_cuda: cu121, cuda_version: 12.1.1 }, - { torch_cuda: cu118, cuda_version: 11.8.0 }, + { torch_cuda: cu118, torch_pre_release: 0, cuda_version: 11.8.0 }, + { torch_cuda: cu121, torch_pre_release: 1, cuda_version: 12.1.1 }, ] - runs-on: hf-dgx-01 + runs-on: nvidia-gpu steps: - name: Checkout uses: actions/checkout@v3 @@ -37,17 +37,20 @@ jobs: --tag opt-bench-cuda:${{ matrix.image.cuda_version }} . + - name: Get GPUs with most free memory + id: get_devices + run: | + echo "::set-output name=devices::$(nvidia-smi --query-gpu=memory.free,index --format=csv,noheader,nounits | sort -n -k1 | tail -n 2 | awk -F', ' '{print $2}' | xargs echo -n | sed 's/ /,/g' | awk '{print $0}')" + - name: Run tests run: docker run --rm - --net host --pid host --shm-size 64G --env USE_CUDA="1" - --volume $HOME/.cache/huggingface:/home/user/.cache/huggingface + --gpus '"device=${{ steps.get_devices.outputs.devices }}"' --volume $(pwd):/workspace/optimum-benchmark --workdir /workspace/optimum-benchmark - --gpus '"device=0,1"' --entrypoint /bin/bash opt-bench-cuda:${{ matrix.image.cuda_version }} -c "pip install -e .[testing,diffusers,timm,deepspeed,peft] && pytest -k 'cli and cuda and pytorch' -x" diff --git a/.github/workflows/test_cli_cuda_torch_ort.yaml b/.github/workflows/test_cli_cuda_torch_ort.yaml index 71bfd33e..680f3f0f 100644 --- a/.github/workflows/test_cli_cuda_torch_ort.yaml +++ b/.github/workflows/test_cli_cuda_torch_ort.yaml @@ -13,7 +13,7 @@ concurrency: jobs: build_image_and_run_cli_cuda_torch_ort_tests: - runs-on: hf-dgx-01 + runs-on: nvidia-gpu steps: - name: Checkout uses: actions/checkout@v3 @@ -28,16 +28,20 @@ jobs: --tag opt-bench-cuda:11.8.0 . + - name: Get GPUs with most free memory + id: get_devices + run: | + echo "::set-output name=devices::$(nvidia-smi --query-gpu=memory.free,index --format=csv,noheader,nounits | sort -n -k1 | tail -n 2 | awk -F', ' '{print $2}' | xargs echo -n | sed 's/ /,/g' | awk '{print $0}')" + - name: Run tests run: docker run --rm - --net host --pid host --shm-size 64G --env USE_CUDA="1" --entrypoint /bin/bash + --gpus '"device=${{ steps.get_devices.outputs.devices }}"' --volume $(pwd):/workspace/optimum-benchmark --workdir /workspace/optimum-benchmark - --gpus '"device=0,1"' opt-bench-cuda:11.8.0 -c "pip install -e .[testing,torch-ort,peft] && python -m torch_ort.configure && pytest -k 'cli and cuda and torch_ort' -x" diff --git a/.github/workflows/test_cli_rocm_onnxruntime.yaml b/.github/workflows/test_cli_rocm_onnxruntime.yaml index fcd0f53d..8be58292 100644 --- a/.github/workflows/test_cli_rocm_onnxruntime.yaml +++ b/.github/workflows/test_cli_rocm_onnxruntime.yaml @@ -13,7 +13,7 @@ concurrency: jobs: build_image_and_run_cli_rocm_onnxruntime_tests: - runs-on: hf-amd-mi210-dev + runs-on: amd-gpu steps: - name: Checkout uses: actions/checkout@v3 @@ -21,7 +21,7 @@ jobs: - name: Check if image exists id: check_image run: | - if [[ "$(docker images -q opt-bench-rocm-ort:5.7 2> /dev/null)" == "" ]]; then + if [[ "$(docker images -q opt-bench-rocm-ort:latest 2> /dev/null)" == "" ]]; then echo "::set-output name=exists::false" else echo "::set-output name=exists::true" @@ -33,14 +33,12 @@ jobs: --file docker/rocm-ort.dockerfile --build-arg USER_ID=$(id -u) --build-arg GROUP_ID=$(id -g) - --build-arg ROCM_VERSION=5.7 - --tag opt-bench-rocm-ort:5.7 + --tag opt-bench-rocm-ort:latest . - name: Run tests run: docker run --rm - --net host --pid host --shm-size 64G --env USE_ROCM="1" @@ -50,5 +48,5 @@ jobs: --device /dev/dri/renderD128 --device /dev/dri/renderD129 --entrypoint /bin/bash - opt-bench-rocm-ort:5.7 + opt-bench-rocm-ort:latest -c "pip install -e .[testing,timm,diffusers] && pytest -k 'cli and rocm and onnxruntime' -x" diff --git a/.github/workflows/test_cli_rocm_pytorch.yaml b/.github/workflows/test_cli_rocm_pytorch.yaml index 11c9e77a..c4ae7139 100644 --- a/.github/workflows/test_cli_rocm_pytorch.yaml +++ b/.github/workflows/test_cli_rocm_pytorch.yaml @@ -19,10 +19,10 @@ jobs: image: [ { torch_rocm: rocm5.6, torch_pre_release: 0, rocm_version: 5.6.1 }, - { torch_rocm: rocm5.7, torch_pre_release: 1, rocm_version: 5.7 }, + { torch_rocm: rocm5.7, torch_pre_release: 1, rocm_version: 5.7.1 }, ] - runs-on: hf-amd-mi210-dev + runs-on: [amd-gpu] steps: - name: Checkout code uses: actions/checkout@v3 @@ -41,11 +41,9 @@ jobs: - name: Run tests run: docker run --rm - --net host --pid host --shm-size 64G --env USE_ROCM="1" - --volume $HOME/.cache/huggingface:/home/user/.cache/huggingface --volume $(pwd):/workspace/optimum-benchmark --workdir /workspace/optimum-benchmark --device /dev/kfd diff --git a/.github/workflows/test_cli_tensorrt_llm.yaml b/.github/workflows/test_cli_tensorrt_llm.yaml index 0169fca5..40438055 100644 --- a/.github/workflows/test_cli_tensorrt_llm.yaml +++ b/.github/workflows/test_cli_tensorrt_llm.yaml @@ -13,7 +13,7 @@ concurrency: jobs: pull_image_and_run_cli_tensorrt_llm_tests: - runs-on: hf-dgx-01 + runs-on: nvidia-gpu steps: - name: Checkout uses: actions/checkout@v3 @@ -26,18 +26,20 @@ jobs: --tag opt-bench-tensorrt-llm:latest . + - name: Get GPUs with most free memory + id: get_devices + run: | + echo "::set-output name=devices::$(nvidia-smi --query-gpu=memory.free,index --format=csv,noheader,nounits | sort -n -k1 | tail -n 2 | awk -F', ' '{print $2}' | xargs echo -n | sed 's/ /,/g' | awk '{print $0}')" + - name: Run tests run: docker run --rm - --net host --pid host --shm-size 64G --env USE_CUDA="1" - --env USER_ID=$(id -u) - --env GROUP_ID=$(id -g) + --gpus '"device=${{ steps.get_devices.outputs.devices }}"' --volume $(pwd):/workspace/optimum-benchmark --workdir /workspace/optimum-benchmark - --gpus '"device=0,1"' --entrypoint /bin/bash opt-bench-tensorrt-llm:latest - -c "pip install -e .[testing] && pytest -k 'cli and tensorrt_llm' -x" + -c "pip install -e .[testing] && pip uninstall -y nvidia-ml-py && pytest -k 'cli and tensorrt_llm' -x" diff --git a/.github/workflows/test_cli_tensorrt_onnxruntime.yaml b/.github/workflows/test_cli_tensorrt_onnxruntime.yaml index 92f425e7..a98bfc15 100644 --- a/.github/workflows/test_cli_tensorrt_onnxruntime.yaml +++ b/.github/workflows/test_cli_tensorrt_onnxruntime.yaml @@ -13,7 +13,7 @@ concurrency: jobs: build_image_and_run_cli_tensorrt_onnxruntime_tests: - runs-on: hf-dgx-01 + runs-on: nvidia-gpu steps: - name: Checkout uses: actions/checkout@v3 @@ -23,15 +23,12 @@ jobs: --file docker/tensorrt.dockerfile --build-arg USER_ID=$(id -u) --build-arg GROUP_ID=$(id -g) - --build-arg TENSORRT_VERSION=22.12 - --build-arg TORCH_CUDA=cu118 - --tag opt-bench-tensorrt:22.12 + --tag opt-bench-tensorrt:latest . - name: Run tests run: docker run --rm - --net host --pid host --shm-size 64G --env USE_CUDA="1" @@ -39,5 +36,5 @@ jobs: --workdir /workspace/optimum-benchmark --gpus '"device=0,1"' --entrypoint /bin/bash - opt-bench-tensorrt:22.12 + opt-bench-tensorrt:latest -c "pip install -e .[testing,onnxruntime-gpu,diffusers,timm] && pytest -k 'cli and tensorrt and onnxruntime' -x" diff --git a/.gitignore b/.gitignore index 12c19326..a8e86c83 100644 --- a/.gitignore +++ b/.gitignore @@ -171,3 +171,4 @@ actions-runner/ experiments/ examples/ .engine/ +amdsmi \ No newline at end of file diff --git a/Makefile b/Makefile index 55e44e1e..0253c183 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,9 @@ # List of targets that are not associated with files -.PHONY: quality style install install_dev_cpu install_dev_gpu +.PHONY: quality style install \ + build_docker_cpu, build_docker_cuda, build_docker_rocm, \ + test_cli_cpu_pytorch, test_cli_rocm_pytorch, \ + test_cli_cpu_neural_compressor, test_cli_cpu_onnxruntime, test_cli_cpu_openvino, \ + test_api_cpu, test_api_cuda, test_api_rocm, test_api_misc quality: ruff check . @@ -13,13 +17,13 @@ install: pip install -e . build_docker_cpu: - docker build -f docker/cuda.dockerfile --build-arg USER_ID=$(id -u) --build-arg GROUP_ID=$(id -g) -t opt-bench-cpu:latest . + docker build -f docker/cpu.dockerfile --build-arg USER_ID=$(shell id -u) --build-arg GROUP_ID=$(shell id -g) -t opt-bench-cpu:latest . build_docker_cuda: - docker build -f docker/cuda.dockerfile --build-arg USER_ID=$(id -u) --build-arg GROUP_ID=$(id -g) --build-arg TORCH_CUDA=cu118 --build-arg CUDA_VERSION=11.8.0 -t opt-bench-cuda:11.8.0 . + docker build -f docker/cuda.dockerfile --build-arg USER_ID=$(shell id -u) --build-arg GROUP_ID=$(shell id -g) -t opt-bench-cuda:latest . build_docker_rocm: - docker build -f docker/rocm.dockerfile --build-arg USER_ID=$(id -u) --build-arg GROUP_ID=$(id -g) --build-arg TORCH_ROCM=rocm5.6 --build-arg ROCM_VERSION=5.6.1 -t opt-bench-rocm:5.6.1 . + docker build -f docker/rocm.dockerfile --build-arg USER_ID=$(shell id -u) --build-arg GROUP_ID=$(shell id -g) -t opt-bench-rocm:latest . test_cli_cpu_neural_compressor: docker run \ @@ -27,23 +31,23 @@ test_cli_cpu_neural_compressor: --entrypoint /bin/bash \ --volume $(PWD):/workspace \ --workdir /workspace \ - opt-bench-cpu:latest -c "pip install -e .[testing,neural-compressor] && pytest tests/ -k 'cli and cpu and neural_compressor' -x" + opt-bench-cpu:latest -c "pip install -e .[testing,neural-compressor,diffusers,timm] && pytest tests/ -k 'cli and cpu and neural_compressor' -x" -test_cli_cpu_openvino: +test_cli_cpu_onnxruntime: docker run \ --rm \ --entrypoint /bin/bash \ --volume $(PWD):/workspace \ --workdir /workspace \ - opt-bench-cpu:latest -c "pip install -e .[testing,openvino,diffusers] && pytest tests/ -k 'cli and cpu and openvino' -x" + opt-bench-cpu:latest -c "pip install -e .[testing,onnxruntime,diffusers,timm] && pytest tests/ -k 'cli and cpu and onnxruntime' -x" -test_cli_cpu_onnxruntime: +test_cli_cpu_openvino: docker run \ --rm \ --entrypoint /bin/bash \ --volume $(PWD):/workspace \ --workdir /workspace \ - opt-bench-cpu:latest -c "pip install -e .[testing,onnxruntime,diffusers,timm] && pytest tests/ -k 'cli and cpu and onnxruntime' -x" + opt-bench-cpu:latest -c "pip install -e .[testing,openvino,diffusers,timm] && pytest tests/ -k 'cli and cpu and openvino' -x" test_cli_cpu_pytorch: docker run \ @@ -53,13 +57,34 @@ test_cli_cpu_pytorch: --workdir /workspace \ opt-bench-cpu:latest -c "pip install -e .[testing,diffusers,timm] && pytest tests/ -k 'cli and cpu and pytorch' -x" +test_cli_rocm_pytorch: + docker run \ + --rm \ + --device=/dev/kfd \ + --device /dev/dri/renderD128 \ + --device /dev/dri/renderD129 \ + --group-add video \ + --entrypoint /bin/bash \ + --volume $(PWD):/workspace \ + --workdir /workspace \ + opt-bench-rocm:latest -c "pip install -e .[testing,diffusers,timm,deepspeed,peft] && pytest tests/ -k 'cli and cuda and pytorch' -x" + +test_cli_cuda_pytorch: + docker run \ + --rm \ + --gpus '"device=0,1"' \ + --entrypoint /bin/bash \ + --volume $(PWD):/workspace \ + --workdir /workspace \ + opt-bench-cuda:latest -c "pip install -e .[testing,diffusers,timm,deepspeed,peft] && pytest tests/ -k 'cli and cuda and pytorch' -x" + test_api_cpu: docker run \ --rm \ --entrypoint /bin/bash \ --volume $(PWD):/workspace \ --workdir /workspace \ - opt-bench-cpu:latest -c "pip install -e .[testing,timm,diffusers,codecarbon] && pytest tests/ -k 'api and cpu' -x" + opt-bench-cpu:latest -c "pip install -e .[testing,timm,diffusers] && pytest tests/ -k 'api and cpu' -x" test_api_cuda: docker run \ @@ -68,7 +93,19 @@ test_api_cuda: --entrypoint /bin/bash \ --volume $(PWD):/workspace \ --workdir /workspace \ - opt-bench-cuda:11.8.0 -c "pip install -e .[testing,timm,diffusers,codecarbon] && pytest tests/ -k 'api and cuda' -x" + opt-bench-cuda:latest -c "pip install -e .[testing,timm,diffusers] && pytest tests/ -k 'api and cuda' -x" + +test_api_rocm: + docker run \ + --rm \ + --device=/dev/kfd \ + --device /dev/dri/renderD128 \ + --device /dev/dri/renderD129 \ + --group-add video \ + --entrypoint /bin/bash \ + --volume $(PWD):/workspace \ + --workdir /workspace \ + opt-bench-rocm:latest -c "pip install -e .[testing,timm,diffusers] && pytest tests/ -k 'api and cuda' -x" test_api_misc: docker run \ @@ -76,4 +113,4 @@ test_api_misc: --entrypoint /bin/bash \ --volume $(PWD):/workspace \ --workdir /workspace \ - opt-bench-cpu:latest -c "pip install -e .[testing,timm,diffusers,codecarbon] && pytest tests/ -k 'api and not (cpu or cuda or rocm or tensorrt)' -x" + opt-bench-cpu:latest -c "pip install -e .[testing,timm,diffusers] && pytest tests/ -k 'api and not (cpu or cuda or rocm or tensorrt)' -x" diff --git a/README.md b/README.md index e338b888..49889327 100644 --- a/README.md +++ b/README.md @@ -1,37 +1,42 @@ -

- Optimum-Benchmark Logo -

+

Optimum-Benchmark Logo

+

All benchmarks are wrong, some will cost you less than the others.

Optimum-Benchmark ๐Ÿ‹๏ธ

-Optimum-Benchmark is a unified [multi-backend & multi-device](#backends--devices-) utility for benchmarking [Transformers](https://github.com/huggingface/transformers), [Diffusers](https://github.com/huggingface/diffusers), [PEFT](https://github.com/huggingface/peft), [TIMM](https://github.com/huggingface/pytorch-image-models) and [Optimum](https://github.com/huggingface/optimum) flavors, along with all their supported [optimizations & quantization schemes](#backend-features-), for [inference & training](#benchmark-features-%EF%B8%8F), in [distributed & non-distributed settings](#backend-features-). +Optimum-Benchmark is a unified [multi-backend & multi-device](#backends--devices-) utility for benchmarking [Transformers](https://github.com/huggingface/transformers), [Diffusers](https://github.com/huggingface/diffusers), [PEFT](https://github.com/huggingface/peft), [TIMM](https://github.com/huggingface/pytorch-image-models) and [Optimum](https://github.com/huggingface/optimum) flavors, along with all their supported [optimizations & quantization schemes](#backend-features-), for [inference & training](#benchmark-features-%EF%B8%8F), in [distributed & non-distributed settings](#backend-features-), in the most correct and scalable way possible (no need to even download model weights). -## Motivation ๐Ÿค” +*News* ๐Ÿ“ฐ +- PYPI release soon. +- Added a simple Python API to run benchmarks with all isolation and tracking features supported by the CLI. +*Motivations* ๐Ÿค” - HF hardware partners wanting to know how their hardware performs compared to another hardware on the same models. - HF ecosystem users wanting to know how their chosen model performs in terms of latency, throughput, memory usage, energy consumption, etc compared to another model. - Experimenting with hardware & backend specific optimizations & quantization schemes that can be applied to models and improve their computational/memory/energy efficiency. +*Notes* ๐Ÿ“ +- If you were using `optimum-benchmark` before and want to keep using the old CLI only version, you can still do so by installing from this branch [`0.0.1`](https://github.com/huggingface/optimum-benchmark/tree/0.0.1). + ## Current status ๐Ÿ“ˆ ### API - [![CPU](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_api_cpu.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_api_cpu.yaml) [![CUDA](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_api_cuda.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_api_cuda.yaml) [![ROCM](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_api_rocm.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_api_rocm.yaml) +[![MISC](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_api_misc.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_api_misc.yaml) ### CLI - -[![CPU Pytorch Tests](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cpu_pytorch.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cpu_pytorch.yaml) -[![CPU OnnxRuntime Tests](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cpu_onnxruntime.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cpu_onnxruntime.yaml) -[![CPU Intel Neural Compressor Tests](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cpu_neural_compressor.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cpu_neural_compressor.yaml) -[![CPU OpenVINO Tests](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cpu_openvino.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cpu_openvino.yaml) -[![CUDA Pytorch Tests](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cuda_pytorch.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cuda_pytorch.yaml) -[![CUDA OnnxRuntime Tests](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cuda_onnxruntime_inference.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cuda_onnxruntime_inference.yaml) -[![CUDA Torch-ORT Tests](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cuda_torch_ort_training.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cuda_torch_ort_training.yaml) -[![TensorRT OnnxRuntime Tests](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_tensorrt_onnxruntime_inference.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_tensorrt_onnxruntime_inference.yaml) -[![TensorRT-LLM Tests](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_tensorrt_llm.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_tensorrt_llm.yaml) -[![ROCm Pytorch Tests](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_rocm_pytorch.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_rocm_pytorch.yaml) -[![ROCm OnnxRuntime Tests](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_rocm_onnxruntime_inference.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_rocm_onnxruntime_inference.yaml) +[![CPU Pytorch Tests](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cpu_pytorch.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cpu_pytorch.yaml) +[![CPU OnnxRuntime Tests](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cpu_onnxruntime.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cpu_onnxruntime.yaml) +[![CPU Intel Neural Compressor Tests](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cpu_neural_compressor.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cpu_neural_compressor.yaml) +[![CPU OpenVINO Tests](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cpu_openvino.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cpu_openvino.yaml) +[![CUDA Pytorch Tests](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cuda_pytorch.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cuda_pytorch.yaml) +[![CUDA OnnxRuntime Tests](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cuda_onnxruntime.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cuda_onnxruntime.yaml) +[![CUDA Torch-ORT Tests](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cuda_torch_ort.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cuda_torch_ort.yaml) +[![TensorRT OnnxRuntime Tests](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_tensorrt_onnxruntime.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_tensorrt_onnxruntime.yaml) +[![TensorRT-LLM Tests](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_tensorrt_llm.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_tensorrt_llm.yaml) +[![ROCm Pytorch Tests](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_rocm_pytorch.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_rocm_pytorch.yaml) +[![ROCm OnnxRuntime Tests](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_rocm_onnxruntime.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_rocm_onnxruntime.yaml) +[![MISC Tests](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_misc.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_misc.yaml) ## Quickstart ๐Ÿš€ @@ -64,46 +69,36 @@ Depending on the backends you want to use, you might need to install some extra ### Running benchmarks from Python API ๐Ÿงช -You can run benchmarks from the Python API, using the `launch` function from the `optimum_benchmark.experiment` module. Here's an example of how to run a benchmark using the `pytorch` backend, `process` launcher and `inference` benchmark. +You can run benchmarks from the Python API, using the `launch` function. Here's an example of how to run a benchmark using the `pytorch` backend, `torchrun` launcher and `inference` benchmark. ```python from optimum_benchmark.logging_utils import setup_logging from optimum_benchmark.experiment import launch, ExperimentConfig from optimum_benchmark.backends.pytorch.config import PyTorchConfig -from optimum_benchmark.launchers.process.config import ProcessConfig +from optimum_benchmark.launchers.torchrun.config import TorchrunConfig from optimum_benchmark.benchmarks.inference.config import InferenceConfig - if __name__ == "__main__": setup_logging(level="INFO") - benchmark_config = InferenceConfig(latency=False, memory=True, energy=True) - launcher_config = ProcessConfig() - backend_config = PyTorchConfig( - device="cuda", - no_weights=True, - device_ids="0,1", - device_map="auto", - model="IlyasMoutawwakil/vicuna-7b-v1.5-awq-gemm", - ) + launcher_config = TorchrunConfig(nproc_per_node=2) + benchmark_config = InferenceConfig(latency=True, memory=True) + backend_config = PyTorchConfig(model="gpt2", device="cuda", device_ids="0,1", no_weights=True) experiment_config = ExperimentConfig( - experiment_name="python-api-launch-experiment", + experiment_name="api-launch", benchmark=benchmark_config, launcher=launcher_config, backend=backend_config, ) benchmark_report = launch(experiment_config) - benchmark_report.log_all() - # or - print(benchmark_report.to_dict()) - # or - benchmark_report.push_to_hub("IlyasMoutawwakil/vicuna-7b-v1.5-awq-gemm") + experiment_config.push_to_hub("IlyasMoutawwakil/benchmarks") # pushes experiment_config.json to the hub + benchmark_report.push_to_hub("IlyasMoutawwakil/benchmarks") # pushes benchmark_report.json to the hub ``` -Yep, it's that simple! Check the supported backends, launchers and benchmarks in the [features](#features-) section. +Yep, it's that simple! Check the supported backends, launchers and benchmarks matrix in the [features](#features-) section. ### Running benchmarks from CLI ๐Ÿƒโ€โ™‚๏ธ -You can run a benchmark using the command line by specifying the configuration directory and the configuration name. Both arguments are mandatory for [`hydra`](https://hydra.cc/). `--config-dir` is the directory where the configuration files are stored and `--config-name` is the name of the configuration file without its `.yaml` extension. +You can also run a benchmark using the command line by specifying the configuration directory and the configuration name. Both arguments are mandatory for [`hydra`](https://hydra.cc/). `--config-dir` is the directory where the configuration files are stored and `--config-name` is the name of the configuration file without its `.yaml` extension. ```bash optimum-benchmark --config-dir examples/ --config-name pytorch_bert @@ -111,11 +106,11 @@ optimum-benchmark --config-dir examples/ --config-name pytorch_bert This will run the benchmark using the configuration in [`examples/pytorch_bert.yaml`](examples/pytorch_bert.yaml) and store the results in `runs/pytorch_bert`. -The result files are `benchmark_report.json`, the program's logs `experiment.log` and the configuration that's been used `experiment_config.yaml`, including backend, launcher, benchmark and environment configurations. +The result files are `benchmark_report.json`, the program's logs `cli.log` and the configuration that's been used `experiment_config.json`, including backend, launcher, benchmark and environment configurations. The directory for storing these results can be changed by setting `hydra.run.dir` (and/or `hydra.sweep.dir` in case of a multirun) in the command line or in the config file. -### Configuration overrides ๐ŸŽ›๏ธ +#### Configuration overrides ๐ŸŽ›๏ธ It's easy to override the default behavior of a benchmark from the command line. @@ -123,40 +118,17 @@ It's easy to override the default behavior of a benchmark from the command line. optimum-benchmark --config-dir examples/ --config-name pytorch_bert backend.model=gpt2 backend.device=cuda ``` -### Configuration multirun sweeps ๐Ÿงน +#### Configuration multirun sweeps ๐Ÿงน You can easily run configuration sweeps using the `-m` or `--multirun` option. By default, configurations will be executed serially but other kinds of executions are supported with hydra's launcher plugins : `=submitit`, `hydra/launcher=rays`, etc. -Note that the hydra launcher `hydra/launcher` is different than our own `launcher`, specifically `hydra/launcher` can only be used in `--multirun` mode, and will only handle the inter-run behavior. ```bash optimum-benchmark --config-dir examples --config-name pytorch_bert -m backend.device=cpu,cuda ``` -Also, for integer parameters like `batch_size`, one can specify a range of values to sweep over: - -```bash -optimum-benchmark --config-dir examples --config-name pytorch_bert -m device=cpu,cuda benchmark.input_shapes.batch_size='range(1,10,step=2)' -``` - ### Configurations structure ๐Ÿ“ -You can create custom configuration files following the [examples here](examples). -You can also use `hydra`'s [composition](https://hydra.cc/docs/0.11/tutorial/composition/) with a base configuration ([`examples/pytorch_bert.yaml`](examples/pytorch_bert.yaml) for example) and override/define parameters. - -To create a configuration that uses a `wav2vec2` model and `onnxruntime` backend, it's as easy as: - -```yaml -defaults: - - pytorch_bert - - _self_ - - override backend: onnxruntime - -experiment_name: onnxruntime_wav2vec2 -model: bookbot/distil-wav2vec2-adult-child-cls-37m -device: cpu -``` - -Other than the [examples](examples), you can also check [tests](tests/configs/). +You can create custom configuration files following the [examples here]([examples](https://github.com/IlyasMoutawwakil/optimum-benchmark-examples)). ## Features ๐ŸŽจ @@ -171,9 +143,9 @@ Everything else is optional or inferred at runtime, but can be configured to you ### Launchers ๐Ÿš€ +- [x] Distributed inference/training (`launcher=torchrun`) - [x] Process isolation between consecutive runs (`launcher=process`) - [x] Assert GPU devices (NVIDIA & AMD) isolation (`launcher.device_isolation=true`) -- [x] Distributed inference/training (`launcher=torchrun`, `launcher.n_proc_per_node=2`) ### Backends & Devices ๐Ÿ“ฑ @@ -191,19 +163,18 @@ Everything else is optional or inferred at runtime, but can be configured to you ### Benchmarking ๐Ÿ‹๏ธ - [x] Memory tracking (`benchmark.memory=true`) -- [x] Latency and throughput tracking of forward pass (default) +- [x] Energy and efficiency tracking (`benchmark.energy=true`) +- [x] Latency and throughput tracking (`benchmark.latency=true`) - [x] Warm up runs before inference (`benchmark.warmup_runs=20`) - [x] Warm up steps during training (`benchmark.warmup_steps=20`) -- [x] Energy and carbon emissions tracking (`benchmark.energy=true`) - [x] Inputs shapes control (e.g. `benchmark.input_shapes.sequence_length=128`) - [x] Dataset shapes control (e.g. `benchmark.dataset_shapes.dataset_size=1000`) -- [x] Latancy and throughput tracking of generation pass (auto-enabled for generative models) -- [x] Prefill latency and Decoding throughput deduced from generation and forward pass (auto-enabled for generative models) -- [x] Forward and Generation pass control (e.g. for an LLM `benchmark.generate_kwargs.max_new_tokens=100`, for a diffusion model `benchmark.forward_kwargs.num_images_per_prompt=4`) +- [x] Prefill latency and Decoding throughput deduced from Generate and Forward pass (auto-enabled for text generation models) +- [x] Forward, Call and Generate pass kwargs control (e.g. for an LLM `benchmark.generate_kwargs.max_new_tokens=100`, for a diffusion model `benchmark.call_kwargs.num_images_per_prompt=4`) ### Backend features ๐Ÿงฐ -- [x] Random weights initialization (`backend.no_weights=true` for fast model instantiation without downloading weights) +- [x] "No weights" to benchmark models without downloading their weights (`backend.no_weights=true`) - [x] Onnxruntime Quantization and AutoQuantization (`backend.quantization=true` or `backend.auto_quantization=avx2`, etc) - [x] Onnxruntime Calibration for Static Quantization (`backend.quantization_config.is_static=true`, etc) - [x] Onnxruntime Optimization and AutoOptimization (`backend.optimization=true` or `backend.auto_optimization=O4`, etc) diff --git a/docker/cpu.dockerfile b/docker/cpu.dockerfile index 371a89c8..f15db72f 100644 --- a/docker/cpu.dockerfile +++ b/docker/cpu.dockerfile @@ -1,6 +1,5 @@ FROM ubuntu:latest - # Ignore interactive questions during `docker build` ENV DEBIAN_FRONTEND noninteractive diff --git a/docker/cuda.dockerfile b/docker/cuda.dockerfile index a2270ffa..664895d1 100644 --- a/docker/cuda.dockerfile +++ b/docker/cuda.dockerfile @@ -13,12 +13,12 @@ # limitations under the License. ARG CUDNN_VERSION=8 -ARG CUDA_VERSION=12.1.1 +ARG CUDA_VERSION=11.8.0 ARG UBUNTU_VERSION=22.04 FROM nvidia/cuda:${CUDA_VERSION}-cudnn${CUDNN_VERSION}-devel-ubuntu${UBUNTU_VERSION} -ARG TORCH_CUDA=cu121 +ARG TORCH_CUDA=cu118 ARG TORCH_PRE_RELEASE=0 # Ignore interactive questions during `docker build` diff --git a/docker/rocm-ort.dockerfile b/docker/rocm-ort.dockerfile index 1dafd137..5309962f 100644 --- a/docker/rocm-ort.dockerfile +++ b/docker/rocm-ort.dockerfile @@ -13,10 +13,11 @@ # limitations under the License. ARG ROCM_VERSION=5.7 -ARG UBUNTU_VERSION=22.04 ARG PYTHON_VERSION=3.10 +ARG UBUNTU_VERSION=22.04 +ARG PYTORCH_VERSION=2.0.1 -FROM rocm/pytorch:rocm${ROCM_VERSION}_ubuntu${UBUNTU_VERSION}_py${PYTHON_VERSION}_pytorch_2.0.1 +FROM rocm/pytorch:rocm${ROCM_VERSION}_ubuntu${UBUNTU_VERSION}_py${PYTHON_VERSION}_pytorch_${PYTORCH_VERSION} # Ignore interactive questions during `docker build` ENV DEBIAN_FRONTEND noninteractive diff --git a/docker/tensorrt.dockerfile b/docker/tensorrt.dockerfile index 1e2b8603..35c84a63 100644 --- a/docker/tensorrt.dockerfile +++ b/docker/tensorrt.dockerfile @@ -16,7 +16,7 @@ ARG TENSORRT_VERSION=23.09 FROM nvcr.io/nvidia/tensorrt:${TENSORRT_VERSION}-py3 -ARG TORCH_CUDA=cu121 +ARG TORCH_CUDA=cu118 # Ignore interactive questions during `docker build` ENV DEBIAN_FRONTEND noninteractive diff --git a/examples/api_launch.py b/examples/api_launch.py new file mode 100644 index 00000000..987ec8c9 --- /dev/null +++ b/examples/api_launch.py @@ -0,0 +1,21 @@ +from optimum_benchmark.backends.pytorch.config import PyTorchConfig +from optimum_benchmark.benchmarks.inference.config import InferenceConfig +from optimum_benchmark.experiment import ExperimentConfig, launch +from optimum_benchmark.launchers.torchrun.config import TorchrunConfig +from optimum_benchmark.logging_utils import setup_logging + + +if __name__ == "__main__": + setup_logging(level="INFO") + launcher_config = TorchrunConfig(nproc_per_node=2) + benchmark_config = InferenceConfig(latency=True, memory=True) + backend_config = PyTorchConfig(model="gpt2", device="cuda", device_ids="0,1", no_weights=True) + experiment_config = ExperimentConfig( + experiment_name="api-launch", + benchmark=benchmark_config, + launcher=launcher_config, + backend=backend_config, + ) + benchmark_report = launch(experiment_config) + experiment_config.push_to_hub("IlyasMoutawwakil/benchmarks") + benchmark_report.push_to_hub("IlyasMoutawwakil/benchmarks") diff --git a/examples/pytorch_bert.yaml b/examples/pytorch_bert.yaml index 5a36147c..e3b08e87 100644 --- a/examples/pytorch_bert.yaml +++ b/examples/pytorch_bert.yaml @@ -9,8 +9,12 @@ defaults: experiment_name: pytorch_bert +benchmark: + latency: true + memory: true + backend: - device: cpu + device: cuda device_ids: 0 model: bert-base-uncased diff --git a/optimum_benchmark/backends/base.py b/optimum_benchmark/backends/base.py index cf0f5087..2be47a11 100644 --- a/optimum_benchmark/backends/base.py +++ b/optimum_benchmark/backends/base.py @@ -1,26 +1,25 @@ import gc import random from abc import ABC -from logging import getLogger from collections import OrderedDict -from typing import Optional, ClassVar, Generic, Dict, Any +from logging import getLogger +from typing import Any, ClassVar, Dict, Generic, Optional -from .config import BackendConfigT -from ..task_utils import get_automodel_class_for_task +import numpy as np +from transformers import GenerationConfig, PretrainedConfig, PreTrainedModel, TrainerState +from ..task_utils import get_automodel_class_for_task +from .config import BackendConfigT from .diffusers_utils import extract_diffusers_shapes_from_config, get_diffusers_pretrained_config -from .timm_utils import extract_timm_shapes_from_config, get_timm_pretrained_config, get_timm_pre_processor +from .timm_utils import extract_timm_shapes_from_config, get_timm_pre_processor, get_timm_pretrained_config from .transformers_utils import ( + PretrainedProcessor, extract_transformers_shapes_from_artifacts, get_transformers_generation_config, - get_transformers_pretrained_config, get_transformers_pre_processor, - PretrainedProcessor, + get_transformers_pretrained_config, ) -import numpy as np -from transformers import GenerationConfig, PretrainedConfig, PreTrainedModel, TrainerState - LOGGER = getLogger("backend") @@ -62,10 +61,7 @@ def __init__(self, config: BackendConfigT): self.model_type = self.pretrained_config.model_type self.automodel_class = get_automodel_class_for_task( - model_type=self.model_type, - library=self.config.library, - task=self.config.task, - framework="pt", + model_type=self.model_type, library=self.config.library, task=self.config.task, framework="pt" ) def seed(self) -> None: diff --git a/optimum_benchmark/backends/config.py b/optimum_benchmark/backends/config.py index a4919c15..e8c9c231 100644 --- a/optimum_benchmark/backends/config.py +++ b/optimum_benchmark/backends/config.py @@ -1,24 +1,17 @@ import os from abc import ABC -from logging import getLogger from dataclasses import dataclass, field -from typing import Optional, TypeVar, Dict, Any +from logging import getLogger +from typing import Any, Dict, Optional, TypeVar -from ..import_utils import is_psutil_available -from ..env_utils import get_cuda_device_ids, is_nvidia_system, is_rocm_system -from ..task_utils import infer_library_from_model_name_or_path, infer_task_from_model_name_or_path +from psutil import cpu_count -if is_psutil_available(): - from psutil import cpu_count +from ..system_utils import get_gpu_device_ids, is_nvidia_system, is_rocm_system +from ..task_utils import infer_library_from_model_name_or_path, infer_task_from_model_name_or_path LOGGER = getLogger("backend") -HUB_KWARGS = { - "revision": "main", - "force_download": False, - "local_files_only": False, - "trust_remote_code": False, -} +HUB_KWARGS = {"revision": "main", "force_download": False, "local_files_only": False, "trust_remote_code": False} @dataclass @@ -31,10 +24,10 @@ class BackendConfig(ABC): model: Optional[str] = None device: Optional[str] = None - # yes we use a string here instead of a list - # it's easier to pass in a yaml or from cli - # also it's consistent with CUDA_VISIBLE_DEVICES device_ids: Optional[str] = None + # yes we use a string here instead of a list + # because it's easier to pass in a yaml or from cli + # and it's consistent with GPU environment variables task: Optional[str] = None library: Optional[str] = None @@ -48,36 +41,49 @@ def __post_init__(self): if self.model is None: raise ValueError("`model` must be specified.") + if self.task is None: + self.task = infer_task_from_model_name_or_path(self.model) + if self.device is None: self.device = "cuda" if is_nvidia_system() or is_rocm_system() else "cpu" + LOGGER.warning(f"`device` is not specified, defaulting to {self.device} based on system configuration.") + + if self.device not in ["cuda", "cpu", "mps", "xla"]: + raise ValueError(f"`device` must be either `cuda`, `cpu`, `mps` or `xla`, but got {self.device}") if ":" in self.device: - # using device index + # support pytorch device index notation self.device = self.device.split(":")[0] self.device_ids = self.device.split(":")[1] if self.device == "cuda": if self.device_ids is None: - self.device_ids = get_cuda_device_ids() + self.device_ids = get_gpu_device_ids() os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICES"] = self.device_ids - # TODO: add rocm specific environment variables ? - if self.device not in ["cuda", "cpu", "mps", "xla"]: - raise ValueError(f"`device` must be either `cuda`, `cpu`, `mps` or `xla`, but got {self.device}") - - if self.task is None: - self.task = infer_task_from_model_name_or_path(self.model) + if is_rocm_system(): + # https://rocm.docs.amd.com/en/latest/conceptual/gpu-isolation.html + os.environ["GPU_DEVICE_ORDINAL"] = self.device_ids + os.environ["HIP_VISIBLE_DEVICES"] = self.device_ids + os.environ["ROCR_VISIBLE_DEVICES"] = self.device_ids if self.library is None: self.library = infer_library_from_model_name_or_path(self.model) + if self.library not in ["transformers", "diffusers", "timm"]: + raise ValueError(f"`library` must be either `transformers`, `diffusers` or `timm`, but got {self.library}") + if self.inter_op_num_threads is not None: + if not isinstance(self.inter_op_num_threads, int): + raise ValueError(f"`inter_op_num_threads` must be an integer, but got {self.inter_op_num_threads}") if self.inter_op_num_threads == -1: self.inter_op_num_threads = cpu_count() if self.intra_op_num_threads is not None: + if not isinstance(self.intra_op_num_threads, int): + raise ValueError(f"`intra_op_num_threads` must be an integer, but got {self.intra_op_num_threads}") if self.intra_op_num_threads == -1: self.intra_op_num_threads = cpu_count() diff --git a/optimum_benchmark/backends/diffusers_utils.py b/optimum_benchmark/backends/diffusers_utils.py index 705436d3..5b0f56ce 100644 --- a/optimum_benchmark/backends/diffusers_utils.py +++ b/optimum_benchmark/backends/diffusers_utils.py @@ -5,7 +5,7 @@ from ..import_utils import is_diffusers_available if is_diffusers_available(): - import diffusers + import diffusers # type: ignore def get_diffusers_pretrained_config(model: str, **kwargs) -> Dict[str, int]: diff --git a/optimum_benchmark/backends/neural_compressor/backend.py b/optimum_benchmark/backends/neural_compressor/backend.py index dd2a7a82..cb70fdfc 100644 --- a/optimum_benchmark/backends/neural_compressor/backend.py +++ b/optimum_benchmark/backends/neural_compressor/backend.py @@ -1,22 +1,22 @@ -import os import gc -from typing import Any, Dict +import os from logging import getLogger from tempfile import TemporaryDirectory - -from ...generators.dataset_generator import DatasetGenerator -from ..transformers_utils import randomize_weights -from .utils import TASKS_TO_INCMODELS -from .config import INCConfig -from ..base import Backend +from typing import Any, Dict import torch from hydra.utils import get_class -from transformers.utils import ModelOutput +from neural_compressor.config import AccuracyCriterion, PostTrainingQuantConfig, TuningCriterion +from optimum.intel.neural_compressor.quantization import INCQuantizer from transformers.modeling_utils import no_init_weights +from transformers.utils import ModelOutput from transformers.utils.logging import set_verbosity_error -from optimum.intel.neural_compressor.quantization import INCQuantizer -from neural_compressor.config import PostTrainingQuantConfig, AccuracyCriterion, TuningCriterion + +from ...generators.dataset_generator import DatasetGenerator +from ..base import Backend +from ..transformers_utils import randomize_weights +from .config import INCConfig +from .utils import TASKS_TO_INCMODELS # disable transformers logging set_verbosity_error() @@ -128,15 +128,9 @@ def quantize_automodel(self) -> None: if self.config.calibration: LOGGER.info("\t+ Generating calibration dataset") - dataset_shapes = { - "dataset_size": 1, - "sequence_length": 1, - **self.model_shapes, - } + dataset_shapes = {"dataset_size": 1, "sequence_length": 1, **self.model_shapes} calibration_dataset = DatasetGenerator( - task=self.config.task, - dataset_shapes=dataset_shapes, - model_shapes=self.model_shapes, + task=self.config.task, dataset_shapes=dataset_shapes, model_shapes=self.model_shapes )() columns_to_be_removed = list(set(calibration_dataset.column_names) - set(quantizer._signature_columns)) calibration_dataset = calibration_dataset.remove_columns(columns_to_be_removed) diff --git a/optimum_benchmark/backends/neural_compressor/config.py b/optimum_benchmark/backends/neural_compressor/config.py index 22becfe6..09623e47 100644 --- a/optimum_benchmark/backends/neural_compressor/config.py +++ b/optimum_benchmark/backends/neural_compressor/config.py @@ -1,17 +1,13 @@ -from typing import Any, Dict, Optional from dataclasses import dataclass, field +from typing import Any, Dict, Optional from omegaconf import OmegaConf -from ..config import BackendConfig from ...import_utils import neural_compressor_version +from ..config import BackendConfig # https://github.com/intel/neural-compressor/blob/master/neural_compressor/config.py#L490 -ACCURACY_CRITERION_CONFIG = { - "higher_is_better": True, - "criterion": "relative", - "tolerable_loss": 0.01, -} +ACCURACY_CRITERION_CONFIG = {"higher_is_better": True, "criterion": "relative", "tolerable_loss": 0.01} # https://github.com/intel/neural-compressor/blob/master/neural_compressor/config.py#L593 TUNING_CRITERION_CONFIG = { diff --git a/optimum_benchmark/backends/onnxruntime/backend.py b/optimum_benchmark/backends/onnxruntime/backend.py index 07d5d860..0d2fc857 100644 --- a/optimum_benchmark/backends/onnxruntime/backend.py +++ b/optimum_benchmark/backends/onnxruntime/backend.py @@ -1,40 +1,40 @@ import gc import os -from logging import getLogger from collections import OrderedDict +from logging import getLogger from tempfile import TemporaryDirectory from typing import Any, Callable, Dict, List -from ..base import Backend -from .config import ORTConfig -from ...task_utils import TEXT_GENERATION_TASKS -from ...generators.dataset_generator import DatasetGenerator -from .utils import format_calibration_config, format_quantization_config, TASKS_TO_ORTMODELS, TASKS_TO_ORTSD - import torch from datasets import Dataset from hydra.utils import get_class from onnxruntime import SessionOptions -from safetensors.torch import save_file -from transformers import TrainerCallback -from transformers.modeling_utils import no_init_weights -from transformers.utils.logging import set_verbosity_error -from optimum.onnxruntime.configuration import ( - AutoOptimizationConfig, - AutoQuantizationConfig, - AutoCalibrationConfig, - OptimizationConfig, - QuantizationConfig, - CalibrationConfig, -) from optimum.onnxruntime import ( - ONNX_DECODER_WITH_PAST_NAME, ONNX_DECODER_NAME, - ORTTrainingArguments, + ONNX_DECODER_WITH_PAST_NAME, ORTOptimizer, ORTQuantizer, ORTTrainer, + ORTTrainingArguments, ) +from optimum.onnxruntime.configuration import ( + AutoCalibrationConfig, + AutoOptimizationConfig, + AutoQuantizationConfig, + CalibrationConfig, + OptimizationConfig, + QuantizationConfig, +) +from safetensors.torch import save_file +from transformers import TrainerCallback +from transformers.modeling_utils import no_init_weights +from transformers.utils.logging import set_verbosity_error + +from ...generators.dataset_generator import DatasetGenerator +from ...task_utils import TEXT_GENERATION_TASKS +from ..base import Backend +from .config import ORTConfig +from .utils import TASKS_TO_ORTMODELS, TASKS_TO_ORTSD, format_calibration_config, format_quantization_config # disable transformers logging set_verbosity_error() @@ -199,8 +199,7 @@ def optimize_onnx_files(self) -> None: ) elif self.config.optimization: optimization_config = OptimizationConfig( - optimize_for_gpu=(self.config.device == "cuda"), - **self.config.optimization_config, + optimize_for_gpu=(self.config.device == "cuda"), **self.config.optimization_config ) LOGGER.info("\t+ Creating optimizer") optimizer = ORTOptimizer.from_pretrained(self.config.model, file_names=self.onnx_files_names) @@ -243,15 +242,9 @@ def quantize_onnx_files(self) -> None: if self.is_calibrated: LOGGER.info("\t+ Generating calibration dataset") - dataset_shapes = { - "dataset_size": 1, - "sequence_length": 1, - **self.model_shapes, - } + dataset_shapes = {"dataset_size": 1, "sequence_length": 1, **self.model_shapes} calibration_dataset = DatasetGenerator( - task=self.config.task, - dataset_shapes=dataset_shapes, - model_shapes=self.model_shapes, + task=self.config.task, dataset_shapes=dataset_shapes, model_shapes=self.model_shapes )() columns_to_be_removed = list(set(calibration_dataset.column_names) - set(self.inputs_names)) calibration_dataset = calibration_dataset.remove_columns(columns_to_be_removed) @@ -260,10 +253,7 @@ def quantize_onnx_files(self) -> None: if self.config.auto_calibration is not None: LOGGER.info("\t+ Processing calibration config") auto_calibration_method = getattr(AutoCalibrationConfig, self.config.auto_calibration) - calibration_config = auto_calibration_method( - calibration_dataset, - **self.config.auto_calibration_config, - ) + calibration_config = auto_calibration_method(calibration_dataset, **self.config.auto_calibration_config) elif self.config.calibration: LOGGER.info("\t+ Processing calibration config") calibration_config = format_calibration_config(self.config.calibration_config) diff --git a/optimum_benchmark/backends/onnxruntime/config.py b/optimum_benchmark/backends/onnxruntime/config.py index e0191b88..19ad747d 100644 --- a/optimum_benchmark/backends/onnxruntime/config.py +++ b/optimum_benchmark/backends/onnxruntime/config.py @@ -1,9 +1,9 @@ import os -from typing import Any, Dict, Optional from dataclasses import dataclass, field +from typing import Any, Dict, Optional -from ..config import BackendConfig from ...import_utils import onnxruntime_version +from ..config import BackendConfig from ..peft_utils import PEFT_CONFIGS, PEFT_TASKS_TYPES QUANTIZATION_CONFIG = { @@ -18,14 +18,11 @@ } AUTO_QUANTIZATION_CONFIG = { - "is_static": False, + "is_static": False # is_static is mandatory } -TRT_PROVIDER_OPTIONS = { - "trt_engine_cache_enable": True, - "trt_engine_cache_path": "/tmp/trt_cache", -} +TRT_PROVIDER_OPTIONS = {"trt_engine_cache_enable": True, "trt_engine_cache_path": "/tmp/trt_cache"} IO_BINDING_LIBRARIES = ["transformers", "timm"] IO_BINDING_PROVIDERS = ["CPUExecutionProvider", "CUDAExecutionProvider"] @@ -103,10 +100,7 @@ def __post_init__(self): os.makedirs(self.provider_options["trt_engine_cache_path"], exist_ok=True) if self.quantization: - self.quantization_config = { - **QUANTIZATION_CONFIG, - **self.quantization_config, - } + self.quantization_config = {**QUANTIZATION_CONFIG, **self.quantization_config} # raise ValueError if the quantization is static but calibration is not enabled if self.quantization_config["is_static"] and self.auto_calibration is None and not self.calibration: raise ValueError( @@ -115,10 +109,7 @@ def __post_init__(self): ) if self.auto_quantization is not None: - self.auto_quantization_config = { - **AUTO_QUANTIZATION_CONFIG, - **self.auto_quantization_config, - } + self.auto_quantization_config = {**AUTO_QUANTIZATION_CONFIG, **self.auto_quantization_config} if self.auto_quantization_config["is_static"] and self.auto_calibration is None and not self.calibration: raise ValueError( "Quantization is static but calibration is not enabled. " diff --git a/optimum_benchmark/backends/onnxruntime/utils.py b/optimum_benchmark/backends/onnxruntime/utils.py index 759962f1..86eeeed9 100644 --- a/optimum_benchmark/backends/onnxruntime/utils.py +++ b/optimum_benchmark/backends/onnxruntime/utils.py @@ -1,13 +1,7 @@ from typing import Any, Dict +from onnxruntime.quantization import CalibrationMethod, QuantFormat, QuantizationMode, QuantType from optimum.pipelines import ORT_SUPPORTED_TASKS -from onnxruntime.quantization import ( - CalibrationMethod, - QuantizationMode, - QuantFormat, - QuantType, -) - TASKS_TO_ORTSD = { "stable-diffusion": "optimum.onnxruntime.ORTStableDiffusionPipeline", diff --git a/optimum_benchmark/backends/openvino/backend.py b/optimum_benchmark/backends/openvino/backend.py index 73cbd63d..e883c3ac 100644 --- a/optimum_benchmark/backends/openvino/backend.py +++ b/optimum_benchmark/backends/openvino/backend.py @@ -1,26 +1,26 @@ import gc -import os import inspect -from typing import Any, Dict -from logging import getLogger +import os from collections import OrderedDict +from logging import getLogger from tempfile import TemporaryDirectory - -from ..base import Backend -from .config import OVConfig -from .utils import TASKS_TO_OVMODEL -from ...task_utils import TEXT_GENERATION_TASKS -from ..transformers_utils import randomize_weights -from ...generators.dataset_generator import DatasetGenerator +from typing import Any, Dict import torch from hydra.utils import get_class from openvino.runtime import properties -from safetensors.torch import save_file +from optimum.intel.openvino import OVConfig as OVQuantizationConfig # naming conflict from optimum.intel.openvino import OVQuantizer +from safetensors.torch import save_file from transformers.modeling_utils import no_init_weights from transformers.utils.logging import set_verbosity_error -from optimum.intel.openvino import OVConfig as OVQuantizationConfig # naming conflict + +from ...generators.dataset_generator import DatasetGenerator +from ...task_utils import TEXT_GENERATION_TASKS +from ..base import Backend +from ..transformers_utils import randomize_weights +from .config import OVConfig +from .utils import TASKS_TO_OVMODEL # disable transformers logging set_verbosity_error() @@ -143,15 +143,9 @@ def quantize_automodel(self) -> None: if self.config.calibration: LOGGER.info("\t+ Generating calibration dataset") - dataset_shapes = { - "dataset_size": 1, - "sequence_length": 1, - **self.model_shapes, - } + dataset_shapes = {"dataset_size": 1, "sequence_length": 1, **self.model_shapes} calibration_dataset = DatasetGenerator( - task=self.config.task, - dataset_shapes=dataset_shapes, - model_shapes=self.model_shapes, + task=self.config.task, dataset_shapes=dataset_shapes, model_shapes=self.model_shapes )() columns_to_be_removed = list(set(calibration_dataset.column_names) - set(quantizer._export_input_names)) calibration_dataset = calibration_dataset.remove_columns(columns_to_be_removed) diff --git a/optimum_benchmark/backends/openvino/config.py b/optimum_benchmark/backends/openvino/config.py index 6f4ba460..6b6797eb 100644 --- a/optimum_benchmark/backends/openvino/config.py +++ b/optimum_benchmark/backends/openvino/config.py @@ -1,8 +1,8 @@ from dataclasses import dataclass, field from typing import Any, Dict, Optional -from ..config import BackendConfig from ...import_utils import openvino_version +from ..config import BackendConfig @dataclass diff --git a/optimum_benchmark/backends/openvino/utils.py b/optimum_benchmark/backends/openvino/utils.py index 8a39824d..b1005f38 100644 --- a/optimum_benchmark/backends/openvino/utils.py +++ b/optimum_benchmark/backends/openvino/utils.py @@ -1,8 +1,4 @@ from optimum.intel.openvino.utils import _HEAD_TO_AUTOMODELS TASKS_TO_OVMODEL = {task: f"optimum.intel.openvino.{ovmodel}" for task, ovmodel in _HEAD_TO_AUTOMODELS.items()} -TASKS_TO_OVMODEL.update( - { - "feature-extraction": "optimum.intel.openvino.OVModelForFeatureExtraction", - } -) +TASKS_TO_OVMODEL.update({"feature-extraction": "optimum.intel.openvino.OVModelForFeatureExtraction"}) diff --git a/optimum_benchmark/backends/peft_utils.py b/optimum_benchmark/backends/peft_utils.py index 1a367120..8ec7d1fa 100644 --- a/optimum_benchmark/backends/peft_utils.py +++ b/optimum_benchmark/backends/peft_utils.py @@ -4,23 +4,16 @@ if is_peft_available(): from peft import ( + AdaLoraConfig, IA3Config, LoraConfig, PeftConfig, - AdaLoraConfig, PrefixTuningConfig, PromptEncoderConfig, PromptLearningConfig, ) -PEFT_TASKS_TYPES = [ - "SEQ_CLS", - "SEQ_2_SEQ_LM", - "CAUSAL_LM", - "TOKEN_CLS", - "QUESTION_ANS", - "FEATURE_EXTRACTION", -] +PEFT_TASKS_TYPES = ["SEQ_CLS", "SEQ_2_SEQ_LM", "CAUSAL_LM", "TOKEN_CLS", "QUESTION_ANS", "FEATURE_EXTRACTION"] PEFT_CONFIG = { "base_model_name_or_path": None, diff --git a/optimum_benchmark/backends/pytorch/backend.py b/optimum_benchmark/backends/pytorch/backend.py index 268f4306..f7fdf7ab 100644 --- a/optimum_benchmark/backends/pytorch/backend.py +++ b/optimum_benchmark/backends/pytorch/backend.py @@ -1,29 +1,33 @@ import gc import os -from logging import getLogger from collections import OrderedDict +from logging import getLogger from tempfile import TemporaryDirectory from typing import Any, Callable, Dict, List -from ..base import Backend -from .config import PyTorchConfig -from ..peft_utils import get_peft_config_class -from ..transformers_utils import randomize_weights -from ...import_utils import is_deepspeed_available, is_peft_available - +import datasets.utils.logging as datasets_logging import torch +import transformers.utils.logging as transformers_logging from datasets import Dataset from safetensors.torch import save_file -import datasets.utils.logging as datasets_logging +from transformers import Trainer, TrainerCallback, TrainerState, TrainingArguments from transformers.modeling_utils import no_init_weights -import transformers.utils.logging as transformers_logging -from transformers import TrainerCallback, TrainerState, Trainer, TrainingArguments + +from ...import_utils import is_deepspeed_available, is_peft_available, is_torch_distributed_available +from ..base import Backend +from ..peft_utils import get_peft_config_class +from ..transformers_utils import randomize_weights +from .config import PyTorchConfig if is_peft_available(): - from peft import get_peft_model + from peft import get_peft_model # type: ignore + +if is_torch_distributed_available(): + import torch.distributed if is_deepspeed_available(): - from deepspeed import init_inference + from deepspeed import init_inference # type: ignore + # disable other loggers datasets_logging.set_verbosity_error() @@ -94,14 +98,12 @@ def __init__(self, config: PyTorchConfig): LOGGER.info("\t+ Using torch.compile on unet forward pass") # TODO: should we compile vae and/or clip as well ? self.pretrained_model.unet.forward = torch.compile( - self.pretrained_model.unet.forward, - **self.config.torch_compile_config, + self.pretrained_model.unet.forward, **self.config.torch_compile_config ) else: LOGGER.info("\t+ Using torch.compile on forward pass") self.pretrained_model.forward = torch.compile( - self.pretrained_model.forward, - **self.config.torch_compile_config, + self.pretrained_model.forward, **self.config.torch_compile_config ) if self.config.peft_strategy is not None: @@ -176,9 +178,7 @@ def load_model_from_pretrained(self) -> None: LOGGER.info(f"\t+ Loading model directly on device: {self.config.device}") with torch.device(self.config.device): self.pretrained_model = self.automodel_class.from_pretrained( - pretrained_model_name_or_path=self.config.model, - **self.config.hub_kwargs, - **self.automodel_kwargs, + pretrained_model_name_or_path=self.config.model, **self.config.hub_kwargs, **self.automodel_kwargs ) def create_no_weights_model(self) -> None: @@ -233,30 +233,21 @@ def process_quantization_config(self) -> None: from transformers import GPTQConfig self.quantization_config = GPTQConfig( - **dict( - getattr(self.pretrained_config, "quantization_config", {}), - **self.config.quantization_config, - ) + **dict(getattr(self.pretrained_config, "quantization_config", {}), **self.config.quantization_config) ) elif self.is_awq_quantized: LOGGER.info("\t+ Processing AWQ config") from transformers import AwqConfig self.quantization_config = AwqConfig( - **dict( - getattr(self.pretrained_config, "quantization_config", {}), - **self.config.quantization_config, - ) + **dict(getattr(self.pretrained_config, "quantization_config", {}), **self.config.quantization_config) ) elif self.is_bnb_quantized: LOGGER.info("\t+ Processing BitsAndBytes config") from transformers import BitsAndBytesConfig self.quantization_config = BitsAndBytesConfig( - **dict( - getattr(self.pretrained_config, "quantization_config", {}), - **self.config.quantization_config, - ) + **dict(getattr(self.pretrained_config, "quantization_config", {}), **self.config.quantization_config) ) else: self.quantization_config = None @@ -290,8 +281,8 @@ def is_awq_quantized(self) -> bool: def is_exllamav2(self) -> bool: return ( self.is_gptq_quantized - and "exllama_config" in self.quantization_config - and self.quantization_config["exllama_config"].get("version", None) == 2 + and hasattr(self.quantization_config, "exllama_config") + and self.quantization_config.exllama_config.get("version", None) == 2 ) @property @@ -369,6 +360,10 @@ def seed(self): torch.cuda.manual_seed_all(self.config.seed) def clean(self) -> None: + if is_torch_distributed_available() and torch.distributed.is_initialized(): + LOGGER.info("\t+ Waiting for distributed processes to finish before cleaning backend") + torch.distributed.barrier() + super().clean() if hasattr(self, "tmpdir"): diff --git a/optimum_benchmark/backends/pytorch/config.py b/optimum_benchmark/backends/pytorch/config.py index d8089f60..7902719d 100644 --- a/optimum_benchmark/backends/pytorch/config.py +++ b/optimum_benchmark/backends/pytorch/config.py @@ -1,20 +1,16 @@ from dataclasses import dataclass, field from typing import Any, Dict, Optional -from ..config import BackendConfig -from ...env_utils import is_rocm_system from ...import_utils import torch_version +from ...system_utils import is_rocm_system +from ..config import BackendConfig from ..peft_utils import PEFT_CONFIGS, PEFT_TASKS_TYPES DEVICE_MAPS = ["auto", "sequential"] AMP_DTYPES = ["bfloat16", "float16"] TORCH_DTYPES = ["bfloat16", "float16", "float32", "auto"] -QUANTIZATION_CONFIGS = { - "bnb": {"llm_int8_threshold": 0.0}, - "gptq": {}, - "awq": {}, -} +QUANTIZATION_CONFIGS = {"bnb": {"llm_int8_threshold": 0.0}, "gptq": {}, "awq": {}} COMPILE_CONFIG = { "fullgraph": False, "dynamic": False, @@ -89,10 +85,7 @@ def __post_init__(self): if self.quantization_config: QUANTIZATION_CONFIG = QUANTIZATION_CONFIGS[self.quantization_scheme] - self.quantization_config = { - **QUANTIZATION_CONFIG, - **self.quantization_config, - } + self.quantization_config = {**QUANTIZATION_CONFIG, **self.quantization_config} if self.peft_strategy is not None: if self.peft_strategy not in PEFT_CONFIGS: diff --git a/optimum_benchmark/backends/tensorrt_llm/backend.py b/optimum_benchmark/backends/tensorrt_llm/backend.py index 7c86adeb..3beb1387 100644 --- a/optimum_benchmark/backends/tensorrt_llm/backend.py +++ b/optimum_benchmark/backends/tensorrt_llm/backend.py @@ -1,13 +1,13 @@ from logging import getLogger from typing import Any, Dict +from hydra.utils import get_class +from transformers.utils import ModelOutput + from ..base import Backend from .config import TRTLLMConfig from .utils import MODEL_TYPE_TO_TRTLLMMODEL -from hydra.utils import get_class -from transformers.utils import ModelOutput - LOGGER = getLogger("tensorrt-llm") @@ -47,9 +47,7 @@ def load_trtmodel_from_pretrained(self) -> None: def forward(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> ModelOutput: return self.pretrained_model.generate( - input_ids=inputs.get("input_ids", None), - attention_mask=inputs.get("attention_mask", None), - max_new_tokens=1, + input_ids=inputs.get("input_ids", None), attention_mask=inputs.get("attention_mask", None), max_new_tokens=1 ) def generate(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> ModelOutput: diff --git a/optimum_benchmark/backends/tensorrt_llm/config.py b/optimum_benchmark/backends/tensorrt_llm/config.py index e676accb..d7f4b1cb 100644 --- a/optimum_benchmark/backends/tensorrt_llm/config.py +++ b/optimum_benchmark/backends/tensorrt_llm/config.py @@ -1,9 +1,8 @@ -from typing import Optional from dataclasses import dataclass +from typing import Optional -from ..config import BackendConfig from ...import_utils import tesnorrt_llm_version - +from ..config import BackendConfig SUPPORTED_DTYPES = ["float16", "bfloat16", "float32"] diff --git a/optimum_benchmark/backends/text_generation_inference/backend.py b/optimum_benchmark/backends/text_generation_inference/backend.py index 538de53c..c7ecd5ce 100644 --- a/optimum_benchmark/backends/text_generation_inference/backend.py +++ b/optimum_benchmark/backends/text_generation_inference/backend.py @@ -1,23 +1,24 @@ import gc import os import time +from concurrent.futures import ThreadPoolExecutor from logging import getLogger -from typing import Any, Dict, List from tempfile import TemporaryDirectory -from concurrent.futures import ThreadPoolExecutor - -from ..base import Backend -from .config import TGIConfig -from ...task_utils import TEXT_GENERATION_TASKS -from ..transformers_utils import randomize_weights +from typing import Any, Dict, List import torch -import docker -import docker.types -import docker.errors -from safetensors.torch import save_model from huggingface_hub import InferenceClient, snapshot_download from huggingface_hub.inference._text_generation import TextGenerationResponse +from safetensors.torch import save_model + +import docker +import docker.errors +import docker.types + +from ...task_utils import TEXT_GENERATION_TASKS +from ..base import Backend +from ..transformers_utils import randomize_weights +from .config import TGIConfig # bachend logger LOGGER = getLogger("text-generation-inference") @@ -59,12 +60,7 @@ def load_model_from_pretrained(self) -> None: model_cache_path = f"{self.config.volume}/{model_cache_folder}" snapshot_ref = ( - open( - f"{model_cache_path}/refs/{self.config.hub_kwargs.get('revision', 'main')}", - "r", - ) - .read() - .strip() + open(f"{model_cache_path}/refs/{self.config.hub_kwargs.get('revision', 'main')}", "r").read().strip() ) model_snapshot_path = f"{model_cache_path}/snapshots/{snapshot_ref}" @@ -133,12 +129,7 @@ def start_tgi_server(self) -> None: env["HUGGING_FACE_HUB_TOKEN"] = os.environ["HUGGING_FACE_HUB_TOKEN"] LOGGER.info("\t+ Building TGI command") - self.command = [ - "--model-id", - self.config.model, - "--revision", - self.config.hub_kwargs.get("revision", "main"), - ] + self.command = ["--model-id", self.config.model, "--revision", self.config.hub_kwargs.get("revision", "main")] if self.config.sharded is not None: self.command.extend(["--sharded", str(self.config.sharded).lower()]) diff --git a/optimum_benchmark/backends/timm_utils.py b/optimum_benchmark/backends/timm_utils.py index 9e2924b2..07105003 100644 --- a/optimum_benchmark/backends/timm_utils.py +++ b/optimum_benchmark/backends/timm_utils.py @@ -1,6 +1,6 @@ from typing import Any, Dict, Optional -from ..import_utils import is_timm_available, is_transformers_available, is_torch_available +from ..import_utils import is_timm_available, is_torch_available, is_transformers_available if is_torch_available(): import torch diff --git a/optimum_benchmark/backends/torch_ort/backend.py b/optimum_benchmark/backends/torch_ort/backend.py index a7515d2f..52bede74 100644 --- a/optimum_benchmark/backends/torch_ort/backend.py +++ b/optimum_benchmark/backends/torch_ort/backend.py @@ -4,18 +4,22 @@ from tempfile import TemporaryDirectory from typing import Any, Callable, Dict, List -from ..transformers_utils import randomize_weights -from ..peft_utils import get_peft_config_class -from .config import TorchORTConfig -from ..base import Backend - import torch from datasets import Dataset +from optimum.onnxruntime import ORTTrainer, ORTTrainingArguments from safetensors.torch import save_file from transformers import TrainerCallback, TrainerState from transformers.modeling_utils import no_init_weights from transformers.utils.logging import set_verbosity_error -from optimum.onnxruntime import ORTTrainer, ORTTrainingArguments + +from ...import_utils import is_peft_available +from ..base import Backend +from ..peft_utils import get_peft_config_class +from ..transformers_utils import randomize_weights +from .config import TorchORTConfig + +if is_peft_available(): + from peft import get_peft_model # type: ignore # disable transformers logging set_verbosity_error() @@ -39,9 +43,7 @@ def __init__(self, config: TorchORTConfig): self.load_automodel_from_pretrained() if self.config.peft_strategy is not None: - LOGGER.info("\t+ Applying PEFT") - from peft import get_peft_model - + LOGGER.info("\t+ Using PEFT") peft_config_class = get_peft_config_class(self.config.peft_strategy) peft_config = peft_config_class(**self.config.peft_config) self.pretrained_model = get_peft_model(self.pretrained_model, peft_config=peft_config) @@ -87,9 +89,7 @@ def load_automodel_with_no_weights(self) -> None: def load_automodel_from_pretrained(self) -> None: self.pretrained_model = self.automodel_class.from_pretrained( - self.config.model, - **self.automodel_kwargs, - **self.config.hub_kwargs, + self.config.model, **self.automodel_kwargs, **self.config.hub_kwargs ).to(self.config.device) @property diff --git a/optimum_benchmark/backends/torch_ort/config.py b/optimum_benchmark/backends/torch_ort/config.py index ac2de2f7..8559022f 100644 --- a/optimum_benchmark/backends/torch_ort/config.py +++ b/optimum_benchmark/backends/torch_ort/config.py @@ -1,8 +1,8 @@ from dataclasses import dataclass, field from typing import Any, Dict, Optional -from ..config import BackendConfig from ...import_utils import torch_ort_version +from ..config import BackendConfig from ..peft_utils import PEFT_CONFIGS, PEFT_TASKS_TYPES diff --git a/optimum_benchmark/backends/transformers_utils.py b/optimum_benchmark/backends/transformers_utils.py index 1d7ad410..6835617a 100644 --- a/optimum_benchmark/backends/transformers_utils.py +++ b/optimum_benchmark/backends/transformers_utils.py @@ -1,29 +1,24 @@ import os from typing import Any, Dict, Optional, Union -from ..import_utils import is_transformers_available, is_torch_available +from ..import_utils import is_torch_available, is_transformers_available if is_torch_available(): import torch if is_transformers_available(): from transformers import ( + AutoConfig, + AutoProcessor, FeatureExtractionMixin, - ImageProcessingMixin, - PreTrainedTokenizer, GenerationConfig, + ImageProcessingMixin, PretrainedConfig, + PreTrainedTokenizer, ProcessorMixin, - AutoProcessor, - AutoConfig, ) - PretrainedProcessor = Union[ - FeatureExtractionMixin, - ImageProcessingMixin, - PreTrainedTokenizer, - ProcessorMixin, - ] + PretrainedProcessor = Union[FeatureExtractionMixin, ImageProcessingMixin, PreTrainedTokenizer, ProcessorMixin] def get_transformers_cache_dir() -> str: @@ -52,8 +47,7 @@ def get_transformers_pre_processor(model: str, **kwargs) -> Optional["Pretrained def extract_transformers_shapes_from_artifacts( - config: "PretrainedConfig", - processor: Optional["PretrainedProcessor"] = None, + config: "PretrainedConfig", processor: Optional["PretrainedProcessor"] = None ) -> Dict[str, Any]: artifacts_dict = {} diff --git a/optimum_benchmark/benchmarks/base.py b/optimum_benchmark/benchmarks/base.py index 84495a1a..a8c42806 100644 --- a/optimum_benchmark/benchmarks/base.py +++ b/optimum_benchmark/benchmarks/base.py @@ -3,8 +3,8 @@ from typing import ClassVar, Generic from ..backends.base import Backend -from .report import BenchmarkReport from .config import BenchmarkConfigT +from .report import BenchmarkReport LOGGER = getLogger("benchmark") diff --git a/optimum_benchmark/benchmarks/config.py b/optimum_benchmark/benchmarks/config.py index f3e96348..76d102af 100644 --- a/optimum_benchmark/benchmarks/config.py +++ b/optimum_benchmark/benchmarks/config.py @@ -1,8 +1,7 @@ from abc import ABC -from typing import TypeVar -from logging import getLogger from dataclasses import dataclass - +from logging import getLogger +from typing import TypeVar LOGGER = getLogger("benchmark") diff --git a/optimum_benchmark/benchmarks/inference/benchmark.py b/optimum_benchmark/benchmarks/inference/benchmark.py index 9cc96ee1..07c4f9ee 100644 --- a/optimum_benchmark/benchmarks/inference/benchmark.py +++ b/optimum_benchmark/benchmarks/inference/benchmark.py @@ -1,26 +1,23 @@ +from dataclasses import dataclass from logging import getLogger -from typing import List, Tuple, Dict -from ..base import Benchmark -from .config import InferenceConfig -from ...trackers.energy import EnergyTracker -from ...trackers.memory import MemoryTracker -from ...trackers.latency import LatencyTracker from ...backends.base import Backend, BackendConfigT from ...generators.input_generator import InputGenerator from ...import_utils import is_torch_distributed_available -from ...task_utils import TEXT_GENERATION_TASKS, IMAGE_DIFFUSION_TASKS -from .report import InferenceReport, TextGenerationReport, ImageDiffusionReport +from ...task_utils import IMAGE_DIFFUSION_TASKS, TEXT_GENERATION_TASKS +from ...trackers.energy import Efficiency, EnergyTracker +from ...trackers.latency import LatencyTracker, Throughput +from ...trackers.memory import MemoryTracker +from ..base import Benchmark +from ..report import BenchmarkMeasurements, BenchmarkReport +from .config import InferenceConfig if is_torch_distributed_available(): import torch.distributed LOGGER = getLogger("inference") -IMAGE_DIFFUSION_KWARGS = { - "num_inference_steps": 30, - "num_images_per_prompt": 1, -} +IMAGE_DIFFUSION_KWARGS = {"num_inference_steps": 30, "num_images_per_prompt": 1} TEXT_GENERATION_KWARGS = { "num_return_sequences": 1, @@ -33,6 +30,33 @@ "num_beams": 1, } +EFFICIENCY_UNIT = "samples/kWh" +THROUGHPUT_UNIT = "samples/s" + +PREFILL_THROUGHPUT_UNIT = "tokens/s" +DECODE_THROUGHPUT_UNIT = "tokens/s" +CALL_THROUGHPUT_UNIT = "images/s" + +PREFILL_EFFICIENCY_UNIT = "tokens/kWh" +DECODE_EFFICIENCY_UNIT = "tokens/kWh" +CALL_EFFICIENCY_UNIT = "images/kWh" + + +@dataclass +class InferenceReport(BenchmarkReport): + forward: BenchmarkMeasurements + + +@dataclass +class ImageDiffusionReport(BenchmarkReport): + call: BenchmarkMeasurements + + +@dataclass +class TextGenerationReport(BenchmarkReport): + prefill: BenchmarkMeasurements + decode: BenchmarkMeasurements + class InferenceBenchmark(Benchmark[InferenceConfig]): NAME = "inference" @@ -42,17 +66,18 @@ def __init__(self, config: InferenceConfig) -> None: def run(self, backend: Backend[BackendConfigT]) -> None: if is_torch_distributed_available() and torch.distributed.is_initialized(): + LOGGER.info("\t+ Distributing batch size across processes") if self.config.input_shapes["batch_size"] % torch.distributed.get_world_size() != 0: raise ValueError( "The batch size must be divisible by the number of processes in a distributed environment" ) self.config.input_shapes["batch_size"] //= torch.distributed.get_world_size() + if backend.config.device == "cuda" and backend.config.task in TEXT_GENERATION_TASKS: + TEXT_GENERATION_TASKS["synced_gpus"] = True LOGGER.info("\t+ Creating input generator") self.input_generator = InputGenerator( - task=backend.config.task, - model_shapes=backend.model_shapes, - input_shapes=self.config.input_shapes, + task=backend.config.task, model_shapes=backend.model_shapes, input_shapes=self.config.input_shapes ) if backend.config.task in TEXT_GENERATION_TASKS: @@ -64,12 +89,7 @@ def run(self, backend: Backend[BackendConfigT]) -> None: LOGGER.info("\t+ Updating Text Generation kwargs with default values") self.config.generate_kwargs = {**TEXT_GENERATION_KWARGS, **self.config.generate_kwargs} LOGGER.info("\t+ Initializing Text Generation report") - self.report = TextGenerationReport( - batch_size=self.config.input_shapes["batch_size"], - sequence_length=self.config.input_shapes["sequence_length"], - num_new_tokens=self.config.generate_kwargs["max_new_tokens"], - num_return_sequences=self.config.generate_kwargs["num_return_sequences"], - ) + self.report = TextGenerationReport(prefill=BenchmarkMeasurements(), decode=BenchmarkMeasurements()) elif backend.config.task in IMAGE_DIFFUSION_TASKS: LOGGER.info("\t+ Generating and preparing Image Diffusion input") @@ -78,19 +98,14 @@ def run(self, backend: Backend[BackendConfigT]) -> None: LOGGER.info("\t+ Updating Image Diffusion kwargs with default values") self.config.forward_kwargs = {**IMAGE_DIFFUSION_KWARGS, **self.config.forward_kwargs} LOGGER.info("\t+ Initializing Image Diffusion report") - self.report = ImageDiffusionReport( - batch_size=self.config.input_shapes["batch_size"], - num_images_per_prompts=self.config.forward_kwargs["num_images_per_prompt"], - ) + self.report = ImageDiffusionReport(call=BenchmarkMeasurements()) else: LOGGER.info("\t+ Generating and preparing Inference input") self.forward_inputs = self.input_generator(mode="forward") self.forward_inputs = backend.prepare_inputs(self.forward_inputs) LOGGER.info("\t+ Initializing Inference report") - self.report = InferenceReport( - batch_size=self.config.input_shapes["batch_size"], - ) + self.report = InferenceReport(forward=BenchmarkMeasurements()) LOGGER.info("\t+ Preparing backend for Inference") backend.prepare_for_inference( @@ -103,11 +118,9 @@ def run(self, backend: Backend[BackendConfigT]) -> None: LOGGER.info("\t+ Warming up backend for Inference") for _ in range(self.config.warmup_runs): if backend.config.task in TEXT_GENERATION_TASKS: - generate_warmup_kwargs = {"max_new_tokens": 2, "min_new_tokens": 2} - _ = backend.generate(self.generate_input, generate_warmup_kwargs) + _ = backend.generate(self.generate_input, {"max_new_tokens": 2, "min_new_tokens": 2}) elif backend.config.task in IMAGE_DIFFUSION_TASKS: - diffuse_warmup_kwargs = {"num_inference_steps": 2} - _ = backend.call(self.diffuse_input, diffuse_warmup_kwargs) + _ = backend.call(self.diffuse_input, {"num_inference_steps": 2}) else: _ = backend.forward(self.forward_inputs, self.config.forward_kwargs) @@ -117,14 +130,11 @@ def run(self, backend: Backend[BackendConfigT]) -> None: backend=backend.config.name, device=backend.config.device, device_ids=backend.config.device_ids ) if backend.config.task in TEXT_GENERATION_TASKS: - forward_memories_dict, generate_memories_dict = self.run_text_generation_memory_tracking(backend) - self.report.populate_memory(forward_memories_dict, generate_memories_dict) + self.run_text_generation_memory_tracking(backend) elif backend.config.task in IMAGE_DIFFUSION_TASKS: - call_memories_dict = self.run_image_diffusion_memory_tracking(backend) - self.report.populate_memory(call_memories_dict) + self.run_image_diffusion_memory_tracking(backend) else: - forward_memories_dict = self.run_inference_memory_tracking(backend) - self.report.populate_memory(forward_memories_dict) + self.run_inference_memory_tracking(backend) self.report.log_memory() @@ -132,146 +142,170 @@ def run(self, backend: Backend[BackendConfigT]) -> None: LOGGER.info("\t+ Creating inference latency tracker") self.latency_tracker = LatencyTracker(backend=backend.config.name, device=backend.config.device) if backend.config.task in TEXT_GENERATION_TASKS: - forward_latencies_dict, generate_latencies_dict = self.run_text_generation_latency_tracking(backend) - self.report.populate_latency(forward_latencies_dict, generate_latencies_dict) + self.run_text_generation_latency_tracking(backend) elif backend.config.task in IMAGE_DIFFUSION_TASKS: - call_latencies_dict = self.run_image_diffusion_latency_tracking(backend) - self.report.populate_latency(call_latencies_dict) + self.run_image_diffusion_latency_tracking(backend) else: - forward_latencies_dict = self.run_latency_inference_tracking(backend) - self.report.populate_latency(forward_latencies_dict) + self.run_latency_inference_tracking(backend) self.report.log_latency() + self.report.log_throughput() if self.config.energy: LOGGER.info("\t+ Creating inference energy tracker") self.energy_tracker = EnergyTracker(device=backend.config.device, device_ids=backend.config.device_ids) if backend.config.task in TEXT_GENERATION_TASKS: - forward_energies_dict, generate_energies_dict = self.run_text_generation_energy_tracking(backend) - self.report.populate_energy(forward_energies_dict, generate_energies_dict) + self.run_text_generation_energy_tracking(backend) elif backend.config.task in IMAGE_DIFFUSION_TASKS: - call_energies_dict = self.run_image_diffusion_energy_tracking(backend) - self.report.populate_energy(call_energies_dict) + self.run_image_diffusion_energy_tracking(backend) else: - forward_energies_dict = self.run_inference_energy_tracking(backend) - self.report.populate_energy(forward_energies_dict) + self.run_inference_energy_tracking(backend) self.report.log_energy() + self.report.log_efficiency() + + self.report.log() ## Memory tracking - def run_text_generation_memory_tracking(self, backend: Backend) -> Tuple[Dict[str, float], Dict[str, float]]: + def run_text_generation_memory_tracking(self, backend: Backend): LOGGER.info("\t+ Running memory tracking") self.memory_tracker.reset() with self.memory_tracker.track(): _ = backend.forward(self.forward_inputs, self.config.forward_kwargs) - forward_memories_dict = self.memory_tracker.get_memories_dict() + self.report.prefill.memory = self.memory_tracker.get_max_memory() self.memory_tracker.reset() with self.memory_tracker.track(): _ = backend.generate(self.generate_input, self.config.generate_kwargs) - generate_memories_dict = self.memory_tracker.get_memories_dict() + self.report.decode.memory = self.memory_tracker.get_max_memory() - return forward_memories_dict, generate_memories_dict - - def run_image_diffusion_memory_tracking(self, backend: Backend) -> Dict[str, float]: + def run_image_diffusion_memory_tracking(self, backend: Backend): LOGGER.info("\t+ Running memory tracking") self.memory_tracker.reset() with self.memory_tracker.track(): _ = backend.call(self.diffuse_input, self.config.forward_kwargs) - call_memories_dict = self.memory_tracker.get_memories_dict() - - return call_memories_dict + self.report.call.memory = self.memory_tracker.get_max_memory() - def run_inference_memory_tracking(self, backend: Backend) -> Dict[str, float]: + def run_inference_memory_tracking(self, backend: Backend): LOGGER.info("\t+ Running memory tracking") self.memory_tracker.reset() with self.memory_tracker.track(): _ = backend.forward(self.forward_inputs, self.config.forward_kwargs) - forward_memories_dict = self.memory_tracker.get_memories_dict() - - return forward_memories_dict + self.report.forward.memory = self.memory_tracker.get_max_memory() ## Latency tracking - def run_text_generation_latency_tracking(self, backend: Backend) -> Tuple[List[float], List[float]]: + def run_text_generation_latency_tracking(self, backend: Backend): LOGGER.info("\t+ Running latency tracking") self.latency_tracker.reset() - while self.latency_tracker.get_total_latency() < self.config.duration: + while self.latency_tracker.get_elapsed_time() < self.config.duration: with self.latency_tracker.track(): _ = backend.forward(self.forward_inputs, self.config.forward_kwargs) - forward_latencies_list = self.latency_tracker.get_latencies_list() + self.report.prefill.latency = self.latency_tracker.get_latency() + self.report.prefill.throughput = self.latency_tracker.get_throughput( + volume=self.prefill_volume, unit=PREFILL_THROUGHPUT_UNIT + ) self.latency_tracker.reset() - while self.latency_tracker.get_total_latency() < self.config.duration: + while self.latency_tracker.get_elapsed_time() < self.config.duration: with self.latency_tracker.track(): _ = backend.generate(self.generate_input, self.config.generate_kwargs) - generate_latencies_list = self.latency_tracker.get_latencies_list() - - return forward_latencies_list, generate_latencies_list + self.report.decode.latency = self.latency_tracker.get_latency() - self.report.prefill.latency.mean + self.report.decode.throughput = Throughput.from_latency( + self.report.decode.latency, self.decode_volume, unit=DECODE_THROUGHPUT_UNIT + ) - def run_image_diffusion_latency_tracking(self, backend: Backend) -> List[float]: + def run_image_diffusion_latency_tracking(self, backend: Backend): LOGGER.info("\t+ Running latency tracking") self.latency_tracker.reset() - while self.latency_tracker.get_total_latency() < self.config.duration: + while self.latency_tracker.get_elapsed_time() < self.config.duration: with self.latency_tracker.track(): _ = backend.call(self.diffuse_input, self.config.forward_kwargs) - call_latencies_list = self.latency_tracker.get_latencies_list() - - return call_latencies_list + self.report.call.latency = self.latency_tracker.get_latency() + self.report.call.throughput = Throughput.from_latency( + self.report.call.latency, self.call_volume, unit=CALL_THROUGHPUT_UNIT + ) - def run_latency_inference_tracking(self, backend: Backend) -> List[float]: + def run_latency_inference_tracking(self, backend: Backend): LOGGER.info("\t+ Running latency tracking") self.latency_tracker.reset() - while self.latency_tracker.get_total_latency() < self.config.duration: + while self.latency_tracker.get_elapsed_time() < self.config.duration: with self.latency_tracker.track(): _ = backend.forward(self.forward_inputs, self.config.forward_kwargs) - forward_latencies_list = self.latency_tracker.get_latencies_list() - - return forward_latencies_list + self.report.forward.latency = self.latency_tracker.get_latency() + self.report.forward.throughput = Throughput.from_latency( + self.report.forward.latency, self.forward_volume, unit=THROUGHPUT_UNIT + ) ## Energy tracking - def run_text_generation_energy_tracking(self, backend: Backend) -> Tuple[Dict[str, float], Dict[str, float]]: + def run_text_generation_energy_tracking(self, backend: Backend): LOGGER.info("\t+ Running energy tracking") self.energy_tracker.reset() with self.energy_tracker.track(): _ = backend.forward(self.forward_inputs, self.config.forward_kwargs) - forward_energies_dict = self.energy_tracker.get_energies_dict() + self.report.prefill.energy = self.energy_tracker.get_energy() + self.report.prefill.efficiency = Efficiency.from_energy( + self.report.prefill.energy, self.prefill_volume, unit=PREFILL_EFFICIENCY_UNIT + ) self.energy_tracker.reset() with self.energy_tracker.track(): _ = backend.generate(self.generate_input, self.config.generate_kwargs) - generate_energies_dict = self.energy_tracker.get_energies_dict() - - return forward_energies_dict, generate_energies_dict + self.report.decode.energy = self.energy_tracker.get_energy() - self.report.prefill.energy + self.report.decode.efficiency = Efficiency.from_energy( + self.report.decode.energy, self.decode_volume, unit=DECODE_EFFICIENCY_UNIT + ) - def run_image_diffusion_energy_tracking(self, backend: Backend) -> Dict[str, float]: + def run_image_diffusion_energy_tracking(self, backend: Backend): LOGGER.info("\t+ Running energy tracking") self.energy_tracker.reset() with self.energy_tracker.track(): _ = backend.call(self.diffuse_input, self.config.forward_kwargs) - call_energies_dict = self.energy_tracker.get_energies_dict() - - return call_energies_dict + self.report.call.energy = self.energy_tracker.get_energy() + self.report.call.efficiency = Efficiency.from_energy( + self.report.call.energy, self.call_volume, unit=CALL_EFFICIENCY_UNIT + ) - def run_inference_energy_tracking(self, backend: Backend) -> Dict[str, float]: + def run_inference_energy_tracking(self, backend: Backend): LOGGER.info("\t+ Running energy tracking") self.energy_tracker.reset() with self.energy_tracker.track(): _ = backend.forward(self.forward_inputs, self.config.forward_kwargs) - forward_energies_dict = self.energy_tracker.get_energies_dict() + self.report.forward.energy = self.energy_tracker.get_energy() + self.report.forward.efficiency = Efficiency.from_energy( + self.report.forward.energy, self.forward_volume, unit=EFFICIENCY_UNIT + ) + + @property + def forward_volume(self) -> int: # in samples + return self.config.input_shapes["batch_size"] + + @property + def prefill_volume(self) -> int: # in tokens + return self.config.input_shapes["batch_size"] * self.config.input_shapes["sequence_length"] - return forward_energies_dict + @property + def call_volume(self) -> int: # in images + return self.config.input_shapes["batch_size"] * self.config.forward_kwargs["num_images_per_prompt"] + + @property + def decode_volume(self) -> int: # in tokens + return ( + self.config.input_shapes["batch_size"] + * self.config.generate_kwargs["num_return_sequences"] + * self.config.generate_kwargs["max_new_tokens"] + ) def get_report(self) -> InferenceReport: return self.report diff --git a/optimum_benchmark/benchmarks/inference/callback.py b/optimum_benchmark/benchmarks/inference/callback.py deleted file mode 100644 index 4871691d..00000000 --- a/optimum_benchmark/benchmarks/inference/callback.py +++ /dev/null @@ -1,25 +0,0 @@ -import time - -from ...import_utils import is_torch_available - -from transformers import LogitsProcessor - -if is_torch_available(): - import torch - - -# TODO: uses this class for more fine-grained latency measurements in text generation -class MeasurementProcessor(LogitsProcessor): - def __init__(self, device: str, backend: str): - self.device = device - self.backend = backend - - self.latencies = [] - - def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor): - """ - Callback to track the time it takes to generate one batch of tokens. - """ - self.latencies.append(time.perf_counter_ns()) - - return scores diff --git a/optimum_benchmark/benchmarks/inference/config.py b/optimum_benchmark/benchmarks/inference/config.py index d5c4a0bb..7b6cfd3f 100644 --- a/optimum_benchmark/benchmarks/inference/config.py +++ b/optimum_benchmark/benchmarks/inference/config.py @@ -1,17 +1,13 @@ +from dataclasses import dataclass, field from logging import getLogger from typing import Any, Dict, Optional -from dataclasses import dataclass, field -from ...env_utils import is_rocm_system +from ...system_utils import is_rocm_system from ..config import BenchmarkConfig LOGGER = getLogger("inference") -INPUT_SHAPES = { - "batch_size": 2, - "sequence_length": 16, - "num_choices": 2, -} +INPUT_SHAPES = {"batch_size": 2, "num_choices": 2, "sequence_length": 16} @dataclass @@ -40,16 +36,13 @@ class InferenceConfig(BenchmarkConfig): # methods kwargs forward_kwargs: Dict[str, Any] = field( - default_factory=dict, - metadata={"help": "Keyword arguments to pass to the forward method of the model."}, + default_factory=dict, metadata={"help": "Keyword arguments to pass to the forward method of the model."} ) generate_kwargs: Dict[str, Any] = field( - default_factory=dict, - metadata={"help": "Keyword arguments to pass to the generate method of the model."}, + default_factory=dict, metadata={"help": "Keyword arguments to pass to the generate method of the model."} ) call_kwargs: Dict[str, Any] = field( - default_factory=dict, - metadata={"help": "Keyword arguments to pass to the __call__ method of the pipeline."}, + default_factory=dict, metadata={"help": "Keyword arguments to pass to the __call__ method of the pipeline."} ) def __post_init__(self): diff --git a/optimum_benchmark/benchmarks/inference/report.py b/optimum_benchmark/benchmarks/inference/report.py deleted file mode 100644 index 9cd43cfc..00000000 --- a/optimum_benchmark/benchmarks/inference/report.py +++ /dev/null @@ -1,353 +0,0 @@ -from dataclasses import dataclass, field -from statistics import mean, stdev -from typing import Any, Dict, List -from logging import getLogger - -from ..report import BenchmarkReport - -LOGGER = getLogger("report") - - -@dataclass -class InferenceReport(BenchmarkReport): - # Config - batch_size: int - # Metrics - forward: Dict[str, Any] = field(default_factory=dict) - - # POPULATING - def populate_latency(self, forward_latencies_list: List[float]): - ## Latency - self.forward["latency"] = { - "list[s]": forward_latencies_list, - "mean(s)": compute_mean(forward_latencies_list), - "stdev(s)": compute_stdev(forward_latencies_list), - } - ## Throughput - forward_throughputs_list = [self.batch_size / latency for latency in forward_latencies_list] - self.forward["throughput"] = { - "list[samples/s]": forward_throughputs_list, - "mean(samples/s)": compute_mean(forward_throughputs_list), - "stdev(samples/s)": compute_stdev(forward_throughputs_list), - } - - def populate_memory(self, forward_memories_dict: Dict[str, Any]): - self.forward["memory"] = forward_memories_dict - - def populate_energy(self, forward_energies_dict: Dict[str, Any]): - self.forward["energy"] = forward_energies_dict - - # LOGGING - def log_latency(self): - for key, value in self.forward["latency"].items(): - if "list" in key: - continue - LOGGER.info(f"\t+ forward.latency.{key}: {value:f} (s)") - for key, value in self.forward["throughput"].items(): - if "list" in key: - continue - LOGGER.info(f"\t+ forward.throughput.{key}: {value:f} (samples/s)") - - def log_memory(self): - for key, value in self.forward["memory"].items(): - LOGGER.info(f"\t+ forward.memory.{key}: {value:f} (MB)") - - def log_energy(self): - for key, value in self.forward["energy"].items(): - LOGGER.info(f"\t+ forward.energy.{key}: {value:f} (kWh)") - - def log_all(self) -> None: - if "latency" in self.forward: - self.log_latency() - if "memory" in self.forward: - self.log_memory() - if "energy" in self.forward: - self.log_energy() - - # add operator to aggregate multiple reports - def __add__(self, other: "InferenceReport") -> "InferenceReport": - agg_report = InferenceReport(batch_size=self.batch_size + other.batch_size) - if "latency" in self.forward and "latency" in other.forward: - agg_forward_latencies_list = [ - (lat_1 + lat_2) / 2 - for lat_1, lat_2 in zip(self.forward["latency"]["list[s]"], other.forward["latency"]["list[s]"]) - ] - agg_report.populate_latency(agg_forward_latencies_list) - - if "memory" in self.forward and "memory" in other.forward: - agg_forward_memories_dict = {} - for key in self.forward["memory"]: - if "vram" in key: - # our vram measures are not process-specific - agg_forward_memories_dict[key] = max(self.forward["memory"][key], other.forward["memory"][key]) - else: - # ram and pytorch measures are process-specific - agg_forward_memories_dict[key] = self.forward["memory"][key] + other.forward["memory"][key] - - agg_report.populate_memory(agg_forward_memories_dict) - - if "energy" in self.forward and "energy" in other.forward: - agg_forward_energies_dict = {} - for key in self.forward["energy"]: - # theoretically, the energies measured by codecarbon are process-specific (it's not clear from the code) - agg_forward_energies_dict[key] = self.forward["energy"][key] + other.forward["energy"][key] - - agg_report.populate_energy(agg_forward_energies_dict) - - return agg_report - - -@dataclass -class ImageDiffusionReport(BenchmarkReport): - # Config - batch_size: int - num_images_per_prompts: int - # Metrics - call: Dict[str, Any] = field(default_factory=dict) - - # POPULATING - def populate_latency(self, call_latencies_list: List[float]): - ## Latency - self.call["latency"] = { - "list[s]": call_latencies_list, - "mean(s)": compute_mean(call_latencies_list), - "stdev(s)": compute_stdev(call_latencies_list), - } - ## Throughput - call_throughputs_list = [ - self.batch_size * self.num_images_per_prompts / latency for latency in call_latencies_list - ] - self.call["throughput"] = { - "list[images/s]": call_throughputs_list, - "mean[images/s]": compute_mean(call_throughputs_list), - "stdev[images/s]": compute_stdev(call_throughputs_list), - } - - def populate_memory(self, call_memories_dict: Dict[str, Any]): - self.call["memory"] = call_memories_dict - - def populate_energy(self, call_energies_dict: Dict[str, Any]): - self.call["energy"] = call_energies_dict - - # LOGGING - def log_latency(self): - for key, value in self.call["latency"].items(): - if "list" in key: - continue - LOGGER.info(f"\t+ call.latency.{key}: {value:f} (s)") - for key, value in self.call["throughput"].items(): - if "list" in key: - continue - LOGGER.info(f"\t+ call.throughput.{key}: {value:f} (images/s)") - - def log_memory(self): - for key, value in self.call["memory"].items(): - LOGGER.info(f"\t+ call.memory.{key}: {value:f} (MB)") - - def log_energy(self): - for key, value in self.call["energy"].items(): - LOGGER.info(f"\t+ call.energy.{key}: {value:f} (kWh)") - - def log_all(self) -> None: - if "latency" in self.call: - self.log_latency() - if "memory" in self.call: - self.log_memory() - if "energy" in self.call: - self.log_energy() - - # add operator to aggregate multiple reports - def __add__(self, other: "ImageDiffusionReport") -> "ImageDiffusionReport": - assert self.num_images_per_prompts == other.num_images_per_prompts, "num_images_per_prompts must be the same" - - agg_report = ImageDiffusionReport( - batch_size=self.batch_size + other.batch_size, - num_images_per_prompts=self.num_images_per_prompts, - ) - if "latency" in self.call and "latency" in other.call: - agg_call_latencies_list = [ - (lat_1 + lat_2) / 2 - for lat_1, lat_2 in zip(self.call["latency"]["list[s]"], other.call["latency"]["list[s]"]) - ] - agg_report.populate_latency(agg_call_latencies_list) - - if "memory" in self.call and "memory" in other.call: - agg_call_memories_dict = {} - for key in self.call["memory"]: - if "vram" in key: - # our vram measures are not process-specific - agg_call_memories_dict[key] = max(self.call["memory"][key], other.call["memory"][key]) - else: - # ram and pytorch measures are process-specific - agg_call_memories_dict[key] = self.call["memory"][key] + other.call["memory"][key] - - agg_report.populate_memory(agg_call_memories_dict) - - if "energy" in self.call and "energy" in other.call: - agg_call_energies_dict = {} - for key in self.call["energy"]: - # theoretically, the energies measured by codecarbon are process-specific (it's not clear from the code) - agg_call_energies_dict[key] = self.call["energy"][key] + other.call["energy"][key] - - agg_report.populate_energy(agg_call_energies_dict) - - return agg_report - - -@dataclass -class TextGenerationReport(BenchmarkReport): - # Config - batch_size: int - sequence_length: int - num_new_tokens: int - num_return_sequences: int - # Prefill Metrics - prefill: Dict[str, Any] = field(default_factory=dict) - # Decode Metrics - decode: Dict[str, Any] = field(default_factory=dict) - - def populate_latency(self, forward_latencies_list: List[float], generate_latencies_list: List[float]): - ## Latency - self.prefill["latency"] = { - "list[s]": forward_latencies_list, - "mean(s)": compute_mean(forward_latencies_list), - "stdev(s)": compute_stdev(forward_latencies_list), - } - ## Throughput - prefill_throughputs_list = [ - self.batch_size * self.sequence_length / latency for latency in forward_latencies_list - ] - self.prefill["throughput"] = { - "list[tokens/s]": prefill_throughputs_list, - "mean[tokens/s]": compute_mean(prefill_throughputs_list), - "stdev[tokens/s]": compute_stdev(prefill_throughputs_list), - } - ## Latency - decode_latencies_list = [ - generate_latency - self.prefill["latency"]["mean(s)"] for generate_latency in generate_latencies_list - ] - self.decode["latency"] = { - "list[s]": decode_latencies_list, - "mean(s)": compute_mean(decode_latencies_list), - "stdev(s)": compute_stdev(decode_latencies_list), - } - ## Throughput - decode_throughputs_list = [ - self.batch_size * self.num_new_tokens * self.num_return_sequences / latency - for latency in decode_latencies_list - ] - self.decode["throughput"] = { - "list[tokens/s]": decode_throughputs_list, - "mean[tokens/s]": compute_mean(decode_throughputs_list), - "stdev[tokens/s]": compute_stdev(decode_throughputs_list), - } - - def populate_memory(self, forward_memories_dict: Dict[str, Any], generate_memories_dict: Dict[str, Any]): - self.prefill["memory"] = forward_memories_dict - self.decode["memory"] = generate_memories_dict - - def populate_energy(self, forward_energies_dict: Dict[str, Any], generate_energies_dict: Dict[str, Any]): - self.prefill["energy"] = forward_energies_dict - self.decode["energy"] = generate_energies_dict - - # LOGGING - def log_latency(self): - for key, value in self.prefill["latency"].items(): - if "list" in key: - continue - LOGGER.info(f"\t+ prefill.latency.{key}: {value:f} (s)") - for key, value in self.prefill["throughput"].items(): - if "list" in key: - continue - LOGGER.info(f"\t+ prefill.throughput.{key}: {value:f} (tokens/s)") - for key, value in self.decode["latency"].items(): - if "list" in key: - continue - LOGGER.info(f"\t+ decode.latency.{key}: {value:f} (s)") - for key, value in self.decode["throughput"].items(): - if "list" in key: - continue - LOGGER.info(f"\t+ decode.throughput.{key}: {value:f} (tokens/s)") - - def log_memory(self): - for key, value in self.prefill["memory"].items(): - LOGGER.info(f"\t+ prefill.memory.{key}: {value:f} (MB)") - for key, value in self.decode["memory"].items(): - LOGGER.info(f"\t+ decode.memory.{key}: {value:f} (MB)") - - def log_energy(self): - for key, value in self.prefill["energy"].items(): - LOGGER.info(f"\t+ prefill.energy.{key}: {value:f} (kWh)") - for key, value in self.decode["energy"].items(): - LOGGER.info(f"\t+ decode.energy.{key}: {value:f} (kWh)") - - def log_all(self) -> None: - if "latency" in self.prefill: - self.log_latency() - if "memory" in self.prefill: - self.log_memory() - if "energy" in self.prefill: - self.log_energy() - - # add operator to aggregate multiple reports - def __add__(self, other: "TextGenerationReport") -> "TextGenerationReport": - agg_report = TextGenerationReport( - batch_size=self.batch_size + other.batch_size, - sequence_length=self.sequence_length, - num_new_tokens=self.num_new_tokens, - num_return_sequences=self.num_return_sequences, - ) - if "latency" in self.prefill and "latency" in other.prefill: - agg_forward_latencies_list = [ - (lat_1 + lat_2) / 2 - for lat_1, lat_2 in zip(self.prefill["latency"]["list[s]"], other.prefill["latency"]["list[s]"]) - ] - agg_generate_latencies_list = [ - (lat_1 + lat_2) / 2 - for lat_1, lat_2 in zip(self.decode["latency"]["list[s]"], other.decode["latency"]["list[s]"]) - ] - agg_report.populate_latency(agg_forward_latencies_list, agg_generate_latencies_list) - - if "memory" in self.prefill and "memory" in other.prefill: - agg_forward_memories_dict = {} - for key in self.prefill["memory"]: - if "vram" in key: - # our vram measures are not process-specific - agg_forward_memories_dict[key] = max(self.prefill["memory"][key], other.prefill["memory"][key]) - else: - # ram and pytorch measures are process-specific - agg_forward_memories_dict[key] = self.prefill["memory"][key] + other.prefill["memory"][key] - - agg_generate_memories_dict = {} - for key in self.decode["memory"]: - if "vram" in key: - # our vram measures are not process-specific - agg_generate_memories_dict[key] = max(self.decode["memory"][key], other.decode["memory"][key]) - else: - # ram and pytorch measures are process-specific - agg_generate_memories_dict[key] = self.decode["memory"][key] + other.decode["memory"][key] - - agg_report.populate_memory(agg_forward_memories_dict, agg_generate_memories_dict) - - if "energy" in self.prefill and "energy" in other.prefill: - agg_forward_energies_dict = {} - for key in self.prefill["energy"]: - # theoretically, the energies measured by codecarbon are process-specific (it's not clear from the code) - agg_forward_energies_dict[key] = self.prefill["energy"][key] + other.prefill["energy"][key] - - agg_generate_energies_dict = {} - for key in self.decode["energy"]: - # theoretically, the energies measured by codecarbon are process-specific (it's not clear from the code) - agg_generate_energies_dict[key] = self.decode["energy"][key] + other.decode["energy"][key] - - agg_report.populate_energy(agg_forward_energies_dict, agg_generate_energies_dict) - - return agg_report - - -def compute_mean(values: List[float]) -> float: - return mean(values) if len(values) > 0 else 0.0 - - -def compute_stdev(values: List[float]) -> float: - return stdev(values) if len(values) > 1 else 0.0 diff --git a/optimum_benchmark/benchmarks/report.py b/optimum_benchmark/benchmarks/report.py index 69491d65..02dbc541 100644 --- a/optimum_benchmark/benchmarks/report.py +++ b/optimum_benchmark/benchmarks/report.py @@ -1,11 +1,61 @@ -from dataclasses import dataclass, asdict -from typing import Union, Optional -from json import dump import os +from dataclasses import asdict, dataclass +from json import dump +from logging import getLogger +from typing import Any, Dict, List, Optional, Union -from transformers.configuration_utils import PushToHubMixin -from flatten_dict import flatten import pandas as pd +from flatten_dict import flatten +from transformers.configuration_utils import PushToHubMixin + +from ..trackers.energy import Efficiency, Energy +from ..trackers.latency import Latency, Throughput +from ..trackers.memory import Memory + +LOGGER = getLogger("report") + +REPORT_FILE_NAME = "benchmark_report.json" + + +@dataclass +class BenchmarkMeasurements: + memory: Optional[Memory] = None + latency: Optional[Latency] = None + throughput: Optional[Throughput] = None + energy: Optional[Energy] = None + efficiency: Optional[Efficiency] = None + + @staticmethod + def aggregate(benchmark_measurements: List["BenchmarkMeasurements"]) -> "BenchmarkMeasurements": + memory = ( + Memory.aggregate([m.memory for m in benchmark_measurements]) + if benchmark_measurements[0].memory is not None + else None + ) + latency = ( + Latency.aggregate([m.latency for m in benchmark_measurements]) + if benchmark_measurements[0].latency is not None + else None + ) + throughput = ( + Throughput.aggregate([m.throughput for m in benchmark_measurements if m.throughput is not None]) + if benchmark_measurements[0].throughput is not None + else None + ) + energy = ( + Energy.aggregate([m.energy for m in benchmark_measurements if m.energy is not None]) + if benchmark_measurements[0].energy is not None + else None + ) + efficiency = ( + Efficiency.aggregate([m.efficiency for m in benchmark_measurements if m.efficiency is not None]) + if benchmark_measurements[0].efficiency is not None + else None + ) + + return BenchmarkMeasurements( + memory=memory, latency=latency, throughput=throughput, energy=energy, efficiency=efficiency + ) @dataclass @@ -22,7 +72,7 @@ def save_pretrained( if use_auth_token is not None: kwargs["token"] = use_auth_token - config_file_name = config_file_name if config_file_name is not None else "benchmark_report.json" + config_file_name = config_file_name if config_file_name is not None else REPORT_FILE_NAME if os.path.isfile(save_directory): raise AssertionError(f"Provided path ({save_directory}) should be a directory, not a file") @@ -36,21 +86,17 @@ def save_pretrained( files_timestamps = self._get_files_timestamps(save_directory) output_config_file = os.path.join(save_directory, config_file_name) - self.to_json(output_config_file) + self.to_json(output_config_file, flat=False) if push_to_hub: self._upload_modified_files( - save_directory, - repo_id, - files_timestamps, - commit_message=commit_message, - token=kwargs.get("token"), + save_directory, repo_id, files_timestamps, commit_message=commit_message, token=kwargs.get("token") ) - def to_dict(self) -> dict: + def to_dict(self) -> Dict[str, Any]: return asdict(self) - def to_flat_dict(self) -> dict: + def to_flat_dict(self) -> Dict[str, Any]: report_dict = self.to_dict() return flatten(report_dict, reducer="dot") @@ -64,10 +110,60 @@ def to_json(self, path: str, flat: bool = False) -> None: def to_dataframe(self) -> pd.DataFrame: flat_report_dict = self.to_flat_dict() - return pd.DataFrame(flat_report_dict, index=[0]) + return pd.DataFrame.from_dict(flat_report_dict, orient="index") def to_csv(self, path: str) -> None: self.to_dataframe().to_csv(path, index=False) - def log_all(self) -> None: - raise NotImplementedError("`log_all` method must be implemented in the child class") + def log_memory(self): + for target in self.to_dict().keys(): + benchmark_measurements: BenchmarkMeasurements = getattr(self, target) + if benchmark_measurements.memory is not None: + benchmark_measurements.memory.log(prefix=target) + + def log_latency(self): + for target in self.to_dict().keys(): + benchmark_measurements: BenchmarkMeasurements = getattr(self, target) + if benchmark_measurements.latency is not None: + benchmark_measurements.latency.log(prefix=target) + + def log_throughput(self): + for target in self.to_dict().keys(): + benchmark_measurements: BenchmarkMeasurements = getattr(self, target) + if benchmark_measurements.throughput is not None: + benchmark_measurements.throughput.log(prefix=target) + + def log_energy(self): + for target in self.to_dict().keys(): + benchmark_measurements: BenchmarkMeasurements = getattr(self, target) + if benchmark_measurements.energy is not None: + benchmark_measurements.energy.log(prefix=target) + + def log_efficiency(self): + for target in self.to_dict().keys(): + benchmark_measurements: BenchmarkMeasurements = getattr(self, target) + if benchmark_measurements.efficiency is not None: + benchmark_measurements.efficiency.log(prefix=target) + + def log(self): + for target in self.to_dict().keys(): + benchmark_measurements: BenchmarkMeasurements = getattr(self, target) + if benchmark_measurements.memory is not None: + benchmark_measurements.memory.log(prefix=target) + if benchmark_measurements.latency is not None: + benchmark_measurements.latency.log(prefix=target) + if benchmark_measurements.throughput is not None: + benchmark_measurements.throughput.log(prefix=target) + if benchmark_measurements.energy is not None: + benchmark_measurements.energy.log(prefix=target) + if benchmark_measurements.efficiency is not None: + benchmark_measurements.efficiency.log(prefix=target) + + @classmethod + def aggregate(cls, reports: List["BenchmarkReport"]) -> "BenchmarkReport": + aggregated_measurements = {} + for target in reports[0].to_dict().keys(): + benchmark_measurements = [getattr(report, target) for report in reports] + aggregated_measurements[target] = BenchmarkMeasurements.aggregate(benchmark_measurements) + + return cls(**aggregated_measurements) diff --git a/optimum_benchmark/benchmarks/training/benchmark.py b/optimum_benchmark/benchmarks/training/benchmark.py index 90c231d0..950cb0f7 100644 --- a/optimum_benchmark/benchmarks/training/benchmark.py +++ b/optimum_benchmark/benchmarks/training/benchmark.py @@ -1,19 +1,30 @@ -from logging import getLogger from contextlib import ExitStack +from dataclasses import dataclass +from logging import getLogger + +from transformers import default_data_collator -from ..base import Benchmark -from .config import TrainingConfig -from .report import TrainingReport -from ...trackers.memory import MemoryTracker -from ...trackers.energy import EnergyTracker -from .callback import LatencyTrainerCallback from ...backends.base import Backend, BackendConfigT from ...generators.dataset_generator import DatasetGenerator - -from transformers import default_data_collator +from ...trackers.energy import Efficiency, EnergyTracker +from ...trackers.latency import LatencyTrainerCallback, Throughput +from ...trackers.memory import MemoryTracker +from ..base import Benchmark +from ..report import BenchmarkMeasurements, BenchmarkReport +from .config import TrainingConfig LOGGER = getLogger("training") +TRAIN_THROUGHPUT_UNIT = "samples/s" +TRAIN_EFFICIENCY_UNIT = "samples/kWh" + + +@dataclass +class TrainingReport(BenchmarkReport): + overall: BenchmarkMeasurements = BenchmarkMeasurements() + warmup: BenchmarkMeasurements = BenchmarkMeasurements() + train: BenchmarkMeasurements = BenchmarkMeasurements() + class TrainingBenchmark(Benchmark[TrainingConfig]): NAME = "training" @@ -24,21 +35,14 @@ def __init__(self, config: TrainingConfig) -> None: def run(self, backend: Backend[BackendConfigT]) -> None: LOGGER.info("\t+ Creating dataset generator") dataset_generator = DatasetGenerator( - task=backend.config.task, - model_shapes=backend.model_shapes, - dataset_shapes=self.config.dataset_shapes, + task=backend.config.task, model_shapes=backend.model_shapes, dataset_shapes=self.config.dataset_shapes ) LOGGER.info("\t+ Generating training dataset") training_dataset = dataset_generator() LOGGER.info("\t+ Initializing training report") - self.report = TrainingReport( - max_steps=self.config.max_steps, - warmup_steps=self.config.warmup_steps, - per_process_batch_size=self.config.training_arguments["per_device_train_batch_size"], - gradient_accumulation_steps=self.config.training_arguments["gradient_accumulation_steps"], - ) + self.report = TrainingReport() training_callbackes = [] if self.config.latency: @@ -70,17 +74,51 @@ def run(self, backend: Backend[BackendConfigT]) -> None: training_arguments=self.config.training_arguments, ) - if self.config.latency: - self.report.populate_latency(overall_latencies_list=latency_callback.get_latencies_list()) - self.report.log_latency() - if self.config.memory: - self.report.populate_memory(overall_memories_dict=memory_tracker.get_memories_dict()) - self.report.log_memory() + self.report.overall.memory = memory_tracker.get_max_memory() + self.report.warmup.memory = memory_tracker.get_max_memory() + self.report.train.memory = memory_tracker.get_max_memory() + + if self.config.latency: + self.report.overall.latency = latency_callback.get_latency() + self.report.overall.throughput = Throughput.from_latency( + self.report.overall.latency, volume=self.overall_volume, unit=TRAIN_THROUGHPUT_UNIT + ) + self.report.warmup.latency = self.report.overall.latency[: self.config.warmup_steps] + self.report.warmup.throughput = Throughput.from_latency( + self.report.warmup.latency, volume=self.warmup_volume, unit=TRAIN_THROUGHPUT_UNIT + ) + self.report.train.latency = self.report.overall.latency[self.config.warmup_steps :] + self.report.train.throughput = Throughput.from_latency( + self.report.train.latency, volume=self.train_volume, unit=TRAIN_THROUGHPUT_UNIT + ) if self.config.energy: - self.report.populate_energy(overall_energies_dict=energy_tracker.get_energies_dict()) - self.report.log_energy() + # can only get overall energy consumption + self.report.overall.energy = energy_tracker.get_energy() + self.report.overall.efficiency = Efficiency.from_energy( + self.report.overall.energy, volume=self.overall_volume, unit=TRAIN_EFFICIENCY_UNIT + ) + + @property + def overall_volume(self) -> int: + return ( + self.config.max_steps + * self.config.training_arguments["per_device_train_batch_size"] + * self.config.training_arguments["gradient_accumulation_steps"] + ) + + @property + def warmup_volume(self) -> int: + return ( + self.config.warmup_steps + * self.config.training_arguments["per_device_train_batch_size"] + * self.config.training_arguments["gradient_accumulation_steps"] + ) + + @property + def train_volume(self) -> int: + return self.overall_volume - self.warmup_volume def get_report(self) -> TrainingReport: return self.report diff --git a/optimum_benchmark/benchmarks/training/callback.py b/optimum_benchmark/benchmarks/training/callback.py deleted file mode 100644 index 88026d79..00000000 --- a/optimum_benchmark/benchmarks/training/callback.py +++ /dev/null @@ -1,43 +0,0 @@ -import time -from typing import List - -import torch -from transformers import TrainerCallback - - -class LatencyTrainerCallback(TrainerCallback): - def __init__(self, device: str, backend: str) -> None: - self.device = device - self.backend = backend - self.all_latencies_list = [] - - def on_step_begin(self, *args, **kwargs): - # one record per step - if self.device == "cuda" and self.backend == "pytorch": - self.all_latencies_list.append(torch.cuda.Event(enable_timing=True)) - self.all_latencies_list[-1].record() - else: - self.all_latencies_list.append(time.perf_counter_ns()) - - def on_train_end(self, *args, **kwargs): - # one last record to measure the time of the last step - if self.device == "cuda" and self.backend == "pytorch": - self.all_latencies_list.append(torch.cuda.Event(enable_timing=True)) - self.all_latencies_list[-1].record() - else: - self.all_latencies_list.append(time.perf_counter_ns()) - - def get_latencies_list(self) -> List[float]: - if self.device == "cuda" and self.backend == "pytorch": - torch.cuda.synchronize() # synchronize the device to make sure all events have been recorded - latencies_list = [ - self.all_latencies_list[i - 1].elapsed_time(self.all_latencies_list[i]) * 1e-3 - for i in range(1, len(self.all_latencies_list)) - ] - else: - latencies_list = [ - (self.all_latencies_list[i] - self.all_latencies_list[i - 1]) * 1e-9 - for i in range(1, len(self.all_latencies_list)) - ] - - return latencies_list diff --git a/optimum_benchmark/benchmarks/training/config.py b/optimum_benchmark/benchmarks/training/config.py index e5d19581..6ea9d0b4 100644 --- a/optimum_benchmark/benchmarks/training/config.py +++ b/optimum_benchmark/benchmarks/training/config.py @@ -25,11 +25,7 @@ "ddp_find_unused_parameters": False, } -DATASET_SHAPES = { - "dataset_size": 500, - "sequence_length": 16, - "num_choices": 1, -} +DATASET_SHAPES = {"dataset_size": 500, "sequence_length": 16, "num_choices": 1} @dataclass @@ -63,7 +59,8 @@ def __post_init__(self): if self.max_steps != self.training_arguments["max_steps"]: LOGGER.warning( f"`benchmark.max_steps` ({self.max_steps}) and `benchmark.training_arguments.max_steps` " - f"({self.training_arguments['max_steps']}) are different. Using `benchmark.training_arguments.max_steps`." + f"({self.training_arguments['max_steps']}) are different. " + "Using `benchmark.training_arguments.max_steps`." ) self.max_steps = self.training_arguments["max_steps"] diff --git a/optimum_benchmark/benchmarks/training/report.py b/optimum_benchmark/benchmarks/training/report.py deleted file mode 100644 index 9eeba211..00000000 --- a/optimum_benchmark/benchmarks/training/report.py +++ /dev/null @@ -1,169 +0,0 @@ -from dataclasses import dataclass, field -from statistics import mean, stdev -from typing import Any, Dict, List -from logging import getLogger - -from ..report import BenchmarkReport - -LOGGER = getLogger("report") - - -@dataclass -class TrainingReport(BenchmarkReport): - max_steps: int - warmup_steps: int - per_process_batch_size: int - gradient_accumulation_steps: int - - overall: Dict[str, Any] = field(default_factory=dict) - training: Dict[str, Any] = field(default_factory=dict) - warmup: Dict[str, Any] = field(default_factory=dict) - - world_size: int = 1 - - # POPULATING - def populate_latency(self, overall_latencies_list: List[float]) -> None: - assert ( - len(overall_latencies_list) == self.max_steps - ), f"Expected {self.max_steps} latencies, but got {len(overall_latencies_list)} latencies" - # Overall - ## Latency - self.overall["latency"] = { - "list[s/step]": overall_latencies_list, - "mean(s/step)": compute_mean(overall_latencies_list), - "stdev(s/step)": compute_stdev(overall_latencies_list), - } - ## Throughput - overall_throughputs_list = [ - self.world_size * self.per_process_batch_size * self.gradient_accumulation_steps / latency - for latency in overall_latencies_list - ] - self.overall["throughput"] = { - "list[samples/s]": overall_throughputs_list, - "mean(samples/s)": compute_mean(overall_throughputs_list), - "stdev(samples/s)": compute_stdev(overall_throughputs_list), - } - # Training - ## Latency - training_latencies_list = overall_latencies_list[self.warmup_steps :] - self.training["latency"] = { - "list[s/step]": training_latencies_list, - "mean(s/step)": compute_mean(training_latencies_list), - "stdev(s/step)": compute_stdev(training_latencies_list), - } - ## Throughput - training_throughputs_list = overall_throughputs_list[self.warmup_steps :] - self.training["throughput"] = { - "list[samples/s]": training_throughputs_list, - "mean(samples/s)": compute_mean(training_throughputs_list), - "stdev(samples/s)": compute_stdev(training_throughputs_list), - } - # Warmup - ## Latency - warmup_latencies_list = overall_latencies_list[: self.warmup_steps] - self.warmup["latency"] = { - "list[s/step]": warmup_latencies_list, - "mean(s/step)": compute_mean(warmup_latencies_list), - "stdev(s/step)": compute_stdev(warmup_latencies_list), - } - ## Throughput - warmup_throughputs_list = overall_throughputs_list[: self.warmup_steps] - self.warmup["throughput"] = { - "list[samples/s]": warmup_throughputs_list, - "mean(samples/s)": compute_mean(warmup_throughputs_list), - "stdev(samples/s)": compute_stdev(warmup_throughputs_list), - } - - def populate_memory(self, overall_memories_dict: Dict[str, float]) -> None: - self.warmup["memory"] = overall_memories_dict - self.overall["memory"] = overall_memories_dict - self.training["memory"] = overall_memories_dict - - def populate_energy(self, overall_energies_dict: Dict[str, float]) -> None: - self.overall["energy"] = overall_energies_dict - # can't get training only or warmup only energies - # self.warmup["energy"] = overall_energies_dict - # self.training["energy"] = overall_energies_dict - # TODO: use a callback for energy instead of a tracker - - # LOGGING - def log_latency(self): - for key, value in self.training["latency"].items(): - if "list" in key: - continue - LOGGER.info(f"\t+ training.latency.{key}: {value:f} (s)") - for key, value in self.training["throughput"].items(): - if "list" in key: - continue - LOGGER.info(f"\t+ training.throughput.{key}: {value:f} (samples/s)") - - def log_memory(self): - for key, value in self.training["memory"].items(): - LOGGER.info(f"\t+ training.memory.{key}: {value:f} (MB)") - - def log_energy(self): - for key, value in self.overall["energy"].items(): - LOGGER.info(f"\t+ overall.energy.{key}: {value:f} (kWh)") - - def log_all(self): - if "latency" in self.training: - self.log_latency() - if "memory" in self.training: - self.log_memory() - if "energy" in self.training: - self.log_energy() - - # LOGIC - def __add__(self, other: "TrainingReport") -> "TrainingReport": - assert self.max_steps == other.max_steps, "Both reports must have the same max_steps" - assert self.warmup_steps == other.warmup_steps, "Both reports must have the same warmup_steps" - assert ( - self.gradient_accumulation_steps == other.gradient_accumulation_steps - ), "Both reports must have the same gradient_accumulation_steps" - - agg_report = TrainingReport( - max_steps=self.max_steps, - warmup_steps=self.warmup_steps, - world_size=self.world_size + other.world_size, - per_process_batch_size=self.per_process_batch_size, - gradient_accumulation_steps=self.gradient_accumulation_steps, - ) - - if "latency" in self.overall: - agg_overall_latencies_list = [ - max(lat_1, lat_2) - for lat_1, lat_2 in zip( - self.overall["latency"]["list[s/step]"], other.overall["latency"]["list[s/step]"] - ) - ] - agg_report.populate_latency(agg_overall_latencies_list) - - if "memory" in self.overall: - agg_overall_memories_dict = {} - for key in self.overall["memory"]: - if "vram" in key: - # our vram measures are not process-specific - agg_overall_memories_dict[key] = max(self.overall["memory"][key], other.overall["memory"][key]) - else: - # ram and pytorch measures are process-specific (can be accumulated) - agg_overall_memories_dict[key] = self.overall["memory"][key] + other.overall["memory"][key] - - agg_report.populate_memory(agg_overall_memories_dict) - - if "energy" in self.overall: - agg_overall_energies_dict = {} - for key in self.overall["energy"]: - # theoretically, the energies measured by codecarbon are process-specific (it's not clear from the code) - agg_overall_energies_dict[key] = self.overall["energy"][key] + other.overall["energy"][key] - - agg_report.populate_energy(agg_overall_energies_dict) - - return agg_report - - -def compute_mean(values: List[float]) -> float: - return mean(values) if len(values) > 0 else 0.0 - - -def compute_stdev(values: List[float]) -> float: - return stdev(values) if len(values) > 1 else 0.0 diff --git a/optimum_benchmark/benchmarks/utils.py b/optimum_benchmark/benchmarks/utils.py deleted file mode 100644 index 8b137891..00000000 --- a/optimum_benchmark/benchmarks/utils.py +++ /dev/null @@ -1 +0,0 @@ - diff --git a/optimum_benchmark/cli.py b/optimum_benchmark/cli.py index 4961c189..f91a3b2c 100644 --- a/optimum_benchmark/cli.py +++ b/optimum_benchmark/cli.py @@ -1,28 +1,25 @@ -import os import glob +import os from logging import getLogger import hydra -from omegaconf import DictConfig, OmegaConf from hydra.core.config_store import ConfigStore +from omegaconf import DictConfig, OmegaConf -from .launchers.inline.config import InlineConfig -from .launchers.process.config import ProcessConfig -from .launchers.torchrun.config import TorchrunConfig - +from .backends.neural_compressor.config import INCConfig +from .backends.onnxruntime.config import ORTConfig from .backends.openvino.config import OVConfig from .backends.pytorch.config import PyTorchConfig -from .backends.onnxruntime.config import ORTConfig -from .backends.torch_ort.config import TorchORTConfig from .backends.tensorrt_llm.config import TRTLLMConfig -from .backends.neural_compressor.config import INCConfig from .backends.text_generation_inference.config import TGIConfig - +from .backends.torch_ort.config import TorchORTConfig +from .benchmarks.inference.config import InferenceConfig from .benchmarks.report import BenchmarkReport -from .experiment import launch, ExperimentConfig from .benchmarks.training.config import TrainingConfig -from .benchmarks.inference.config import InferenceConfig - +from .experiment import ExperimentConfig, launch +from .launchers.inline.config import InlineConfig +from .launchers.process.config import ProcessConfig +from .launchers.torchrun.config import TorchrunConfig LOGGER = getLogger("cli") @@ -49,33 +46,17 @@ # optimum-benchmark @hydra.main(version_base=None) def benchmark_cli(experiment_config: DictConfig) -> None: - os.environ["BENCHMARK_CLI"] = "1" + os.environ["BENCHMARK_INTERFACE"] = "CLI" - if glob.glob("*.csv") and os.environ.get("OVERRIDE_BENCHMARKS", "0") != "1": + if glob.glob("benchmark_report.json") and os.environ.get("OVERRIDE_BENCHMARKS", "0") != "1": LOGGER.warning( - "Skipping benchmark because results already exist. " - "Set OVERRIDE_BENCHMARKS=1 to override benchmark results." + "Benchmark report already exists. If you want to override it, set the environment variable OVERRIDE_BENCHMARKS=1" ) return - # fix backend until deprecated model and device are removed - if experiment_config.task is not None: - LOGGER.warning("`task` is deprecated in experiment. Use `backend.task` instead.") - experiment_config.backend.task = experiment_config.task - if experiment_config.model is not None: - LOGGER.warning("`model` is deprecated in experiment. Use `backend.model` instead.") - experiment_config.backend.model = experiment_config.model - if experiment_config.device is not None: - LOGGER.warning("`device` is deprecated in experiment. Use `backend.device` instead.") - experiment_config.backend.device = experiment_config.device - if experiment_config.library is not None: - LOGGER.warning("`library` is deprecated in experiment. Use `backend.library` instead.") - experiment_config.backend.library = experiment_config.library - # Instantiate the experiment configuration and trigger its __post_init__ experiment_config: ExperimentConfig = OmegaConf.to_object(experiment_config) - OmegaConf.save(experiment_config, "experiment_config.yaml", resolve=True) + experiment_config.to_json("experiment_config.json") benchmark_report: BenchmarkReport = launch(experiment_config=experiment_config) - benchmark_report.to_json("benchmark_report.json") diff --git a/optimum_benchmark/env_utils.py b/optimum_benchmark/env_utils.py deleted file mode 100644 index ed4b710b..00000000 --- a/optimum_benchmark/env_utils.py +++ /dev/null @@ -1,175 +0,0 @@ -import os -import re -import platform -import subprocess -import importlib.util -from typing import Optional, List - -from .import_utils import is_py3nvml_available, is_pyrsmi_available - -import psutil - - -def is_nvidia_system(): - try: - subprocess.check_output("nvidia-smi") - return True - except Exception: - return False - - -def is_rocm_system(): - try: - subprocess.check_output("rocm-smi") - return True - except Exception: - return False - - -def bytes_to_mega_bytes(bytes: int) -> int: - # MB, not MiB - # Reference: https://en.wikipedia.org/wiki/Byte#Multiple-byte_units - return int(bytes * 1e-6) - - -def get_cpu() -> Optional[str]: - if platform.system() == "Windows": - return platform.processor() - - elif platform.system() == "Darwin": - command = "sysctl -n machdep.cpu.brand_string" - return str(subprocess.check_output(command, shell=True).decode().strip()) - - elif platform.system() == "Linux": - command = "cat /proc/cpuinfo" - all_info = subprocess.check_output(command, shell=True).decode().strip() - for line in all_info.split("\n"): - if "model name" in line: - return re.sub(".*model name.*:", "", line, 1) - return "Could not find device name" - - else: - raise ValueError(f"Unknown system '{platform.system()}'") - - -def get_cpu_ram_mb(): - return bytes_to_mega_bytes(psutil.virtual_memory().total) - - -def get_gpus(): - if is_nvidia_system(): - if not is_py3nvml_available(): - raise ValueError( - "The library py3nvml is required to collect information on NVIDIA GPUs, but is not installed. " - "Please install it through `pip install py3nvml`." - ) - import py3nvml.py3nvml as nvml - - gpus = [] - nvml.nvmlInit() - device_count = nvml.nvmlDeviceGetCount() - for i in range(device_count): - handle = nvml.nvmlDeviceGetHandleByIndex(i) - gpus.append(nvml.nvmlDeviceGetName(handle)) - nvml.nvmlShutdown() - elif is_rocm_system(): - if not is_pyrsmi_available(): - raise ValueError( - "The library pyrsmi is required to collect information on ROCm-powered GPUs, but is not installed. " - "Please install it following the instructions https://github.com/RadeonOpenCompute/pyrsmi." - ) - from pyrsmi import rocml - - rocml.smi_initialize() - - device_count = rocml.smi_get_device_count() - - gpus = [rocml.smi_get_device_name(index) for index in range(device_count)] - rocml.smi_shutdown() - else: - gpus = [] - - return gpus - - -def get_gpu_vram_mb() -> List[int]: - if is_nvidia_system(): - if not is_py3nvml_available(): - raise ValueError( - "The library py3nvml is required to collect information on NVIDIA GPUs, but is not installed. " - "Please install it through `pip install py3nvml`." - ) - import py3nvml.py3nvml as nvml - - nvml.nvmlInit() - device_count = nvml.nvmlDeviceGetCount() - vrams = [nvml.nvmlDeviceGetMemoryInfo(nvml.nvmlDeviceGetHandleByIndex(i)).total for i in range(device_count)] - nvml.nvmlShutdown() - elif is_rocm_system(): - if not is_pyrsmi_available(): - raise ValueError( - "The library pyrsmi is required to collect information on ROCm-powered GPUs, but is not installed. " - "Please install it following the instructions https://github.com/RadeonOpenCompute/pyrsmi." - ) - - from pyrsmi import rocml - - rocml.smi_initialize() - device_count = rocml.smi_get_device_count() - vrams = [rocml.smi_get_device_memory_total(index) for index in range(device_count)] - rocml.smi_shutdown() - else: - vrams = [] - - return sum(vrams) - - -def get_cuda_device_ids() -> str: - if os.environ.get("CUDA_VISIBLE_DEVICES", None) is not None: - device_ids = os.environ["CUDA_VISIBLE_DEVICES"] - else: - if is_nvidia_system(): - if not is_py3nvml_available(): - raise ValueError( - "The library py3nvml is required to collect information on NVIDIA GPUs, but is not installed. " - "Please install it through `pip install py3nvml`." - ) - import py3nvml.py3nvml as nvml - - nvml.nvmlInit() - device_ids = list(range(nvml.nvmlDeviceGetCount())) - nvml.nvmlShutdown() - elif is_rocm_system(): - if not is_pyrsmi_available(): - raise ValueError( - "The library pyrsmi is required to collect information on ROCm-powered GPUs, but is not installed. " - "Please install it following the instructions https://github.com/RadeonOpenCompute/pyrsmi." - ) - - from pyrsmi import rocml - - rocml.smi_initialize() - device_ids = list(range(rocml.smi_get_device_count())) - rocml.smi_shutdown() - else: - raise ValueError("No NVIDIA or ROCm GPUs found.") - - return ",".join(str(i) for i in device_ids) - - -def get_git_revision_hash(package_name: str) -> Optional[str]: - """ - Returns the git commit SHA of a package installed from a git repository. - """ - - try: - path = importlib.util.find_spec(package_name).origin - except Exception: - return None - - try: - git_hash = subprocess.check_output(["git", "rev-parse", "HEAD"], cwd=path).decode().strip() - except Exception: - return None - - return git_hash diff --git a/optimum_benchmark/experiment.py b/optimum_benchmark/experiment.py index c9b6d733..c9a556cc 100644 --- a/optimum_benchmark/experiment.py +++ b/optimum_benchmark/experiment.py @@ -1,48 +1,38 @@ import os -import platform +from dataclasses import asdict, dataclass, field from logging import getLogger from tempfile import TemporaryDirectory -from dataclasses import dataclass, field -from typing import Any, Dict, Type, Optional, TYPE_CHECKING +from typing import TYPE_CHECKING, Any, Dict, Optional, Type, Union -from hydra.utils import get_class - -from .benchmarks.report import BenchmarkReport +from .backends.config import BackendConfig from .benchmarks.config import BenchmarkConfig +from .benchmarks.report import BenchmarkReport +from .import_utils import get_hf_libs_info from .launchers.config import LauncherConfig -from .backends.config import BackendConfig -from .import_utils import ( - transformers_version, - accelerate_version, - diffusers_version, - optimum_version, - timm_version, - peft_version, -) -from .env_utils import ( - get_git_revision_hash, - is_nvidia_system, - is_rocm_system, - get_gpu_vram_mb, - get_cpu_ram_mb, - get_gpus, - get_cpu, -) +from .system_utils import get_system_info if TYPE_CHECKING: # avoid importing any torch to be able to set # the CUDA_VISIBLE_DEVICES environment variable # in BackendConfig __post_init__ + from .backends.base import Backend from .benchmarks.base import Benchmark from .launchers.base import Launcher - from .backends.base import Backend +from json import dump + +import pandas as pd +from flatten_dict import flatten +from hydra.utils import get_class +from transformers.configuration_utils import PushToHubMixin LOGGER = getLogger("experiment") +EXPERIMENT_FILE_NAME = "experiment_config.json" + @dataclass -class ExperimentConfig: +class ExperimentConfig(PushToHubMixin): # BACKEND CONFIGURATION backend: Any # https://github.com/facebookresearch/hydra/issues/1722#issuecomment-883568386 # LAUNCHER CONFIGURATION @@ -59,39 +49,62 @@ class ExperimentConfig: library: Optional[str] = None # deprecated # ENVIRONMENT CONFIGURATION - environment: Dict = field( - default_factory=lambda: { - "cpu": get_cpu(), - "cpu_count": os.cpu_count(), - "cpu_ram_mb": get_cpu_ram_mb(), - "system": platform.system(), - "python_version": platform.python_version(), - # libraries - "transformers_version": transformers_version(), - "transformers_commit": get_git_revision_hash("transformers"), - "accelerate_version": accelerate_version(), - "accelerate_commit": get_git_revision_hash("accelerate"), - "diffusers_version": diffusers_version(), - "diffusers_commit": get_git_revision_hash("diffusers"), - "optimum_version": optimum_version(), - "optimum_commit": get_git_revision_hash("optimum"), - "timm_version": timm_version(), - "timm_commit": get_git_revision_hash("timm"), - "peft_version": peft_version(), - "peft_commit": get_git_revision_hash("peft"), - } - ) - - def __post_init__(self): - # adding GPU information to the environment - if is_nvidia_system() or is_rocm_system(): - available_gpus = get_gpus() - if len(available_gpus) > 0: - self.environment["gpu"] = available_gpus[0] - self.environment["gpu_count"] = len(available_gpus) - self.environment["gpu_vram_mb"] = get_gpu_vram_mb() - else: - LOGGER.warning("Detected NVIDIA or ROCm system, but no GPUs found.") + environment: Dict = field(default_factory=lambda: {**get_system_info(), **get_hf_libs_info()}) + + def to_dict(self) -> Dict[str, Any]: + return asdict(self) + + def to_flat_dict(self) -> Dict[str, Any]: + report_dict = self.to_dict() + return flatten(report_dict, reducer="dot") + + def to_json(self, path: str, flat: bool = False) -> None: + if flat: + with open(path, "w") as f: + dump(self.to_flat_dict(), f, indent=4) + else: + with open(path, "w") as f: + dump(self.to_dict(), f, indent=4) + + def to_dataframe(self) -> pd.DataFrame: + flat_report_dict = self.to_flat_dict() + return pd.DataFrame.from_dict(flat_report_dict, orient="index") + + def to_csv(self, path: str) -> None: + self.to_dataframe().to_csv(path, index=False) + + def save_pretrained( + self, + save_directory: Union[str, os.PathLike], + config_file_name: Optional[Union[str, os.PathLike]] = None, + push_to_hub: bool = False, + **kwargs, + ): + use_auth_token = kwargs.pop("use_auth_token", None) + + if use_auth_token is not None: + kwargs["token"] = use_auth_token + + config_file_name = config_file_name if config_file_name is not None else EXPERIMENT_FILE_NAME + + if os.path.isfile(save_directory): + raise AssertionError(f"Provided path ({save_directory}) should be a directory, not a file") + + os.makedirs(save_directory, exist_ok=True) + + if push_to_hub: + commit_message = kwargs.pop("commit_message", None) + repo_id = kwargs.pop("repo_id", save_directory.split(os.path.sep)[-1]) + repo_id = self._create_repo(repo_id, **kwargs) + files_timestamps = self._get_files_timestamps(save_directory) + + output_config_file = os.path.join(save_directory, config_file_name) + self.to_json(output_config_file, flat=False) + + if push_to_hub: + self._upload_modified_files( + save_directory, repo_id, files_timestamps, commit_message=commit_message, token=kwargs.get("token") + ) def run(benchmark_config: BenchmarkConfig, backend_config: BackendConfig) -> BenchmarkReport: @@ -131,11 +144,27 @@ def run(benchmark_config: BenchmarkConfig, backend_config: BackendConfig) -> Ben def launch(experiment_config: ExperimentConfig) -> BenchmarkReport: - if os.environ.get("BENCHMARK_CLI", "0") == "0": + # fix backend until deprecated model and device are removed + if experiment_config.task is not None: + LOGGER.warning("`task` is deprecated in experiment config. Use `backend.task` instead.") + experiment_config.backend.task = experiment_config.task + if experiment_config.model is not None: + LOGGER.warning("`model` is deprecated in experiment config. Use `backend.model` instead.") + experiment_config.backend.model = experiment_config.model + if experiment_config.device is not None: + LOGGER.warning("`device` is deprecated in experiment config. Use `backend.device` instead.") + experiment_config.backend.device = experiment_config.device + if experiment_config.library is not None: + LOGGER.warning("`library` is deprecated in experiment config. Use `backend.library` instead.") + experiment_config.backend.library = experiment_config.library + + original_dir = os.getcwd() + tmpdir = TemporaryDirectory() + + if os.environ.get("BENCHMARK_INTERFACE", "API") == "API": + # to not pollute the user's environment LOGGER.info("Launching experiment in a temporary directory.") - tmep_dir = TemporaryDirectory() - original_dir = os.getcwd() - os.chdir(tmep_dir.name) + os.chdir(tmpdir.name) launcher_config: LauncherConfig = experiment_config.launcher @@ -145,6 +174,7 @@ def launch(experiment_config: ExperimentConfig) -> BenchmarkReport: launcher: Launcher = launcher_factory(launcher_config) except Exception as e: LOGGER.error(f"Error during launcher allocation: {e}") + tmpdir.cleanup() raise e backend_config: BackendConfig = experiment_config.backend @@ -154,10 +184,11 @@ def launch(experiment_config: ExperimentConfig) -> BenchmarkReport: output = launcher.launch(run, benchmark_config, backend_config) except Exception as e: LOGGER.error(f"Error during experiment launching: {e}") + tmpdir.cleanup() raise e - if os.environ.get("BENCHMARK_CLI", "0") == "0": + if os.environ.get("BENCHMARK_INTERFACE", "API") == "API": os.chdir(original_dir) - tmep_dir.cleanup() + tmpdir.cleanup() return output diff --git a/optimum_benchmark/generators/input_generator.py b/optimum_benchmark/generators/input_generator.py index 13f1d9aa..0dfc3050 100644 --- a/optimum_benchmark/generators/input_generator.py +++ b/optimum_benchmark/generators/input_generator.py @@ -28,27 +28,17 @@ def __call__(self, mode: str) -> Dict[str, Any]: if mode == "generate": if "pixel_values" in task_input: # image input - task_input = { - "inputs": task_input["pixel_values"], - } + task_input = {"inputs": task_input["pixel_values"]} elif "input_values" in task_input: # speech input - task_input = { - "inputs": task_input["input_values"], - } + task_input = {"inputs": task_input["input_values"]} elif "input_features" in task_input: # waveform input - task_input = { - "inputs": task_input["input_features"], - } + task_input = {"inputs": task_input["input_features"]} elif "input_ids" in task_input: # text input - task_input = { - "inputs": task_input["input_ids"], - } + task_input = {"inputs": task_input["input_ids"]} elif mode == "call": - task_input = { - "prompt": task_input["prompt"], - } + task_input = {"prompt": task_input["prompt"]} return task_input diff --git a/optimum_benchmark/generators/task_generator.py b/optimum_benchmark/generators/task_generator.py index 1f3e9b23..683d8963 100644 --- a/optimum_benchmark/generators/task_generator.py +++ b/optimum_benchmark/generators/task_generator.py @@ -43,40 +43,28 @@ def input_ids(self): return self.generate_random_integers( min_value=0, max_value=self.shapes["vocab_size"], - shape=( - self.shapes["batch_size"], - self.shapes["sequence_length"], - ), + shape=(self.shapes["batch_size"], self.shapes["sequence_length"]), ) def attention_mask(self): return self.generate_random_integers( min_value=1, # avoid sparse attention max_value=2, - shape=( - self.shapes["batch_size"], - self.shapes["sequence_length"], - ), + shape=(self.shapes["batch_size"], self.shapes["sequence_length"]), ) def token_type_ids(self): return self.generate_random_integers( min_value=0, max_value=self.shapes["type_vocab_size"], - shape=( - self.shapes["batch_size"], - self.shapes["sequence_length"], - ), + shape=(self.shapes["batch_size"], self.shapes["sequence_length"]), ) def position_ids(self): return self.generate_ranges( start=0, stop=self.shapes["sequence_length"], - shape=( - self.shapes["batch_size"], - self.shapes["sequence_length"], - ), + shape=(self.shapes["batch_size"], self.shapes["sequence_length"]), ) def requires_token_type_ids(self): @@ -91,44 +79,28 @@ def pixel_values(self): return self.generate_random_floats( min_value=0, max_value=1, - shape=( - self.shapes["batch_size"], - self.shapes["num_channels"], - self.shapes["height"], - self.shapes["width"], - ), + shape=(self.shapes["batch_size"], self.shapes["num_channels"], self.shapes["height"], self.shapes["width"]), ) class AudioGenerator(TaskGenerator): def input_values(self): return self.generate_random_floats( - min_value=-1, - max_value=1, - shape=( - self.shapes["batch_size"], - self.shapes["sequence_length"], - ), + min_value=-1, max_value=1, shape=(self.shapes["batch_size"], self.shapes["sequence_length"]) ) def input_features(self): return self.generate_random_floats( min_value=-1, max_value=1, - shape=( - self.shapes["batch_size"], - self.shapes["feature_size"], - self.shapes["nb_max_frames"], - ), + shape=(self.shapes["batch_size"], self.shapes["feature_size"], self.shapes["nb_max_frames"]), ) class TextClassificationGenerator(TextGenerator): def labels(self): return self.generate_random_integers( - min_value=0, - max_value=self.shapes["num_labels"], - shape=(self.shapes["batch_size"],), + min_value=0, max_value=self.shapes["num_labels"], shape=(self.shapes["batch_size"],) ) def __call__(self): @@ -154,10 +126,7 @@ def labels(self): return self.generate_random_integers( min_value=0, max_value=self.shapes["num_labels"], - shape=( - self.shapes["batch_size"], - self.shapes["sequence_length"], - ), + shape=(self.shapes["batch_size"], self.shapes["sequence_length"]), ) def __call__(self): @@ -199,16 +168,12 @@ def __call__(self): class QuestionAnsweringGenerator(TextGenerator): def start_positions(self): return self.generate_random_integers( - min_value=0, - max_value=self.shapes["sequence_length"], - shape=(self.shapes["batch_size"],), + min_value=0, max_value=self.shapes["sequence_length"], shape=(self.shapes["batch_size"],) ) def end_positions(self): return self.generate_random_integers( - min_value=0, - max_value=self.shapes["sequence_length"], - shape=(self.shapes["batch_size"],), + min_value=0, max_value=self.shapes["sequence_length"], shape=(self.shapes["batch_size"],) ) def __call__(self): @@ -247,9 +212,7 @@ def __call__(self): class MultipleChoiceGenerator(TextGenerator): def labels(self): return self.generate_random_integers( - min_value=0, - max_value=self.shapes["num_choices"], - shape=(self.shapes["batch_size"],), + min_value=0, max_value=self.shapes["num_choices"], shape=(self.shapes["batch_size"],) ) def __call__(self): @@ -283,9 +246,7 @@ def __call__(self): class ImageClassificationGenerator(ImageGenerator): def labels(self): return self.generate_random_integers( - min_value=0, - max_value=self.shapes["num_labels"], - shape=(self.shapes["batch_size"],), + min_value=0, max_value=self.shapes["num_labels"], shape=(self.shapes["batch_size"],) ) def __call__(self): @@ -303,15 +264,9 @@ def labels(self): return [ { "class_labels": self.generate_random_integers( - min_value=0, - max_value=self.shapes["num_labels"], - shape=(self.shapes["num_queries"],), - ), - "boxes": self.generate_random_floats( - min_value=-1, - max_value=1, - shape=(self.shapes["num_queries"], 4), + min_value=0, max_value=self.shapes["num_labels"], shape=(self.shapes["num_queries"],) ), + "boxes": self.generate_random_floats(min_value=-1, max_value=1, shape=(self.shapes["num_queries"], 4)), } for _ in range(self.shapes["batch_size"]) ] @@ -331,11 +286,7 @@ def labels(self): return self.generate_random_integers( min_value=0, max_value=self.shapes["num_labels"], - shape=( - self.shapes["batch_size"], - self.shapes["height"], - self.shapes["width"], - ), + shape=(self.shapes["batch_size"], self.shapes["height"], self.shapes["width"]), ) def __call__(self): @@ -351,9 +302,7 @@ def __call__(self): class AudioClassificationGenerator(AudioGenerator): def labels(self): return self.generate_random_integers( - min_value=0, - max_value=self.shapes["num_labels"], - shape=(self.shapes["batch_size"],), + min_value=0, max_value=self.shapes["num_labels"], shape=(self.shapes["batch_size"],) ) def __call__(self): @@ -371,10 +320,7 @@ def labels(self): return self.generate_random_integers( min_value=0, max_value=self.shapes["vocab_size"], - shape=( - self.shapes["batch_size"], - self.shapes["sequence_length"], - ), + shape=(self.shapes["batch_size"], self.shapes["sequence_length"]), ) def __call__(self): diff --git a/optimum_benchmark/import_utils.py b/optimum_benchmark/import_utils.py index f19fbda3..f247eaf3 100644 --- a/optimum_benchmark/import_utils.py +++ b/optimum_benchmark/import_utils.py @@ -1,6 +1,7 @@ import importlib.metadata import importlib.util - +import subprocess +from typing import Optional _transformers_available = importlib.util.find_spec("transformers") is not None _accelerate_available = importlib.util.find_spec("accelerate") is not None @@ -10,12 +11,11 @@ _onnx_available = importlib.util.find_spec("onnx") is not None _tensorrt_available = importlib.util.find_spec("tensorrt") is not None _peft_available = importlib.util.find_spec("peft") is not None -_py3nvml_available = importlib.util.find_spec("py3nvml") is not None +_pynvml_available = importlib.util.find_spec("pynvml") is not None _torch_distributed_available = importlib.util.find_spec("torch.distributed") is not None _onnxruntime_available = importlib.util.find_spec("onnxruntime") is not None _openvino_available = importlib.util.find_spec("openvino") is not None _neural_compressor_available = importlib.util.find_spec("neural_compressor") is not None -_pyrsmi_available = importlib.util.find_spec("pyrsmi") is not None _codecarbon_available = importlib.util.find_spec("codecarbon") is not None _amdsmi_available = importlib.util.find_spec("amdsmi") is not None _tensorflow_available = importlib.util.find_spec("tensorflow") is not None @@ -25,6 +25,7 @@ _deepspeed_available = importlib.util.find_spec("deepspeed") is not None _tensorrt_llm_available = importlib.util.find_spec("tensorrt_llm") is not None _psutil_available = importlib.util.find_spec("psutil") is not None +_optimum_benchmark_available = importlib.util.find_spec("optimum_benchmark") is not None def is_psutil_available(): @@ -83,12 +84,8 @@ def is_onnxruntime_available(): return _onnxruntime_available -def is_py3nvml_available(): - return _py3nvml_available - - -def is_pyrsmi_available(): - return _pyrsmi_available +def is_pynvml_available(): + return _pynvml_available def is_amdsmi_available(): @@ -178,3 +175,45 @@ def peft_version(): def tesnorrt_llm_version(): if _tensorrt_llm_available: return importlib.metadata.version("tensorrt_llm") + + +def optimum_benchmark_version(): + if _optimum_benchmark_available: + return importlib.metadata.version("optimum_benchmark") + + +def get_git_revision_hash(package_name: str) -> Optional[str]: + """ + Returns the git commit SHA of a package installed from a git repository. + """ + + try: + path = importlib.util.find_spec(package_name).origin + except Exception: + return None + + try: + git_hash = subprocess.check_output(["git", "rev-parse", "HEAD"], cwd=path).decode().strip() + except Exception: + return None + + return git_hash + + +def get_hf_libs_info(): + return { + "optimum_benchmark_version": optimum_benchmark_version(), + "optimum_benchmark_commit": get_git_revision_hash("optimum_benchmark"), + "transformers_version": transformers_version(), + "transformers_commit": get_git_revision_hash("transformers"), + "accelerate_version": accelerate_version(), + "accelerate_commit": get_git_revision_hash("accelerate"), + "diffusers_version": diffusers_version(), + "diffusers_commit": get_git_revision_hash("diffusers"), + "optimum_version": optimum_version(), + "optimum_commit": get_git_revision_hash("optimum"), + "timm_version": timm_version(), + "timm_commit": get_git_revision_hash("timm"), + "peft_version": peft_version(), + "peft_commit": get_git_revision_hash("peft"), + } diff --git a/optimum_benchmark/launchers/base.py b/optimum_benchmark/launchers/base.py index 91b50da0..4d5323f4 100644 --- a/optimum_benchmark/launchers/base.py +++ b/optimum_benchmark/launchers/base.py @@ -1,7 +1,8 @@ from abc import ABC from logging import getLogger -from typing import Callable, ClassVar, Generic, Dict, Any +from typing import Callable, ClassVar, Generic +from ..benchmarks.report import BenchmarkReport from .config import LauncherConfigT LOGGER = getLogger("launcher") @@ -16,5 +17,5 @@ def __init__(self, config: LauncherConfigT): LOGGER.info(f"ูŽูŽAllocating {self.NAME} launcher") self.config = config - def launch(self, worker: Callable, *worker_args) -> Dict[str, Any]: + def launch(self, worker: Callable, *worker_args) -> BenchmarkReport: raise NotImplementedError("Launcher must implement launch method") diff --git a/optimum_benchmark/launchers/config.py b/optimum_benchmark/launchers/config.py index 2d04caa4..938c3c97 100644 --- a/optimum_benchmark/launchers/config.py +++ b/optimum_benchmark/launchers/config.py @@ -1,7 +1,7 @@ from abc import ABC -from typing import TypeVar -from logging import getLogger from dataclasses import dataclass +from logging import getLogger +from typing import TypeVar LOGGER = getLogger("launcher") diff --git a/optimum_benchmark/launchers/inline/launcher.py b/optimum_benchmark/launchers/inline/launcher.py index e5702ba1..64a8002c 100644 --- a/optimum_benchmark/launchers/inline/launcher.py +++ b/optimum_benchmark/launchers/inline/launcher.py @@ -1,10 +1,10 @@ -import os from logging import getLogger -from typing import Callable, Dict, Any +from typing import Callable +from ...benchmarks.report import BenchmarkReport from ..base import Launcher -from .config import InlineConfig from ..isolation_utils import device_isolation +from .config import InlineConfig LOGGER = getLogger("inline") @@ -15,12 +15,9 @@ class InlineLauncher(Launcher[InlineConfig]): def __init__(self, config: InlineConfig): super().__init__(config) - def launch(self, worker: Callable, *worker_args) -> Dict[str, Any]: - with device_isolation( - benchmark_pid=os.getpid(), - enabled=self.config.device_isolation, - ): - LOGGER.info("\t+ Launching inline experiment (no process isolation)") - report: Dict[str, Any] = worker(*worker_args) + def launch(self, worker: Callable, *worker_args) -> BenchmarkReport: + with device_isolation(enabled=self.config.device_isolation): + LOGGER.info("\t+ Launching inline worker (no process isolation)") + report = worker(*worker_args) return report diff --git a/optimum_benchmark/launchers/isolation_utils.py b/optimum_benchmark/launchers/isolation_utils.py index 52006bcc..f8a0074c 100644 --- a/optimum_benchmark/launchers/isolation_utils.py +++ b/optimum_benchmark/launchers/isolation_utils.py @@ -1,61 +1,64 @@ import os -import time import signal -from typing import Dict, Set +import time +from contextlib import contextmanager from logging import getLogger from multiprocessing import Process -from contextlib import contextmanager +from typing import Dict, Set +from ..import_utils import is_amdsmi_available, is_psutil_available, is_pynvml_available from ..logging_utils import setup_logging -from ..env_utils import is_nvidia_system, is_rocm_system -from ..import_utils import is_amdsmi_available, is_py3nvml_available, torch_version, is_psutil_available +from ..system_utils import get_rocm_version, is_nvidia_system, is_rocm_system if is_psutil_available(): import psutil -if is_py3nvml_available(): - import py3nvml.py3nvml as nvml +if is_pynvml_available(): + import pynvml if is_amdsmi_available(): - import amdsmi # type: ignore + import amdsmi LOGGER = getLogger("isolation") def get_nvidia_devices_pids() -> Dict[int, list]: + if not is_pynvml_available(): + raise ValueError( + "The library pynvml is required to get the pids running on NVIDIA GPUs, but is not installed. " + "Please install the official and NVIDIA maintained PyNVML library through `pip install nvidia-ml-py`." + ) + devices_pids: Dict[int, list] = {} devices_ids = [int(device_id) for device_id in os.environ["CUDA_VISIBLE_DEVICES"].split(",")] - if not is_py3nvml_available(): - raise ValueError("get_nvidia_device_pids requires py3nvml. Please install it with `pip install py3nvml`.") - - nvml.nvmlInit() + pynvml.nvmlInit() for device_id in devices_ids: - device_handle = nvml.nvmlDeviceGetHandleByIndex(device_id) - device_processes = nvml.nvmlDeviceGetComputeRunningProcesses(device_handle) + device_handle = pynvml.nvmlDeviceGetHandleByIndex(device_id) + device_processes = pynvml.nvmlDeviceGetComputeRunningProcesses(device_handle) for device_process in device_processes: if device_id not in devices_pids: devices_pids[device_id] = [] devices_pids[device_id].append(device_process.pid) - nvml.nvmlShutdown() + pynvml.nvmlShutdown() return devices_pids def get_amd_devices_pids() -> Dict[int, list]: - devices_pids: Dict[int, list] = {} - rocm_version = torch_version().split("rocm")[-1] - devices_ids = [int(device_id) for device_id in os.environ["CUDA_VISIBLE_DEVICES"].split(",")] - if not is_amdsmi_available(): raise ValueError( - "get_amd_devices_pids requires amdsmi. " - "Please follow the instructions at https://github.com/RadeonOpenCompute/amdsmi/tree/master" + "The library amdsmi is required get the pids running on AMD GPUs, but is not installed. " + "Please install the official and AMD maintained amdsmi library from https://github.com/ROCm/amdsmi." ) + devices_pids: Dict[int, list] = {} + rocm_version = get_rocm_version() + devices_ids = [int(device_id) for device_id in os.environ["CUDA_VISIBLE_DEVICES"].split(",")] + amdsmi.amdsmi_init() if rocm_version >= "5.7": @@ -115,7 +118,6 @@ def get_amd_devices_pids() -> Dict[int, list]: def get_pids_running_on_system_device() -> Set[int]: """Returns the set of pids running on the system device(s).""" - if is_nvidia_system(): devices_pids = get_nvidia_devices_pids() elif is_rocm_system(): @@ -128,29 +130,28 @@ def get_pids_running_on_system_device() -> Set[int]: return all_devices_pids -def assert_system_devices_isolation(benchmark_pid: int) -> None: +def assert_system_devices_isolation(main_pid: int) -> None: setup_logging("ERROR") - isolation_pid = os.getpid() - while psutil.pid_exists(benchmark_pid): + while psutil.pid_exists(main_pid): child_processes = set() non_permitted_pids = set() all_devices_pids = get_pids_running_on_system_device() for pid in list(all_devices_pids): - if pid == benchmark_pid or pid == isolation_pid: + if pid == main_pid or pid == isolation_pid: continue try: info = psutil.Process(pid) parent_pid = info.ppid() except Exception as e: - LOGGER.error(f"Failed to get info for process {pid} with error {e}") + LOGGER.error(f"Failed to get parent pid for process {pid} with error {e}") parent_pid = None - if parent_pid == benchmark_pid or parent_pid == isolation_pid: + if parent_pid == main_pid or parent_pid == isolation_pid: child_processes.add(pid) else: non_permitted_pids.add(pid) @@ -159,29 +160,25 @@ def assert_system_devices_isolation(benchmark_pid: int) -> None: LOGGER.error(f"Found non-permitted process(es) running on system device(s): {non_permitted_pids}") for pid in child_processes: try: - LOGGER.error(f"Terminating child process {pid}") - os.kill(pid, signal.SIGTERM) + LOGGER.error(f"Interrupting child process {pid} of main process {main_pid}") + os.kill(pid, signal.SIGINT) except Exception as e: LOGGER.error(f"Failed to terminate child process {pid} with error {e}") - LOGGER.error(f"Terminating benchmark process {benchmark_pid}") - os.kill(benchmark_pid, signal.SIGTERM) - break + LOGGER.error(f"Interrupting main process {main_pid}...") + os.kill(main_pid, signal.SIGINT) + exit(1) time.sleep(1) @contextmanager -def device_isolation(benchmark_pid: int, enabled: bool): +def device_isolation(enabled: bool): if not enabled: yield return - isolation_process = Process( - target=assert_system_devices_isolation, - kwargs={"benchmark_pid": benchmark_pid}, - daemon=True, - ) + isolation_process = Process(target=assert_system_devices_isolation, kwargs={"main_pid": os.getpid()}, daemon=True) isolation_process.start() LOGGER.info(f"\t+ Launched device(s) isolation process {isolation_process.pid}.") diff --git a/optimum_benchmark/launchers/process/launcher.py b/optimum_benchmark/launchers/process/launcher.py index b2619d2f..c08061a5 100644 --- a/optimum_benchmark/launchers/process/launcher.py +++ b/optimum_benchmark/launchers/process/launcher.py @@ -1,13 +1,13 @@ -import os -import multiprocessing as mp from logging import getLogger -from typing import Callable, Dict, Any -from multiprocessing import Process, Queue +from typing import Callable -from ..isolation_utils import device_isolation +import torch.multiprocessing as mp + +from ...benchmarks.report import BenchmarkReport from ...logging_utils import setup_logging -from .config import ProcessConfig from ..base import Launcher +from ..isolation_utils import device_isolation +from .config import ProcessConfig LOGGER = getLogger("process") @@ -22,35 +22,44 @@ def __init__(self, config: ProcessConfig): LOGGER.info(f"\t+ Setting multiprocessing start method to {self.config.start_method}.") mp.set_start_method(self.config.start_method, force=True) - def launch(self, worker: Callable, *worker_args) -> Dict[str, Any]: - # worker process can't be daemon since it might spawn its own processes - queue = Queue() - current_log_level = getLogger().getEffectiveLevel() - worker_process = Process( - daemon=False, - target=target, - args=(worker, queue, current_log_level, *worker_args), - ) - worker_process.start() - LOGGER.info(f"\t+ Launched worker process with PID {worker_process.pid}.") + def launch(self, worker: Callable, *worker_args) -> BenchmarkReport: + log_level = getLogger().getEffectiveLevel() + + ctx = mp.get_context(self.config.start_method) + queue = ctx.Queue() + lock = ctx.Lock() - with device_isolation(enabled=self.config.device_isolation, benchmark_pid=os.getpid()): - worker_process.join() + with device_isolation(enabled=self.config.device_isolation): + process_context = mp.start_processes( + entrypoint, + args=(worker, queue, lock, log_level, *worker_args), + start_method=self.config.start_method, + daemon=False, + join=False, + nprocs=1, + ) + LOGGER.info(f"\t+ Launched worker process(es) with PID(s): {process_context.pids()}") + while not process_context.join(): + pass - if worker_process.exitcode != 0: - LOGGER.error(f"\t+ Worker process exited with code {worker_process.exitcode}, forwarding...") - exit(worker_process.exitcode) + # restore the original logging configuration + setup_logging(log_level) - report = queue.get() + report: BenchmarkReport = queue.get() return report -def target(fn, q, log_level, *args): - """This a pickalable function that correctly sets up the logging configuration for the worker process.""" +def entrypoint(i, worker, queue, lock, log_level, *worker_args): + """ + This a pickalable function that correctly sets up the logging configuration for the worker process, + and puts the output of the worker function into a lock-protected queue. + """ - setup_logging(log_level) + setup_logging(log_level, prefix=f"PROC-{i}") - out = fn(*args) + worker_output = worker(*worker_args) - q.put(out) + lock.acquire() + queue.put(worker_output) + lock.release() diff --git a/optimum_benchmark/launchers/torchrun/config.py b/optimum_benchmark/launchers/torchrun/config.py index 2d87ff03..c1fbfc38 100644 --- a/optimum_benchmark/launchers/torchrun/config.py +++ b/optimum_benchmark/launchers/torchrun/config.py @@ -1,7 +1,7 @@ import uuid +from dataclasses import dataclass, field from logging import getLogger from typing import Any, Dict, Optional -from dataclasses import dataclass, field from ..config import LauncherConfig diff --git a/optimum_benchmark/launchers/torchrun/launcher.py b/optimum_benchmark/launchers/torchrun/launcher.py index f327e85c..d5351a34 100644 --- a/optimum_benchmark/launchers/torchrun/launcher.py +++ b/optimum_benchmark/launchers/torchrun/launcher.py @@ -1,23 +1,17 @@ -import os -import multiprocessing as mp from logging import getLogger -from multiprocessing import Queue -from typing import Callable, Dict, Any +from typing import Any, Callable, Dict, List -from ..base import Launcher -from .config import TorchrunConfig +import torch.distributed +import torch.multiprocessing as mp +from torch.distributed.elastic.multiprocessing import Std +from torch.distributed.elastic.multiprocessing.errors import record +from torch.distributed.launcher.api import LaunchConfig, launch_agent + +from ...benchmarks.report import BenchmarkReport from ...logging_utils import setup_logging +from ..base import Launcher from ..isolation_utils import device_isolation -from ...benchmarks.report import BenchmarkReport -from ...import_utils import is_torch_distributed_available - -if is_torch_distributed_available(): - import torch.distributed - from torch.distributed import FileStore - from torch.distributed.elastic.multiprocessing import Std - from torch.distributed.elastic.multiprocessing.errors import record - from torch.distributed.launcher.api import LaunchConfig, launch_agent - +from .config import TorchrunConfig LOGGER = getLogger("torchrun") @@ -33,6 +27,7 @@ def __init__(self, config: TorchrunConfig): mp.set_start_method(self.config.start_method, force=True) def launch(self, worker: Callable, *worker_args) -> Dict[str, Any]: + log_level = getLogger().getEffectiveLevel() launch_config = LaunchConfig( min_nodes=self.config.min_nodes, max_nodes=self.config.max_nodes, @@ -51,55 +46,51 @@ def launch(self, worker: Callable, *worker_args) -> Dict[str, Any]: local_addr=self.config.local_addr, log_dir=self.config.log_dir, ) - queue = Queue() - current_log_level = getLogger().getEffectiveLevel() - with device_isolation(enabled=self.config.device_isolation, benchmark_pid=os.getpid()): + ctx = mp.get_context(self.config.start_method) + queue = ctx.Queue() + lock = ctx.Lock() + + with device_isolation(enabled=self.config.device_isolation): LOGGER.info(f"\t+ Launching torchrun agent with {self.config.nproc_per_node} workers processes") launch_agent( - config=launch_config, - entrypoint=entrypoint, - args=(worker, queue, current_log_level, *worker_args), + entrypoint=entrypoint, args=(worker, queue, lock, log_level, *worker_args), config=launch_config ) - outputs = [] + # restore the original logging configuration + setup_logging(log_level) + reports: List[BenchmarkReport] = [] while not queue.empty(): - outputs.append(queue.get()) + reports.append(queue.get()) - if len(outputs) == 1: - report: BenchmarkReport = outputs[0] + if len(reports) > 1: + LOGGER.info(f"\t+ Merging benchmark reports from {len(reports)} workers") + report = reports[0].aggregate(reports) + elif len(reports) == 1: + report = reports[0] else: - LOGGER.info(f"\t+ Merging benchmark reports from {len(outputs)} workers") - report: BenchmarkReport = sum(outputs[1:], outputs[0]) - report.log_all() + raise ValueError("No benchmark report was returned by the workers") + + report.log() return report @record -def entrypoint(fn, q, log_level, *args): +def entrypoint(worker, queue, lock, log_level, *worker_args): """ This a pickalable function that correctly sets up the logging configuration """ - if not torch.distributed.is_initialized(): - # initialize the process group if not already initialized - backend = "nccl" if torch.cuda.is_available() else "gloo" - torch.distributed.init_process_group(backend=backend) - rank = torch.distributed.get_rank() - - if torch.cuda.is_available(): - torch.cuda.set_device(rank) + torch.distributed.init_process_group(backend="nccl" if torch.cuda.is_available() else "gloo") - if rank == 0: - setup_logging(level=log_level, prefix="RANK-0") - else: - setup_logging(level="ERROR") + rank = torch.distributed.get_rank() + torch.cuda.set_device(rank) if torch.cuda.is_available() else None + setup_logging(level=log_level, prefix=f"RANK-{rank}") if rank == 0 else None - # TODO: use a tcp store instead - store = FileStore("torchrun.filestore") - store.set(f"rank_{rank}", str(os.getpid())) + output = worker(*worker_args) - output = fn(*args) - q.put(output) + lock.acquire() + queue.put(output) + lock.release() diff --git a/optimum_benchmark/logging_utils.py b/optimum_benchmark/logging_utils.py index 72f76889..c4c5ab6a 100644 --- a/optimum_benchmark/logging_utils.py +++ b/optimum_benchmark/logging_utils.py @@ -1,9 +1,9 @@ -import os import logging import logging.config +import os from logging import Logger +from subprocess import PIPE, STDOUT, Popen from typing import Optional -from subprocess import Popen, PIPE, STDOUT from omegaconf import OmegaConf @@ -14,34 +14,19 @@ "colorlog": { "()": "colorlog.ColoredFormatter", "format": "[%(cyan)s%(asctime)s%(reset)s][%(blue)s%(name)s%(reset)s][%(log_color)s%(levelname)s%(reset)s] - %(message)s", - "log_colors": { - "DEBUG": "purple", - "INFO": "green", - "WARNING": "yellow", - "CRITICAL": "red", - "ERROR": "red", - }, - }, - }, - "handlers": { - "console": { - "formatter": "colorlog", - "stream": "ext://sys.stdout", - "class": "logging.StreamHandler", + "log_colors": {"DEBUG": "purple", "INFO": "green", "WARNING": "yellow", "CRITICAL": "red", "ERROR": "red"}, }, }, + "handlers": {"console": {"formatter": "colorlog", "stream": "ext://sys.stdout", "class": "logging.StreamHandler"}}, "root": {"level": "INFO", "handlers": ["console"]}, "disable_existing_loggers": False, } def setup_logging(level: str = "INFO", prefix: Optional[str] = None): - if os.environ.get("BENCHMARK_CLI", "0") == "1": + if os.environ.get("BENCHMARK_INTERFACE", "API") == "CLI": hydra_config = OmegaConf.load(".hydra/hydra.yaml") - job_logging = OmegaConf.to_container( - hydra_config.hydra.job_logging, - resolve=True, - ) + job_logging = OmegaConf.to_container(hydra_config.hydra.job_logging, resolve=True) else: job_logging = API_JOB_LOGGING.copy() diff --git a/optimum_benchmark/system_utils.py b/optimum_benchmark/system_utils.py new file mode 100644 index 00000000..52d59383 --- /dev/null +++ b/optimum_benchmark/system_utils.py @@ -0,0 +1,219 @@ +import os +import platform +import re +import subprocess +from typing import List, Optional + +import psutil + +from .import_utils import is_amdsmi_available, is_pynvml_available + + +## CPU related stuff +def get_cpu() -> Optional[str]: + if platform.system() == "Windows": + return platform.processor() + + elif platform.system() == "Darwin": + command = "sysctl -n machdep.cpu.brand_string" + return str(subprocess.check_output(command, shell=True).decode().strip()) + + elif platform.system() == "Linux": + command = "cat /proc/cpuinfo" + all_info = subprocess.check_output(command, shell=True).decode().strip() + for line in all_info.split("\n"): + if "model name" in line: + return re.sub(".*model name.*:", "", line, 1) + return "Could not find device name" + + else: + raise ValueError(f"Unknown system '{platform.system()}'") + + +def get_cpu_ram_mb(): + return psutil.virtual_memory().total / 1e6 + + +## GPU related stuff +try: + subprocess.check_output("nvidia-smi") + _nvidia_system = True +except Exception: + _nvidia_system = False + +try: + subprocess.check_output("rocm-smi") + _rocm_system = True +except Exception: + _rocm_system = False + + +def is_nvidia_system(): + return _nvidia_system + + +def is_rocm_system(): + return _rocm_system + + +if is_nvidia_system() and is_pynvml_available(): + import pynvml + +if is_rocm_system() and is_amdsmi_available(): + import amdsmi + + +def get_rocm_version(): + for folder in os.listdir("/opt/"): + if "rocm" in folder and "rocm" != folder: + return folder.split("-")[-1] + raise ValueError("Could not find ROCm version.") + + +def get_gpus(): + if is_nvidia_system(): + if not is_pynvml_available(): + raise ValueError( + "The library pynvml is required to run memory benchmark on NVIDIA GPUs, but is not installed. " + "Please install the official and NVIDIA maintained PyNVML library through `pip install nvidia-ml-py`." + ) + + gpus = [] + pynvml.nvmlInit() + device_count = pynvml.nvmlDeviceGetCount() + for i in range(device_count): + handle = pynvml.nvmlDeviceGetHandleByIndex(i) + gpus.append(pynvml.nvmlDeviceGetName(handle)) + pynvml.nvmlShutdown() + elif is_rocm_system(): + if not is_amdsmi_available(): + raise ValueError( + "The library amdsmi is required to run memory benchmark on AMD GPUs, but is not installed. " + "Please install the official and AMD maintained amdsmi library from https://github.com/ROCm/amdsmi." + ) + + gpus = [] + amdsmi.amdsmi_init() + rocm_version = get_rocm_version() + if rocm_version >= "5.7": + devices_handles = amdsmi.amdsmi_get_processor_handles() + for device_handle in devices_handles: + gpus.append(amdsmi.amdsmi_get_gpu_vendor_name(device_handle)) + else: + devices_handles = amdsmi.amdsmi_get_device_handles() + for device_handle in devices_handles: + gpus.append(amdsmi.amdsmi_dev_get_vendor_name(device_handle)) + amdsmi.amdsmi_shut_down() + else: + raise ValueError("No NVIDIA or ROCm GPUs found.") + + return gpus + + +def get_gpu_vram_mb() -> List[int]: + if is_nvidia_system(): + if not is_pynvml_available(): + raise ValueError( + "The library pynvml is required to run memory benchmark on NVIDIA GPUs, but is not installed. " + "Please install the official and NVIDIA maintained PyNVML library through `pip install nvidia-ml-py`." + ) + + pynvml.nvmlInit() + device_count = pynvml.nvmlDeviceGetCount() + vrams = [ + pynvml.nvmlDeviceGetMemoryInfo(pynvml.nvmlDeviceGetHandleByIndex(i)).total for i in range(device_count) + ] + pynvml.nvmlShutdown() + elif is_rocm_system(): + if not is_amdsmi_available(): + raise ValueError( + "The library amdsmi is required to run memory benchmark on AMD GPUs, but is not installed. " + "Please install the official and AMD maintained amdsmi library from https://github.com/ROCm/amdsmi." + ) + + amdsmi.amdsmi_init() + rocm_version = get_rocm_version() + + if rocm_version >= "5.7": + device_handles = amdsmi.amdsmi_get_processor_handles() + vrams = [ + amdsmi.amdsmi_get_gpu_memory_total(device_handle, mem_type=amdsmi.AmdSmiMemoryType.VRAM) + for device_handle in device_handles + ] + else: + device_handles = amdsmi.amdsmi_get_device_handles() + vrams = [ + amdsmi.amdsmi_dev_get_memory_total(device_handle, mem_type=amdsmi.AmdSmiMemoryType.VRAM) + for device_handle in device_handles + ] + + amdsmi.amdsmi_shut_down() + + else: + raise ValueError("No NVIDIA or ROCm GPUs found.") + + return sum(vrams) + + +def get_gpu_device_ids() -> str: + if os.environ.get("CUDA_VISIBLE_DEVICES", None) is not None: + device_ids = os.environ["CUDA_VISIBLE_DEVICES"] + elif os.environ.get("GPU_DEVICE_ORDINAL", None) is not None: + device_ids = os.environ["GPU_DEVICE_ORDINAL"] + elif os.environ.get("HIP_VISIBLE_DEVICES", None) is not None: + device_ids = os.environ["HIP_VISIBLE_DEVICES"] + elif os.environ.get("ROCR_VISIBLE_DEVICES", None) is not None: + device_ids = os.environ["ROCR_VISIBLE_DEVICES"] + elif is_nvidia_system(): + if not is_pynvml_available(): + raise ValueError( + "The library pynvml is required to run memory benchmark on NVIDIA GPUs, but is not installed. " + "Please install the official and NVIDIA maintained PyNVML library through `pip install nvidia-ml-py`." + ) + + pynvml.nvmlInit() + device_ids = list(range(pynvml.nvmlDeviceGetCount())) + device_ids = ",".join(str(i) for i in device_ids) + pynvml.nvmlShutdown() + elif is_rocm_system(): + if not is_amdsmi_available(): + raise ValueError( + "The library amdsmi is required to run memory benchmark on AMD GPUs, but is not installed. " + "Please install the official and AMD maintained amdsmi library from https://github.com/ROCm/amdsmi." + ) + + amdsmi.amdsmi_init() + rocm_version = get_rocm_version() + + if rocm_version >= "5.7": + device_ids = list(range(len(amdsmi.amdsmi_get_processor_handles()))) + else: + device_ids = list(range(len(amdsmi.amdsmi_get_device_handles()))) + + device_ids = ",".join(str(i) for i in device_ids) + amdsmi.amdsmi_shut_down() + else: + raise ValueError("Couldn't infer GPU device ids.") + + return device_ids + + +## System related stuff +def get_system_info() -> dict: + system_dict = { + "cpu": get_cpu(), + "cpu_count": os.cpu_count(), + "cpu_ram_mb": get_cpu_ram_mb(), + "system": platform.system(), + "machine": platform.machine(), + "platform": platform.platform(), + "processor": platform.processor(), + "python_version": platform.python_version(), + } + + if is_nvidia_system() or is_rocm_system(): + system_dict["gpu"] = get_gpus() + system_dict["gpu_count"] = len(get_gpus()) + system_dict["gpu_vram_mb"] = get_gpu_vram_mb() + + return system_dict diff --git a/optimum_benchmark/task_utils.py b/optimum_benchmark/task_utils.py index e35baae3..bd7d7999 100644 --- a/optimum_benchmark/task_utils.py +++ b/optimum_benchmark/task_utils.py @@ -1,5 +1,5 @@ -import os import importlib +import os from typing import Optional import huggingface_hub @@ -16,10 +16,7 @@ "feature-extraction": "AutoModel", "fill-mask": "AutoModelForMaskedLM", "image-classification": "AutoModelForImageClassification", - "image-segmentation": ( - "AutoModelForImageSegmentation", - "AutoModelForSemanticSegmentation", - ), + "image-segmentation": ("AutoModelForImageSegmentation", "AutoModelForSemanticSegmentation"), "image-to-image": "AutoModelForImageToImage", "image-to-text": "AutoModelForVision2Seq", "mask-generation": "AutoModel", @@ -64,12 +61,8 @@ "stable-diffusion": "StableDiffusionPipeline", "stable-diffusion-xl": "StableDiffusionXLImg2ImgPipeline", } -_TIMM_TASKS_TO_MODEL_LOADERS = { - "image-classification": "create_model", -} -_LIBRARY_TO_TF_TASKS_TO_MODEL_LOADER_MAP = { - "transformers": _TRANSFORMERS_TASKS_TO_TF_MODEL_LOADERS, -} +_TIMM_TASKS_TO_MODEL_LOADERS = {"image-classification": "create_model"} +_LIBRARY_TO_TF_TASKS_TO_MODEL_LOADER_MAP = {"transformers": _TRANSFORMERS_TASKS_TO_TF_MODEL_LOADERS} _LIBRARY_TO_TASKS_TO_MODEL_LOADER_MAP = { "transformers": _TRANSFORMERS_TASKS_TO_MODEL_LOADERS, "diffusers": _DIFFUSERS_TASKS_TO_MODEL_LOADERS, @@ -96,35 +89,15 @@ "zero-shot-classification": "text-classification", } _CUSTOM_CLASSES = { - ("pt", "pix2struct", "image-to-text"): ( - "transformers", - "Pix2StructForConditionalGeneration", - ), - ("pt", "pix2struct", "visual-question-answering"): ( - "transformers", - "Pix2StructForConditionalGeneration", - ), - ("pt", "visual-bert", "question-answering"): ( - "transformers", - "VisualBertForQuestionAnswering", - ), - ("pt", "vision-encoder-decoder", "document-question-answering"): ( - "transformers", - "VisionEncoderDecoderModel", - ), + ("pt", "pix2struct", "image-to-text"): ("transformers", "Pix2StructForConditionalGeneration"), + ("pt", "pix2struct", "visual-question-answering"): ("transformers", "Pix2StructForConditionalGeneration"), + ("pt", "visual-bert", "question-answering"): ("transformers", "VisualBertForQuestionAnswering"), + ("pt", "vision-encoder-decoder", "document-question-answering"): ("transformers", "VisionEncoderDecoderModel"), } -IMAGE_DIFFUSION_TASKS = [ - "stable-diffusion", - "stable-diffusion-xl", -] +IMAGE_DIFFUSION_TASKS = ["stable-diffusion", "stable-diffusion-xl"] -TEXT_GENERATION_TASKS = [ - "image-to-text", - "text-generation", - "text2text-generation", - "automatic-speech-recognition", -] +TEXT_GENERATION_TASKS = ["image-to-text", "text-generation", "text2text-generation", "automatic-speech-recognition"] def map_from_synonym(task: str) -> str: @@ -166,10 +139,7 @@ def infer_task_from_model_name_or_path(model_name_or_path: str, revision: Option else: pipeline_tag = getattr(model_info, "pipeline_tag", None) # conversational is not a supported task per se, just an alias that may map to text-generaton or text2text-generation - if pipeline_tag is not None and pipeline_tag not in [ - "conversational", - "object-detection", - ]: + if pipeline_tag is not None and pipeline_tag not in ["conversational", "object-detection"]: inferred_task_name = map_from_synonym(model_info.pipeline_tag) else: transformers_info = model_info.transformersInfo diff --git a/optimum_benchmark/trackers/energy.py b/optimum_benchmark/trackers/energy.py index 7d3bb7ad..d5335b5d 100644 --- a/optimum_benchmark/trackers/energy.py +++ b/optimum_benchmark/trackers/energy.py @@ -1,35 +1,99 @@ import os -from logging import getLogger from contextlib import contextmanager -from typing import Optional, Dict +from dataclasses import dataclass +from logging import getLogger +from typing import List, Literal, Optional -from ..env_utils import get_cuda_device_ids -from ..import_utils import is_codecarbon_available +from ..import_utils import is_codecarbon_available, is_torch_distributed_available +from ..system_utils import get_gpu_device_ids -if is_codecarbon_available(): - from codecarbon import EmissionsTracker, OfflineEmissionsTracker +if is_torch_distributed_available(): + import torch.distributed +if is_codecarbon_available(): + from codecarbon import ( + EmissionsTracker, # type: ignore + OfflineEmissionsTracker, + ) LOGGER = getLogger("energy") +ENERGY_UNIT = "kWh" +Energy_Unit_Literal = Literal["kWh"] +Efficiency_Unit_Literal = Literal["samples/kWh", "tokens/kWh", "images/kWh"] + + +@dataclass +class Energy: + unit: Energy_Unit_Literal + + cpu: float + ram: float + gpu: float + total: float + + @staticmethod + def aggregate(energies: List["Energy"]) -> "Energy": + if len(energies) == 0 or all(energy is None for energy in energies): + return None + elif any(energy is None for energy in energies): + raise ValueError("Some energy measurements are missing") + + cpu = sum(energy.cpu for energy in energies) + gpu = sum(energy.gpu for energy in energies) + ram = sum(energy.ram for energy in energies) + total = sum(energy.total for energy in energies) + + return Energy(cpu=cpu, gpu=gpu, ram=ram, total=total, unit=ENERGY_UNIT) + + def log(self, prefix: str = "forward"): + LOGGER.info(f"\t\t+ {prefix} CPU energy: {self.cpu:f} ({self.unit})") + LOGGER.info(f"\t\t+ {prefix} GPU energy: {self.gpu:f} ({self.unit})") + LOGGER.info(f"\t\t+ {prefix} RAM energy: {self.ram:f} ({self.unit})") + LOGGER.info(f"\t\t+ {prefix} total energy: {self.total:f} ({self.unit})") + + +@dataclass +class Efficiency: + unit: Efficiency_Unit_Literal + + value: float + + @staticmethod + def aggregate(efficiencies: List["Efficiency"]) -> "Efficiency": + if len(efficiencies) == 0: + raise ValueError("No efficiency measurements to aggregate") + elif any(efficiency is None for efficiency in efficiencies): + raise ValueError("Some efficiency measurements are None") + + unit = efficiencies[0].unit + value = sum(efficiency.value for efficiency in efficiencies) / len(efficiencies) + + return Efficiency(value=value, unit=unit) + + @staticmethod + def from_energy(energy: "Energy", volume: int, unit: str) -> "Efficiency": + return Efficiency(value=volume / energy.total if energy.total > 0 else 0, unit=unit) + + def log(self, prefix: str = "forward"): + LOGGER.info(f"\t\t+ {prefix} efficiency: {self.value:f} ({self.unit})") + class EnergyTracker: def __init__(self, device: str, device_ids: Optional[str] = None): self.device = device - - self.cpu_energy: float = 0 - self.gpu_energy: float = 0 - self.ram_energy: float = 0 - self.total_energy: float = 0 + self.device_ids = device_ids + self.distributed = is_torch_distributed_available() and torch.distributed.is_initialized() if self.device == "cuda": - if device_ids is None: + if self.device_ids is None: LOGGER.warning("\t+ `device=cuda` but `device_ids` not provided. Using all available CUDA devices.") - self.device_ids = list(map(int, get_cuda_device_ids().split(","))) - else: - self.device_ids = list(map(int, device_ids.split(","))) - else: - self.device_ids = [] + self.device_ids = get_gpu_device_ids() + + self.device_ids = list(map(int, self.device_ids.split(","))) + LOGGER.info(f"\t+ Tracking GPU energy on devices {self.device_ids}") + + self.reset() def reset(self): self.cpu_energy = 0 @@ -72,10 +136,16 @@ def track(self, interval=1, file_prefix="method"): country_iso_code=os.environ.get("COUNTRY_ISO_CODE", "FRA"), ) + if self.distributed: + torch.distributed.barrier(device_ids=[torch.cuda.current_device()] if self.device == "cuda" else None) + self.emission_tracker.start() yield self.emission_tracker.stop() + if self.distributed: + torch.distributed.barrier(device_ids=[torch.cuda.current_device()] if self.device == "cuda" else None) + self.cpu_energy = self.emission_tracker._total_cpu_energy.kWh self.gpu_energy = self.emission_tracker._total_gpu_energy.kWh self.ram_energy = self.emission_tracker._total_ram_energy.kWh @@ -84,10 +154,7 @@ def track(self, interval=1, file_prefix="method"): def get_elapsed_time(self) -> float: return self.emission_tracker._last_measured_time - self.emission_tracker._start_time - def get_energies_dict(self) -> Dict[str, float]: - return { - "cpu_energy(kHh)": self.cpu_energy, - "gpu_energy(kHh)": self.gpu_energy, - "ram_energy(kHh)": self.ram_energy, - "total(kHh)": self.total_energy, - } + def get_energy(self) -> Energy: + return Energy( + unit=ENERGY_UNIT, cpu=self.cpu_energy, gpu=self.gpu_energy, ram=self.ram_energy, total=self.total_energy + ) diff --git a/optimum_benchmark/trackers/latency.py b/optimum_benchmark/trackers/latency.py index 369c2b70..e076875f 100644 --- a/optimum_benchmark/trackers/latency.py +++ b/optimum_benchmark/trackers/latency.py @@ -1,91 +1,241 @@ +import time from contextlib import contextmanager +from dataclasses import dataclass from logging import getLogger -from typing import List -import time +from typing import List, Literal, Union -from ..import_utils import is_torch_distributed_available, is_torch_available - -if is_torch_available(): - import torch +from ..import_utils import is_torch_distributed_available if is_torch_distributed_available(): import torch.distributed +import torch +from transformers import LogitsProcessor, TrainerCallback + LOGGER = getLogger("latency") +LATENCY_UNIT = "s" +Latency_Unit_Literal = Literal["s"] +Throughput_Unit_Literal = Literal["samples/s", "tokens/s", "images/s", "steps/s"] + + +@dataclass +class Latency: + unit: Latency_Unit_Literal + + mean: float + stdev: float + values: List[float] + + def __getitem__(self, index: int) -> float: + if isinstance(index, slice): + return Latency.from_values(values=self.values[index], unit=self.unit) + else: + return Latency.from_values(values=[self.values[index]], unit=self.unit) + + def __sub__(self, scalar: float) -> "Latency": + if not isinstance(scalar, (int, float)): + raise ValueError(f"Cannot subtract non-scalar value from latency: {scalar}") + + latencies = [lat - scalar for lat in self.values] + return Latency.from_values(values=latencies, unit=self.unit) + + @staticmethod + def aggregate(latencies: List["Latency"]) -> "Latency": + if len(latencies) == 0 or all(latency is None for latency in latencies): + return None + elif any(latency is None for latency in latencies): + raise ValueError("Some latency measurements are missing") + + unit = latencies[0].unit + values = sum((lat.values for lat in latencies), []) + return Latency.from_values(values=values, unit=unit) + + @staticmethod + def from_values(values: List[float], unit: str) -> "Latency": + mean = sum(values) / len(values) if len(values) > 0 else 0 + stdev = (sum((val - mean) ** 2 for val in values) / len(values)) ** 0.5 if len(values) > 1 else 0 + return Latency(mean=mean, stdev=stdev, values=values, unit=unit) + + def log(self, prefix: str = "forward"): + LOGGER.info(f"\t\t+ {prefix} latency: {self.mean:f} ยฑ 2 x {self.stdev:f} ({self.unit})") + + +@dataclass +class Throughput: + unit: Throughput_Unit_Literal + + value: float + + @staticmethod + def aggregate(throughputs: List["Throughput"]) -> "Throughput": + if len(throughputs) == 0: + raise ValueError("No throughput measurements to aggregate") + elif any(throughput is None for throughput in throughputs): + raise ValueError("Some throughput measurements are missing") + + unit = throughputs[0].unit + value = sum(throughput.value for throughput in throughputs) + + return Throughput(value=value, unit=unit) + + @staticmethod + def from_latency(latency: Latency, volume: int, unit: str) -> "Throughput": + value = volume / latency.mean if latency.mean > 0 else 0 + return Throughput(value=value, unit=unit) + + def log(self, prefix: str = "forward"): + LOGGER.info(f"\t\t+ {prefix} throughput: {self.value:f} {self.unit}") + class LatencyTracker: def __init__(self, device: str, backend: str): self.device = device self.backend = backend + self.distributed = is_torch_distributed_available() and torch.distributed.is_initialized() - self.latencies: List[float] = [] - - # this is not in track, because this tracker is used repeatedly - if is_torch_distributed_available() and torch.distributed.is_initialized(): - LOGGER.info("\t+ Tracking Pytorch Distributed latency") - elif self.device == "cuda" and self.backend == "pytorch": + if self.backend == "pytorch" and self.device == "cuda": LOGGER.info("\t+ Tracking Pytorch CUDA latency") else: LOGGER.info("\t+ Tracking CPU latency") + self.reset() + def reset(self): - self.latencies = [] + self.start_events: List[Union[float, torch.cuda.Event]] = [] + self.end_events: List[Union[float, torch.cuda.Event]] = [] + self.start_time: float = time.perf_counter() @contextmanager def track(self): - if is_torch_distributed_available() and torch.distributed.is_initialized(): - yield from self._pytorch_distributed_latency() - elif self.backend == "pytorch" and self.device == "cuda": + if self.distributed: + torch.distributed.barrier(device_ids=[torch.cuda.current_device()] if self.device == "cuda" else None) + + if self.backend == "pytorch" and self.device == "cuda": yield from self._pytorch_cuda_latency() else: yield from self._cpu_latency() - def _pytorch_distributed_latency(self): - torch.distributed.barrier() # synchronize before workload - start = time.perf_counter_ns() + if self.distributed: + torch.distributed.barrier(device_ids=[torch.cuda.current_device()] if self.device == "cuda" else None) + + def _pytorch_cuda_latency(self): + start = torch.cuda.Event(enable_timing=True) + start.record() + self.start_events.append(start) + yield - torch.distributed.barrier() # synchronize after workload - end = time.perf_counter_ns() - latency = (end - start) / 1e9 - self.latencies.append(latency) + end = torch.cuda.Event(enable_timing=True) + end.record() + self.end_events.append(end) - LOGGER.debug(f"\t+ Tracked Pytorch distributed latency: {latency:.2e}s") + def _cpu_latency(self): + start = time.perf_counter() + self.start_events.append(start) - def _pytorch_cuda_latency(self): - # Note: torch.cuda.Event is not used here, - # there's actually no specific need to use cuda events if you're synchronizing - # it's rather a feature that can be used to measure kernel latency without synchronizing, - # allowing us to measure the time it takes to perform an operation without necessarily stalling the GPU. - # An interesting use case is with cuda graphs where synchronization makes us shoot the optimization in the foot. - # details: https://developer.nvidia.com/blog/how-implement-performance-metrics-cuda-cc/ - torch.cuda.synchronize() # synchronize before workload - start = time.perf_counter_ns() yield - torch.cuda.synchronize() # synchronize after workload - end = time.perf_counter_ns() - latency = (end - start) / 1e9 - self.latencies.append(latency) + end = time.perf_counter() + self.end_events.append(end) - LOGGER.debug(f"\t+ Tracked Pytorch CUDA latency: {latency:.2e}s") + def get_elapsed_time(self) -> float: + # we measured in cpu to not synchronize all events + return time.perf_counter() - self.start_time - def _cpu_latency(self): - start = time.perf_counter_ns() - yield - end = time.perf_counter_ns() + def get_latency(self) -> Latency: + if self.backend == "pytorch" and self.device == "cuda": + # synchronize the last event to make sure it has been recorded + self.start_events[-1].synchronize() + self.end_events[-1].synchronize() + + latencies_list = [ + self.start_events[i].elapsed_time(self.end_events[i]) / 1e3 for i in range(len(self.start_events)) + ] + else: + latencies_list = [(self.end_events[i] - self.start_events[i]) for i in range(len(self.start_events))] + + return Latency.from_values(latencies_list, unit=LATENCY_UNIT) + + def get_throughput(self, volume: int, unit: str) -> Throughput: + return Throughput.from_latency(self.get_latency(), volume, unit) + + +class LatencyTrainerCallback(TrainerCallback): + def __init__(self, device: str, backend: str) -> None: + self.device = device + self.backend = backend + + self.reset() + + def reset(self): + self.events: List[Union[float, torch.cuda.Event]] = [] - latency = (end - start) / 1e9 - self.latencies.append(latency) + def on_step_begin(self, *args, **kwargs): + if self.device == "cuda" and self.backend == "pytorch": + event = torch.cuda.Event(enable_timing=True) + event.record() + self.events.append(event) + else: + self.events.append(time.perf_counter()) + + def on_train_end(self, *args, **kwargs): + # one last record to measure the time of the last step + if self.device == "cuda" and self.backend == "pytorch": + event = torch.cuda.Event(enable_timing=True) + event.record() + self.events.append(event) + else: + self.events.append(time.perf_counter()) - LOGGER.debug(f"\t+ Tracked CPU latency: {latency:.2e}s") + def get_latency(self) -> Latency: + if self.device == "cuda" and self.backend == "pytorch": + # synchronize the device to make sure all events have been recorded + torch.cuda.synchronize() + latencies_list = [self.events[i - 1].elapsed_time(self.events[i]) / 1e3 for i in range(1, len(self.events))] + else: + latencies_list = [(self.events[i] - self.events[i - 1]) for i in range(1, len(self.events))] - def get_total_count(self): - return len(self.latencies) + return Latency.from_values(latencies_list, unit=LATENCY_UNIT) + + def get_throughput(self, volume: int, unit: str) -> Throughput: + return Throughput.from_latency(self.get_latency(), volume, unit) + + +class LatencyLogitsProcessor(LogitsProcessor): + def __init__(self, device: str, backend: str): + self.device = device + self.backend = backend + + self.reset() + + def reset(self): + if self.device == "cuda" and self.backend == "pytorch": + event = torch.cuda.Event(enable_timing=True) + event.record() + self.events = [event] + else: + self.events = [time.perf_counter()] + + def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor): + if self.device == "cuda" and self.backend == "pytorch": + event = torch.cuda.Event(enable_timing=True) + event.record() + self.events.append(event) + else: + self.events.append(time.perf_counter()) + + return scores + + def get_latency(self) -> Latency: + if self.device == "cuda" and self.backend == "pytorch": + # synchronize the device to make sure all events have been recorded + torch.cuda.synchronize() + latencies_list = [self.events[i - 1].elapsed_time(self.events[i]) / 1e3 for i in range(1, len(self.events))] + else: + latencies_list = [(self.events[i] - self.events[i - 1]) for i in range(1, len(self.events))] - def get_total_latency(self): - return sum(self.latencies) + return Latency.from_values(latencies_list, unit=LATENCY_UNIT) - def get_latencies_list(self) -> List[float]: - return self.latencies + def get_throughput(self, volume: int, unit: str) -> Throughput: + return Throughput.from_latency(self.get_latency(), volume, unit) diff --git a/optimum_benchmark/trackers/memory.py b/optimum_benchmark/trackers/memory.py index 816f1d5a..017c21fe 100644 --- a/optimum_benchmark/trackers/memory.py +++ b/optimum_benchmark/trackers/memory.py @@ -1,86 +1,110 @@ import os -from logging import getLogger from contextlib import contextmanager -from typing import List, Optional, Dict +from dataclasses import dataclass +from logging import getLogger from multiprocessing import Pipe, Process from multiprocessing.connection import Connection +from typing import List, Literal, Optional -from ..env_utils import bytes_to_mega_bytes, get_cuda_device_ids, is_nvidia_system, is_rocm_system -from ..import_utils import is_py3nvml_available, is_pyrsmi_available, is_torch_available +from ..import_utils import is_amdsmi_available, is_pynvml_available, is_torch_available, is_torch_distributed_available +from ..system_utils import get_gpu_device_ids, get_rocm_version, is_nvidia_system, is_rocm_system -if is_nvidia_system(): - if is_py3nvml_available(): - import py3nvml.py3nvml as nvml - else: - raise ValueError( - "The library py3nvml is required to run memory benchmark on NVIDIA GPUs, but is not installed. " - "Please install it through `pip install py3nvml`." - ) +if is_torch_distributed_available(): + import torch.distributed -if is_rocm_system(): - if is_pyrsmi_available(): - from pyrsmi import rocml - else: - raise ValueError( - "The library pyrsmi is required to run memory benchmark on AMD GPUs, but is not installed. " - "Please install it through `pip install pyrsmi@git+https://github.com/RadeonOpenCompute/pyrsmi.git." - ) +if is_nvidia_system() and is_pynvml_available(): + import pynvml + +if is_rocm_system() and is_amdsmi_available(): + import amdsmi # type: ignore if is_torch_available(): import torch import psutil - LOGGER = getLogger("memory") +MEMORY_UNIT = "MB" +Memory_Unit_Literal = Literal["MB"] -class MemoryTracker: - """ - Memory tracker to measure max memory usage of CPU or GPU devices. - Args: - device (str): Device to track memory usage. Can be either "cuda" or any other device. - backend (str): Backend to track memory usage. Can be either "pytorch" or any other backend. - device_ids (List[int], optional): List of device IDs to track memory usage. Defaults to None. - """ +@dataclass +class Memory: + unit: Memory_Unit_Literal + + max_ram: float + max_vram: Optional[float] = None + max_reserved: Optional[float] = None + max_allocated: Optional[float] = None + + @staticmethod + def aggregate(memories: List["Memory"]) -> "Memory": + if len(memories) == 0: + raise ValueError("No memory measurements to aggregate") + elif any(memory is None for memory in memories): + raise ValueError("Some memory measurements are missing") + unit = memories[0].unit + max_ram = sum(memory.max_ram for memory in memories) + max_vram = sum(memory.max_vram for memory in memories) if memories[0].max_vram is not None else None + max_reserved = sum(memory.max_reserved for memory in memories) if memories[0].max_reserved is not None else None + max_allocated = ( + sum(memory.max_allocated for memory in memories) if memories[0].max_allocated is not None else None + ) + return Memory( + unit=unit, max_ram=max_ram, max_vram=max_vram, max_reserved=max_reserved, max_allocated=max_allocated + ) + + def log(self, prefix: str = "forward"): + LOGGER.info(f"\t\t+ {prefix} max RAM memory: {self.max_ram:f} ({self.unit})") + if self.max_vram is not None: + LOGGER.info(f"\t\t+ {prefix} max VRAM memory: {self.max_vram:f} ({self.unit})") + if self.max_reserved is not None: + LOGGER.info(f"\t\t+ {prefix} max reserved memory: {self.max_reserved:f} ({self.unit})") + if self.max_allocated is not None: + LOGGER.info(f"\t\t+ {prefix} max allocated memory: {self.max_allocated:f} ({self.unit})") + + +class MemoryTracker: def __init__(self, device: str, backend: str, device_ids: Optional[str] = None): self.device = device self.backend = backend + self.device_ids = device_ids + self.distributed = is_torch_distributed_available() and torch.distributed.is_initialized() - self.max_memory_used = 0 - self.max_memory_reserved = 0 - self.max_memory_allocated = 0 + LOGGER.info("\t+ Tracking RAM memory") if self.device == "cuda": - if device_ids is None: + if self.device_ids is None: LOGGER.warning("\t+ `device=cuda` but `device_ids` not provided. Using all available CUDA devices.") - self.device_ids = list(map(int, get_cuda_device_ids().split(","))) - else: - self.device_ids = list(map(int, device_ids.split(","))) + self.device_ids = get_gpu_device_ids() + self.device_ids = list(map(int, self.device_ids.split(","))) LOGGER.info(f"\t+ Tracking VRAM memory of CUDA devices: {self.device_ids}") if self.backend == "pytorch": - self.pytorch_device_ids = list(range(torch.cuda.device_count())) - LOGGER.info(f"\t+ Tracking Pytorch memory of Pytorch CUDA devices: {self.pytorch_device_ids}") - - if len(self.device_ids) != len(self.pytorch_device_ids): + num_pytorch_devices = torch.cuda.device_count() + if len(self.device_ids) != num_pytorch_devices: raise ValueError( "The number of CUDA devices and Pytorch CUDA devices must be the same. " - f"Got {len(self.device_ids)} and {len(self.pytorch_device_ids)} respectively." + f"Got {len(self.device_ids)} and {num_pytorch_devices} respectively." ) - else: - LOGGER.info("\t+ Tracking RAM memory") + LOGGER.info(f"\t+ Tracking Allocated/Reserved memory of {num_pytorch_devices} Pytorch CUDA devices") + + self.reset() def reset(self): - self.max_memory_used = 0 - self.max_memory_reserved = 0 - self.max_memory_allocated = 0 + self.max_ram_memory = 0 + self.max_vram_memory = 0 + self.max_reserved_memory = 0 + self.max_allocated_memory = 0 @contextmanager def track(self): + if self.distributed: + torch.distributed.barrier(device_ids=[torch.cuda.current_device()] if self.device == "cuda" else None) + if self.device == "cuda" and self.backend == "pytorch": yield from self._cuda_pytorch_memory() elif self.device == "cuda": @@ -88,122 +112,202 @@ def track(self): else: yield from self._cpu_memory() + if self.distributed: + torch.distributed.barrier(device_ids=[torch.cuda.current_device()] if self.device == "cuda" else None) + def _cuda_pytorch_memory(self): torch.cuda.empty_cache() - for pytorch_device_index in self.pytorch_device_ids: + + for device in range(torch.cuda.device_count()): try: - torch.cuda.reset_peak_memory_stats(device=pytorch_device_index) + torch.cuda.reset_peak_memory_stats(device=device) except Exception as e: - LOGGER.warning(f"\t+ Could not reset max memory stats for device {pytorch_device_index}: {e}") + LOGGER.warning(f"\t\t+ Could not reset max memory stats for device {device}: {e}") yield from self._cuda_memory() - for pytorch_device_index in self.pytorch_device_ids: - self.max_memory_reserved += torch.cuda.max_memory_reserved(device=pytorch_device_index) - self.max_memory_allocated += torch.cuda.max_memory_allocated(device=pytorch_device_index) + self.max_allocated_memory = sum( + torch.cuda.max_memory_allocated(device=device) / 1e6 for device in range(torch.cuda.device_count()) + ) + self.max_reserved_memory = sum( + torch.cuda.max_memory_reserved(device=device) / 1e6 for device in range(torch.cuda.device_count()) + ) - LOGGER.debug(f"\t+ Pytorch max memory reserved: {self.get_max_memory_reserved_mb()} MB") - LOGGER.debug(f"\t+ Pytorch max memory allocated: {self.get_max_memory_allocated_mb()} MB") + torch.cuda.empty_cache() - def _cuda_memory(self, interval: float = 0.001): + def _cuda_memory(self): child_connection, parent_connection = Pipe() memory_process = Process( - target=monitor_gpu_max_vram_memory, - args=(self.device_ids, child_connection, interval), - daemon=True, + target=monitor_gpu_vram_memory, args=(os.getpid(), self.device_ids, child_connection), daemon=True ) memory_process.start() parent_connection.recv() # wait for memory process to be ready - yield + yield from self._cpu_memory() parent_connection.send(True) - self.max_memory_used = parent_connection.recv() - LOGGER.debug(f"\t+ Max memory (VRAM) used: {self.get_max_memory_used_mb()} MB") + self.max_vram_memory = parent_connection.recv() - def _cpu_memory(self, interval: float = 0.001): + def _cpu_memory(self): child_connection, parent_connection = Pipe() - memory_process = Process( - target=monitor_cpu_max_ram_memory, - args=(os.getpid(), child_connection, interval), - daemon=True, - ) + memory_process = Process(target=monitor_cpu_ram_memory, args=(os.getpid(), child_connection), daemon=True) memory_process.start() parent_connection.recv() # wait for memory process to be ready yield parent_connection.send(True) - self.max_memory_used = parent_connection.recv() - LOGGER.debug(f"\t+ Max memory (RAM) used: {self.get_max_memory_used_mb()} MB") - - def get_max_memory_used_mb(self) -> int: - return bytes_to_mega_bytes(self.max_memory_used) - - def get_max_memory_allocated_mb(self) -> int: - return bytes_to_mega_bytes(self.max_memory_allocated) - - def get_max_memory_reserved_mb(self) -> int: - return bytes_to_mega_bytes(self.max_memory_reserved) + self.max_ram_memory = parent_connection.recv() - def get_memories_dict(self) -> Dict[str, int]: + def get_max_memory(self): if self.device == "cuda" and self.backend == "pytorch": - return { - "max_vram_used(MB)": self.get_max_memory_used_mb(), - "max_memory_reserved(MB)": self.get_max_memory_reserved_mb(), - "max_memory_allocated(MB)": self.get_max_memory_allocated_mb(), - } + return Memory( + unit=MEMORY_UNIT, + max_ram=self.max_ram_memory, + max_vram=self.max_vram_memory, + max_reserved=self.max_reserved_memory, + max_allocated=self.max_allocated_memory, + ) elif self.device == "cuda": - return {"max_vram_used(MB)": self.get_max_memory_used_mb()} + return Memory(unit=MEMORY_UNIT, max_ram=self.max_ram_memory, max_vram=self.max_vram_memory) else: - return {"max_ram_used(MB)": self.get_max_memory_used_mb()} + return Memory(unit=MEMORY_UNIT, max_ram=self.max_ram_memory) -def monitor_cpu_max_ram_memory(process_id: int, connection: Connection, interval: float): +def monitor_cpu_ram_memory(process_id: int, connection: Connection, interval: float = 0.001): + stop = False + max_memory = 0 process = psutil.Process(process_id) - max_memory_usage = 0 connection.send(0) - stop = False while not stop: meminfo_attr = "memory_info" if hasattr(process, "memory_info") else "get_memory_info" - current_memory_usage = getattr(process, meminfo_attr)()[0] - max_memory_usage = max(max_memory_usage, current_memory_usage) + current_used_memory = getattr(process, meminfo_attr)()[0] + max_memory = max(max_memory, current_used_memory) stop = connection.poll(interval) - connection.send(max_memory_usage) + connection.send(max_memory / 1e6) # convert to MB connection.close() -def monitor_gpu_max_vram_memory(device_ids: List[int], connection: Connection, interval: float): - if is_nvidia_system() and is_py3nvml_available(): - nvml.nvmlInit() - handles = [nvml.nvmlDeviceGetHandleByIndex(device_id) for device_id in device_ids] - max_memory_usage = 0 - connection.send(0) - stop = False - - while not stop: - current_memory_usage = sum(nvml.nvmlDeviceGetMemoryInfo(handle).used for handle in handles) - max_memory_usage = max(max_memory_usage, current_memory_usage) - stop = connection.poll(interval) +def monitor_gpu_vram_memory(process_id: int, device_ids: List[int], connection: Connection, interval: float = 0.01): + stop = False + max_memory = 0 + connection.send(0) - connection.send(max_memory_usage) - nvml.nvmlShutdown() - connection.close() - elif is_rocm_system() and is_pyrsmi_available(): - rocml.smi_initialize() - max_memory_usage = 0 - connection.send(0) - stop = False + if is_nvidia_system(): + if not is_pynvml_available(): + raise ValueError( + "The library pynvml is required to run memory benchmark on NVIDIA GPUs, but is not installed. " + "Please install the official and NVIDIA maintained PyNVML library through `pip install nvidia-ml-py`." + ) + pynvml.nvmlInit() + devices_handles = [pynvml.nvmlDeviceGetHandleByIndex(device_id) for device_id in device_ids] while not stop: - current_memory_usage = sum(rocml.smi_get_device_memory_used(device_id) for device_id in device_ids) - max_memory_usage = max(max_memory_usage, current_memory_usage) + current_used_memory = 0 + for device_id, device_handle in zip(device_ids, devices_handles): + try: + device_processes = pynvml.nvmlDeviceGetComputeRunningProcesses(device_handle) + except Exception as e: + LOGGER.warning(f"\t\t+ Could not get process list for device {device_id}: {e}") + continue + for device_process in device_processes: + if device_process.pid == process_id: + current_used_memory += device_process.usedGpuMemory + else: + try: + cpu_process = psutil.Process(device_process.pid) + except Exception as e: + LOGGER.warning(f"\t\t+ Could not get process info for process {device_process.pid}: {e}") + continue + if cpu_process.parent() is not None and cpu_process.parent().pid == process_id: + current_used_memory += device_process.usedGpuMemory + + max_memory = max(max_memory, current_used_memory) stop = connection.poll(interval) - connection.send(max_memory_usage) - rocml.smi_shutdown() - connection.close() + pynvml.nvmlShutdown() + + elif is_rocm_system(): + if not is_amdsmi_available(): + raise ValueError( + "The library amdsmi is required to run memory benchmark on AMD GPUs, but is not installed. " + "Please install the official and AMD maintained amdsmi library from https://github.com/ROCm/amdsmi." + ) + amdsmi.amdsmi_init() + rocm_version = get_rocm_version() + + if rocm_version >= "5.7": + devices_handles = amdsmi.amdsmi_get_processor_handles() + while not stop: + current_used_memory = 0 + for device_id in device_ids: + device_handle = devices_handles[device_id] + try: + processes_handles = amdsmi.amdsmi_get_gpu_process_list(device_handle) + except Exception as e: + LOGGER.warning(f"\t\t+ Could not get process list for device {device_id}: {e}") + continue + for process_handle in processes_handles: + try: + gpu_process_info = amdsmi.amdsmi_get_gpu_process_info(device_handle, process_handle) + except Exception as e: + LOGGER.warning(f"\t\t+ Could not get process info for process {process_handle}: {e}") + continue + # only memory usage of the monitored process and its children is tracked + if gpu_process_info["pid"] == process_id: + current_used_memory += gpu_process_info["memory_usage"]["vram_mem"] + else: + try: + cpu_process_info = psutil.Process(gpu_process_info["pid"]) + except Exception as e: + LOGGER.warning( + f"\t\t+ Could not get process info for process {gpu_process_info['pid']}: {e}" + ) + continue + if cpu_process_info.parent() is not None and cpu_process_info.ppid() == process_id: + current_used_memory += gpu_process_info["memory_usage"]["vram_mem"] + + max_memory = max(max_memory, current_used_memory) + stop = connection.poll(interval) + else: + devices_handles = amdsmi.amdsmi_get_device_handles() + while not stop: + current_used_memory = 0 + for device_id in device_ids: + device_handle = devices_handles[device_id] + try: + processes_handles = amdsmi.amdsmi_get_process_list(device_handle) + except Exception as e: + LOGGER.warning(f"\t\t+ Could not get process list for device {device_id}: {e}") + continue + for process_handle in processes_handles: + try: + gpu_process_info = amdsmi.amdsmi_get_process_info(device_handle, process_handle) + except Exception as e: + LOGGER.warning(f"\t\t+ Could not get process info for process {process_handle}: {e}") + continue + # only memory usage of the monitored process and its children is tracked + if gpu_process_info["pid"] == process_id: + current_used_memory += gpu_process_info["memory_usage"]["vram_mem"] + else: + try: + cpu_process_info = psutil.Process(gpu_process_info["pid"]) + except Exception as e: + LOGGER.warning( + f"\t\t+ Could not get process info for process {gpu_process_info['pid']}: {e}" + ) + continue + if cpu_process_info.parent() is not None and cpu_process_info.ppid() == process_id: + current_used_memory += gpu_process_info["memory_usage"]["vram_mem"] + + max_memory = max(max_memory, current_used_memory) + stop = connection.poll(interval) + + amdsmi.amdsmi_shut_down() else: raise ValueError("Only NVIDIA and AMD ROCm GPUs are supported for CUDA memory tracking.") + + connection.send(max_memory / 1e6) # convert to MB + connection.close() diff --git a/pyproject.toml b/pyproject.toml index e9ce4301..58e5b284 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,5 +1,18 @@ +# [tool.isort] +# profile = "ruff" +# lines_after_imports = 2 +# known_first_party = "optimum_benchmark" + [tool.ruff] line-length = 120 +ignore = ["C901", "E501", "E741", "W605"] +select = ["C", "E", "F", "I", "W", "I001"] + +[tool.ruff.format] +line-ending = "auto" +quote-style = "double" +indent-style = "space" +skip-magic-trailing-comma = false [tool.pytest.ini_options] log_cli = true diff --git a/setup.py b/setup.py index 40504fd3..f993adc4 100644 --- a/setup.py +++ b/setup.py @@ -1,5 +1,6 @@ import os import subprocess + from setuptools import find_packages, setup MIN_OPTIMUM_VERSION = "1.16.0" @@ -12,13 +13,10 @@ "hydra_colorlog", "hydra-core", "omegaconf", - # Other + # CPU Memory "psutil", - "pandas", # Reporting - "rich", - "tabulate", - "matplotlib", + "pandas", "flatten_dict", ] @@ -28,20 +26,21 @@ USE_ROCM = os.environ.get("USE_ROCM", None) == "1" if USE_CUDA: - INSTALL_REQUIRES.append("py3nvml") + INSTALL_REQUIRES.append("nvidia-ml-py") else: try: subprocess.run(["nvidia-smi"], stdout=subprocess.DEVNULL) - INSTALL_REQUIRES.append("py3nvml") + INSTALL_REQUIRES.append("nvidia-ml-py") except FileNotFoundError: pass +# we keep this as a check that amdsmi is installed since it's not available on pypi if USE_ROCM: - INSTALL_REQUIRES.append("pyrsmi@git+https://github.com/RadeonOpenCompute/pyrsmi.git") + INSTALL_REQUIRES.append("amdsmi") else: try: - subprocess.run(["nvidia-smi"], stdout=subprocess.DEVNULL) - INSTALL_REQUIRES.append("pyrsmi@git+https://github.com/RadeonOpenCompute/pyrsmi.git") + subprocess.run(["rocm-smi"], stdout=subprocess.DEVNULL) + INSTALL_REQUIRES.append("amdsmi") except FileNotFoundError: pass @@ -54,11 +53,7 @@ "onnxruntime": [f"optimum[onnxruntime]>={MIN_OPTIMUM_VERSION}"], "neural-compressor": [f"optimum[neural-compressor]>={MIN_OPTIMUM_VERSION}"], "onnxruntime-gpu": [f"optimum[onnxruntime-gpu]>={MIN_OPTIMUM_VERSION}"], - "torch-ort": [ - f"optimum>={MIN_OPTIMUM_VERSION}", - "onnxruntime-training", - "torch-ort", - ], + "torch-ort": [f"optimum>={MIN_OPTIMUM_VERSION}", "onnxruntime-training", "torch-ort"], # docker-based backends "text-generation-inference": ["docker"], # specific settings @@ -75,6 +70,6 @@ install_requires=INSTALL_REQUIRES, extras_require=EXTRAS_REQUIRE, packages=find_packages(), - version="0.0.2", + version="0.1.0", entry_points={"console_scripts": ["optimum-benchmark=optimum_benchmark.cli:benchmark_cli"]}, ) diff --git a/tests/configs/_base_.yaml b/tests/configs/_base_.yaml index d983b841..27acb325 100644 --- a/tests/configs/_base_.yaml +++ b/tests/configs/_base_.yaml @@ -2,8 +2,8 @@ defaults: - launcher: process # isolated process launcher - experiment # inheriting experiment schema - _self_ # for hydra 1.1 compatibility - - override hydra/hydra_logging: colorlog # colorful logging - - override hydra/job_logging: colorlog # colorful logging + - override hydra/hydra_logging: colorlog + - override hydra/job_logging: colorlog - override hydra/launcher: joblib # for parallelization experiment_name: ${device}_${benchmark.name}_${backend.name}_${task} @@ -20,13 +20,12 @@ hydra: # change working directory to the run directory chdir: true env_set: - # set environment variable OVERRIDE_BENCHMARKS to 1 - # to not skip benchmarks that have been run before + # to not skip benchmarks if results already exist OVERRIDE_BENCHMARKS: 1 # we are using joblib launcher to parallelize testing since - # we're having ccorrect benchmarks is not important while testing + # having correct benchmark values is not important while testing # to force sequential execution, uncomment the following three lines # launcher: - # n_jobs: 1 # for debugging - # batch_size: 1 # for debugging + # n_jobs: -1 # 1 for debugging + # batch_size: auto # 1 for debugging diff --git a/tests/configs/_bert_sweep_.yaml b/tests/configs/_bert_sweep_.yaml index e6a6c4fc..c4986d0d 100644 --- a/tests/configs/_bert_sweep_.yaml +++ b/tests/configs/_bert_sweep_.yaml @@ -1,6 +1,5 @@ hydra: sweeper: params: - backend.no_weights: false,true + backend.model: hf-internal-testing/tiny-random-bert backend.task: fill-mask,text-classification,token-classification,question-answering - backend.model: hf-internal-testing/tiny-random-bert,hf-internal-testing/tiny-random-roberta diff --git a/tests/configs/_lm_naive_mp_.yaml b/tests/configs/_gpt_naive_mp_.yaml similarity index 100% rename from tests/configs/_lm_naive_mp_.yaml rename to tests/configs/_gpt_naive_mp_.yaml index 2ac16fb8..cf2adfd3 100644 --- a/tests/configs/_lm_naive_mp_.yaml +++ b/tests/configs/_gpt_naive_mp_.yaml @@ -1,6 +1,6 @@ backend: - device_ids: 0,1 - device_map: auto + model: gpt2 task: text-generation library: transformers - model: gpt2 + device_ids: 0,1 + device_map: auto diff --git a/tests/configs/_lm_peft_.yaml b/tests/configs/_gpt_peft_.yaml similarity index 100% rename from tests/configs/_lm_peft_.yaml rename to tests/configs/_gpt_peft_.yaml diff --git a/tests/configs/_lm_sweep_.yaml b/tests/configs/_gpt_sweep_.yaml similarity index 81% rename from tests/configs/_lm_sweep_.yaml rename to tests/configs/_gpt_sweep_.yaml index 763d7120..1ff5e2c7 100644 --- a/tests/configs/_lm_sweep_.yaml +++ b/tests/configs/_gpt_sweep_.yaml @@ -2,5 +2,4 @@ hydra: sweeper: params: backend.task: text-generation - backend.no_weights: false,true backend.model: hf-internal-testing/tiny-random-gpt2,IlyasMoutawwakil/tiny-random-llama diff --git a/tests/configs/cpu_inference_neural_compressor_lm_sweep.yaml b/tests/configs/cpu_inference_neural_compressor_gpt_sweep.yaml similarity index 70% rename from tests/configs/cpu_inference_neural_compressor_lm_sweep.yaml rename to tests/configs/cpu_inference_neural_compressor_gpt_sweep.yaml index 21fb30d9..bf2f9d15 100644 --- a/tests/configs/cpu_inference_neural_compressor_lm_sweep.yaml +++ b/tests/configs/cpu_inference_neural_compressor_gpt_sweep.yaml @@ -3,8 +3,8 @@ defaults: # order of inheritance, last one overrides previous ones - _base_ # inherits from base config - _inference_ # inherits from inference config - - _lm_sweep_ # inherits from gpt_sweep config + - _gpt_sweep_ # inherits from gpt_sweep config - _cpu_ # inherits from cpu config - _self_ # hydra 1.1 compatibility -experiment_name: cpu_inference_neural_compressor_lm_sweep +experiment_name: cpu_inference_neural_compressor_gpt_sweep diff --git a/tests/configs/cpu_inference_onnxruntime_lm_sweep.yaml b/tests/configs/cpu_inference_onnxruntime_gpt_sweep.yaml similarity index 71% rename from tests/configs/cpu_inference_onnxruntime_lm_sweep.yaml rename to tests/configs/cpu_inference_onnxruntime_gpt_sweep.yaml index 6e3c214c..a958bb55 100644 --- a/tests/configs/cpu_inference_onnxruntime_lm_sweep.yaml +++ b/tests/configs/cpu_inference_onnxruntime_gpt_sweep.yaml @@ -3,8 +3,8 @@ defaults: # order of inheritance, last one overrides previous ones - _base_ # inherits from base config - _inference_ # inherits from inference config - - _lm_sweep_ # inherits from gpt_sweep config + - _gpt_sweep_ # inherits from gpt_sweep config - _cpu_ # inherits from cpu config - _self_ # hydra 1.1 compatibility -experiment_name: cpu_inference_onnxruntime_lm_sweep +experiment_name: cpu_inference_onnxruntime_gpt_sweep diff --git a/tests/configs/cpu_inference_openvino_lm_sweep.yaml b/tests/configs/cpu_inference_openvino_gpt_sweep.yaml similarity index 71% rename from tests/configs/cpu_inference_openvino_lm_sweep.yaml rename to tests/configs/cpu_inference_openvino_gpt_sweep.yaml index 8389d7b3..486f9e8f 100644 --- a/tests/configs/cpu_inference_openvino_lm_sweep.yaml +++ b/tests/configs/cpu_inference_openvino_gpt_sweep.yaml @@ -3,8 +3,8 @@ defaults: # order of inheritance, last one overrides previous ones - _base_ # inherits from base config - _inference_ # inherits from inference config - - _lm_sweep_ # inherits from gpt_sweep config + - _gpt_sweep_ # inherits from gpt_sweep config - _cpu_ # inherits from cpu config - _self_ # hydra 1.1 compatibility -experiment_name: cpu_inference_openvino_lm_sweep +experiment_name: cpu_inference_openvino_gpt_sweep diff --git a/tests/configs/cpu_inference_pytorch_lm_sweep.yaml b/tests/configs/cpu_inference_pytorch_gpt_sweep.yaml similarity index 72% rename from tests/configs/cpu_inference_pytorch_lm_sweep.yaml rename to tests/configs/cpu_inference_pytorch_gpt_sweep.yaml index c30d7b60..b4720e88 100644 --- a/tests/configs/cpu_inference_pytorch_lm_sweep.yaml +++ b/tests/configs/cpu_inference_pytorch_gpt_sweep.yaml @@ -3,8 +3,8 @@ defaults: # order of inheritance, last one overrides previous ones - _base_ # inherits from base config - _inference_ # inherits from inference config - - _lm_sweep_ # inherits from gpt_sweep config + - _gpt_sweep_ # inherits from gpt_sweep config - _cpu_ # inherits from cpu config - _self_ # hydra 1.1 compatibility -experiment_name: cpu_inference_pytorch_lm_sweep +experiment_name: cpu_inference_pytorch_gpt_sweep diff --git a/tests/configs/cpu_training_pytorch_lm_sweep.yaml b/tests/configs/cpu_training_pytorch_gpt_sweep.yaml similarity index 72% rename from tests/configs/cpu_training_pytorch_lm_sweep.yaml rename to tests/configs/cpu_training_pytorch_gpt_sweep.yaml index 8b3fbb83..5f8987b6 100644 --- a/tests/configs/cpu_training_pytorch_lm_sweep.yaml +++ b/tests/configs/cpu_training_pytorch_gpt_sweep.yaml @@ -3,8 +3,8 @@ defaults: # order of inheritance, last one overrides previous ones - _base_ # inherits from base config - _training_ # inherits from training config - - _lm_sweep_ # inherits from gpt_sweep config + - _gpt_sweep_ # inherits from gpt_sweep config - _cpu_ # inherits from cpu config - _self_ # hydra 1.1 compatibility -experiment_name: cpu_training_pytorch_lm_sweep +experiment_name: cpu_training_pytorch_gpt_sweep diff --git a/tests/configs/cuda_inference_onnxruntime_lm_sweep.yaml b/tests/configs/cuda_inference_onnxruntime_gpt_sweep.yaml similarity index 71% rename from tests/configs/cuda_inference_onnxruntime_lm_sweep.yaml rename to tests/configs/cuda_inference_onnxruntime_gpt_sweep.yaml index e220b955..f9b38910 100644 --- a/tests/configs/cuda_inference_onnxruntime_lm_sweep.yaml +++ b/tests/configs/cuda_inference_onnxruntime_gpt_sweep.yaml @@ -3,8 +3,8 @@ defaults: # order of inheritance, last one overrides previous ones - _base_ # inherits from base config - _inference_ # inherits from inference config - - _lm_sweep_ # inherits from gpt_sweep config + - _gpt_sweep_ # inherits from gpt_sweep config - _cuda_ # inherits from cpu config - _self_ # hydra 1.1 compatibility -experiment_name: cuda_inference_onnxruntime_lm_sweep +experiment_name: cuda_inference_onnxruntime_gpt_sweep diff --git a/tests/configs/cuda_inference_pytorch_lm_naive_mp.yaml b/tests/configs/cuda_inference_pytorch_gpt_naive_mp.yaml similarity index 70% rename from tests/configs/cuda_inference_pytorch_lm_naive_mp.yaml rename to tests/configs/cuda_inference_pytorch_gpt_naive_mp.yaml index a274429f..6e19ba18 100644 --- a/tests/configs/cuda_inference_pytorch_lm_naive_mp.yaml +++ b/tests/configs/cuda_inference_pytorch_gpt_naive_mp.yaml @@ -3,8 +3,8 @@ defaults: # order of inheritance, last one overrides previous ones - _base_ # inherits from base config - _inference_ # inherits from inference config - - _lm_naive_mp_ # inherits from lm naive mp config + - _gpt_naive_mp_ # inherits from lm naive mp config - _cuda_ # inherits from cpu config - _self_ # hydra 1.1 compatibility -experiment_name: cuda_inference_pytorch_lm_naive_mp +experiment_name: cuda_inference_pytorch_gpt_naive_mp diff --git a/tests/configs/cuda_inference_pytorch_lm_sweep.yaml b/tests/configs/cuda_inference_pytorch_gpt_sweep.yaml similarity index 72% rename from tests/configs/cuda_inference_pytorch_lm_sweep.yaml rename to tests/configs/cuda_inference_pytorch_gpt_sweep.yaml index 23b7ace2..8b033a67 100644 --- a/tests/configs/cuda_inference_pytorch_lm_sweep.yaml +++ b/tests/configs/cuda_inference_pytorch_gpt_sweep.yaml @@ -3,8 +3,8 @@ defaults: # order of inheritance, last one overrides previous ones - _base_ # inherits from base config - _inference_ # inherits from inference config - - _lm_sweep_ # inherits from gpt_sweep config + - _gpt_sweep_ # inherits from gpt_sweep config - _cuda_ # inherits from cpu config - _self_ # hydra 1.1 compatibility -experiment_name: cpu_inference_pytorch_lm_sweep +experiment_name: cpu_inference_pytorch_gpt_sweep diff --git a/tests/configs/cuda_training_pytorch_lm_naive_mp.yaml b/tests/configs/cuda_training_pytorch_gpt_naive_mp.yaml similarity index 70% rename from tests/configs/cuda_training_pytorch_lm_naive_mp.yaml rename to tests/configs/cuda_training_pytorch_gpt_naive_mp.yaml index 714f8692..ab6d4bc2 100644 --- a/tests/configs/cuda_training_pytorch_lm_naive_mp.yaml +++ b/tests/configs/cuda_training_pytorch_gpt_naive_mp.yaml @@ -3,8 +3,8 @@ defaults: # order of inheritance, last one overrides previous ones - _base_ # inherits from base config - _training_ # inherits from training config - - _lm_naive_mp_ # inherits from lm naive mp config + - _gpt_naive_mp_ # inherits from lm naive mp config - _cuda_ # inherits from cpu config - _self_ # hydra 1.1 compatibility -experiment_name: cuda_training_pytorch_lm_naive_mp +experiment_name: cuda_training_pytorch_gpt_naive_mp diff --git a/tests/configs/cuda_training_pytorch_lm_peft.yaml b/tests/configs/cuda_training_pytorch_gpt_peft.yaml similarity index 69% rename from tests/configs/cuda_training_pytorch_lm_peft.yaml rename to tests/configs/cuda_training_pytorch_gpt_peft.yaml index be198ecc..1ee6f473 100644 --- a/tests/configs/cuda_training_pytorch_lm_peft.yaml +++ b/tests/configs/cuda_training_pytorch_gpt_peft.yaml @@ -3,8 +3,8 @@ defaults: # order of inheritance, last one overrides previous ones - _base_ # inherits from base config - _training_ # inherits from training config - - _lm_peft_ # inherits from language modeling peft config + - _gpt_peft_ # inherits from language modeling peft config - _cuda_ # inherits from cpu config - _self_ # hydra 1.1 compatibility -experiment_name: cuda_training_pytorch_lm_peft +experiment_name: cuda_training_pytorch_gpt_peft diff --git a/tests/configs/cuda_training_pytorch_lm_sweep.yaml b/tests/configs/cuda_training_pytorch_gpt_sweep.yaml similarity index 69% rename from tests/configs/cuda_training_pytorch_lm_sweep.yaml rename to tests/configs/cuda_training_pytorch_gpt_sweep.yaml index 17fefe51..004f1f82 100644 --- a/tests/configs/cuda_training_pytorch_lm_sweep.yaml +++ b/tests/configs/cuda_training_pytorch_gpt_sweep.yaml @@ -3,8 +3,8 @@ defaults: # order of inheritance, last one overrides previous ones - _base_ # inherits from base config - _training_ # inherits from training config - - _lm_sweep_ # inherits from language modeling sweep config + - _gpt_sweep_ # inherits from language modeling sweep config - _cuda_ # inherits from cpu config - _self_ # hydra 1.1 compatibility -experiment_name: cuda_training_pytorch_lm_sweep +experiment_name: cuda_training_pytorch_gpt_sweep diff --git a/tests/configs/cuda_training_torch_ort_lm_peft.yaml b/tests/configs/cuda_training_torch_ort_gpt_peft.yaml similarity index 69% rename from tests/configs/cuda_training_torch_ort_lm_peft.yaml rename to tests/configs/cuda_training_torch_ort_gpt_peft.yaml index 98e347a4..665dec16 100644 --- a/tests/configs/cuda_training_torch_ort_lm_peft.yaml +++ b/tests/configs/cuda_training_torch_ort_gpt_peft.yaml @@ -3,8 +3,8 @@ defaults: # order of inheritance, last one overrides previous ones - _base_ # inherits from base config - _training_ # inherits from training config - - _lm_peft_ # inherits from language modeling peft config + - _gpt_peft_ # inherits from language modeling peft config - _cuda_ # inherits from cpu config - _self_ # hydra 1.1 compatibility -experiment_name: cuda_training_torch_ort_lm_peft +experiment_name: cuda_training_torch_ort_gpt_peft diff --git a/tests/configs/cuda_training_torch_ort_lm_sweep.yaml b/tests/configs/cuda_training_torch_ort_gpt_sweep.yaml similarity index 69% rename from tests/configs/cuda_training_torch_ort_lm_sweep.yaml rename to tests/configs/cuda_training_torch_ort_gpt_sweep.yaml index 25d4d054..ff8f505f 100644 --- a/tests/configs/cuda_training_torch_ort_lm_sweep.yaml +++ b/tests/configs/cuda_training_torch_ort_gpt_sweep.yaml @@ -3,8 +3,8 @@ defaults: # order of inheritance, last one overrides previous ones - _base_ # inherits from base config - _training_ # inherits from inference config - - _lm_sweep_ # inherits from language modeling sweep config + - _gpt_sweep_ # inherits from language modeling sweep config - _cuda_ # inherits from cpu config - _self_ # hydra 1.1 compatibility -experiment_name: cuda_training_torch_ort_lm_sweep +experiment_name: cuda_training_torch_ort_gpt_sweep diff --git a/tests/configs/rocm_inference_onnxruntime_lm_sweep.yaml b/tests/configs/rocm_inference_onnxruntime_gpt_sweep.yaml similarity index 71% rename from tests/configs/rocm_inference_onnxruntime_lm_sweep.yaml rename to tests/configs/rocm_inference_onnxruntime_gpt_sweep.yaml index d6630ff1..f53b6612 100644 --- a/tests/configs/rocm_inference_onnxruntime_lm_sweep.yaml +++ b/tests/configs/rocm_inference_onnxruntime_gpt_sweep.yaml @@ -3,8 +3,8 @@ defaults: # order of inheritance, last one overrides previous ones - _base_ # inherits from base config - _inference_ # inherits from inference config - - _lm_sweep_ # inherits from gpt sweep config + - _gpt_sweep_ # inherits from gpt sweep config - _rocm_ # inherits from rocm config - _self_ # hydra 1.1 compatibility -experiment_name: rocm_inference_onnxruntime_lm_sweep +experiment_name: rocm_inference_onnxruntime_gpt_sweep diff --git a/tests/test_api.py b/tests/test_api.py index 0bf6ced9..30815d82 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -1,36 +1,28 @@ -from logging import getLogger +import gc import time +from tempfile import TemporaryDirectory -import torch import pytest +import torch -from optimum_benchmark.trackers.memory import MemoryTracker -from optimum_benchmark.trackers.latency import LatencyTracker -from optimum_benchmark.experiment import ExperimentConfig, launch -from optimum_benchmark.launchers.inline.config import InlineConfig from optimum_benchmark.backends.pytorch.config import PyTorchConfig -from optimum_benchmark.launchers.process.config import ProcessConfig -from optimum_benchmark.launchers.torchrun.config import TorchrunConfig -from optimum_benchmark.benchmarks.inference.config import INPUT_SHAPES -from optimum_benchmark.benchmarks.training.config import DATASET_SHAPES -from optimum_benchmark.generators.input_generator import InputGenerator -from optimum_benchmark.benchmarks.training.config import TrainingConfig -from optimum_benchmark.benchmarks.inference.config import InferenceConfig -from optimum_benchmark.generators.dataset_generator import DatasetGenerator -from optimum_benchmark.task_utils import TEXT_GENERATION_TASKS, IMAGE_DIFFUSION_TASKS from optimum_benchmark.backends.timm_utils import extract_timm_shapes_from_config, get_timm_pretrained_config from optimum_benchmark.backends.transformers_utils import ( extract_transformers_shapes_from_artifacts, get_transformers_pretrained_config, ) +from optimum_benchmark.benchmarks.inference.config import INPUT_SHAPES, InferenceConfig +from optimum_benchmark.benchmarks.training.config import DATASET_SHAPES +from optimum_benchmark.experiment import ExperimentConfig, launch +from optimum_benchmark.generators.dataset_generator import DatasetGenerator +from optimum_benchmark.generators.input_generator import InputGenerator +from optimum_benchmark.launchers.inline.config import InlineConfig +from optimum_benchmark.launchers.process.config import ProcessConfig +from optimum_benchmark.launchers.torchrun.config import TorchrunConfig +from optimum_benchmark.task_utils import IMAGE_DIFFUSION_TASKS, TEXT_GENERATION_TASKS +from optimum_benchmark.trackers.latency import LatencyTracker +from optimum_benchmark.trackers.memory import MemoryTracker - -LOGGER = getLogger("test-api") - -DEVICES_BACKENDS = [ - ("cpu", "none"), - ("cuda", "pytorch"), -] LIBRARIES_TASKS_MODELS = [ ("transformers", "fill-mask", "bert-base-uncased"), ("timm", "image-classification", "timm/resnet50.a1_in1k"), @@ -43,18 +35,17 @@ ("transformers", "image-classification", "google/vit-base-patch16-224"), ("transformers", "semantic-segmentation", "google/vit-base-patch16-224"), ] -BENCHMARK_CONFIGS = [ - InferenceConfig(latency=True, memory=True), - TrainingConfig(latency=True, memory=True), -] LAUNCHER_CONFIGS = [ - TorchrunConfig(nproc_per_node=2, device_isolation=False), - ProcessConfig(device_isolation=False), InlineConfig(device_isolation=False), + ProcessConfig(device_isolation=False), + TorchrunConfig(device_isolation=False, nproc_per_node=2), ] +BACKENDS = ["pytorch", "none"] +DEVICES = ["cpu", "cuda"] -@pytest.mark.parametrize("device,backend", DEVICES_BACKENDS) +@pytest.mark.parametrize("device", DEVICES) +@pytest.mark.parametrize("backend", BACKENDS) def test_api_latency_tracker(device, backend): expected_latency = 1 tracker = LatencyTracker(device=device, backend=backend) @@ -63,40 +54,55 @@ def test_api_latency_tracker(device, backend): with tracker.track(): time.sleep(1) - latencies_list = tracker.get_latencies_list() + latency = tracker.get_latency() + latency.log() - assert len(latencies_list) == 2 - assert latencies_list[0] > expected_latency * 0.9 - assert latencies_list[0] < expected_latency * 1.1 + assert latency.mean < expected_latency * 1.1 + assert latency.mean > expected_latency * 0.9 -@pytest.mark.parametrize("device,backend", DEVICES_BACKENDS) +@pytest.mark.parametrize("device", DEVICES) +@pytest.mark.parametrize("backend", BACKENDS) def test_api_memory_tracker(device, backend): tracker = MemoryTracker(device=device, backend=backend) + tracker.reset() with tracker.track(): + time.sleep(1) pass # the process consumes memory that we can't control - if backend == "pytorch": - initial_process_memory = tracker.get_max_memory_allocated_mb() - else: - initial_process_memory = tracker.get_max_memory_used_mb() + initial_memory = tracker.get_max_memory() + initial_memory.log() + tracker.reset() with tracker.track(): - array = torch.ones((10000, 10000), dtype=torch.float64, device=device) - expected_memory = array.nbytes / 1e6 # around 800 MB - - if backend == "pytorch": - final_process_memory = tracker.get_max_memory_allocated_mb() + time.sleep(1) + array = torch.randn((10000, 10000), dtype=torch.float64, device=device) + expected_memory = array.nbytes / 1e6 + time.sleep(1) + + final_memory = tracker.get_max_memory() + final_memory.log() + + if device == "cuda": + if backend == "pytorch": + measured_memory = final_memory.max_allocated - initial_memory.max_allocated + else: + measured_memory = final_memory.max_vram - initial_memory.max_vram + if torch.version.hip is not None: + return # skip vram measurement for ROCm else: - final_process_memory = tracker.get_max_memory_used_mb() - - measured_memory = final_process_memory - initial_process_memory + measured_memory = final_memory.max_ram - initial_memory.max_ram assert measured_memory < expected_memory * 1.1 assert measured_memory > expected_memory * 0.9 + del array + if torch.cuda.is_available(): + torch.cuda.empty_cache() + gc.collect() + @pytest.mark.parametrize("library,task,model", LIBRARIES_TASKS_MODELS) def test_api_input_generator(library, task, model): @@ -109,11 +115,7 @@ def test_api_input_generator(library, task, model): else: raise ValueError(f"Unknown library {library}") - generator = InputGenerator( - task=task, - input_shapes=INPUT_SHAPES, - model_shapes=model_shapes, - ) + generator = InputGenerator(task=task, input_shapes=INPUT_SHAPES, model_shapes=model_shapes) if task in TEXT_GENERATION_TASKS: _ = generator(mode="forward") @@ -135,23 +137,31 @@ def test_api_dataset_generator(library, task, model): else: raise ValueError(f"Unknown library {library}") - generator = DatasetGenerator( - task=task, - dataset_shapes=DATASET_SHAPES, - model_shapes=model_shapes, - ) + generator = DatasetGenerator(task=task, dataset_shapes=DATASET_SHAPES, model_shapes=model_shapes) _ = generator() -@pytest.mark.parametrize("benchmark_config", BENCHMARK_CONFIGS) @pytest.mark.parametrize("launcher_config", LAUNCHER_CONFIGS) -def test_api_launch_cpu(benchmark_config, launcher_config): - backend_config = PyTorchConfig(model="bert-base-uncased", no_weights=True, device="cpu") +@pytest.mark.parametrize("device", DEVICES) +def test_api_launch(launcher_config, device): + benchmark_config = InferenceConfig(latency=True, memory=True) + device_ids = ",".join(str(i) for i in range(torch.cuda.device_count())) if device == "cuda" else None + backend_config = PyTorchConfig(model="bert-base-uncased", device_ids=device_ids, no_weights=True, device=device) experiment_config = ExperimentConfig( - experiment_name="", - benchmark=benchmark_config, - launcher=launcher_config, - backend=backend_config, + experiment_name="api-experiment", benchmark=benchmark_config, launcher=launcher_config, backend=backend_config ) - _ = launch(experiment_config) + benchmark_report = launch(experiment_config) + + with TemporaryDirectory() as tempdir: + experiment_config.to_dict() + experiment_config.to_flat_dict() + experiment_config.to_dataframe() + experiment_config.to_csv(f"{tempdir}/experiment_config.csv") + experiment_config.to_json(f"{tempdir}/experiment_config.json") + + benchmark_report.to_dict() + benchmark_report.to_flat_dict() + benchmark_report.to_dataframe() + benchmark_report.to_csv(f"{tempdir}/benchmark_report.csv") + benchmark_report.to_json(f"{tempdir}/benchmark_report.json") diff --git a/tests/test_cli.py b/tests/test_cli.py index afae3609..739d0f89 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -11,20 +11,13 @@ TEST_CONFIG_NAMES = [ config.split(".")[0] for config in os.listdir(TEST_CONFIG_DIR) - if config.endswith(".yaml") and not (config.startswith("_") or config.endswith("_")) + if config.endswith(".yaml") and not (config.startswith("_") or config.endswith("_")) # or "ds_tp" in config) ] @pytest.mark.parametrize("config_name", TEST_CONFIG_NAMES) def test_cli_configs(config_name): - args = [ - "optimum-benchmark", - "--config-dir", - TEST_CONFIG_DIR, - "--config-name", - config_name, - "--multirun", - ] + args = ["optimum-benchmark", "--config-dir", TEST_CONFIG_DIR, "--config-name", config_name, "--multirun"] popen = run_subprocess_and_log_stream_output(LOGGER, args) assert popen.returncode == 0, f"Failed to run {config_name}"