diff --git a/.github/workflows/test_api_cpu.yaml b/.github/workflows/test_api_cpu.yaml index 126e500b..b48490a5 100644 --- a/.github/workflows/test_api_cpu.yaml +++ b/.github/workflows/test_api_cpu.yaml @@ -47,8 +47,21 @@ jobs: pip install -e .[testing,timm,diffusers,codecarbon] - name: Run tests + run: | + pytest tests/test_api.py -s -k "api and cpu" env: HF_TOKEN: ${{ secrets.HF_TOKEN }} PUSH_REPO_ID: optimum-benchmark/cpu - run: | - pytest tests/test_api.py -s -k "api and cpu" + + # no examples for now + # - if: ${{ + # (github.event_name == 'push') || + # (github.event_name == 'workflow_dispatch') || + # contains( github.event.pull_request.labels.*.name, 'examples') + # }} + # name: Run examples + # run: | + # pytest tests/test_examples.py -s -k "api and cpu" + # env: + # HF_TOKEN: ${{ secrets.HF_TOKEN }} + # PUSH_REPO_ID: optimum-benchmark/cpu diff --git a/.github/workflows/test_api_cuda.yaml b/.github/workflows/test_api_cuda.yaml index c8be0ece..d45afa40 100644 --- a/.github/workflows/test_api_cuda.yaml +++ b/.github/workflows/test_api_cuda.yaml @@ -45,8 +45,21 @@ jobs: pip install -e .[testing,timm,diffusers,codecarbon] - name: Run tests + run: | + pytest tests/test_api.py -x -s -k "api and cuda" env: HF_TOKEN: ${{ secrets.HF_TOKEN }} PUSH_REPO_ID: optimum-benchmark/cuda + + - if: ${{ + (github.event_name == 'push') || + (github.event_name == 'workflow_dispatch') || + contains( github.event.pull_request.labels.*.name, 'examples') + }} + name: Run examples run: | - pytest tests/test_api.py -x -s -k "api and cuda" + pip install -e .[testing,torchao,autoawq,auto-gptq] + pytest tests/test_examples.py -x -s -k "api and cuda and pytorch" + env: + HF_TOKEN: ${{ secrets.HF_TOKEN }} + PUSH_REPO_ID: optimum-benchmark/cuda diff --git a/.github/workflows/test_api_misc.yaml b/.github/workflows/test_api_misc.yaml index 2da1e7ec..36c26215 100644 --- a/.github/workflows/test_api_misc.yaml +++ b/.github/workflows/test_api_misc.yaml @@ -58,8 +58,8 @@ jobs: UV_SYSTEM_PYTHON: 1 - name: Run tests + run: | + pytest tests/test_api.py -s -k "api and not (cpu or cuda or rocm or mps)" env: HF_TOKEN: ${{ secrets.HF_TOKEN }} PUSH_REPO_ID: optimum-benchmark/misc-${{ matrix.os }}-${{ matrix.python }} - run: | - pytest tests/test_api.py -s -k "api and not (cpu or cuda or rocm or mps)" diff --git a/.github/workflows/test_cli_cpu_ipex.yaml b/.github/workflows/test_cli_cpu_ipex.yaml index d6b94d3e..5bf0be92 100644 --- a/.github/workflows/test_cli_cpu_ipex.yaml +++ b/.github/workflows/test_cli_cpu_ipex.yaml @@ -36,16 +36,17 @@ jobs: - name: Checkout uses: actions/checkout@v4 - - name: Set up Python 3.10 - uses: actions/setup-python@v5 - with: - python-version: "3.10" - - name: Install requirements run: | - pip install --upgrade pip - pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu pip install -e .[testing,ipex,diffusers,timm] - name: Run tests run: pytest tests/test_cli.py -s -k "cli and cpu and ipex" + + - if: ${{ + (github.event_name == 'push') || + (github.event_name == 'workflow_dispatch') || + contains( github.event.pull_request.labels.*.name, 'examples') + }} + name: Run examples + run: pytest tests/test_examples.py -s -k "cli and cpu and ipex" diff --git a/.github/workflows/test_cli_cpu_llama_cpp.yaml b/.github/workflows/test_cli_cpu_llama_cpp.yaml index 05d43683..145c0f83 100644 --- a/.github/workflows/test_cli_cpu_llama_cpp.yaml +++ b/.github/workflows/test_cli_cpu_llama_cpp.yaml @@ -48,4 +48,12 @@ jobs: pip install -e .[testing,llama-cpp] - name: Run tests - run: pytest tests/test_cli.py -s -k "llama_cpp" + run: pytest tests/test_cli.py -s -k "cli and cpu and llama_cpp" + + - if: ${{ + (github.event_name == 'push') || + (github.event_name == 'workflow_dispatch') || + contains( github.event.pull_request.labels.*.name, 'examples') + }} + name: Run examples + run: pytest tests/test_examples.py -s -k "cli and cpu and llama_cpp" diff --git a/.github/workflows/test_cli_cpu_neural_compressor.yaml b/.github/workflows/test_cli_cpu_neural_compressor.yaml deleted file mode 100644 index 435f4216..00000000 --- a/.github/workflows/test_cli_cpu_neural_compressor.yaml +++ /dev/null @@ -1,51 +0,0 @@ -name: CLI CPU Intel Neural Compressor Tests - -on: - workflow_dispatch: - push: - branches: - - main - pull_request: - branches: - - main - types: - - opened - - reopened - - synchronize - - labeled - - unlabeled - -concurrency: - cancel-in-progress: true - group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} - -jobs: - run_cli_cpu_neural_compressor_tests: - if: ${{ - (github.event_name == 'push') || - (github.event_name == 'workflow_dispatch') || - contains( github.event.pull_request.labels.*.name, 'cli') || - contains( github.event.pull_request.labels.*.name, 'cpu') || - contains( github.event.pull_request.labels.*.name, 'neural_compressor') || - contains( github.event.pull_request.labels.*.name, 'cli_cpu_neural_compressor') - }} - - runs-on: ubuntu-latest - - steps: - - name: Checkout - uses: actions/checkout@v4 - - - name: Set up Python 3.10 - uses: actions/setup-python@v5 - with: - python-version: "3.10" - - - name: Install requirements - run: | - pip install --upgrade pip - pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu - pip install -e .[testing,neural-compressor,diffusers,timm] - - - name: Run tests - run: pytest tests/test_cli.py -s -k "cli and cpu and neural_compressor" diff --git a/.github/workflows/test_cli_cpu_onnxruntime.yaml b/.github/workflows/test_cli_cpu_onnxruntime.yaml index 21e65235..ef8482b7 100644 --- a/.github/workflows/test_cli_cpu_onnxruntime.yaml +++ b/.github/workflows/test_cli_cpu_onnxruntime.yaml @@ -49,3 +49,11 @@ jobs: - name: Run tests run: pytest tests/test_cli.py -s -k "cli and cpu and onnxruntime" + + - if: ${{ + (github.event_name == 'push') || + (github.event_name == 'workflow_dispatch') || + contains( github.event.pull_request.labels.*.name, 'examples') + }} + name: Run examples + run: pytest tests/test_examples.py -s -k "cli and cpu and onnxruntime" diff --git a/.github/workflows/test_cli_cpu_openvino.yaml b/.github/workflows/test_cli_cpu_openvino.yaml index 4612370c..2ef0312e 100644 --- a/.github/workflows/test_cli_cpu_openvino.yaml +++ b/.github/workflows/test_cli_cpu_openvino.yaml @@ -36,16 +36,18 @@ jobs: - name: Checkout uses: actions/checkout@v4 - - name: Set up Python 3.10 - uses: actions/setup-python@v5 - with: - python-version: "3.10" - - name: Install requirements run: | - pip install --upgrade pip pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu pip install -e .[testing,openvino,diffusers,timm] - name: Run tests run: pytest tests/test_cli.py -s -k "cli and cpu and openvino" + + - if: ${{ + (github.event_name == 'push') || + (github.event_name == 'workflow_dispatch') || + contains( github.event.pull_request.labels.*.name, 'examples') + }} + name: Run examples + run: pytest tests/test_examples.py -s -k "cli and cpu and openvino" diff --git a/.github/workflows/test_cli_cpu_py_txi.yaml b/.github/workflows/test_cli_cpu_py_txi.yaml index d07f6170..7b1946e7 100644 --- a/.github/workflows/test_cli_cpu_py_txi.yaml +++ b/.github/workflows/test_cli_cpu_py_txi.yaml @@ -45,7 +45,16 @@ jobs: run: | pip install --upgrade pip pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu - pip install -e .[testing,py-txi] + pip install -e .[testing,py-txi] git+https://github.com/IlyasMoutawwakil/py-txi.git - name: Run tests run: pytest tests/test_cli.py -s -k "cli and cpu and py_txi" + + # no examples for now + # - if: ${{ + # (github.event_name == 'push') || + # (github.event_name == 'workflow_dispatch') || + # contains( github.event.pull_request.labels.*.name, 'examples') + # }} + # name: Run examples + # run: pytest tests/test_examples.py -s -k "cli and cpu and (tgi or tei)" diff --git a/.github/workflows/test_cli_cpu_pytorch.yaml b/.github/workflows/test_cli_cpu_pytorch.yaml index fef2a772..dab603c7 100644 --- a/.github/workflows/test_cli_cpu_pytorch.yaml +++ b/.github/workflows/test_cli_cpu_pytorch.yaml @@ -49,3 +49,12 @@ jobs: - name: Run tests run: pytest tests/test_cli.py -s -k "cli and cpu and pytorch" + + # no examples for now + # - if: ${{ + # (github.event_name == 'push') || + # (github.event_name == 'workflow_dispatch') || + # contains( github.event.pull_request.labels.*.name, 'examples') + # }} + # name: Run examples + # run: pytest tests/test_examples.py -s -k "cli and cpu and pytorch" diff --git a/.github/workflows/test_cli_cuda_onnxruntime.yaml b/.github/workflows/test_cli_cuda_onnxruntime.yaml index 0584665c..1351e1b0 100644 --- a/.github/workflows/test_cli_cuda_onnxruntime.yaml +++ b/.github/workflows/test_cli_cuda_onnxruntime.yaml @@ -48,3 +48,12 @@ jobs: - name: Run tests run: | pytest tests/test_cli.py -x -s -k "cli and cuda and onnxruntime" + + # no examples for now + # - if: ${{ + # (github.event_name == 'push') || + # (github.event_name == 'workflow_dispatch') || + # contains( github.event.pull_request.labels.*.name, 'examples') + # }} + # name: Run examples + # run: pytest tests/test_examples.py -x -s -k "cli and cuda and onnxruntime" diff --git a/.github/workflows/test_cli_cuda_py_txi.yaml b/.github/workflows/test_cli_cuda_py_txi.yaml index 7339b98e..5c090b28 100644 --- a/.github/workflows/test_cli_cuda_py_txi.yaml +++ b/.github/workflows/test_cli_cuda_py_txi.yaml @@ -45,7 +45,15 @@ jobs: - name: Install requirements run: | pip install --upgrade pip - pip install -e .[testing,py-txi] + pip install -e .[testing,py-txi] git+https://github.com/IlyasMoutawwakil/py-txi.git - name: Run tests run: pytest tests/test_cli.py -x -s -k "cli and cuda and py_txi" + + - if: ${{ + (github.event_name == 'push') || + (github.event_name == 'workflow_dispatch') || + contains( github.event.pull_request.labels.*.name, 'examples') + }} + name: Run examples + run: pytest tests/test_examples.py -x -s -k "cli and cuda and (tgi or tei)" diff --git a/.github/workflows/test_cli_cuda_pytorch.yaml b/.github/workflows/test_cli_cuda_pytorch.yaml index 0bc5dfaf..2aa54d5d 100644 --- a/.github/workflows/test_cli_cuda_pytorch.yaml +++ b/.github/workflows/test_cli_cuda_pytorch.yaml @@ -50,6 +50,14 @@ jobs: run: | pytest tests/test_cli.py -x -s -k "cli and cuda and pytorch and not (dp or ddp or device_map or deepspeed)" + - if: ${{ + (github.event_name == 'push') || + (github.event_name == 'workflow_dispatch') || + contains( github.event.pull_request.labels.*.name, 'examples') + }} + name: Run examples + run: pytest tests/test_examples.py -x -s -k "cli and cuda and pytorch" + run_cli_cuda_pytorch_multi_gpu_tests: if: ${{ (github.event_name == 'push') || diff --git a/.github/workflows/test_cli_cuda_tensorrt_llm.yaml b/.github/workflows/test_cli_cuda_tensorrt_llm.yaml index acb04fe2..c75aac92 100644 --- a/.github/workflows/test_cli_cuda_tensorrt_llm.yaml +++ b/.github/workflows/test_cli_cuda_tensorrt_llm.yaml @@ -50,6 +50,16 @@ jobs: run: | pytest tests/test_cli.py -x -s -k "cli and cuda and tensorrt_llm and not (tp or pp)" + - if: ${{ + (github.event_name == 'push') || + (github.event_name == 'workflow_dispatch') || + contains( github.event.pull_request.labels.*.name, 'examples') + }} + name: Run examples + run: | + huggingface-cli delete-cache + pytest tests/test_examples.py -x -s -k "cli and cuda and trt" + cli_cuda_tensorrt_llm_multi_gpu_tests: if: ${{ (github.event_name == 'push') || diff --git a/.github/workflows/test_cli_cuda_torch_ort.yaml b/.github/workflows/test_cli_cuda_torch_ort.yaml index ee886e8c..7dccafb8 100644 --- a/.github/workflows/test_cli_cuda_torch_ort.yaml +++ b/.github/workflows/test_cli_cuda_torch_ort.yaml @@ -44,13 +44,21 @@ jobs: - name: Install dependencies run: | - pip install -e .[testing,torch-ort,peft] - pip install optimum@git+https://github.com/huggingface/optimum.git + pip install -e .[testing,torch-ort,peft] optimum@git+https://github.com/huggingface/optimum.git@fxi-ort-trainer - name: Run tests run: | pytest tests/test_cli.py -x -s -k "cli and cuda and torch_ort and not (dp or ddp or device_map) and not (peft)" + # - if: ${{ + # (github.event_name == 'push') || + # (github.event_name == 'workflow_dispatch') || + # contains( github.event.pull_request.labels.*.name, 'examples') + # }} + # name: Run examples + # run: | + # pytest tests/test_examples.py -x -s -k "cli and cuda and torch_ort" + run_cli_cuda_torch_ort_multi_gpu_tests: if: ${{ (github.event_name == 'push') || @@ -75,8 +83,7 @@ jobs: - name: Install dependencies run: | - pip install -e .[testing,torch-ort,peft] - pip install optimum@git+https://github.com/huggingface/optimum.git + pip install -e .[testing,torch-ort,peft] optimum@git+https://github.com/huggingface/optimum.git@fxi-ort-trainer - name: Run tests run: | diff --git a/.github/workflows/test_cli_cuda_vllm.yaml b/.github/workflows/test_cli_cuda_vllm.yaml index 732513d2..6072dd8c 100644 --- a/.github/workflows/test_cli_cuda_vllm.yaml +++ b/.github/workflows/test_cli_cuda_vllm.yaml @@ -50,6 +50,15 @@ jobs: run: | FORCE_SEQUENTIAL=1 pytest tests/test_cli.py -x -s -k "cli and cuda and vllm and not (tp or pp)" + - if: ${{ + (github.event_name == 'push') || + (github.event_name == 'workflow_dispatch') || + contains( github.event.pull_request.labels.*.name, 'examples') + }} + name: Run examples + run: | + pytest tests/test_examples.py -x -s -k "cli and cuda and vllm" + run_cli_cuda_vllm_multi_gpu_tests: if: ${{ (github.event_name == 'push') || diff --git a/.github/workflows/test_cli_energy_star.yaml b/.github/workflows/test_energy_star.yaml similarity index 84% rename from .github/workflows/test_cli_energy_star.yaml rename to .github/workflows/test_energy_star.yaml index 24c487f6..db9a22cd 100644 --- a/.github/workflows/test_cli_energy_star.yaml +++ b/.github/workflows/test_energy_star.yaml @@ -20,13 +20,11 @@ concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} jobs: - run_cli_energy_star_tests: + run_energy_star: if: ${{ (github.event_name == 'push') || (github.event_name == 'workflow_dispatch') || - contains( github.event.pull_request.labels.*.name, 'cli') || - contains( github.event.pull_request.labels.*.name, 'energy_star') || - contains( github.event.pull_request.labels.*.name, 'cli_energy_star') + contains( github.event.pull_request.labels.*.name, 'energy_star') }} runs-on: diff --git a/examples/energy_star/_base_.yaml b/energy_star/_base_.yaml similarity index 100% rename from examples/energy_star/_base_.yaml rename to energy_star/_base_.yaml diff --git a/examples/energy_star/automatic_speech_recognition.yaml b/energy_star/automatic_speech_recognition.yaml similarity index 100% rename from examples/energy_star/automatic_speech_recognition.yaml rename to energy_star/automatic_speech_recognition.yaml diff --git a/examples/energy_star/image_classification.yaml b/energy_star/image_classification.yaml similarity index 100% rename from examples/energy_star/image_classification.yaml rename to energy_star/image_classification.yaml diff --git a/examples/energy_star/image_to_text.yaml b/energy_star/image_to_text.yaml similarity index 100% rename from examples/energy_star/image_to_text.yaml rename to energy_star/image_to_text.yaml diff --git a/examples/energy_star/object_detection.yaml b/energy_star/object_detection.yaml similarity index 100% rename from examples/energy_star/object_detection.yaml rename to energy_star/object_detection.yaml diff --git a/examples/energy_star/question_answering.yaml b/energy_star/question_answering.yaml similarity index 100% rename from examples/energy_star/question_answering.yaml rename to energy_star/question_answering.yaml diff --git a/examples/energy_star/sentence_similarity.yaml b/energy_star/sentence_similarity.yaml similarity index 100% rename from examples/energy_star/sentence_similarity.yaml rename to energy_star/sentence_similarity.yaml diff --git a/examples/energy_star/summarization.yaml b/energy_star/summarization.yaml similarity index 100% rename from examples/energy_star/summarization.yaml rename to energy_star/summarization.yaml diff --git a/examples/energy_star/t5_question_answering.yaml b/energy_star/t5_question_answering.yaml similarity index 100% rename from examples/energy_star/t5_question_answering.yaml rename to energy_star/t5_question_answering.yaml diff --git a/examples/energy_star/t5_summarization.yaml b/energy_star/t5_summarization.yaml similarity index 100% rename from examples/energy_star/t5_summarization.yaml rename to energy_star/t5_summarization.yaml diff --git a/examples/energy_star/t5_text_classification.yaml b/energy_star/t5_text_classification.yaml similarity index 100% rename from examples/energy_star/t5_text_classification.yaml rename to energy_star/t5_text_classification.yaml diff --git a/examples/energy_star/t5_text_generation.yaml b/energy_star/t5_text_generation.yaml similarity index 100% rename from examples/energy_star/t5_text_generation.yaml rename to energy_star/t5_text_generation.yaml diff --git a/examples/energy_star/text_classification.yaml b/energy_star/text_classification.yaml similarity index 100% rename from examples/energy_star/text_classification.yaml rename to energy_star/text_classification.yaml diff --git a/examples/energy_star/text_generation.yaml b/energy_star/text_generation.yaml similarity index 100% rename from examples/energy_star/text_generation.yaml rename to energy_star/text_generation.yaml diff --git a/examples/energy_star/text_to_image.yaml b/energy_star/text_to_image.yaml similarity index 100% rename from examples/energy_star/text_to_image.yaml rename to energy_star/text_to_image.yaml diff --git a/examples/ipex_bert.yaml b/examples/cpu_ipex_bert.yaml similarity index 59% rename from examples/ipex_bert.yaml rename to examples/cpu_ipex_bert.yaml index e549da0a..0e7ed37b 100644 --- a/examples/ipex_bert.yaml +++ b/examples/cpu_ipex_bert.yaml @@ -6,7 +6,7 @@ defaults: - _base_ - _self_ -name: ipex_bert +name: cpu_ipex_bert launcher: numactl: true @@ -14,16 +14,17 @@ launcher: cpunodebind: 0 membind: 0 +backend: + device: cpu + export: true + no_weights: false # because on multi-node machines, intializing weights could harm performance + torch_dtype: float32 # but use bfloat16 on compatible Intel CPUs + model: google-bert/bert-base-uncased + scenario: - latency: true memory: true + latency: true + input_shapes: batch_size: 1 sequence_length: 128 - -backend: - device: cpu - no_weights: false - export: true - torch_dtype: bfloat16 - model: bert-base-uncased diff --git a/examples/ipex_llama.yaml b/examples/cpu_ipex_llama.yaml similarity index 66% rename from examples/ipex_llama.yaml rename to examples/cpu_ipex_llama.yaml index b564316b..898ed0df 100644 --- a/examples/ipex_llama.yaml +++ b/examples/cpu_ipex_llama.yaml @@ -6,7 +6,7 @@ defaults: - _base_ - _self_ -name: ipex_llama +name: cpu_ipex_llama launcher: numactl: true @@ -14,24 +14,21 @@ launcher: cpunodebind: 0 membind: 0 +backend: + device: cpu + export: true + no_weights: false # because on multi-node machines, intializing weights could harm performance + torch_dtype: float32 # but use bfloat16 on compatible Intel CPUs + model: TinyLlama/TinyLlama-1.1B-Chat-v1.0 + scenario: - latency: true memory: true + latency: true - warmup_runs: 10 - iterations: 10 - duration: 10 - input_shapes: batch_size: 1 - sequence_length: 256 + sequence_length: 64 + generate_kwargs: max_new_tokens: 32 min_new_tokens: 32 - -backend: - device: cpu - export: true - no_weights: false - torch_dtype: bfloat16 - model: TinyLlama/TinyLlama-1.1B-Chat-v1.0 diff --git a/examples/llama_cpp_embedding.yaml b/examples/cpu_llama_cpp_embedding.yaml similarity index 57% rename from examples/llama_cpp_embedding.yaml rename to examples/cpu_llama_cpp_embedding.yaml index bdd86cce..666277c6 100644 --- a/examples/llama_cpp_embedding.yaml +++ b/examples/cpu_llama_cpp_embedding.yaml @@ -1,26 +1,24 @@ defaults: - benchmark - scenario: inference - - launcher: inline - backend: llama_cpp + - launcher: process - _base_ - _self_ -name: llama_cpp_llama +name: cpu_llama_cpp_embedding backend: - device: mps - model: nomic-ai/nomic-embed-text-v1.5-GGUF + device: cpu task: feature-extraction + model: nomic-ai/nomic-embed-text-v1.5-GGUF filename: nomic-embed-text-v1.5.Q4_0.gguf scenario: input_shapes: batch_size: 1 - sequence_length: 256 - vocab_size: 30000 - type_vocab_size: 1 - max_position_embeddings: 512 + sequence_length: 64 + generate_kwargs: - max_new_tokens: 100 - min_new_tokens: 100 + max_new_tokens: 32 + min_new_tokens: 32 diff --git a/examples/llama_cpp_text_generation.yaml b/examples/cpu_llama_cpp_text_generation.yaml similarity index 61% rename from examples/llama_cpp_text_generation.yaml rename to examples/cpu_llama_cpp_text_generation.yaml index 96def950..2cd55514 100644 --- a/examples/llama_cpp_text_generation.yaml +++ b/examples/cpu_llama_cpp_text_generation.yaml @@ -1,25 +1,23 @@ defaults: - benchmark - scenario: inference - - launcher: inline - backend: llama_cpp + - launcher: process - _base_ - _self_ -name: llama_cpp_llama +name: cpu_llama_cpp_text_generation backend: - device: mps - model: TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF + device: cpu task: text-generation + model: TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF filename: tinyllama-1.1b-chat-v1.0.Q4_0.gguf - scenario: + memory: true + latency: true + input_shapes: batch_size: 1 - sequence_length: 256 - vocab_size: 32000 - generate_kwargs: - max_new_tokens: 100 - min_new_tokens: 100 + sequence_length: 128 diff --git a/examples/onnxruntime_static_quant_vit.yaml b/examples/cpu_onnxruntime_static_quant_vit.yaml similarity index 70% rename from examples/onnxruntime_static_quant_vit.yaml rename to examples/cpu_onnxruntime_static_quant_vit.yaml index 3d298473..97591bcd 100644 --- a/examples/onnxruntime_static_quant_vit.yaml +++ b/examples/cpu_onnxruntime_static_quant_vit.yaml @@ -6,10 +6,11 @@ defaults: - _base_ - _self_ -name: onnxruntime_static_quant_vit +name: cpu_onnxruntime_static_quant_vit backend: device: cpu + export: true no_weights: true model: google/vit-base-patch16-224 quantization: true @@ -17,3 +18,9 @@ backend: is_static: true per_channel: false calibration: true + +scenario: + memory: true + latency: true + input_shapes: + batch_size: 2 diff --git a/examples/onnxruntime_timm.yaml b/examples/cpu_onnxruntime_timm.yaml similarity index 82% rename from examples/onnxruntime_timm.yaml rename to examples/cpu_onnxruntime_timm.yaml index 165fc28a..963f44f0 100644 --- a/examples/onnxruntime_timm.yaml +++ b/examples/cpu_onnxruntime_timm.yaml @@ -10,7 +10,8 @@ name: onnxruntime_timm backend: device: cpu - model: timm/mobilenetv3_large_100.ra_in1k + export: true + model: timm/tiny_vit_21m_224.in1k scenario: memory: true diff --git a/examples/numactl_bert.yaml b/examples/cpu_openvino_8bit_bert.yaml similarity index 57% rename from examples/numactl_bert.yaml rename to examples/cpu_openvino_8bit_bert.yaml index 7add65e7..73ef474d 100644 --- a/examples/numactl_bert.yaml +++ b/examples/cpu_openvino_8bit_bert.yaml @@ -1,27 +1,24 @@ defaults: - benchmark - scenario: inference + - backend: openvino - launcher: process - - backend: pytorch - _base_ - _self_ -name: pytorch_bert +name: openvino_static_quant -launcher: - numactl: true - numactl_kwargs: - cpunodebind: 0 - membind: 0 +backend: + device: cpu + reshape: true + no_weights: true + load_in_8bit: false # enable 8bit on compatible Intel CPU machines + model: google-bert/bert-base-uncased scenario: - latency: true memory: true + latency: true + input_shapes: batch_size: 1 sequence_length: 128 - -backend: - device: cpu - no_weights: true - model: bert-base-uncased diff --git a/examples/openvino_diffusion.yaml b/examples/cpu_openvino_diffusion.yaml similarity index 78% rename from examples/openvino_diffusion.yaml rename to examples/cpu_openvino_diffusion.yaml index f0501101..30d21935 100644 --- a/examples/openvino_diffusion.yaml +++ b/examples/cpu_openvino_diffusion.yaml @@ -10,10 +10,9 @@ name: openvino_diffusion backend: device: cpu - model: stabilityai/stable-diffusion-2-1 - reshape: true export: true - half: true + model: stabilityai/stable-diffusion-2-1 + half: false # enable half-precision on compatible Intel CPU machines scenario: input_shapes: diff --git a/examples/pytorch_bert.py b/examples/cuda_pytorch_bert.py similarity index 59% rename from examples/pytorch_bert.py rename to examples/cuda_pytorch_bert.py index 09f62b8d..2a7ddf89 100644 --- a/examples/pytorch_bert.py +++ b/examples/cuda_pytorch_bert.py @@ -1,22 +1,20 @@ import os -from huggingface_hub import whoami - from optimum_benchmark import Benchmark, BenchmarkConfig, InferenceConfig, ProcessConfig, PyTorchConfig from optimum_benchmark.logging_utils import setup_logging -try: - USERNAME = whoami()["name"] -except Exception as e: - print(f"Failed to get username from Hugging Face Hub: {e}") - USERNAME = None +BENCHMARK_NAME = "cuda_pytorch_bert" +MODEL = "google-bert/bert-base-uncased" +PUSH_REPO_ID = os.environ.get("PUSH_REPO_ID", None) -BENCHMARK_NAME = "pytorch_bert" +if __name__ == "__main__": + level = os.environ.get("LOG_LEVEL", "INFO") + to_file = os.environ.get("LOG_TO_FILE", "0") == "1" + setup_logging(level=level, to_file=to_file, prefix="MAIN-PROCESS") -def run_benchmark(): launcher_config = ProcessConfig(device_isolation=True, device_isolation_action="warn") - backend_config = PyTorchConfig(device="cuda", device_ids="0", no_weights=True, model="bert-base-uncased") + backend_config = PyTorchConfig(device="cuda", device_ids="0", no_weights=True, model=MODEL) scenario_config = InferenceConfig(memory=True, latency=True, input_shapes={"batch_size": 1, "sequence_length": 128}) benchmark_config = BenchmarkConfig( name=BENCHMARK_NAME, @@ -27,19 +25,9 @@ def run_benchmark(): log_report=True, ) benchmark_report = Benchmark.launch(benchmark_config) - - return benchmark_config, benchmark_report - - -if __name__ == "__main__": - level = os.environ.get("LOG_LEVEL", "INFO") - to_file = os.environ.get("LOG_TO_FILE", "0") == "1" - setup_logging(level=level, to_file=to_file, prefix="MAIN-PROCESS") - - benchmark_config, benchmark_report = run_benchmark() benchmark = Benchmark(config=benchmark_config, report=benchmark_report) - if USERNAME is not None: - benchmark_config.push_to_hub(repo_id=f"{USERNAME}/benchmarks", subfolder=BENCHMARK_NAME) - benchmark_report.push_to_hub(repo_id=f"{USERNAME}/benchmarks", subfolder=BENCHMARK_NAME) - benchmark.push_to_hub(repo_id=f"{USERNAME}/benchmarks", subfolder=BENCHMARK_NAME) + if PUSH_REPO_ID is not None: + benchmark_config.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=BENCHMARK_NAME) + benchmark_report.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=BENCHMARK_NAME) + benchmark.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=BENCHMARK_NAME) diff --git a/examples/pytorch_bert.yaml b/examples/cuda_pytorch_bert.yaml similarity index 90% rename from examples/pytorch_bert.yaml rename to examples/cuda_pytorch_bert.yaml index 8bb702ca..8ab9b5cb 100644 --- a/examples/pytorch_bert.yaml +++ b/examples/cuda_pytorch_bert.yaml @@ -12,15 +12,16 @@ launcher: device_isolation: true device_isolation_action: warn +backend: + device: cuda + device_ids: 0 + no_weights: true + model: google-bert/bert-base-uncased + scenario: - latency: true memory: true + latency: true + input_shapes: batch_size: 1 sequence_length: 128 - -backend: - device: cuda - device_ids: 0 - no_weights: true - model: bert-base-uncased diff --git a/examples/trt_llama.yaml b/examples/cuda_pytorch_llama.yaml similarity index 70% rename from examples/trt_llama.yaml rename to examples/cuda_pytorch_llama.yaml index 30cb600a..1f85bd10 100644 --- a/examples/trt_llama.yaml +++ b/examples/cuda_pytorch_llama.yaml @@ -1,12 +1,12 @@ defaults: - benchmark - - backend: tensorrt-llm - scenario: inference - launcher: process + - backend: pytorch - _base_ - _self_ -name: trt_llama +name: cuda_pytorch_llama launcher: device_isolation: true @@ -16,12 +16,14 @@ backend: device: cuda device_ids: 0 no_weights: true + torch_dtype: float16 model: TinyLlama/TinyLlama-1.1B-Chat-v1.0 scenario: input_shapes: batch_size: 4 - sequence_length: 256 + sequence_length: 64 + generate_kwargs: - max_new_tokens: 100 - min_new_tokens: 100 + max_new_tokens: 32 + min_new_tokens: 32 diff --git a/examples/pytorch_llama.py b/examples/cuda_pytorch_llama_quants.py similarity index 81% rename from examples/pytorch_llama.py rename to examples/cuda_pytorch_llama_quants.py index fe732bfa..01d492cb 100644 --- a/examples/pytorch_llama.py +++ b/examples/cuda_pytorch_llama_quants.py @@ -1,17 +1,11 @@ import os -from huggingface_hub import whoami - from optimum_benchmark import Benchmark, BenchmarkConfig, InferenceConfig, ProcessConfig, PyTorchConfig from optimum_benchmark.logging_utils import setup_logging -try: - USERNAME = whoami()["name"] -except Exception as e: - print(f"Failed to get username from Hugging Face Hub: {e}") - USERNAME = None - -BENCHMARK_NAME = "pytorch-llama" +BENCHMARK_NAME = "cuda_pytorch_llama" +MODEL = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" +PUSH_REPO_ID = os.environ.get("PUSH_REPO_ID", None) WEIGHTS_CONFIGS = { "float16": { @@ -40,10 +34,10 @@ def run_benchmark(weight_config: str): launcher_config = ProcessConfig(device_isolation=True, device_isolation_action="warn") backend_config = PyTorchConfig( + model=MODEL, device="cuda", device_ids="0", no_weights=True, - model="gpt2", **WEIGHTS_CONFIGS[weight_config], ) scenario_config = InferenceConfig( @@ -52,7 +46,7 @@ def run_benchmark(weight_config: str): duration=10, iterations=10, warmup_runs=10, - input_shapes={"batch_size": 1, "sequence_length": 128}, + input_shapes={"batch_size": 1, "sequence_length": 64}, generate_kwargs={"max_new_tokens": 32, "min_new_tokens": 32}, ) benchmark_config = BenchmarkConfig( @@ -77,7 +71,5 @@ def run_benchmark(weight_config: str): benchmark_config, benchmark_report = run_benchmark(weight_config) benchmark = Benchmark(config=benchmark_config, report=benchmark_report) - if USERNAME is not None: - benchmark.push_to_hub( - repo_id=f"{USERNAME}/benchmarks", filename=f"{weight_config}.json", subfolder=BENCHMARK_NAME - ) + if PUSH_REPO_ID is not None: + benchmark.push_to_hub(repo_id=PUSH_REPO_ID, subfolder=BENCHMARK_NAME, filename=f"{weight_config}.json") diff --git a/examples/pytorch_vlm.yaml b/examples/cuda_pytorch_vlm.yaml similarity index 92% rename from examples/pytorch_vlm.yaml rename to examples/cuda_pytorch_vlm.yaml index a39f8c8a..8f1e0f3c 100644 --- a/examples/pytorch_vlm.yaml +++ b/examples/cuda_pytorch_vlm.yaml @@ -6,7 +6,7 @@ defaults: - _base_ - _self_ -name: pytorch_vlm +name: cuda_pytorch_vlm launcher: device_isolation: true @@ -30,7 +30,7 @@ scenario: input_shapes: # text batch_size: 1 - sequence_length: 256 + sequence_length: 64 # image num_images: 2 num_channels: 3 diff --git a/examples/tgi_llama.yaml b/examples/cuda_tgi_llama.yaml similarity index 63% rename from examples/tgi_llama.yaml rename to examples/cuda_tgi_llama.yaml index 399667fb..297403c8 100644 --- a/examples/tgi_llama.yaml +++ b/examples/cuda_tgi_llama.yaml @@ -6,7 +6,7 @@ defaults: - _base_ - _self_ -name: tgi_llama +name: cuda_tgi_llama launcher: device_isolation: true @@ -14,14 +14,15 @@ launcher: backend: device: cuda - device_ids: 4 - # no_weights: true + device_ids: 0 + cuda_graphs: 0 # remove for better perf but bigger memory footprint model: TinyLlama/TinyLlama-1.1B-Chat-v1.0 scenario: input_shapes: batch_size: 4 - sequence_length: 256 + sequence_length: 64 + generate_kwargs: - max_new_tokens: 100 - min_new_tokens: 100 + max_new_tokens: 32 + min_new_tokens: 32 diff --git a/examples/pytorch_llama.yaml b/examples/cuda_trt_llama.yaml similarity index 56% rename from examples/pytorch_llama.yaml rename to examples/cuda_trt_llama.yaml index becd1f2e..c483fc2f 100644 --- a/examples/pytorch_llama.yaml +++ b/examples/cuda_trt_llama.yaml @@ -1,33 +1,30 @@ defaults: - benchmark + - backend: tensorrt-llm - scenario: inference - launcher: process - - backend: pytorch - _base_ - _self_ -name: pytorch_llama +name: cuda_trt_llama launcher: device_isolation: true device_isolation_action: warn backend: - model: gpt2 device: cuda - torch_dtype: float16 + device_ids: 0 + max_batch_size: 4 + max_new_tokens: 32 + max_prompt_length: 64 + model: TinyLlama/TinyLlama-1.1B-Chat-v1.0 scenario: - memory: true - latency: true - - warmup_runs: 10 - iterations: 10 - duration: 10 - input_shapes: - batch_size: 1 - sequence_length: 256 + batch_size: 4 + sequence_length: 64 + generate_kwargs: max_new_tokens: 32 min_new_tokens: 32 diff --git a/examples/vllm_llama.yaml b/examples/cuda_vllm_llama.yaml similarity index 62% rename from examples/vllm_llama.yaml rename to examples/cuda_vllm_llama.yaml index 8bbb4025..5ec4b5a8 100644 --- a/examples/vllm_llama.yaml +++ b/examples/cuda_vllm_llama.yaml @@ -6,7 +6,7 @@ defaults: - _base_ - _self_ -name: vllm_llama +name: cuda_vllm_llama launcher: device_isolation: true @@ -15,16 +15,16 @@ launcher: backend: device: cuda device_ids: 0 - no_weights: false - serving_mode: offline + serving_mode: online # server-like model: TinyLlama/TinyLlama-1.1B-Chat-v1.0 engine_args: - enforce_eager: true + enforce_eager: true # remove for better perf but bigger memory footprint scenario: input_shapes: batch_size: 4 - sequence_length: 256 + sequence_length: 64 + generate_kwargs: - max_new_tokens: 100 - min_new_tokens: 100 + max_new_tokens: 32 + min_new_tokens: 32 diff --git a/examples/pytorch_bert_mps.yaml b/examples/mps_pytorch_bert.yaml similarity index 67% rename from examples/pytorch_bert_mps.yaml rename to examples/mps_pytorch_bert.yaml index 4d4dc6e3..27368eb1 100644 --- a/examples/pytorch_bert_mps.yaml +++ b/examples/mps_pytorch_bert.yaml @@ -1,15 +1,12 @@ defaults: - benchmark - scenario: inference - - launcher: process # launcher: inline works, + - launcher: inline # mps fails with python multi-processing for some reason - backend: pytorch - _base_ - _self_ -name: pytorch_bert - -# launcher: -# start_method: spawn +name: mps_pytorch_bert scenario: latency: true @@ -19,8 +16,6 @@ scenario: sequence_length: 128 backend: - device: cpu + device: mps no_weights: true model: bert-base-uncased - - diff --git a/examples/neural_compressor_ptq_bert.yaml b/examples/neural_compressor_ptq_bert.yaml deleted file mode 100644 index cbc32590..00000000 --- a/examples/neural_compressor_ptq_bert.yaml +++ /dev/null @@ -1,20 +0,0 @@ -defaults: - - benchmark - - backend: neural-compressor - - scenario: inference - - launcher: process - - _base_ - - _self_ - -name: neural_compressor_ptq_bert - -backend: - device: cpu - no_weights: true - model: bert-base-uncased - ptq_quantization: true - calibration: true - -scenario: - input_shapes: - batch_size: 1 diff --git a/examples/openvino_static_quant_bert.yaml b/examples/openvino_static_quant_bert.yaml deleted file mode 100644 index caa4363a..00000000 --- a/examples/openvino_static_quant_bert.yaml +++ /dev/null @@ -1,21 +0,0 @@ -defaults: - - benchmark - - scenario: inference - - backend: openvino - - launcher: process - - _base_ - - _self_ - -name: openvino_static_quant_bert - -backend: - device: cpu - no_weights: true - model: bert-base-uncased - quantization: true - calibration: true - reshape: true - -scenario: - input_shapes: - batch_size: 1 diff --git a/examples/tei_bge.yaml b/examples/tei_bge.yaml deleted file mode 100644 index dbbab7d5..00000000 --- a/examples/tei_bge.yaml +++ /dev/null @@ -1,21 +0,0 @@ -defaults: - - benchmark - - scenario: inference - - launcher: inline - - backend: py-txi - - _self_ - -name: tei_bert - -launcher: - device_isolation: true - device_isolation_action: warn - -backend: - device: cpu - model: BAAI/bge-base-en-v1.5 - -scenario: - input_shapes: - batch_size: 64 - sequence_length: 128 diff --git a/optimum_benchmark/__init__.py b/optimum_benchmark/__init__.py index 313fb22a..7be4c4c3 100644 --- a/optimum_benchmark/__init__.py +++ b/optimum_benchmark/__init__.py @@ -1,9 +1,7 @@ from .backends import ( BackendConfig, - INCConfig, IPEXConfig, LlamaCppConfig, - LLMSwarmConfig, ORTConfig, OVConfig, PyTorchConfig, @@ -26,10 +24,8 @@ "EnergyStarConfig", "InferenceConfig", "IPEXConfig", - "INCConfig", "InlineConfig", "LauncherConfig", - "LLMSwarmConfig", "ORTConfig", "OVConfig", "ProcessConfig", diff --git a/optimum_benchmark/backends/__init__.py b/optimum_benchmark/backends/__init__.py index ec146f0b..2019270a 100644 --- a/optimum_benchmark/backends/__init__.py +++ b/optimum_benchmark/backends/__init__.py @@ -1,8 +1,6 @@ from .config import BackendConfig from .ipex.config import IPEXConfig from .llama_cpp.config import LlamaCppConfig -from .llm_swarm.config import LLMSwarmConfig -from .neural_compressor.config import INCConfig from .onnxruntime.config import ORTConfig from .openvino.config import OVConfig from .py_txi.config import PyTXIConfig @@ -18,9 +16,7 @@ "OVConfig", "TorchORTConfig", "TRTLLMConfig", - "INCConfig", "PyTXIConfig", - "LLMSwarmConfig", "BackendConfig", "VLLMConfig", "LlamaCppConfig", diff --git a/optimum_benchmark/backends/llm_swarm/__init__.py b/optimum_benchmark/backends/llm_swarm/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/optimum_benchmark/backends/llm_swarm/backend.py b/optimum_benchmark/backends/llm_swarm/backend.py deleted file mode 100644 index 8139e4ea..00000000 --- a/optimum_benchmark/backends/llm_swarm/backend.py +++ /dev/null @@ -1,85 +0,0 @@ -import asyncio -from typing import Any, Dict, List - -import torch -from huggingface_hub import AsyncInferenceClient -from llm_swarm import LLMSwarm -from llm_swarm import LLMSwarmConfig as LLMSwarmCfg - -from ...task_utils import TEXT_GENERATION_TASKS -from ..base import Backend -from .config import LLMSwarmConfig - - -class LLMSwarmBackend(Backend[LLMSwarmConfig]): - NAME: str = "llm-swarm" - - def __init__(self, config: LLMSwarmConfig) -> None: - super().__init__(config) - - if self.config.task not in TEXT_GENERATION_TASKS: - raise NotImplementedError(f"LLM Swarm does not support task {self.config.task}") - - def load(self) -> None: - self.logger.info("\t+ Downloading pretrained model") - self.download_pretrained_model() - self.logger.info("\t+ Preparing generation config") - self.prepare_generation_config() - self.logger.info("\t+ Loading pretrained model") - self.load_model_from_pretrained() - - def load_model_from_pretrained(self) -> None: - self.llm_swarm_config = LLMSwarmCfg( - gpus=self.config.gpus, - model=self.config.model, - instances=self.config.instances, - inference_engine=self.config.inference_engine, - slurm_template_path=self.config.slurm_template_path, - load_balancer_template_path=self.config.load_balancer_template_path, - per_instance_max_parallel_requests=self.config.per_instance_max_parallel_requests, - revision=self.config.model_kwargs.get("revision", "main"), - debug_endpoint=self.config.debug_endpoint, - ) - self.llm_swarm = LLMSwarm(self.llm_swarm_config).__enter__() - self.client = AsyncInferenceClient(self.llm_swarm.endpoint) - - def download_pretrained_model(self) -> None: - with torch.device("meta"): - self.auto_model_loader.from_pretrained(self.config.model, **self.config.model_kwargs) - - def prepare_generation_config(self) -> None: - self.generation_config.eos_token_id = -100 - self.generation_config.pad_token_id = -100 - - model_cache_folder = f"models/{self.config.model}".replace("/", "--") - model_cache_path = f"{self.config.volume}/{model_cache_folder}" - snapshot_file = f"{model_cache_path}/refs/{self.config.model_kwargs.get('revision', 'main')}" - snapshot_ref = open(snapshot_file, "r").read().strip() - model_snapshot_path = f"{model_cache_path}/snapshots/{snapshot_ref}" - self.logger.info("\t+ Saving new pretrained generation config") - self.generation_config.save_pretrained(save_directory=model_snapshot_path) - - def prepare_inputs(self, inputs: Dict[str, Any]) -> Dict[str, Any]: - if "inputs" in inputs: - inputs = {"prompt": self.pretrained_processor.batch_decode(inputs["inputs"].tolist())} - elif "input_ids" in inputs: - inputs = {"prompt": self.pretrained_processor.batch_decode(inputs["input_ids"].tolist())} - else: - raise ValueError("inputs must contain either input_ids or inputs") - - return inputs - - async def single_client_call(self, prompt: str, kwargs: Dict[str, Any]) -> str: - return await self.client.text_generation(prompt, max_new_tokens=kwargs.get("max_new_tokens", 1)) - - async def batch_client_call(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> List[str]: - return await asyncio.gather(*(self.single_client_call(p, kwargs) for p in inputs["prompt"])) - - def forward(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> List[str]: - return asyncio.run(self.batch_client_call(inputs, kwargs)) - - def prefill(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> List[str]: - return asyncio.run(self.batch_client_call(inputs, kwargs)) - - def generate(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> List[str]: - return asyncio.run(self.batch_client_call(inputs, kwargs)) diff --git a/optimum_benchmark/backends/llm_swarm/config.py b/optimum_benchmark/backends/llm_swarm/config.py deleted file mode 100644 index 745cdd3f..00000000 --- a/optimum_benchmark/backends/llm_swarm/config.py +++ /dev/null @@ -1,31 +0,0 @@ -from dataclasses import dataclass -from typing import Optional - -from ...import_utils import llm_swarm_version -from ..config import BackendConfig - - -@dataclass -class LLMSwarmConfig(BackendConfig): - name: str = "llm-swarm" - version: Optional[str] = llm_swarm_version() - _target_: str = "optimum_benchmark.backends.llm_swarm.backend.LLMSwarmBackend" - - # optimum benchmark specific - no_weights: bool = False - - # llm-swarm specific - gpus: int = 8 - instances: int = 1 - inference_engine: str = "tgi" - volume: str = "/fsx/ilyas/.cache" - per_instance_max_parallel_requests: int = 500 - slurm_template_path: str = "/fsx/ilyas/swarm-templates/tgi_h100.template.slurm" - load_balancer_template_path: str = "/fsx/ilyas/swarm-templates/nginx.template.conf" - debug_endpoint: Optional[str] = None - - def __post_init__(self): - super().__post_init__() - - # so that downloaded artifacts are stored in the same place - self.hub_kwargs["cache_dir"] = self.volume diff --git a/optimum_benchmark/backends/neural_compressor/__init__.py b/optimum_benchmark/backends/neural_compressor/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/optimum_benchmark/backends/neural_compressor/backend.py b/optimum_benchmark/backends/neural_compressor/backend.py deleted file mode 100644 index c180a5ba..00000000 --- a/optimum_benchmark/backends/neural_compressor/backend.py +++ /dev/null @@ -1,151 +0,0 @@ -import os -from collections import OrderedDict -from tempfile import TemporaryDirectory -from typing import Any, Dict - -import torch -from hydra.utils import get_class -from neural_compressor.config import AccuracyCriterion, PostTrainingQuantConfig, TuningCriterion -from optimum.intel.neural_compressor.quantization import INCQuantizer - -from ...generators.dataset_generator import DatasetGenerator -from ..base import Backend -from ..transformers_utils import fast_weights_init -from .config import INCConfig -from .utils import TASKS_TO_INCMODELS - - -class INCBackend(Backend[INCConfig]): - NAME: str = "neural-compressor" - - def __init__(self, config: INCConfig): - super().__init__(config) - - if self.config.task in TASKS_TO_INCMODELS: - self.incmodel_class = get_class(TASKS_TO_INCMODELS[self.config.task]) - self.logger.info(f"Using INCModel class {self.incmodel_class.__name__}") - else: - raise NotImplementedError(f"INCBackend does not support task {self.config.task}") - - def load(self) -> None: - self.logger.info("\t+ Creating backend temporary directory") - self.tmpdir = TemporaryDirectory() - - if self.config.ptq_quantization: - if self.config.no_weights: - self.logger.info("\t+ Creating no weights AutoModel") - self.create_no_weights_model() - self.logger.info("\t+ Loading no weights AutoModel") - self.load_automodel_with_no_weights() - else: - self.logger.info("\t+ Loading pretrained AutoModel") - self.load_automodel_from_pretrained() - self.logger.info("\t+ Applying post-training quantization") - self.quantize_automodel() - self.logger.info("\t+ Loading quantized INCModel") - original_model, self.config.model = self.config.model, self.quantized_model - self.load_incmodel_from_pretrained() - self.config.model = original_model - elif self.config.no_weights: - self.logger.info("\t+ Creating no weights INCModel") - self.create_no_weights_model() - self.logger.info("\t+ Loading no weights INCModel") - self.load_incmodel_with_no_weights() - else: - self.logger.info("\t+ Loading pretrained INCModel") - self.load_incmodel_from_pretrained() - - self.tmpdir.cleanup() - - def load_automodel_from_pretrained(self) -> None: - self.pretrained_model = self.automodel_loader.from_pretrained(self.config.model, **self.config.model_kwargs) - - def load_automodel_with_no_weights(self) -> None: - original_model, self.config.model = self.config.model, self.no_weights_model - - with fast_weights_init(): - self.load_automodel_from_pretrained() - - self.logger.info("\t+ Tying model weights") - self.pretrained_model.tie_weights() - - self.config.model = original_model - - def load_incmodel_from_pretrained(self) -> None: - self.pretrained_model = self.incmodel_class.from_pretrained(self.config.model, **self.config.model_kwargs) - - def load_incmodel_with_no_weights(self) -> None: - original_model, self.config.model = self.config.model, self.no_weights_model - - with fast_weights_init(): - self.load_incmodel_from_pretrained() - - self.logger.info("\t+ Tying model weights") - self.pretrained_model.model.tie_weights() - - self.config.model = original_model - - def create_no_weights_model(self) -> None: - self.no_weights_model = os.path.join(self.tmpdir.name, "no_weights_model") - self.logger.info("\t+ Creating no weights model directory") - os.makedirs(self.no_weights_model, exist_ok=True) - self.logger.info("\t+ Creating no weights model state dict") - state_dict = torch.nn.Linear(1, 1).state_dict() - self.logger.info("\t+ Saving no weights model pytorch_model.bin") - torch.save(state_dict, os.path.join(self.no_weights_model, "pytorch_model.bin")) - self.logger.info("\t+ Saving no weights model pretrained config") - self.pretrained_config.save_pretrained(save_directory=self.no_weights_model) - - def quantize_automodel(self) -> None: - self.quantized_model = f"{self.tmpdir.name}/quantized_model" - self.logger.info("\t+ Processing quantization config") - ptq_quantization_config = self.config.ptq_quantization_config.copy() - ptq_quantization_config["accuracy_criterion"] = AccuracyCriterion( - **ptq_quantization_config["accuracy_criterion"] - ) - ptq_quantization_config["tuning_criterion"] = TuningCriterion(**ptq_quantization_config["tuning_criterion"]) - ptq_quantization_config = PostTrainingQuantConfig(**ptq_quantization_config) - self.logger.info("\t+ Creating quantizer") - quantizer = INCQuantizer.from_pretrained( - model=self.pretrained_model, - task=self.config.task, - seed=self.config.seed, - # TODO: add support for these - calibration_fn=None, - eval_fn=None, - ) - - if self.config.calibration: - self.logger.info("\t+ Generating calibration dataset") - dataset_shapes = {"dataset_size": 1, "sequence_length": 1, **self.model_shapes} - calibration_dataset = DatasetGenerator( - task=self.config.task, dataset_shapes=dataset_shapes, model_shapes=self.model_shapes - )() - columns_to_be_removed = list(set(calibration_dataset.column_names) - set(quantizer._signature_columns)) - calibration_dataset = calibration_dataset.remove_columns(columns_to_be_removed) - else: - calibration_dataset = None - - self.logger.info("\t+ Quantizing model") - quantizer.quantize( - save_directory=self.quantized_model, - calibration_dataset=calibration_dataset, - quantization_config=ptq_quantization_config, - # TODO: add support for these - remove_unused_columns=True, - data_collator=None, - file_name=None, - batch_size=1, - ) - - @torch.inference_mode() - def forward(self, input: Dict[str, Any], kwargs: Dict[str, Any]) -> OrderedDict: - return self.pretrained_model(**input, **kwargs) - - @torch.inference_mode() - def prefill(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> OrderedDict: - return self.pretrained_model.generate(**inputs, **kwargs) - - @torch.inference_mode() - def generate(self, input: Dict[str, Any], kwargs: Dict[str, Any]) -> OrderedDict: - return self.pretrained_model.generate(**input, **kwargs) diff --git a/optimum_benchmark/backends/neural_compressor/config.py b/optimum_benchmark/backends/neural_compressor/config.py deleted file mode 100644 index 8aea5964..00000000 --- a/optimum_benchmark/backends/neural_compressor/config.py +++ /dev/null @@ -1,71 +0,0 @@ -from dataclasses import dataclass, field -from typing import Any, Dict, Optional - -from omegaconf import OmegaConf - -from ...import_utils import neural_compressor_version -from ..config import BackendConfig - -# https://github.com/intel/neural-compressor/blob/master/neural_compressor/config.py#L490 -ACCURACY_CRITERION_CONFIG = {"higher_is_better": True, "criterion": "relative", "tolerable_loss": 0.01} - -# https://github.com/intel/neural-compressor/blob/master/neural_compressor/config.py#L593 -TUNING_CRITERION_CONFIG = { - "strategy": "basic", - "strategy_kwargs": None, - "timeout": 0, - "max_trials": 100, - "objective": "performance", -} - -# https://github.com/intel/neural-compressor/blob/master/neural_compressor/config.py#L1242 -PTQ_QUANTIZATION_CONFIG = { - "device": "cpu", - "backend": "default", - "domain": "auto", - "recipes": {}, - "quant_format": "default", - "inputs": [], - "outputs": [], - "approach": "static", - "calibration_sampling_size": [100], - "op_type_dict": None, - "op_name_dict": None, - "reduce_range": None, - "example_inputs": None, - "excluded_precisions": [], - "quant_level": "auto", - "accuracy_criterion": ACCURACY_CRITERION_CONFIG, - "tuning_criterion": TUNING_CRITERION_CONFIG, -} - - -@dataclass -class INCConfig(BackendConfig): - name: str = "neural-compressor" - version: Optional[str] = neural_compressor_version() - _target_: str = "optimum_benchmark.backends.neural_compressor.backend.INCBackend" - - # load options - no_weights: bool = False - - # post-training quantization options - ptq_quantization: bool = False - ptq_quantization_config: Dict[str, Any] = field(default_factory=dict) - - # calibration options - calibration: bool = False - calibration_config: Dict[str, Any] = field(default_factory=dict) - - def __post_init__(self): - super().__post_init__() - - if self.device != "cpu": - raise ValueError(f"INCBackend only supports CPU devices, got {self.device}") - - if self.ptq_quantization: - self.ptq_quantization_config = OmegaConf.to_object( - OmegaConf.merge(PTQ_QUANTIZATION_CONFIG, self.ptq_quantization_config) - ) - if self.ptq_quantization_config["approach"] == "static" and not self.calibration: - raise ValueError("Calibration must be enabled when using static quantization.") diff --git a/optimum_benchmark/backends/neural_compressor/utils.py b/optimum_benchmark/backends/neural_compressor/utils.py deleted file mode 100644 index beb99977..00000000 --- a/optimum_benchmark/backends/neural_compressor/utils.py +++ /dev/null @@ -1,5 +0,0 @@ -from optimum.intel.neural_compressor.utils import _HEAD_TO_AUTOMODELS - -TASKS_TO_INCMODELS = { - task: f"optimum.intel.neural_compressor.{incmodel_name}" for task, incmodel_name in _HEAD_TO_AUTOMODELS.items() -} diff --git a/optimum_benchmark/backends/py_txi/config.py b/optimum_benchmark/backends/py_txi/config.py index e42161e6..73b75b75 100644 --- a/optimum_benchmark/backends/py_txi/config.py +++ b/optimum_benchmark/backends/py_txi/config.py @@ -50,7 +50,7 @@ class PyTXIConfig(BackendConfig): quantize: Optional[str] = None num_shard: Optional[int] = None speculate: Optional[int] = None - cuda_graphs: Optional[bool] = None + cuda_graphs: Optional[int] = None disable_custom_kernels: Optional[bool] = None trust_remote_code: Optional[bool] = None diff --git a/optimum_benchmark/cli.py b/optimum_benchmark/cli.py index 4b26266b..5af0723b 100644 --- a/optimum_benchmark/cli.py +++ b/optimum_benchmark/cli.py @@ -10,12 +10,10 @@ Benchmark, BenchmarkConfig, EnergyStarConfig, - INCConfig, InferenceConfig, InlineConfig, IPEXConfig, LlamaCppConfig, - LLMSwarmConfig, ORTConfig, OVConfig, ProcessConfig, @@ -43,9 +41,7 @@ cs.store(group="backend", name=ORTConfig.name, node=ORTConfig) cs.store(group="backend", name=TorchORTConfig.name, node=TorchORTConfig) cs.store(group="backend", name=TRTLLMConfig.name, node=TRTLLMConfig) -cs.store(group="backend", name=INCConfig.name, node=INCConfig) cs.store(group="backend", name=PyTXIConfig.name, node=PyTXIConfig) -cs.store(group="backend", name=LLMSwarmConfig.name, node=LLMSwarmConfig) cs.store(group="backend", name=VLLMConfig.name, node=VLLMConfig) cs.store(group="backend", name=LlamaCppConfig.name, node=LlamaCppConfig) # scenarios configurations diff --git a/setup.py b/setup.py index 03bbdf07..46a1ed60 100644 --- a/setup.py +++ b/setup.py @@ -76,6 +76,7 @@ "py-txi": ["py-txi"], "vllm": ["vllm"], # optional dependencies + "torchao": ["torchao"], "autoawq": ["autoawq"], "auto-gptq": ["optimum", "auto-gptq"], "sentence-transformers": ["sentence-transformers"], diff --git a/tests/test_energy_star.py b/tests/test_energy_star.py index bbb83f55..f2520932 100644 --- a/tests/test_energy_star.py +++ b/tests/test_energy_star.py @@ -9,12 +9,16 @@ LOGGER = getLogger("test-cli") -TEST_CONFIG_DIR = Path(__file__).parent.parent / "examples/energy_star" +TEST_CONFIG_DIR = Path(__file__).parent.parent / "energy_star" + TEST_CONFIG_NAMES = [ config.split(".")[0] for config in os.listdir(TEST_CONFIG_DIR) if config.endswith(".yaml") and not (config.startswith("_") or config.endswith("_")) ] +TEST_SCRIPT_PATHS = [ + str(TEST_CONFIG_DIR / filename) for filename in os.listdir(TEST_CONFIG_DIR) if filename.endswith(".py") +] ROCR_VISIBLE_DEVICES = os.environ.get("ROCR_VISIBLE_DEVICES", None) CUDA_VISIBLE_DEVICES = os.environ.get("CUDA_VISIBLE_DEVICES", None) @@ -42,3 +46,11 @@ def test_cli_configs(config_name): popen = run_subprocess_and_log_stream_output(LOGGER, args) assert popen.returncode == 0, f"Failed to run {config_name}" + + +@pytest.mark.parametrize("script_path", TEST_SCRIPT_PATHS) +def test_api_scripts(script_path): + args = ["python", script_path] + + popen = run_subprocess_and_log_stream_output(LOGGER, args) + assert popen.returncode == 0, f"Failed to run {script_path}" diff --git a/tests/test_examples.py b/tests/test_examples.py new file mode 100644 index 00000000..13cf3cff --- /dev/null +++ b/tests/test_examples.py @@ -0,0 +1,46 @@ +import os +from logging import getLogger +from pathlib import Path + +import pytest + +from optimum_benchmark.logging_utils import run_subprocess_and_log_stream_output + +LOGGER = getLogger("test-examples") + + +TEST_CONFIG_DIR = Path(__file__).parent.parent / "examples" + +TEST_CONFIG_NAMES = [ + config.split(".")[0] + for config in os.listdir(TEST_CONFIG_DIR) + if config.endswith(".yaml") and not (config.startswith("_") or config.endswith("_")) +] + +TEST_SCRIPT_PATHS = [ + str(TEST_CONFIG_DIR / filename) for filename in os.listdir(TEST_CONFIG_DIR) if filename.endswith(".py") +] + +ROCR_VISIBLE_DEVICES = os.environ.get("ROCR_VISIBLE_DEVICES", None) +CUDA_VISIBLE_DEVICES = os.environ.get("CUDA_VISIBLE_DEVICES", None) + + +@pytest.mark.parametrize("config_name", TEST_CONFIG_NAMES) +def test_cli_configs(config_name): + args = ["optimum-benchmark", "--config-dir", TEST_CONFIG_DIR, "--config-name", config_name] + + if ROCR_VISIBLE_DEVICES is not None: + args += [f'backend.device_ids="{ROCR_VISIBLE_DEVICES}"'] + elif CUDA_VISIBLE_DEVICES is not None: + args += [f'backend.device_ids="{CUDA_VISIBLE_DEVICES}"'] + + popen = run_subprocess_and_log_stream_output(LOGGER, args) + assert popen.returncode == 0, f"Failed to run {config_name}" + + +@pytest.mark.parametrize("script_path", TEST_SCRIPT_PATHS) +def test_api_scripts(script_path): + args = ["python", script_path] + + popen = run_subprocess_and_log_stream_output(LOGGER, args) + assert popen.returncode == 0, f"Failed to run {script_path}"