From 1fa8c70f1c5a866f2fe0d5bc06c289a60d39b3ab Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Thu, 17 Oct 2024 01:04:29 -0700 Subject: [PATCH 01/45] Remove existing GH workflows --- .github/dependabot.yml | 35 --- .github/lock.yml | 32 --- .github/workflows/freebsd.yml | 34 --- .github/workflows/i386.yml | 43 ---- .github/workflows/jvm_tests.yml | 100 -------- .github/workflows/main.yml | 193 --------------- .github/workflows/python_tests.yml | 348 ---------------------------- .github/workflows/python_wheels.yml | 55 ----- .github/workflows/r_nold.yml | 44 ---- .github/workflows/r_tests.yml | 150 ------------ .github/workflows/scorecards.yml | 54 ----- .github/workflows/update_rapids.yml | 44 ---- 12 files changed, 1132 deletions(-) delete mode 100644 .github/dependabot.yml delete mode 100644 .github/lock.yml delete mode 100644 .github/workflows/freebsd.yml delete mode 100644 .github/workflows/i386.yml delete mode 100644 .github/workflows/jvm_tests.yml delete mode 100644 .github/workflows/main.yml delete mode 100644 .github/workflows/python_tests.yml delete mode 100644 .github/workflows/python_wheels.yml delete mode 100644 .github/workflows/r_nold.yml delete mode 100644 .github/workflows/r_tests.yml delete mode 100644 .github/workflows/scorecards.yml delete mode 100644 .github/workflows/update_rapids.yml diff --git a/.github/dependabot.yml b/.github/dependabot.yml deleted file mode 100644 index 1a8098071ba3..000000000000 --- a/.github/dependabot.yml +++ /dev/null @@ -1,35 +0,0 @@ -# To get started with Dependabot version updates, you'll need to specify which -# package ecosystems to update and where the package manifests are located. -# Please see the documentation for all configuration options: -# https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates - -version: 2 -updates: - - package-ecosystem: "maven" - directory: "/jvm-packages" - schedule: - interval: "monthly" - - package-ecosystem: "maven" - directory: "/jvm-packages/xgboost4j" - schedule: - interval: "monthly" - - package-ecosystem: "maven" - directory: "/jvm-packages/xgboost4j-gpu" - schedule: - interval: "monthly" - - package-ecosystem: "maven" - directory: "/jvm-packages/xgboost4j-example" - schedule: - interval: "monthly" - - package-ecosystem: "maven" - directory: "/jvm-packages/xgboost4j-spark" - schedule: - interval: "monthly" - - package-ecosystem: "maven" - directory: "/jvm-packages/xgboost4j-spark-gpu" - schedule: - interval: "monthly" - - package-ecosystem: "github-actions" - directory: / - schedule: - interval: "monthly" diff --git a/.github/lock.yml b/.github/lock.yml deleted file mode 100644 index f916abe5a367..000000000000 --- a/.github/lock.yml +++ /dev/null @@ -1,32 +0,0 @@ -# Configuration for lock-threads - https://github.com/dessant/lock-threads - -# Number of days of inactivity before a closed issue or pull request is locked -daysUntilLock: 90 - -# Issues and pull requests with these labels will not be locked. Set to `[]` to disable -exemptLabels: - - feature-request - -# Label to add before locking, such as `outdated`. Set to `false` to disable -lockLabel: false - -# Comment to post before locking. Set to `false` to disable -lockComment: false - -# Assign `resolved` as the reason for locking. Set to `false` to disable -setLockReason: true - -# Limit to only `issues` or `pulls` -# only: issues - -# Optionally, specify configuration settings just for `issues` or `pulls` -# issues: -# exemptLabels: -# - help-wanted -# lockLabel: outdated - -# pulls: -# daysUntilLock: 30 - -# Repository to extend settings from -# _extends: repo diff --git a/.github/workflows/freebsd.yml b/.github/workflows/freebsd.yml deleted file mode 100644 index d3208a1294d1..000000000000 --- a/.github/workflows/freebsd.yml +++ /dev/null @@ -1,34 +0,0 @@ -name: FreeBSD - -on: [push, pull_request] - -permissions: - contents: read # to fetch code (actions/checkout) - -concurrency: - group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} - cancel-in-progress: true - -jobs: - test: - runs-on: ubuntu-latest - timeout-minutes: 20 - name: A job to run test in FreeBSD - steps: - - uses: actions/checkout@v4 - with: - submodules: 'true' - - name: Test in FreeBSD - id: test - uses: vmactions/freebsd-vm@v1 - with: - usesh: true - prepare: | - pkg install -y cmake git ninja googletest - - run: | - mkdir build - cd build - cmake .. -GNinja -DGOOGLE_TEST=ON - ninja -v - ./testxgboost diff --git a/.github/workflows/i386.yml b/.github/workflows/i386.yml deleted file mode 100644 index 72618dc697a6..000000000000 --- a/.github/workflows/i386.yml +++ /dev/null @@ -1,43 +0,0 @@ -name: XGBoost-i386-test - -on: [push, pull_request] - -permissions: - contents: read # to fetch code (actions/checkout) - -concurrency: - group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} - cancel-in-progress: true - -jobs: - build-32bit: - name: Build 32-bit - runs-on: ubuntu-latest - services: - registry: - image: registry:2 - ports: - - 5000:5000 - steps: - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 - with: - submodules: 'true' - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3.6.1 - with: - driver-opts: network=host - - name: Build and push container - uses: docker/build-push-action@v6 - with: - context: . - file: tests/ci_build/Dockerfile.i386 - push: true - tags: localhost:5000/xgboost/build-32bit:latest - cache-from: type=gha - cache-to: type=gha,mode=max - - name: Build XGBoost - run: | - docker run --rm -v $PWD:/workspace -w /workspace \ - -e CXXFLAGS='-Wno-error=overloaded-virtual -Wno-error=maybe-uninitialized -Wno-error=redundant-move' \ - localhost:5000/xgboost/build-32bit:latest \ - tests/ci_build/build_via_cmake.sh diff --git a/.github/workflows/jvm_tests.yml b/.github/workflows/jvm_tests.yml deleted file mode 100644 index 1281c5d5fe56..000000000000 --- a/.github/workflows/jvm_tests.yml +++ /dev/null @@ -1,100 +0,0 @@ -name: XGBoost-JVM-Tests - -on: [push, pull_request] - -permissions: - contents: read # to fetch code (actions/checkout) - -concurrency: - group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} - cancel-in-progress: true - -jobs: - test-with-jvm: - name: Test JVM on OS ${{ matrix.os }} - timeout-minutes: 30 - runs-on: ${{ matrix.os }} - strategy: - fail-fast: false - matrix: - os: [windows-latest, ubuntu-latest, macos-13] - - steps: - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 - with: - submodules: 'true' - - - uses: actions/setup-java@6a0805fcefea3d4657a47ac4c165951e33482018 # v4.2.2 - with: - distribution: 'temurin' - java-version: '8' - - - uses: conda-incubator/setup-miniconda@a4260408e20b96e80095f42ff7f1a15b27dd94ca # v3.0.4 - with: - miniforge-variant: Mambaforge - miniforge-version: latest - activate-environment: jvm_tests - environment-file: tests/ci_build/conda_env/jvm_tests.yml - use-mamba: true - - - name: Cache Maven packages - uses: actions/cache@0c45773b623bea8c8e75f6c82b208c3cf94ea4f9 # v4.0.2 - with: - path: ~/.m2 - key: ${{ runner.os }}-m2-${{ hashFiles('./jvm-packages/pom.xml') }} - restore-keys: ${{ runner.os }}-m2-${{ hashFiles('./jvm-packages/pom.xml') }} - - - name: Test XGBoost4J (Core) - run: | - cd jvm-packages - mvn test -B -pl :xgboost4j_2.12 - - - name: Test XGBoost4J (Core, Spark, Examples) - run: | - rm -rfv build/ - cd jvm-packages - mvn -B test - if: matrix.os == 'ubuntu-latest' # Distributed training doesn't work on Windows - - - name: Extract branch name - shell: bash - run: | - echo "branch=${GITHUB_REF#refs/heads/}" >> "$GITHUB_OUTPUT" - id: extract_branch - if: | - (github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_')) && - (matrix.os == 'windows-latest' || matrix.os == 'macos-13') - - - name: Publish artifact xgboost4j.dll to S3 - run: | - cd lib/ - Rename-Item -Path xgboost4j.dll -NewName xgboost4j_${{ github.sha }}.dll - dir - python -m awscli s3 cp xgboost4j_${{ github.sha }}.dll s3://xgboost-nightly-builds/${{ steps.extract_branch.outputs.branch }}/libxgboost4j/ --acl public-read --region us-west-2 - if: | - (github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_')) && - matrix.os == 'windows-latest' - env: - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID_IAM_S3_UPLOADER }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY_IAM_S3_UPLOADER }} - - - name: Publish artifact libxgboost4j.dylib to S3 - shell: bash -l {0} - run: | - cd lib/ - mv -v libxgboost4j.dylib libxgboost4j_${{ github.sha }}.dylib - ls - python -m awscli s3 cp libxgboost4j_${{ github.sha }}.dylib s3://xgboost-nightly-builds/${{ steps.extract_branch.outputs.branch }}/libxgboost4j/ --acl public-read --region us-west-2 - if: | - (github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_')) && - matrix.os == 'macos-13' - env: - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID_IAM_S3_UPLOADER }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY_IAM_S3_UPLOADER }} - - - name: Build and Test XGBoost4J with scala 2.13 - run: | - rm -rfv build/ - cd jvm-packages - mvn -B clean install test -Pdefault,scala-2.13 - if: matrix.os == 'ubuntu-latest' # Distributed training doesn't work on Windows diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml deleted file mode 100644 index e515f97296fa..000000000000 --- a/.github/workflows/main.yml +++ /dev/null @@ -1,193 +0,0 @@ -# This is a basic workflow to help you get started with Actions - -name: XGBoost-CI - -# Controls when the action will run. Triggers the workflow on push or pull request -# events but only for the master branch -on: [push, pull_request] - -permissions: - contents: read # to fetch code (actions/checkout) - -concurrency: - group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} - cancel-in-progress: true - -# A workflow run is made up of one or more jobs that can run sequentially or in parallel -jobs: - gtest-cpu: - name: Test Google C++ test (CPU) - runs-on: ${{ matrix.os }} - strategy: - fail-fast: false - matrix: - os: [macos-12] - steps: - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 - with: - submodules: 'true' - - name: Install system packages - run: | - brew install ninja libomp - - name: Build gtest binary - run: | - mkdir build - cd build - cmake .. -DGOOGLE_TEST=ON -DUSE_OPENMP=ON -DUSE_DMLC_GTEST=ON -GNinja -DBUILD_DEPRECATED_CLI=ON -DUSE_SANITIZER=ON -DENABLED_SANITIZERS=address -DCMAKE_BUILD_TYPE=RelWithDebInfo - ninja -v - - name: Run gtest binary - run: | - cd build - ./testxgboost - ctest -R TestXGBoostCLI --extra-verbose - - gtest-cpu-nonomp: - name: Test Google C++ unittest (CPU Non-OMP) - runs-on: ${{ matrix.os }} - strategy: - fail-fast: false - matrix: - os: [ubuntu-latest] - steps: - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 - with: - submodules: 'true' - - name: Install system packages - run: | - sudo apt-get install -y --no-install-recommends ninja-build - - name: Build and install XGBoost - shell: bash -l {0} - run: | - mkdir build - cd build - cmake .. -GNinja -DGOOGLE_TEST=ON -DUSE_DMLC_GTEST=ON -DUSE_OPENMP=OFF -DBUILD_DEPRECATED_CLI=ON - ninja -v - - name: Run gtest binary - run: | - cd build - ctest --extra-verbose - - gtest-cpu-sycl: - name: Test Google C++ unittest (CPU SYCL) - runs-on: ${{ matrix.os }} - strategy: - fail-fast: false - matrix: - os: [ubuntu-latest] - python-version: ["3.10"] - steps: - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 - with: - submodules: 'true' - - uses: conda-incubator/setup-miniconda@a4260408e20b96e80095f42ff7f1a15b27dd94ca # v3.0.4 - with: - miniforge-variant: Mambaforge - miniforge-version: latest - activate-environment: linux_sycl_test - environment-file: tests/ci_build/conda_env/linux_sycl_test.yml - use-mamba: true - - name: Display Conda env - run: | - conda info - conda list - - name: Build and install XGBoost - shell: bash -l {0} - run: | - mkdir build - cd build - cmake .. -DGOOGLE_TEST=ON -DUSE_DMLC_GTEST=ON -DPLUGIN_SYCL=ON -DCMAKE_CXX_COMPILER=g++ -DCMAKE_C_COMPILER=gcc -DCMAKE_INSTALL_PREFIX=$CONDA_PREFIX - make -j$(nproc) - - name: Run gtest binary for SYCL - run: | - cd build - ./testxgboost --gtest_filter=Sycl* - - name: Run gtest binary for non SYCL - run: | - cd build - ./testxgboost --gtest_filter=-Sycl* - - c-api-demo: - name: Test installing XGBoost lib + building the C API demo - runs-on: ${{ matrix.os }} - defaults: - run: - shell: bash -l {0} - strategy: - fail-fast: false - matrix: - os: ["ubuntu-latest"] - python-version: ["3.10"] - steps: - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 - with: - submodules: 'true' - - uses: conda-incubator/setup-miniconda@a4260408e20b96e80095f42ff7f1a15b27dd94ca # v3.0.4 - with: - miniforge-variant: Mambaforge - miniforge-version: latest - activate-environment: cpp_test - environment-file: tests/ci_build/conda_env/cpp_test.yml - use-mamba: true - - name: Display Conda env - run: | - conda info - conda list - - - name: Build and install XGBoost static library - run: | - mkdir build - cd build - cmake .. -DBUILD_STATIC_LIB=ON -DCMAKE_INSTALL_PREFIX=$CONDA_PREFIX -GNinja - ninja -v install - cd - - - name: Build and run C API demo with static - run: | - pushd . - cd demo/c-api/ - mkdir build - cd build - cmake .. -GNinja -DCMAKE_PREFIX_PATH=$CONDA_PREFIX - ninja -v - ctest - cd .. - rm -rf ./build - popd - - - name: Build and install XGBoost shared library - run: | - cd build - cmake .. -DBUILD_STATIC_LIB=OFF -DCMAKE_INSTALL_PREFIX=$CONDA_PREFIX -GNinja -DPLUGIN_FEDERATED=ON -DGOOGLE_TEST=ON - ninja -v install - ./testxgboost - cd - - - name: Build and run C API demo with shared - run: | - pushd . - cd demo/c-api/ - mkdir build - cd build - cmake .. -GNinja -DCMAKE_PREFIX_PATH=$CONDA_PREFIX - ninja -v - ctest - popd - ./tests/ci_build/verify_link.sh ./demo/c-api/build/basic/api-demo - ./tests/ci_build/verify_link.sh ./demo/c-api/build/external-memory/external-memory-demo - - cpp-lint: - runs-on: ubuntu-latest - name: Code linting for C++ - steps: - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 - with: - submodules: 'true' - - uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0 - with: - python-version: "3.10" - architecture: 'x64' - - name: Install Python packages - run: | - python -m pip install wheel setuptools cmakelint cpplint==1.6.1 pylint - - name: Run lint - run: | - python3 tests/ci_build/lint_cpp.py - sh ./tests/ci_build/lint_cmake.sh diff --git a/.github/workflows/python_tests.yml b/.github/workflows/python_tests.yml deleted file mode 100644 index c8d2aba55507..000000000000 --- a/.github/workflows/python_tests.yml +++ /dev/null @@ -1,348 +0,0 @@ -name: XGBoost-Python-Tests - -on: [push, pull_request] - -permissions: - contents: read # to fetch code (actions/checkout) - -defaults: - run: - shell: bash -l {0} - -concurrency: - group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} - cancel-in-progress: true - -jobs: - python-mypy-lint: - runs-on: ubuntu-latest - name: Type and format checks for the Python package - strategy: - matrix: - os: [ubuntu-latest] - steps: - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 - with: - submodules: 'true' - - uses: conda-incubator/setup-miniconda@a4260408e20b96e80095f42ff7f1a15b27dd94ca # v3.0.4 - with: - miniforge-variant: Mambaforge - miniforge-version: latest - activate-environment: python_lint - environment-file: tests/ci_build/conda_env/python_lint.yml - use-mamba: true - - name: Display Conda env - run: | - conda info - conda list - - name: Run mypy - run: | - python tests/ci_build/lint_python.py --format=0 --type-check=1 --pylint=0 - - name: Run formatter - run: | - python tests/ci_build/lint_python.py --format=1 --type-check=0 --pylint=0 - - name: Run pylint - run: | - python tests/ci_build/lint_python.py --format=0 --type-check=0 --pylint=1 - - python-sdist-test-on-Linux: - # Mismatched glibcxx version between system and conda forge. - runs-on: ${{ matrix.os }} - name: Test installing XGBoost Python source package on ${{ matrix.os }} - strategy: - matrix: - os: [ubuntu-latest] - steps: - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 - with: - submodules: 'true' - - uses: conda-incubator/setup-miniconda@a4260408e20b96e80095f42ff7f1a15b27dd94ca # v3.0.4 - with: - miniforge-variant: Mambaforge - miniforge-version: latest - activate-environment: sdist_test - environment-file: tests/ci_build/conda_env/sdist_test.yml - use-mamba: true - - name: Display Conda env - run: | - conda info - conda list - - name: Build and install XGBoost - run: | - cd python-package - python --version - python -m build --sdist - pip install -v ./dist/xgboost-*.tar.gz --config-settings use_openmp=False - cd .. - python -c 'import xgboost' - - python-sdist-test: - # Use system toolchain instead of conda toolchain for macos and windows. - # MacOS has linker error if clang++ from conda-forge is used - runs-on: ${{ matrix.os }} - name: Test installing XGBoost Python source package on ${{ matrix.os }} - strategy: - matrix: - os: [macos-13, windows-latest] - python-version: ["3.10"] - steps: - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 - with: - submodules: 'true' - - name: Install osx system dependencies - if: matrix.os == 'macos-13' - run: | - brew install ninja libomp - - uses: conda-incubator/setup-miniconda@a4260408e20b96e80095f42ff7f1a15b27dd94ca # v3.0.4 - with: - auto-update-conda: true - python-version: ${{ matrix.python-version }} - activate-environment: test - - name: Install build - run: | - conda install -c conda-forge python-build - - name: Display Conda env - run: | - conda info - conda list - - name: Build and install XGBoost - run: | - cd python-package - python --version - python -m build --sdist - pip install -v ./dist/xgboost-*.tar.gz - cd .. - python -c 'import xgboost' - - python-tests-on-macos: - name: Test XGBoost Python package on ${{ matrix.config.os }} - runs-on: ${{ matrix.config.os }} - timeout-minutes: 60 - strategy: - matrix: - config: - - {os: macos-13} - - steps: - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 - with: - submodules: 'true' - - - uses: conda-incubator/setup-miniconda@a4260408e20b96e80095f42ff7f1a15b27dd94ca # v3.0.4 - with: - miniforge-variant: Mambaforge - miniforge-version: latest - activate-environment: macos_cpu_test - environment-file: tests/ci_build/conda_env/macos_cpu_test.yml - use-mamba: true - - - name: Display Conda env - run: | - conda info - conda list - - - name: Build XGBoost on macos - run: | - brew install ninja - - mkdir build - cd build - # Set prefix, to use OpenMP library from Conda env - # See https://github.com/dmlc/xgboost/issues/7039#issuecomment-1025038228 - # to learn why we don't use libomp from Homebrew. - cmake .. -GNinja -DCMAKE_PREFIX_PATH=$CONDA_PREFIX -DBUILD_DEPRECATED_CLI=ON - ninja - - - name: Install Python package - run: | - cd python-package - python --version - pip install -v . - - - name: Test Python package - run: | - pytest -s -v -rxXs --durations=0 ./tests/python - - - name: Test Dask Interface - run: | - pytest -s -v -rxXs --durations=0 ./tests/test_distributed/test_with_dask - - python-tests-on-win: - name: Test XGBoost Python package on ${{ matrix.config.os }} - runs-on: ${{ matrix.config.os }} - timeout-minutes: 60 - strategy: - matrix: - config: - - {os: windows-latest, python-version: '3.10'} - - steps: - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 - with: - submodules: 'true' - - - uses: conda-incubator/setup-miniconda@a4260408e20b96e80095f42ff7f1a15b27dd94ca # v3.0.4 - with: - auto-update-conda: true - python-version: ${{ matrix.config.python-version }} - activate-environment: win64_env - environment-file: tests/ci_build/conda_env/win64_cpu_test.yml - - - name: Display Conda env - run: | - conda info - conda list - - - name: Build XGBoost on Windows - run: | - mkdir build_msvc - cd build_msvc - cmake .. -G"Visual Studio 17 2022" -DCMAKE_CONFIGURATION_TYPES="Release" -A x64 -DBUILD_DEPRECATED_CLI=ON - cmake --build . --config Release --parallel $(nproc) - - - name: Install Python package - run: | - cd python-package - python --version - pip wheel -v . --wheel-dir dist/ - pip install ./dist/*.whl - - - name: Test Python package - run: | - pytest -s -v -rxXs --durations=0 ./tests/python - - python-tests-on-ubuntu: - name: Test XGBoost Python package on ${{ matrix.config.os }} - runs-on: ${{ matrix.config.os }} - timeout-minutes: 90 - strategy: - matrix: - config: - - {os: ubuntu-latest, python-version: "3.10"} - - steps: - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 - with: - submodules: 'true' - - - uses: conda-incubator/setup-miniconda@a4260408e20b96e80095f42ff7f1a15b27dd94ca # v3.0.4 - with: - miniforge-variant: Mambaforge - miniforge-version: latest - activate-environment: linux_cpu_test - environment-file: tests/ci_build/conda_env/linux_cpu_test.yml - use-mamba: true - - - name: Display Conda env - run: | - conda info - conda list - - - name: Build XGBoost on Ubuntu - run: | - mkdir build - cd build - cmake .. -GNinja -DCMAKE_PREFIX_PATH=$CONDA_PREFIX -DBUILD_DEPRECATED_CLI=ON - ninja - - - name: Install Python package - run: | - cd python-package - python --version - pip install -v . - - - name: Test Python package - run: | - pytest -s -v -rxXs --durations=0 ./tests/python - - - name: Test Dask Interface - run: | - pytest -s -v -rxXs --durations=0 ./tests/test_distributed/test_with_dask - - - name: Test PySpark Interface - shell: bash -l {0} - run: | - pytest -s -v -rxXs --durations=0 ./tests/test_distributed/test_with_spark - - python-sycl-tests-on-ubuntu: - name: Test XGBoost Python package with SYCL on ${{ matrix.config.os }} - runs-on: ${{ matrix.config.os }} - timeout-minutes: 90 - strategy: - matrix: - config: - - {os: ubuntu-latest, python-version: "3.10"} - - steps: - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 - with: - submodules: 'true' - - - uses: conda-incubator/setup-miniconda@a4260408e20b96e80095f42ff7f1a15b27dd94ca # v3.0.4 - with: - miniforge-variant: Mambaforge - miniforge-version: latest - activate-environment: linux_sycl_test - environment-file: tests/ci_build/conda_env/linux_sycl_test.yml - use-mamba: true - - - name: Display Conda env - run: | - conda info - conda list - - name: Build XGBoost on Ubuntu - run: | - mkdir build - cd build - cmake .. -DPLUGIN_SYCL=ON -DCMAKE_CXX_COMPILER=g++ -DCMAKE_C_COMPILER=gcc -DCMAKE_PREFIX_PATH=$CONDA_PREFIX - make -j$(nproc) - - name: Install Python package - run: | - cd python-package - python --version - pip install -v . - - name: Test Python package - run: | - pytest -s -v -rxXs --durations=0 ./tests/python-sycl/ - - - python-system-installation-on-ubuntu: - name: Test XGBoost Python package System Installation on ${{ matrix.os }} - runs-on: ${{ matrix.os }} - strategy: - matrix: - os: [ubuntu-latest] - - steps: - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 - with: - submodules: 'true' - - - name: Set up Python 3.10 - uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0 - with: - python-version: "3.10" - - - name: Install ninja - run: | - sudo apt-get update && sudo apt-get install -y ninja-build - - - name: Build XGBoost on Ubuntu - run: | - mkdir build - cd build - cmake .. -GNinja - ninja - - - name: Copy lib to system lib - run: | - cp lib/* "$(python -c 'import sys; print(sys.base_prefix)')/lib" - - - name: Install XGBoost in Virtual Environment - run: | - cd python-package - pip install virtualenv - virtualenv venv - source venv/bin/activate && \ - pip install -v . --config-settings use_system_libxgboost=True && \ - python -c 'import xgboost' diff --git a/.github/workflows/python_wheels.yml b/.github/workflows/python_wheels.yml deleted file mode 100644 index 235942713287..000000000000 --- a/.github/workflows/python_wheels.yml +++ /dev/null @@ -1,55 +0,0 @@ -name: XGBoost-Python-Wheels - -on: [push, pull_request] - -permissions: - contents: read # to fetch code (actions/checkout) - -defaults: - run: - shell: bash -l {0} - -concurrency: - group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} - cancel-in-progress: true - -jobs: - python-wheels: - name: Build wheel for ${{ matrix.platform_id }} - runs-on: ${{ matrix.os }} - strategy: - matrix: - include: - - os: macos-13 - platform_id: macosx_x86_64 - - os: macos-14 - platform_id: macosx_arm64 - steps: - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 - with: - submodules: 'true' - - name: Set up homebrew - uses: Homebrew/actions/setup-homebrew@68fa6aeb1ccb0596d311f2b34ec74ec21ee68e54 - - name: Install libomp - run: brew install libomp - - uses: conda-incubator/setup-miniconda@a4260408e20b96e80095f42ff7f1a15b27dd94ca # v3.0.4 - with: - miniforge-variant: Mambaforge - miniforge-version: latest - python-version: "3.10" - use-mamba: true - - name: Build wheels - run: bash tests/ci_build/build_python_wheels.sh ${{ matrix.platform_id }} ${{ github.sha }} - - name: Extract branch name - run: | - echo "branch=${GITHUB_REF#refs/heads/}" >> "$GITHUB_OUTPUT" - id: extract_branch - if: github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_') - - name: Upload Python wheel - if: github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_') - run: | - python -m pip install awscli - python -m awscli s3 cp wheelhouse/*.whl s3://xgboost-nightly-builds/${{ steps.extract_branch.outputs.branch }}/ --acl public-read --region us-west-2 - env: - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID_IAM_S3_UPLOADER }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY_IAM_S3_UPLOADER }} diff --git a/.github/workflows/r_nold.yml b/.github/workflows/r_nold.yml deleted file mode 100644 index 4b506927e06c..000000000000 --- a/.github/workflows/r_nold.yml +++ /dev/null @@ -1,44 +0,0 @@ -# Run expensive R tests with the help of rhub. Only triggered by a pull request review -# See discussion at https://github.com/dmlc/xgboost/pull/6378 - -name: XGBoost-R-noLD - -on: - pull_request_review_comment: - types: [created] - -permissions: - contents: read # to fetch code (actions/checkout) - -concurrency: - group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} - cancel-in-progress: true - -jobs: - test-R-noLD: - if: github.event.comment.body == '/gha run r-nold-test' && contains('OWNER,MEMBER,COLLABORATOR', github.event.comment.author_association) - timeout-minutes: 120 - runs-on: ubuntu-latest - container: - image: rhub/debian-gcc-devel-nold - steps: - - name: Install git and system packages - shell: bash - run: | - apt update && apt install libcurl4-openssl-dev libssl-dev libssh2-1-dev libgit2-dev libglpk-dev libxml2-dev libharfbuzz-dev libfribidi-dev git -y - - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 - with: - submodules: 'true' - - - name: Install dependencies - shell: bash -l {0} - run: | - /tmp/R-devel/bin/Rscript -e "source('./R-package/tests/helper_scripts/install_deps.R')" - - - name: Run R tests - shell: bash - run: | - cd R-package && \ - /tmp/R-devel/bin/R CMD INSTALL . && \ - /tmp/R-devel/bin/R -q -e "library(testthat); setwd('tests'); source('testthat.R')" diff --git a/.github/workflows/r_tests.yml b/.github/workflows/r_tests.yml deleted file mode 100644 index 27ae4bee1166..000000000000 --- a/.github/workflows/r_tests.yml +++ /dev/null @@ -1,150 +0,0 @@ -name: XGBoost-R-Tests - -on: [push, pull_request] - -env: - GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} - -permissions: - contents: read # to fetch code (actions/checkout) - -concurrency: - group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} - cancel-in-progress: true - -jobs: - lintr: - runs-on: ${{ matrix.config.os }} - name: Run R linters on OS ${{ matrix.config.os }}, R ${{ matrix.config.r }}, Compiler ${{ matrix.config.compiler }}, Build ${{ matrix.config.build }} - strategy: - matrix: - config: - - {os: ubuntu-latest, r: 'release'} - env: - R_REMOTES_NO_ERRORS_FROM_WARNINGS: true - RSPM: ${{ matrix.config.rspm }} - - steps: - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 - with: - submodules: 'true' - - - uses: r-lib/actions/setup-r@929c772977a3a13c8733b363bf5a2f685c25dd91 # v2.9.0 - with: - r-version: ${{ matrix.config.r }} - - - name: Cache R packages - uses: actions/cache@0c45773b623bea8c8e75f6c82b208c3cf94ea4f9 # v4.0.2 - with: - path: ${{ env.R_LIBS_USER }} - key: ${{ runner.os }}-r-${{ matrix.config.r }}-7-${{ hashFiles('R-package/DESCRIPTION') }} - restore-keys: ${{ runner.os }}-r-${{ matrix.config.r }}-7-${{ hashFiles('R-package/DESCRIPTION') }} - - - name: Install dependencies - shell: Rscript {0} - run: | - source("./R-package/tests/helper_scripts/install_deps.R") - - - name: Run lintr - run: | - MAKEFLAGS="-j$(nproc)" R CMD INSTALL R-package/ - Rscript tests/ci_build/lint_r.R $(pwd) - - test-Rpkg: - runs-on: ${{ matrix.config.os }} - name: Test R on OS ${{ matrix.config.os }}, R ${{ matrix.config.r }}, Compiler ${{ matrix.config.compiler }}, Build ${{ matrix.config.build }} - strategy: - fail-fast: false - matrix: - config: - - {os: windows-latest, r: 'release', compiler: 'mingw', build: 'autotools'} - - {os: ubuntu-latest, r: 'release', compiler: 'none', build: 'cmake'} - env: - R_REMOTES_NO_ERRORS_FROM_WARNINGS: true - RSPM: ${{ matrix.config.rspm }} - - steps: - - name: Install system dependencies - run: | - sudo apt update - sudo apt install libcurl4-openssl-dev libssl-dev libssh2-1-dev libgit2-dev libglpk-dev libxml2-dev libharfbuzz-dev libfribidi-dev - if: matrix.config.os == 'ubuntu-latest' - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 - with: - submodules: 'true' - - - uses: r-lib/actions/setup-r@929c772977a3a13c8733b363bf5a2f685c25dd91 # v2.9.0 - with: - r-version: ${{ matrix.config.r }} - - - name: Cache R packages - uses: actions/cache@0c45773b623bea8c8e75f6c82b208c3cf94ea4f9 # v4.0.2 - with: - path: ${{ env.R_LIBS_USER }} - key: ${{ runner.os }}-r-${{ matrix.config.r }}-7-${{ hashFiles('R-package/DESCRIPTION') }} - restore-keys: ${{ runner.os }}-r-${{ matrix.config.r }}-7-${{ hashFiles('R-package/DESCRIPTION') }} - - - uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0 - with: - python-version: "3.10" - architecture: 'x64' - - - uses: r-lib/actions/setup-tinytex@v2 - - - name: Install dependencies - shell: Rscript {0} - run: | - source("./R-package/tests/helper_scripts/install_deps.R") - - - name: Test R - run: | - python tests/ci_build/test_r_package.py --compiler='${{ matrix.config.compiler }}' --build-tool="${{ matrix.config.build }}" --task=check - if: matrix.config.compiler != 'none' - - - name: Test R - run: | - python tests/ci_build/test_r_package.py --build-tool="${{ matrix.config.build }}" --task=check - if: matrix.config.compiler == 'none' - - test-R-on-Debian: - name: Test R package on Debian - runs-on: ubuntu-latest - container: - image: rhub/debian-gcc-release - - steps: - - name: Install system dependencies - run: | - # Must run before checkout to have the latest git installed. - # No need to add pandoc, the container has it figured out. - apt update && apt install libcurl4-openssl-dev libssl-dev libssh2-1-dev libgit2-dev libglpk-dev libxml2-dev libharfbuzz-dev libfribidi-dev git -y - - - name: Trust git cloning project sources - run: | - git config --global --add safe.directory "${GITHUB_WORKSPACE}" - - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 - with: - submodules: 'true' - - - name: Install dependencies - shell: bash -l {0} - run: | - Rscript -e "source('./R-package/tests/helper_scripts/install_deps.R')" - - - name: Test R - shell: bash -l {0} - run: | - python3 tests/ci_build/test_r_package.py --r=/usr/bin/R --build-tool=autotools --task=check - - - uses: dorny/paths-filter@v3 - id: changes - with: - filters: | - r_package: - - 'R-package/**' - - - name: Run document check - if: steps.changes.outputs.r_package == 'true' - run: | - python3 tests/ci_build/test_r_package.py --r=/usr/bin/R --task=doc diff --git a/.github/workflows/scorecards.yml b/.github/workflows/scorecards.yml deleted file mode 100644 index 1881c0eba274..000000000000 --- a/.github/workflows/scorecards.yml +++ /dev/null @@ -1,54 +0,0 @@ -name: Scorecards supply-chain security -on: - # Only the default branch is supported. - branch_protection_rule: - schedule: - - cron: '17 2 * * 6' - push: - branches: [ "master" ] - -# Declare default permissions as read only. -permissions: read-all - -jobs: - analysis: - name: Scorecards analysis - runs-on: ubuntu-latest - permissions: - # Needed to upload the results to code-scanning dashboard. - security-events: write - # Used to receive a badge. - id-token: write - - steps: - - name: "Checkout code" - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 - with: - persist-credentials: false - - - name: "Run analysis" - uses: ossf/scorecard-action@62b2cac7ed8198b15735ed49ab1e5cf35480ba46 # v2.4.0 - with: - results_file: results.sarif - results_format: sarif - - # Publish the results for public repositories to enable scorecard badges. For more details, see - # https://github.com/ossf/scorecard-action#publishing-results. - # For private repositories, `publish_results` will automatically be set to `false`, regardless - # of the value entered here. - publish_results: true - - # Upload the results as artifacts (optional). Commenting out will disable uploads of run results in SARIF - # format to the repository Actions tab. - - name: "Upload artifact" - uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0 - with: - name: SARIF file - path: results.sarif - retention-days: 5 - - # Upload the results to GitHub's code scanning dashboard. - - name: "Upload to code-scanning" - uses: github/codeql-action/upload-sarif@83a02f7883b12e0e4e1a146174f5e2292a01e601 # v2.16.4 - with: - sarif_file: results.sarif diff --git a/.github/workflows/update_rapids.yml b/.github/workflows/update_rapids.yml deleted file mode 100644 index 9490926cfcaf..000000000000 --- a/.github/workflows/update_rapids.yml +++ /dev/null @@ -1,44 +0,0 @@ -name: update-rapids - -on: - workflow_dispatch: - schedule: - - cron: "0 20 * * 1" # Run once weekly - -permissions: - pull-requests: write - contents: write - -defaults: - run: - shell: bash -l {0} - -concurrency: - group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} - cancel-in-progress: true - -env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} # To use GitHub CLI - -jobs: - update-rapids: - name: Check latest RAPIDS - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 - with: - submodules: 'true' - - name: Check latest RAPIDS and update conftest.sh - run: | - bash tests/buildkite/update-rapids.sh - - name: Create Pull Request - uses: peter-evans/create-pull-request@v6 - if: github.ref == 'refs/heads/master' - with: - add-paths: | - tests/buildkite - branch: create-pull-request/update-rapids - base: master - title: "[CI] Update RAPIDS to latest stable" - commit-message: "[CI] Update RAPIDS to latest stable" - From 78c4d8ca82136a428ed71e93b583658b7d0de68f Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Thu, 17 Oct 2024 01:10:31 -0700 Subject: [PATCH 02/45] First RunsOn example --- .github/workflows/main.yml | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) create mode 100644 .github/workflows/main.yml diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml new file mode 100644 index 000000000000..ed02bfb2ad53 --- /dev/null +++ b/.github/workflows/main.yml @@ -0,0 +1,22 @@ +name: Nextgen XGBoost CI + +on: [push, pull_request] + +permissions: + contents: read # to fetch code (actions/checkout) + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + +jobs: + build: + name: Build + runs-on: + - runs-on=${{ github.run_id }} + - runner=2cpu-linux-x64 + steps: + - uses: actions/checkout@v4 + with: + submodules: "true" + - run: echo "Hello from x64!" From 92fd6a29b9b5419adb9a94ecbc9648d942caf42c Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Thu, 17 Oct 2024 11:29:00 -0700 Subject: [PATCH 03/45] Test custom runner config --- .github/runs-on.yml | 6 ++++++ .github/workflows/main.yml | 10 ++++++++-- 2 files changed, 14 insertions(+), 2 deletions(-) create mode 100644 .github/runs-on.yml diff --git a/.github/runs-on.yml b/.github/runs-on.yml new file mode 100644 index 000000000000..24a9caa0073b --- /dev/null +++ b/.github/runs-on.yml @@ -0,0 +1,6 @@ +runners: + linux-amd64-cpu: + cpu: 16 + hdd: 40 + family: ["c7i-flex", "c7i", "c7a", "c5", "c5a"] + image: ubuntu24-full-x64 diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index ed02bfb2ad53..af7fc1fa6435 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -14,9 +14,15 @@ jobs: name: Build runs-on: - runs-on=${{ github.run_id }} - - runner=2cpu-linux-x64 + - runner=linux-amd64-cpu steps: - uses: actions/checkout@v4 with: submodules: "true" - - run: echo "Hello from x64!" + - run: | + sudo apt update && sudo apt install ninja-build + mkdir build + cd build + cmake .. -GNinja -DGOOGLE_TEST=ON -DUSE_DMLC_GTEST=ON -DBUILD_DEPRECATED_CLI=ON + ninja -v + ./testxgboost From d6761cb9623f84187dc7109bba5d8cbd7229d8eb Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Thu, 17 Oct 2024 19:17:45 -0700 Subject: [PATCH 04/45] Try out GPU support --- .github/runs-on.yml | 10 ++++++++++ .github/workflows/main.yml | 21 ++++++++++++++++++++- 2 files changed, 30 insertions(+), 1 deletion(-) diff --git a/.github/runs-on.yml b/.github/runs-on.yml index 24a9caa0073b..fd9be8996a9e 100644 --- a/.github/runs-on.yml +++ b/.github/runs-on.yml @@ -1,6 +1,16 @@ +images: + dlami-amd64: + platform: "linux" + arch: "x64" + owner: "898082745236" # AWS + name: "Deep Learning Base OSS Nvidia Driver GPU AMI (Ubuntu 22.04)*" + runners: linux-amd64-cpu: cpu: 16 hdd: 40 family: ["c7i-flex", "c7i", "c7a", "c5", "c5a"] image: ubuntu24-full-x64 + linux-amd64-gpu: + family: ["g4dn.xlarge"] + image: dlami-amd64 diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index af7fc1fa6435..64db9583b195 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -19,10 +19,29 @@ jobs: - uses: actions/checkout@v4 with: submodules: "true" - - run: | + - name: Build and run gtest + run: | sudo apt update && sudo apt install ninja-build mkdir build cd build cmake .. -GNinja -DGOOGLE_TEST=ON -DUSE_DMLC_GTEST=ON -DBUILD_DEPRECATED_CLI=ON ninja -v ./testxgboost + build-gpu: + name: Build GPU + runs-on: + - runs-on=${{ github.run_id }} + - runner=linux-amd64-gpu + steps: + - uses: actions/checkout@v4 + with: + submodules: "true" + - name: Build and run gtest + run: | + nvidia-smi + sudo apt update && sudo apt install ninja-build + mkdir build + cd build + cmake .. -GNinja -DGOOGLE_TEST=ON -DUSE_DMLC_GTEST=ON -DBUILD_DEPRECATED_CLI=ON -DUSE_CUDA=ON -DCMAKE_CUDA_COMPILER=/usr/local/cuda-12.4/bin/nvcc -DCMAKE_CUDA_ARCHITECTURES=75 + ninja -v + ./testxgboost From 3331cd6ae307b993456e0334893a80ba30a8cb9e Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Tue, 22 Oct 2024 12:52:40 -0700 Subject: [PATCH 05/45] Add Windows --- .github/runs-on.yml | 19 +++++- .github/workflows/main.yml | 22 +++++++ ops/packer/windows/bootstrap.ps1 | 73 ++++++++++++++++++++++ ops/packer/windows/install_choco.ps1 | 14 +++++ ops/packer/windows/setup_ssh.ps1 | 58 ++++++++++++++++++ ops/packer/windows/sysprep.ps1 | 14 +++++ ops/packer/windows/windows.pkr.hcl | 90 ++++++++++++++++++++++++++++ 7 files changed, 289 insertions(+), 1 deletion(-) create mode 100644 ops/packer/windows/bootstrap.ps1 create mode 100644 ops/packer/windows/install_choco.ps1 create mode 100644 ops/packer/windows/setup_ssh.ps1 create mode 100644 ops/packer/windows/sysprep.ps1 create mode 100644 ops/packer/windows/windows.pkr.hcl diff --git a/.github/runs-on.yml b/.github/runs-on.yml index fd9be8996a9e..f8de09feb553 100644 --- a/.github/runs-on.yml +++ b/.github/runs-on.yml @@ -4,13 +4,30 @@ images: arch: "x64" owner: "898082745236" # AWS name: "Deep Learning Base OSS Nvidia Driver GPU AMI (Ubuntu 22.04)*" + windows-amd64: + platform: "windows" + arch: "x64" + owner: "492475357299" # XGBooost CI + name: "xgboost-ci-runs-on-windows-*" runners: linux-amd64-cpu: cpu: 16 - hdd: 40 family: ["c7i-flex", "c7i", "c7a", "c5", "c5a"] image: ubuntu24-full-x64 linux-amd64-gpu: family: ["g4dn.xlarge"] image: dlami-amd64 + linux-amd64-mgpu: + family: ["g4dn.12xlarge"] + image: dlami-amd64 + linux-arm64-cpu: + family: ["c6g", "c7g"] + image: ubuntu24-full-arm64 + windows-gpu: + family: ["g4dn.2xlarge"] + image: windows-amd64 + windows-cpu: + cpu: 16 + family: ["c7i-flex", "c7i", "c7a", "c5", "c5a"] + image: windows-amd64 diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 64db9583b195..ead80a7dadac 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -45,3 +45,25 @@ jobs: cmake .. -GNinja -DGOOGLE_TEST=ON -DUSE_DMLC_GTEST=ON -DBUILD_DEPRECATED_CLI=ON -DUSE_CUDA=ON -DCMAKE_CUDA_COMPILER=/usr/local/cuda-12.4/bin/nvcc -DCMAKE_CUDA_ARCHITECTURES=75 ninja -v ./testxgboost + build-gpu-win64: + name: Build GPU (Windows) + runs-on: + - runs-on=${{ github.run_id }} + - runner=windows-gpu + steps: + - uses: actions/checkout@v4 + with: + submodules: "true" + - name: Build and run gtest + shell: powershell + run: | + nvcc --version + git clone https://github.com/NVIDIA/cccl.git -b v2.6.1 --quiet + mkdir build + cd build + cmake .. -G"Visual Studio 17 2022" -A x64 -DUSE_CUDA=ON -DGOOGLE_TEST=ON -DUSE_DMLC_GTEST=ON -DBUILD_DEPRECATED_CLI=ON -DCMAKE_CUDA_ARCHITECTURES=75 -DCMAKE_PREFIX_PATH="$(Get-Location)/../cccl" + if ($LASTEXITCODE -ne 0) { throw "Last command failed" } + cmake --build . --config Release -- /m /nodeReuse:false "/consoleloggerparameters:ShowCommandLine;Verbosity=minimal" + if ($LASTEXITCODE -ne 0) { throw "Last command failed" } + build/testxgboost.exe + if ($LASTEXITCODE -ne 0) { throw "Last command failed" } diff --git a/ops/packer/windows/bootstrap.ps1 b/ops/packer/windows/bootstrap.ps1 new file mode 100644 index 000000000000..c67f3b73fb9a --- /dev/null +++ b/ops/packer/windows/bootstrap.ps1 @@ -0,0 +1,73 @@ +## Install packages from Chocolatey + +# jq & yq +Write-Output "Installing jq and yq..." +choco install jq --version=1.7.1 +if ($LASTEXITCODE -ne 0) { throw "Last command failed" } +choco install yq --version=4.40.2 +if ($LASTEXITCODE -ne 0) { throw "Last command failed" } + +# AWS CLI +Write-Output "Installing AWS CLI..." +choco install awscli --version=2.18.11 +if ($LASTEXITCODE -ne 0) { throw "Last command failed" } + +# Git +Write-Host '>>> Installing Git...' +choco install git --version=2.47.0 +if ($LASTEXITCODE -ne 0) { throw "Last command failed" } + +# CMake +Write-Host '>>> Installing CMake 3.30.5...' +choco install cmake --version 3.30.5 --installargs "ADD_CMAKE_TO_PATH=System" +if ($LASTEXITCODE -ne 0) { throw "Last command failed" } + +# Notepad++ +Write-Host '>>> Installing Notepad++...' +choco install notepadplusplus +if ($LASTEXITCODE -ne 0) { throw "Last command failed" } + +# Miniforge3 +Write-Host '>>> Installing Miniforge3...' +choco install miniforge3 --params="'/InstallationType:AllUsers /RegisterPython:1 /D:C:\tools\miniforge3'" +C:\tools\miniforge3\Scripts\conda.exe init --user --system +if ($LASTEXITCODE -ne 0) { throw "Last command failed" } +. "C:\Windows\System32\WindowsPowerShell\v1.0\profile.ps1" +if ($LASTEXITCODE -ne 0) { throw "Last command failed" } +conda config --set auto_activate_base false + +# Java 11 +Write-Host '>>> Installing Java 11...' +choco install openjdk11 +if ($LASTEXITCODE -ne 0) { throw "Last command failed" } + +# Maven +Write-Host '>>> Installing Maven...' +choco install maven +if ($LASTEXITCODE -ne 0) { throw "Last command failed" } + +# GraphViz +Write-Host '>>> Installing GraphViz...' +choco install graphviz +if ($LASTEXITCODE -ne 0) { throw "Last command failed" } + +# Visual Studio 2022 Community +Write-Host '>>> Installing Visual Studio 2022 Community...' +choco install visualstudio2022community ` + --params "--wait --passive --norestart" +if ($LASTEXITCODE -ne 0) { throw "Last command failed" } +choco install visualstudio2022-workload-nativedesktop --params ` + "--wait --passive --norestart --includeOptional" +if ($LASTEXITCODE -ne 0) { throw "Last command failed" } + +# CUDA 12.5 +Write-Host '>>> Installing CUDA 12.5...' +choco install cuda --version=12.5.1.555 +if ($LASTEXITCODE -ne 0) { throw "Last command failed" } + +# R 4.3 +Write-Host '>>> Installing R...' +choco install r.project --version=4.3.2 +if ($LASTEXITCODE -ne 0) { throw "Last command failed" } +choco install rtools --version=4.3.5550 +if ($LASTEXITCODE -ne 0) { throw "Last command failed" } diff --git a/ops/packer/windows/install_choco.ps1 b/ops/packer/windows/install_choco.ps1 new file mode 100644 index 000000000000..131e8129feaa --- /dev/null +++ b/ops/packer/windows/install_choco.ps1 @@ -0,0 +1,14 @@ +## Adopted from https://github.com/chorrell/packer-aws-windows-openssh/blob/20c40aa60b54469b3d85650a2e2e45e35ed83bc7/files/InstallChoco.ps1 +## Author: Christopher Horrell (https://github.com/chorrell) + +$ErrorActionPreference = "Stop" + +# Install Chocolatey +# See https://chocolatey.org/install#individual +Set-ExecutionPolicy Bypass -Scope Process -Force +[System.Net.ServicePointManager]::SecurityProtocol = [System.Net.ServicePointManager]::SecurityProtocol -bor 3072 +Invoke-Expression ((New-Object System.Net.WebClient).DownloadString("https://community.chocolatey.org/install.ps1")) + +# Globally Auto confirm every action +# See: https://docs.chocolatey.org/en-us/faqs#why-do-i-have-to-confirm-packages-now-is-there-a-way-to-remove-this +choco feature enable -n allowGlobalConfirmation diff --git a/ops/packer/windows/setup_ssh.ps1 b/ops/packer/windows/setup_ssh.ps1 new file mode 100644 index 000000000000..a7bdee898002 --- /dev/null +++ b/ops/packer/windows/setup_ssh.ps1 @@ -0,0 +1,58 @@ + +## Adopted from https://github.com/chorrell/packer-aws-windows-openssh/blob/20c40aa60b54469b3d85650a2e2e45e35ed83bc7/files/SetupSsh.ps1 +## Author: Christopher Horrell (https://github.com/chorrell) + +# Don't display progress bars +# See: https://learn.microsoft.com/en-us/powershell/module/microsoft.powershell.core/about/about_preference_variables?view=powershell-7.3#progresspreference +$ProgressPreference = "SilentlyContinue" +$ErrorActionPreference = "Stop" + +# Install OpenSSH using Add-WindowsCapability +# See: https://learn.microsoft.com/en-us/windows-server/administration/openssh/openssh_install_firstuse?tabs=powershell#install-openssh-for-windows + +Write-Host "Installing and starting ssh-agent" +Add-WindowsCapability -Online -Name OpenSSH.Client~~~~0.0.1.0 +Set-Service -Name ssh-agent -StartupType Automatic +Start-Service ssh-agent + +Write-Host "Installing and starting sshd" +Add-WindowsCapability -Online -Name OpenSSH.Server~~~~0.0.1.0 +Set-Service -Name sshd -StartupType Automatic +Start-Service sshd + +# Confirm the Firewall rule is configured. It should be created automatically by setup. Run the following to verify +if (!(Get-NetFirewallRule -Name "OpenSSH-Server-In-TCP" -ErrorAction SilentlyContinue | Select-Object Name, Enabled)) { + Write-Output "Firewall Rule 'OpenSSH-Server-In-TCP' does not exist, creating it..." + New-NetFirewallRule -Name "OpenSSH-Server-In-TCP" -DisplayName "OpenSSH Server (sshd)" -Enabled True -Direction Inbound -Protocol TCP -Action Allow -LocalPort 22 +} else { + Write-Output "Firewall rule 'OpenSSH-Server-In-TCP' has been created and exists." +} + +# Set default shell to Powershell +New-ItemProperty -Path "HKLM:\SOFTWARE\OpenSSH" -Name DefaultShell -Value "C:\Windows\System32\WindowsPowerShell\v1.0\powershell.exe" -PropertyType String -Force + +$keyDownloadScript = Join-Path $env:ProgramData "ssh\download-key.ps1" + +@' +# Download private key to $env:ProgramData\ssh\administrators_authorized_keys +$openSSHAuthorizedKeys = Join-Path $env:ProgramData "ssh\administrators_authorized_keys" + +$keyUrl = "http://169.254.169.254/latest/meta-data/public-keys/0/openssh-key" +Invoke-WebRequest $keyUrl -OutFile $openSSHAuthorizedKeys + +# Ensure ACL for administrators_authorized_keys is correct +# See https://learn.microsoft.com/en-us/windows-server/administration/openssh/openssh_server_configuration#authorizedkeysfile +icacls.exe $openSSHAuthorizedKeys /inheritance:r /grant "Administrators:F" /grant "SYSTEM:F" +'@ | Out-File $keyDownloadScript + +# Create Task +$taskName = "DownloadKey" +$principal = New-ScheduledTaskPrincipal -UserID "NT AUTHORITY\SYSTEM" -LogonType ServiceAccount -RunLevel Highest +$action = New-ScheduledTaskAction -Execute "Powershell.exe" -Argument "-NoProfile -File ""$keyDownloadScript""" +$trigger = New-ScheduledTaskTrigger -AtStartup +Register-ScheduledTask -Action $action -Trigger $trigger -Principal $principal -TaskName $taskName -Description $taskName + +# Fetch key via $keyDownloadScript +& Powershell.exe -ExecutionPolicy Bypass -File $keyDownloadScript + + diff --git a/ops/packer/windows/sysprep.ps1 b/ops/packer/windows/sysprep.ps1 new file mode 100644 index 000000000000..a0470309f9da --- /dev/null +++ b/ops/packer/windows/sysprep.ps1 @@ -0,0 +1,14 @@ +## Adopted from https://github.com/chorrell/packer-aws-windows-openssh/blob/20c40aa60b54469b3d85650a2e2e45e35ed83bc7/files/PrepareImage.ps1 +## Author: Christopher Horrell (https://github.com/chorrell) + +$ErrorActionPreference = "Stop" + +Write-Output "Cleaning up keys" +$openSSHAuthorizedKeys = Join-Path $env:ProgramData "ssh\administrators_authorized_keys" +Remove-Item -Recurse -Force -Path $openSSHAuthorizedKeys + +# Make sure task is enabled +Enable-ScheduledTask "DownloadKey" + +Write-Output "Running Sysprep" +& "$Env:Programfiles\Amazon\EC2Launch\ec2launch.exe" sysprep diff --git a/ops/packer/windows/windows.pkr.hcl b/ops/packer/windows/windows.pkr.hcl new file mode 100644 index 000000000000..4c14b7b75806 --- /dev/null +++ b/ops/packer/windows/windows.pkr.hcl @@ -0,0 +1,90 @@ +packer { + required_plugins { + amazon = { + source = "github.com/hashicorp/amazon" + version = "~> 1" + } + windows-update = { + version = "0.15.0" + source = "github.com/rgl/windows-update" + } + } +} + +locals { + ami_name_prefix = "xgboost-ci" + image_name = "RunsOn worker with Windows Server 2022 + ssh + CUDA driver" + region = "us-west-2" + timestamp = regex_replace(timestamp(), "[- TZ:]", "") + volume_size = 120 +} + +data "amazon-ami" "aws-windows-x64" { + filters = { + name = "Windows_Server-2022-English-Full-Base-*" + root-device-type = "ebs" + virtualization-type = "hvm" + } + most_recent = true + owners = ["amazon"] +} + +source "amazon-ebs" "runs-on-windows" { + source_ami = "${data.amazon-ami.aws-windows-x64.id}" + ami_name = "${local.ami_name_prefix}-runs-on-windows-${local.timestamp}" + ami_description = "${local.image_name}" + ami_regions = ["${local.region}"] + ami_virtualization_type = "hvm" + associate_public_ip_address = true + communicator = "ssh" + instance_type = "g4dn.xlarge" + region = "${local.region}" + ssh_timeout = "10m" + ssh_username = "Administrator" + ssh_file_transfer_method = "sftp" + user_data_file = "setup_ssh.ps1" + launch_block_device_mappings { + device_name = "/dev/sda1" + volume_size = "${local.volume_size}" + volume_type = "gp3" + delete_on_termination = true + } + aws_polling { # Wait up to 2.5 hours until the AMI is ready + delay_seconds = 15 + max_attempts = 600 + } + fast_launch { + enable_fast_launch = true + target_resource_count = 10 + } + snapshot_tags = { + Name = "${local.image_name}" + BuildTime = "${local.timestamp}" + } + tags = { + Name = "${local.image_name}" + BuildTime = "${local.timestamp}" + } +} + +build { + sources = ["source.amazon-ebs.runs-on-windows"] + + provisioner "windows-update" {} + + provisioner "powershell" { + script = "install_choco.ps1" + } + + provisioner "windows-restart" { + max_retries = 3 + } + + provisioner "powershell" { + script = "bootstrap.ps1" + } + + provisioner "powershell" { # Sysprep should run the last + script = "sysprep.ps1" + } +} From 61f207dec6b5001127b4f5cd5e7ee0b2381bf980 Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Wed, 23 Oct 2024 00:21:50 -0700 Subject: [PATCH 06/45] Fix Windows build --- .github/workflows/main.yml | 2 +- CMakeLists.txt | 8 +++++++- include/xgboost/collective/socket.h | 3 ++- include/xgboost/windefs.h | 7 +++++++ tests/cpp/common/test_device_vector.cu | 5 +++++ 5 files changed, 22 insertions(+), 3 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index ead80a7dadac..a523ec7a1ba3 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -65,5 +65,5 @@ jobs: if ($LASTEXITCODE -ne 0) { throw "Last command failed" } cmake --build . --config Release -- /m /nodeReuse:false "/consoleloggerparameters:ShowCommandLine;Verbosity=minimal" if ($LASTEXITCODE -ne 0) { throw "Last command failed" } - build/testxgboost.exe + & .\testxgboost.exe if ($LASTEXITCODE -ne 0) { throw "Last command failed" } diff --git a/CMakeLists.txt b/CMakeLists.txt index 22fe4a3eb977..9bfaedb1e16f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -221,7 +221,9 @@ if(USE_CUDA) find_package(CUDAToolkit REQUIRED) find_package(CCCL CONFIG) - if(NOT CCCL_FOUND) + if(CCCL_FOUND) + message(STATUS "Standalone CCCL found.") + else() message(STATUS "Standalone CCCL not found. Attempting to use CCCL from CUDA Toolkit...") find_package(CCCL CONFIG HINTS ${CUDAToolkit_LIBRARY_DIR}/cmake) @@ -238,6 +240,10 @@ if(USE_CUDA) target_link_libraries(CCCL::CCCL INTERFACE libcudacxx::libcudacxx CUB::CUB Thrust) endif() endif() + # Define guard macros to prevent windows.h from conflicting with winsock2.h + if(WIN32) + target_compile_definitions(CCCL::CCCL INTERFACE NOMINMAX WIN32_LEAN_AND_MEAN _WINSOCKAPI_) + endif() endif() if(FORCE_COLORED_OUTPUT AND (CMAKE_GENERATOR STREQUAL "Ninja") AND diff --git a/include/xgboost/collective/socket.h b/include/xgboost/collective/socket.h index 4bc285a515c5..57882ee26844 100644 --- a/include/xgboost/collective/socket.h +++ b/include/xgboost/collective/socket.h @@ -99,6 +99,7 @@ inline auto ThrowAtError(StringView fn_name, std::int32_t errsv = LastError()) { using SocketT = SOCKET; #else using SocketT = int; +#define INVALID_SOCKET -1 #endif // defined(_WIN32) #if !defined(xgboost_CHECK_SYS_CALL) @@ -276,7 +277,7 @@ class TCPSocket { SockDomain domain_{SockDomain::kV4}; #endif - constexpr static HandleT InvalidSocket() { return -1; } + constexpr static HandleT InvalidSocket() { return INVALID_SOCKET; } explicit TCPSocket(HandleT newfd) : handle_{newfd} {} diff --git a/include/xgboost/windefs.h b/include/xgboost/windefs.h index 99bf11d09b17..b0e012994e4a 100644 --- a/include/xgboost/windefs.h +++ b/include/xgboost/windefs.h @@ -20,7 +20,14 @@ #endif // !defined(NOMINMAX) // A macro used inside `windows.h` to avoid conflicts with `winsock2.h` +#if !defined(WIN32_LEAN_AND_MEAN) #define WIN32_LEAN_AND_MEAN +#endif // !defined(WIN32_LEAN_AND_MEAN) + +// Stop windows.h from including winsock.h +#if !defined(_WINSOCKAPI_) +#define _WINSOCKAPI_ +#endif // !defined(_WINSOCKAPI_) #if !defined(xgboost_IS_MINGW) diff --git a/tests/cpp/common/test_device_vector.cu b/tests/cpp/common/test_device_vector.cu index 9dff9c691c15..d7a03e41a64b 100644 --- a/tests/cpp/common/test_device_vector.cu +++ b/tests/cpp/common/test_device_vector.cu @@ -10,6 +10,7 @@ #include "../../../src/common/device_helpers.cuh" // for CachingThrustPolicy, PinnedMemory #include "../../../src/common/device_vector.cuh" #include "xgboost/global_config.h" // for GlobalConfigThreadLocalStore +#include "xgboost/windefs.h" // for xgboost_IS_WIN namespace dh { TEST(DeviceUVector, Basic) { @@ -109,10 +110,14 @@ TEST(TestVirtualMem, Version) { xgboost::curt::DrVersion(&major, &minor); LOG(INFO) << "Latest supported CUDA version by the driver:" << major << "." << minor; PinnedMemory pinned; +#if defined(xgboost_IS_WIN) + ASSERT_FALSE(pinned.IsVm()); +#else // defined(xgboost_IS_WIN) if (major >= 12 && minor >= 5) { ASSERT_TRUE(pinned.IsVm()); } else { ASSERT_FALSE(pinned.IsVm()); } +#endif // defined(xgboost_IS_WIN) } } // namespace dh From 6b78a12245450eabbe226d3a9df4a986373e258e Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Wed, 23 Oct 2024 16:06:30 -0700 Subject: [PATCH 07/45] Custom image for Linux --- .github/runs-on.yml | 12 +++--- .github/workflows/main.yml | 11 +++++- ops/packer/linux/bootstrap.sh | 42 +++++++++++++++++++++ ops/packer/linux/linux.pkr.hcl | 68 ++++++++++++++++++++++++++++++++++ ops/packer/linux/setup_ssh.sh | 2 + 5 files changed, 129 insertions(+), 6 deletions(-) create mode 100644 ops/packer/linux/bootstrap.sh create mode 100644 ops/packer/linux/linux.pkr.hcl create mode 100644 ops/packer/linux/setup_ssh.sh diff --git a/.github/runs-on.yml b/.github/runs-on.yml index f8de09feb553..3fa13de66a25 100644 --- a/.github/runs-on.yml +++ b/.github/runs-on.yml @@ -1,9 +1,11 @@ +# Custom images with CUDA toolkit installed +# See ops/packer for instructions for building the images images: - dlami-amd64: + linux-amd64: platform: "linux" arch: "x64" - owner: "898082745236" # AWS - name: "Deep Learning Base OSS Nvidia Driver GPU AMI (Ubuntu 22.04)*" + owner: "492475357299" # XGBooost CI + name: "xgboost-ci-runs-on-linux-*" windows-amd64: platform: "windows" arch: "x64" @@ -17,10 +19,10 @@ runners: image: ubuntu24-full-x64 linux-amd64-gpu: family: ["g4dn.xlarge"] - image: dlami-amd64 + image: linux-amd64 linux-amd64-mgpu: family: ["g4dn.12xlarge"] - image: dlami-amd64 + image: linux-amd64 linux-arm64-cpu: family: ["c6g", "c7g"] image: ubuntu24-full-arm64 diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index a523ec7a1ba3..7fb80e21299a 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -38,11 +38,19 @@ jobs: submodules: "true" - name: Build and run gtest run: | + cat >> $HOME/.bashrc <<- EOM + export PATH=/usr/local/cuda/bin${PATH:+:${PATH}} + export LD_LIBRARY_PATH=/usr/local/cuda/lib64\ + ${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}} + EOM + source $HOME/.bashrc nvidia-smi + nvcc --version sudo apt update && sudo apt install ninja-build + git clone https://github.com/NVIDIA/cccl.git -b v2.6.1 --quiet mkdir build cd build - cmake .. -GNinja -DGOOGLE_TEST=ON -DUSE_DMLC_GTEST=ON -DBUILD_DEPRECATED_CLI=ON -DUSE_CUDA=ON -DCMAKE_CUDA_COMPILER=/usr/local/cuda-12.4/bin/nvcc -DCMAKE_CUDA_ARCHITECTURES=75 + cmake .. -GNinja -DGOOGLE_TEST=ON -DUSE_DMLC_GTEST=ON -DBUILD_DEPRECATED_CLI=ON -DUSE_CUDA=ON -DCMAKE_CUDA_ARCHITECTURES=75 -DCMAKE_PREFIX_PATH="$PWD/../cccl" ninja -v ./testxgboost build-gpu-win64: @@ -57,6 +65,7 @@ jobs: - name: Build and run gtest shell: powershell run: | + nvidia-smi nvcc --version git clone https://github.com/NVIDIA/cccl.git -b v2.6.1 --quiet mkdir build diff --git a/ops/packer/linux/bootstrap.sh b/ops/packer/linux/bootstrap.sh new file mode 100644 index 000000000000..9cf0edfe7fe8 --- /dev/null +++ b/ops/packer/linux/bootstrap.sh @@ -0,0 +1,42 @@ +#!/bin/bash +set -euo pipefail + +## Install basic tools +echo 'debconf debconf/frontend select Noninteractive' | sudo debconf-set-selections +sudo apt-get update +sudo apt-get install -y cmake git build-essential wget ca-certificates curl + +## Install CUDA 12.5 + driver +echo "Installilng CUDA and driver..." +wget -nv https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/x86_64/cuda-ubuntu2404.pin +sudo mv cuda-ubuntu2404.pin /etc/apt/preferences.d/cuda-repository-pin-600 +wget -nv https://developer.download.nvidia.com/compute/cuda/12.5.1/local_installers/cuda-repo-ubuntu2404-12-5-local_12.5.1-555.42.06-1_amd64.deb +sudo dpkg -i cuda-repo-ubuntu2404-12-5-local_12.5.1-555.42.06-1_amd64.deb +sudo cp /var/cuda-repo-ubuntu2404-12-5-local/cuda-*-keyring.gpg /usr/share/keyrings/ +sudo apt-get update +sudo apt-get install -y cuda-toolkit-12-5 nvidia-driver-555-open cuda-drivers-555 + +## Install Docker +# Add Docker's official GPG key: +sudo install -m 0755 -d /etc/apt/keyrings +sudo curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc +sudo chmod a+r /etc/apt/keyrings/docker.asc +# Add the repository to Apt sources: +echo \ + "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu \ + $(. /etc/os-release && echo "$VERSION_CODENAME") stable" | \ + sudo tee /etc/apt/sources.list.d/docker.list > /dev/null +sudo apt-get update +sudo apt-get install -y docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin +# Allow users to use Docker without sudo +sudo usermod -aG docker ubuntu + +## Install NVIDIA Container Toolkit +curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg \ + && curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | \ + sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | \ + sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list +sudo apt-get update +sudo apt-get install -y nvidia-container-toolkit +sudo nvidia-ctk runtime configure --runtime=docker +sudo systemctl restart docker diff --git a/ops/packer/linux/linux.pkr.hcl b/ops/packer/linux/linux.pkr.hcl new file mode 100644 index 000000000000..1dc11f9bac03 --- /dev/null +++ b/ops/packer/linux/linux.pkr.hcl @@ -0,0 +1,68 @@ +packer { + required_plugins { + amazon = { + source = "github.com/hashicorp/amazon" + version = "~> 1" + } + } +} + +locals { + ami_name_prefix = "xgboost-ci" + image_name = "RunsOn worker with Ubuntu 24.04 + CUDA driver" + region = "us-west-2" + timestamp = regex_replace(timestamp(), "[- TZ:]", "") + volume_size = 40 +} + +data "amazon-ami" "aws-ubuntu-x64" { + filters = { + name = "ubuntu/images/hvm-ssd-gp3/ubuntu-noble-24.04-amd64-server-*" + root-device-type = "ebs" + virtualization-type = "hvm" + } + most_recent = true + owners = ["amazon"] +} + +source "amazon-ebs" "runs-on-linux" { + source_ami = "${data.amazon-ami.aws-ubuntu-x64.id}" + ami_name = "${local.ami_name_prefix}-runs-on-linux-${local.timestamp}" + ami_description = "${local.image_name}" + ami_regions = ["${local.region}"] + ami_virtualization_type = "hvm" + associate_public_ip_address = true + communicator = "ssh" + instance_type = "g4dn.xlarge" + region = "${local.region}" + ssh_timeout = "10m" + ssh_username = "ubuntu" + ssh_file_transfer_method = "sftp" + user_data_file = "setup_ssh.sh" + launch_block_device_mappings { + device_name = "/dev/sda1" + volume_size = "${local.volume_size}" + volume_type = "gp3" + delete_on_termination = true + } + aws_polling { # Wait up to 1 hour until the AMI is ready + delay_seconds = 15 + max_attempts = 240 + } + snapshot_tags = { + Name = "${local.image_name}" + BuildTime = "${local.timestamp}" + } + tags = { + Name = "${local.image_name}" + BuildTime = "${local.timestamp}" + } +} + +build { + sources = ["source.amazon-ebs.runs-on-linux"] + + provisioner "shell" { + script = "bootstrap.sh" + } +} diff --git a/ops/packer/linux/setup_ssh.sh b/ops/packer/linux/setup_ssh.sh new file mode 100644 index 000000000000..501b4da455f5 --- /dev/null +++ b/ops/packer/linux/setup_ssh.sh @@ -0,0 +1,2 @@ +#!/bin/bash +systemctl start ssh From 000be18bfe35a0acdf071d6f09fc9dd413bd353d Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Wed, 23 Oct 2024 21:22:29 -0700 Subject: [PATCH 08/45] New Docker wrapper with S3 caching --- .github/runs-on.yml | 2 +- .github/workflows/main.yml | 99 ++++++---- .../docker}/Dockerfile.aarch64 | 0 .../docker}/Dockerfile.clang_tidy | 2 +- {tests/ci_build => ops/docker}/Dockerfile.cpu | 0 {tests/ci_build => ops/docker}/Dockerfile.gpu | 10 +- .../Dockerfile.gpu_build_r_rockylinux8 | 2 +- .../docker}/Dockerfile.gpu_build_rockylinux8 | 2 +- .../ci_build => ops/docker}/Dockerfile.i386 | 0 {tests/ci_build => ops/docker}/Dockerfile.jvm | 0 .../docker}/Dockerfile.jvm_cross | 16 +- .../docker}/Dockerfile.jvm_gpu_build | 2 +- .../Dockerfile.jvm_manylinux2014_aarch64 | 0 .../Dockerfile.jvm_manylinux2014_x86_64 | 0 .../docker}/Dockerfile.manylinux2014_aarch64 | 0 .../docker}/Dockerfile.manylinux2014_x86_64 | 0 .../docker}/Dockerfile.manylinux_2_28_x86_64 | 0 ops/docker/entrypoint.sh | 43 +++++ ops/docker_build.py | 134 +++++++++++++ ops/docker_run.py | 181 ++++++++++++++++++ ops/packer/linux/bootstrap.sh | 7 +- tests/ci_build/Dockerfile.gpu_dev_ver | 54 ------ 22 files changed, 450 insertions(+), 104 deletions(-) rename {tests/ci_build => ops/docker}/Dockerfile.aarch64 (100%) rename {tests/ci_build => ops/docker}/Dockerfile.clang_tidy (98%) rename {tests/ci_build => ops/docker}/Dockerfile.cpu (100%) rename {tests/ci_build => ops/docker}/Dockerfile.gpu (81%) rename {tests/ci_build => ops/docker}/Dockerfile.gpu_build_r_rockylinux8 (98%) rename {tests/ci_build => ops/docker}/Dockerfile.gpu_build_rockylinux8 (99%) rename {tests/ci_build => ops/docker}/Dockerfile.i386 (100%) rename {tests/ci_build => ops/docker}/Dockerfile.jvm (100%) rename {tests/ci_build => ops/docker}/Dockerfile.jvm_cross (74%) rename {tests/ci_build => ops/docker}/Dockerfile.jvm_gpu_build (98%) rename {tests/ci_build => ops/docker}/Dockerfile.jvm_manylinux2014_aarch64 (100%) rename {tests/ci_build => ops/docker}/Dockerfile.jvm_manylinux2014_x86_64 (100%) rename {tests/ci_build => ops/docker}/Dockerfile.manylinux2014_aarch64 (100%) rename {tests/ci_build => ops/docker}/Dockerfile.manylinux2014_x86_64 (100%) rename {tests/ci_build => ops/docker}/Dockerfile.manylinux_2_28_x86_64 (100%) create mode 100755 ops/docker/entrypoint.sh create mode 100644 ops/docker_build.py create mode 100644 ops/docker_run.py delete mode 100644 tests/ci_build/Dockerfile.gpu_dev_ver diff --git a/.github/runs-on.yml b/.github/runs-on.yml index 3fa13de66a25..6ae28d1e9c6b 100644 --- a/.github/runs-on.yml +++ b/.github/runs-on.yml @@ -16,7 +16,7 @@ runners: linux-amd64-cpu: cpu: 16 family: ["c7i-flex", "c7i", "c7a", "c5", "c5a"] - image: ubuntu24-full-x64 + image: linux-amd64 linux-amd64-gpu: family: ["g4dn.xlarge"] image: linux-amd64 diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 7fb80e21299a..8af6571a99bb 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -10,50 +10,85 @@ concurrency: cancel-in-progress: true jobs: - build: - name: Build + build-gpu: + name: Build GPU runs-on: - runs-on=${{ github.run_id }} - runner=linux-amd64-cpu steps: + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 - uses: actions/checkout@v4 with: submodules: "true" - - name: Build and run gtest + - name: Build container run: | - sudo apt update && sudo apt install ninja-build - mkdir build - cd build - cmake .. -GNinja -DGOOGLE_TEST=ON -DUSE_DMLC_GTEST=ON -DBUILD_DEPRECATED_CLI=ON - ninja -v - ./testxgboost - build-gpu: - name: Build GPU + python3 ops/docker_build.py \ + --container-def gpu_build_rockylinux8 \ + --container-id xgb-ci.gpu_build_rockylinux8 \ + --build-arg CUDA_VERSION_ARG=12.5.1 \ + --build-arg NCCL_VERSION_ARG=2.22.3-1 \ + --build-arg RAPIDS_VERSION_ARG=24.10 \ + --cache-from type=s3,blobs_prefix=cache/${{ github.repository }}/,manifests_prefix=cache/${{ github.repository }}/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }} \ + --cache-to type=s3,blobs_prefix=cache/${{ github.repository }}/,manifests_prefix=cache/${{ github.repository }}/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }},mode=max + - name: Build gtest + run: | + git clone https://github.com/NVIDIA/cccl.git -b v2.6.1 --quiet + python3 ops/docker_run.py \ + --container-id xgb-ci.gpu_build_rockylinux8 \ + -- tests/ci_build/build_via_cmake.sh \ + -DCMAKE_PREFIX_PATH="/opt/grpc;/workspace/cccl" \ + -DUSE_CUDA=ON \ + -DUSE_OPENMP=ON \ + -DHIDE_CXX_SYMBOLS=ON \ + -DPLUGIN_FEDERATED=ON \ + -DUSE_NCCL=ON \ + -DUSE_NCCL_LIB_PATH=ON \ + -DNCCL_INCLUDE_DIR=/usr/include \ + -DUSE_DLOPEN_NCCL=ON \ + -DGPU_COMPUTE_VER=75 + - name: Stash testxgboost + run: | + aws s3 cp ./build/testxgboost \ + s3://${{ env.RUNS_ON_S3_BUCKET_CACHE }}/cache/${{ github.repository }}/stash/${{ github.run_id }}/testxgboost + test-gpu: + name: Test GPU + needs: build-gpu runs-on: - runs-on=${{ github.run_id }} - runner=linux-amd64-gpu steps: - - uses: actions/checkout@v4 - with: - submodules: "true" - - name: Build and run gtest - run: | - cat >> $HOME/.bashrc <<- EOM - export PATH=/usr/local/cuda/bin${PATH:+:${PATH}} - export LD_LIBRARY_PATH=/usr/local/cuda/lib64\ - ${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}} - EOM - source $HOME/.bashrc - nvidia-smi - nvcc --version - sudo apt update && sudo apt install ninja-build - git clone https://github.com/NVIDIA/cccl.git -b v2.6.1 --quiet - mkdir build - cd build - cmake .. -GNinja -DGOOGLE_TEST=ON -DUSE_DMLC_GTEST=ON -DBUILD_DEPRECATED_CLI=ON -DUSE_CUDA=ON -DCMAKE_CUDA_ARCHITECTURES=75 -DCMAKE_PREFIX_PATH="$PWD/../cccl" - ninja -v - ./testxgboost - build-gpu-win64: + # Restart Docker daemon so that it recognized the ephemeral disks + - run: sudo systemctl restart docker + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + - uses: actions/checkout@v4 + with: + submodules: "true" + - name: Unstash testxgboost + run: | + aws s3 cp \ + s3://${{ env.RUNS_ON_S3_BUCKET_CACHE }}/cache/${{ github.repository }}/stash/${{ github.run_id }}/testxgboost \ + ./testxgboost + chmod +x testxgboost + - name: Build container + run: | + python3 ops/docker_build.py \ + --container-def gpu \ + --container-id xgb-ci.gpu \ + --build-arg CUDA_VERSION_ARG=12.5.1 \ + --build-arg NCCL_VERSION_ARG=2.22.3-1 \ + --build-arg RAPIDS_VERSION_ARG=24.10 \ + --cache-from type=s3,blobs_prefix=cache/${{ github.repository }}/,manifests_prefix=cache/${{ github.repository }}/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }} \ + --cache-to type=s3,blobs_prefix=cache/${{ github.repository }}/,manifests_prefix=cache/${{ github.repository }}/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }},mode=max + - name: Run gtest + run: | + python3 ops/docker_run.py \ + --container-id xgb-ci.gpu \ + --use-gpus \ + -- ./testxgboost + + build-test-gpu-win64: name: Build GPU (Windows) runs-on: - runs-on=${{ github.run_id }} diff --git a/tests/ci_build/Dockerfile.aarch64 b/ops/docker/Dockerfile.aarch64 similarity index 100% rename from tests/ci_build/Dockerfile.aarch64 rename to ops/docker/Dockerfile.aarch64 diff --git a/tests/ci_build/Dockerfile.clang_tidy b/ops/docker/Dockerfile.clang_tidy similarity index 98% rename from tests/ci_build/Dockerfile.clang_tidy rename to ops/docker/Dockerfile.clang_tidy index 2e7751a20185..c9528015c17e 100644 --- a/tests/ci_build/Dockerfile.clang_tidy +++ b/ops/docker/Dockerfile.clang_tidy @@ -1,4 +1,4 @@ -ARG CUDA_VERSION_ARG +ARG CUDA_VERSION_ARG=notset FROM nvidia/cuda:$CUDA_VERSION_ARG-devel-ubuntu22.04 ARG CUDA_VERSION_ARG diff --git a/tests/ci_build/Dockerfile.cpu b/ops/docker/Dockerfile.cpu similarity index 100% rename from tests/ci_build/Dockerfile.cpu rename to ops/docker/Dockerfile.cpu diff --git a/tests/ci_build/Dockerfile.gpu b/ops/docker/Dockerfile.gpu similarity index 81% rename from tests/ci_build/Dockerfile.gpu rename to ops/docker/Dockerfile.gpu index 884fc924cba8..461f1d99dd54 100644 --- a/tests/ci_build/Dockerfile.gpu +++ b/ops/docker/Dockerfile.gpu @@ -1,8 +1,10 @@ -ARG CUDA_VERSION_ARG +ARG CUDA_VERSION_ARG=notset FROM nvidia/cuda:$CUDA_VERSION_ARG-runtime-ubuntu22.04 ARG CUDA_VERSION_ARG ARG RAPIDS_VERSION_ARG + # Should be first 4 digits (e.g. 24.06) ARG NCCL_VERSION_ARG +ARG RAPIDSAI_CONDA_CHANNEL_ARG="rapidsai" # Environment ENV DEBIAN_FRONTEND=noninteractive @@ -23,11 +25,11 @@ ENV PATH=/opt/miniforge/bin:$PATH RUN \ export NCCL_SHORT_VER=$(echo "$NCCL_VERSION_ARG" | cut -d "-" -f 1) && \ export CUDA_SHORT_VER=$(echo "$CUDA_VERSION_ARG" | grep -o -E '[0-9]+\.[0-9]') && \ - mamba create -y -n gpu_test -c rapidsai -c conda-forge -c nvidia \ - python=3.10 cudf=$RAPIDS_VERSION_ARG* rmm=$RAPIDS_VERSION_ARG* cuda-version=$CUDA_SHORT_VER \ + mamba create -y -n gpu_test -c ${RAPIDSAI_CONDA_CHANNEL_ARG} -c conda-forge -c nvidia \ + python=3.10 "cudf=$RAPIDS_VERSION_ARG.*" "rmm=$RAPIDS_VERSION_ARG.*" cuda-version=$CUDA_SHORT_VER \ "nccl>=${NCCL_SHORT_VER}" \ dask \ - dask-cuda=$RAPIDS_VERSION_ARG* dask-cudf=$RAPIDS_VERSION_ARG* cupy \ + "dask-cuda=$RAPIDS_VERSION_ARG.*" "dask-cudf=$RAPIDS_VERSION_ARG.*" cupy \ numpy pytest pytest-timeout scipy scikit-learn pandas matplotlib wheel \ python-kubernetes urllib3 graphviz hypothesis loky \ "pyspark>=3.4.0" cloudpickle cuda-python && \ diff --git a/tests/ci_build/Dockerfile.gpu_build_r_rockylinux8 b/ops/docker/Dockerfile.gpu_build_r_rockylinux8 similarity index 98% rename from tests/ci_build/Dockerfile.gpu_build_r_rockylinux8 rename to ops/docker/Dockerfile.gpu_build_r_rockylinux8 index 159e5d776c16..7c1d4e8ef642 100644 --- a/tests/ci_build/Dockerfile.gpu_build_r_rockylinux8 +++ b/ops/docker/Dockerfile.gpu_build_r_rockylinux8 @@ -1,4 +1,4 @@ -ARG CUDA_VERSION_ARG +ARG CUDA_VERSION_ARG=notset FROM nvcr.io/nvidia/cuda:$CUDA_VERSION_ARG-devel-rockylinux8 ARG CUDA_VERSION_ARG ARG R_VERSION_ARG diff --git a/tests/ci_build/Dockerfile.gpu_build_rockylinux8 b/ops/docker/Dockerfile.gpu_build_rockylinux8 similarity index 99% rename from tests/ci_build/Dockerfile.gpu_build_rockylinux8 rename to ops/docker/Dockerfile.gpu_build_rockylinux8 index 8869fb468e12..d021190b6744 100644 --- a/tests/ci_build/Dockerfile.gpu_build_rockylinux8 +++ b/ops/docker/Dockerfile.gpu_build_rockylinux8 @@ -1,4 +1,4 @@ -ARG CUDA_VERSION_ARG +ARG CUDA_VERSION_ARG=notset FROM nvcr.io/nvidia/cuda:$CUDA_VERSION_ARG-devel-rockylinux8 ARG CUDA_VERSION_ARG ARG NCCL_VERSION_ARG diff --git a/tests/ci_build/Dockerfile.i386 b/ops/docker/Dockerfile.i386 similarity index 100% rename from tests/ci_build/Dockerfile.i386 rename to ops/docker/Dockerfile.i386 diff --git a/tests/ci_build/Dockerfile.jvm b/ops/docker/Dockerfile.jvm similarity index 100% rename from tests/ci_build/Dockerfile.jvm rename to ops/docker/Dockerfile.jvm diff --git a/tests/ci_build/Dockerfile.jvm_cross b/ops/docker/Dockerfile.jvm_cross similarity index 74% rename from tests/ci_build/Dockerfile.jvm_cross rename to ops/docker/Dockerfile.jvm_cross index 2f2b5b77ede8..3ebdb3c6686d 100644 --- a/tests/ci_build/Dockerfile.jvm_cross +++ b/ops/docker/Dockerfile.jvm_cross @@ -1,6 +1,6 @@ FROM ubuntu:22.04 -ARG JDK_VERSION=8 -ARG SPARK_VERSION=3.5.1 +ARG JDK_VERSION_ARG=8 +ARG SPARK_VERSION_ARG=3.5.1 # Environment ENV DEBIAN_FRONTEND=noninteractive @@ -11,7 +11,7 @@ RUN \ apt-get install -y software-properties-common && \ add-apt-repository ppa:openjdk-r/ppa && \ apt-get update && \ - apt-get install -y tar unzip wget openjdk-$JDK_VERSION-jdk libgomp1 && \ + apt-get install -y tar unzip wget openjdk-$JDK_VERSION_ARG-jdk libgomp1 && \ # Python wget -nv -O conda.sh https://github.com/conda-forge/miniforge/releases/download/24.3.0-0/Miniforge3-24.3.0-0-Linux-x86_64.sh && \ bash conda.sh -b -p /opt/miniforge && \ @@ -22,12 +22,12 @@ RUN \ ln -s /opt/apache-maven-3.9.7/ /opt/maven && \ # Spark with scala 2.12 mkdir -p /opt/spark-scala-2.12 && \ - wget -nv https://archive.apache.org/dist/spark/spark-$SPARK_VERSION/spark-$SPARK_VERSION-bin-hadoop3.tgz && \ - tar xvf spark-$SPARK_VERSION-bin-hadoop3.tgz --strip-components=1 -C /opt/spark-scala-2.12 && \ + wget -nv https://archive.apache.org/dist/spark/spark-$SPARK_VERSION_ARG/spark-$SPARK_VERSION_ARG-bin-hadoop3.tgz && \ + tar xvf spark-$SPARK_VERSION_ARG-bin-hadoop3.tgz --strip-components=1 -C /opt/spark-scala-2.12 && \ # Spark with scala 2.13 mkdir -p /opt/spark-scala-2.13 && \ - wget -nv https://archive.apache.org/dist/spark/spark-$SPARK_VERSION/spark-$SPARK_VERSION-bin-hadoop3-scala2.13.tgz && \ - tar xvf spark-$SPARK_VERSION-bin-hadoop3-scala2.13.tgz --strip-components=1 -C /opt/spark-scala-2.13 + wget -nv https://archive.apache.org/dist/spark/spark-$SPARK_VERSION_ARG/spark-$SPARK_VERSION_ARG-bin-hadoop3-scala2.13.tgz && \ + tar xvf spark-$SPARK_VERSION_ARG-bin-hadoop3-scala2.13.tgz --strip-components=1 -C /opt/spark-scala-2.13 ENV PATH=/opt/miniforge/bin:/opt/spark/bin:/opt/maven/bin:$PATH @@ -43,7 +43,7 @@ RUN set -ex; \ gosu nobody true # Set default JDK version -RUN update-java-alternatives -v -s java-1.$JDK_VERSION.0-openjdk-amd64 +RUN update-java-alternatives -v -s java-1.$JDK_VERSION_ARG.0-openjdk-amd64 # Default entry-point to use if running locally # It will preserve attributes of created files diff --git a/tests/ci_build/Dockerfile.jvm_gpu_build b/ops/docker/Dockerfile.jvm_gpu_build similarity index 98% rename from tests/ci_build/Dockerfile.jvm_gpu_build rename to ops/docker/Dockerfile.jvm_gpu_build index edb5918b8bbc..7f0168df467f 100644 --- a/tests/ci_build/Dockerfile.jvm_gpu_build +++ b/ops/docker/Dockerfile.jvm_gpu_build @@ -1,4 +1,4 @@ -ARG CUDA_VERSION_ARG +ARG CUDA_VERSION_ARG=notset FROM nvcr.io/nvidia/cuda:$CUDA_VERSION_ARG-devel-rockylinux8 ARG CUDA_VERSION_ARG ARG NCCL_VERSION_ARG diff --git a/tests/ci_build/Dockerfile.jvm_manylinux2014_aarch64 b/ops/docker/Dockerfile.jvm_manylinux2014_aarch64 similarity index 100% rename from tests/ci_build/Dockerfile.jvm_manylinux2014_aarch64 rename to ops/docker/Dockerfile.jvm_manylinux2014_aarch64 diff --git a/tests/ci_build/Dockerfile.jvm_manylinux2014_x86_64 b/ops/docker/Dockerfile.jvm_manylinux2014_x86_64 similarity index 100% rename from tests/ci_build/Dockerfile.jvm_manylinux2014_x86_64 rename to ops/docker/Dockerfile.jvm_manylinux2014_x86_64 diff --git a/tests/ci_build/Dockerfile.manylinux2014_aarch64 b/ops/docker/Dockerfile.manylinux2014_aarch64 similarity index 100% rename from tests/ci_build/Dockerfile.manylinux2014_aarch64 rename to ops/docker/Dockerfile.manylinux2014_aarch64 diff --git a/tests/ci_build/Dockerfile.manylinux2014_x86_64 b/ops/docker/Dockerfile.manylinux2014_x86_64 similarity index 100% rename from tests/ci_build/Dockerfile.manylinux2014_x86_64 rename to ops/docker/Dockerfile.manylinux2014_x86_64 diff --git a/tests/ci_build/Dockerfile.manylinux_2_28_x86_64 b/ops/docker/Dockerfile.manylinux_2_28_x86_64 similarity index 100% rename from tests/ci_build/Dockerfile.manylinux_2_28_x86_64 rename to ops/docker/Dockerfile.manylinux_2_28_x86_64 diff --git a/ops/docker/entrypoint.sh b/ops/docker/entrypoint.sh new file mode 100755 index 000000000000..a0c5f56bb52d --- /dev/null +++ b/ops/docker/entrypoint.sh @@ -0,0 +1,43 @@ +#!/usr/bin/env bash + +# This script is a wrapper creating the same user inside container as the one +# running the ci_build.sh outside the container. It also set the home directory +# for the user inside container to match the same absolute path as the workspace +# outside of container. Do not run this manually. It does not make sense. It is +# intended to be called by ci_build.sh only. + +set -e + +COMMAND=("$@") + +if ! touch /this_is_writable_file_system; then + echo "You can't write to your filesystem!" + echo "If you are in Docker you should check you do not have too many images" \ + "with too many files in them. Docker has some issue with it." + exit 1 +else + rm /this_is_writable_file_system +fi + +if [[ -n $CI_BUILD_UID ]] && [[ -n $CI_BUILD_GID ]]; then + groupadd -o -g "${CI_BUILD_GID}" "${CI_BUILD_GROUP}" || true + useradd -o -m -g "${CI_BUILD_GID}" -u "${CI_BUILD_UID}" \ + "${CI_BUILD_USER}" || true + export HOME="/home/${CI_BUILD_USER}" + shopt -s dotglob + cp -r /root/* "$HOME/" + chown -R "${CI_BUILD_UID}:${CI_BUILD_GID}" "$HOME" + + # Allows project-specific customization + if [[ -e "/workspace/.pre_entry.sh" ]]; then + gosu "${CI_BUILD_UID}:${CI_BUILD_GID}" /workspace/.pre_entry.sh + fi + + # Enable passwordless sudo capabilities for the user + chown root:"${CI_BUILD_GID}" "$(which gosu)" + chmod +s "$(which gosu)"; sync + + exec gosu "${CI_BUILD_UID}:${CI_BUILD_GID}" "${COMMAND[@]}" +else + exec "${COMMAND[@]}" +fi diff --git a/ops/docker_build.py b/ops/docker_build.py new file mode 100644 index 000000000000..a7276cd65b76 --- /dev/null +++ b/ops/docker_build.py @@ -0,0 +1,134 @@ +""" +Wrapper script to build a Docker container with layer caching +""" + +import argparse +import itertools +import pathlib +import subprocess +import sys +from typing import Optional + +from docker_run import SCRIPT_DIR, fancy_print_cli_args + + +def parse_build_args(raw_build_args: list[str]) -> list[dict[str, str]]: + parsed_build_args = dict() + for arg in raw_build_args: + try: + key, value = arg.split("=", maxsplit=1) + except ValueError as e: + raise ValueError( + f"Build argument must be of form KEY=VALUE. Got: {arg}" + ) from e + parsed_build_args[key] = value + return parsed_build_args + + +def docker_build( + container_id: str, + *, + build_args: list[dict[str, str]], + dockerfile_path: pathlib.Path, + docker_context_path: pathlib.Path, + cache_from: Optional[str], + cache_to: Optional[str], +) -> None: + ## Set up command-line arguments to be passed to `docker build` + # Build args + docker_build_cli_args = list( + itertools.chain.from_iterable( + [["--build-arg", f"{k}={v}"] for k, v in build_args.items()] + ) + ) + # When building an image using a non-default driver, we need to specify + # `--load` to load it to the image store. + # See https://docs.docker.com/build/builders/drivers/ + docker_build_cli_args.append("--load") + # Layer caching + if cache_from: + docker_build_cli_args.extend(["--cache-from", cache_from]) + if cache_to: + docker_build_cli_args.extend(["--cache-to", cache_to]) + # Remaining CLI args + docker_build_cli_args.extend( + [ + "--progress=plain", + "--ulimit", + "nofile=1024000:1024000", + "-t", + container_id, + "-f", + str(dockerfile_path), + str(docker_context_path), + ] + ) + cli_args = ["docker", "buildx", "build"] + docker_build_cli_args + fancy_print_cli_args(cli_args) + subprocess.run(cli_args, check=True, encoding="utf-8") + + +def main(args: argparse.Namespace) -> None: + # Dockerfile to be used in docker build + dockerfile_path = SCRIPT_DIR / "docker" / f"Dockerfile.{args.container_def}" + docker_context_path = SCRIPT_DIR / "docker" + + build_args = parse_build_args(args.build_arg) + + docker_build( + args.container_id, + build_args=build_args, + dockerfile_path=dockerfile_path, + docker_context_path=docker_context_path, + cache_from=args.cache_from, + cache_to=args.cache_to, + ) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Build a Docker container") + parser.add_argument( + "--container-def", + type=str, + required=True, + help=( + "String uniquely identifying the container definition. The container " + "definition will be fetched from docker/Dockerfile.CONTAINER_DEF." + ), + ) + parser.add_argument( + "--container-id", + type=str, + required=True, + help="String ID to assign to the newly built container", + ) + parser.add_argument( + "--build-arg", + type=str, + default=[], + action="append", + help=( + "Build-time variable(s) to be passed to `docker build`. Each variable " + "should be specified as a key-value pair in the form KEY=VALUE. " + "The variables should match the ARG instructions in the Dockerfile. " + "When passing multiple variables, specify --build-arg multiple times. " + "Example: --build-arg CUDA_VERSION_ARG=12.5 --build-arg RAPIDS_VERSION_ARG=24.10'" + ), + ) + parser.add_argument( + "--cache-from", + type=str, + help="Use an external cache source for the Docker build", + ) + parser.add_argument( + "--cache-to", + type=str, + help="Export layers from the container to an external cache destination", + ) + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(1) + + parsed_args = parser.parse_args() + main(parsed_args) diff --git a/ops/docker_run.py b/ops/docker_run.py new file mode 100644 index 000000000000..4fc6356a90a1 --- /dev/null +++ b/ops/docker_run.py @@ -0,0 +1,181 @@ +""" +Wrapper script to run a command inside a Docker container +""" + +import argparse +import grp +import itertools +import os +import pathlib +import pwd +import subprocess +import sys +import textwrap + +SCRIPT_DIR = pathlib.Path(__file__).expanduser().resolve().parent +PROJECT_ROOT_DIR = SCRIPT_DIR.parent +LINEWIDTH = 88 +TEXT_WRAPPER = textwrap.TextWrapper( + width=LINEWIDTH, + initial_indent="", + subsequent_indent=" ", + break_long_words=False, + break_on_hyphens=False, +) + + +def parse_run_args(raw_run_args: str) -> list[str]: + return [x for x in raw_run_args.split(" ") if x] + + +def compute_container_id(container_name: str, build_args: list[dict[str, str]]) -> str: + container_id = f"xgb-ci.{container_name}" + # For some build arguments, append special suffixies + for arg_name, suffix in [ + ("CUDA_VERSION_ARG", "cuda"), + ("RAPIDS_VERSION_ARG", "rapids"), + ("JDK_VERSION_ARG", "jdk"), + ]: + if arg_name in build_args: + container_id += f"_{suffix}{build_args[arg_name]}" + return container_id + + +def get_user_ids() -> dict[str, str]: + uid = os.getuid() + gid = os.getgid() + return { + "CI_BUILD_UID": str(uid), + "CI_BUILD_USER": pwd.getpwuid(uid).pw_name, + "CI_BUILD_GID": str(gid), + "CI_BUILD_GROUP": grp.getgrgid(gid).gr_name, + } + + +def fancy_print_cli_args(cli_args: list[str]) -> None: + print( + "=" * LINEWIDTH + + "\n" + + " \\\n".join(TEXT_WRAPPER.wrap(" ".join(cli_args))) + + "\n" + + "=" * LINEWIDTH + + "\n", + flush=True, + ) + + +def docker_run( + container_id: str, + command_args: list[str], + *, + use_gpus: bool, + workdir: pathlib.Path, + user_ids: dict[str, str], + extra_args: list[str], +) -> None: + # Command-line arguments to be passed to `docker run` + docker_run_cli_args = ["--rm", "--pid=host"] + + if use_gpus: + docker_run_cli_args.extend(["--gpus", "all"]) + + docker_run_cli_args.extend(["-v", f"{workdir}:/workspace", "-w", "/workspace"]) + docker_run_cli_args.extend( + itertools.chain.from_iterable([["-e", f"{k}={v}"] for k, v in user_ids.items()]) + ) + docker_run_cli_args.extend(extra_args) + docker_run_cli_args.append(container_id) + docker_run_cli_args.extend(command_args) + + cli_args = ["docker", "run"] + docker_run_cli_args + fancy_print_cli_args(cli_args) + subprocess.run(cli_args, check=True, encoding="utf-8") + + +def main(args: argparse.Namespace) -> None: + run_args = parse_run_args(args.run_args) + user_ids = get_user_ids() + + if args.use_gpus: + print("Using NVIDIA GPUs for `docker run`") + if args.interactive: + print("Using interactive mode for `docker run`") + run_args.append("-it") + + docker_run( + args.container_id, + args.command_args, + use_gpus=args.use_gpus, + workdir=args.workdir, + user_ids=user_ids, + extra_args=run_args, + ) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + usage=( + f"{sys.argv[0]} --container-id CONTAINER_ID [--use-gpus] [--interactive] " + "[--workdir WORKDIR] [--run-args RUN_ARGS] -- COMMAND_ARG " + "[COMMAND_ARG ...]" + ), + description="Run tasks inside a Docker container", + ) + parser.add_argument( + "--container-id", + type=str, + required=True, + help="String ID of the container to run.", + ) + parser.add_argument( + "--use-gpus", + action="store_true", + help=( + "Grant the container access to NVIDIA GPUs; requires the NVIDIA " + "Container Toolkit." + ), + ) + parser.add_argument( + "--interactive", + action="store_true", + help=( + "Run the container in the interactive mode; requires an interactive shell " + "(TTY). With this flag, you can use Ctrl-C to interrupt an long-running " + "command." + ), + ) + parser.add_argument( + "--workdir", + type=lambda p: pathlib.Path(p).expanduser().resolve(), + default=PROJECT_ROOT_DIR, + help="Path to working directory; if unset, use the project's root", + ) + parser.add_argument( + "--run-args", + type=str, + default="", + help=( + "Argument(s) to be passed to `docker run`. When passing multiple " + "arguments, use single quotes to wrap them. Example: " + "--run-args '--cap-add SYS_PTRACE --shm-size=4g'" + ), + ) + parser.add_argument( + "command_args", + metavar="COMMAND_ARG", + type=str, + nargs="+", + help=( + "Argument(s) for the command to execute. NOTE. Make sure to specify " + "double-dash (--) to clearly distinguish between the command and the " + "preceding parameters. Example: --run-args '--cap-add SYS_PTRACE " + "--shm-size=4g' -- ./myprog" + ), + ) + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(1) + + parsed_args = parser.parse_args() + main(parsed_args) diff --git a/ops/packer/linux/bootstrap.sh b/ops/packer/linux/bootstrap.sh index 9cf0edfe7fe8..fac5c20f7146 100644 --- a/ops/packer/linux/bootstrap.sh +++ b/ops/packer/linux/bootstrap.sh @@ -4,7 +4,7 @@ set -euo pipefail ## Install basic tools echo 'debconf debconf/frontend select Noninteractive' | sudo debconf-set-selections sudo apt-get update -sudo apt-get install -y cmake git build-essential wget ca-certificates curl +sudo apt-get install -y cmake git build-essential wget ca-certificates curl unzip ## Install CUDA 12.5 + driver echo "Installilng CUDA and driver..." @@ -40,3 +40,8 @@ sudo apt-get update sudo apt-get install -y nvidia-container-toolkit sudo nvidia-ctk runtime configure --runtime=docker sudo systemctl restart docker + +## Install AWS CLI v2 +wget -nv https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip -O awscliv2.zip +unzip awscliv2.zip +sudo ./aws/install diff --git a/tests/ci_build/Dockerfile.gpu_dev_ver b/tests/ci_build/Dockerfile.gpu_dev_ver deleted file mode 100644 index d23c5e83c2c7..000000000000 --- a/tests/ci_build/Dockerfile.gpu_dev_ver +++ /dev/null @@ -1,54 +0,0 @@ -# Container to test XGBoost against dev versions of dependencies - -ARG CUDA_VERSION_ARG -FROM nvidia/cuda:$CUDA_VERSION_ARG-runtime-ubuntu22.04 -ARG CUDA_VERSION_ARG -ARG RAPIDS_VERSION_ARG - # Should be first 4 digits of the dev version (e.g. 24.06) -ARG NCCL_VERSION_ARG - -# Environment -ENV DEBIAN_FRONTEND=noninteractive -SHELL ["/bin/bash", "-c"] # Use Bash as shell - -# Install all basic requirements -RUN \ - apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/3bf863cc.pub && \ - apt-get update && \ - apt-get install -y wget unzip bzip2 libgomp1 build-essential openjdk-8-jdk-headless && \ - # Python - wget -nv -O conda.sh https://github.com/conda-forge/miniforge/releases/download/24.3.0-0/Miniforge3-24.3.0-0-Linux-x86_64.sh && \ - bash conda.sh -b -p /opt/miniforge - -ENV PATH=/opt/miniforge/bin:$PATH - -# Create new Conda environment with dev versions of cuDF, Dask, and cuPy -RUN \ - export NCCL_SHORT_VER=$(echo "$NCCL_VERSION_ARG" | cut -d "-" -f 1) && \ - export CUDA_SHORT_VER=$(echo "$CUDA_VERSION_ARG" | grep -o -E '[0-9]+\.[0-9]') && \ - mamba create -y -n gpu_test -c rapidsai-nightly -c conda-forge -c nvidia \ - python=3.10 "cudf=$RAPIDS_VERSION_ARG.*" "rmm=$RAPIDS_VERSION_ARG.*" cuda-version=$CUDA_SHORT_VER \ - "nccl>=${NCCL_SHORT_VER}" \ - dask \ - "dask-cuda=$RAPIDS_VERSION_ARG.*" "dask-cudf=$RAPIDS_VERSION_ARG.*" cupy \ - numpy pytest pytest-timeout scipy scikit-learn pandas matplotlib wheel \ - python-kubernetes urllib3 graphviz hypothesis loky \ - "pyspark>=3.4.0" cloudpickle cuda-python && \ - mamba clean --all --yes && \ - conda run --no-capture-output -n gpu_test pip install buildkite-test-collector - -ENV GOSU_VERSION=1.10 -ENV JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64/ - -# Install lightweight sudo (not bound to TTY) -RUN set -ex; \ - wget -nv -O /usr/local/bin/gosu "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-amd64" && \ - chmod +x /usr/local/bin/gosu && \ - gosu nobody true - -# Default entry-point to use if running locally -# It will preserve attributes of created files -COPY entrypoint.sh /scripts/ - -WORKDIR /workspace -ENTRYPOINT ["/scripts/entrypoint.sh"] From e1e3b41ed2df12a2d0e431d28df82da1b1c16916 Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Thu, 24 Oct 2024 15:20:02 -0700 Subject: [PATCH 09/45] Create utility script to build CI container --- .github/workflows/main.yml | 143 +++++++++++++++++-------------- ops/docker_build.py | 2 +- ops/docker_build.sh | 141 ++++++++++++++++++++++++++++++ ops/matrix/ci_container.yml | 18 ++++ ops/matrix/docker_cache_ecr.yml | 4 + ops/matrix/extract_build_args.jq | 6 ++ ops/matrix/extract_build_args.sh | 21 +++++ ops/packer/linux/bootstrap.sh | 5 ++ 8 files changed, 275 insertions(+), 65 deletions(-) create mode 100755 ops/docker_build.sh create mode 100644 ops/matrix/ci_container.yml create mode 100644 ops/matrix/docker_cache_ecr.yml create mode 100644 ops/matrix/extract_build_args.jq create mode 100755 ops/matrix/extract_build_args.sh diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 8af6571a99bb..18f997e5c52e 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -10,47 +10,66 @@ concurrency: cancel-in-progress: true jobs: + build-containers: + name: Build CI containers + runs-on: + - runs-on=${{ github.run_id }} + - runner=linux-amd64-cpu + strategy: + matrix: + container_id: + - xgb-ci.gpu_build_rockylinux8 + - xgb-ci.gpu + steps: + # Restart Docker daemon so that it recognized the ephemeral disks + - run: sudo systemctl restart docker + - uses: actions/checkout@v4 + with: + submodules: "true" + - name: Build ${{ matrix.container_id }} + run: bash ops/docker_build.sh + env: + CONTAINER_ID: ${{ matrix.container_id }} + BRANCH_NAME: ${{ github.event.pull_request.number || github.ref_name }} + USE_DOCKER_CACHE: 1 build-gpu: name: Build GPU + needs: build-containers runs-on: - runs-on=${{ github.run_id }} - runner=linux-amd64-cpu steps: - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 - - uses: actions/checkout@v4 - with: - submodules: "true" - - name: Build container - run: | - python3 ops/docker_build.py \ - --container-def gpu_build_rockylinux8 \ - --container-id xgb-ci.gpu_build_rockylinux8 \ - --build-arg CUDA_VERSION_ARG=12.5.1 \ - --build-arg NCCL_VERSION_ARG=2.22.3-1 \ - --build-arg RAPIDS_VERSION_ARG=24.10 \ - --cache-from type=s3,blobs_prefix=cache/${{ github.repository }}/,manifests_prefix=cache/${{ github.repository }}/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }} \ - --cache-to type=s3,blobs_prefix=cache/${{ github.repository }}/,manifests_prefix=cache/${{ github.repository }}/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }},mode=max - - name: Build gtest - run: | - git clone https://github.com/NVIDIA/cccl.git -b v2.6.1 --quiet - python3 ops/docker_run.py \ - --container-id xgb-ci.gpu_build_rockylinux8 \ - -- tests/ci_build/build_via_cmake.sh \ - -DCMAKE_PREFIX_PATH="/opt/grpc;/workspace/cccl" \ - -DUSE_CUDA=ON \ - -DUSE_OPENMP=ON \ - -DHIDE_CXX_SYMBOLS=ON \ - -DPLUGIN_FEDERATED=ON \ - -DUSE_NCCL=ON \ - -DUSE_NCCL_LIB_PATH=ON \ - -DNCCL_INCLUDE_DIR=/usr/include \ - -DUSE_DLOPEN_NCCL=ON \ - -DGPU_COMPUTE_VER=75 - - name: Stash testxgboost - run: | - aws s3 cp ./build/testxgboost \ - s3://${{ env.RUNS_ON_S3_BUCKET_CACHE }}/cache/${{ github.repository }}/stash/${{ github.run_id }}/testxgboost + # Restart Docker daemon so that it recognized the ephemeral disks + - run: sudo systemctl restart docker + - uses: actions/checkout@v4 + with: + submodules: "true" + - name: Fetch container from cache + run: bash ops/docker_build.sh + env: + CONTAINER_ID: xgb-ci.gpu_build_rockylinux8 + BRANCH_NAME: ${{ github.event.pull_request.number || github.ref_name }} + USE_DOCKER_CACHE: 1 + - name: Build gtest + run: | + git clone https://github.com/NVIDIA/cccl.git -b v2.6.1 --quiet + python3 ops/docker_run.py \ + --container-id xgb-ci.gpu_build_rockylinux8 \ + -- tests/ci_build/build_via_cmake.sh \ + -DCMAKE_PREFIX_PATH="/opt/grpc;/workspace/cccl" \ + -DUSE_CUDA=ON \ + -DUSE_OPENMP=ON \ + -DHIDE_CXX_SYMBOLS=ON \ + -DPLUGIN_FEDERATED=ON \ + -DUSE_NCCL=ON \ + -DUSE_NCCL_LIB_PATH=ON \ + -DNCCL_INCLUDE_DIR=/usr/include \ + -DUSE_DLOPEN_NCCL=ON \ + -DGPU_COMPUTE_VER=75 + - name: Stash testxgboost + run: | + aws s3 cp ./build/testxgboost \ + s3://${{ env.RUNS_ON_S3_BUCKET_CACHE }}/cache/${{ github.repository }}/stash/${{ github.run_id }}/testxgboost test-gpu: name: Test GPU needs: build-gpu @@ -60,8 +79,6 @@ jobs: steps: # Restart Docker daemon so that it recognized the ephemeral disks - run: sudo systemctl restart docker - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 - uses: actions/checkout@v4 with: submodules: "true" @@ -71,21 +88,19 @@ jobs: s3://${{ env.RUNS_ON_S3_BUCKET_CACHE }}/cache/${{ github.repository }}/stash/${{ github.run_id }}/testxgboost \ ./testxgboost chmod +x testxgboost - - name: Build container - run: | - python3 ops/docker_build.py \ - --container-def gpu \ - --container-id xgb-ci.gpu \ - --build-arg CUDA_VERSION_ARG=12.5.1 \ - --build-arg NCCL_VERSION_ARG=2.22.3-1 \ - --build-arg RAPIDS_VERSION_ARG=24.10 \ - --cache-from type=s3,blobs_prefix=cache/${{ github.repository }}/,manifests_prefix=cache/${{ github.repository }}/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }} \ - --cache-to type=s3,blobs_prefix=cache/${{ github.repository }}/,manifests_prefix=cache/${{ github.repository }}/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }},mode=max + - name: Fetch container from cache + run: bash ops/docker_build.sh + env: + CONTAINER_ID: xgb-ci.gpu + BRANCH_NAME: ${{ github.event.pull_request.number || github.ref_name }} + USE_DOCKER_CACHE: 1 - name: Run gtest run: | + nvidia-smi python3 ops/docker_run.py \ --container-id xgb-ci.gpu \ --use-gpus \ + --run-args='--privileged' \ -- ./testxgboost build-test-gpu-win64: @@ -94,20 +109,20 @@ jobs: - runs-on=${{ github.run_id }} - runner=windows-gpu steps: - - uses: actions/checkout@v4 - with: - submodules: "true" - - name: Build and run gtest - shell: powershell - run: | - nvidia-smi - nvcc --version - git clone https://github.com/NVIDIA/cccl.git -b v2.6.1 --quiet - mkdir build - cd build - cmake .. -G"Visual Studio 17 2022" -A x64 -DUSE_CUDA=ON -DGOOGLE_TEST=ON -DUSE_DMLC_GTEST=ON -DBUILD_DEPRECATED_CLI=ON -DCMAKE_CUDA_ARCHITECTURES=75 -DCMAKE_PREFIX_PATH="$(Get-Location)/../cccl" - if ($LASTEXITCODE -ne 0) { throw "Last command failed" } - cmake --build . --config Release -- /m /nodeReuse:false "/consoleloggerparameters:ShowCommandLine;Verbosity=minimal" - if ($LASTEXITCODE -ne 0) { throw "Last command failed" } - & .\testxgboost.exe - if ($LASTEXITCODE -ne 0) { throw "Last command failed" } + - uses: actions/checkout@v4 + with: + submodules: "true" + - name: Build and run gtest + shell: powershell + run: | + nvidia-smi + nvcc --version + git clone https://github.com/NVIDIA/cccl.git -b v2.6.1 --quiet + mkdir build + cd build + cmake .. -G"Visual Studio 17 2022" -A x64 -DUSE_CUDA=ON -DGOOGLE_TEST=ON -DUSE_DMLC_GTEST=ON -DBUILD_DEPRECATED_CLI=ON -DCMAKE_CUDA_ARCHITECTURES=75 -DCMAKE_PREFIX_PATH="$(Get-Location)/../cccl" + if ($LASTEXITCODE -ne 0) { throw "Last command failed" } + cmake --build . --config Release -- /m /nodeReuse:false "/consoleloggerparameters:ShowCommandLine;Verbosity=minimal" + if ($LASTEXITCODE -ne 0) { throw "Last command failed" } + & .\testxgboost.exe + if ($LASTEXITCODE -ne 0) { throw "Last command failed" } diff --git a/ops/docker_build.py b/ops/docker_build.py index a7276cd65b76..dd2871c3a6ed 100644 --- a/ops/docker_build.py +++ b/ops/docker_build.py @@ -63,7 +63,7 @@ def docker_build( str(docker_context_path), ] ) - cli_args = ["docker", "buildx", "build"] + docker_build_cli_args + cli_args = ["docker", "build"] + docker_build_cli_args fancy_print_cli_args(cli_args) subprocess.run(cli_args, check=True, encoding="utf-8") diff --git a/ops/docker_build.sh b/ops/docker_build.sh new file mode 100755 index 000000000000..c8c0680aea05 --- /dev/null +++ b/ops/docker_build.sh @@ -0,0 +1,141 @@ +#!/bin/bash +## Build a CI container and cache the layers in AWS ECR (Elastic Container Registry). +## This script provides a convenient wrapper for ops/docker_build.py. +## Build-time variables (--build-arg) and container defintion are fetched from +## ops/matrix/ci_container.yml. +## +## Note. This script takes in all inputs via environment variables. + +INPUT_DOC=$( +cat <<-EOF +Inputs + - CONTAINER_ID: String ID uniquely identifying the container (Required) + - BRANCH_NAME: Name of the current git branch or pull request (Required) + - USE_DOCKER_CACHE: If set to 1, enable caching +EOF +) + +ECR_LIFECYCLE_RULE=$( +cat <<-EOF +{ + "rules": [ + { + "rulePriority": 1, + "selection": { + "tagStatus": "any", + "countType": "sinceImagePushed", + "countUnit": "days", + "countNumber": 30 + }, + "action": { + "type": "expire" + } + } + ] +} +EOF +) + +set -euo pipefail + +for arg in "CONTAINER_ID" "BRANCH_NAME" +do + if [[ -z "${!arg:-}" ]] + then + echo -e "Error: $arg must be set.\n${INPUT_DOC}" + exit 1 + fi +done + +# Fetch CONTAINER_DEF and BUILD_ARGS +source <(ops/matrix/extract_build_args.sh ${CONTAINER_ID} | tee /dev/stderr) 2>&1 + +if [[ "${USE_DOCKER_CACHE:-}" != "1" ]] # Any value other than 1 is considered false +then + USE_DOCKER_CACHE=0 +fi + +if [[ ${USE_DOCKER_CACHE} -eq 0 ]] +then + echo "USE_DOCKER_CACHE not set; caching disabled" +else + DOCKER_CACHE_ECR_ID=$(yq ".DOCKER_CACHE_ECR_ID" ops/matrix/docker_cache_ecr.yml) + DOCKER_CACHE_ECR_REGION=$(yq ".DOCKER_CACHE_ECR_REGION" ops/matrix/docker_cache_ecr.yml) + DOCKER_CACHE_REPO="${DOCKER_CACHE_ECR_ID}.dkr.ecr.${DOCKER_CACHE_ECR_REGION}.amazonaws.com" + echo "Using AWS ECR; repo URL = ${DOCKER_CACHE_REPO}" + # Login for Docker registry + echo "aws ecr get-login-password --region ${DOCKER_CACHE_ECR_REGION} |" \ + "docker login --username AWS --password-stdin ${DOCKER_CACHE_REPO}" + aws ecr get-login-password --region ${DOCKER_CACHE_ECR_REGION} \ + | docker login --username AWS --password-stdin ${DOCKER_CACHE_REPO} +fi + +# Pull pre-built container from the cache +# First try locating one for the particular branch or pull request +CACHE_FROM_CMD="" +IS_CACHED=0 +if [[ ${USE_DOCKER_CACHE} -eq 1 ]] +then + DOCKER_TAG="${BRANCH_NAME//\//-}" # Slashes are not allowed in Docker tag + DOCKER_URL="${DOCKER_CACHE_REPO}/${CONTAINER_ID}:${DOCKER_TAG}" + echo "docker pull --quiet ${DOCKER_URL}" + if time docker pull --quiet "${DOCKER_URL}" + then + echo "Found a cached container for the branch ${BRANCH_NAME}: ${DOCKER_URL}" + IS_CACHED=1 + else + # If there's no pre-built container from the cache, + # use the pre-built container from the master branch. + DOCKER_URL="${DOCKER_CACHE_REPO}/${CONTAINER_ID}:master" + echo "Could not find a cached container for the branch ${BRANCH_NAME}." \ + "Using a cached container from the master branch: ${DOCKER_URL}" + echo "docker pull --quiet ${DOCKER_URL}" + if time docker pull --quiet "${DOCKER_URL}" + then + IS_CACHED=1 + else + echo "Could not find a cached container for the master branch either." + IS_CACHED=0 + fi + fi + if [[ $IS_CACHED -eq 1 ]] + then + CACHE_FROM_CMD="--cache-from type=registry,ref=${DOCKER_URL}" + fi +fi + +# Run Docker build +set -x +python3 ops/docker_build.py \ + --container-def ${CONTAINER_DEF} \ + --container-id ${CONTAINER_ID} \ + ${BUILD_ARGS} \ + --cache-to type=inline \ + ${CACHE_FROM_CMD} +set +x + +# Now cache the new container +if [[ ${USE_DOCKER_CACHE} -eq 1 ]] +then + DOCKER_URL="${DOCKER_CACHE_REPO}/${CONTAINER_ID}:${DOCKER_TAG}" + echo "docker tag ${CONTAINER_ID} ${DOCKER_URL}" + docker tag "${CONTAINER_ID}" "${DOCKER_URL}" + + # Attempt to create Docker repository; it will fail if the repository already exists + echo "aws ecr create-repository --repository-name ${CONTAINER_ID} --region ${DOCKER_CACHE_ECR_REGION}" + if aws ecr create-repository --repository-name ${CONTAINER_ID} --region ${DOCKER_CACHE_ECR_REGION} + then + # Repository was created. Now set expiration policy + echo "aws ecr put-lifecycle-policy --repository-name ${CONTAINER_ID}" \ + "--region ${DOCKER_CACHE_ECR_REGION} --lifecycle-policy-text file:///dev/stdin" + echo "${ECR_LIFECYCLE_RULE}" | aws ecr put-lifecycle-policy --repository-name ${CONTAINER_ID} \ + --region ${DOCKER_CACHE_ECR_REGION} --lifecycle-policy-text file:///dev/stdin + fi + + echo "docker push --quiet ${DOCKER_URL}" + if ! time docker push --quiet "${DOCKER_URL}" + then + echo "ERROR: could not update Docker cache ${DOCKER_URL}" + exit 1 + fi +fi diff --git a/ops/matrix/ci_container.yml b/ops/matrix/ci_container.yml new file mode 100644 index 000000000000..e01431b463a5 --- /dev/null +++ b/ops/matrix/ci_container.yml @@ -0,0 +1,18 @@ +## List of CI containers with definitions and build arguments + +# Each container will be built using the definition from +# ops/docker/Dockerfile.CONTAINER_DEF + +xgb-ci.gpu_build_rockylinux8: + container_def: gpu_build_rockylinux8 + build_args: + CUDA_VERSION_ARG: "12.5.1" + NCCL_VERSION_ARG: "2.22.3-1" + RAPIDS_VERSION_ARG: "24.10" + +xgb-ci.gpu: + container_def: gpu + build_args: + CUDA_VERSION_ARG: "12.5.1" + NCCL_VERSION_ARG: "2.22.3-1" + RAPIDS_VERSION_ARG: "24.10" diff --git a/ops/matrix/docker_cache_ecr.yml b/ops/matrix/docker_cache_ecr.yml new file mode 100644 index 000000000000..e20f35fc8020 --- /dev/null +++ b/ops/matrix/docker_cache_ecr.yml @@ -0,0 +1,4 @@ +## Constants for AWS ECR (Elastic Container Registry), used for the Docker cache + +DOCKER_CACHE_ECR_ID: "492475357299" +DOCKER_CACHE_ECR_REGION: "us-west-2" diff --git a/ops/matrix/extract_build_args.jq b/ops/matrix/extract_build_args.jq new file mode 100644 index 000000000000..0453e2a7c081 --- /dev/null +++ b/ops/matrix/extract_build_args.jq @@ -0,0 +1,6 @@ +def compute_build_args($input; $container_id): + $input | + .[$container_id].build_args | + to_entries | + map("--build-arg " + .key + "=" + .value) | + join(" "); diff --git a/ops/matrix/extract_build_args.sh b/ops/matrix/extract_build_args.sh new file mode 100755 index 000000000000..ec4621bc42b2 --- /dev/null +++ b/ops/matrix/extract_build_args.sh @@ -0,0 +1,21 @@ +#!/bin/bash +## Extract container definition and build args from ops/matrix/ci_container.yml, +## given the container ID. + +if [ "$#" -ne 1 ]; then + echo "Usage: $0 [container_id]" + exit 1 +fi + +CONTAINER_ID="$1" +CONTAINER_DEF=$( + yq -o json ops/matrix/ci_container.yml | + jq -r --arg container_id "${CONTAINER_ID}" '.[$container_id].container_def' +) +BUILD_ARGS=$( + yq -o json ops/matrix/ci_container.yml | + jq -r --arg container_id "${CONTAINER_ID}" \ + 'include "ops/matrix/extract_build_args"; + compute_build_args(.; $container_id)' +) +echo "CONTAINER_DEF='${CONTAINER_DEF}' BUILD_ARGS='${BUILD_ARGS}'" diff --git a/ops/packer/linux/bootstrap.sh b/ops/packer/linux/bootstrap.sh index fac5c20f7146..9dbda19c3baa 100644 --- a/ops/packer/linux/bootstrap.sh +++ b/ops/packer/linux/bootstrap.sh @@ -45,3 +45,8 @@ sudo systemctl restart docker wget -nv https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip -O awscliv2.zip unzip awscliv2.zip sudo ./aws/install + +## Install jq and yq +sudo apt update && sudo apt install jq +wget -nv https://github.com/mikefarah/yq/releases/download/v4.44.3/yq_linux_amd64.tar.gz -O - | \ + tar xz && sudo mv ./yq_linux_amd64 /usr/bin/yq From 5373276a364037d32a7eb1f7df98d3473431bb29 Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Fri, 25 Oct 2024 16:32:48 -0700 Subject: [PATCH 10/45] Migrate some tasks from BuildKite --- .github/runs-on.yml | 5 +- .github/workflows/main.yml | 219 +++++++++++++++--- .../Dockerfile.jvm_manylinux2014_aarch64 | 17 -- .../Dockerfile.jvm_manylinux2014_x86_64 | 17 -- ops/docker/Dockerfile.manylinux2014_aarch64 | 2 + ops/docker/Dockerfile.manylinux2014_x86_64 | 2 + .../docker}/conda_env/aarch64_test.yml | 0 .../docker}/conda_env/cpp_test.yml | 0 .../docker}/conda_env/jvm_tests.yml | 0 .../docker}/conda_env/linux_cpu_test.yml | 0 .../docker}/conda_env/linux_sycl_test.yml | 0 .../docker}/conda_env/macos_cpu_test.yml | 0 .../docker}/conda_env/python_lint.yml | 0 .../docker}/conda_env/sdist_test.yml | 0 .../docker}/conda_env/win64_cpu_test.yml | 0 .../docker}/conda_env/win64_test.yml | 0 ops/docker_run.py | 2 +- {tests/ci_build => ops}/format_wheel_meta.py | 3 +- ops/matrix/ci_container.yml | 20 ++ ops/matrix/extract_build_args.jq | 4 +- {tests/ci_build => ops}/rename_whl.py | 0 ops/stash_artifacts.sh | 66 ++++++ .../buildkite => ops/task}/build-cpu-arm64.sh | 44 ++-- ops/task/build-cpu.sh | 43 ++++ ops/task/build-cuda-with-rmm.sh | 65 ++++++ {tests/buildkite => ops/task}/build-cuda.sh | 64 ++--- .../task/build-jvm-manylinux2014.sh | 18 +- .../task}/build-manylinux2014.sh | 46 ++-- .../task/build-via-cmake.sh | 17 +- ops/task/enforce-ci.sh | 42 ++++ .../task/patches}/cpu_only_pypkg.patch | 0 .../task/patches}/manylinux2014_warning.patch | 0 .../task/patches}/remove_nccl_dep.patch | 0 ops/task/run-clang-tidy.sh | 11 + tests/buildkite/build-cpu.sh | 34 --- tests/buildkite/build-cuda-with-rmm.sh | 65 ------ .../build-jvm-linux-x86_64-manylinux2014.sh | 29 --- tests/buildkite/conftest.sh | 64 ----- tests/buildkite/run-clang-tidy.sh | 11 - 39 files changed, 556 insertions(+), 354 deletions(-) delete mode 100644 ops/docker/Dockerfile.jvm_manylinux2014_aarch64 delete mode 100644 ops/docker/Dockerfile.jvm_manylinux2014_x86_64 rename {tests/ci_build => ops/docker}/conda_env/aarch64_test.yml (100%) rename {tests/ci_build => ops/docker}/conda_env/cpp_test.yml (100%) rename {tests/ci_build => ops/docker}/conda_env/jvm_tests.yml (100%) rename {tests/ci_build => ops/docker}/conda_env/linux_cpu_test.yml (100%) rename {tests/ci_build => ops/docker}/conda_env/linux_sycl_test.yml (100%) rename {tests/ci_build => ops/docker}/conda_env/macos_cpu_test.yml (100%) rename {tests/ci_build => ops/docker}/conda_env/python_lint.yml (100%) rename {tests/ci_build => ops/docker}/conda_env/sdist_test.yml (100%) rename {tests/ci_build => ops/docker}/conda_env/win64_cpu_test.yml (100%) rename {tests/ci_build => ops/docker}/conda_env/win64_test.yml (100%) rename {tests/ci_build => ops}/format_wheel_meta.py (96%) rename {tests/ci_build => ops}/rename_whl.py (100%) create mode 100755 ops/stash_artifacts.sh rename {tests/buildkite => ops/task}/build-cpu-arm64.sh (54%) create mode 100755 ops/task/build-cpu.sh create mode 100755 ops/task/build-cuda-with-rmm.sh rename {tests/buildkite => ops/task}/build-cuda.sh (50%) rename tests/buildkite/build-jvm-linux-arm64-manylinux2014.sh => ops/task/build-jvm-manylinux2014.sh (69%) rename {tests/buildkite => ops/task}/build-manylinux2014.sh (60%) rename tests/ci_build/build_via_cmake.sh => ops/task/build-via-cmake.sh (70%) create mode 100755 ops/task/enforce-ci.sh rename {tests/buildkite => ops/task/patches}/cpu_only_pypkg.patch (100%) rename {tests/buildkite => ops/task/patches}/manylinux2014_warning.patch (100%) rename {tests/buildkite => ops/task/patches}/remove_nccl_dep.patch (100%) create mode 100755 ops/task/run-clang-tidy.sh delete mode 100755 tests/buildkite/build-cpu.sh delete mode 100755 tests/buildkite/build-cuda-with-rmm.sh delete mode 100644 tests/buildkite/build-jvm-linux-x86_64-manylinux2014.sh delete mode 100755 tests/buildkite/conftest.sh delete mode 100755 tests/buildkite/run-clang-tidy.sh diff --git a/.github/runs-on.yml b/.github/runs-on.yml index 6ae28d1e9c6b..720ba76bb836 100644 --- a/.github/runs-on.yml +++ b/.github/runs-on.yml @@ -14,7 +14,7 @@ images: runners: linux-amd64-cpu: - cpu: 16 + cpu: 32 family: ["c7i-flex", "c7i", "c7a", "c5", "c5a"] image: linux-amd64 linux-amd64-gpu: @@ -24,12 +24,13 @@ runners: family: ["g4dn.12xlarge"] image: linux-amd64 linux-arm64-cpu: + cpu: 32 family: ["c6g", "c7g"] image: ubuntu24-full-arm64 windows-gpu: family: ["g4dn.2xlarge"] image: windows-amd64 windows-cpu: - cpu: 16 + cpu: 32 family: ["c7i-flex", "c7i", "c7a", "c5", "c5a"] image: windows-amd64 diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 18f997e5c52e..84967f0684a2 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -9,17 +9,31 @@ concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} cancel-in-progress: true +env: + BRANCH_NAME: >- + ${{ github.event.pull_request.number && 'PR-' }}${{ github.event.pull_request.number || github.ref_name }} + USE_DOCKER_CACHE: 1 + ARTIFACT_STASH_PREFIX: cache/${{ github.repository }}/stash/${{ github.run_id }} + jobs: build-containers: name: Build CI containers runs-on: - runs-on=${{ github.run_id }} - - runner=linux-amd64-cpu + - runner=${{ matrix.runner }} strategy: matrix: container_id: - xgb-ci.gpu_build_rockylinux8 - xgb-ci.gpu + - xgb-ci.cpu + - xgb-ci.clang_tidy + - xgb-ci.manylinux_2_28_x86_64 + - xgb-ci.manylinux2014_x86_64 + runner: [linux-amd64-cpu] + include: + - container_id: xgb-ci.manylinux2014_aarch64 + runner: linux-arm64-cpu steps: # Restart Docker daemon so that it recognized the ephemeral disks - run: sudo systemctl restart docker @@ -30,16 +44,80 @@ jobs: run: bash ops/docker_build.sh env: CONTAINER_ID: ${{ matrix.container_id }} - BRANCH_NAME: ${{ github.event.pull_request.number || github.ref_name }} - USE_DOCKER_CACHE: 1 - build-gpu: - name: Build GPU + + clang-tidy: + name: Run clang-tidy needs: build-containers runs-on: - runs-on=${{ github.run_id }} - runner=linux-amd64-cpu steps: - # Restart Docker daemon so that it recognized the ephemeral disks + # Restart Docker daemon so that it recognizes the ephemeral disks + - run: sudo systemctl restart docker + - uses: actions/checkout@v4 + with: + submodules: "true" + - name: Fetch container from cache + run: bash ops/docker_build.sh + env: + CONTAINER_ID: xgb-ci.clang_tidy + - run: bash ops/task/run-clang-tidy.sh + + build-cpu: + name: Build CPU + needs: build-containers + runs-on: + - runs-on=${{ github.run_id }} + - runner=linux-amd64-cpu + steps: + # Restart Docker daemon so that it recognizes the ephemeral disks + - run: sudo systemctl restart docker + - uses: actions/checkout@v4 + with: + submodules: "true" + - name: Fetch container from cache + run: bash ops/docker_build.sh + env: + CONTAINER_ID: xgb-ci.cpu + - run: bash ops/task/build-cpu.sh + - name: Stash CLI executable + run: bash ops/stash_artifacts.sh ./xgboost + env: + COMMAND: upload + S3_BUCKET: ${{ env.RUNS_ON_S3_BUCKET_CACHE }} + PREFIX: ${{ env.ARTIFACT_STASH_PREFIX }}/build-cpu + + build-cpu-arm64: + name: Build CPU ARM64 + manylinux_2_28_aarch64 wheel + runs-on: + - runs-on=${{ github.run_id }} + - runner=linux-arm64-cpu + steps: + # Restart Docker daemon so that it recognizes the ephemeral disks + - run: sudo systemctl restart docker + - uses: actions/checkout@v4 + with: + submodules: "true" + - name: Fetch container from cache + run: bash ops/docker_build.sh + env: + CONTAINER_ID: xgb-ci.aarch64 + - run: bash ops/task/build-cpu-arm64.sh + - name: Stash files + run: bash ops/stash_artifacts.sh ./xgboost python-package/dist/*.whl + env: + COMMAND: upload + S3_BUCKET: ${{ env.RUNS_ON_S3_BUCKET_CACHE }} + PREFIX: ${{ env.ARTIFACT_STASH_PREFIX }}/build-cpu-arm64 + + build-cuda: + name: Build CUDA + manylinux_2_28_x86_64 wheel + needs: build-containers + runs-on: + - runs-on=${{ github.run_id }} + - runner=linux-amd64-cpu + steps: + # Restart Docker daemon so that it recognizes the ephemeral disks - run: sudo systemctl restart docker - uses: actions/checkout@v4 with: @@ -48,31 +126,100 @@ jobs: run: bash ops/docker_build.sh env: CONTAINER_ID: xgb-ci.gpu_build_rockylinux8 - BRANCH_NAME: ${{ github.event.pull_request.number || github.ref_name }} - USE_DOCKER_CACHE: 1 - - name: Build gtest - run: | - git clone https://github.com/NVIDIA/cccl.git -b v2.6.1 --quiet - python3 ops/docker_run.py \ - --container-id xgb-ci.gpu_build_rockylinux8 \ - -- tests/ci_build/build_via_cmake.sh \ - -DCMAKE_PREFIX_PATH="/opt/grpc;/workspace/cccl" \ - -DUSE_CUDA=ON \ - -DUSE_OPENMP=ON \ - -DHIDE_CXX_SYMBOLS=ON \ - -DPLUGIN_FEDERATED=ON \ - -DUSE_NCCL=ON \ - -DUSE_NCCL_LIB_PATH=ON \ - -DNCCL_INCLUDE_DIR=/usr/include \ - -DUSE_DLOPEN_NCCL=ON \ - -DGPU_COMPUTE_VER=75 - - name: Stash testxgboost - run: | - aws s3 cp ./build/testxgboost \ - s3://${{ env.RUNS_ON_S3_BUCKET_CACHE }}/cache/${{ github.repository }}/stash/${{ github.run_id }}/testxgboost + - name: Fetch container from cache + run: bash ops/docker_build.sh + env: + CONTAINER_ID: xgb-ci.manylinux_2_28_x86_64 + - run: bash ops/task/build-cuda.sh + - name: Stash files + run: bash ops/stash_artifacts.sh build/testxgboost python-package/dist/*.whl + env: + COMMAND: upload + S3_BUCKET: ${{ env.RUNS_ON_S3_BUCKET_CACHE }} + PREFIX: ${{ env.ARTIFACT_STASH_PREFIX }}/build-cuda + + build-cuda-with-rmm: + name: Build CUDA with RMM + needs: build-containers + runs-on: + - runs-on=${{ github.run_id }} + - runner=linux-amd64-cpu + steps: + # Restart Docker daemon so that it recognizes the ephemeral disks + - run: sudo systemctl restart docker + - uses: actions/checkout@v4 + with: + submodules: "true" + - name: Fetch container from cache + run: bash ops/docker_build.sh + env: + CONTAINER_ID: xgb-ci.gpu_build_rockylinux8 + - name: Fetch container from cache + run: bash ops/docker_build.sh + env: + CONTAINER_ID: xgb-ci.manylinux_2_28_x86_64 + - run: bash ops/task/build-cuda-with-rmm.sh + - name: Stash files + run: bash ops/stash_artifacts.sh build/testxgboost python-package/dist/*.whl + env: + COMMAND: upload + S3_BUCKET: ${{ env.RUNS_ON_S3_BUCKET_CACHE }} + PREFIX: ${{ env.ARTIFACT_STASH_PREFIX }}/build-cuda-with-rmm + + build-jvm-manylinux2014: + name: Build libxgboost4j.so targeting gblic 2.17 + needs: build-containers + runs-on: + - runs-on=${{ github.run_id }} + - runner=${{ matrix.runner }} + strategy: + matrix: + include: + - arch: aarch64 + runner: linux-arm64-cpu + - arch: x86_64 + runner: linux-amd64-cpu + + steps: + # Restart Docker daemon so that it recognizes the ephemeral disks + - run: sudo systemctl restart docker + - uses: actions/checkout@v4 + with: + submodules: "true" + - name: Fetch container from cache + run: bash ops/docker_build.sh + env: + CONTAINER_ID: xgb-ci.manylinux2014_${{ matrix.arch }} + - run: bash ops/task/build-jvm-manylinux2014.sh ${{ matrix.arch }} + + build-manylinux2014: + name: Build manylinux2024_${{ matrix.arch }} wheel + needs: build-containers + runs-on: + - runs-on=${{ github.run_id }} + - runner=${{ matrix.runner }} + strategy: + matrix: + include: + - arch: aarch64 + runner: linux-arm64-cpu + - arch: x86_64 + runner: linux-amd64-cpu + steps: + # Restart Docker daemon so that it recognizes the ephemeral disks + - run: sudo systemctl restart docker + - uses: actions/checkout@v4 + with: + submodules: "true" + - name: Fetch container from cache + run: bash ops/docker_build.sh + env: + CONTAINER_ID: xgb-ci.manylinux2014_${{ matrix.arch }} + - run: bash ops/task/build-manylinux2014.sh ${{ matrix.arch }} + test-gpu: name: Test GPU - needs: build-gpu + needs: build-cuda runs-on: - runs-on=${{ github.run_id }} - runner=linux-amd64-gpu @@ -82,18 +229,18 @@ jobs: - uses: actions/checkout@v4 with: submodules: "true" - - name: Unstash testxgboost + - name: Unstash gtest executable run: | - aws s3 cp \ - s3://${{ env.RUNS_ON_S3_BUCKET_CACHE }}/cache/${{ github.repository }}/stash/${{ github.run_id }}/testxgboost \ - ./testxgboost - chmod +x testxgboost + bash ops/stash_artifacts.sh ./testxgboost + chmod +x ./testxgboost + env: + COMMAND: download + S3_BUCKET: ${{ env.RUNS_ON_S3_BUCKET_CACHE }} + PREFIX: ${{ env.ARTIFACT_STASH_PREFIX }}/build-cuda - name: Fetch container from cache run: bash ops/docker_build.sh env: CONTAINER_ID: xgb-ci.gpu - BRANCH_NAME: ${{ github.event.pull_request.number || github.ref_name }} - USE_DOCKER_CACHE: 1 - name: Run gtest run: | nvidia-smi diff --git a/ops/docker/Dockerfile.jvm_manylinux2014_aarch64 b/ops/docker/Dockerfile.jvm_manylinux2014_aarch64 deleted file mode 100644 index 52baff43bb6f..000000000000 --- a/ops/docker/Dockerfile.jvm_manylinux2014_aarch64 +++ /dev/null @@ -1,17 +0,0 @@ -FROM quay.io/pypa/manylinux2014_aarch64 - -RUN yum update -y && yum install -y java-1.8.0-openjdk-devel - -# Install lightweight sudo (not bound to TTY) -ENV GOSU_VERSION=1.10 -RUN set -ex; \ - curl -o /usr/local/bin/gosu -L "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-arm64" && \ - chmod +x /usr/local/bin/gosu && \ - gosu nobody true - -# Default entry-point to use if running locally -# It will preserve attributes of created files -COPY entrypoint.sh /scripts/ - -WORKDIR /workspace -ENTRYPOINT ["/scripts/entrypoint.sh"] diff --git a/ops/docker/Dockerfile.jvm_manylinux2014_x86_64 b/ops/docker/Dockerfile.jvm_manylinux2014_x86_64 deleted file mode 100644 index 578b85618776..000000000000 --- a/ops/docker/Dockerfile.jvm_manylinux2014_x86_64 +++ /dev/null @@ -1,17 +0,0 @@ -FROM quay.io/pypa/manylinux2014_x86_64 - -RUN yum update -y && yum install -y java-1.8.0-openjdk-devel ninja-build - -# Install lightweight sudo (not bound to TTY) -ENV GOSU_VERSION=1.10 -RUN set -ex; \ - curl -o /usr/local/bin/gosu -L "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-amd64" && \ - chmod +x /usr/local/bin/gosu && \ - gosu nobody true - -# Default entry-point to use if running locally -# It will preserve attributes of created files -COPY entrypoint.sh /scripts/ - -WORKDIR /workspace -ENTRYPOINT ["/scripts/entrypoint.sh"] diff --git a/ops/docker/Dockerfile.manylinux2014_aarch64 b/ops/docker/Dockerfile.manylinux2014_aarch64 index 9627e15c64a0..52baff43bb6f 100644 --- a/ops/docker/Dockerfile.manylinux2014_aarch64 +++ b/ops/docker/Dockerfile.manylinux2014_aarch64 @@ -1,5 +1,7 @@ FROM quay.io/pypa/manylinux2014_aarch64 +RUN yum update -y && yum install -y java-1.8.0-openjdk-devel + # Install lightweight sudo (not bound to TTY) ENV GOSU_VERSION=1.10 RUN set -ex; \ diff --git a/ops/docker/Dockerfile.manylinux2014_x86_64 b/ops/docker/Dockerfile.manylinux2014_x86_64 index 11beb116ee43..fdfcbd277360 100644 --- a/ops/docker/Dockerfile.manylinux2014_x86_64 +++ b/ops/docker/Dockerfile.manylinux2014_x86_64 @@ -1,5 +1,7 @@ FROM quay.io/pypa/manylinux2014_x86_64 +RUN yum update -y && yum install -y java-1.8.0-openjdk-devel + # Install lightweight sudo (not bound to TTY) ENV GOSU_VERSION=1.10 RUN set -ex; \ diff --git a/tests/ci_build/conda_env/aarch64_test.yml b/ops/docker/conda_env/aarch64_test.yml similarity index 100% rename from tests/ci_build/conda_env/aarch64_test.yml rename to ops/docker/conda_env/aarch64_test.yml diff --git a/tests/ci_build/conda_env/cpp_test.yml b/ops/docker/conda_env/cpp_test.yml similarity index 100% rename from tests/ci_build/conda_env/cpp_test.yml rename to ops/docker/conda_env/cpp_test.yml diff --git a/tests/ci_build/conda_env/jvm_tests.yml b/ops/docker/conda_env/jvm_tests.yml similarity index 100% rename from tests/ci_build/conda_env/jvm_tests.yml rename to ops/docker/conda_env/jvm_tests.yml diff --git a/tests/ci_build/conda_env/linux_cpu_test.yml b/ops/docker/conda_env/linux_cpu_test.yml similarity index 100% rename from tests/ci_build/conda_env/linux_cpu_test.yml rename to ops/docker/conda_env/linux_cpu_test.yml diff --git a/tests/ci_build/conda_env/linux_sycl_test.yml b/ops/docker/conda_env/linux_sycl_test.yml similarity index 100% rename from tests/ci_build/conda_env/linux_sycl_test.yml rename to ops/docker/conda_env/linux_sycl_test.yml diff --git a/tests/ci_build/conda_env/macos_cpu_test.yml b/ops/docker/conda_env/macos_cpu_test.yml similarity index 100% rename from tests/ci_build/conda_env/macos_cpu_test.yml rename to ops/docker/conda_env/macos_cpu_test.yml diff --git a/tests/ci_build/conda_env/python_lint.yml b/ops/docker/conda_env/python_lint.yml similarity index 100% rename from tests/ci_build/conda_env/python_lint.yml rename to ops/docker/conda_env/python_lint.yml diff --git a/tests/ci_build/conda_env/sdist_test.yml b/ops/docker/conda_env/sdist_test.yml similarity index 100% rename from tests/ci_build/conda_env/sdist_test.yml rename to ops/docker/conda_env/sdist_test.yml diff --git a/tests/ci_build/conda_env/win64_cpu_test.yml b/ops/docker/conda_env/win64_cpu_test.yml similarity index 100% rename from tests/ci_build/conda_env/win64_cpu_test.yml rename to ops/docker/conda_env/win64_cpu_test.yml diff --git a/tests/ci_build/conda_env/win64_test.yml b/ops/docker/conda_env/win64_test.yml similarity index 100% rename from tests/ci_build/conda_env/win64_test.yml rename to ops/docker/conda_env/win64_test.yml diff --git a/ops/docker_run.py b/ops/docker_run.py index 4fc6356a90a1..161c81b477b0 100644 --- a/ops/docker_run.py +++ b/ops/docker_run.py @@ -25,7 +25,7 @@ def parse_run_args(raw_run_args: str) -> list[str]: - return [x for x in raw_run_args.split(" ") if x] + return [x for x in raw_run_args.split() if x] def compute_container_id(container_name: str, build_args: list[dict[str, str]]) -> str: diff --git a/tests/ci_build/format_wheel_meta.py b/ops/format_wheel_meta.py similarity index 96% rename from tests/ci_build/format_wheel_meta.py rename to ops/format_wheel_meta.py index 9e7bad907687..570f7854cf62 100644 --- a/tests/ci_build/format_wheel_meta.py +++ b/ops/format_wheel_meta.py @@ -2,6 +2,7 @@ Script to generate meta.json to store metadata for a nightly build of XGBoost Python package. """ + import json import pathlib from argparse import ArgumentParser @@ -13,7 +14,7 @@ def main(args): raise ValueError(f"Wheel cannot be found at path {wheel_path}") if not wheel_path.is_file(): raise ValueError(f"Path {wheel_path} is not a valid file") - wheel_dir, wheel_name = wheel_path.parent, wheel_path.name + wheel_name = wheel_path.name meta_path = pathlib.Path(args.meta_path) if not meta_path.exists(): diff --git a/ops/matrix/ci_container.yml b/ops/matrix/ci_container.yml index e01431b463a5..d57d63d99e5c 100644 --- a/ops/matrix/ci_container.yml +++ b/ops/matrix/ci_container.yml @@ -16,3 +16,23 @@ xgb-ci.gpu: CUDA_VERSION_ARG: "12.5.1" NCCL_VERSION_ARG: "2.22.3-1" RAPIDS_VERSION_ARG: "24.10" + +xgb-ci.cpu: + container_def: cpu + +xgb-ci.clang_tidy: + container_def: clang_tidy + build_args: + CUDA_VERSION_ARG: "12.5.1" + +xgb-ci.aarch64: + container_def: aarch64 + +xgb-ci.manylinux_2_28_x86_64: + container_def: manylinux_2_28_x86_64 + +xgb-ci.manylinux2014_x86_64: + container_def: manylinux2014_x86_64 + +xgb-ci.manylinux2014_aarch64: + container_def: manylinux2014_aarch64 diff --git a/ops/matrix/extract_build_args.jq b/ops/matrix/extract_build_args.jq index 0453e2a7c081..682b62cb63cb 100644 --- a/ops/matrix/extract_build_args.jq +++ b/ops/matrix/extract_build_args.jq @@ -1,6 +1,8 @@ def compute_build_args($input; $container_id): $input | - .[$container_id].build_args | + .[$container_id] | + select(.build_args != null) | + .build_args | to_entries | map("--build-arg " + .key + "=" + .value) | join(" "); diff --git a/tests/ci_build/rename_whl.py b/ops/rename_whl.py similarity index 100% rename from tests/ci_build/rename_whl.py rename to ops/rename_whl.py diff --git a/ops/stash_artifacts.sh b/ops/stash_artifacts.sh new file mode 100755 index 000000000000..f091af3cf50b --- /dev/null +++ b/ops/stash_artifacts.sh @@ -0,0 +1,66 @@ +#!/bin/bash + +## Stash an artifact in an S3 bucket for later use +## +## Note. This script takes in all inputs via environment variables +## except the path to the artifact(s). + +set -euo pipefail + +ENV_VAR_DOC=$( +cat <<-EOF +Inputs + - COMMAND: Whether to upload or download the artifact. Either "upload" or + "download" + - S3_BUCKET: Name of the S3 bucket to store the artifact + - PREFIX: Where the artifact would be stored. The artifact will be stored + in s3://{S3_BUCKET}/{prefix}/. +EOF +) + +if [ "$#" -lt 1 ]; then + echo "Usage: $0 [artifact] [artifact ...]" + exit 1 +fi + +for arg in "COMMAND" "S3_BUCKET" "PREFIX" +do + if [[ -z "${!arg:-}" ]] + then + echo -e "Error: $arg must be set.\n${ENV_VAR_DOC}" + exit 1 + fi +done + +compute_s3_url() { # args: artifact + S3_URL="s3://${S3_BUCKET}/${PREFIX}/"$(basename "$1") +} + +aws_s3_cp() { # args: src, dest + set -x + aws s3 cp --no-progress "$1" "$2" + set +x + return 0 +} + +if [[ "$COMMAND" == "upload" ]] +then + echo "Uploading artifacts with prefix $PREFIX..." + for artifact in "$@" + do + compute_s3_url "${artifact}" + aws_s3_cp "${artifact}" "${S3_URL}" + done +elif [[ "$COMMAND" == "download" ]] +then + echo "Downloading artifacts with prefix $PREFIX..." + for artifact in "$@" + do + compute_s3_url "${artifact}" + aws_s3_cp "${S3_URL}" "${artifact}" + done +else + echo "Unrecognized command: $COMMAND" + exit 2 +fi + diff --git a/tests/buildkite/build-cpu-arm64.sh b/ops/task/build-cpu-arm64.sh similarity index 54% rename from tests/buildkite/build-cpu-arm64.sh rename to ops/task/build-cpu-arm64.sh index 8b3847ed58b9..4a8c96e0e941 100755 --- a/tests/buildkite/build-cpu-arm64.sh +++ b/ops/task/build-cpu-arm64.sh @@ -1,47 +1,55 @@ #!/bin/bash -set -euo pipefail +set -euox pipefail WHEEL_TAG=manylinux_2_28_aarch64 echo "--- Build CPU code targeting ARM64" -source tests/buildkite/conftest.sh - -command_wrapper="tests/ci_build/ci_build.sh aarch64" +source ops/task/enforce-ci.sh echo "--- Build libxgboost from the source" -$command_wrapper tests/ci_build/build_via_cmake.sh --conda-env=aarch64_test \ - -DUSE_OPENMP=ON -DHIDE_CXX_SYMBOL=ON +python3 ops/docker_run.py \ + --container-id xgb-ci.aarch64 \ + -- ops/task/build-via-cmake.sh \ + --conda-env=aarch64_test \ + -DUSE_OPENMP=ON \ + -DHIDE_CXX_SYMBOL=ON + echo "--- Run Google Test" -$command_wrapper bash -c "cd build && ctest --extra-verbose" +python3 ops/docker_run.py \ + --container-id xgb-ci.aarch64 \ + -- bash -c "cd build && ctest --extra-verbose" echo "--- Build binary wheel" -$command_wrapper bash -c \ +python3 ops/docker_run.py \ + --container-id xgb-ci.aarch64 \ + -- bash -c \ "cd python-package && rm -rf dist/* && pip wheel --no-deps -v . --wheel-dir dist/" -$command_wrapper python tests/ci_build/rename_whl.py \ +python3 ops/rename_whl.py \ --wheel-path python-package/dist/*.whl \ - --commit-hash ${BUILDKITE_COMMIT} \ + --commit-hash ${GITHUB_SHA} \ --platform-tag ${WHEEL_TAG} echo "--- Audit binary wheel to ensure it's compliant with ${WHEEL_TAG} standard" -$command_wrapper auditwheel repair --plat ${WHEEL_TAG} python-package/dist/*.whl -$command_wrapper python tests/ci_build/rename_whl.py \ +python3 ops/docker_run.py \ + --container-id xgb-ci.aarch64 \ + -- auditwheel repair --plat ${WHEEL_TAG} python-package/dist/*.whl +python3 ops/rename_whl.py \ --wheel-path wheelhouse/*.whl \ - --commit-hash ${BUILDKITE_COMMIT} \ + --commit-hash ${GITHUB_SHA} \ --platform-tag ${WHEEL_TAG} mv -v wheelhouse/*.whl python-package/dist/ + # Make sure that libgomp.so is vendored in the wheel -$command_wrapper bash -c \ +python3 ops/docker_run.py \ + --container-id xgb-ci.aarch64 \ + -- bash -c \ "unzip -l python-package/dist/*.whl | grep libgomp || exit -1" echo "--- Upload Python wheel" -buildkite-agent artifact upload "python-package/dist/*.whl" if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]] then aws s3 cp python-package/dist/*.whl s3://xgboost-nightly-builds/${BRANCH_NAME}/ \ --acl public-read --no-progress fi - -echo "--- Stash XGBoost CLI executable" -buildkite-agent artifact upload ./xgboost diff --git a/ops/task/build-cpu.sh b/ops/task/build-cpu.sh new file mode 100755 index 000000000000..7f8c69cd43bf --- /dev/null +++ b/ops/task/build-cpu.sh @@ -0,0 +1,43 @@ +#!/bin/bash + +set -euox pipefail + +source ops/task/enforce-ci.sh + +echo "--- Build CPU code" + +# This step is not necessary, but here we include it, to ensure that +# DMLC_CORE_USE_CMAKE flag is correctly propagated. We want to make sure that we use +# the configured header build/dmlc/build_config.h instead of +# include/dmlc/build_config_default.h. +rm -fv dmlc-core/include/dmlc/build_config_default.h + +# Sanitizer tests +echo "--- Run Google Test with sanitizer enabled" +# Work around https://github.com/google/sanitizers/issues/1614 +sudo sysctl vm.mmap_rnd_bits=28 +python3 ops/docker_run.py \ + --container-id xgb-ci.cpu \ + -- ops/task/build-via-cmake.sh \ + -DUSE_SANITIZER=ON \ + -DENABLED_SANITIZERS="address;leak;undefined" \ + -DCMAKE_BUILD_TYPE=Debug \ + -DSANITIZER_PATH=/usr/lib/x86_64-linux-gnu/ +python3 ops/docker_run.py \ + --container-id xgb-ci.cpu \ + --run-args '-e ASAN_SYMBOLIZER_PATH=/usr/bin/llvm-symbolizer + -e ASAN_OPTIONS=symbolize=1 + -e UBSAN_OPTIONS=print_stacktrace=1:log_path=ubsan_error.log + --cap-add SYS_PTRACE' \ + -- bash -c \ + "cd build && ./testxgboost --gtest_filter=-*DeathTest*" + +echo "--- Run Google Test" +python3 ops/docker_run.py \ + --container-id xgb-ci.cpu \ + -- ops/task/build-via-cmake.sh \ + -DCMAKE_PREFIX_PATH=/opt/grpc \ + -DPLUGIN_FEDERATED=ON +python3 ops/docker_run.py \ + --container-id xgb-ci.cpu \ + -- bash -c "cd build && ctest --extra-verbose" diff --git a/ops/task/build-cuda-with-rmm.sh b/ops/task/build-cuda-with-rmm.sh new file mode 100755 index 000000000000..901e66a8f649 --- /dev/null +++ b/ops/task/build-cuda-with-rmm.sh @@ -0,0 +1,65 @@ +#!/bin/bash + +set -euo pipefail + +WHEEL_TAG=manylinux_2_28_x86_64 + +source ops/task/enforce-ci.sh + +echo "--- Build with CUDA with RMM" + +if [[ ($is_pull_request == 1) || ($is_release_branch == 0) ]] +then + arch_flag="-DGPU_COMPUTE_VER=75" +else + arch_flag="" +fi + +echo "--- Build libxgboost from the source" +python3 ops/docker_run.py \ + --container-id xgb-ci.gpu_build_rockylinux8 \ + -- ops/task/build-via-cmake.sh \ + -DCMAKE_PREFIX_PATH="/opt/grpc;/opt/rmm;/opt/rmm/lib64/rapids/cmake" \ + -DUSE_CUDA=ON \ + -DUSE_OPENMP=ON \ + -DHIDE_CXX_SYMBOLS=ON \ + -DPLUGIN_FEDERATED=ON \ + -DPLUGIN_RMM=ON \ + -DUSE_NCCL=ON \ + -DUSE_NCCL_LIB_PATH=ON \ + -DNCCL_INCLUDE_DIR=/usr/include \ + -DUSE_DLOPEN_NCCL=ON \ + ${arch_flag} + +echo "--- Build binary wheel" +python3 ops/docker_run.py \ + --container-id xgb-ci.gpu_build_rockylinux8 \ + -- bash -c \ + "cd python-package && rm -rf dist/* && pip wheel --no-deps -v . --wheel-dir dist/" +python3 ops/rename_whl.py \ + --wheel-path python-package/dist/*.whl \ + --commit-hash ${GITHUB_SHA} \ + --platform-tag ${WHEEL_TAG} + +echo "--- Audit binary wheel to ensure it's compliant with ${WHEEL_TAG} standard" +python3 ops/docker_run.py \ + --container-id xgb-ci.$WHEEL_TAG \ + -- auditwheel repair \ + --plat ${WHEEL_TAG} python-package/dist/*.whl +python3 ops/rename_whl.py \ + --wheel-path wheelhouse/*.whl \ + --commit-hash ${GITHUB_SHA} \ + --platform-tag ${WHEEL_TAG} +mv -v wheelhouse/*.whl python-package/dist/ +# Make sure that libgomp.so is vendored in the wheel +python3 ops/docker_run.py \ + --container-id xgb-ci.$WHEEL_TAG \ + -- bash -c \ + "unzip -l python-package/dist/*.whl | grep libgomp || exit -1" + +echo "--- Upload Python wheel" +if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]] +then + aws s3 cp python-package/dist/*.whl s3://xgboost-nightly-builds/experimental_build_with_rmm/ \ + --acl public-read --no-progress +fi diff --git a/tests/buildkite/build-cuda.sh b/ops/task/build-cuda.sh similarity index 50% rename from tests/buildkite/build-cuda.sh rename to ops/task/build-cuda.sh index 03d2cc8a6a24..c98c041d8187 100755 --- a/tests/buildkite/build-cuda.sh +++ b/ops/task/build-cuda.sh @@ -4,9 +4,9 @@ set -euo pipefail WHEEL_TAG=manylinux_2_28_x86_64 -source tests/buildkite/conftest.sh +source ops/task/enforce-ci.sh -echo "--- Build with CUDA ${CUDA_VERSION}" +echo "--- Build with CUDA" if [[ ($is_pull_request == 1) || ($is_release_branch == 0) ]] then @@ -15,58 +15,62 @@ else arch_flag="" fi -command_wrapper="tests/ci_build/ci_build.sh gpu_build_rockylinux8 --build-arg "` - `"CUDA_VERSION_ARG=$CUDA_VERSION --build-arg "` - `"NCCL_VERSION_ARG=$NCCL_VERSION --build-arg "` - `"RAPIDS_VERSION_ARG=$RAPIDS_VERSION" - echo "--- Build libxgboost from the source" -$command_wrapper tests/ci_build/build_via_cmake.sh \ - -DCMAKE_PREFIX_PATH="/opt/grpc" \ - -DUSE_CUDA=ON \ - -DUSE_OPENMP=ON \ - -DHIDE_CXX_SYMBOLS=ON \ - -DPLUGIN_FEDERATED=ON \ - -DUSE_NCCL=ON \ - -DUSE_NCCL_LIB_PATH=ON \ - -DNCCL_INCLUDE_DIR=/usr/include \ - -DUSE_DLOPEN_NCCL=ON \ - ${arch_flag} +git clone https://github.com/NVIDIA/cccl.git -b v2.6.1 --quiet +python3 ops/docker_run.py \ + --container-id xgb-ci.gpu_build_rockylinux8 \ + -- ops/task/build-via-cmake.sh \ + -DCMAKE_PREFIX_PATH="/opt/grpc;/workspace/cccl" \ + -DUSE_CUDA=ON \ + -DUSE_OPENMP=ON \ + -DHIDE_CXX_SYMBOLS=ON \ + -DPLUGIN_FEDERATED=ON \ + -DUSE_NCCL=ON \ + -DUSE_NCCL_LIB_PATH=ON \ + -DNCCL_INCLUDE_DIR=/usr/include \ + -DUSE_DLOPEN_NCCL=ON \ + ${arch_flag} + echo "--- Build binary wheel" -$command_wrapper bash -c \ +python3 ops/docker_run.py \ + --container-id xgb-ci.gpu_build_rockylinux8 \ + -- bash -c \ "cd python-package && rm -rf dist/* && pip wheel --no-deps -v . --wheel-dir dist/" -$command_wrapper python tests/ci_build/rename_whl.py \ +python3 ops/rename_whl.py \ --wheel-path python-package/dist/*.whl \ - --commit-hash ${BUILDKITE_COMMIT} \ + --commit-hash ${GITHUB_SHA} \ --platform-tag ${WHEEL_TAG} echo "--- Audit binary wheel to ensure it's compliant with ${WHEEL_TAG} standard" -tests/ci_build/ci_build.sh manylinux_2_28_x86_64 auditwheel repair \ +python3 ops/docker_run.py \ + --container-id xgb-ci.manylinux_2_28_x86_64 \ + -- auditwheel repair \ --plat ${WHEEL_TAG} python-package/dist/*.whl -$command_wrapper python tests/ci_build/rename_whl.py \ +python3 ops/rename_whl.py \ --wheel-path wheelhouse/*.whl \ - --commit-hash ${BUILDKITE_COMMIT} \ + --commit-hash ${GITHUB_SHA} \ --platform-tag ${WHEEL_TAG} mv -v wheelhouse/*.whl python-package/dist/ # Make sure that libgomp.so is vendored in the wheel -tests/ci_build/ci_build.sh manylinux_2_28_x86_64 bash -c \ - "unzip -l python-package/dist/*.whl | grep libgomp || exit -1" +python3 ops/docker_run.py \ + --container-id xgb-ci.manylinux_2_28_x86_64 \ + -- bash -c "unzip -l python-package/dist/*.whl | grep libgomp || exit -1" echo "--- Upload Python wheel" -buildkite-agent artifact upload python-package/dist/*.whl if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]] then aws s3 cp python-package/dist/*.whl s3://xgboost-nightly-builds/${BRANCH_NAME}/ \ --acl public-read --no-progress # Generate the meta info which includes xgboost version and the commit info - $command_wrapper python tests/ci_build/format_wheel_meta.py \ + python3 ops/docker_run.py \ + --container-id xgb-ci.gpu_build_rockylinux8 \ + -- python ops/format_wheel_meta.py \ --wheel-path python-package/dist/*.whl \ - --commit-hash ${BUILDKITE_COMMIT} \ + --commit-hash ${GITHUB_SHA} \ --platform-tag ${WHEEL_TAG} \ --meta-path python-package/dist/ aws s3 cp python-package/dist/meta.json s3://xgboost-nightly-builds/${BRANCH_NAME}/ \ --acl public-read --no-progress fi echo "-- Stash C++ test executable (testxgboost)" -buildkite-agent artifact upload build/testxgboost diff --git a/tests/buildkite/build-jvm-linux-arm64-manylinux2014.sh b/ops/task/build-jvm-manylinux2014.sh similarity index 69% rename from tests/buildkite/build-jvm-linux-arm64-manylinux2014.sh rename to ops/task/build-jvm-manylinux2014.sh index e7fec780b956..88bdb256821f 100644 --- a/tests/buildkite/build-jvm-linux-arm64-manylinux2014.sh +++ b/ops/task/build-jvm-manylinux2014.sh @@ -2,24 +2,32 @@ set -euo pipefail -source tests/buildkite/conftest.sh +source ops/task/enforce-ci.sh -command_wrapper="tests/ci_build/ci_build.sh jvm_manylinux2014_aarch64" +if [ $# -ne 1 ]; then + echo "Usage: $0 {x86_64,aarch64}" + exit 1 +fi + +arch=$1 + +image="xgb-ci.manylinux2014_${arch}" # Build XGBoost4J binary echo "--- Build libxgboost4j.so (targeting glibc 2.17)" set -x mkdir build -$command_wrapper bash -c \ +python3 ops/docker_run.py \ + --container-id ${image} \ + -- bash -c \ "cd build && cmake .. -DJVM_BINDINGS=ON -DUSE_OPENMP=ON && make -j$(nproc)" ldd lib/libxgboost4j.so objdump -T lib/libxgboost4j.so | grep GLIBC_ | sed 's/.*GLIBC_\([.0-9]*\).*/\1/g' | sort -Vu echo "--- Upload libxgboost4j.so" pushd lib -libname=libxgboost4j_linux_arm64_${BUILDKITE_COMMIT}.so +libname=libxgboost4j_linux_${arch}_${GITHUB_SHA}.so mv -v libxgboost4j.so ${libname} -buildkite-agent artifact upload ${libname} if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]] then aws s3 cp ${libname} \ diff --git a/tests/buildkite/build-manylinux2014.sh b/ops/task/build-manylinux2014.sh similarity index 60% rename from tests/buildkite/build-manylinux2014.sh rename to ops/task/build-manylinux2014.sh index 426d32b5c361..7b71b51a0587 100755 --- a/tests/buildkite/build-manylinux2014.sh +++ b/ops/task/build-manylinux2014.sh @@ -2,6 +2,8 @@ set -euo pipefail +source ops/task/enforce-ci.sh + if [ $# -ne 1 ]; then echo "Usage: $0 {x86_64,aarch64}" exit 1 @@ -9,24 +11,28 @@ fi arch=$1 -source tests/buildkite/conftest.sh - WHEEL_TAG="manylinux2014_${arch}" -command_wrapper="tests/ci_build/ci_build.sh ${WHEEL_TAG}" +image="xgb-ci.$WHEEL_TAG" + python_bin="/opt/python/cp310-cp310/bin/python" echo "--- Build binary wheel for ${WHEEL_TAG}" # Patch to add warning about manylinux2014 variant -patch -p0 < tests/buildkite/remove_nccl_dep.patch -patch -p0 < tests/buildkite/manylinux2014_warning.patch -$command_wrapper bash -c \ +patch -p0 < ops/task/patches/remove_nccl_dep.patch +patch -p0 < ops/task/patches/manylinux2014_warning.patch +python3 ops/docker_run.py \ + --container-id ${image} \ + -- bash -c \ "cd python-package && ${python_bin} -m pip wheel --no-deps -v . --wheel-dir dist/" -git checkout python-package/pyproject.toml python-package/xgboost/core.py # discard the patch +git checkout python-package/pyproject.toml python-package/xgboost/core.py + # discard the patch -$command_wrapper auditwheel repair --plat ${WHEEL_TAG} python-package/dist/*.whl -$command_wrapper ${python_bin} tests/ci_build/rename_whl.py \ +python3 ops/docker_run.py \ + --container-id ${image} \ + -- auditwheel repair --plat ${WHEEL_TAG} python-package/dist/*.whl +python3 ops/rename_whl.py \ --wheel-path wheelhouse/*.whl \ - --commit-hash ${BUILDKITE_COMMIT} \ + --commit-hash ${GITHUB_SHA} \ --platform-tag ${WHEEL_TAG} rm -rf python-package/dist/ mkdir python-package/dist/ @@ -34,25 +40,25 @@ mv -v wheelhouse/*.whl python-package/dist/ echo "--- Build binary wheel for ${WHEEL_TAG} (CPU only)" # Patch to rename pkg to xgboost-cpu -patch -p0 < tests/buildkite/remove_nccl_dep.patch -patch -p0 < tests/buildkite/cpu_only_pypkg.patch -$command_wrapper bash -c \ +patch -p0 < ops/task/patches/remove_nccl_dep.patch +patch -p0 < ops/task/patches/cpu_only_pypkg.patch +python3 ops/docker_run.py \ + --container-id ${image} \ + -- bash -c \ "cd python-package && ${python_bin} -m pip wheel --no-deps -v . --wheel-dir dist/" git checkout python-package/pyproject.toml # discard the patch -$command_wrapper auditwheel repair --plat ${WHEEL_TAG} python-package/dist/xgboost_cpu-*.whl -$command_wrapper ${python_bin} tests/ci_build/rename_whl.py \ +python3 ops/docker_run.py \ + --container-id ${image} \ + -- auditwheel repair --plat ${WHEEL_TAG} python-package/dist/xgboost_cpu-*.whl +python3 ops/rename_whl.py \ --wheel-path wheelhouse/xgboost_cpu-*.whl \ - --commit-hash ${BUILDKITE_COMMIT} \ + --commit-hash ${GITHUB_SHA} \ --platform-tag ${WHEEL_TAG} rm -v python-package/dist/xgboost_cpu-*.whl mv -v wheelhouse/xgboost_cpu-*.whl python-package/dist/ echo "--- Upload Python wheel" -for wheel in python-package/dist/*.whl -do - buildkite-agent artifact upload "${wheel}" -done if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]] then for wheel in python-package/dist/*.whl diff --git a/tests/ci_build/build_via_cmake.sh b/ops/task/build-via-cmake.sh similarity index 70% rename from tests/ci_build/build_via_cmake.sh rename to ops/task/build-via-cmake.sh index 3238c41e1bcb..857ebbbec0c2 100755 --- a/tests/ci_build/build_via_cmake.sh +++ b/ops/task/build-via-cmake.sh @@ -1,5 +1,6 @@ -#!/usr/bin/env bash -set -e +#!/bin/bash + +set -euo pipefail if [[ "$1" == --conda-env=* ]] then @@ -26,7 +27,17 @@ mkdir build cd build # Disable CMAKE_COMPILE_WARNING_AS_ERROR option temporarily until # https://github.com/dmlc/xgboost/issues/10400 is fixed -cmake .. ${cmake_args} -DGOOGLE_TEST=ON -DUSE_DMLC_GTEST=ON -DCMAKE_VERBOSE_MAKEFILE=ON -DENABLE_ALL_WARNINGS=ON -DCMAKE_COMPILE_WARNING_AS_ERROR=OFF -GNinja ${cmake_prefix_flag} -DHIDE_CXX_SYMBOLS=ON -DBUILD_DEPRECATED_CLI=ON +set -x +cmake .. ${cmake_args} \ + -DGOOGLE_TEST=ON \ + -DUSE_DMLC_GTEST=ON \ + -DENABLE_ALL_WARNINGS=ON \ + -DCMAKE_COMPILE_WARNING_AS_ERROR=OFF \ + -GNinja \ + ${cmake_prefix_flag} \ + -DHIDE_CXX_SYMBOLS=ON \ + -DBUILD_DEPRECATED_CLI=ON ninja clean time ninja -v cd .. +set +x diff --git a/ops/task/enforce-ci.sh b/ops/task/enforce-ci.sh new file mode 100755 index 000000000000..1e50dc045cb1 --- /dev/null +++ b/ops/task/enforce-ci.sh @@ -0,0 +1,42 @@ +#!/bin/bash + +## Ensure that a script is running inside the CI. +## Usage: source ops/task/enforce-ci.sh + +set -euo pipefail + +set -x + +if [[ -z ${GITHUB_ACTION:-} ]] +then + echo "$0 is not meant to run locally; it should run inside GitHub Actions." + echo "Please inspect the content of $0 and locate the desired command manually." + exit 1 +fi + +if [[ -z ${BRANCH_NAME:-} ]] +then + echo "Make sure to define environment variable BRANCH_NAME." + exit 2 +fi + +if [[ -n ${GITHUB_BASE_REF:-} ]] +then + is_pull_request=1 +else + is_pull_request=0 +fi + +if [[ $BRANCH_NAME == "master" || $BRANCH_NAME == "release_"* || $BRANCH_NAME == "federated-secure" ]] +then + is_release_branch=1 +else + is_release_branch=0 +fi + +if [[ -n ${DISABLE_RELEASE:-} ]] +then + is_release_branch=0 +fi + +set +x diff --git a/tests/buildkite/cpu_only_pypkg.patch b/ops/task/patches/cpu_only_pypkg.patch similarity index 100% rename from tests/buildkite/cpu_only_pypkg.patch rename to ops/task/patches/cpu_only_pypkg.patch diff --git a/tests/buildkite/manylinux2014_warning.patch b/ops/task/patches/manylinux2014_warning.patch similarity index 100% rename from tests/buildkite/manylinux2014_warning.patch rename to ops/task/patches/manylinux2014_warning.patch diff --git a/tests/buildkite/remove_nccl_dep.patch b/ops/task/patches/remove_nccl_dep.patch similarity index 100% rename from tests/buildkite/remove_nccl_dep.patch rename to ops/task/patches/remove_nccl_dep.patch diff --git a/ops/task/run-clang-tidy.sh b/ops/task/run-clang-tidy.sh new file mode 100755 index 000000000000..da12a8808a2a --- /dev/null +++ b/ops/task/run-clang-tidy.sh @@ -0,0 +1,11 @@ +#!/bin/bash + +set -euox pipefail + +echo "--- Run clang-tidy" + +source ops/task/enforce-ci.sh + +python3 ops/docker_run.py \ + --container-id xgb-ci.clang_tidy \ + -- python3 tests/ci_build/tidy.py --cuda-archs 75 diff --git a/tests/buildkite/build-cpu.sh b/tests/buildkite/build-cpu.sh deleted file mode 100755 index 11679d644de1..000000000000 --- a/tests/buildkite/build-cpu.sh +++ /dev/null @@ -1,34 +0,0 @@ -#!/bin/bash - -set -euo pipefail - -echo "--- Build CPU code" - -source tests/buildkite/conftest.sh - -command_wrapper="tests/ci_build/ci_build.sh cpu" - -$command_wrapper rm -fv dmlc-core/include/dmlc/build_config_default.h - # This step is not necessary, but here we include it, to ensure that - # DMLC_CORE_USE_CMAKE flag is correctly propagated. We want to make sure that we use - # the configured header build/dmlc/build_config.h instead of - # include/dmlc/build_config_default.h. -echo "--- Build libxgboost from the source" -$command_wrapper tests/ci_build/build_via_cmake.sh -DCMAKE_PREFIX_PATH=/opt/grpc \ - -DPLUGIN_FEDERATED=ON -echo "--- Run Google Test" -$command_wrapper bash -c "cd build && ctest --extra-verbose" -echo "--- Stash XGBoost CLI executable" -buildkite-agent artifact upload ./xgboost - -# Sanitizer test -echo "--- Run Google Test with sanitizer enabled" -$command_wrapper tests/ci_build/build_via_cmake.sh -DUSE_SANITIZER=ON \ - -DENABLED_SANITIZERS="address;leak;undefined" -DCMAKE_BUILD_TYPE=Debug \ - -DSANITIZER_PATH=/usr/lib/x86_64-linux-gnu/ -CI_DOCKER_EXTRA_PARAMS_INIT="-e ASAN_SYMBOLIZER_PATH=/usr/bin/llvm-symbolizer "` - `"-e ASAN_OPTIONS=symbolize=1 "` - `"-e UBSAN_OPTIONS=print_stacktrace=1:log_path=ubsan_error.log "` - `"--cap-add SYS_PTRACE" \ - $command_wrapper bash -c "cd build && ctest --exclude-regex AllTestsInDMLCUnitTests "` - `"--extra-verbose" diff --git a/tests/buildkite/build-cuda-with-rmm.sh b/tests/buildkite/build-cuda-with-rmm.sh deleted file mode 100755 index f1d3f1b1c91a..000000000000 --- a/tests/buildkite/build-cuda-with-rmm.sh +++ /dev/null @@ -1,65 +0,0 @@ -#!/bin/bash - -set -euo pipefail - -WHEEL_TAG=manylinux_2_28_x86_64 - -source tests/buildkite/conftest.sh - -echo "--- Build with CUDA ${CUDA_VERSION} with RMM" - -if [[ ($is_pull_request == 1) || ($is_release_branch == 0) ]] -then - arch_flag="-DGPU_COMPUTE_VER=75" -else - arch_flag="" -fi - -command_wrapper="tests/ci_build/ci_build.sh gpu_build_rockylinux8 --build-arg "` - `"CUDA_VERSION_ARG=$CUDA_VERSION --build-arg "` - `"NCCL_VERSION_ARG=$NCCL_VERSION --build-arg "` - `"RAPIDS_VERSION_ARG=$RAPIDS_VERSION" - -echo "--- Build libxgboost from the source" -$command_wrapper tests/ci_build/build_via_cmake.sh \ - -DCMAKE_PREFIX_PATH="/opt/grpc;/opt/rmm;/opt/rmm/lib64/rapids/cmake" \ - -DUSE_CUDA=ON \ - -DUSE_OPENMP=ON \ - -DHIDE_CXX_SYMBOLS=ON \ - -DPLUGIN_FEDERATED=ON \ - -DPLUGIN_RMM=ON \ - -DUSE_NCCL=ON \ - -DUSE_NCCL_LIB_PATH=ON \ - -DNCCL_INCLUDE_DIR=/usr/include \ - -DUSE_DLOPEN_NCCL=ON \ - ${arch_flag} -echo "--- Build binary wheel" -$command_wrapper bash -c \ - "cd python-package && rm -rf dist/* && pip wheel --no-deps -v . --wheel-dir dist/" -$command_wrapper python tests/ci_build/rename_whl.py \ - --wheel-path python-package/dist/*.whl \ - --commit-hash ${BUILDKITE_COMMIT} \ - --platform-tag ${WHEEL_TAG} - -echo "--- Audit binary wheel to ensure it's compliant with ${WHEEL_TAG} standard" -tests/ci_build/ci_build.sh manylinux_2_28_x86_64 auditwheel repair \ - --plat ${WHEEL_TAG} python-package/dist/*.whl -$command_wrapper python tests/ci_build/rename_whl.py \ - --wheel-path wheelhouse/*.whl \ - --commit-hash ${BUILDKITE_COMMIT} \ - --platform-tag ${WHEEL_TAG} -mv -v wheelhouse/*.whl python-package/dist/ -# Make sure that libgomp.so is vendored in the wheel -tests/ci_build/ci_build.sh manylinux_2_28_x86_64 bash -c \ - "unzip -l python-package/dist/*.whl | grep libgomp || exit -1" - -echo "--- Upload Python wheel" -buildkite-agent artifact upload python-package/dist/*.whl -if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]] -then - aws s3 cp python-package/dist/*.whl s3://xgboost-nightly-builds/experimental_build_with_rmm/ \ - --acl public-read --no-progress -fi - -echo "-- Stash C++ test executable (testxgboost)" -buildkite-agent artifact upload build/testxgboost diff --git a/tests/buildkite/build-jvm-linux-x86_64-manylinux2014.sh b/tests/buildkite/build-jvm-linux-x86_64-manylinux2014.sh deleted file mode 100644 index 46a819a016d3..000000000000 --- a/tests/buildkite/build-jvm-linux-x86_64-manylinux2014.sh +++ /dev/null @@ -1,29 +0,0 @@ -#!/bin/bash - -set -euo pipefail - -source tests/buildkite/conftest.sh - -command_wrapper="tests/ci_build/ci_build.sh jvm_manylinux2014_x86_64" - -# Build XGBoost4J binary -echo "--- Build libxgboost4j.so (targeting glibc 2.17)" -set -x -mkdir build -$command_wrapper bash -c \ - "cd build && cmake .. -GNinja -DJVM_BINDINGS=ON -DUSE_OPENMP=ON && ninja -v" -ldd lib/libxgboost4j.so -objdump -T lib/libxgboost4j.so | grep GLIBC_ | sed 's/.*GLIBC_\([.0-9]*\).*/\1/g' | sort -Vu - -echo "--- Upload libxgboost4j.so" -pushd lib -libname=libxgboost4j_linux_x86_64_${BUILDKITE_COMMIT}.so -mv -v libxgboost4j.so ${libname} -buildkite-agent artifact upload ${libname} -if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]] -then - aws s3 cp ${libname} \ - s3://xgboost-nightly-builds/${BRANCH_NAME}/libxgboost4j/ \ - --acl public-read --no-progress -fi -popd diff --git a/tests/buildkite/conftest.sh b/tests/buildkite/conftest.sh deleted file mode 100755 index 185b4a356d7e..000000000000 --- a/tests/buildkite/conftest.sh +++ /dev/null @@ -1,64 +0,0 @@ -#!/bin/bash - -set -euo pipefail - -function get_aws_secret { - if [[ $# -ne 1 ]] - then - echo "Usage: get_aws_secret [Name of secret]" - return 1 - fi - aws secretsmanager get-secret-value --secret-id $1 --output text --region us-west-2 --query SecretString -} - -function set_buildkite_env_vars_in_container { - # Pass all Buildkite-specific env vars to Docker containers. - # This is to be used with tests/ci_build/ci_build.sh - export CI_DOCKER_EXTRA_PARAMS_INIT="${CI_DOCKER_EXTRA_PARAMS_INIT:-} "` - `"--env BUILDKITE_ANALYTICS_TOKEN --env BUILDKITE_BUILD_ID --env BUILDKITE_BUILD_NUMBER "` - `"--env BUILDKITE_JOB_ID --env BUILDKITE_BRANCH --env BUILDKITE_COMMIT "` - `"--env BUILDKITE_MESSAGE --env BUILDKITE_BUILD_URL" -} - -set -x - -CUDA_VERSION=12.4.1 -NCCL_VERSION=2.22.3-1 -RAPIDS_VERSION=24.08 -DEV_RAPIDS_VERSION=24.10 -SPARK_VERSION=3.5.1 -JDK_VERSION=8 -R_VERSION=4.3.2 - -if [[ -z ${BUILDKITE:-} ]] -then - echo "$0 is not meant to run locally; it should run inside BuildKite." - echo "Please inspect the content of $0 and locate the desired command manually." - exit 1 -fi - -if [[ -n $BUILDKITE_PULL_REQUEST && $BUILDKITE_PULL_REQUEST != "false" ]] -then - is_pull_request=1 - BRANCH_NAME=PR-$BUILDKITE_PULL_REQUEST -else - is_pull_request=0 - BRANCH_NAME=$BUILDKITE_BRANCH -fi -export BRANCH_NAME=${BRANCH_NAME//\//-} - -if [[ $BRANCH_NAME == "master" || $BRANCH_NAME == "release_"* || $BRANCH_NAME == "federated-secure" ]] -then - is_release_branch=1 - enforce_daily_budget=0 -else - is_release_branch=0 - enforce_daily_budget=1 -fi - -if [[ -n ${DISABLE_RELEASE:-} ]] -then - is_release_branch=0 -fi - -set +x diff --git a/tests/buildkite/run-clang-tidy.sh b/tests/buildkite/run-clang-tidy.sh deleted file mode 100755 index 95ff010c20f1..000000000000 --- a/tests/buildkite/run-clang-tidy.sh +++ /dev/null @@ -1,11 +0,0 @@ -#!/bin/bash - -set -euo pipefail - -echo "--- Run clang-tidy" - -source tests/buildkite/conftest.sh - -tests/ci_build/ci_build.sh clang_tidy \ - --build-arg CUDA_VERSION_ARG=${CUDA_VERSION} \ - python3 tests/ci_build/tidy.py --cuda-archs 75 From fa96af191afd9bfbae855ae8c3f88771cafdd342 Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Mon, 28 Oct 2024 23:24:38 -0700 Subject: [PATCH 11/45] Move more pipelines; refine stash_artifacts --- .github/workflows/main.yml | 105 ++++++++--------- .github/workflows/windows.yml | 61 ++++++++++ ops/docker/conda_env/win64_cpu_test.yml | 22 ---- ops/stash_artifacts.py | 144 ++++++++++++++++++++++++ ops/stash_artifacts.sh | 66 ----------- ops/task/build-win64-gpu.ps1 | 44 ++++++++ ops/task/enforce-ci.ps1 | 28 +++++ ops/task/enforce-ci.sh | 2 + ops/task/test-cpp-gpu.sh | 33 ++++++ ops/task/test-win64-gpu.ps1 | 28 +++++ tests/buildkite/build-containers.sh | 47 -------- tests/buildkite/build-win64-gpu.ps1 | 55 --------- tests/buildkite/conftest.ps1 | 13 --- tests/buildkite/test-cpp-gpu.sh | 24 ---- tests/buildkite/test-win64-gpu.ps1 | 39 ------- 15 files changed, 385 insertions(+), 326 deletions(-) create mode 100644 .github/workflows/windows.yml delete mode 100644 ops/docker/conda_env/win64_cpu_test.yml create mode 100644 ops/stash_artifacts.py delete mode 100755 ops/stash_artifacts.sh create mode 100644 ops/task/build-win64-gpu.ps1 create mode 100644 ops/task/enforce-ci.ps1 create mode 100755 ops/task/test-cpp-gpu.sh create mode 100644 ops/task/test-win64-gpu.ps1 delete mode 100755 tests/buildkite/build-containers.sh delete mode 100644 tests/buildkite/build-win64-gpu.ps1 delete mode 100644 tests/buildkite/conftest.ps1 delete mode 100755 tests/buildkite/test-cpp-gpu.sh delete mode 100644 tests/buildkite/test-win64-gpu.ps1 diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 84967f0684a2..d1f1d2e3f0b6 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -34,6 +34,8 @@ jobs: include: - container_id: xgb-ci.manylinux2014_aarch64 runner: linux-arm64-cpu + - container_id: xgb-ci.aarch64 + runner: linux-arm64-cpu steps: # Restart Docker daemon so that it recognized the ephemeral disks - run: sudo systemctl restart docker @@ -81,14 +83,16 @@ jobs: CONTAINER_ID: xgb-ci.cpu - run: bash ops/task/build-cpu.sh - name: Stash CLI executable - run: bash ops/stash_artifacts.sh ./xgboost - env: - COMMAND: upload - S3_BUCKET: ${{ env.RUNS_ON_S3_BUCKET_CACHE }} - PREFIX: ${{ env.ARTIFACT_STASH_PREFIX }}/build-cpu + run: | + python3 ops/stash_artifacts.py \ + --command upload \ + --s3-bucket ${{ env.RUNS_ON_S3_BUCKET_CACHE }} \ + --prefix ${{ env.ARTIFACT_STASH_PREFIX }}/build-cpu \ + -- ./xgboost build-cpu-arm64: name: Build CPU ARM64 + manylinux_2_28_aarch64 wheel + needs: build-containers runs-on: - runs-on=${{ github.run_id }} - runner=linux-arm64-cpu @@ -104,11 +108,12 @@ jobs: CONTAINER_ID: xgb-ci.aarch64 - run: bash ops/task/build-cpu-arm64.sh - name: Stash files - run: bash ops/stash_artifacts.sh ./xgboost python-package/dist/*.whl - env: - COMMAND: upload - S3_BUCKET: ${{ env.RUNS_ON_S3_BUCKET_CACHE }} - PREFIX: ${{ env.ARTIFACT_STASH_PREFIX }}/build-cpu-arm64 + run: | + python3 ops/stash_artifacts.py \ + --command upload \ + --s3-bucket ${{ env.RUNS_ON_S3_BUCKET_CACHE }} \ + --prefix ${{ env.ARTIFACT_STASH_PREFIX }}/build-cpu-arm64 \ + -- ./xgboost python-package/dist/*.whl build-cuda: name: Build CUDA + manylinux_2_28_x86_64 wheel @@ -132,11 +137,12 @@ jobs: CONTAINER_ID: xgb-ci.manylinux_2_28_x86_64 - run: bash ops/task/build-cuda.sh - name: Stash files - run: bash ops/stash_artifacts.sh build/testxgboost python-package/dist/*.whl - env: - COMMAND: upload - S3_BUCKET: ${{ env.RUNS_ON_S3_BUCKET_CACHE }} - PREFIX: ${{ env.ARTIFACT_STASH_PREFIX }}/build-cuda + run: | + python3 ops/stash_artifacts.py \ + --command upload \ + --s3-bucket ${{ env.RUNS_ON_S3_BUCKET_CACHE }} \ + --prefix ${{ env.ARTIFACT_STASH_PREFIX }}/build-cuda \ + -- build/testxgboost python-package/dist/*.whl build-cuda-with-rmm: name: Build CUDA with RMM @@ -160,11 +166,12 @@ jobs: CONTAINER_ID: xgb-ci.manylinux_2_28_x86_64 - run: bash ops/task/build-cuda-with-rmm.sh - name: Stash files - run: bash ops/stash_artifacts.sh build/testxgboost python-package/dist/*.whl - env: - COMMAND: upload - S3_BUCKET: ${{ env.RUNS_ON_S3_BUCKET_CACHE }} - PREFIX: ${{ env.ARTIFACT_STASH_PREFIX }}/build-cuda-with-rmm + run: | + python3 ops/stash_artifacts.py \ + --command upload \ + --s3-bucket ${{ env.RUNS_ON_S3_BUCKET_CACHE }} \ + --prefix ${{ env.ARTIFACT_STASH_PREFIX }}/build-cuda-with-rmm \ + -- build/testxgboost build-jvm-manylinux2014: name: Build libxgboost4j.so targeting gblic 2.17 @@ -217,8 +224,8 @@ jobs: CONTAINER_ID: xgb-ci.manylinux2014_${{ matrix.arch }} - run: bash ops/task/build-manylinux2014.sh ${{ matrix.arch }} - test-gpu: - name: Test GPU + test-cpp-gpu: + name: Run Google Tests needs: build-cuda runs-on: - runs-on=${{ github.run_id }} @@ -229,47 +236,25 @@ jobs: - uses: actions/checkout@v4 with: submodules: "true" - - name: Unstash gtest executable - run: | - bash ops/stash_artifacts.sh ./testxgboost - chmod +x ./testxgboost - env: - COMMAND: download - S3_BUCKET: ${{ env.RUNS_ON_S3_BUCKET_CACHE }} - PREFIX: ${{ env.ARTIFACT_STASH_PREFIX }}/build-cuda - name: Fetch container from cache run: bash ops/docker_build.sh env: CONTAINER_ID: xgb-ci.gpu - - name: Run gtest + - name: Unstash gtest run: | - nvidia-smi - python3 ops/docker_run.py \ - --container-id xgb-ci.gpu \ - --use-gpus \ - --run-args='--privileged' \ - -- ./testxgboost - - build-test-gpu-win64: - name: Build GPU (Windows) - runs-on: - - runs-on=${{ github.run_id }} - - runner=windows-gpu - steps: - - uses: actions/checkout@v4 - with: - submodules: "true" - - name: Build and run gtest - shell: powershell + python3 ops/stash_artifacts.py \ + --command download \ + --s3-bucket ${{ env.RUNS_ON_S3_BUCKET_CACHE }} \ + --prefix ${{ env.ARTIFACT_STASH_PREFIX }}/build-cuda \ + -- build/testxgboost + chmod +x build/testxgboost + - run: bash ops/task/test-cpp-gpu.sh build-cuda + - name: Unstash gtest run: | - nvidia-smi - nvcc --version - git clone https://github.com/NVIDIA/cccl.git -b v2.6.1 --quiet - mkdir build - cd build - cmake .. -G"Visual Studio 17 2022" -A x64 -DUSE_CUDA=ON -DGOOGLE_TEST=ON -DUSE_DMLC_GTEST=ON -DBUILD_DEPRECATED_CLI=ON -DCMAKE_CUDA_ARCHITECTURES=75 -DCMAKE_PREFIX_PATH="$(Get-Location)/../cccl" - if ($LASTEXITCODE -ne 0) { throw "Last command failed" } - cmake --build . --config Release -- /m /nodeReuse:false "/consoleloggerparameters:ShowCommandLine;Verbosity=minimal" - if ($LASTEXITCODE -ne 0) { throw "Last command failed" } - & .\testxgboost.exe - if ($LASTEXITCODE -ne 0) { throw "Last command failed" } + python3 ops/stash_artifacts.py \ + --command download \ + --s3-bucket ${{ env.RUNS_ON_S3_BUCKET_CACHE }} \ + --prefix ${{ env.ARTIFACT_STASH_PREFIX }}/build-cuda-with-rmm \ + -- build/testxgboost + chmod +x build/testxgboost + - run: bash ops/task/test-cpp-gpu.sh build-cuda-with-rmm diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml new file mode 100644 index 000000000000..6edc14711258 --- /dev/null +++ b/.github/workflows/windows.yml @@ -0,0 +1,61 @@ +name: Nextgen XGBoost CI Windows + +on: [push, pull_request] + +permissions: + contents: read # to fetch code (actions/checkout) + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + +defaults: + run: + shell: powershell + +env: + BRANCH_NAME: >- + ${{ github.event.pull_request.number && 'PR-' }}${{ github.event.pull_request.number || github.ref_name }} + ARTIFACT_STASH_PREFIX: cache/${{ github.repository }}/stash/${{ github.run_id }} + # TODO(hcho3): Remove + RUNS_ON_S3_BUCKET_CACHE: runs-on-s3bucketcache-m3ikdpczirva + +jobs: + build-win64-gpu: + name: Build XGBoost for Windows with CUDA + runs-on: + - runs-on=${{ github.run_id }} + - runner=windows-cpu + steps: + - uses: actions/checkout@v4 + with: + submodules: "true" + - run: powershell ops/task/build-win64-gpu.ps1 + - name: Stash files + run: | + conda activate + python ops/stash_artifacts.py ` + --command upload ` + --s3-bucket ${{ env.RUNS_ON_S3_BUCKET_CACHE }} ` + --prefix ${{ env.ARTIFACT_STASH_PREFIX }}/build-win64-gpu ` + -- build/testxgboost.exe xgboost.exe ` + (Get-ChildItem python-package/dist/*.whl | Select-Object -Expand FullName) + test-win64-gpu: + name: Test XGBoost on Windows + needs: build-win64-gpu + runs-on: + - runs-on=${{ github.run_id }} + - runner=windows-gpu + steps: + - uses: actions/checkout@v4 + with: + submodules: "true" + - name: Unstash files + run: | + conda activate + python ops/stash_artifacts.py ` + --command download ` + --s3-bucket ${{ env.RUNS_ON_S3_BUCKET_CACHE }} ` + --prefix ${{ env.ARTIFACT_STASH_PREFIX }}/build-win64-gpu ` + -- build/testxgboost.exe xgboost.exe python-package/dist/*.whl + - run: powershell ops/task/test-win64-gpu.ps1 diff --git a/ops/docker/conda_env/win64_cpu_test.yml b/ops/docker/conda_env/win64_cpu_test.yml deleted file mode 100644 index d69dd2a6ef85..000000000000 --- a/ops/docker/conda_env/win64_cpu_test.yml +++ /dev/null @@ -1,22 +0,0 @@ -name: win64_env -channels: -- conda-forge -dependencies: -- python=3.10 -- wheel -- numpy -- scipy -- scikit-learn>=1.4.1 -- pandas -- matplotlib -- dask -- distributed -- python-graphviz -- pytest -- jsonschema -- hypothesis -- python-graphviz -- pip -- py-ubjson -- loky -- pyarrow diff --git a/ops/stash_artifacts.py b/ops/stash_artifacts.py new file mode 100644 index 000000000000..405804b499c6 --- /dev/null +++ b/ops/stash_artifacts.py @@ -0,0 +1,144 @@ +""" +Stash an artifact in an S3 bucket for later use + +Note. This script takes in all inputs via environment variables + except the path to the artifact(s). +""" + +import argparse +import os +import subprocess +from pathlib import Path +from urllib.parse import SplitResult, urlsplit, urlunsplit + + +def resolve(x: Path) -> Path: + return x.expanduser().resolve() + + +def path_equals(a: Path, b: Path) -> bool: + return resolve(a) == resolve(b) + + +def compute_s3_url(s3_bucket: str, prefix: str, artifact: Path) -> str: + filename = artifact.name + relative_path = resolve(artifact).relative_to(Path.cwd()) + if resolve(artifact.parent) == resolve(Path.cwd()): + full_prefix = prefix + else: + full_prefix = f"{prefix}/{str(relative_path.parent)}" + return f"s3://{s3_bucket}/{full_prefix}/{filename}" + + +def aws_s3_upload(src: Path, dest: str) -> None: + cli_args = ["aws", "s3", "cp", "--no-progress", str(src), dest] + print(" ".join(cli_args)) + subprocess.run( + cli_args, + check=True, + encoding="utf-8", + ) + + +def aws_s3_download(src: str, dest: Path) -> None: + cli_args = ["aws", "s3", "cp", "--no-progress", src, str(dest)] + print(" ".join(cli_args)) + subprocess.run( + cli_args, + check=True, + encoding="utf-8", + ) + + +def aws_s3_download_with_wildcard(src: str, dest: Path) -> None: + parsed_src = urlsplit(src) + src_dir = urlunsplit( + SplitResult( + scheme="s3", + netloc=parsed_src.netloc, + path=os.path.dirname(parsed_src.path), + query="", + fragment="", + ) + ) + dest_dir = dest.parent + src_glob = os.path.basename(parsed_src.path) + cli_args = [ + "aws", + "s3", + "cp", + "--recursive", + "--no-progress", + "--exclude", + "'*'", + "--include", + src_glob, + src_dir, + str(dest_dir), + ] + print(" ".join(cli_args)) + subprocess.run( + cli_args, + check=True, + encoding="utf-8", + ) + + +def upload(args): + print(f"Uploading artifacts with prefix {args.prefix}...") + for artifact in args.artifacts: + artifact_path = Path(artifact) + s3_url = compute_s3_url(args.s3_bucket, args.prefix, artifact_path) + aws_s3_upload(artifact_path, s3_url) + + +def download(args): + print(f"Downloading artifacts with prefix {args.prefix}...") + for artifact in args.artifacts: + artifact_path = Path(artifact) + print(f"mkdir -p {str(artifact_path.parent)}") + artifact_path.parent.mkdir(parents=True, exist_ok=True) + s3_url = compute_s3_url(args.s3_bucket, args.prefix, artifact_path) + if "*" in artifact: + aws_s3_download_with_wildcard(s3_url, artifact_path) + else: + aws_s3_download(s3_url, artifact_path) + + +if __name__ == "__main__": + # Ensure that the current working directory is the project root + if not (Path.cwd() / "ops").is_dir() or not path_equals( + Path(__file__).parent, Path.cwd() / "ops" + ): + x = Path(__file__).name + raise RuntimeError(f"Script {x} must be run at the project's root directory") + + parser = argparse.ArgumentParser() + parser.add_argument( + "--command", + type=str, + choices=["upload", "download"], + required=True, + help="Whether to upload or download the artifact (upload/download)", + ) + parser.add_argument( + "--s3-bucket", + type=str, + required=True, + help="Name of the S3 bucket to store the artifact", + ) + parser.add_argument( + "--prefix", + type=str, + required=True, + help=( + "Where the artifact would be stored. The artifact will be stored in " + "s3://[s3-bucket]/[prefix]." + ), + ) + parser.add_argument("artifacts", type=str, nargs="+", metavar="artifact") + parsed_args = parser.parse_args() + if parsed_args.command == "upload": + upload(parsed_args) + elif parsed_args.command == "download": + download(parsed_args) diff --git a/ops/stash_artifacts.sh b/ops/stash_artifacts.sh deleted file mode 100755 index f091af3cf50b..000000000000 --- a/ops/stash_artifacts.sh +++ /dev/null @@ -1,66 +0,0 @@ -#!/bin/bash - -## Stash an artifact in an S3 bucket for later use -## -## Note. This script takes in all inputs via environment variables -## except the path to the artifact(s). - -set -euo pipefail - -ENV_VAR_DOC=$( -cat <<-EOF -Inputs - - COMMAND: Whether to upload or download the artifact. Either "upload" or - "download" - - S3_BUCKET: Name of the S3 bucket to store the artifact - - PREFIX: Where the artifact would be stored. The artifact will be stored - in s3://{S3_BUCKET}/{prefix}/. -EOF -) - -if [ "$#" -lt 1 ]; then - echo "Usage: $0 [artifact] [artifact ...]" - exit 1 -fi - -for arg in "COMMAND" "S3_BUCKET" "PREFIX" -do - if [[ -z "${!arg:-}" ]] - then - echo -e "Error: $arg must be set.\n${ENV_VAR_DOC}" - exit 1 - fi -done - -compute_s3_url() { # args: artifact - S3_URL="s3://${S3_BUCKET}/${PREFIX}/"$(basename "$1") -} - -aws_s3_cp() { # args: src, dest - set -x - aws s3 cp --no-progress "$1" "$2" - set +x - return 0 -} - -if [[ "$COMMAND" == "upload" ]] -then - echo "Uploading artifacts with prefix $PREFIX..." - for artifact in "$@" - do - compute_s3_url "${artifact}" - aws_s3_cp "${artifact}" "${S3_URL}" - done -elif [[ "$COMMAND" == "download" ]] -then - echo "Downloading artifacts with prefix $PREFIX..." - for artifact in "$@" - do - compute_s3_url "${artifact}" - aws_s3_cp "${S3_URL}" "${artifact}" - done -else - echo "Unrecognized command: $COMMAND" - exit 2 -fi - diff --git a/ops/task/build-win64-gpu.ps1 b/ops/task/build-win64-gpu.ps1 new file mode 100644 index 000000000000..0b49d143dd5b --- /dev/null +++ b/ops/task/build-win64-gpu.ps1 @@ -0,0 +1,44 @@ +$ErrorActionPreference = "Stop" + +. ops/task/enforce-ci.ps1 + +Write-Host "--- Build libxgboost on Windows with CUDA" + +nvcc --version +if ( $is_release_branch -eq 0 ) { + $arch_flag = "-DGPU_COMPUTE_VER=75" +} else { + $arch_flag = "" +} + +git clone https://github.com/NVIDIA/cccl.git -b v2.6.1 --quiet +mkdir build +cd build +cmake .. -G"Visual Studio 17 2022" -A x64 -DUSE_CUDA=ON ` + -DGOOGLE_TEST=ON -DUSE_DMLC_GTEST=ON -DBUILD_DEPRECATED_CLI=ON ` + -DCMAKE_PREFIX_PATH="$(Get-Location)/../cccl" ${arch_flag} +if ($LASTEXITCODE -ne 0) { throw "Last command failed" } +cmake --build . --config Release -- /m /nodeReuse:false ` + "/consoleloggerparameters:ShowCommandLine;Verbosity=minimal" +if ($LASTEXITCODE -ne 0) { throw "Last command failed" } + +Write-Host "--- Build binary wheel" +cd ../python-package +conda activate +pip install --user -v "pip>=23" +pip --version +pip wheel --no-deps -v . --wheel-dir dist/ +if ($LASTEXITCODE -ne 0) { throw "Last command failed" } +python ../ops/rename_whl.py ` + --wheel-path (Get-ChildItem dist/*.whl | Select-Object -Expand FullName) ` + --commit-hash $Env:GITHUB_SHA ` + --platform-tag win_amd64 +if ($LASTEXITCODE -ne 0) { throw "Last command failed" } + +Write-Host "--- Upload Python wheel" +cd .. +if ( $is_release_branch -eq 1 ) { + aws s3 cp (Get-ChildItem python-package/dist/*.whl | Select-Object -Expand FullName) ` + s3://xgboost-nightly-builds/$Env:BRANCH_NAME/ --acl public-read --no-progress + if ($LASTEXITCODE -ne 0) { throw "Last command failed" } +} diff --git a/ops/task/enforce-ci.ps1 b/ops/task/enforce-ci.ps1 new file mode 100644 index 000000000000..9183764b9a13 --- /dev/null +++ b/ops/task/enforce-ci.ps1 @@ -0,0 +1,28 @@ +## Ensure that a script is running inside the CI. +## Usage: . ops/task/enforce-ci.ps1 + +if ( -Not $Env:GITHUB_ACTION ) { + $script_name = (Split-Path -Path $PSCommandPath -Leaf) + Write-Host "$script_name is not meant to run locally; it should run inside GitHub Actions." + Write-Host "Please inspect the content of $script_name and locate the desired command manually." + exit 1 +} + +if ( -Not $Env:BRANCH_NAME ) { + Write-Host "Make sure to define environment variable BRANCH_NAME." + exit 2 +} + +if ( $Env:GITHUB_BASE_REF ) { + $is_pull_request = 1 +} else { + $is_pull_request = 0 +} + +if ( ($Env:BRANCH_NAME -eq "master") -or ($Env:BRANCH_NAME -match "release_.+") ) { + $is_release_branch = 1 + $enforce_daily_budget = 0 +} else { + $is_release_branch = 0 + $enforce_daily_budget = 1 +} diff --git a/ops/task/enforce-ci.sh b/ops/task/enforce-ci.sh index 1e50dc045cb1..dfed11914c9a 100755 --- a/ops/task/enforce-ci.sh +++ b/ops/task/enforce-ci.sh @@ -30,8 +30,10 @@ fi if [[ $BRANCH_NAME == "master" || $BRANCH_NAME == "release_"* || $BRANCH_NAME == "federated-secure" ]] then is_release_branch=1 + enforce_daily_budget=0 else is_release_branch=0 + enforce_daily_budget=1 fi if [[ -n ${DISABLE_RELEASE:-} ]] diff --git a/ops/task/test-cpp-gpu.sh b/ops/task/test-cpp-gpu.sh new file mode 100755 index 000000000000..57090551ecad --- /dev/null +++ b/ops/task/test-cpp-gpu.sh @@ -0,0 +1,33 @@ +#!/bin/bash + +set -euo pipefail + +source ops/task/enforce-ci.sh + +if [[ "$#" -lt 1 ]] +then + echo "Usage: $0 {build-cuda,build-cuda-with-rmm}" + exit 1 +fi +arg=$1 + +case "${arg}" in + build-cuda) + echo "--- Run Google Tests with CUDA, using a GPU" + python3 ops/docker_run.py --container-id xgb-ci.gpu --use-gpus \ + --run-args='--privileged' \ + -- build/testxgboost + ;; + + build-cuda-with-rmm) + echo "--- Run Google Tests with CUDA, using a GPU, RMM enabled" + python3 ops/docker_run.py --container-id xgb-ci.gpu --use-gpus \ + --run-args='--privileged' \ + -- build/testxgboost --use-rmm-pool + ;; + + *) + echo "Unrecognized arg: ${arg}" + exit 2 + ;; +esac diff --git a/ops/task/test-win64-gpu.ps1 b/ops/task/test-win64-gpu.ps1 new file mode 100644 index 000000000000..21d8f6e7b533 --- /dev/null +++ b/ops/task/test-win64-gpu.ps1 @@ -0,0 +1,28 @@ +$ErrorActionPreference = "Stop" + +. ops/task/enforce-ci.ps1 + +Write-Host "--- Test XGBoost on Windows with CUDA" + +nvcc --version + +Write-Host "--- Run Google Tests" +build/testxgboost.exe +if ($LASTEXITCODE -ne 0) { throw "Last command failed" } + +Write-Host "--- Set up Python env" +conda activate +$env_name = -join("win64_", (New-Guid).ToString().replace("-", "")) +mamba env create -n ${env_name} --file=ops/docker/conda_env/win64_test.yml +conda activate ${env_name} +python -m pip install ` + (Get-ChildItem python-package/dist/*.whl | Select-Object -Expand FullName) +if ($LASTEXITCODE -ne 0) { throw "Last command failed" } + +Write-Host "--- Run Python tests" +python -X faulthandler -m pytest -v -s -rxXs --fulltrace tests/python +if ($LASTEXITCODE -ne 0) { throw "Last command failed" } +Write-Host "--- Run Python tests with GPU" +python -X faulthandler -m pytest -v -s -rxXs --fulltrace -m "(not slow) and (not mgpu)"` + tests/python-gpu +if ($LASTEXITCODE -ne 0) { throw "Last command failed" } diff --git a/tests/buildkite/build-containers.sh b/tests/buildkite/build-containers.sh deleted file mode 100755 index aa8f572483a3..000000000000 --- a/tests/buildkite/build-containers.sh +++ /dev/null @@ -1,47 +0,0 @@ -#!/bin/bash - -set -euo pipefail -set -x - -if [ "$#" -lt 1 ] -then - echo "Usage: $0 [container to build]" - exit 1 -fi -container=$1 - -source tests/buildkite/conftest.sh - -echo "--- Build container ${container}" - -BUILD_ARGS="" - -case "${container}" in - cpu) - ;; - - gpu|gpu_build_rockylinux8) - BUILD_ARGS="$BUILD_ARGS --build-arg CUDA_VERSION_ARG=$CUDA_VERSION" - BUILD_ARGS="$BUILD_ARGS --build-arg NCCL_VERSION_ARG=$NCCL_VERSION" - BUILD_ARGS="$BUILD_ARGS --build-arg RAPIDS_VERSION_ARG=$RAPIDS_VERSION" - ;; - - gpu_dev_ver) - BUILD_ARGS="$BUILD_ARGS --build-arg CUDA_VERSION_ARG=$CUDA_VERSION" - BUILD_ARGS="$BUILD_ARGS --build-arg NCCL_VERSION_ARG=$NCCL_VERSION" - BUILD_ARGS="$BUILD_ARGS --build-arg RAPIDS_VERSION_ARG=$DEV_RAPIDS_VERSION" - ;; - - jvm_gpu_build) - BUILD_ARGS="$BUILD_ARGS --build-arg CUDA_VERSION_ARG=$CUDA_VERSION" - BUILD_ARGS="$BUILD_ARGS --build-arg NCCL_VERSION_ARG=$NCCL_VERSION" - ;; - - *) - echo "Unrecognized container ID: ${container}" - exit 2 - ;; -esac - -# Run a no-op command. This will simply build the container and push it to the private registry -tests/ci_build/ci_build.sh ${container} ${BUILD_ARGS} bash diff --git a/tests/buildkite/build-win64-gpu.ps1 b/tests/buildkite/build-win64-gpu.ps1 deleted file mode 100644 index 9114d3237751..000000000000 --- a/tests/buildkite/build-win64-gpu.ps1 +++ /dev/null @@ -1,55 +0,0 @@ -$ErrorActionPreference = "Stop" - -. tests/buildkite/conftest.ps1 - -Write-Host "--- Build libxgboost on Windows with CUDA" - -nvcc --version -if ( $is_release_branch -eq 0 ) { - $arch_flag = "-DGPU_COMPUTE_VER=75" -} else { - $arch_flag = "" -} -mkdir build -cd build -cmake .. -G"Visual Studio 17 2022" -A x64 -DUSE_CUDA=ON ` - -DGOOGLE_TEST=ON -DUSE_DMLC_GTEST=ON -DBUILD_DEPRECATED_CLI=ON ${arch_flag} -if ($LASTEXITCODE -ne 0) { throw "Last command failed" } -cmake --build . --config Release -- /m /nodeReuse:false ` - "/consoleloggerparameters:ShowCommandLine;Verbosity=minimal" -if ($LASTEXITCODE -ne 0) { throw "Last command failed" } - -Write-Host "--- Build binary wheel" -cd ../python-package -conda activate -& pip install --user -v "pip>=23" -& pip --version -& pip wheel --no-deps -v . --wheel-dir dist/ -Get-ChildItem . -Filter dist/*.whl | -Foreach-Object { - & python ../tests/ci_build/rename_whl.py ` - --wheel-path $_.FullName ` - --commit-hash $Env:BUILDKITE_COMMIT ` - --platform-tag win_amd64 - if ($LASTEXITCODE -ne 0) { throw "Last command failed" } -} - -Write-Host "--- Upload Python wheel" -cd .. -Get-ChildItem . -Filter python-package/dist/*.whl | -Foreach-Object { - & buildkite-agent artifact upload python-package/dist/$_ - if ($LASTEXITCODE -ne 0) { throw "Last command failed" } -} -if ( $is_release_branch -eq 1 ) { - Get-ChildItem . -Filter python-package/dist/*.whl | - Foreach-Object { - & aws s3 cp python-package/dist/$_ s3://xgboost-nightly-builds/$Env:BUILDKITE_BRANCH/ ` - --acl public-read --no-progress - if ($LASTEXITCODE -ne 0) { throw "Last command failed" } - } -} - -Write-Host "--- Stash C++ test executables" -& buildkite-agent artifact upload build/testxgboost.exe -& buildkite-agent artifact upload xgboost.exe diff --git a/tests/buildkite/conftest.ps1 b/tests/buildkite/conftest.ps1 deleted file mode 100644 index bd623caf0c03..000000000000 --- a/tests/buildkite/conftest.ps1 +++ /dev/null @@ -1,13 +0,0 @@ -if ( $Env:BUILDKITE_PULL_REQUEST -and ($Env:BUILDKITE_PULL_REQUEST -ne "false") ) { - $is_pull_request = 1 -} else { - $is_pull_request = 0 -} - -if ( ($Env:BUILDKITE_BRANCH -eq "master") -or ($Env:BUILDKITE_BRANCH -match "release_.+") ) { - $is_release_branch = 1 - $enforce_daily_budget = 0 -} else { - $is_release_branch = 0 - $enforce_daily_budget = 1 -} diff --git a/tests/buildkite/test-cpp-gpu.sh b/tests/buildkite/test-cpp-gpu.sh deleted file mode 100755 index d7197db2efce..000000000000 --- a/tests/buildkite/test-cpp-gpu.sh +++ /dev/null @@ -1,24 +0,0 @@ -#!/bin/bash - -set -euo pipefail - -source tests/buildkite/conftest.sh - -echo "--- Run Google Tests with CUDA, using a GPU" -buildkite-agent artifact download "build/testxgboost" . --step build-cuda -chmod +x build/testxgboost -tests/ci_build/ci_build.sh gpu --use-gpus \ - --build-arg CUDA_VERSION_ARG=$CUDA_VERSION \ - --build-arg RAPIDS_VERSION_ARG=$RAPIDS_VERSION \ - --build-arg NCCL_VERSION_ARG=$NCCL_VERSION \ - build/testxgboost - -echo "--- Run Google Tests with CUDA, using a GPU, RMM enabled" -rm -rfv build/ -buildkite-agent artifact download "build/testxgboost" . --step build-cuda-with-rmm -chmod +x build/testxgboost -tests/ci_build/ci_build.sh gpu --use-gpus \ - --build-arg CUDA_VERSION_ARG=$CUDA_VERSION \ - --build-arg RAPIDS_VERSION_ARG=$RAPIDS_VERSION \ - --build-arg NCCL_VERSION_ARG=$NCCL_VERSION \ - build/testxgboost --use-rmm-pool diff --git a/tests/buildkite/test-win64-gpu.ps1 b/tests/buildkite/test-win64-gpu.ps1 deleted file mode 100644 index 95a51b50228d..000000000000 --- a/tests/buildkite/test-win64-gpu.ps1 +++ /dev/null @@ -1,39 +0,0 @@ -$ErrorActionPreference = "Stop" - -. tests/buildkite/conftest.ps1 - -Write-Host "--- Test XGBoost on Windows with CUDA" - -New-Item python-package/dist -ItemType Directory -ea 0 -New-Item build -ItemType Directory -ea 0 -buildkite-agent artifact download "python-package/dist/*.whl" . --step build-win64-gpu -if ($LASTEXITCODE -ne 0) { throw "Last command failed" } -buildkite-agent artifact download "build/testxgboost.exe" . --step build-win64-gpu -if ($LASTEXITCODE -ne 0) { throw "Last command failed" } -buildkite-agent artifact download "xgboost.exe" . --step build-win64-gpu -if ($LASTEXITCODE -ne 0) { throw "Last command failed" } - -nvcc --version - -Write-Host "--- Run Google Tests" -& build/testxgboost.exe -if ($LASTEXITCODE -ne 0) { throw "Last command failed" } - -Write-Host "--- Set up Python env" -conda activate -$env_name = -join("win64_", (New-Guid).ToString().replace("-", "")) -mamba env create -n ${env_name} --file=tests/ci_build/conda_env/win64_test.yml -conda activate ${env_name} -Get-ChildItem . -Filter python-package/dist/*.whl | -Foreach-Object { - & python -m pip install python-package/dist/$_ - if ($LASTEXITCODE -ne 0) { throw "Last command failed" } -} - -Write-Host "--- Run Python tests" -python -X faulthandler -m pytest -v -s -rxXs --fulltrace tests/python -if ($LASTEXITCODE -ne 0) { throw "Last command failed" } -Write-Host "--- Run Python tests with GPU" -python -X faulthandler -m pytest -v -s -rxXs --fulltrace -m "(not slow) and (not mgpu)"` - tests/python-gpu -if ($LASTEXITCODE -ne 0) { throw "Last command failed" } From f387555cbe8dbfa384ca2ed294176dc4fff96dae Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Wed, 30 Oct 2024 10:42:47 -0700 Subject: [PATCH 12/45] Simplify stash_artifacts --- .github/workflows/main.yml | 59 ++++++++++++++--------------------- .github/workflows/windows.yml | 22 ++++++------- ops/stash_artifacts.ps1 | 47 ++++++++++++++++++++++++++++ ops/stash_artifacts.sh | 39 +++++++++++++++++++++++ 4 files changed, 120 insertions(+), 47 deletions(-) create mode 100644 ops/stash_artifacts.ps1 create mode 100755 ops/stash_artifacts.sh diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index d1f1d2e3f0b6..d0e33f87d70a 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -13,7 +13,6 @@ env: BRANCH_NAME: >- ${{ github.event.pull_request.number && 'PR-' }}${{ github.event.pull_request.number || github.ref_name }} USE_DOCKER_CACHE: 1 - ARTIFACT_STASH_PREFIX: cache/${{ github.repository }}/stash/${{ github.run_id }} jobs: build-containers: @@ -83,12 +82,10 @@ jobs: CONTAINER_ID: xgb-ci.cpu - run: bash ops/task/build-cpu.sh - name: Stash CLI executable - run: | - python3 ops/stash_artifacts.py \ - --command upload \ - --s3-bucket ${{ env.RUNS_ON_S3_BUCKET_CACHE }} \ - --prefix ${{ env.ARTIFACT_STASH_PREFIX }}/build-cpu \ - -- ./xgboost + run: bash ops/stash_artifacts.sh ./xgboost + env: + COMMAND: upload + KEY: build-cpu build-cpu-arm64: name: Build CPU ARM64 + manylinux_2_28_aarch64 wheel @@ -108,12 +105,10 @@ jobs: CONTAINER_ID: xgb-ci.aarch64 - run: bash ops/task/build-cpu-arm64.sh - name: Stash files - run: | - python3 ops/stash_artifacts.py \ - --command upload \ - --s3-bucket ${{ env.RUNS_ON_S3_BUCKET_CACHE }} \ - --prefix ${{ env.ARTIFACT_STASH_PREFIX }}/build-cpu-arm64 \ - -- ./xgboost python-package/dist/*.whl + run: bash ops/stash_artifacts.sh ./xgboost python-package/dist/*.whl + env: + COMMAND: upload + KEY: build-cpu-arm64 build-cuda: name: Build CUDA + manylinux_2_28_x86_64 wheel @@ -137,12 +132,10 @@ jobs: CONTAINER_ID: xgb-ci.manylinux_2_28_x86_64 - run: bash ops/task/build-cuda.sh - name: Stash files - run: | - python3 ops/stash_artifacts.py \ - --command upload \ - --s3-bucket ${{ env.RUNS_ON_S3_BUCKET_CACHE }} \ - --prefix ${{ env.ARTIFACT_STASH_PREFIX }}/build-cuda \ - -- build/testxgboost python-package/dist/*.whl + run: bash ops/stash_artifacts.sh build/testxgboost python-package/dist/*.whl + env: + COMMAND: upload + KEY: build-cuda build-cuda-with-rmm: name: Build CUDA with RMM @@ -166,12 +159,10 @@ jobs: CONTAINER_ID: xgb-ci.manylinux_2_28_x86_64 - run: bash ops/task/build-cuda-with-rmm.sh - name: Stash files - run: | - python3 ops/stash_artifacts.py \ - --command upload \ - --s3-bucket ${{ env.RUNS_ON_S3_BUCKET_CACHE }} \ - --prefix ${{ env.ARTIFACT_STASH_PREFIX }}/build-cuda-with-rmm \ - -- build/testxgboost + run: bash ops/stash_artifacts.sh build/testxgboost + env: + COMMAND: upload + KEY: build-cuda-with-rmm build-jvm-manylinux2014: name: Build libxgboost4j.so targeting gblic 2.17 @@ -242,19 +233,17 @@ jobs: CONTAINER_ID: xgb-ci.gpu - name: Unstash gtest run: | - python3 ops/stash_artifacts.py \ - --command download \ - --s3-bucket ${{ env.RUNS_ON_S3_BUCKET_CACHE }} \ - --prefix ${{ env.ARTIFACT_STASH_PREFIX }}/build-cuda \ - -- build/testxgboost + bash ops/stash_artifacts.sh build/testxgboost chmod +x build/testxgboost + env: + COMMAND: download + KEY: build-cuda - run: bash ops/task/test-cpp-gpu.sh build-cuda - name: Unstash gtest run: | - python3 ops/stash_artifacts.py \ - --command download \ - --s3-bucket ${{ env.RUNS_ON_S3_BUCKET_CACHE }} \ - --prefix ${{ env.ARTIFACT_STASH_PREFIX }}/build-cuda-with-rmm \ - -- build/testxgboost + bash ops/stash_artifacts.sh build/testxgboost chmod +x build/testxgboost + env: + COMMAND: download + KEY: build-cuda-with-rmm - run: bash ops/task/test-cpp-gpu.sh build-cuda-with-rmm diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml index 6edc14711258..76388302f49f 100644 --- a/.github/workflows/windows.yml +++ b/.github/workflows/windows.yml @@ -33,13 +33,12 @@ jobs: - run: powershell ops/task/build-win64-gpu.ps1 - name: Stash files run: | - conda activate - python ops/stash_artifacts.py ` - --command upload ` - --s3-bucket ${{ env.RUNS_ON_S3_BUCKET_CACHE }} ` - --prefix ${{ env.ARTIFACT_STASH_PREFIX }}/build-win64-gpu ` - -- build/testxgboost.exe xgboost.exe ` + powershell ops/stash_artifacts.ps1 ` + build/testxgboost.exe xgboost.exe ` (Get-ChildItem python-package/dist/*.whl | Select-Object -Expand FullName) + env: + COMMAND: upload + KEY: build-win64-gpu test-win64-gpu: name: Test XGBoost on Windows needs: build-win64-gpu @@ -52,10 +51,9 @@ jobs: submodules: "true" - name: Unstash files run: | - conda activate - python ops/stash_artifacts.py ` - --command download ` - --s3-bucket ${{ env.RUNS_ON_S3_BUCKET_CACHE }} ` - --prefix ${{ env.ARTIFACT_STASH_PREFIX }}/build-win64-gpu ` - -- build/testxgboost.exe xgboost.exe python-package/dist/*.whl + powershell ops/stash_artifacts.ps1 ` + build/testxgboost.exe xgboost.exe python-package/dist/*.whl + env: + COMMAND: download + KEY: build-win64-gpu - run: powershell ops/task/test-win64-gpu.ps1 diff --git a/ops/stash_artifacts.ps1 b/ops/stash_artifacts.ps1 new file mode 100644 index 000000000000..2f8cbaf0a855 --- /dev/null +++ b/ops/stash_artifacts.ps1 @@ -0,0 +1,47 @@ +[CmdletBinding()] +Param( + [Parameter( + Mandatory=$true, + Position=0, + ValueFromRemainingArguments=$true + )][string[]]$artifacts +) + +## Convenience wrapper for ops/stash_artifacts.py +## Meant to be used inside GitHub Actions + +$ENV_VAR_DOC = @' +Inputs + - COMMAND: Either "upload" or "download" + - KEY: Unique string to identify a group of artifacts +'@ + +$ErrorActionPreference = "Stop" + +. ops/task/enforce-ci.ps1 + +foreach ($env in "COMMAND", "KEY", "GITHUB_REPOSITORY", "GITHUB_RUN_ID", + "RUNS_ON_S3_BUCKET_CACHE") { + $val = [Environment]::GetEnvironmentVariable($env) + if ($val -eq $null) { + Write-Host "Error: $env must be set.`n${ENV_VAR_DOC}" + exit 1 + } +} + +$artifact_stash_prefix = "cache/${Env:GITHUB_REPOSITORY}/stash/${Env:GITHUB_RUN_ID}" + +conda activate + +Write-Host @" +python ops/stash_artifacts.py ` + --command "${Env:COMMAND}" ` + --s3-bucket "${Env:RUNS_ON_S3_BUCKET_CACHE}" ` + --prefix "${artifact_stash_prefix}/${Env:KEY}" ` + -- $artifacts +"@ +python ops/stash_artifacts.py ` + --command "${Env:COMMAND}" ` + --s3-bucket "${Env:RUNS_ON_S3_BUCKET_CACHE}" ` + --prefix "${artifact_stash_prefix}/${Env:KEY}" ` + -- $artifacts diff --git a/ops/stash_artifacts.sh b/ops/stash_artifacts.sh new file mode 100755 index 000000000000..c796831a963d --- /dev/null +++ b/ops/stash_artifacts.sh @@ -0,0 +1,39 @@ +#!/bin/bash + +## Convenience wrapper for ops/stash_artifacts.py +## Meant to be used inside GitHub Actions + +ENV_VAR_DOC=$( +cat <<-EOF +Inputs + - COMMAND: Either "upload" or "download" + - KEY: Unique string to identify a group of artifacts +EOF +) + +set -euo pipefail + +source ops/task/enforce-ci.sh + +if [ "$#" -lt 1 ]; then + echo "Usage: $0 [artifact] [artifact ...]" + exit 1 +fi + +for arg in "COMMAND" "KEY" "GITHUB_REPOSITORY" "GITHUB_RUN_ID" "RUNS_ON_S3_BUCKET_CACHE" +do + if [[ -z "${!arg:-}" ]] + then + echo -e "Error: $arg must be set.\n${ENV_VAR_DOC}" + exit 1 + fi +done + +artifact_stash_prefix="cache/${GITHUB_REPOSITORY}/stash/${GITHUB_RUN_ID}" + +set -x +python3 ops/stash_artifacts.py \ + --command "${COMMAND}" \ + --s3-bucket "${RUNS_ON_S3_BUCKET_CACHE}" \ + --prefix "${artifact_stash_prefix}/${KEY}" \ + -- "$@" From 91eee2d761b852eab47d1c71a86db5afb0ef5419 Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Wed, 30 Oct 2024 12:10:05 -0700 Subject: [PATCH 13/45] Migrate Python tests --- .github/workflows/main.yml | 83 ++++++++++++++--- ops/matrix/ci_container.yml | 12 ++- ops/task/test-cpp-gpu.sh | 17 +++- ops/task/test-python.sh | 82 +++++++++++++++++ tests/buildkite/test-cpp-mgpu.sh | 17 ---- tests/buildkite/test-python-cpu-arm64.sh | 11 --- tests/buildkite/test-python-cpu.sh | 16 ---- tests/buildkite/test-python-gpu.sh | 59 ------------ tests/ci_build/test_python.sh | 111 ----------------------- 9 files changed, 176 insertions(+), 232 deletions(-) create mode 100755 ops/task/test-python.sh delete mode 100755 tests/buildkite/test-cpp-mgpu.sh delete mode 100755 tests/buildkite/test-python-cpu-arm64.sh delete mode 100755 tests/buildkite/test-python-cpu.sh delete mode 100755 tests/buildkite/test-python-gpu.sh delete mode 100755 tests/ci_build/test_python.sh diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index d0e33f87d70a..00e9b5abb844 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -216,11 +216,23 @@ jobs: - run: bash ops/task/build-manylinux2014.sh ${{ matrix.arch }} test-cpp-gpu: - name: Run Google Tests - needs: build-cuda + name: Run Google Tests with GPU(s) + needs: [build-cuda, build-cuda-with-rmm] runs-on: - runs-on=${{ github.run_id }} - - runner=linux-amd64-gpu + - runner=${{ matrix.runner }} + strategy: + matrix: + include: + - suite: gpu + runner: linux-amd64-gpu + artifact_from: build-cuda + - suite: gpu-rmm + runner: linux-amd64-gpu + artifact_from: build-cuda-with-rmm + - suite: mgpu + runner: linux-amd64-mgpu + artifact_from: build-cuda steps: # Restart Docker daemon so that it recognized the ephemeral disks - run: sudo systemctl restart docker @@ -237,13 +249,62 @@ jobs: chmod +x build/testxgboost env: COMMAND: download - KEY: build-cuda - - run: bash ops/task/test-cpp-gpu.sh build-cuda - - name: Unstash gtest - run: | - bash ops/stash_artifacts.sh build/testxgboost - chmod +x build/testxgboost + KEY: ${{ matrix.artifact_from }} + - run: bash ops/task/test-cpp-gpu.sh ${{ matrix.suite }} + + test-python: + name: Run Python tests + needs: [build-cuda] + runs-on: + - runs-on=${{ github.run_id }} + - runner=${{ matrix.runner }} + strategy: + matrix: + include: + - description: "single GPU" + container: xgb-ci.gpu + suite: gpu + runner: linux-amd64-gpu + artifact_from: build-cuda + - description: "single GPU, nightly deps" + container: xgb-ci.gpu_dev_ver + suite: gpu + runner: linux-amd64-gpu + artifact_from: build-cuda + - description: "multiple GPUs" + container: xgb-ci.gpu + suite: mgpu + runner: linux-amd64-mgpu + artifact_from: build-cuda + - description: "multiple GPUs, nightly deps" + container: xgb-ci.gpu_dev_ver + suite: mgpu + runner: linux-amd64-mgpu + artifact_from: build-cuda + - description: "CPU" + container: xgb-ci.cpu + suite: cpu + runner: linux-amd64-cpu + artifact_from: build-cuda + - description: "CPU ARM64" + container: xgb-ci.aarch64 + suite: cpu-arm64 + runner: linux-arm64-cpu + artifact_from: build-cpu-arm64 + steps: + # Restart Docker daemon so that it recognized the ephemeral disks + - run: sudo systemctl restart docker + - uses: actions/checkout@v4 + with: + submodules: "true" + - name: Fetch container from cache + run: bash ops/docker_build.sh + env: + CONTAINER_ID: ${{ matrix.container }} + - name: Unstash Python wheel + run: bash ops/stash_artifacts.sh python-package/dist/*.whl env: COMMAND: download - KEY: build-cuda-with-rmm - - run: bash ops/task/test-cpp-gpu.sh build-cuda-with-rmm + KEY: build-cuda + - name: Run Python tests, ${{ matrix.description }} + run: bash ops/task/test-python.sh ${{ matrix.suite }} ${{ matrix.container }} diff --git a/ops/matrix/ci_container.yml b/ops/matrix/ci_container.yml index d57d63d99e5c..fb0ae62325cd 100644 --- a/ops/matrix/ci_container.yml +++ b/ops/matrix/ci_container.yml @@ -17,14 +17,22 @@ xgb-ci.gpu: NCCL_VERSION_ARG: "2.22.3-1" RAPIDS_VERSION_ARG: "24.10" -xgb-ci.cpu: - container_def: cpu +xgb-ci.gpu_dev_ver: + container_def: gpu + build_args: + CUDA_VERSION_ARG: "12.5.1" + NCCL_VERSION_ARG: "2.22.3-1" + RAPIDS_VERSION_ARG: "24.12" + RAPIDSAI_CONDA_CHANNEL_ARG: "rapidsai-nightly" xgb-ci.clang_tidy: container_def: clang_tidy build_args: CUDA_VERSION_ARG: "12.5.1" +xgb-ci.cpu: + container_def: cpu + xgb-ci.aarch64: container_def: aarch64 diff --git a/ops/task/test-cpp-gpu.sh b/ops/task/test-cpp-gpu.sh index 57090551ecad..96d11bc9940b 100755 --- a/ops/task/test-cpp-gpu.sh +++ b/ops/task/test-cpp-gpu.sh @@ -6,26 +6,33 @@ source ops/task/enforce-ci.sh if [[ "$#" -lt 1 ]] then - echo "Usage: $0 {build-cuda,build-cuda-with-rmm}" + echo "Usage: $0 {gpu,gpu-rmm,mgpu}" exit 1 fi arg=$1 case "${arg}" in - build-cuda) - echo "--- Run Google Tests with CUDA, using a GPU" + gpu) + echo "--- Run Google Tests, using a single GPU" python3 ops/docker_run.py --container-id xgb-ci.gpu --use-gpus \ --run-args='--privileged' \ -- build/testxgboost ;; - build-cuda-with-rmm) - echo "--- Run Google Tests with CUDA, using a GPU, RMM enabled" + gpu-rmm) + echo "--- Run Google Tests, using a single GPU, RMM enabled" python3 ops/docker_run.py --container-id xgb-ci.gpu --use-gpus \ --run-args='--privileged' \ -- build/testxgboost --use-rmm-pool ;; + mgpu) + echo "--- Run Google Tests, using multiple GPUs" + python3 ops/docker_run.py --container-id xgb-ci.gpu --use-gpus \ + --run-args='--privileged --shm-size=4g' \ + -- build/testxgboost --gtest_filter=*MGPU* + ;; + *) echo "Unrecognized arg: ${arg}" exit 2 diff --git a/ops/task/test-python.sh b/ops/task/test-python.sh new file mode 100755 index 000000000000..99f8b0b42277 --- /dev/null +++ b/ops/task/test-python.sh @@ -0,0 +1,82 @@ +#!/bin/bash + +set -euo pipefail + +source ops/task/enforce-ci.sh + +if [[ "$#" -lt 2 ]] +then + echo "Usage: $0 {gpu|mgpu|cpu|cpu-arm64} {container_id}" + exit 1 +fi + +suite="$1" +container_id="$2" + +tee test-python-wrapper.sh <<-'EOF' +#!/bin/bash +set -euox pipefail + +source activate "$1" +export PYSPARK_DRIVER_PYTHON=$(which python) +export PYSPARK_PYTHON=$(which python) +export SPARK_TESTING=1 + +pip install -v ./python-package/dist/*.whl +EOF + +case "$suite" in + gpu) + echo "-- Run Python tests, using a single GPU" + echo " + python -c 'from cupy.cuda import jitify; jitify._init_module()' + pytest -v -s -rxXs --fulltrace --durations=0 -m 'not mgpu' tests/python-gpu + " | tee -a test-python-wrapper.sh + python3 ops/docker_run.py --container-id "${container_id}" --use-gpus \ + --run-args='--privileged' \ + -- bash test-python-wrapper.sh gpu_test + ;; + + mgpu) + echo "-- Run Python tests, using multiple GPUs" + echo " + python -c 'from cupy.cuda import jitify; jitify._init_module()' + pytest -v -s -rxXs --fulltrace --durations=0 -m 'mgpu' tests/python-gpu + pytest -v -s -rxXs --fulltrace --durations=0 -m 'mgpu' tests/test_distributed/test_gpu_with_dask + pytest -v -s -rxXs --fulltrace --durations=0 -m 'mgpu' tests/test_distributed/test_gpu_with_spark + pytest -v -s -rxXs --fulltrace --durations=0 -m 'mgpu' tests/test_distributed/test_gpu_federated + " | tee -a test-python-wrapper.sh + python3 ops/docker_run.py --container-id "${container_id}" --use-gpus \ + --run-args='--privileged --shm-size=4g' \ + -- bash test-python-wrapper.sh gpu_test + ;; + + cpu) + echo "-- Run Python tests (CPU)" + echo " + export RAY_OBJECT_STORE_ALLOW_SLOW_STORAGE=1 + pytest -v -s -rxXs --fulltrace --durations=0 tests/python + pytest -v -s -rxXs --fulltrace --durations=0 tests/test_distributed/test_with_dask + pytest -v -s -rxXs --fulltrace --durations=0 tests/test_distributed/test_with_spark + pytest -v -s -rxXs --fulltrace --durations=0 tests/test_distributed/test_federated + " | tee -a test-python-wrapper.sh + python3 ops/docker_run.py --container-id "${container_id}" \ + -- bash test-python-wrapper.sh linux_cpu_test + ;; + + cpu-arm64) + echo "-- Run Python tests (CPU, ARM64)" + echo " + pytest -v -s -rxXs --fulltrace --durations=0 \\ + tests/python/test_basic.py tests/python/test_basic_models.py \\ + tests/python/test_model_compatibility.py + " | tee -a test-python-wrapper.sh + python3 ops/docker_run.py --container-id "${container_id}" \ + -- bash test-python-wrapper.sh aarch64_test + ;; + + *) + echo "Unrecognized argument: $suite" + exit 1 + ;; +esac diff --git a/tests/buildkite/test-cpp-mgpu.sh b/tests/buildkite/test-cpp-mgpu.sh deleted file mode 100755 index 65614b191d04..000000000000 --- a/tests/buildkite/test-cpp-mgpu.sh +++ /dev/null @@ -1,17 +0,0 @@ -#!/bin/bash - -set -euo pipefail - -source tests/buildkite/conftest.sh - -# Allocate extra space in /dev/shm to enable NCCL -export CI_DOCKER_EXTRA_PARAMS_INIT='--shm-size=4g' - -echo "--- Run Google Tests with CUDA, using multiple GPUs" -buildkite-agent artifact download "build/testxgboost" . --step build-cuda -chmod +x build/testxgboost -tests/ci_build/ci_build.sh gpu --use-gpus \ - --build-arg CUDA_VERSION_ARG=$CUDA_VERSION \ - --build-arg RAPIDS_VERSION_ARG=$RAPIDS_VERSION \ - --build-arg NCCL_VERSION_ARG=$NCCL_VERSION \ - build/testxgboost --gtest_filter=*MGPU* diff --git a/tests/buildkite/test-python-cpu-arm64.sh b/tests/buildkite/test-python-cpu-arm64.sh deleted file mode 100755 index 68a428034073..000000000000 --- a/tests/buildkite/test-python-cpu-arm64.sh +++ /dev/null @@ -1,11 +0,0 @@ -#!/bin/bash - -set -euo pipefail - -source tests/buildkite/conftest.sh - -echo "--- Test Python CPU ARM64" -buildkite-agent artifact download "python-package/dist/*.whl" . --step build-cpu-arm64 -buildkite-agent artifact download "xgboost" . --step build-cpu-arm64 -chmod +x ./xgboost -tests/ci_build/ci_build.sh aarch64 tests/ci_build/test_python.sh cpu-arm64 diff --git a/tests/buildkite/test-python-cpu.sh b/tests/buildkite/test-python-cpu.sh deleted file mode 100755 index 6c53dc2821bc..000000000000 --- a/tests/buildkite/test-python-cpu.sh +++ /dev/null @@ -1,16 +0,0 @@ -#!/bin/bash - -set -euo pipefail - -echo "--- Test CPU code in Python env" - -source tests/buildkite/conftest.sh - -mkdir -pv python-package/dist -buildkite-agent artifact download "python-package/dist/*.whl" . --step build-cuda -buildkite-agent artifact download "xgboost" . --step build-cpu -chmod +x ./xgboost - -export BUILDKITE_ANALYTICS_TOKEN=$(get_aws_secret buildkite/test_analytics/cpu) -set_buildkite_env_vars_in_container -tests/ci_build/ci_build.sh cpu tests/ci_build/test_python.sh cpu diff --git a/tests/buildkite/test-python-gpu.sh b/tests/buildkite/test-python-gpu.sh deleted file mode 100755 index d7bd729a2e01..000000000000 --- a/tests/buildkite/test-python-gpu.sh +++ /dev/null @@ -1,59 +0,0 @@ -#!/bin/bash - -set -euo pipefail - -if [ "$#" -lt 1 ] -then - suite='' - args='' -else - suite=$1 - shift 1 - args="$@" -fi - -source tests/buildkite/conftest.sh - -echo "--- Fetch build artifacts" -buildkite-agent artifact download "python-package/dist/*.whl" . --step build-cuda -buildkite-agent artifact download "build/testxgboost" . --step build-cuda -chmod +x build/testxgboost - -# Allocate extra space in /dev/shm to enable NCCL -export CI_DOCKER_EXTRA_PARAMS_INIT='--shm-size=4g' - -if [[ -z "${USE_DEPS_DEV_VER-}" ]] -then - container_tag='gpu' - rapids_version=${RAPIDS_VERSION} -else - container_tag='gpu_dev_ver' - rapids_version=${DEV_RAPIDS_VERSION} -fi - -command_wrapper="tests/ci_build/ci_build.sh ${container_tag} --use-gpus --build-arg "` - `"CUDA_VERSION_ARG=$CUDA_VERSION --build-arg "` - `"RAPIDS_VERSION_ARG=${rapids_version} --build-arg "` - `"NCCL_VERSION_ARG=$NCCL_VERSION" - -# Run specified test suite -case "$suite" in - gpu) - export BUILDKITE_ANALYTICS_TOKEN=$(get_aws_secret buildkite/test_analytics/gpu) - set_buildkite_env_vars_in_container - echo "--- Test XGBoost Python package, single GPU" - $command_wrapper tests/ci_build/test_python.sh $suite - ;; - - mgpu) - export BUILDKITE_ANALYTICS_TOKEN=$(get_aws_secret buildkite/test_analytics/mgpu) - set_buildkite_env_vars_in_container - echo "--- Test XGBoost Python package, 4 GPUs" - $command_wrapper tests/ci_build/test_python.sh $suite - ;; - - *) - echo "Usage: $0 {gpu|mgpu} [extra args to pass to pytest]" - exit 1 - ;; -esac diff --git a/tests/ci_build/test_python.sh b/tests/ci_build/test_python.sh deleted file mode 100755 index a1a023046e5b..000000000000 --- a/tests/ci_build/test_python.sh +++ /dev/null @@ -1,111 +0,0 @@ -#!/bin/bash -set -e - -if [ "$#" -lt 1 ] -then - suite='' - args='' -else - suite=$1 - shift 1 - args="$@" -fi - -# Install XGBoost Python package -function install_xgboost { - wheel_found=0 - pip install --upgrade pip --user - for file in python-package/dist/*.whl - do - if [ -e "${file}" ] - then - pip install --user "${file}" - wheel_found=1 - break # need just one - fi - done - if [ "$wheel_found" -eq 0 ] - then - pushd . - cd python-package - pip install --user -v . - popd - fi -} - -function setup_pyspark_envs { - export PYSPARK_DRIVER_PYTHON=`which python` - export PYSPARK_PYTHON=`which python` - export SPARK_TESTING=1 -} - -function unset_pyspark_envs { - unset PYSPARK_DRIVER_PYTHON - unset PYSPARK_PYTHON - unset SPARK_TESTING -} - -function uninstall_xgboost { - pip uninstall -y xgboost -} - -# Run specified test suite -case "$suite" in - gpu) - source activate gpu_test - set -x - install_xgboost - setup_pyspark_envs - python -c 'from cupy.cuda import jitify; jitify._init_module()' - pytest -v -s -rxXs --fulltrace --durations=0 -m "not mgpu" ${args} tests/python-gpu - unset_pyspark_envs - uninstall_xgboost - set +x - ;; - - mgpu) - source activate gpu_test - set -x - install_xgboost - setup_pyspark_envs - python -c 'from cupy.cuda import jitify; jitify._init_module()' - pytest -v -s -rxXs --fulltrace --durations=0 -m "mgpu" ${args} tests/python-gpu - pytest -v -s -rxXs --fulltrace --durations=0 -m "mgpu" ${args} tests/test_distributed/test_gpu_with_dask - pytest -v -s -rxXs --fulltrace --durations=0 -m "mgpu" ${args} tests/test_distributed/test_gpu_with_spark - pytest -v -s -rxXs --fulltrace --durations=0 -m "mgpu" ${args} tests/test_distributed/test_gpu_federated - unset_pyspark_envs - uninstall_xgboost - set +x - ;; - - cpu) - source activate linux_cpu_test - set -x - install_xgboost - export RAY_OBJECT_STORE_ALLOW_SLOW_STORAGE=1 - setup_pyspark_envs - pytest -v -s -rxXs --fulltrace --durations=0 ${args} tests/python - pytest -v -s -rxXs --fulltrace --durations=0 ${args} tests/test_distributed/test_with_dask - pytest -v -s -rxXs --fulltrace --durations=0 ${args} tests/test_distributed/test_with_spark - pytest -v -s -rxXs --fulltrace --durations=0 ${args} tests/test_distributed/test_federated - unset_pyspark_envs - uninstall_xgboost - set +x - ;; - - cpu-arm64) - source activate aarch64_test - set -x - install_xgboost - setup_pyspark_envs - pytest -v -s -rxXs --fulltrace --durations=0 ${args} tests/python/test_basic.py tests/python/test_basic_models.py tests/python/test_model_compatibility.py - unset_pyspark_envs - uninstall_xgboost - set +x - ;; - - *) - echo "Usage: $0 {gpu|mgpu|cpu|cpu-arm64} [extra args to pass to pytest]" - exit 1 - ;; -esac From 26fff3826da530931ea48833a018ef3bf686def7 Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Wed, 30 Oct 2024 16:59:57 -0700 Subject: [PATCH 14/45] Mass renaming; Migrate R GPU pkg build + MacOS --- .github/runs-on.yml | 4 +- .github/workflows/macos.yml | 24 ++ .github/workflows/main.yml | 31 +- .github/workflows/windows.yml | 5 +- dev/prepare_jvm_release.py | 2 +- {tests/ci_build => ops}/build_jvm_doc.sh | 11 +- .../ci_build => ops}/build_r_pkg_with_cuda.sh | 12 +- .../build-via-cmake.sh => build_via_cmake.sh} | 0 {dev => ops}/change_scala_version.py | 0 {tests/ci_build => ops}/change_version.py | 0 ops/{matrix => docker}/ci_container.yml | 2 +- ops/{matrix => docker}/docker_cache_ecr.yml | 0 .../{ => dockerfile}/Dockerfile.aarch64 | 0 .../{ => dockerfile}/Dockerfile.clang_tidy | 0 ops/docker/{ => dockerfile}/Dockerfile.cpu | 0 ops/docker/{ => dockerfile}/Dockerfile.gpu | 0 .../Dockerfile.gpu_build_r_rockylinux8 | 0 .../Dockerfile.gpu_build_rockylinux8 | 0 ops/docker/{ => dockerfile}/Dockerfile.i386 | 0 ops/docker/{ => dockerfile}/Dockerfile.jvm | 0 .../{ => dockerfile}/Dockerfile.jvm_cross | 0 .../{ => dockerfile}/Dockerfile.jvm_gpu_build | 0 .../Dockerfile.manylinux2014_aarch64 | 0 .../Dockerfile.manylinux2014_x86_64 | 0 .../Dockerfile.manylinux_2_28_x86_64 | 0 ops/docker/entrypoint.sh | 11 +- ops/{matrix => docker}/extract_build_args.jq | 0 ops/{matrix => docker}/extract_build_args.sh | 8 +- ops/docker_build.py | 7 +- ops/docker_build.sh | 8 +- {tests/ci_build => ops}/lint_cmake.sh | 0 {tests/ci_build => ops}/lint_cpp.py | 0 {tests/ci_build => ops}/lint_python.py | 0 {tests/ci_build => ops}/lint_r.R | 0 .../patches => patch}/cpu_only_pypkg.patch | 0 .../manylinux2014_warning.patch | 0 .../patches => patch}/remove_nccl_dep.patch | 0 ops/{task => pipeline}/build-cpu-arm64.sh | 4 +- ops/{task => pipeline}/build-cpu.sh | 6 +- ops/{task => pipeline}/build-cuda-with-rmm.sh | 14 +- ops/{task => pipeline}/build-cuda.sh | 14 +- .../pipeline}/build-gpu-rpkg.sh | 12 +- .../pipeline}/build-jvm-doc.sh | 6 +- .../pipeline}/build-jvm-macos-m1.sh | 8 +- .../build-jvm-manylinux2014.sh | 2 +- ops/{task => pipeline}/build-manylinux2014.sh | 10 +- ops/{task => pipeline}/build-win64-gpu.ps1 | 10 +- ops/{task => pipeline}/enforce-ci.ps1 | 2 +- ops/{task => pipeline}/enforce-ci.sh | 2 +- ops/{task => pipeline}/run-clang-tidy.sh | 2 +- ops/{task => pipeline}/test-cpp-gpu.sh | 2 +- ops/{task => pipeline}/test-python.sh | 5 +- ops/{task => pipeline}/test-win64-gpu.ps1 | 2 +- ops/stash_artifacts.ps1 | 2 +- ops/stash_artifacts.sh | 2 +- {tests/ci_build => ops}/test_r_package.py | 0 {tests/ci_build => ops}/test_tidy.cc | 0 {tests/ci_build => ops}/test_utils.py | 0 {tests/ci_build => ops}/tidy.py | 0 .../update-rapids.sh => ops/update_rapids.sh | 0 {tests/ci_build => ops}/verify_link.sh | 0 tests/buildkite/infrastructure/README.md | 106 ------ .../agent-iam-policy-template.yml | 32 -- .../aws-stack-creator/create_stack.py | 127 ------- .../aws-stack-creator/metadata.py | 114 ------ .../infrastructure/common_blocks/utils.py | 97 ----- .../buildkite/infrastructure/requirements.txt | 2 - .../service-user/create_service_user.py | 44 --- .../service-user/service-user-template.yml | 349 ------------------ .../create_worker_image_pipelines.py | 85 ----- .../ec2-image-builder-pipeline-template.yml | 108 ------ .../linux-amd64-gpu-bootstrap.yml | 24 -- .../worker-image-pipeline/metadata.py | 18 - .../worker-image-pipeline/run_pipelines.py | 22 -- .../windows-gpu-bootstrap.yml | 71 ---- tests/buildkite/pipeline-mac-m1.yml | 13 - tests/buildkite/pipeline-mgpu.yml | 16 - tests/buildkite/pipeline-nightly.yml | 37 -- tests/buildkite/pipeline-win64.yml | 24 -- tests/buildkite/pipeline.yml | 70 ---- .../test-integration-jvm-packages.sh | 13 - tests/buildkite/test-macos-m1-clang11.sh | 25 -- tests/ci_build/build_jvm_packages.sh | 9 +- tests/ci_build/ci_build.sh | 248 ------------- tests/ci_build/deploy_jvm_packages.sh | 2 +- tests/ci_build/entrypoint.sh | 43 --- tests/ci_build/initialize_maven.sh | 19 - tests/ci_build/jenkins_tools.Groovy | 38 -- tests/ci_build/test_jvm_cross.sh | 62 ---- 89 files changed, 137 insertions(+), 1911 deletions(-) create mode 100644 .github/workflows/macos.yml rename {tests/ci_build => ops}/build_jvm_doc.sh (88%) rename {tests/ci_build => ops}/build_r_pkg_with_cuda.sh (73%) rename ops/{task/build-via-cmake.sh => build_via_cmake.sh} (100%) rename {dev => ops}/change_scala_version.py (100%) rename {tests/ci_build => ops}/change_version.py (100%) rename ops/{matrix => docker}/ci_container.yml (95%) rename ops/{matrix => docker}/docker_cache_ecr.yml (100%) rename ops/docker/{ => dockerfile}/Dockerfile.aarch64 (100%) rename ops/docker/{ => dockerfile}/Dockerfile.clang_tidy (100%) rename ops/docker/{ => dockerfile}/Dockerfile.cpu (100%) rename ops/docker/{ => dockerfile}/Dockerfile.gpu (100%) rename ops/docker/{ => dockerfile}/Dockerfile.gpu_build_r_rockylinux8 (100%) rename ops/docker/{ => dockerfile}/Dockerfile.gpu_build_rockylinux8 (100%) rename ops/docker/{ => dockerfile}/Dockerfile.i386 (100%) rename ops/docker/{ => dockerfile}/Dockerfile.jvm (100%) rename ops/docker/{ => dockerfile}/Dockerfile.jvm_cross (100%) rename ops/docker/{ => dockerfile}/Dockerfile.jvm_gpu_build (100%) rename ops/docker/{ => dockerfile}/Dockerfile.manylinux2014_aarch64 (100%) rename ops/docker/{ => dockerfile}/Dockerfile.manylinux2014_x86_64 (100%) rename ops/docker/{ => dockerfile}/Dockerfile.manylinux_2_28_x86_64 (100%) rename ops/{matrix => docker}/extract_build_args.jq (100%) rename ops/{matrix => docker}/extract_build_args.sh (68%) rename {tests/ci_build => ops}/lint_cmake.sh (100%) rename {tests/ci_build => ops}/lint_cpp.py (100%) rename {tests/ci_build => ops}/lint_python.py (100%) rename {tests/ci_build => ops}/lint_r.R (100%) rename ops/{task/patches => patch}/cpu_only_pypkg.patch (100%) rename ops/{task/patches => patch}/manylinux2014_warning.patch (100%) rename ops/{task/patches => patch}/remove_nccl_dep.patch (100%) rename ops/{task => pipeline}/build-cpu-arm64.sh (96%) rename ops/{task => pipeline}/build-cpu.sh (92%) rename ops/{task => pipeline}/build-cuda-with-rmm.sh (91%) rename ops/{task => pipeline}/build-cuda.sh (93%) rename {tests/buildkite => ops/pipeline}/build-gpu-rpkg.sh (53%) rename {tests/buildkite => ops/pipeline}/build-jvm-doc.sh (70%) rename {tests/buildkite => ops/pipeline}/build-jvm-macos-m1.sh (85%) rename ops/{task => pipeline}/build-jvm-manylinux2014.sh (96%) rename ops/{task => pipeline}/build-manylinux2014.sh (88%) rename ops/{task => pipeline}/build-win64-gpu.ps1 (93%) rename ops/{task => pipeline}/enforce-ci.ps1 (94%) rename ops/{task => pipeline}/enforce-ci.sh (94%) rename ops/{task => pipeline}/run-clang-tidy.sh (83%) rename ops/{task => pipeline}/test-cpp-gpu.sh (96%) rename ops/{task => pipeline}/test-python.sh (98%) rename ops/{task => pipeline}/test-win64-gpu.ps1 (96%) rename {tests/ci_build => ops}/test_r_package.py (100%) rename {tests/ci_build => ops}/test_tidy.cc (100%) rename {tests/ci_build => ops}/test_utils.py (100%) rename {tests/ci_build => ops}/tidy.py (100%) rename tests/buildkite/update-rapids.sh => ops/update_rapids.sh (100%) rename {tests/ci_build => ops}/verify_link.sh (100%) delete mode 100644 tests/buildkite/infrastructure/README.md delete mode 100644 tests/buildkite/infrastructure/aws-stack-creator/agent-iam-policy-template.yml delete mode 100644 tests/buildkite/infrastructure/aws-stack-creator/create_stack.py delete mode 100644 tests/buildkite/infrastructure/aws-stack-creator/metadata.py delete mode 100644 tests/buildkite/infrastructure/common_blocks/utils.py delete mode 100644 tests/buildkite/infrastructure/requirements.txt delete mode 100644 tests/buildkite/infrastructure/service-user/create_service_user.py delete mode 100644 tests/buildkite/infrastructure/service-user/service-user-template.yml delete mode 100644 tests/buildkite/infrastructure/worker-image-pipeline/create_worker_image_pipelines.py delete mode 100644 tests/buildkite/infrastructure/worker-image-pipeline/ec2-image-builder-pipeline-template.yml delete mode 100644 tests/buildkite/infrastructure/worker-image-pipeline/linux-amd64-gpu-bootstrap.yml delete mode 100644 tests/buildkite/infrastructure/worker-image-pipeline/metadata.py delete mode 100644 tests/buildkite/infrastructure/worker-image-pipeline/run_pipelines.py delete mode 100644 tests/buildkite/infrastructure/worker-image-pipeline/windows-gpu-bootstrap.yml delete mode 100644 tests/buildkite/pipeline-mac-m1.yml delete mode 100644 tests/buildkite/pipeline-nightly.yml delete mode 100644 tests/buildkite/pipeline-win64.yml delete mode 100755 tests/buildkite/test-integration-jvm-packages.sh delete mode 100755 tests/buildkite/test-macos-m1-clang11.sh delete mode 100755 tests/ci_build/ci_build.sh delete mode 100755 tests/ci_build/entrypoint.sh delete mode 100755 tests/ci_build/initialize_maven.sh delete mode 100644 tests/ci_build/jenkins_tools.Groovy delete mode 100755 tests/ci_build/test_jvm_cross.sh diff --git a/.github/runs-on.yml b/.github/runs-on.yml index 720ba76bb836..e21895ee8c3b 100644 --- a/.github/runs-on.yml +++ b/.github/runs-on.yml @@ -14,7 +14,7 @@ images: runners: linux-amd64-cpu: - cpu: 32 + cpu: 16 family: ["c7i-flex", "c7i", "c7a", "c5", "c5a"] image: linux-amd64 linux-amd64-gpu: @@ -24,7 +24,7 @@ runners: family: ["g4dn.12xlarge"] image: linux-amd64 linux-arm64-cpu: - cpu: 32 + cpu: 16 family: ["c6g", "c7g"] image: ubuntu24-full-arm64 windows-gpu: diff --git a/.github/workflows/macos.yml b/.github/workflows/macos.yml new file mode 100644 index 000000000000..2bb3e1aba46c --- /dev/null +++ b/.github/workflows/macos.yml @@ -0,0 +1,24 @@ +name: Nextgen XGBoost CI, MacOS + +on: [push, pull_request] + +permissions: + contents: read # to fetch code (actions/checkout) + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + +env: + BRANCH_NAME: >- + ${{ github.event.pull_request.number && 'PR-' }}${{ github.event.pull_request.number || github.ref_name }} + +jobs: + mac-m1-jvm: + name: "Build libxgboost4j.dylib for MacOS M1" + runs-on: macos-14 + steps: + - uses: actions/checkout@v4 + with: + submodules: "true" + - run: bash ops/pipeline/build-jvm-macos-m1.sh diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 00e9b5abb844..276fa45ba533 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -20,11 +20,13 @@ jobs: runs-on: - runs-on=${{ github.run_id }} - runner=${{ matrix.runner }} + - spot=false strategy: matrix: container_id: - xgb-ci.gpu_build_rockylinux8 - xgb-ci.gpu + - xgb-ci.gpu_dev_ver - xgb-ci.cpu - xgb-ci.clang_tidy - xgb-ci.manylinux_2_28_x86_64 @@ -62,7 +64,7 @@ jobs: run: bash ops/docker_build.sh env: CONTAINER_ID: xgb-ci.clang_tidy - - run: bash ops/task/run-clang-tidy.sh + - run: bash ops/pipeline/run-clang-tidy.sh build-cpu: name: Build CPU @@ -80,7 +82,7 @@ jobs: run: bash ops/docker_build.sh env: CONTAINER_ID: xgb-ci.cpu - - run: bash ops/task/build-cpu.sh + - run: bash ops/pipeline/build-cpu.sh - name: Stash CLI executable run: bash ops/stash_artifacts.sh ./xgboost env: @@ -103,7 +105,7 @@ jobs: run: bash ops/docker_build.sh env: CONTAINER_ID: xgb-ci.aarch64 - - run: bash ops/task/build-cpu-arm64.sh + - run: bash ops/pipeline/build-cpu-arm64.sh - name: Stash files run: bash ops/stash_artifacts.sh ./xgboost python-package/dist/*.whl env: @@ -116,6 +118,7 @@ jobs: runs-on: - runs-on=${{ github.run_id }} - runner=linux-amd64-cpu + - spot=false steps: # Restart Docker daemon so that it recognizes the ephemeral disks - run: sudo systemctl restart docker @@ -130,9 +133,11 @@ jobs: run: bash ops/docker_build.sh env: CONTAINER_ID: xgb-ci.manylinux_2_28_x86_64 - - run: bash ops/task/build-cuda.sh + - run: bash ops/pipeline/build-cuda.sh - name: Stash files - run: bash ops/stash_artifacts.sh build/testxgboost python-package/dist/*.whl + run: | + bash ops/stash_artifacts.sh \ + build/testxgboost ./xgboost python-package/dist/*.whl env: COMMAND: upload KEY: build-cuda @@ -157,7 +162,7 @@ jobs: run: bash ops/docker_build.sh env: CONTAINER_ID: xgb-ci.manylinux_2_28_x86_64 - - run: bash ops/task/build-cuda-with-rmm.sh + - run: bash ops/pipeline/build-cuda-with-rmm.sh - name: Stash files run: bash ops/stash_artifacts.sh build/testxgboost env: @@ -188,7 +193,7 @@ jobs: run: bash ops/docker_build.sh env: CONTAINER_ID: xgb-ci.manylinux2014_${{ matrix.arch }} - - run: bash ops/task/build-jvm-manylinux2014.sh ${{ matrix.arch }} + - run: bash ops/pipeline/build-jvm-manylinux2014.sh ${{ matrix.arch }} build-manylinux2014: name: Build manylinux2024_${{ matrix.arch }} wheel @@ -213,7 +218,7 @@ jobs: run: bash ops/docker_build.sh env: CONTAINER_ID: xgb-ci.manylinux2014_${{ matrix.arch }} - - run: bash ops/task/build-manylinux2014.sh ${{ matrix.arch }} + - run: bash ops/pipeline/build-manylinux2014.sh ${{ matrix.arch }} test-cpp-gpu: name: Run Google Tests with GPU(s) @@ -250,7 +255,7 @@ jobs: env: COMMAND: download KEY: ${{ matrix.artifact_from }} - - run: bash ops/task/test-cpp-gpu.sh ${{ matrix.suite }} + - run: bash ops/pipeline/test-cpp-gpu.sh ${{ matrix.suite }} test-python: name: Run Python tests @@ -302,9 +307,11 @@ jobs: env: CONTAINER_ID: ${{ matrix.container }} - name: Unstash Python wheel - run: bash ops/stash_artifacts.sh python-package/dist/*.whl + run: | + bash ops/stash_artifacts.sh python-package/dist/*.whl ./xgboost + chmod +x ./xgboost env: COMMAND: download - KEY: build-cuda + KEY: ${{ matrix.artifact_from }} - name: Run Python tests, ${{ matrix.description }} - run: bash ops/task/test-python.sh ${{ matrix.suite }} ${{ matrix.container }} + run: bash ops/pipeline/test-python.sh ${{ matrix.suite }} ${{ matrix.container }} diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml index 76388302f49f..0fc50815d683 100644 --- a/.github/workflows/windows.yml +++ b/.github/workflows/windows.yml @@ -26,11 +26,12 @@ jobs: runs-on: - runs-on=${{ github.run_id }} - runner=windows-cpu + - spot=false steps: - uses: actions/checkout@v4 with: submodules: "true" - - run: powershell ops/task/build-win64-gpu.ps1 + - run: powershell ops/pipeline/build-win64-gpu.ps1 - name: Stash files run: | powershell ops/stash_artifacts.ps1 ` @@ -56,4 +57,4 @@ jobs: env: COMMAND: download KEY: build-win64-gpu - - run: powershell ops/task/test-win64-gpu.ps1 + - run: powershell ops/pipeline/test-win64-gpu.ps1 diff --git a/dev/prepare_jvm_release.py b/dev/prepare_jvm_release.py index 0b4594e2d2c0..927cb4945950 100644 --- a/dev/prepare_jvm_release.py +++ b/dev/prepare_jvm_release.py @@ -203,7 +203,7 @@ def main(): ) print( "5. Remove the Scala 2.12 artifacts and build Scala 2.13 artifacts:\n" - " python dev/change_scala_version.py --scala-version 2.13 --purge-artifacts\n" + " python ops/change_scala_version.py --scala-version 2.13 --purge-artifacts\n" " GPG_TTY=$(tty) mvn deploy -Prelease -DskipTests -Dskip.native.build=true" ) print( diff --git a/tests/ci_build/build_jvm_doc.sh b/ops/build_jvm_doc.sh similarity index 88% rename from tests/ci_build/build_jvm_doc.sh rename to ops/build_jvm_doc.sh index a536b0efeeb3..6f785f488027 100755 --- a/tests/ci_build/build_jvm_doc.sh +++ b/ops/build_jvm_doc.sh @@ -1,15 +1,14 @@ #!/bin/bash -if [ $# -ne 1 ]; then +## Build docs for the JVM packages and package it in a tarball + +if [[ $# -ne 1 ]] +then echo "Usage: $0 [branch name]" exit 1 fi -set -e -set -x - -# Initialize local Maven repository -./tests/ci_build/initialize_maven.sh +set -euo pipefail rm -rf build/ cd jvm-packages diff --git a/tests/ci_build/build_r_pkg_with_cuda.sh b/ops/build_r_pkg_with_cuda.sh similarity index 73% rename from tests/ci_build/build_r_pkg_with_cuda.sh rename to ops/build_r_pkg_with_cuda.sh index 78a2afc1cdf7..d0a7c9295195 100755 --- a/tests/ci_build/build_r_pkg_with_cuda.sh +++ b/ops/build_r_pkg_with_cuda.sh @@ -1,8 +1,12 @@ #!/bin/bash -set -e -set -x -if [ "$#" -ne 1 ] +## Build XGBoost R package with GPU support and package it in a tarball. +## Users will be able to install it without having CTK installed +## (only a compatible NVIDIA driver is needed). + +set -euo pipefail + +if [[ "$#" -ne 1 ]] then echo "Build the R package tarball with CUDA code. Usage: $0 [commit hash]" exit 1 @@ -10,7 +14,7 @@ fi commit_hash="$1" -python tests/ci_build/test_r_package.py --task=pack +python3 ops/test_r_package.py --task=pack mv xgboost/ xgboost_rpack/ mkdir build diff --git a/ops/task/build-via-cmake.sh b/ops/build_via_cmake.sh similarity index 100% rename from ops/task/build-via-cmake.sh rename to ops/build_via_cmake.sh diff --git a/dev/change_scala_version.py b/ops/change_scala_version.py similarity index 100% rename from dev/change_scala_version.py rename to ops/change_scala_version.py diff --git a/tests/ci_build/change_version.py b/ops/change_version.py similarity index 100% rename from tests/ci_build/change_version.py rename to ops/change_version.py diff --git a/ops/matrix/ci_container.yml b/ops/docker/ci_container.yml similarity index 95% rename from ops/matrix/ci_container.yml rename to ops/docker/ci_container.yml index fb0ae62325cd..f21122231c0b 100644 --- a/ops/matrix/ci_container.yml +++ b/ops/docker/ci_container.yml @@ -1,7 +1,7 @@ ## List of CI containers with definitions and build arguments # Each container will be built using the definition from -# ops/docker/Dockerfile.CONTAINER_DEF +# ops/docker/dockerfile/Dockerfile.CONTAINER_DEF xgb-ci.gpu_build_rockylinux8: container_def: gpu_build_rockylinux8 diff --git a/ops/matrix/docker_cache_ecr.yml b/ops/docker/docker_cache_ecr.yml similarity index 100% rename from ops/matrix/docker_cache_ecr.yml rename to ops/docker/docker_cache_ecr.yml diff --git a/ops/docker/Dockerfile.aarch64 b/ops/docker/dockerfile/Dockerfile.aarch64 similarity index 100% rename from ops/docker/Dockerfile.aarch64 rename to ops/docker/dockerfile/Dockerfile.aarch64 diff --git a/ops/docker/Dockerfile.clang_tidy b/ops/docker/dockerfile/Dockerfile.clang_tidy similarity index 100% rename from ops/docker/Dockerfile.clang_tidy rename to ops/docker/dockerfile/Dockerfile.clang_tidy diff --git a/ops/docker/Dockerfile.cpu b/ops/docker/dockerfile/Dockerfile.cpu similarity index 100% rename from ops/docker/Dockerfile.cpu rename to ops/docker/dockerfile/Dockerfile.cpu diff --git a/ops/docker/Dockerfile.gpu b/ops/docker/dockerfile/Dockerfile.gpu similarity index 100% rename from ops/docker/Dockerfile.gpu rename to ops/docker/dockerfile/Dockerfile.gpu diff --git a/ops/docker/Dockerfile.gpu_build_r_rockylinux8 b/ops/docker/dockerfile/Dockerfile.gpu_build_r_rockylinux8 similarity index 100% rename from ops/docker/Dockerfile.gpu_build_r_rockylinux8 rename to ops/docker/dockerfile/Dockerfile.gpu_build_r_rockylinux8 diff --git a/ops/docker/Dockerfile.gpu_build_rockylinux8 b/ops/docker/dockerfile/Dockerfile.gpu_build_rockylinux8 similarity index 100% rename from ops/docker/Dockerfile.gpu_build_rockylinux8 rename to ops/docker/dockerfile/Dockerfile.gpu_build_rockylinux8 diff --git a/ops/docker/Dockerfile.i386 b/ops/docker/dockerfile/Dockerfile.i386 similarity index 100% rename from ops/docker/Dockerfile.i386 rename to ops/docker/dockerfile/Dockerfile.i386 diff --git a/ops/docker/Dockerfile.jvm b/ops/docker/dockerfile/Dockerfile.jvm similarity index 100% rename from ops/docker/Dockerfile.jvm rename to ops/docker/dockerfile/Dockerfile.jvm diff --git a/ops/docker/Dockerfile.jvm_cross b/ops/docker/dockerfile/Dockerfile.jvm_cross similarity index 100% rename from ops/docker/Dockerfile.jvm_cross rename to ops/docker/dockerfile/Dockerfile.jvm_cross diff --git a/ops/docker/Dockerfile.jvm_gpu_build b/ops/docker/dockerfile/Dockerfile.jvm_gpu_build similarity index 100% rename from ops/docker/Dockerfile.jvm_gpu_build rename to ops/docker/dockerfile/Dockerfile.jvm_gpu_build diff --git a/ops/docker/Dockerfile.manylinux2014_aarch64 b/ops/docker/dockerfile/Dockerfile.manylinux2014_aarch64 similarity index 100% rename from ops/docker/Dockerfile.manylinux2014_aarch64 rename to ops/docker/dockerfile/Dockerfile.manylinux2014_aarch64 diff --git a/ops/docker/Dockerfile.manylinux2014_x86_64 b/ops/docker/dockerfile/Dockerfile.manylinux2014_x86_64 similarity index 100% rename from ops/docker/Dockerfile.manylinux2014_x86_64 rename to ops/docker/dockerfile/Dockerfile.manylinux2014_x86_64 diff --git a/ops/docker/Dockerfile.manylinux_2_28_x86_64 b/ops/docker/dockerfile/Dockerfile.manylinux_2_28_x86_64 similarity index 100% rename from ops/docker/Dockerfile.manylinux_2_28_x86_64 rename to ops/docker/dockerfile/Dockerfile.manylinux_2_28_x86_64 diff --git a/ops/docker/entrypoint.sh b/ops/docker/entrypoint.sh index a0c5f56bb52d..babe4359e8e1 100755 --- a/ops/docker/entrypoint.sh +++ b/ops/docker/entrypoint.sh @@ -1,12 +1,8 @@ #!/usr/bin/env bash -# This script is a wrapper creating the same user inside container as the one -# running the ci_build.sh outside the container. It also set the home directory -# for the user inside container to match the same absolute path as the workspace -# outside of container. Do not run this manually. It does not make sense. It is -# intended to be called by ci_build.sh only. +# This wrapper script -set -e +set -euo pipefail COMMAND=("$@") @@ -19,7 +15,8 @@ else rm /this_is_writable_file_system fi -if [[ -n $CI_BUILD_UID ]] && [[ -n $CI_BUILD_GID ]]; then +if [[ -n ${CI_BUILD_UID:-} ]] && [[ -n ${CI_BUILD_GID:-} ]] +then groupadd -o -g "${CI_BUILD_GID}" "${CI_BUILD_GROUP}" || true useradd -o -m -g "${CI_BUILD_GID}" -u "${CI_BUILD_UID}" \ "${CI_BUILD_USER}" || true diff --git a/ops/matrix/extract_build_args.jq b/ops/docker/extract_build_args.jq similarity index 100% rename from ops/matrix/extract_build_args.jq rename to ops/docker/extract_build_args.jq diff --git a/ops/matrix/extract_build_args.sh b/ops/docker/extract_build_args.sh similarity index 68% rename from ops/matrix/extract_build_args.sh rename to ops/docker/extract_build_args.sh index ec4621bc42b2..0fa7b132b760 100755 --- a/ops/matrix/extract_build_args.sh +++ b/ops/docker/extract_build_args.sh @@ -1,5 +1,5 @@ #!/bin/bash -## Extract container definition and build args from ops/matrix/ci_container.yml, +## Extract container definition and build args from ops/docker/ci_container.yml, ## given the container ID. if [ "$#" -ne 1 ]; then @@ -9,13 +9,13 @@ fi CONTAINER_ID="$1" CONTAINER_DEF=$( - yq -o json ops/matrix/ci_container.yml | + yq -o json ops/docker/ci_container.yml | jq -r --arg container_id "${CONTAINER_ID}" '.[$container_id].container_def' ) BUILD_ARGS=$( - yq -o json ops/matrix/ci_container.yml | + yq -o json ops/docker/ci_container.yml | jq -r --arg container_id "${CONTAINER_ID}" \ - 'include "ops/matrix/extract_build_args"; + 'include "ops/docker/extract_build_args"; compute_build_args(.; $container_id)' ) echo "CONTAINER_DEF='${CONTAINER_DEF}' BUILD_ARGS='${BUILD_ARGS}'" diff --git a/ops/docker_build.py b/ops/docker_build.py index dd2871c3a6ed..922d528814a4 100644 --- a/ops/docker_build.py +++ b/ops/docker_build.py @@ -70,7 +70,9 @@ def docker_build( def main(args: argparse.Namespace) -> None: # Dockerfile to be used in docker build - dockerfile_path = SCRIPT_DIR / "docker" / f"Dockerfile.{args.container_def}" + dockerfile_path = ( + SCRIPT_DIR / "docker" / "dockerfile" / f"Dockerfile.{args.container_def}" + ) docker_context_path = SCRIPT_DIR / "docker" build_args = parse_build_args(args.build_arg) @@ -93,7 +95,8 @@ def main(args: argparse.Namespace) -> None: required=True, help=( "String uniquely identifying the container definition. The container " - "definition will be fetched from docker/Dockerfile.CONTAINER_DEF." + "definition will be fetched from " + "docker/dockerfile/Dockerfile.CONTAINER_DEF." ), ) parser.add_argument( diff --git a/ops/docker_build.sh b/ops/docker_build.sh index c8c0680aea05..0539f817ba8e 100755 --- a/ops/docker_build.sh +++ b/ops/docker_build.sh @@ -2,7 +2,7 @@ ## Build a CI container and cache the layers in AWS ECR (Elastic Container Registry). ## This script provides a convenient wrapper for ops/docker_build.py. ## Build-time variables (--build-arg) and container defintion are fetched from -## ops/matrix/ci_container.yml. +## ops/docker/ci_container.yml. ## ## Note. This script takes in all inputs via environment variables. @@ -48,7 +48,7 @@ do done # Fetch CONTAINER_DEF and BUILD_ARGS -source <(ops/matrix/extract_build_args.sh ${CONTAINER_ID} | tee /dev/stderr) 2>&1 +source <(ops/docker/extract_build_args.sh ${CONTAINER_ID} | tee /dev/stderr) 2>&1 if [[ "${USE_DOCKER_CACHE:-}" != "1" ]] # Any value other than 1 is considered false then @@ -59,8 +59,8 @@ if [[ ${USE_DOCKER_CACHE} -eq 0 ]] then echo "USE_DOCKER_CACHE not set; caching disabled" else - DOCKER_CACHE_ECR_ID=$(yq ".DOCKER_CACHE_ECR_ID" ops/matrix/docker_cache_ecr.yml) - DOCKER_CACHE_ECR_REGION=$(yq ".DOCKER_CACHE_ECR_REGION" ops/matrix/docker_cache_ecr.yml) + DOCKER_CACHE_ECR_ID=$(yq ".DOCKER_CACHE_ECR_ID" ops/docker/docker_cache_ecr.yml) + DOCKER_CACHE_ECR_REGION=$(yq ".DOCKER_CACHE_ECR_REGION" ops/docker/docker_cache_ecr.yml) DOCKER_CACHE_REPO="${DOCKER_CACHE_ECR_ID}.dkr.ecr.${DOCKER_CACHE_ECR_REGION}.amazonaws.com" echo "Using AWS ECR; repo URL = ${DOCKER_CACHE_REPO}" # Login for Docker registry diff --git a/tests/ci_build/lint_cmake.sh b/ops/lint_cmake.sh similarity index 100% rename from tests/ci_build/lint_cmake.sh rename to ops/lint_cmake.sh diff --git a/tests/ci_build/lint_cpp.py b/ops/lint_cpp.py similarity index 100% rename from tests/ci_build/lint_cpp.py rename to ops/lint_cpp.py diff --git a/tests/ci_build/lint_python.py b/ops/lint_python.py similarity index 100% rename from tests/ci_build/lint_python.py rename to ops/lint_python.py diff --git a/tests/ci_build/lint_r.R b/ops/lint_r.R similarity index 100% rename from tests/ci_build/lint_r.R rename to ops/lint_r.R diff --git a/ops/task/patches/cpu_only_pypkg.patch b/ops/patch/cpu_only_pypkg.patch similarity index 100% rename from ops/task/patches/cpu_only_pypkg.patch rename to ops/patch/cpu_only_pypkg.patch diff --git a/ops/task/patches/manylinux2014_warning.patch b/ops/patch/manylinux2014_warning.patch similarity index 100% rename from ops/task/patches/manylinux2014_warning.patch rename to ops/patch/manylinux2014_warning.patch diff --git a/ops/task/patches/remove_nccl_dep.patch b/ops/patch/remove_nccl_dep.patch similarity index 100% rename from ops/task/patches/remove_nccl_dep.patch rename to ops/patch/remove_nccl_dep.patch diff --git a/ops/task/build-cpu-arm64.sh b/ops/pipeline/build-cpu-arm64.sh similarity index 96% rename from ops/task/build-cpu-arm64.sh rename to ops/pipeline/build-cpu-arm64.sh index 4a8c96e0e941..8a5db56d9eeb 100755 --- a/ops/task/build-cpu-arm64.sh +++ b/ops/pipeline/build-cpu-arm64.sh @@ -6,12 +6,12 @@ WHEEL_TAG=manylinux_2_28_aarch64 echo "--- Build CPU code targeting ARM64" -source ops/task/enforce-ci.sh +source ops/pipeline/enforce-ci.sh echo "--- Build libxgboost from the source" python3 ops/docker_run.py \ --container-id xgb-ci.aarch64 \ - -- ops/task/build-via-cmake.sh \ + -- ops/build_via_cmake.sh \ --conda-env=aarch64_test \ -DUSE_OPENMP=ON \ -DHIDE_CXX_SYMBOL=ON diff --git a/ops/task/build-cpu.sh b/ops/pipeline/build-cpu.sh similarity index 92% rename from ops/task/build-cpu.sh rename to ops/pipeline/build-cpu.sh index 7f8c69cd43bf..60346203d85f 100755 --- a/ops/task/build-cpu.sh +++ b/ops/pipeline/build-cpu.sh @@ -2,7 +2,7 @@ set -euox pipefail -source ops/task/enforce-ci.sh +source ops/pipeline/enforce-ci.sh echo "--- Build CPU code" @@ -18,7 +18,7 @@ echo "--- Run Google Test with sanitizer enabled" sudo sysctl vm.mmap_rnd_bits=28 python3 ops/docker_run.py \ --container-id xgb-ci.cpu \ - -- ops/task/build-via-cmake.sh \ + -- ops/build_via_cmake.sh \ -DUSE_SANITIZER=ON \ -DENABLED_SANITIZERS="address;leak;undefined" \ -DCMAKE_BUILD_TYPE=Debug \ @@ -35,7 +35,7 @@ python3 ops/docker_run.py \ echo "--- Run Google Test" python3 ops/docker_run.py \ --container-id xgb-ci.cpu \ - -- ops/task/build-via-cmake.sh \ + -- ops/build_via_cmake.sh \ -DCMAKE_PREFIX_PATH=/opt/grpc \ -DPLUGIN_FEDERATED=ON python3 ops/docker_run.py \ diff --git a/ops/task/build-cuda-with-rmm.sh b/ops/pipeline/build-cuda-with-rmm.sh similarity index 91% rename from ops/task/build-cuda-with-rmm.sh rename to ops/pipeline/build-cuda-with-rmm.sh index 901e66a8f649..ab5420002f46 100755 --- a/ops/task/build-cuda-with-rmm.sh +++ b/ops/pipeline/build-cuda-with-rmm.sh @@ -4,21 +4,21 @@ set -euo pipefail WHEEL_TAG=manylinux_2_28_x86_64 -source ops/task/enforce-ci.sh +source ops/pipeline/enforce-ci.sh echo "--- Build with CUDA with RMM" -if [[ ($is_pull_request == 1) || ($is_release_branch == 0) ]] -then +#if [[ ($is_pull_request == 1) || ($is_release_branch == 0) ]] +#then arch_flag="-DGPU_COMPUTE_VER=75" -else - arch_flag="" -fi +#else +# arch_flag="" +#fi echo "--- Build libxgboost from the source" python3 ops/docker_run.py \ --container-id xgb-ci.gpu_build_rockylinux8 \ - -- ops/task/build-via-cmake.sh \ + -- ops/build_via_cmake.sh \ -DCMAKE_PREFIX_PATH="/opt/grpc;/opt/rmm;/opt/rmm/lib64/rapids/cmake" \ -DUSE_CUDA=ON \ -DUSE_OPENMP=ON \ diff --git a/ops/task/build-cuda.sh b/ops/pipeline/build-cuda.sh similarity index 93% rename from ops/task/build-cuda.sh rename to ops/pipeline/build-cuda.sh index c98c041d8187..690c7f25f69e 100755 --- a/ops/task/build-cuda.sh +++ b/ops/pipeline/build-cuda.sh @@ -4,22 +4,22 @@ set -euo pipefail WHEEL_TAG=manylinux_2_28_x86_64 -source ops/task/enforce-ci.sh +source ops/pipeline/enforce-ci.sh echo "--- Build with CUDA" -if [[ ($is_pull_request == 1) || ($is_release_branch == 0) ]] -then +# if [[ ($is_pull_request == 1) || ($is_release_branch == 0) ]] +#then arch_flag="-DGPU_COMPUTE_VER=75" -else - arch_flag="" -fi +#else +# arch_flag="" +#fi echo "--- Build libxgboost from the source" git clone https://github.com/NVIDIA/cccl.git -b v2.6.1 --quiet python3 ops/docker_run.py \ --container-id xgb-ci.gpu_build_rockylinux8 \ - -- ops/task/build-via-cmake.sh \ + -- ops/build_via_cmake.sh \ -DCMAKE_PREFIX_PATH="/opt/grpc;/workspace/cccl" \ -DUSE_CUDA=ON \ -DUSE_OPENMP=ON \ diff --git a/tests/buildkite/build-gpu-rpkg.sh b/ops/pipeline/build-gpu-rpkg.sh similarity index 53% rename from tests/buildkite/build-gpu-rpkg.sh rename to ops/pipeline/build-gpu-rpkg.sh index 83bcd9eb9c7b..4df0c029568c 100755 --- a/tests/buildkite/build-gpu-rpkg.sh +++ b/ops/pipeline/build-gpu-rpkg.sh @@ -2,15 +2,13 @@ set -euo pipefail -source tests/buildkite/conftest.sh +source ops/pipeline/enforce-ci.sh echo "--- Build XGBoost R package with CUDA" - -tests/ci_build/ci_build.sh gpu_build_r_rockylinux8 \ - --build-arg CUDA_VERSION_ARG=${CUDA_VERSION} \ - --build-arg R_VERSION_ARG=${R_VERSION} \ - tests/ci_build/build_r_pkg_with_cuda.sh \ - ${BUILDKITE_COMMIT} +python3 ops/docker_run.py \ + --container-id xgb-ci.gpu_build_r_rockylinux8 \ + -- tests/ci_build/build_r_pkg_with_cuda.sh \ + ${GITHUB_SHA} if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]] then diff --git a/tests/buildkite/build-jvm-doc.sh b/ops/pipeline/build-jvm-doc.sh similarity index 70% rename from tests/buildkite/build-jvm-doc.sh rename to ops/pipeline/build-jvm-doc.sh index d168eb8cc58d..7f5eb0ac7b8a 100755 --- a/tests/buildkite/build-jvm-doc.sh +++ b/ops/pipeline/build-jvm-doc.sh @@ -2,10 +2,12 @@ set -euo pipefail -source tests/buildkite/conftest.sh +source ops/pipeline/enforce-ci.sh echo "--- Build JVM packages doc" -tests/ci_build/ci_build.sh jvm tests/ci_build/build_jvm_doc.sh ${BRANCH_NAME} +python3 ops/docker_run.py \ + --container-id jvm \ + -- ops/build_jvm_doc.sh ${BRANCH_NAME} if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]] then echo "--- Upload JVM packages doc" diff --git a/tests/buildkite/build-jvm-macos-m1.sh b/ops/pipeline/build-jvm-macos-m1.sh similarity index 85% rename from tests/buildkite/build-jvm-macos-m1.sh rename to ops/pipeline/build-jvm-macos-m1.sh index 1d2e5e8703bc..d50c1a1a1b1d 100644 --- a/tests/buildkite/build-jvm-macos-m1.sh +++ b/ops/pipeline/build-jvm-macos-m1.sh @@ -2,7 +2,7 @@ set -euo pipefail -source tests/buildkite/conftest.sh +source ops/pipeline/enforce-ci.sh # Display system info echo "--- Display system information" @@ -12,6 +12,8 @@ sysctl -n machdep.cpu.brand_string uname -m set +x +brew install ninja libomp + # Build XGBoost4J binary echo "--- Build libxgboost4j.dylib" set -x @@ -28,9 +30,9 @@ set +x echo "--- Upload libxgboost4j.dylib" set -x pushd lib -libname=libxgboost4j_m1_${BUILDKITE_COMMIT}.dylib +libname=libxgboost4j_m1_${GITHUB_SHA}.dylib mv -v libxgboost4j.dylib ${libname} -buildkite-agent artifact upload ${libname} + if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]] then aws s3 cp ${libname} \ diff --git a/ops/task/build-jvm-manylinux2014.sh b/ops/pipeline/build-jvm-manylinux2014.sh similarity index 96% rename from ops/task/build-jvm-manylinux2014.sh rename to ops/pipeline/build-jvm-manylinux2014.sh index 88bdb256821f..c009de93e62c 100644 --- a/ops/task/build-jvm-manylinux2014.sh +++ b/ops/pipeline/build-jvm-manylinux2014.sh @@ -2,7 +2,7 @@ set -euo pipefail -source ops/task/enforce-ci.sh +source ops/pipeline/enforce-ci.sh if [ $# -ne 1 ]; then echo "Usage: $0 {x86_64,aarch64}" diff --git a/ops/task/build-manylinux2014.sh b/ops/pipeline/build-manylinux2014.sh similarity index 88% rename from ops/task/build-manylinux2014.sh rename to ops/pipeline/build-manylinux2014.sh index 7b71b51a0587..5b1935097d9d 100755 --- a/ops/task/build-manylinux2014.sh +++ b/ops/pipeline/build-manylinux2014.sh @@ -2,7 +2,7 @@ set -euo pipefail -source ops/task/enforce-ci.sh +source ops/pipeline/enforce-ci.sh if [ $# -ne 1 ]; then echo "Usage: $0 {x86_64,aarch64}" @@ -18,8 +18,8 @@ python_bin="/opt/python/cp310-cp310/bin/python" echo "--- Build binary wheel for ${WHEEL_TAG}" # Patch to add warning about manylinux2014 variant -patch -p0 < ops/task/patches/remove_nccl_dep.patch -patch -p0 < ops/task/patches/manylinux2014_warning.patch +patch -p0 < ops/patch/remove_nccl_dep.patch +patch -p0 < ops/patch/manylinux2014_warning.patch python3 ops/docker_run.py \ --container-id ${image} \ -- bash -c \ @@ -40,8 +40,8 @@ mv -v wheelhouse/*.whl python-package/dist/ echo "--- Build binary wheel for ${WHEEL_TAG} (CPU only)" # Patch to rename pkg to xgboost-cpu -patch -p0 < ops/task/patches/remove_nccl_dep.patch -patch -p0 < ops/task/patches/cpu_only_pypkg.patch +patch -p0 < ops/patch/remove_nccl_dep.patch +patch -p0 < ops/patch/cpu_only_pypkg.patch python3 ops/docker_run.py \ --container-id ${image} \ -- bash -c \ diff --git a/ops/task/build-win64-gpu.ps1 b/ops/pipeline/build-win64-gpu.ps1 similarity index 93% rename from ops/task/build-win64-gpu.ps1 rename to ops/pipeline/build-win64-gpu.ps1 index 0b49d143dd5b..48863528684a 100644 --- a/ops/task/build-win64-gpu.ps1 +++ b/ops/pipeline/build-win64-gpu.ps1 @@ -1,15 +1,15 @@ $ErrorActionPreference = "Stop" -. ops/task/enforce-ci.ps1 +. ops/pipeline/enforce-ci.ps1 Write-Host "--- Build libxgboost on Windows with CUDA" nvcc --version -if ( $is_release_branch -eq 0 ) { +#if ( $is_release_branch -eq 0 ) { $arch_flag = "-DGPU_COMPUTE_VER=75" -} else { - $arch_flag = "" -} +#} else { +# $arch_flag = "" +#} git clone https://github.com/NVIDIA/cccl.git -b v2.6.1 --quiet mkdir build diff --git a/ops/task/enforce-ci.ps1 b/ops/pipeline/enforce-ci.ps1 similarity index 94% rename from ops/task/enforce-ci.ps1 rename to ops/pipeline/enforce-ci.ps1 index 9183764b9a13..0528472be6cb 100644 --- a/ops/task/enforce-ci.ps1 +++ b/ops/pipeline/enforce-ci.ps1 @@ -1,5 +1,5 @@ ## Ensure that a script is running inside the CI. -## Usage: . ops/task/enforce-ci.ps1 +## Usage: . ops/pipeline/enforce-ci.ps1 if ( -Not $Env:GITHUB_ACTION ) { $script_name = (Split-Path -Path $PSCommandPath -Leaf) diff --git a/ops/task/enforce-ci.sh b/ops/pipeline/enforce-ci.sh similarity index 94% rename from ops/task/enforce-ci.sh rename to ops/pipeline/enforce-ci.sh index dfed11914c9a..48a48f2dc730 100755 --- a/ops/task/enforce-ci.sh +++ b/ops/pipeline/enforce-ci.sh @@ -1,7 +1,7 @@ #!/bin/bash ## Ensure that a script is running inside the CI. -## Usage: source ops/task/enforce-ci.sh +## Usage: source ops/pipeline/enforce-ci.sh set -euo pipefail diff --git a/ops/task/run-clang-tidy.sh b/ops/pipeline/run-clang-tidy.sh similarity index 83% rename from ops/task/run-clang-tidy.sh rename to ops/pipeline/run-clang-tidy.sh index da12a8808a2a..9af3273b0dbe 100755 --- a/ops/task/run-clang-tidy.sh +++ b/ops/pipeline/run-clang-tidy.sh @@ -4,7 +4,7 @@ set -euox pipefail echo "--- Run clang-tidy" -source ops/task/enforce-ci.sh +source ops/pipeline/enforce-ci.sh python3 ops/docker_run.py \ --container-id xgb-ci.clang_tidy \ diff --git a/ops/task/test-cpp-gpu.sh b/ops/pipeline/test-cpp-gpu.sh similarity index 96% rename from ops/task/test-cpp-gpu.sh rename to ops/pipeline/test-cpp-gpu.sh index 96d11bc9940b..51d097fbdbdf 100755 --- a/ops/task/test-cpp-gpu.sh +++ b/ops/pipeline/test-cpp-gpu.sh @@ -2,7 +2,7 @@ set -euo pipefail -source ops/task/enforce-ci.sh +source ops/pipeline/enforce-ci.sh if [[ "$#" -lt 1 ]] then diff --git a/ops/task/test-python.sh b/ops/pipeline/test-python.sh similarity index 98% rename from ops/task/test-python.sh rename to ops/pipeline/test-python.sh index 99f8b0b42277..f0c9c81cb554 100755 --- a/ops/task/test-python.sh +++ b/ops/pipeline/test-python.sh @@ -2,7 +2,7 @@ set -euo pipefail -source ops/task/enforce-ci.sh +source ops/pipeline/enforce-ci.sh if [[ "$#" -lt 2 ]] then @@ -15,9 +15,10 @@ container_id="$2" tee test-python-wrapper.sh <<-'EOF' #!/bin/bash +source activate "$1" + set -euox pipefail -source activate "$1" export PYSPARK_DRIVER_PYTHON=$(which python) export PYSPARK_PYTHON=$(which python) export SPARK_TESTING=1 diff --git a/ops/task/test-win64-gpu.ps1 b/ops/pipeline/test-win64-gpu.ps1 similarity index 96% rename from ops/task/test-win64-gpu.ps1 rename to ops/pipeline/test-win64-gpu.ps1 index 21d8f6e7b533..e4a55c77b2bd 100644 --- a/ops/task/test-win64-gpu.ps1 +++ b/ops/pipeline/test-win64-gpu.ps1 @@ -1,6 +1,6 @@ $ErrorActionPreference = "Stop" -. ops/task/enforce-ci.ps1 +. ops/pipeline/enforce-ci.ps1 Write-Host "--- Test XGBoost on Windows with CUDA" diff --git a/ops/stash_artifacts.ps1 b/ops/stash_artifacts.ps1 index 2f8cbaf0a855..57a58d884226 100644 --- a/ops/stash_artifacts.ps1 +++ b/ops/stash_artifacts.ps1 @@ -18,7 +18,7 @@ Inputs $ErrorActionPreference = "Stop" -. ops/task/enforce-ci.ps1 +. ops/pipeline/enforce-ci.ps1 foreach ($env in "COMMAND", "KEY", "GITHUB_REPOSITORY", "GITHUB_RUN_ID", "RUNS_ON_S3_BUCKET_CACHE") { diff --git a/ops/stash_artifacts.sh b/ops/stash_artifacts.sh index c796831a963d..c2a16f42a26c 100755 --- a/ops/stash_artifacts.sh +++ b/ops/stash_artifacts.sh @@ -13,7 +13,7 @@ EOF set -euo pipefail -source ops/task/enforce-ci.sh +source ops/pipeline/enforce-ci.sh if [ "$#" -lt 1 ]; then echo "Usage: $0 [artifact] [artifact ...]" diff --git a/tests/ci_build/test_r_package.py b/ops/test_r_package.py similarity index 100% rename from tests/ci_build/test_r_package.py rename to ops/test_r_package.py diff --git a/tests/ci_build/test_tidy.cc b/ops/test_tidy.cc similarity index 100% rename from tests/ci_build/test_tidy.cc rename to ops/test_tidy.cc diff --git a/tests/ci_build/test_utils.py b/ops/test_utils.py similarity index 100% rename from tests/ci_build/test_utils.py rename to ops/test_utils.py diff --git a/tests/ci_build/tidy.py b/ops/tidy.py similarity index 100% rename from tests/ci_build/tidy.py rename to ops/tidy.py diff --git a/tests/buildkite/update-rapids.sh b/ops/update_rapids.sh similarity index 100% rename from tests/buildkite/update-rapids.sh rename to ops/update_rapids.sh diff --git a/tests/ci_build/verify_link.sh b/ops/verify_link.sh similarity index 100% rename from tests/ci_build/verify_link.sh rename to ops/verify_link.sh diff --git a/tests/buildkite/infrastructure/README.md b/tests/buildkite/infrastructure/README.md deleted file mode 100644 index cc3e552e70ff..000000000000 --- a/tests/buildkite/infrastructure/README.md +++ /dev/null @@ -1,106 +0,0 @@ -BuildKite CI Infrastructure -=========================== - -# Worker image builder (`worker-image-pipeline/`) - -Use EC2 Image Builder to build machine images in a deterministic fashion. -The machine images are used to initialize workers in the CI/CD pipelines. - -## Editing bootstrap scripts - -Currently, we create two pipelines for machine images: one for Linux workers and another -for Windows workers. -You can edit the bootstrap scripts to change how the worker machines are initialized. - -* `linux-amd64-gpu-bootstrap.yml`: Bootstrap script for Linux worker machines -* `windows-gpu-bootstrap.yml`: Bootstrap script for Windows worker machines - -## Creating and running Image Builder pipelines - -Run the following commands to create and run pipelines in EC2 Image Builder service: -```bash -python worker-image-pipeline/create_worker_image_pipelines.py --aws-region us-west-2 -python worker-image-pipeline/run_pipelines.py --aws-region us-west-2 -``` -Go to the AWS CloudFormation console and verify the existence of two CloudFormation stacks: -* `buildkite-windows-gpu-worker` -* `buildkite-linux-amd64-gpu-worker` - -Then go to the EC2 Image Builder console to check the status of the image builds. You may -want to inspect the log output should a build fails. -Once the new machine images are done building, see the next section to deploy the new -images to the worker machines. - -# Elastic CI Stack for AWS (`aws-stack-creator/`) - -Use EC2 Autoscaling groups to launch worker machines in EC2. BuildKite periodically sends -messages to the Autoscaling groups to increase or decrease the number of workers according -to the number of outstanding testing jobs. - -## Deploy an updated CI stack with new machine images - -First, edit `aws-stack-creator/metadata.py` to update the `AMI_ID` fields: -```python -AMI_ID = { - # Managed by XGBoost team - "linux-amd64-gpu": { - "us-west-2": "...", - }, - "linux-amd64-mgpu": { - "us-west-2": "...", - }, - "windows-gpu": { - "us-west-2": "...", - }, - "windows-cpu": { - "us-west-2": "...", - }, - # Managed by BuildKite - # from https://s3.amazonaws.com/buildkite-aws-stack/latest/aws-stack.yml - "linux-amd64-cpu": { - "us-west-2": "...", - }, - "pipeline-loader": { - "us-west-2": "...", - }, - "linux-arm64-cpu": { - "us-west-2": "...", - }, -} -``` -AMI IDs uniquely identify the machine images in the EC2 service. -Go to the EC2 Image Builder console to find the AMI IDs for the new machine images -(see the previous section), and update the following fields: - -* `AMI_ID["linux-amd64-gpu"]["us-west-2"]`: - Use the latest output from the `buildkite-linux-amd64-gpu-worker` pipeline -* `AMI_ID["linux-amd64-mgpu"]["us-west-2"]`: - Should be identical to `AMI_ID["linux-amd64-gpu"]["us-west-2"]` -* `AMI_ID["windows-gpu"]["us-west-2"]`: - Use the latest output from the `buildkite-windows-gpu-worker` pipeline -* `AMI_ID["windows-cpu"]["us-west-2"]`: - Should be identical to `AMI_ID["windows-gpu"]["us-west-2"]` - -Next, visit https://s3.amazonaws.com/buildkite-aws-stack/latest/aws-stack.yml -to look up the AMI IDs for the following fields: - -* `AMI_ID["linux-amd64-cpu"]["us-west-2"]`: Copy and paste the AMI ID from the field - `Mappings/AWSRegion2AMI/us-west-2/linuxamd64` -* `AMI_ID["pipeline-loader"]["us-west-2"]`: - Should be identical to `AMI_ID["linux-amd64-cpu"]["us-west-2"]` -* `AMI_ID["linux-arm64-cpu"]["us-west-2"]`: Copy and paste the AMI ID from the field - `Mappings/AWSRegion2AMI/us-west-2/linuxarm64` - -Finally, run the following commands to deploy the new machine images: -``` -python aws-stack-creator/create_stack.py --aws-region us-west-2 --agent-token AGENT_TOKEN -``` -Go to the AWS CloudFormation console and verify the existence of the following -CloudFormation stacks: -* `buildkite-pipeline-loader-autoscaling-group` -* `buildkite-linux-amd64-cpu-autoscaling-group` -* `buildkite-linux-amd64-gpu-autoscaling-group` -* `buildkite-linux-amd64-mgpu-autoscaling-group` -* `buildkite-linux-arm64-cpu-autoscaling-group` -* `buildkite-windows-cpu-autoscaling-group` -* `buildkite-windows-gpu-autoscaling-group` diff --git a/tests/buildkite/infrastructure/aws-stack-creator/agent-iam-policy-template.yml b/tests/buildkite/infrastructure/aws-stack-creator/agent-iam-policy-template.yml deleted file mode 100644 index 7f15b1fbcd4f..000000000000 --- a/tests/buildkite/infrastructure/aws-stack-creator/agent-iam-policy-template.yml +++ /dev/null @@ -1,32 +0,0 @@ ---- -AWSTemplateFormatVersion: "2010-09-09" -Description: "Buildkite agent's IAM policy" - -Resources: - BuildkiteAgentManagedPolicy: - Type: AWS::IAM::ManagedPolicy - Properties: - PolicyDocument: - { - "Version": "2012-10-17", - "Statement": [ - { - "Effect": "Allow", - "Action": [ - "s3:*", - "s3-object-lambda:*" - ], - "Resource": "*" - }, - { - "Effect": "Allow", - "Action": "lambda:InvokeFunction", - "Resource": "*" - }, - { - "Effect": "Allow", - "Action": "secretsmanager:GetSecretValue", - "Resource": "*" - } - ] - } diff --git a/tests/buildkite/infrastructure/aws-stack-creator/create_stack.py b/tests/buildkite/infrastructure/aws-stack-creator/create_stack.py deleted file mode 100644 index 8f8db348a073..000000000000 --- a/tests/buildkite/infrastructure/aws-stack-creator/create_stack.py +++ /dev/null @@ -1,127 +0,0 @@ -import argparse -import copy -import os -import re -import sys - -import boto3 -import botocore -from metadata import AMI_ID, COMMON_STACK_PARAMS, STACK_PARAMS - -current_dir = os.path.dirname(__file__) -sys.path.append(os.path.join(current_dir, "..")) - -from common_blocks.utils import create_or_update_stack, wait - -TEMPLATE_URL = "https://s3.amazonaws.com/buildkite-aws-stack/latest/aws-stack.yml" - - -def get_availability_zones(*, aws_region): - client = boto3.client("ec2", region_name=aws_region) - r = client.describe_availability_zones( - Filters=[ - {"Name": "region-name", "Values": [aws_region]}, - {"Name": "zone-type", "Values": ["availability-zone"]}, - ] - ) - return sorted([x["ZoneName"] for x in r["AvailabilityZones"]]) - - -def get_default_vpc(*, aws_region): - ec2 = boto3.resource("ec2", region_name=aws_region) - default_vpc_id = None - for x in ec2.vpcs.filter(Filters=[{"Name": "is-default", "Values": ["true"]}]): - return x - - # Create default VPC if not exist - client = boto3.client("ec2", region_name=aws_region) - r = client.create_default_vpc() - default_vpc_id = r["Vpc"]["VpcId"] - - return ec2.Vpc(default_vpc_id) - - -def format_params(args, *, stack_id, agent_iam_policy): - default_vpc = get_default_vpc(aws_region=args.aws_region) - azs = get_availability_zones(aws_region=args.aws_region) - # For each of the first two availability zones (AZs), choose the default subnet - subnets = [ - x.id - for x in default_vpc.subnets.filter( - Filters=[ - {"Name": "default-for-az", "Values": ["true"]}, - {"Name": "availability-zone", "Values": azs[:2]}, - ] - ) - ] - assert len(subnets) == 2 - - params = copy.deepcopy(STACK_PARAMS[stack_id]) - params["ImageId"] = AMI_ID[stack_id][args.aws_region] - params["BuildkiteQueue"] = stack_id - params["CostAllocationTagValue"] = f"buildkite-{stack_id}" - params["BuildkiteAgentToken"] = args.agent_token - params["VpcId"] = default_vpc.id - params["Subnets"] = ",".join(subnets) - params["ManagedPolicyARNs"] = agent_iam_policy - params.update(COMMON_STACK_PARAMS) - return [{"ParameterKey": k, "ParameterValue": v} for k, v in params.items()] - - -def get_full_stack_id(stack_id): - return f"buildkite-{stack_id}-autoscaling-group" - - -def create_agent_iam_policy(args, *, client): - policy_stack_name = "buildkite-agent-iam-policy" - print(f"Creating stack {policy_stack_name} for agent IAM policy...") - with open( - os.path.join(current_dir, "agent-iam-policy-template.yml"), - encoding="utf-8", - ) as f: - policy_template = f.read() - promise = create_or_update_stack( - args, client=client, stack_name=policy_stack_name, template_body=policy_template - ) - wait(promise, client=client) - - cf = boto3.resource("cloudformation", region_name=args.aws_region) - policy = cf.StackResource(policy_stack_name, "BuildkiteAgentManagedPolicy") - return policy.physical_resource_id - - -def main(args): - client = boto3.client("cloudformation", region_name=args.aws_region) - - agent_iam_policy = create_agent_iam_policy(args, client=client) - - promises = [] - - for stack_id in AMI_ID: - stack_id_full = get_full_stack_id(stack_id) - print(f"Creating elastic CI stack {stack_id_full}...") - - params = format_params( - args, stack_id=stack_id, agent_iam_policy=agent_iam_policy - ) - - promise = create_or_update_stack( - args, - client=client, - stack_name=stack_id_full, - template_url=TEMPLATE_URL, - params=params, - ) - promises.append(promise) - print(f"CI stack {stack_id_full} is in progress in the background") - - for promise in promises: - wait(promise, client=client) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("--aws-region", type=str, required=True) - parser.add_argument("--agent-token", type=str, required=True) - args = parser.parse_args() - main(args) diff --git a/tests/buildkite/infrastructure/aws-stack-creator/metadata.py b/tests/buildkite/infrastructure/aws-stack-creator/metadata.py deleted file mode 100644 index 5012aa738854..000000000000 --- a/tests/buildkite/infrastructure/aws-stack-creator/metadata.py +++ /dev/null @@ -1,114 +0,0 @@ -AMI_ID = { - # Managed by XGBoost team - "linux-amd64-gpu": { - "us-west-2": "ami-0b4079c15bbbd0faf", - }, - "linux-amd64-mgpu": { - "us-west-2": "ami-0b4079c15bbbd0faf", - }, - "windows-gpu": { - "us-west-2": "ami-0123456bcf4cdfb82", - }, - "windows-cpu": { - "us-west-2": "ami-0123456bcf4cdfb82", - }, - # Managed by BuildKite - # from https://s3.amazonaws.com/buildkite-aws-stack/latest/aws-stack.yml - "linux-amd64-cpu": { - "us-west-2": "ami-0083e0ae73c175ec6", - }, - "pipeline-loader": { - "us-west-2": "ami-0083e0ae73c175ec6", - }, - "linux-arm64-cpu": { - "us-west-2": "ami-0dbf1f9da54222f21", - }, -} - -STACK_PARAMS = { - "linux-amd64-gpu": { - "InstanceOperatingSystem": "linux", - "InstanceTypes": "g4dn.xlarge", - "AgentsPerInstance": "1", - "MinSize": "0", - "MaxSize": "8", - "OnDemandPercentage": "100", - "ScaleOutFactor": "1.0", - "ScaleInIdlePeriod": "60", # in seconds - }, - "linux-amd64-mgpu": { - "InstanceOperatingSystem": "linux", - "InstanceTypes": "g4dn.12xlarge", - "AgentsPerInstance": "1", - "MinSize": "0", - "MaxSize": "1", - "OnDemandPercentage": "100", - "ScaleOutFactor": "1.0", - "ScaleInIdlePeriod": "60", # in seconds - }, - "windows-gpu": { - "InstanceOperatingSystem": "windows", - "InstanceTypes": "g4dn.2xlarge", - "AgentsPerInstance": "1", - "MinSize": "0", - "MaxSize": "2", - "OnDemandPercentage": "100", - "ScaleOutFactor": "1.0", - "ScaleInIdlePeriod": "60", # in seconds - }, - "windows-cpu": { - "InstanceOperatingSystem": "windows", - "InstanceTypes": "c5a.2xlarge", - "AgentsPerInstance": "1", - "MinSize": "0", - "MaxSize": "2", - "OnDemandPercentage": "100", - "ScaleOutFactor": "1.0", - "ScaleInIdlePeriod": "60", # in seconds - }, - "linux-amd64-cpu": { - "InstanceOperatingSystem": "linux", - "InstanceTypes": "c5a.4xlarge", - "AgentsPerInstance": "1", - "MinSize": "0", - "MaxSize": "16", - "OnDemandPercentage": "100", - "ScaleOutFactor": "1.0", - "ScaleInIdlePeriod": "60", # in seconds - }, - "pipeline-loader": { - "InstanceOperatingSystem": "linux", - "InstanceTypes": "t3a.micro", - "AgentsPerInstance": "1", - "MinSize": "2", - "MaxSize": "2", - "OnDemandPercentage": "100", - "ScaleOutFactor": "1.0", - "ScaleInIdlePeriod": "60", # in seconds - }, - "linux-arm64-cpu": { - "InstanceOperatingSystem": "linux", - "InstanceTypes": "c6g.4xlarge", - "AgentsPerInstance": "1", - "MinSize": "0", - "MaxSize": "8", - "OnDemandPercentage": "100", - "ScaleOutFactor": "1.0", - "ScaleInIdlePeriod": "60", # in seconds - }, -} - -COMMON_STACK_PARAMS = { - "BuildkiteAgentTimestampLines": "false", - "BuildkiteWindowsAdministrator": "true", - "AssociatePublicIpAddress": "true", - "ScaleOutForWaitingJobs": "false", - "EnableCostAllocationTags": "true", - "CostAllocationTagName": "CreatedBy", - "ECRAccessPolicy": "full", - "EnableSecretsPlugin": "false", - "EnableECRPlugin": "false", - "EnableDockerLoginPlugin": "false", - "EnableDockerUserNamespaceRemap": "false", - "BuildkiteAgentExperiments": "normalised-upload-paths,resolve-commit-after-checkout", -} diff --git a/tests/buildkite/infrastructure/common_blocks/utils.py b/tests/buildkite/infrastructure/common_blocks/utils.py deleted file mode 100644 index 27a0835e8dc0..000000000000 --- a/tests/buildkite/infrastructure/common_blocks/utils.py +++ /dev/null @@ -1,97 +0,0 @@ -import re - -import boto3 -import botocore - - -def stack_exists(args, *, stack_name): - client = boto3.client("cloudformation", region_name=args.aws_region) - waiter = client.get_waiter("stack_exists") - try: - waiter.wait(StackName=stack_name, WaiterConfig={"MaxAttempts": 1}) - return True - except botocore.exceptions.WaiterError as e: - return False - - -def create_or_update_stack( - args, *, client, stack_name, template_url=None, template_body=None, params=None -): - kwargs = { - "StackName": stack_name, - "Capabilities": [ - "CAPABILITY_IAM", - "CAPABILITY_NAMED_IAM", - "CAPABILITY_AUTO_EXPAND", - ], - } - if template_url: - kwargs["TemplateURL"] = template_url - if template_body: - kwargs["TemplateBody"] = template_body - if params: - kwargs["Parameters"] = params - - if stack_exists(args, stack_name=stack_name): - print(f"Stack {stack_name} already exists. Updating...") - try: - response = client.update_stack(**kwargs) - return {"StackName": stack_name, "Action": "update"} - except botocore.exceptions.ClientError as e: - if e.response["Error"]["Code"] == "ValidationError" and re.search( - "No updates are to be performed", e.response["Error"]["Message"] - ): - print(f"No update was made to {stack_name}") - return {"StackName": stack_name, "Action": "noop"} - else: - raise e - else: - kwargs.update({"OnFailure": "ROLLBACK", "EnableTerminationProtection": False}) - response = client.create_stack(**kwargs) - return {"StackName": stack_name, "Action": "create"} - - -def replace_stack( - args, *, client, stack_name, template_url=None, template_body=None, params=None -): - """Delete an existing stack and create a new stack with identical name""" - - if not stack_exists(args, stack_name=stack_name): - raise ValueError(f"Stack {stack_name} does not exist") - r = client.delete_stack(StackName=stack_name) - delete_waiter = client.get_waiter("stack_delete_complete") - delete_waiter.wait(StackName=stack_name) - - kwargs = { - "StackName": stack_name, - "Capabilities": [ - "CAPABILITY_IAM", - "CAPABILITY_NAMED_IAM", - "CAPABILITY_AUTO_EXPAND", - ], - "OnFailure": "ROLLBACK", - "EnableTerminationProtection": False, - } - if template_url: - kwargs["TemplateURL"] = template_url - if template_body: - kwargs["TemplateBody"] = template_body - if params: - kwargs["Parameters"] = params - response = client.create_stack(**kwargs) - return {"StackName": stack_name, "Action": "create"} - - -def wait(promise, *, client): - stack_name = promise["StackName"] - print(f"Waiting for {stack_name}...") - if promise["Action"] == "create": - waiter = client.get_waiter("stack_create_complete") - waiter.wait(StackName=stack_name) - print(f"Finished creating stack {stack_name}") - elif promise["Action"] == "update": - waiter = client.get_waiter("stack_update_complete") - waiter.wait(StackName=stack_name) - print(f"Finished updating stack {stack_name}") - elif promise["Action"] != "noop": - raise ValueError(f"Invalid promise {promise}") diff --git a/tests/buildkite/infrastructure/requirements.txt b/tests/buildkite/infrastructure/requirements.txt deleted file mode 100644 index 3ce271ebbdd6..000000000000 --- a/tests/buildkite/infrastructure/requirements.txt +++ /dev/null @@ -1,2 +0,0 @@ -boto3 -cfn_tools diff --git a/tests/buildkite/infrastructure/service-user/create_service_user.py b/tests/buildkite/infrastructure/service-user/create_service_user.py deleted file mode 100644 index ba08779bd159..000000000000 --- a/tests/buildkite/infrastructure/service-user/create_service_user.py +++ /dev/null @@ -1,44 +0,0 @@ -import argparse -import os - -import boto3 - -current_dir = os.path.dirname(__file__) - - -def main(args): - with open( - os.path.join(current_dir, "service-user-template.yml"), encoding="utf-8" - ) as f: - service_user_template = f.read() - - stack_id = "buildkite-elastic-ci-stack-service-user" - - print("Create a new IAM user with suitable permissions...") - client = boto3.client("cloudformation", region_name=args.aws_region) - response = client.create_stack( - StackName=stack_id, - TemplateBody=service_user_template, - Capabilities=[ - "CAPABILITY_IAM", - "CAPABILITY_NAMED_IAM", - ], - Parameters=[{"ParameterKey": "UserName", "ParameterValue": args.user_name}], - ) - waiter = client.get_waiter("stack_create_complete") - waiter.wait(StackName=stack_id) - user = boto3.resource("iam", region_name=args.aws_region).User(args.user_name) - key_pair = user.create_access_key_pair() - print("Finished creating an IAM users with suitable permissions.") - print(f"Access Key ID: {key_pair.access_key_id}") - print(f"Access Secret Access Key: {key_pair.secret_access_key}") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("--aws-region", type=str, required=True) - parser.add_argument( - "--user-name", type=str, default="buildkite-elastic-ci-stack-user" - ) - args = parser.parse_args() - main(args) diff --git a/tests/buildkite/infrastructure/service-user/service-user-template.yml b/tests/buildkite/infrastructure/service-user/service-user-template.yml deleted file mode 100644 index 2077cfe7b148..000000000000 --- a/tests/buildkite/infrastructure/service-user/service-user-template.yml +++ /dev/null @@ -1,349 +0,0 @@ ---- -AWSTemplateFormatVersion: "2010-09-09" -Description: "Buildkite Elastic CI Stack CloudFormation service user" - -Parameters: - UserName: - Type: String - Default: buildkite-elastic-ci-stack-user - Description: Name of user to create - -Outputs: - UserNameOutput: - Value: !Ref CloudFormationServiceUser - UserArnOutput: - Value: !GetAtt CloudFormationServiceUser.Arn - -Resources: - CloudFormationServiceUser: - Type: AWS::IAM::User - Properties: - ManagedPolicyArns: - - !Ref SubstackCrudPolicy - - !Ref CrudPolicy - - !Ref ImageBuilderPolicy - UserName: !Ref UserName - - SubstackCrudPolicy: - Type: AWS::IAM::ManagedPolicy - Properties: - PolicyDocument: - { - "Version": "2012-10-17", - "Statement": [ - { - "Effect": "Allow", - "Action": "cloudformation:*", - "Resource": "*" - }, - { - "Effect": "Allow", - "Action": [ - "serverlessrepo:GetApplication", - "serverlessrepo:GetCloudFormationTemplate", - "serverlessrepo:CreateCloudFormationTemplate" - ], - "Resource": "*" - } - ] - } - - CrudPolicy: - Type: AWS::IAM::ManagedPolicy - Properties: - PolicyDocument: - { - "Version": "2012-10-17", - "Statement": [ - { - "Effect": "Allow", - "Action": [ - "ec2:DescribeAccountAttributes", - "ec2:DescribeAvailabilityZones", - "ec2:DescribeInstances", - "ec2:DescribeInternetGateways", - "ec2:DescribeLaunchTemplateVersions", - "ec2:DescribeLaunchTemplates", - "ec2:DescribeNetworkInterfaces", - "ec2:DescribeRouteTables", - "ec2:DescribeSecurityGroups", - "ec2:DescribeSubnets", - "ec2:DescribeVpcs", - "ec2:CreateTags" - ], - "Resource": "*" - }, - { - "Effect": "Allow", - "Action": [ - "ec2:CreateInternetGateway", - "ec2:AttachInternetGateway", - "ec2:DetachInternetGateway", - "ec2:DeleteInternetGateway" - ], - "Resource": "arn:aws:ec2:*:*:internet-gateway/*" - }, - { - "Effect": "Allow", - "Action": [ - "ec2:CreateLaunchTemplate", - "ec2:CreateLaunchTemplateVersion", - "ec2:DeleteLaunchTemplate" - ], - "Resource": "arn:aws:ec2:*:*:launch-template/*" - }, - { - "Effect": "Allow", - "Action": [ - "ec2:AssociateRouteTable", - "ec2:DisassociateRouteTable", - "ec2:CreateRoute", - "ec2:CreateRouteTable", - "ec2:DeleteRoute", - "ec2:DeleteRouteTable" - ], - "Resource": "arn:aws:ec2:*:*:route-table/*" - }, - { - "Effect": "Allow", - "Action": [ - "ec2:AuthorizeSecurityGroupIngress", - "ec2:RevokeSecurityGroupIngress", - "ec2:CreateSecurityGroup", - "ec2:DeleteSecurityGroup" - ], - "Resource": "arn:aws:ec2:*:*:security-group/*" - }, - { - "Effect": "Allow", - "Action": "ec2:RunInstances", - "Resource": "*" - }, - { - "Effect": "Allow", - "Action": [ - "ec2:CreateSubnet", - "ec2:DeleteSubnet", - "ec2:AssociateRouteTable", - "ec2:DisassociateRouteTable" - ], - "Resource": "arn:aws:ec2:*:*:subnet/*" - }, - { - "Effect": "Allow", - "Action": [ - "ec2:CreateVpc", - "ec2:CreateSecurityGroup", - "ec2:ModifyVpcAttribute", - "ec2:AttachInternetGateway", - "ec2:DetachInternetGateway", - "ec2:CreateSubnet", - "ec2:CreateRouteTable", - "ec2:DeleteVpc" - ], - "Resource": "arn:aws:ec2:*:*:vpc/*" - }, - { - "Effect": "Allow", - "Action": [ - "ec2:CreateDefaultVpc", - "ec2:CreateDefaultSubnet" - ], - "Resource": "*" - }, - { - "Effect": "Allow", - "Action": [ - "iam:CreateInstanceProfile", - "iam:GetInstanceProfile", - "iam:AddRoleToInstanceProfile", - "iam:RemoveRoleFromInstanceProfile", - "iam:DeleteInstanceProfile" - ], - "Resource": "arn:aws:iam::*:instance-profile/*" - }, - { - "Effect": "Allow", - "Action": [ - "kms:DescribeKey", - "kms:CreateGrant", - "kms:Decrypt", - "kms:Encrypt" - ], - "Resource": "arn:aws:kms:*:*:key/*" - }, - { - "Effect": "Allow", - "Action": [ - "lambda:CreateFunction", - "lambda:GetFunction", - "lambda:GetFunctionCodeSigningConfig", - "lambda:AddPermission", - "lambda:RemovePermission", - "lambda:DeleteFunction", - "lambda:InvokeFunction", - "lambda:TagResource" - ], - "Resource": "arn:aws:lambda:*:*:function:*" - }, - { - "Effect": "Allow", - "Action": [ - "logs:CreateLogGroup", - "logs:PutRetentionPolicy", - "logs:DeleteLogGroup" - ], - "Resource": "arn:aws:logs:*:*:log-group:*" - }, - { - "Effect": "Allow", - "Action": [ - "s3:GetObject", - "s3:CreateBucket", - "s3:PutBucketAcl", - "s3:PutBucketLogging", - "s3:PutBucketTagging", - "s3:PutBucketVersioning" - ], - "Resource": "arn:aws:s3:::*" - }, - { - "Effect": "Allow", - "Action": [ - "ssm:GetParameter", - "ssm:PutParameter", - "ssm:DeleteParameter" - ], - "Resource": "arn:aws:ssm:*:*:parameter/*" - }, - { - "Effect": "Allow", - "Action": [ - "iam:ListPolicies", - "iam:ListInstanceProfiles", - "iam:ListRoles", - "iam:ListPolicyVersions", - "iam:ListRolePolicies", - "iam:ListAttachedRolePolicies", - "iam:ListInstanceProfileTags", - "iam:ListRoleTags", - "iam:ListInstanceProfilesForRole", - "iam:GetPolicyVersion", - "iam:GetPolicy", - "iam:GetInstanceProfile", - "iam:GetRole", - "iam:GetRolePolicy", - "iam:TagPolicy", - "iam:UntagPolicy", - "iam:TagInstanceProfile", - "iam:UntagInstanceProfile", - "iam:TagRole", - "iam:UntagRole", - "iam:CreateRole", - "iam:PassRole", - "iam:DeleteRole", - "iam:UpdateRoleDescription", - "iam:UpdateRole", - "iam:AddRoleToInstanceProfile", - "iam:RemoveRoleFromInstanceProfile", - "iam:CreateInstanceProfile", - "iam:DeleteInstanceProfile", - "iam:DetachRolePolicy", - "iam:SetDefaultPolicyVersion", - "iam:AttachRolePolicy", - "iam:UpdateAssumeRolePolicy", - "iam:PutRolePermissionsBoundary", - "iam:DeleteRolePermissionsBoundary", - "iam:CreatePolicy", - "iam:DeletePolicyVersion", - "iam:DeletePolicy", - "iam:PutRolePolicy", - "iam:DeleteRolePolicy" - ], - "Resource": "*" - }, - { - "Effect": "Allow", - "Action": [ - "autoscaling:DescribeLifecycleHookTypes", - "autoscaling:DescribeTerminationPolicyTypes", - "autoscaling:DescribePolicies", - "autoscaling:DescribeWarmPool", - "autoscaling:DescribeScalingActivities", - "autoscaling:DescribeScalingProcessTypes", - "autoscaling:DescribeScheduledActions", - "autoscaling:DescribeAutoScalingGroups", - "autoscaling:DescribeAutoScalingInstances", - "autoscaling:DescribeLifecycleHooks", - "autoscaling:SetDesiredCapacity", - "autoscaling:PutLifecycleHook", - "autoscaling:DeleteLifecycleHook", - "autoscaling:SetInstanceProtection", - "autoscaling:CreateAutoScalingGroup", - "autoscaling:EnableMetricsCollection", - "autoscaling:UpdateAutoScalingGroup", - "autoscaling:DeleteAutoScalingGroup", - "autoscaling:PutScalingPolicy", - "autoscaling:DeletePolicy", - "autoscaling:BatchPutScheduledUpdateGroupAction", - "autoscaling:PutScheduledUpdateGroupAction", - "autoscaling:DeleteScheduledAction", - "autoscaling:PutWarmPool", - "autoscaling:DeleteWarmPool", - "autoscaling:TerminateInstanceInAutoScalingGroup", - "autoscaling:AttachInstances" - ], - "Resource": "*" - }, - { - "Effect": "Allow", - "Action": [ - "events:DescribeRule", - "events:PutRule", - "events:PutTargets", - "events:RemoveTargets", - "events:DeleteRule" - ], - "Resource": "arn:aws:events:*:*:rule/*" - } - ] - } - - ImageBuilderPolicy: - Type: AWS::IAM::ManagedPolicy - Properties: - PolicyDocument: - { - "Version": "2012-10-17", - "Statement": [ - { - "Effect": "Allow", - "Action": [ - "imagebuilder:CreateComponent", - "imagebuilder:GetComponent", - "imagebuilder:DeleteComponent", - "imagebuilder:CreateImageRecipe", - "imagebuilder:GetImageRecipe", - "imagebuilder:DeleteImageRecipe", - "imagebuilder:CreateImagePipeline", - "imagebuilder:GetImagePipeline", - "imagebuilder:DeleteImagePipeline", - "imagebuilder:CreateInfrastructureConfiguration", - "imagebuilder:GetInfrastructureConfiguration", - "imagebuilder:DeleteInfrastructureConfiguration", - "imagebuilder:CreateDistributionConfiguration", - "imagebuilder:GetDistributionConfiguration", - "imagebuilder:DeleteDistributionConfiguration", - "imagebuilder:TagResource", - "imagebuilder:StartImagePipelineExecution", - "ec2:DescribeImages", - "ec2:DescribeSnapshots", - "ec2:DescribeRegions", - "ec2:DescribeVolumes", - "ec2:DescribeKeyPairs", - "ec2:DescribeInstanceTypeOfferings" - ], - "Resource": "*" - } - ] - } diff --git a/tests/buildkite/infrastructure/worker-image-pipeline/create_worker_image_pipelines.py b/tests/buildkite/infrastructure/worker-image-pipeline/create_worker_image_pipelines.py deleted file mode 100644 index 8051b991da51..000000000000 --- a/tests/buildkite/infrastructure/worker-image-pipeline/create_worker_image_pipelines.py +++ /dev/null @@ -1,85 +0,0 @@ -import argparse -import copy -import json -import os -import sys -from urllib.request import urlopen - -import boto3 -import cfn_flip -from metadata import IMAGE_PARAMS - -current_dir = os.path.dirname(__file__) -sys.path.append(os.path.join(current_dir, "..")) - -from common_blocks.utils import replace_stack, wait - -BUILDKITE_CF_TEMPLATE_URL = ( - "https://s3.amazonaws.com/buildkite-aws-stack/latest/aws-stack.yml" -) - - -def format_params(*, stack_id, aws_region, ami_mapping): - params = copy.deepcopy(IMAGE_PARAMS[stack_id]) - with open( - os.path.join(current_dir, params["BootstrapScript"]), - encoding="utf-8", - ) as f: - bootstrap_script = f.read() - params["BaseImageId"] = ami_mapping[aws_region][params["BaseImageId"]] - params["BootstrapScript"] = bootstrap_script - return [{"ParameterKey": k, "ParameterValue": v} for k, v in params.items()] - - -def get_ami_mapping(): - with urlopen(BUILDKITE_CF_TEMPLATE_URL) as response: - buildkite_cf_template = response.read().decode("utf-8") - cfn_obj = json.loads(cfn_flip.to_json(buildkite_cf_template)) - return cfn_obj["Mappings"]["AWSRegion2AMI"] - - -def get_full_stack_id(stack_id): - return f"buildkite-{stack_id}-worker" - - -def main(args): - with open( - os.path.join(current_dir, "ec2-image-builder-pipeline-template.yml"), - encoding="utf-8", - ) as f: - ec2_image_pipeline_template = f.read() - - ami_mapping = get_ami_mapping() - - client = boto3.client("cloudformation", region_name=args.aws_region) - promises = [] - - for stack_id in IMAGE_PARAMS: - stack_id_full = get_full_stack_id(stack_id) - print(f"Creating EC2 image builder stack {stack_id_full}...") - - params = format_params( - stack_id=stack_id, aws_region=args.aws_region, ami_mapping=ami_mapping - ) - - promise = replace_stack( - args, - client=client, - stack_name=stack_id_full, - template_body=ec2_image_pipeline_template, - params=params, - ) - promises.append(promise) - print( - f"EC2 image builder stack {stack_id_full} is in progress in the background" - ) - - for promise in promises: - wait(promise, client=client) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("--aws-region", type=str, required=True) - args = parser.parse_args() - main(args) diff --git a/tests/buildkite/infrastructure/worker-image-pipeline/ec2-image-builder-pipeline-template.yml b/tests/buildkite/infrastructure/worker-image-pipeline/ec2-image-builder-pipeline-template.yml deleted file mode 100644 index 8d3bafa72f08..000000000000 --- a/tests/buildkite/infrastructure/worker-image-pipeline/ec2-image-builder-pipeline-template.yml +++ /dev/null @@ -1,108 +0,0 @@ ---- -AWSTemplateFormatVersion: "2010-09-09" -Description: "EC2 Image Builder pipelines to build workers" - -Parameters: - BaseImageId: - Type: String - Description: Base AMI to build a new image on top of. - - BootstrapScript: - Type: String - Description: Content of AMI customization script - - InstanceType: - Type: String - Description: Instance type for the Image Builder instances. - - InstanceOperatingSystem: - Type: String - Description: The operating system to run on the instance - AllowedValues: - - Linux - - Windows - Default: "Linux" - - VolumeSize: - Type: Number - Description: Size of EBS volume, in GiBs - -Conditions: - IsInstanceWindows: - !Equals [ !Ref InstanceOperatingSystem, "Windows" ] - -Resources: - # IAM role for the image builder instance - InstanceRole: - Type: AWS::IAM::Role - Properties: - AssumeRolePolicyDocument: - Version: "2012-10-17" - Statement: - - Effect: "Allow" - Principal: - Service: "ec2.amazonaws.com" - Action: "sts:AssumeRole" - ManagedPolicyArns: - - arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore - - arn:aws:iam::aws:policy/EC2InstanceProfileForImageBuilder - - arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess - - InstanceProfile: - Type: AWS::IAM::InstanceProfile - Properties: - Roles: - - !Ref InstanceRole - - # Component that runs the bootstrap script - BootstrapComponent: - Type: AWS::ImageBuilder::Component - Properties: - Name: !Join ["-", [!Ref AWS::StackName, "bootstrap-component", !Select [2, !Split ['/', !Ref AWS::StackId]]]] - Platform: !Ref InstanceOperatingSystem - Version: "1.0.0" - Description: Execute a bootstrap script. - Data: !Ref BootstrapScript - - Recipe: - Type: AWS::ImageBuilder::ImageRecipe - Properties: - Name: !Join ["-", [!Ref AWS::StackName, "image", !Select [2, !Split ['/', !Ref AWS::StackId]]]] - Components: - - ComponentArn: !Ref BootstrapComponent - ParentImage: !Ref BaseImageId - BlockDeviceMappings: - - DeviceName: !If [IsInstanceWindows, "/dev/sda1", "/dev/xvda"] - Ebs: - DeleteOnTermination: true - Encrypted: false - VolumeSize: !Ref VolumeSize - VolumeType: gp2 - Version: "1.0.0" - - Infrastructure: - Type: AWS::ImageBuilder::InfrastructureConfiguration - Properties: - Name: !Join ["-", [!Ref AWS::StackName, "image-pipeline-infrastructure", !Select [2, !Split ['/', !Ref AWS::StackId]]]] - InstanceProfileName: !Ref InstanceProfile - InstanceTypes: - - !Ref InstanceType - TerminateInstanceOnFailure: true - - # Copy to this region only - Distribution: - Type: AWS::ImageBuilder::DistributionConfiguration - Properties: - Name: !Join ["-", [!Ref AWS::StackName, "image-pipeline-distribution-config", !Select [2, !Split ['/', !Ref AWS::StackId]]]] - Distributions: - - Region: !Ref AWS::Region - AmiDistributionConfiguration: {} - - # Composition of the above elements - Pipeline: - Type: AWS::ImageBuilder::ImagePipeline - Properties: - Name: !Join ["-", [!Ref AWS::StackName, "image-pipeline", !Select [2, !Split ['/', !Ref AWS::StackId]]]] - DistributionConfigurationArn: !Ref Distribution - ImageRecipeArn: !Ref Recipe - InfrastructureConfigurationArn: !Ref Infrastructure diff --git a/tests/buildkite/infrastructure/worker-image-pipeline/linux-amd64-gpu-bootstrap.yml b/tests/buildkite/infrastructure/worker-image-pipeline/linux-amd64-gpu-bootstrap.yml deleted file mode 100644 index 88403911cbc6..000000000000 --- a/tests/buildkite/infrastructure/worker-image-pipeline/linux-amd64-gpu-bootstrap.yml +++ /dev/null @@ -1,24 +0,0 @@ -name: BuildKiteLinuxAMD64GPUBootstrap -description: Set up worker image for linux-amd64-gpu pipeline -schemaVersion: 1.0 - -phases: - - name: build - steps: - - name: SetupStep - action: ExecuteBash - inputs: - commands: - - | - yum groupinstall -y "Development tools" - yum install -y kernel-devel-$(uname -r) - dnf install -y kernel-modules-extra - aws s3 cp --recursive s3://ec2-linux-nvidia-drivers/latest/ . - chmod +x NVIDIA-Linux-x86_64*.run - ./NVIDIA-Linux-x86_64*.run --silent - - curl -s -L https://nvidia.github.io/libnvidia-container/stable/rpm/nvidia-container-toolkit.repo | tee /etc/yum.repos.d/nvidia-container-toolkit.repo - yum install -y nvidia-container-toolkit - yum clean expire-cache - nvidia-ctk runtime configure --runtime=docker - systemctl restart docker diff --git a/tests/buildkite/infrastructure/worker-image-pipeline/metadata.py b/tests/buildkite/infrastructure/worker-image-pipeline/metadata.py deleted file mode 100644 index 37100209fe2e..000000000000 --- a/tests/buildkite/infrastructure/worker-image-pipeline/metadata.py +++ /dev/null @@ -1,18 +0,0 @@ -IMAGE_PARAMS = { - "linux-amd64-gpu": { - "BaseImageId": "linuxamd64", - # AMI ID is looked up from Buildkite's CloudFormation template - "BootstrapScript": "linux-amd64-gpu-bootstrap.yml", - "InstanceType": "g4dn.xlarge", - "InstanceOperatingSystem": "Linux", - "VolumeSize": "40", # in GiBs - }, - "windows-gpu": { - "BaseImageId": "windows", - # AMI ID is looked up from Buildkite's CloudFormation template - "BootstrapScript": "windows-gpu-bootstrap.yml", - "InstanceType": "g4dn.2xlarge", - "InstanceOperatingSystem": "Windows", - "VolumeSize": "120", # in GiBs - }, -} diff --git a/tests/buildkite/infrastructure/worker-image-pipeline/run_pipelines.py b/tests/buildkite/infrastructure/worker-image-pipeline/run_pipelines.py deleted file mode 100644 index 9edb8b1a7c24..000000000000 --- a/tests/buildkite/infrastructure/worker-image-pipeline/run_pipelines.py +++ /dev/null @@ -1,22 +0,0 @@ -import argparse - -import boto3 -from create_worker_image_pipelines import get_full_stack_id -from metadata import IMAGE_PARAMS - - -def main(args): - cf = boto3.resource("cloudformation", region_name=args.aws_region) - builder_client = boto3.client("imagebuilder", region_name=args.aws_region) - for stack_id in IMAGE_PARAMS: - stack_id_full = get_full_stack_id(stack_id) - pipeline_arn = cf.Stack(stack_id_full).Resource("Pipeline").physical_resource_id - print(f"Running pipeline {pipeline_arn} to generate a new AMI...") - r = builder_client.start_image_pipeline_execution(imagePipelineArn=pipeline_arn) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("--aws-region", type=str, required=True) - args = parser.parse_args() - main(args) diff --git a/tests/buildkite/infrastructure/worker-image-pipeline/windows-gpu-bootstrap.yml b/tests/buildkite/infrastructure/worker-image-pipeline/windows-gpu-bootstrap.yml deleted file mode 100644 index 0348e28c8709..000000000000 --- a/tests/buildkite/infrastructure/worker-image-pipeline/windows-gpu-bootstrap.yml +++ /dev/null @@ -1,71 +0,0 @@ -name: BuildKiteWindowsGPUBootstrap -description: Set up worker image for windows-gpu pipeline -schemaVersion: 1.0 - -phases: - - name: build - steps: - - name: SetupStep - action: ExecutePowerShell - inputs: - commands: - - | - $ErrorActionPreference = "Stop" - - choco --version - choco feature enable -n=allowGlobalConfirmation - - # CMake 3.29.2 - Write-Host '>>> Installing CMake 3.29.2...' - choco install cmake --version 3.29.2 --installargs "ADD_CMAKE_TO_PATH=System" - if ($LASTEXITCODE -ne 0) { throw "Last command failed" } - - # Notepad++ - Write-Host '>>> Installing Notepad++...' - choco install notepadplusplus - if ($LASTEXITCODE -ne 0) { throw "Last command failed" } - - # Mambaforge - Write-Host '>>> Installing Mambaforge...' - choco install mambaforge /RegisterPython:1 /D:C:\tools\mambaforge - C:\tools\mambaforge\Scripts\conda.exe init --user --system - if ($LASTEXITCODE -ne 0) { throw "Last command failed" } - . "C:\Windows\System32\WindowsPowerShell\v1.0\profile.ps1" - if ($LASTEXITCODE -ne 0) { throw "Last command failed" } - conda config --set auto_activate_base false - - # Install Java 11 - Write-Host '>>> Installing Java 11...' - choco install openjdk11 - if ($LASTEXITCODE -ne 0) { throw "Last command failed" } - - # Install Maven - Write-Host '>>> Installing Maven...' - choco install maven - if ($LASTEXITCODE -ne 0) { throw "Last command failed" } - - # Install GraphViz - Write-Host '>>> Installing GraphViz...' - choco install graphviz - if ($LASTEXITCODE -ne 0) { throw "Last command failed" } - - # Install Visual Studio 2022 Community - Write-Host '>>> Installing Visual Studio 2022 Community...' - choco install visualstudio2022community ` - --params "--wait --passive --norestart" - if ($LASTEXITCODE -ne 0) { throw "Last command failed" } - choco install visualstudio2022-workload-nativedesktop --params ` - "--wait --passive --norestart --includeOptional" - if ($LASTEXITCODE -ne 0) { throw "Last command failed" } - - # Install CUDA 12.4 - Write-Host '>>> Installing CUDA 12.4...' - choco install cuda --version=12.4.1.551 - if ($LASTEXITCODE -ne 0) { throw "Last command failed" } - - # Install R - Write-Host '>>> Installing R...' - choco install r.project --version=4.3.2 - if ($LASTEXITCODE -ne 0) { throw "Last command failed" } - choco install rtools --version=4.3.5550 - if ($LASTEXITCODE -ne 0) { throw "Last command failed" } diff --git a/tests/buildkite/pipeline-mac-m1.yml b/tests/buildkite/pipeline-mac-m1.yml deleted file mode 100644 index 57b1b1d12010..000000000000 --- a/tests/buildkite/pipeline-mac-m1.yml +++ /dev/null @@ -1,13 +0,0 @@ -steps: - - block: ":rocket: Run this test job" - if: build.pull_request.id != null || build.branch =~ /^dependabot\// - - label: ":macos: Build libxgboost4j.dylib for MacOS M1" - command: "tests/buildkite/build-jvm-macos-m1.sh" - key: mac-m1-jvm - agents: - queue: mac-mini-m1 - - label: ":macos: Build and Test XGBoost for MacOS M1 with Clang 11" - command: "tests/buildkite/test-macos-m1-clang11.sh" - key: mac-m1-appleclang11 - agents: - queue: mac-mini-m1 diff --git a/tests/buildkite/pipeline-mgpu.yml b/tests/buildkite/pipeline-mgpu.yml index cbb573c3682c..4246425de0ca 100644 --- a/tests/buildkite/pipeline-mgpu.yml +++ b/tests/buildkite/pipeline-mgpu.yml @@ -24,25 +24,9 @@ steps: queue: linux-amd64-cpu - wait #### -------- BUILD -------- - - label: ":console: Build CUDA" - command: "tests/buildkite/build-cuda.sh" - key: build-cuda - agents: - queue: linux-amd64-cpu - label: ":console: Build and test JVM packages with CUDA" command: "tests/buildkite/build-jvm-packages-gpu.sh" key: build-jvm-packages-gpu agents: queue: linux-amd64-mgpu - wait - #### -------- TEST -------- - - label: ":console: Run Google Tests" - command: "tests/buildkite/test-cpp-mgpu.sh" - key: test-cpp-mgpu - agents: - queue: linux-amd64-mgpu - - label: ":console: Test Python package, 4 GPUs" - command: "tests/buildkite/test-python-gpu.sh mgpu" - key: test-python-mgpu - agents: - queue: linux-amd64-mgpu diff --git a/tests/buildkite/pipeline-nightly.yml b/tests/buildkite/pipeline-nightly.yml deleted file mode 100644 index c8cc459acc1e..000000000000 --- a/tests/buildkite/pipeline-nightly.yml +++ /dev/null @@ -1,37 +0,0 @@ -# Nightly CI pipeline, to test against dev versions of dependencies - -env: - DOCKER_CACHE_ECR_ID: "492475357299" - DOCKER_CACHE_ECR_REGION: "us-west-2" - DISABLE_RELEASE: "1" - # Skip uploading artifacts to S3 bucket - # Also, don't build all CUDA archs; just build sm_75 - USE_DEPS_DEV_VER: "1" - # Use dev versions of RAPIDS and other dependencies -steps: - #### -------- CONTAINER BUILD -------- - - label: ":docker: Build containers" - commands: - - "tests/buildkite/build-containers.sh gpu_build_rockylinux8" - - "tests/buildkite/build-containers.sh gpu_dev_ver" - key: build-containers - agents: - queue: linux-amd64-cpu - - wait - - - label: ":console: Build CUDA" - command: "tests/buildkite/build-cuda.sh" - key: build-cuda - agents: - queue: linux-amd64-cpu - - wait - - label: ":console: Test Python package, single GPU" - command: "tests/buildkite/test-python-gpu.sh gpu" - key: test-python-gpu - agents: - queue: linux-amd64-gpu - - label: ":console: Test Python package, 4 GPUs" - command: "tests/buildkite/test-python-gpu.sh mgpu" - key: test-python-mgpu - agents: - queue: linux-amd64-mgpu diff --git a/tests/buildkite/pipeline-win64.yml b/tests/buildkite/pipeline-win64.yml deleted file mode 100644 index 83a61981e716..000000000000 --- a/tests/buildkite/pipeline-win64.yml +++ /dev/null @@ -1,24 +0,0 @@ -steps: - - label: ":moneybag: Enforce daily budget" - command: "tests/buildkite/enforce_daily_budget.sh" - key: enforce-daily-budget - agents: - queue: pipeline-loader - - wait - - block: ":rocket: Run this test job" - if: build.pull_request.id != null || build.branch =~ /^dependabot\// - #### -------- BUILD -------- - - label: ":windows: Build XGBoost for Windows with CUDA" - command: "tests/buildkite/build-win64-gpu.ps1" - key: build-win64-gpu - agents: - queue: windows-cpu - - - wait - - #### -------- TEST -------- - - label: ":windows: Test XGBoost on Windows" - command: "tests/buildkite/test-win64-gpu.ps1" - key: test-win64-gpu - agents: - queue: windows-gpu diff --git a/tests/buildkite/pipeline.yml b/tests/buildkite/pipeline.yml index 19e9c6e2b9e5..65225649a3af 100644 --- a/tests/buildkite/pipeline.yml +++ b/tests/buildkite/pipeline.yml @@ -21,89 +21,19 @@ steps: queue: linux-amd64-cpu - wait #### -------- BUILD -------- - - label: ":console: Run clang-tidy" - command: "tests/buildkite/run-clang-tidy.sh" - key: run-clang-tidy - agents: - queue: linux-amd64-cpu - - label: ":console: Build CPU" - command: "tests/buildkite/build-cpu.sh" - key: build-cpu - agents: - queue: linux-amd64-cpu - - label: ":console: Build CPU ARM64 + manylinux_2_28_aarch64 wheel" - command: "tests/buildkite/build-cpu-arm64.sh" - key: build-cpu-arm64 - agents: - queue: linux-arm64-cpu - - label: ":console: Build CUDA + manylinux_2_28_x86_64 wheel" - command: "tests/buildkite/build-cuda.sh" - key: build-cuda - agents: - queue: linux-amd64-cpu - - label: ":console: Build CUDA with RMM" - command: "tests/buildkite/build-cuda-with-rmm.sh" - key: build-cuda-with-rmm - agents: - queue: linux-amd64-cpu - - label: ":console: Build R package with CUDA" - command: "tests/buildkite/build-gpu-rpkg.sh" - key: build-gpu-rpkg - agents: - queue: linux-amd64-cpu - label: ":console: Build JVM packages" timeout_in_minutes: 30 command: "tests/buildkite/build-jvm-packages.sh" key: build-jvm-packages agents: queue: linux-amd64-cpu - - label: ":console: Build libxgboost4j.so for Linux ARM64 (targeting glibc 2.17)" - command: "tests/buildkite/build-jvm-linux-arm64-manylinux2014.sh" - key: build-jvm-linux-arm64-manylinux2014 - agents: - queue: linux-arm64-cpu - - label: ":console: Build libxgboost4j.so for Linux x86_64 (targeting glibc 2.17)" - command: "tests/buildkite/build-jvm-linux-x86_64-manylinux2014.sh" - key: build-jvm-linux-x86_64-manylinux2014 - agents: - queue: linux-amd64-cpu - label: ":console: Build JVM package doc" command: "tests/buildkite/build-jvm-doc.sh" key: build-jvm-doc agents: queue: linux-amd64-cpu - - label: ":console: Build manylinux2014_x86_64 wheel" - command: "tests/buildkite/build-manylinux2014.sh x86_64" - key: build-manylinux2014-x86_64 - agents: - queue: linux-amd64-cpu - - label: ":console: Build manylinux2014_aarch64 wheel" - command: "tests/buildkite/build-manylinux2014.sh aarch64" - key: build-manylinux2014-aarch64 - agents: - queue: linux-arm64-cpu - wait #### -------- TEST -------- - - label: ":console: Test Python package, CPU" - command: "tests/buildkite/test-python-cpu.sh" - key: test-python-cpu - agents: - queue: linux-amd64-cpu - - label: ":console: Test Python package, CPU ARM64" - command: "tests/buildkite/test-python-cpu-arm64.sh" - key: test-python-cpu-arm64 - agents: - queue: linux-arm64-cpu - - label: ":console: Test Python package, single GPU" - command: "tests/buildkite/test-python-gpu.sh gpu" - key: test-python-gpu - agents: - queue: linux-amd64-gpu - - label: ":console: Run Google Tests" - command: "tests/buildkite/test-cpp-gpu.sh" - key: test-cpp-gpu - agents: - queue: linux-amd64-gpu - label: ":console: Run integration tests with JVM packages" command: "tests/buildkite/test-integration-jvm-packages.sh" key: test-integration-jvm-packages diff --git a/tests/buildkite/test-integration-jvm-packages.sh b/tests/buildkite/test-integration-jvm-packages.sh deleted file mode 100755 index 51f74afe9006..000000000000 --- a/tests/buildkite/test-integration-jvm-packages.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash - -set -euo pipefail - -source tests/buildkite/conftest.sh - -echo "--- Test XGBoost4J on a machine with JDK ${JDK_VERSION}, Spark ${SPARK_VERSION}" -buildkite-agent artifact download "jvm-packages/xgboost4j/target/*.jar" . --step build-jvm-packages -buildkite-agent artifact download "jvm-packages/xgboost4j-spark/target/*.jar" . --step build-jvm-packages -buildkite-agent artifact download "jvm-packages/xgboost4j-example/target/*.jar" . --step build-jvm-packages -export CI_DOCKER_EXTRA_PARAMS_INIT='-e RUN_INTEGRATION_TEST=1' -tests/ci_build/ci_build.sh jvm_cross --build-arg JDK_VERSION=${JDK_VERSION} \ - --build-arg SPARK_VERSION=${SPARK_VERSION} tests/ci_build/test_jvm_cross.sh diff --git a/tests/buildkite/test-macos-m1-clang11.sh b/tests/buildkite/test-macos-m1-clang11.sh deleted file mode 100755 index 6824cb7b14b4..000000000000 --- a/tests/buildkite/test-macos-m1-clang11.sh +++ /dev/null @@ -1,25 +0,0 @@ -#!/bin/bash - -set -euo pipefail - -source tests/buildkite/conftest.sh - -# Display system info -echo "--- Display system information" -set -x -system_profiler SPSoftwareDataType -sysctl -n machdep.cpu.brand_string -uname -m -set +x - -# Ensure that XGBoost can be built with Clang 11 -echo "--- Build and Test XGBoost with MacOS M1, Clang 11" -set -x -LLVM11_PATH=$(brew --prefix llvm\@11) -mkdir build -pushd build -cmake .. -GNinja -DCMAKE_C_COMPILER=${LLVM11_PATH}/bin/clang \ - -DCMAKE_CXX_COMPILER=${LLVM11_PATH}/bin/clang++ -DGOOGLE_TEST=ON \ - -DUSE_DMLC_GTEST=ON -ninja -v -./testxgboost diff --git a/tests/ci_build/build_jvm_packages.sh b/tests/ci_build/build_jvm_packages.sh index 97c056403f0a..23811f817bd7 100755 --- a/tests/ci_build/build_jvm_packages.sh +++ b/tests/ci_build/build_jvm_packages.sh @@ -1,7 +1,6 @@ #!/bin/bash -set -e -set -x +set -euo pipefail spark_version=$1 use_cuda=$2 @@ -13,9 +12,6 @@ if [ "x$use_cuda" == "x-Duse.cuda=ON" ]; then gpu_options="$use_cuda -Pgpu" fi -# Initialize local Maven repository -./tests/ci_build/initialize_maven.sh - rm -rf build/ cd jvm-packages @@ -25,11 +21,10 @@ fi if [ "x$use_scala213" != "x" ]; then cd .. - python dev/change_scala_version.py --scala-version 2.13 --purge-artifacts + python ops/change_scala_version.py --scala-version 2.13 --purge-artifacts cd jvm-packages fi mvn --no-transfer-progress package -Dspark.version=${spark_version} $gpu_options set +x -set +e diff --git a/tests/ci_build/ci_build.sh b/tests/ci_build/ci_build.sh deleted file mode 100755 index a2f2d6063160..000000000000 --- a/tests/ci_build/ci_build.sh +++ /dev/null @@ -1,248 +0,0 @@ -#!/usr/bin/env bash -# -# Execute command within a docker container -# -# Usage: ci_build.sh [--use-gpus] -# [--dockerfile ] [-it] -# [--build-arg ] -# -# CONTAINER_TYPE: Type of the docker container used the run the build: e.g., -# (cpu | gpu) -# -# --use-gpus: Whether to grant the container access to NVIDIA GPUs. -# -# DOCKERFILE_PATH: (Optional) Path to the Dockerfile used for docker build. If -# this optional value is not supplied (via the --dockerfile -# flag), will use Dockerfile.CONTAINER_TYPE in default -# -# BUILD_ARG: (Optional) an argument to be passed to docker build -# -# COMMAND: Command to be executed in the docker container -# -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" - -# Get the command line arguments. -CONTAINER_TYPE=$( echo "$1" | tr '[:upper:]' '[:lower:]' ) -shift 1 - -# Dockerfile to be used in docker build -DOCKERFILE_PATH="${SCRIPT_DIR}/Dockerfile.${CONTAINER_TYPE}" -DOCKER_CONTEXT_PATH="${SCRIPT_DIR}" - -GPU_FLAG='' -if [[ "$1" == "--use-gpus" ]]; then - echo "Using NVIDIA GPUs" - GPU_FLAG='--gpus all' - shift 1 -fi - -if [[ "$1" == "--dockerfile" ]]; then - DOCKERFILE_PATH="$2" - DOCKER_CONTEXT_PATH=$(dirname "${DOCKERFILE_PATH}") - echo "Using custom Dockerfile path: ${DOCKERFILE_PATH}" - echo "Using custom docker build context path: ${DOCKER_CONTEXT_PATH}" - shift 2 -fi - -if [[ -n "${CI_DOCKER_EXTRA_PARAMS_INIT}" ]] -then - IFS=' ' read -r -a CI_DOCKER_EXTRA_PARAMS <<< "${CI_DOCKER_EXTRA_PARAMS_INIT}" -fi - -if [[ "$1" == "-it" ]]; then - CI_DOCKER_EXTRA_PARAMS+=('-it') - shift 1 -fi - -while [[ "$1" == "--build-arg" ]]; do - CI_DOCKER_BUILD_ARG+=" $1" - CI_DOCKER_BUILD_ARG+=" $2" - shift 2 -done - -if [[ ! -f "${DOCKERFILE_PATH}" ]]; then - echo "Invalid Dockerfile path: \"${DOCKERFILE_PATH}\"" - exit 1 -fi - -COMMAND=("$@") - -# Validate command line arguments. -if [ "$#" -lt 1 ] || [ ! -e "${SCRIPT_DIR}/Dockerfile.${CONTAINER_TYPE}" ]; then - supported_container_types=$( ls -1 ${SCRIPT_DIR}/Dockerfile.* | \ - sed -n 's/.*Dockerfile\.\([^\/]*\)/\1/p' | tr '\n' ' ' ) - echo "Usage: $(basename $0) CONTAINER_TYPE COMMAND" - echo " CONTAINER_TYPE can be one of [${supported_container_types}]" - echo " COMMAND is a command (with arguments) to run inside" - echo " the container." - exit 1 -fi - -# Helper function to traverse directories up until given file is found. -function upsearch () { - test / == "$PWD" && return || \ - test -e "$1" && echo "$PWD" && return || \ - cd .. && upsearch "$1" -} - -# Set up WORKSPACE. Jenkins will set them for you or we pick -# reasonable defaults if you run it outside of Jenkins. -WORKSPACE="${WORKSPACE:-${SCRIPT_DIR}/../../}" - -# Determine the docker image name -DOCKER_IMG_NAME="xgb-ci.${CONTAINER_TYPE}" - -# Append cuda version if available -CUDA_VERSION=$(echo "${CI_DOCKER_BUILD_ARG}" | grep -o -E 'CUDA_VERSION_ARG=[0-9]+\.[0-9]+' | grep -o -E '[0-9]+\.[0-9]+') -# Append jdk version if available -JDK_VERSION=$(echo "${CI_DOCKER_BUILD_ARG}" | grep -o -E 'JDK_VERSION=[0-9]+' | grep -o -E '[0-9]+') -# Append cmake version if available -CMAKE_VERSION=$(echo "${CI_DOCKER_BUILD_ARG}" | grep -o -E 'CMAKE_VERSION=[0-9]+\.[0-9]+' | grep -o -E '[0-9]+\.[0-9]+') -# Append R version if available -USE_R35=$(echo "${CI_DOCKER_BUILD_ARG}" | grep -o -E 'USE_R35=[0-9]+' | grep -o -E '[0-9]+$') -if [[ ${USE_R35} == "1" ]]; then - USE_R35="_r35" -elif [[ ${USE_R35} == "0" ]]; then - USE_R35="_no_r35" -fi -DOCKER_IMG_NAME=$DOCKER_IMG_NAME$CUDA_VERSION$JDK_VERSION$CMAKE_VERSION$USE_R35 - -# Under Jenkins matrix build, the build tag may contain characters such as -# commas (,) and equal signs (=), which are not valid inside docker image names. -DOCKER_IMG_NAME=$(echo "${DOCKER_IMG_NAME}" | sed -e 's/=/_/g' -e 's/,/-/g') - -# Convert to all lower-case, as per requirement of Docker image names -DOCKER_IMG_NAME=$(echo "${DOCKER_IMG_NAME}" | tr '[:upper:]' '[:lower:]') - -# Bash on Ubuntu on Windows -UBUNTU_ON_WINDOWS=$([ -e /proc/version ] && grep -l Microsoft /proc/version || echo "") -# MSYS, Git Bash, etc. -MSYS=$([ -e /proc/version ] && grep -l MINGW /proc/version || echo "") - -if [[ -z "$UBUNTU_ON_WINDOWS" ]] && [[ -z "$MSYS" ]] && [[ ! "$OSTYPE" == "darwin"* ]]; then - USER_IDS="-e CI_BUILD_UID=$( id -u ) -e CI_BUILD_GID=$( id -g ) -e CI_BUILD_USER=$( id -un ) -e CI_BUILD_GROUP=$( id -gn ) -e CI_BUILD_HOME=${WORKSPACE}" -fi - -# Print arguments. -cat < Date: Thu, 31 Oct 2024 22:36:51 -0700 Subject: [PATCH 15/45] Fix dmlc/xgboost#10752 --- .github/workflows/main.yml | 19 +++++++++++++++++++ ops/docker/ci_container.yml | 6 ++++++ ops/docker/dockerfile/Dockerfile.cpu | 3 +-- ops/docker/dockerfile/Dockerfile.gpu | 3 +-- ops/pipeline/build-cuda-with-rmm.sh | 2 +- ops/pipeline/build-cuda.sh | 3 ++- ops/pipeline/build-gpu-rpkg.sh | 4 ++-- ops/pipeline/build-jvm-doc.sh | 2 +- ops/pipeline/build-jvm-macos-m1.sh | 2 +- ops/pipeline/build-jvm-manylinux2014.sh | 2 +- ops/pipeline/build-manylinux2014.sh | 2 +- ops/pipeline/run-clang-tidy.sh | 2 +- ops/pipeline/test-cpp-gpu.sh | 2 +- ops/pipeline/test-python.sh | 18 +++++++++++++----- ops/{tidy.py => run_clang_tidy.py} | 0 .../test_gpu_with_dask/test_gpu_with_dask.py | 2 ++ 16 files changed, 53 insertions(+), 19 deletions(-) rename ops/{tidy.py => run_clang_tidy.py} (100%) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 276fa45ba533..a05661d22c80 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -25,6 +25,7 @@ jobs: matrix: container_id: - xgb-ci.gpu_build_rockylinux8 + - xgb-ci.gpu_build_r_rockylinux8 - xgb-ci.gpu - xgb-ci.gpu_dev_ver - xgb-ci.cpu @@ -220,6 +221,24 @@ jobs: CONTAINER_ID: xgb-ci.manylinux2014_${{ matrix.arch }} - run: bash ops/pipeline/build-manylinux2014.sh ${{ matrix.arch }} + build-gpu-rpkg: + name: Build GPU-enabled R package + needs: build-containers + runs-on: + - runs-on=${{ github.run_id }} + - runner=linux-amd64-gpu + steps: + # Restart Docker daemon so that it recognized the ephemeral disks + - run: sudo systemctl restart docker + - uses: actions/checkout@v4 + with: + submodules: "true" + - name: Fetch container from cache + run: bash ops/docker_build.sh + env: + CONTAINER_ID: xgb-ci.gpu_build_r_rockylinux8 + - run: bash ops/pipeline/build-gpu-rpkg.sh + test-cpp-gpu: name: Run Google Tests with GPU(s) needs: [build-cuda, build-cuda-with-rmm] diff --git a/ops/docker/ci_container.yml b/ops/docker/ci_container.yml index f21122231c0b..1b3a60adc014 100644 --- a/ops/docker/ci_container.yml +++ b/ops/docker/ci_container.yml @@ -10,6 +10,12 @@ xgb-ci.gpu_build_rockylinux8: NCCL_VERSION_ARG: "2.22.3-1" RAPIDS_VERSION_ARG: "24.10" +xgb-ci.gpu_build_r_rockylinux8: + container_def: gpu_build_r_rockylinux8 + build_args: + CUDA_VERSION_ARG: "12.5.1" + R_VERSION_ARG: "4.3.2" + xgb-ci.gpu: container_def: gpu build_args: diff --git a/ops/docker/dockerfile/Dockerfile.cpu b/ops/docker/dockerfile/Dockerfile.cpu index 22db93572207..64b28026a89c 100644 --- a/ops/docker/dockerfile/Dockerfile.cpu +++ b/ops/docker/dockerfile/Dockerfile.cpu @@ -41,8 +41,7 @@ RUN git clone -b v1.65.4 https://github.com/grpc/grpc.git \ COPY conda_env/linux_cpu_test.yml /scripts/ RUN mamba create -n linux_cpu_test && \ mamba env update -n linux_cpu_test --file=/scripts/linux_cpu_test.yml && \ - mamba clean --all --yes && \ - conda run --no-capture-output -n linux_cpu_test pip install buildkite-test-collector + mamba clean --all --yes # Install lightweight sudo (not bound to TTY) RUN set -ex; \ diff --git a/ops/docker/dockerfile/Dockerfile.gpu b/ops/docker/dockerfile/Dockerfile.gpu index 461f1d99dd54..eac35c3aaa90 100644 --- a/ops/docker/dockerfile/Dockerfile.gpu +++ b/ops/docker/dockerfile/Dockerfile.gpu @@ -33,8 +33,7 @@ RUN \ numpy pytest pytest-timeout scipy scikit-learn pandas matplotlib wheel \ python-kubernetes urllib3 graphviz hypothesis loky \ "pyspark>=3.4.0" cloudpickle cuda-python && \ - mamba clean --all --yes && \ - conda run --no-capture-output -n gpu_test pip install buildkite-test-collector + mamba clean --all --yes ENV GOSU_VERSION=1.10 ENV JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64/ diff --git a/ops/pipeline/build-cuda-with-rmm.sh b/ops/pipeline/build-cuda-with-rmm.sh index ab5420002f46..24523bd875c0 100755 --- a/ops/pipeline/build-cuda-with-rmm.sh +++ b/ops/pipeline/build-cuda-with-rmm.sh @@ -1,6 +1,6 @@ #!/bin/bash -set -euo pipefail +set -euox pipefail WHEEL_TAG=manylinux_2_28_x86_64 diff --git a/ops/pipeline/build-cuda.sh b/ops/pipeline/build-cuda.sh index 690c7f25f69e..9dc7dfad0224 100755 --- a/ops/pipeline/build-cuda.sh +++ b/ops/pipeline/build-cuda.sh @@ -1,6 +1,6 @@ #!/bin/bash -set -euo pipefail +set -euox pipefail WHEEL_TAG=manylinux_2_28_x86_64 @@ -16,6 +16,7 @@ echo "--- Build with CUDA" #fi echo "--- Build libxgboost from the source" +set -x git clone https://github.com/NVIDIA/cccl.git -b v2.6.1 --quiet python3 ops/docker_run.py \ --container-id xgb-ci.gpu_build_rockylinux8 \ diff --git a/ops/pipeline/build-gpu-rpkg.sh b/ops/pipeline/build-gpu-rpkg.sh index 4df0c029568c..c7d3f7fa4235 100755 --- a/ops/pipeline/build-gpu-rpkg.sh +++ b/ops/pipeline/build-gpu-rpkg.sh @@ -1,13 +1,13 @@ #!/bin/bash -set -euo pipefail +set -euox pipefail source ops/pipeline/enforce-ci.sh echo "--- Build XGBoost R package with CUDA" python3 ops/docker_run.py \ --container-id xgb-ci.gpu_build_r_rockylinux8 \ - -- tests/ci_build/build_r_pkg_with_cuda.sh \ + -- ops/build_r_pkg_with_cuda.sh \ ${GITHUB_SHA} if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]] diff --git a/ops/pipeline/build-jvm-doc.sh b/ops/pipeline/build-jvm-doc.sh index 7f5eb0ac7b8a..7b029a4e7e26 100755 --- a/ops/pipeline/build-jvm-doc.sh +++ b/ops/pipeline/build-jvm-doc.sh @@ -1,6 +1,6 @@ #!/bin/bash -set -euo pipefail +set -euox pipefail source ops/pipeline/enforce-ci.sh diff --git a/ops/pipeline/build-jvm-macos-m1.sh b/ops/pipeline/build-jvm-macos-m1.sh index d50c1a1a1b1d..29a11451428c 100644 --- a/ops/pipeline/build-jvm-macos-m1.sh +++ b/ops/pipeline/build-jvm-macos-m1.sh @@ -1,6 +1,6 @@ #!/bin/bash -set -euo pipefail +set -euox pipefail source ops/pipeline/enforce-ci.sh diff --git a/ops/pipeline/build-jvm-manylinux2014.sh b/ops/pipeline/build-jvm-manylinux2014.sh index c009de93e62c..99216d6f6272 100644 --- a/ops/pipeline/build-jvm-manylinux2014.sh +++ b/ops/pipeline/build-jvm-manylinux2014.sh @@ -1,6 +1,6 @@ #!/bin/bash -set -euo pipefail +set -euox pipefail source ops/pipeline/enforce-ci.sh diff --git a/ops/pipeline/build-manylinux2014.sh b/ops/pipeline/build-manylinux2014.sh index 5b1935097d9d..3f04c0f7e7f4 100755 --- a/ops/pipeline/build-manylinux2014.sh +++ b/ops/pipeline/build-manylinux2014.sh @@ -1,6 +1,6 @@ #!/bin/bash -set -euo pipefail +set -euox pipefail source ops/pipeline/enforce-ci.sh diff --git a/ops/pipeline/run-clang-tidy.sh b/ops/pipeline/run-clang-tidy.sh index 9af3273b0dbe..b669f12ebf9e 100755 --- a/ops/pipeline/run-clang-tidy.sh +++ b/ops/pipeline/run-clang-tidy.sh @@ -8,4 +8,4 @@ source ops/pipeline/enforce-ci.sh python3 ops/docker_run.py \ --container-id xgb-ci.clang_tidy \ - -- python3 tests/ci_build/tidy.py --cuda-archs 75 + -- python3 ops/run_clang_tidy.py --cuda-archs 75 diff --git a/ops/pipeline/test-cpp-gpu.sh b/ops/pipeline/test-cpp-gpu.sh index 51d097fbdbdf..8ff66a554e0c 100755 --- a/ops/pipeline/test-cpp-gpu.sh +++ b/ops/pipeline/test-cpp-gpu.sh @@ -1,6 +1,6 @@ #!/bin/bash -set -euo pipefail +set -euox pipefail source ops/pipeline/enforce-ci.sh diff --git a/ops/pipeline/test-python.sh b/ops/pipeline/test-python.sh index f0c9c81cb554..b33b38ac187c 100755 --- a/ops/pipeline/test-python.sh +++ b/ops/pipeline/test-python.sh @@ -13,7 +13,7 @@ fi suite="$1" container_id="$2" -tee test-python-wrapper.sh <<-'EOF' +cat > test-python-wrapper.sh <<-'EOF' #!/bin/bash source activate "$1" @@ -32,7 +32,9 @@ case "$suite" in echo " python -c 'from cupy.cuda import jitify; jitify._init_module()' pytest -v -s -rxXs --fulltrace --durations=0 -m 'not mgpu' tests/python-gpu - " | tee -a test-python-wrapper.sh + " >> test-python-wrapper.sh + set -x + cat test-python-wrapper.sh python3 ops/docker_run.py --container-id "${container_id}" --use-gpus \ --run-args='--privileged' \ -- bash test-python-wrapper.sh gpu_test @@ -46,7 +48,9 @@ case "$suite" in pytest -v -s -rxXs --fulltrace --durations=0 -m 'mgpu' tests/test_distributed/test_gpu_with_dask pytest -v -s -rxXs --fulltrace --durations=0 -m 'mgpu' tests/test_distributed/test_gpu_with_spark pytest -v -s -rxXs --fulltrace --durations=0 -m 'mgpu' tests/test_distributed/test_gpu_federated - " | tee -a test-python-wrapper.sh + " >> test-python-wrapper.sh + set -x + cat test-python-wrapper.sh python3 ops/docker_run.py --container-id "${container_id}" --use-gpus \ --run-args='--privileged --shm-size=4g' \ -- bash test-python-wrapper.sh gpu_test @@ -60,7 +64,9 @@ case "$suite" in pytest -v -s -rxXs --fulltrace --durations=0 tests/test_distributed/test_with_dask pytest -v -s -rxXs --fulltrace --durations=0 tests/test_distributed/test_with_spark pytest -v -s -rxXs --fulltrace --durations=0 tests/test_distributed/test_federated - " | tee -a test-python-wrapper.sh + " >> test-python-wrapper.sh + set -x + cat test-python-wrapper.sh python3 ops/docker_run.py --container-id "${container_id}" \ -- bash test-python-wrapper.sh linux_cpu_test ;; @@ -71,7 +77,9 @@ case "$suite" in pytest -v -s -rxXs --fulltrace --durations=0 \\ tests/python/test_basic.py tests/python/test_basic_models.py \\ tests/python/test_model_compatibility.py - " | tee -a test-python-wrapper.sh + " >> test-python-wrapper.sh + set -x + cat test-python-wrapper.sh python3 ops/docker_run.py --container-id "${container_id}" \ -- bash test-python-wrapper.sh aarch64_test ;; diff --git a/ops/tidy.py b/ops/run_clang_tidy.py similarity index 100% rename from ops/tidy.py rename to ops/run_clang_tidy.py diff --git a/tests/test_distributed/test_gpu_with_dask/test_gpu_with_dask.py b/tests/test_distributed/test_gpu_with_dask/test_gpu_with_dask.py index c729761b8dd4..50d6f4e43ffc 100644 --- a/tests/test_distributed/test_gpu_with_dask/test_gpu_with_dask.py +++ b/tests/test_distributed/test_gpu_with_dask/test_gpu_with_dask.py @@ -99,6 +99,8 @@ def is_df(part: T) -> T: cp.testing.assert_allclose(predt.values.compute(), single_node) # Make sure the output can be integrated back to original dataframe + X.columns = X.columns.astype("object") + # Work around https://github.com/dmlc/xgboost/issues/10752 X["predict"] = predictions X["inplace_predict"] = series_predictions From 80a883ea037cf087bf9bf7c2db47eac8f2c5e764 Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Thu, 31 Oct 2024 23:24:09 -0700 Subject: [PATCH 16/45] Fix clang-tidy + rpkg build --- .github/workflows/main.yml | 2 +- ops/{ => clang-tidy}/run_clang_tidy.py | 8 ++++---- ops/{ => clang-tidy}/test_tidy.cc | 0 ops/docker/ci_container.yml | 2 +- ops/{ => lint}/lint_cmake.sh | 0 ops/{ => lint}/lint_cpp.py | 0 ops/{ => lint}/lint_python.py | 0 ops/{ => lint}/lint_r.R | 0 ops/pipeline/enforce-ci.sh | 4 ---- ops/pipeline/run-clang-tidy.sh | 2 +- ops/test_r_package.py | 2 +- ops/test_utils.py | 2 +- 12 files changed, 9 insertions(+), 13 deletions(-) rename ops/{ => clang-tidy}/run_clang_tidy.py (97%) rename ops/{ => clang-tidy}/test_tidy.cc (100%) rename ops/{ => lint}/lint_cmake.sh (100%) rename ops/{ => lint}/lint_cpp.py (100%) rename ops/{ => lint}/lint_python.py (100%) rename ops/{ => lint}/lint_r.R (100%) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index a05661d22c80..c0885eaa2ffc 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -226,7 +226,7 @@ jobs: needs: build-containers runs-on: - runs-on=${{ github.run_id }} - - runner=linux-amd64-gpu + - runner=linux-amd64-cpu steps: # Restart Docker daemon so that it recognized the ephemeral disks - run: sudo systemctl restart docker diff --git a/ops/run_clang_tidy.py b/ops/clang-tidy/run_clang_tidy.py similarity index 97% rename from ops/run_clang_tidy.py rename to ops/clang-tidy/run_clang_tidy.py index 13bbedc0b4b5..24cb270393e8 100755 --- a/ops/run_clang_tidy.py +++ b/ops/clang-tidy/run_clang_tidy.py @@ -19,7 +19,7 @@ def call(args: list[str]) -> tuple[int, int, str, list[str]]: # `workspace` is a name used in the CI container. Normally we should keep the dir # as `xgboost`. matched = re.search( - "(workspace|xgboost)/.*(src|tests|include)/.*warning:", error_msg, re.MULTILINE + "(workspace|xgboost)/.*(ops|src|tests|include)/.*warning:", error_msg, re.MULTILINE ) if matched is None: @@ -265,7 +265,7 @@ def test_tidy(args: argparse.Namespace) -> None: """ root_path = os.path.abspath(os.path.curdir) tidy_file = os.path.join(root_path, ".clang-tidy") - test_file_path = os.path.join(root_path, "tests", "ci_build", "test_tidy.cc") + test_file_path = os.path.join(root_path, "ops", "clang-tidy", "test_tidy.cc") tidy_config = "--config-file=" + tidy_file if not args.tidy_version: @@ -274,8 +274,8 @@ def test_tidy(args: argparse.Namespace) -> None: tidy = "clang-tidy-" + str(args.tidy_version) cmd = [tidy, tidy_config, test_file_path] (proc_code, tidy_status, error_msg, _) = call(cmd) - assert proc_code == 0 - assert tidy_status == 1 + if proc_code != 0 or tidy_status != 1: + raise RuntimeError(error_msg) print("clang-tidy is working.") diff --git a/ops/test_tidy.cc b/ops/clang-tidy/test_tidy.cc similarity index 100% rename from ops/test_tidy.cc rename to ops/clang-tidy/test_tidy.cc diff --git a/ops/docker/ci_container.yml b/ops/docker/ci_container.yml index 1b3a60adc014..3612529607b7 100644 --- a/ops/docker/ci_container.yml +++ b/ops/docker/ci_container.yml @@ -13,7 +13,7 @@ xgb-ci.gpu_build_rockylinux8: xgb-ci.gpu_build_r_rockylinux8: container_def: gpu_build_r_rockylinux8 build_args: - CUDA_VERSION_ARG: "12.5.1" + CUDA_VERSION_ARG: "12.4.1" R_VERSION_ARG: "4.3.2" xgb-ci.gpu: diff --git a/ops/lint_cmake.sh b/ops/lint/lint_cmake.sh similarity index 100% rename from ops/lint_cmake.sh rename to ops/lint/lint_cmake.sh diff --git a/ops/lint_cpp.py b/ops/lint/lint_cpp.py similarity index 100% rename from ops/lint_cpp.py rename to ops/lint/lint_cpp.py diff --git a/ops/lint_python.py b/ops/lint/lint_python.py similarity index 100% rename from ops/lint_python.py rename to ops/lint/lint_python.py diff --git a/ops/lint_r.R b/ops/lint/lint_r.R similarity index 100% rename from ops/lint_r.R rename to ops/lint/lint_r.R diff --git a/ops/pipeline/enforce-ci.sh b/ops/pipeline/enforce-ci.sh index 48a48f2dc730..eefb6450b98d 100755 --- a/ops/pipeline/enforce-ci.sh +++ b/ops/pipeline/enforce-ci.sh @@ -5,8 +5,6 @@ set -euo pipefail -set -x - if [[ -z ${GITHUB_ACTION:-} ]] then echo "$0 is not meant to run locally; it should run inside GitHub Actions." @@ -40,5 +38,3 @@ if [[ -n ${DISABLE_RELEASE:-} ]] then is_release_branch=0 fi - -set +x diff --git a/ops/pipeline/run-clang-tidy.sh b/ops/pipeline/run-clang-tidy.sh index b669f12ebf9e..496b601bfdfb 100755 --- a/ops/pipeline/run-clang-tidy.sh +++ b/ops/pipeline/run-clang-tidy.sh @@ -8,4 +8,4 @@ source ops/pipeline/enforce-ci.sh python3 ops/docker_run.py \ --container-id xgb-ci.clang_tidy \ - -- python3 ops/run_clang_tidy.py --cuda-archs 75 + -- python3 ops/clang-tidy/run_clang_tidy.py --cuda-archs 75 diff --git a/ops/test_r_package.py b/ops/test_r_package.py index 5ca7fa69b21a..3ce886c1bc41 100644 --- a/ops/test_r_package.py +++ b/ops/test_r_package.py @@ -42,7 +42,7 @@ def pkgroot(path: str) -> None: else: would_remove = output.stdout.decode("utf-8").strip().split("\n") - if would_remove and not all(f.find("tests/ci_build") != -1 for f in would_remove): + if would_remove and not all(f.find("ops") != -1 for f in would_remove): raise ValueError( "\n".join(would_remove) + "\nPlease cleanup the working git repository." ) diff --git a/ops/test_utils.py b/ops/test_utils.py index adcd05d5a124..f05fed4dc7f8 100644 --- a/ops/test_utils.py +++ b/ops/test_utils.py @@ -75,7 +75,7 @@ def print_time() -> None: ROOT = os.path.normpath( os.path.join( - os.path.dirname(os.path.abspath(__file__)), os.path.pardir, os.path.pardir + os.path.dirname(os.path.abspath(__file__)), os.path.pardir ) ) R_PACKAGE = os.path.join(ROOT, "R-package") From d86deda4dc6f38ca04557e7edf18c292819df21f Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Mon, 4 Nov 2024 18:41:08 -0800 Subject: [PATCH 17/45] [WIP] Properly handle shaded JARs --- jvm-packages/pom.xml | 111 ++---------- jvm-packages/xgboost4j-spark-gpu/pom.xml | 3 + jvm-packages/xgboost4j-spark/pom.xml | 2 + jvm-packages/xgboost4j-tester/generate_pom.py | 162 ------------------ jvm-packages/xgboost4j-tester/get_iris.py | 10 -- .../java/ml/dmlc/xgboost4j/tester/App.java | 26 --- .../build_python_wheels_macos.sh | 3 +- 7 files changed, 16 insertions(+), 301 deletions(-) delete mode 100644 jvm-packages/xgboost4j-tester/generate_pom.py delete mode 100644 jvm-packages/xgboost4j-tester/get_iris.py delete mode 100644 jvm-packages/xgboost4j-tester/src/main/java/ml/dmlc/xgboost4j/tester/App.java rename tests/ci_build/build_python_wheels.sh => ops/build_python_wheels_macos.sh (98%) diff --git a/jvm-packages/pom.xml b/jvm-packages/pom.xml index c4dba142b0e1..af7aec0a6982 100644 --- a/jvm-packages/pom.xml +++ b/jvm-packages/pom.xml @@ -19,8 +19,16 @@ - CodingCat - codingcat@apache.org + Bobby Wang + wbo4958@gmail.com + + + Jiaming Yuan + jm.yuan@outlook.com + + + Hyunsu Cho + chohyu01@cs.washington.edu @@ -106,13 +114,6 @@ release - - xgboost4j - xgboost4j-example - xgboost4j-spark - xgboost4j-flink - xgboost4j-spark-gpu - @@ -192,98 +193,6 @@ - - assembly - - - - org.apache.maven.plugins - maven-assembly-plugin - 3.7.1 - - - jar-with-dependencies - - true - - - - make-assembly - package - - single - - - - - - - - - release-to-github - - - github.repo - Temporary Staging Repository - file://${project.build.directory}/mvn-repo - - - - github - - - xgboost4j - xgboost4j-example - xgboost4j-spark - xgboost4j-flink - xgboost4j-spark-gpu - - - - - com.github.github - site-maven-plugin - 0.12 - - Maven artifacts for ${project.version} - true - ${project.build.directory}/mvn-repo - refs/heads/maven-repo - - *-with-dependencies.jar - - xgboost - CodingCat - true - - - - - - site - - deploy - - - - - org.apache.maven.plugins - maven-deploy-plugin - 3.1.3 - - internal.repo::default::file://${project.build.directory}/mvn-repo - - - - org.apache.maven.plugins - maven-surefire-plugin - - true - - - - - release-to-s3 diff --git a/jvm-packages/xgboost4j-spark-gpu/pom.xml b/jvm-packages/xgboost4j-spark-gpu/pom.xml index 9722da39f801..4eed2648229a 100644 --- a/jvm-packages/xgboost4j-spark-gpu/pom.xml +++ b/jvm-packages/xgboost4j-spark-gpu/pom.xml @@ -23,9 +23,12 @@ org.apache.maven.plugins maven-shade-plugin + true + true false + ml.dmlc:xgboost4j_${scala.binary.version} ml.dmlc:xgboost4j-spark_${scala.binary.version} diff --git a/jvm-packages/xgboost4j-spark/pom.xml b/jvm-packages/xgboost4j-spark/pom.xml index f1791ab90d1a..d62b5808c0e0 100644 --- a/jvm-packages/xgboost4j-spark/pom.xml +++ b/jvm-packages/xgboost4j-spark/pom.xml @@ -23,6 +23,8 @@ org.apache.maven.plugins maven-shade-plugin + true + true false diff --git a/jvm-packages/xgboost4j-tester/generate_pom.py b/jvm-packages/xgboost4j-tester/generate_pom.py deleted file mode 100644 index ad729b3a64cb..000000000000 --- a/jvm-packages/xgboost4j-tester/generate_pom.py +++ /dev/null @@ -1,162 +0,0 @@ -import sys - -pom_template = """ - - - - 4.0.0 - - ml.dmlc - xgboost4j-tester_{scala_binary_version} - 1.0-SNAPSHOT - - xgboost4j-tester - - - UTF-8 - {maven_compiler_source} - {maven_compiler_target} - 4.13.2 - {spark_version} - {scala_version} - 3.2.15 - {scala_binary_version} - 5.6.0 - - - - - com.esotericsoftware - kryo - ${{kryo.version}} - - - org.scala-lang - scala-compiler - ${{scala.version}} - - - org.scala-lang - scala-reflect - ${{scala.version}} - - - org.scala-lang - scala-library - ${{scala.version}} - - - commons-logging - commons-logging - 1.2 - - - com.fasterxml.jackson.core - jackson-databind - 2.14.2 - - - org.scalatest - scalatest_${{scala.binary.version}} - ${{scalatest.version}} - test - - - org.apache.spark - spark-core_${{scala.binary.version}} - ${{spark.version}} - provided - - - org.apache.spark - spark-sql_${{scala.binary.version}} - ${{spark.version}} - provided - - - org.apache.spark - spark-mllib_${{scala.binary.version}} - ${{spark.version}} - provided - - - junit - junit - ${{junit.version}} - test - - - ml.dmlc - xgboost4j_${{scala.binary.version}} - {xgboost4j_version} - - - ml.dmlc - xgboost4j_${{scala.binary.version}} - {xgboost4j_version} - tests - test-jar - test - - - ml.dmlc - xgboost4j-spark_${{scala.binary.version}} - {xgboost4j_version} - - - ml.dmlc - xgboost4j-example_${{scala.binary.version}} - {xgboost4j_version} - - - - - - - org.apache.maven.plugins - maven-assembly-plugin - - - jar-with-dependencies - - - - ml.dmlc.xgboost4j.tester.App - - - - - - package - - single - - - - - - org.apache.maven.plugins - maven-surefire-plugin - - - ml.dmlc:xgboost4j_${{scala.binary.version}} - - - - - - -""" - -if __name__ == '__main__': - if len(sys.argv) != 7: - print('Usage: {} [xgboost4j version] [maven compiler source level] [maven compiler target level] [spark version] [scala version] [scala binary version]'.format(sys.argv[0])) - sys.exit(1) - with open('pom.xml', 'w') as f: - print(pom_template.format(xgboost4j_version=sys.argv[1], - maven_compiler_source=sys.argv[2], - maven_compiler_target=sys.argv[3], - spark_version=sys.argv[4], - scala_version=sys.argv[5], - scala_binary_version=sys.argv[6]), file=f) diff --git a/jvm-packages/xgboost4j-tester/get_iris.py b/jvm-packages/xgboost4j-tester/get_iris.py deleted file mode 100644 index 728c149b0260..000000000000 --- a/jvm-packages/xgboost4j-tester/get_iris.py +++ /dev/null @@ -1,10 +0,0 @@ -import numpy as np -import pandas -from sklearn.datasets import load_iris - -X, y = load_iris(return_X_y=True) -y = y.astype(np.int32) -df = pandas.DataFrame(data=X, columns=['sepal length', 'sepal width', 'petal length', 'petal width']) -class_id_to_name = {0:'Iris-setosa', 1:'Iris-versicolor', 2:'Iris-virginica'} -df['class'] = np.vectorize(class_id_to_name.get)(y) -df.to_csv('./iris.csv', float_format='%.1f', header=False, index=False) diff --git a/jvm-packages/xgboost4j-tester/src/main/java/ml/dmlc/xgboost4j/tester/App.java b/jvm-packages/xgboost4j-tester/src/main/java/ml/dmlc/xgboost4j/tester/App.java deleted file mode 100644 index 917f5062061c..000000000000 --- a/jvm-packages/xgboost4j-tester/src/main/java/ml/dmlc/xgboost4j/tester/App.java +++ /dev/null @@ -1,26 +0,0 @@ -package ml.dmlc.xgboost4j.tester; - -import ml.dmlc.xgboost4j.java.example.*; - -import java.io.IOException; -import ml.dmlc.xgboost4j.java.XGBoostError; - -public class App { - public static void main(String[] args) throws IOException, XGBoostError { - String[] args2 = new String[0]; - System.out.println("BoostFromPrediction"); - BoostFromPrediction.main(args2); - System.out.println("CrossValidation"); - CrossValidation.main(args2); - System.out.println("CustomObjective"); - CustomObjective.main(args2); - System.out.println("ExternalMemory"); - ExternalMemory.main(args2); - System.out.println("GeneralizedLinearModel"); - GeneralizedLinearModel.main(args2); - System.out.println("PredictFirstNtree"); - PredictFirstNtree.main(args2); - System.out.println("PredictLeafIndices"); - PredictLeafIndices.main(args2); - } -} diff --git a/tests/ci_build/build_python_wheels.sh b/ops/build_python_wheels_macos.sh similarity index 98% rename from tests/ci_build/build_python_wheels.sh rename to ops/build_python_wheels_macos.sh index d9927905cf83..f2d1c692c8cb 100644 --- a/tests/ci_build/build_python_wheels.sh +++ b/ops/build_python_wheels_macos.sh @@ -1,7 +1,6 @@ #!/bin/bash -set -e -set -x +set -euox pipefail if [[ $# -ne 2 ]]; then echo "Usage: $0 [platform_id] [commit ID]" From 125b7e94d55f992dc9c6e6bb3e087788baae55ee Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Tue, 12 Nov 2024 18:19:00 -0800 Subject: [PATCH 18/45] [CI] Pin Dask to 2024.10.0 (dmlc/xgboost#10995) --- ops/docker/conda_env/linux_cpu_test.yml | 4 ++-- ops/docker/conda_env/macos_cpu_test.yml | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/ops/docker/conda_env/linux_cpu_test.yml b/ops/docker/conda_env/linux_cpu_test.yml index e9d05c2f70d1..1ec2a5447604 100644 --- a/ops/docker/conda_env/linux_cpu_test.yml +++ b/ops/docker/conda_env/linux_cpu_test.yml @@ -17,8 +17,8 @@ dependencies: - scikit-learn>=1.4.1 - pandas - matplotlib -- dask -- distributed +- dask<=2024.10.0 +- distributed<=2024.10.0 - python-graphviz - hypothesis>=6.46 - astroid diff --git a/ops/docker/conda_env/macos_cpu_test.yml b/ops/docker/conda_env/macos_cpu_test.yml index f1fcb6b99993..29ff99e3504f 100644 --- a/ops/docker/conda_env/macos_cpu_test.yml +++ b/ops/docker/conda_env/macos_cpu_test.yml @@ -14,8 +14,8 @@ dependencies: - scikit-learn>=1.4.1 - pandas - matplotlib -- dask -- distributed +- dask<=2024.10.0 +- distributed<=2024.10.0 - graphviz - python-graphviz - hypothesis From f3ccc6f17a8d405538bc28d17e3547ac1a336925 Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Tue, 12 Nov 2024 18:21:52 -0800 Subject: [PATCH 19/45] Refactor JVM tests --- .github/workflows/main.yml | 34 ++++++-- jvm-packages/xgboost4j-example/pom.xml | 6 ++ .../java/example/BasicWalkThrough.java | 8 +- .../xgboost4j/java/example/EarlyStopping.java | 4 +- .../scala/example/BasicWalkThrough.scala | 10 +-- .../java/example/JavaExamplesTest.java | 44 ++++++++++ .../scala/example/ScalaExamplesTest.scala | 40 +++++++++ ops/change_scala_version.py | 3 + ops/docker/ci_container.yml | 3 + ops/docker/dockerfile/Dockerfile.jvm_cross | 53 ------------ ops/pipeline/build-jvm-macos-m1.sh | 1 + ops/pipeline/build-jvm-manylinux2014.sh | 1 + ops/pipeline/build-test-jvm-packages-impl.sh | 84 +++++++++++++++++++ ops/pipeline/build-test-jvm-packages.sh | 46 ++++++++++ tests/buildkite/build-jvm-packages-gpu.sh | 20 ----- tests/buildkite/build-jvm-packages.sh | 26 ------ tests/buildkite/deploy-jvm-packages.sh | 14 ---- tests/buildkite/enforce_daily_budget.py | 14 ---- tests/buildkite/enforce_daily_budget.sh | 15 ---- tests/buildkite/pipeline-mgpu.yml | 32 ------- tests/buildkite/pipeline.yml | 48 ----------- tests/ci_build/build_jvm_packages.sh | 30 ------- tests/ci_build/deploy_jvm_packages.sh | 37 -------- 23 files changed, 269 insertions(+), 304 deletions(-) create mode 100644 jvm-packages/xgboost4j-example/src/test/java/ml/dmlc/xgboost4j/java/example/JavaExamplesTest.java create mode 100644 jvm-packages/xgboost4j-example/src/test/scala/ml/dmlc/xgboost4j/scala/example/ScalaExamplesTest.scala delete mode 100644 ops/docker/dockerfile/Dockerfile.jvm_cross create mode 100755 ops/pipeline/build-test-jvm-packages-impl.sh create mode 100755 ops/pipeline/build-test-jvm-packages.sh delete mode 100755 tests/buildkite/build-jvm-packages-gpu.sh delete mode 100755 tests/buildkite/build-jvm-packages.sh delete mode 100755 tests/buildkite/deploy-jvm-packages.sh delete mode 100644 tests/buildkite/enforce_daily_budget.py delete mode 100755 tests/buildkite/enforce_daily_budget.sh delete mode 100644 tests/buildkite/pipeline-mgpu.yml delete mode 100644 tests/buildkite/pipeline.yml delete mode 100755 tests/ci_build/build_jvm_packages.sh delete mode 100755 tests/ci_build/deploy_jvm_packages.sh diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index c0885eaa2ffc..74a1cc135908 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -32,6 +32,7 @@ jobs: - xgb-ci.clang_tidy - xgb-ci.manylinux_2_28_x86_64 - xgb-ci.manylinux2014_x86_64 + - xgb-ci.jvm runner: [linux-amd64-cpu] include: - container_id: xgb-ci.manylinux2014_aarch64 @@ -39,7 +40,7 @@ jobs: - container_id: xgb-ci.aarch64 runner: linux-arm64-cpu steps: - # Restart Docker daemon so that it recognized the ephemeral disks + # Restart Docker daemon so that it recognizes the ephemeral disks - run: sudo systemctl restart docker - uses: actions/checkout@v4 with: @@ -197,7 +198,7 @@ jobs: - run: bash ops/pipeline/build-jvm-manylinux2014.sh ${{ matrix.arch }} build-manylinux2014: - name: Build manylinux2024_${{ matrix.arch }} wheel + name: Build manylinux2014_${{ matrix.arch }} wheel needs: build-containers runs-on: - runs-on=${{ github.run_id }} @@ -228,7 +229,7 @@ jobs: - runs-on=${{ github.run_id }} - runner=linux-amd64-cpu steps: - # Restart Docker daemon so that it recognized the ephemeral disks + # Restart Docker daemon so that it recognizes the ephemeral disks - run: sudo systemctl restart docker - uses: actions/checkout@v4 with: @@ -239,6 +240,29 @@ jobs: CONTAINER_ID: xgb-ci.gpu_build_r_rockylinux8 - run: bash ops/pipeline/build-gpu-rpkg.sh + build-test-jvm-packages: + name: Build and test JVM packages + needs: build-containers + runs-on: + - runs-on=${{ github.run_id }} + - runner=linux-amd64-cpu + steps: + # Restart Docker daemon so that it recognizes the ephemeral disks + - run: sudo systemctl restart docker + - uses: actions/checkout@v4 + with: + submodules: "true" + - name: Fetch container from cache + run: bash ops/docker_build.sh + env: + CONTAINER_ID: xgb-ci.jvm + - run: bash ops/pipeline/build-test-jvm-packages.sh + env: + SCALA_VERSION: 2.12 + - run: bash ops/pipeline/build-test-jvm-packages.sh + env: + SCALA_VERSION: 2.13 + test-cpp-gpu: name: Run Google Tests with GPU(s) needs: [build-cuda, build-cuda-with-rmm] @@ -258,7 +282,7 @@ jobs: runner: linux-amd64-mgpu artifact_from: build-cuda steps: - # Restart Docker daemon so that it recognized the ephemeral disks + # Restart Docker daemon so that it recognizes the ephemeral disks - run: sudo systemctl restart docker - uses: actions/checkout@v4 with: @@ -316,7 +340,7 @@ jobs: runner: linux-arm64-cpu artifact_from: build-cpu-arm64 steps: - # Restart Docker daemon so that it recognized the ephemeral disks + # Restart Docker daemon so that it recognizes the ephemeral disks - run: sudo systemctl restart docker - uses: actions/checkout@v4 with: diff --git a/jvm-packages/xgboost4j-example/pom.xml b/jvm-packages/xgboost4j-example/pom.xml index eda453041fa3..4ea4b691987d 100644 --- a/jvm-packages/xgboost4j-example/pom.xml +++ b/jvm-packages/xgboost4j-example/pom.xml @@ -40,5 +40,11 @@ xgboost4j-flink_2.12 ${project.version} + + junit + junit + ${junit.version} + test + diff --git a/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/BasicWalkThrough.java b/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/BasicWalkThrough.java index 8a74b74dab7e..0daf2c2179b1 100644 --- a/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/BasicWalkThrough.java +++ b/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/BasicWalkThrough.java @@ -62,8 +62,10 @@ public static void saveDumpModel(String modelPath, String[] modelInfos) throws I public static void main(String[] args) throws IOException, XGBoostError { // load file from text file, also binary buffer generated by xgboost4j - DMatrix trainMat = new DMatrix("../../demo/data/agaricus.txt.train?format=libsvm"); - DMatrix testMat = new DMatrix("../../demo/data/agaricus.txt.test?format=libsvm"); + DMatrix trainMat = new DMatrix( + "../../demo/data/agaricus.txt.train?format=libsvm&indexing_mode=1"); + DMatrix testMat = new DMatrix( + "../../demo/data/agaricus.txt.test?format=libsvm&indexing_mode=1"); HashMap params = new HashMap(); params.put("eta", 1.0); @@ -113,7 +115,7 @@ public static void main(String[] args) throws IOException, XGBoostError { System.out.println("start build dmatrix from csr sparse data ..."); //build dmatrix from CSR Sparse Matrix DataLoader.CSRSparseData spData = - DataLoader.loadSVMFile("../../demo/data/agaricus.txt.train?format=libsvm"); + DataLoader.loadSVMFile("../../demo/data/agaricus.txt.train"); DMatrix trainMat2 = new DMatrix(spData.rowHeaders, spData.colIndex, spData.data, DMatrix.SparseType.CSR, 127); diff --git a/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/EarlyStopping.java b/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/EarlyStopping.java index 9e52c12fdf3c..61e752f85aa9 100644 --- a/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/EarlyStopping.java +++ b/jvm-packages/xgboost4j-example/src/main/java/ml/dmlc/xgboost4j/java/example/EarlyStopping.java @@ -29,9 +29,9 @@ public class EarlyStopping { public static void main(String[] args) throws IOException, XGBoostError { DataLoader.CSRSparseData trainCSR = - DataLoader.loadSVMFile("../../demo/data/agaricus.txt.train?format=libsvm"); + DataLoader.loadSVMFile("../../demo/data/agaricus.txt.train"); DataLoader.CSRSparseData testCSR = - DataLoader.loadSVMFile("../../demo/data/agaricus.txt.test?format=libsvm"); + DataLoader.loadSVMFile("../../demo/data/agaricus.txt.test"); Map paramMap = new HashMap() { { diff --git a/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/BasicWalkThrough.scala b/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/BasicWalkThrough.scala index 4629fa352ec4..975d890a24b7 100644 --- a/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/BasicWalkThrough.scala +++ b/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/BasicWalkThrough.scala @@ -36,8 +36,8 @@ object BasicWalkThrough { } def main(args: Array[String]): Unit = { - val trainMax = new DMatrix("../../demo/data/agaricus.txt.train?format=libsvm") - val testMax = new DMatrix("../../demo/data/agaricus.txt.test?format=libsvm") + val trainMax = new DMatrix("../../demo/data/agaricus.txt.train?format=libsvm&indexing_mode=1") + val testMax = new DMatrix("../../demo/data/agaricus.txt.test?format=libsvm&indexing_mode=1") val params = new mutable.HashMap[String, Any]() params += "eta" -> 1.0 @@ -61,7 +61,7 @@ object BasicWalkThrough { } booster.saveModel(file.getAbsolutePath + "/xgb.model") // dump model with feature map - val modelInfos = booster.getModelDump(file.getAbsolutePath + "/featmap.txt", false) + val modelInfos = booster.getModelDump("../../demo/data/featmap.txt", false) saveDumpModel(file.getAbsolutePath + "/dump.raw.txt", modelInfos) // save dmatrix into binary buffer testMax.saveBinary(file.getAbsolutePath + "/dtest.buffer") @@ -76,9 +76,9 @@ object BasicWalkThrough { // build dmatrix from CSR Sparse Matrix println("start build dmatrix from csr sparse data ...") - val spData = DataLoader.loadSVMFile("../../demo/data/agaricus.txt.train?format=libsvm") + val spData = DataLoader.loadSVMFile("../../demo/data/agaricus.txt.train") val trainMax2 = new DMatrix(spData.rowHeaders, spData.colIndex, spData.data, - JDMatrix.SparseType.CSR) + JDMatrix.SparseType.CSR, 127) trainMax2.setLabel(spData.labels) // specify watchList diff --git a/jvm-packages/xgboost4j-example/src/test/java/ml/dmlc/xgboost4j/java/example/JavaExamplesTest.java b/jvm-packages/xgboost4j-example/src/test/java/ml/dmlc/xgboost4j/java/example/JavaExamplesTest.java new file mode 100644 index 000000000000..da57d1ebb28b --- /dev/null +++ b/jvm-packages/xgboost4j-example/src/test/java/ml/dmlc/xgboost4j/java/example/JavaExamplesTest.java @@ -0,0 +1,44 @@ +/* + Copyright (c) 2024 by Contributors + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + */ +package ml.dmlc.xgboost4j.java.example; + +import java.io.IOException; +import ml.dmlc.xgboost4j.java.XGBoostError; +import org.junit.Test; + + +public class JavaExamplesTest { + + @Test + public void testExamples() throws XGBoostError, IOException { + String[] args = {""}; + System.out.println("BasicWalkThrough"); + BasicWalkThrough.main(args); + System.out.println("BoostFromPrediction"); + BoostFromPrediction.main(args); + System.out.println("CrossValidation"); + CrossValidation.main(args); + System.out.println("CustomObjective"); + CustomObjective.main(args); + System.out.println("EarlyStopping"); + EarlyStopping.main(args); + System.out.println("ExternalMemory"); + ExternalMemory.main(args); + System.out.println("GeneralizedLinearModel"); + GeneralizedLinearModel.main(args); + System.out.println("PredictFirstNtree"); + PredictFirstNtree.main(args); + System.out.println("PredictLeafIndices"); + PredictLeafIndices.main(args); + } +} diff --git a/jvm-packages/xgboost4j-example/src/test/scala/ml/dmlc/xgboost4j/scala/example/ScalaExamplesTest.scala b/jvm-packages/xgboost4j-example/src/test/scala/ml/dmlc/xgboost4j/scala/example/ScalaExamplesTest.scala new file mode 100644 index 000000000000..d7705f90e5ce --- /dev/null +++ b/jvm-packages/xgboost4j-example/src/test/scala/ml/dmlc/xgboost4j/scala/example/ScalaExamplesTest.scala @@ -0,0 +1,40 @@ +/* + Copyright (c) 2024 by Contributors + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + */ +package ml.dmlc.xgboost4j.scala.example + +import org.scalatest.funsuite.AnyFunSuite + +class ScalaExamplesTest extends AnyFunSuite { + test("Smoke test for Scala examples") { + val args = Array("") + println("BasicWalkThrough") + BasicWalkThrough.main(args) + println("BoostFromPrediction") + BoostFromPrediction.main(args) + println("CrossValidation") + CrossValidation.main(args) + println("CustomObjective") + CustomObjective.main(args) + println("ExternalMemory") + ExternalMemory.main(args) + println("GeneralizedLinearModel") + GeneralizedLinearModel.main(args) + println("PredictFirstNTree") + PredictFirstNTree.main(args) + println("PredictLeafIndices") + PredictLeafIndices.main(args) + } +} diff --git a/ops/change_scala_version.py b/ops/change_scala_version.py index c8a9b54ccf91..3489479dd464 100644 --- a/ops/change_scala_version.py +++ b/ops/change_scala_version.py @@ -20,6 +20,9 @@ def main(args): if target.is_dir(): print(f"Removing {target}...") shutil.rmtree(target) + for target in pathlib.Path("jvm-packages/").glob("**/*.so"): + print(f"Removing {target}...") + target.unlink() # Update pom.xml for pom in pathlib.Path("jvm-packages/").glob("**/pom.xml"): diff --git a/ops/docker/ci_container.yml b/ops/docker/ci_container.yml index 3612529607b7..d042e35549f9 100644 --- a/ops/docker/ci_container.yml +++ b/ops/docker/ci_container.yml @@ -50,3 +50,6 @@ xgb-ci.manylinux2014_x86_64: xgb-ci.manylinux2014_aarch64: container_def: manylinux2014_aarch64 + +xgb-ci.jvm: + container_def: jvm diff --git a/ops/docker/dockerfile/Dockerfile.jvm_cross b/ops/docker/dockerfile/Dockerfile.jvm_cross deleted file mode 100644 index 3ebdb3c6686d..000000000000 --- a/ops/docker/dockerfile/Dockerfile.jvm_cross +++ /dev/null @@ -1,53 +0,0 @@ -FROM ubuntu:22.04 -ARG JDK_VERSION_ARG=8 -ARG SPARK_VERSION_ARG=3.5.1 - -# Environment -ENV DEBIAN_FRONTEND=noninteractive - -# Install all basic requirements -RUN \ - apt-get update && \ - apt-get install -y software-properties-common && \ - add-apt-repository ppa:openjdk-r/ppa && \ - apt-get update && \ - apt-get install -y tar unzip wget openjdk-$JDK_VERSION_ARG-jdk libgomp1 && \ - # Python - wget -nv -O conda.sh https://github.com/conda-forge/miniforge/releases/download/24.3.0-0/Miniforge3-24.3.0-0-Linux-x86_64.sh && \ - bash conda.sh -b -p /opt/miniforge && \ - /opt/miniforge/bin/pip install awscli && \ - # Maven - wget -nv -nc https://archive.apache.org/dist/maven/maven-3/3.9.7/binaries/apache-maven-3.9.7-bin.tar.gz && \ - tar xvf apache-maven-3.9.7-bin.tar.gz -C /opt && \ - ln -s /opt/apache-maven-3.9.7/ /opt/maven && \ - # Spark with scala 2.12 - mkdir -p /opt/spark-scala-2.12 && \ - wget -nv https://archive.apache.org/dist/spark/spark-$SPARK_VERSION_ARG/spark-$SPARK_VERSION_ARG-bin-hadoop3.tgz && \ - tar xvf spark-$SPARK_VERSION_ARG-bin-hadoop3.tgz --strip-components=1 -C /opt/spark-scala-2.12 && \ - # Spark with scala 2.13 - mkdir -p /opt/spark-scala-2.13 && \ - wget -nv https://archive.apache.org/dist/spark/spark-$SPARK_VERSION_ARG/spark-$SPARK_VERSION_ARG-bin-hadoop3-scala2.13.tgz && \ - tar xvf spark-$SPARK_VERSION_ARG-bin-hadoop3-scala2.13.tgz --strip-components=1 -C /opt/spark-scala-2.13 - -ENV PATH=/opt/miniforge/bin:/opt/spark/bin:/opt/maven/bin:$PATH - -# Install Python packages -RUN pip install numpy scipy pandas scikit-learn - -ENV GOSU_VERSION=1.10 - -# Install lightweight sudo (not bound to TTY) -RUN set -ex; \ - wget -nv -O /usr/local/bin/gosu "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-amd64" && \ - chmod +x /usr/local/bin/gosu && \ - gosu nobody true - -# Set default JDK version -RUN update-java-alternatives -v -s java-1.$JDK_VERSION_ARG.0-openjdk-amd64 - -# Default entry-point to use if running locally -# It will preserve attributes of created files -COPY entrypoint.sh /scripts/ - -WORKDIR /workspace -ENTRYPOINT ["/scripts/entrypoint.sh"] diff --git a/ops/pipeline/build-jvm-macos-m1.sh b/ops/pipeline/build-jvm-macos-m1.sh index 29a11451428c..75785aa03eba 100644 --- a/ops/pipeline/build-jvm-macos-m1.sh +++ b/ops/pipeline/build-jvm-macos-m1.sh @@ -1,4 +1,5 @@ #!/bin/bash +## Build libxgboost4j.dylib targeting MacOS set -euox pipefail diff --git a/ops/pipeline/build-jvm-manylinux2014.sh b/ops/pipeline/build-jvm-manylinux2014.sh index 99216d6f6272..93fa03d2eb0b 100644 --- a/ops/pipeline/build-jvm-manylinux2014.sh +++ b/ops/pipeline/build-jvm-manylinux2014.sh @@ -1,4 +1,5 @@ #!/bin/bash +## Build libxgboost4j.so targeting glibc 2.17 systems set -euox pipefail diff --git a/ops/pipeline/build-test-jvm-packages-impl.sh b/ops/pipeline/build-test-jvm-packages-impl.sh new file mode 100755 index 000000000000..717868521408 --- /dev/null +++ b/ops/pipeline/build-test-jvm-packages-impl.sh @@ -0,0 +1,84 @@ +#!/bin/bash +## Build and test JVM packages. +## +## Note. This script takes in all inputs via environment variables. + +INPUT_DOC=$( +cat <<-EOF +Inputs + - SCALA_VERSION: Scala version, either 2.12 or 2.13 (Required) + - USE_CUDA: Set to 1 to enable CUDA + - CUDA_ARCH: Semicolon separated list of GPU compute capability targets + (e.g. '35;61') Only applicable if USE_CUDA=1 + - SKIP_NATIVE_BUILD: Set to 1 to have the JVM packages use an externally provided + libxgboost4j.so. (Usually Maven will invoke create_jni.py to + build it from scratch.) When using this option, make sure to + place libxgboost4j.so in lib/ directory. +EOF +) + +set -euo pipefail + +for arg in "SCALA_VERSION" +do + if [[ -z "${!arg:-}" ]] + then + echo -e "Error: $arg must be set.\n${INPUT_DOC}" + exit 1 + fi +done + +set -x + +# Set Scala version +if [[ "${SCALA_VERSION}" == "2.12" || "${SCALA_VERSION}" == "2.13" ]] +then + python ops/change_scala_version.py --scala-version ${SCALA_VERSION} --purge-artifacts +else + echo "Error: SCALA_VERSION must be either 2.12 or 2.13" + exit 2 +fi + +# If SKIP_NATIVE_BUILD is set, copy in libxgboost4j.so from lib/ +if [[ "${SKIP_NATIVE_BUILD:-}" == "1" ]] +then + echo "Using externally provided libxgboost4j.so. Locating one from lib/..." + cp -v lib/libxgboost4j.so ./jvm-packages/xgboost4j/src/main/resources/lib/linux/x86_64/ +fi + +cd jvm-packages/ + +# Ensure that XGBoost4J-Spark is compatible with multiple versions of Spark +if [[ "${USE_CUDA:-}" != "1" && "${SCALA_VERSION}" == "2.12" ]] +then + for spark_version in 3.1.3 3.2.4 3.3.4 3.4.3 + do + mvn --no-transfer-progress clean package -Dspark.version=${spark_version} \ + -pl xgboost4j,xgboost4j-spark + done +fi + +set +x +mvn_options="" +if [[ "${USE_CUDA:-}" == "1" ]] +then + mvn_options="${mvn_options} -Pgpu" +fi +if [[ "${SKIP_NATIVE_BUILD:-}" == "1" ]] +then + mvn_options="${mvn_options} -Dskip.native.build=true" +fi +set -x + +if [[ -n "${CUDA_ARCH:-}" ]] +then + export GPU_ARCH_FLAG="-DGPU_COMPUTE_VER='${CUDA_ARCH}'" +fi + +mvn --no-transfer-progress clean install ${mvn_options} + +# Integration tests +if [[ "${USE_CUDA:-}" != "1" ]] +then + mvn --no-transfer-progress test -pl xgboost4j-example +fi diff --git a/ops/pipeline/build-test-jvm-packages.sh b/ops/pipeline/build-test-jvm-packages.sh new file mode 100755 index 000000000000..30a11d444d1b --- /dev/null +++ b/ops/pipeline/build-test-jvm-packages.sh @@ -0,0 +1,46 @@ +#!/bin/bash +## Build and test JVM packages. +## +## Note. This script takes in all inputs via environment variables. + +INPUT_DOC=$( +cat <<-EOF +Inputs + - SCALA_VERSION: Scala version, either 2.12 or 2.13 (Required) + - USE_CUDA: Set to 1 to enable CUDA + - CUDA_ARCH: Semicolon separated list of GPU compute capability targets + (e.g. '35;61') Only applicable if USE_CUDA=1 + - SKIP_NATIVE_BUILD: Set to 1 to have the JVM packages use an externally provided + libxgboost4j.so. (Usually Maven will invoke create_jni.py to + build it from scratch.) When using this option, make sure to + place libxgboost4j.so in lib/ directory. +EOF +) + +set -euo pipefail + +source ops/pipeline/enforce-ci.sh + +for arg in "SCALA_VERSION" +do + if [[ -z "${!arg:-}" ]] + then + echo -e "Error: $arg must be set.\n${INPUT_DOC}" + exit 1 + fi +done + +set -x + +run_args="-e SCALA_VERSION=${SCALA_VERSION}" +for arg in "USE_CUDA" "CUDA_ARCH" "SKIP_NATIVE_BUILD" +do + if [[ -n "${!arg:-}" ]] + then + run_args="${run_args} -e ${arg}=${!arg}" + fi +done +echo "${run_args}" + +python3 ops/docker_run.py --container-id xgb-ci.jvm \ + --run-args "${run_args}" -- ops/pipeline/build-test-jvm-packages-impl.sh diff --git a/tests/buildkite/build-jvm-packages-gpu.sh b/tests/buildkite/build-jvm-packages-gpu.sh deleted file mode 100755 index 76ffafbcfdd7..000000000000 --- a/tests/buildkite/build-jvm-packages-gpu.sh +++ /dev/null @@ -1,20 +0,0 @@ -#!/bin/bash - -set -euo pipefail - -source tests/buildkite/conftest.sh - -echo "--- Build and test XGBoost JVM packages with CUDA" - -if [[ ($is_pull_request == 1) || ($is_release_branch == 0) ]] -then - arch_flag="-DGPU_COMPUTE_VER=75" -else - arch_flag="" -fi - -tests/ci_build/ci_build.sh jvm_gpu_build --use-gpus \ - --build-arg CUDA_VERSION_ARG=${CUDA_VERSION} \ - --build-arg NCCL_VERSION_ARG=${NCCL_VERSION} \ - tests/ci_build/build_jvm_packages.sh \ - ${SPARK_VERSION} -Duse.cuda=ON ${arch_flag} diff --git a/tests/buildkite/build-jvm-packages.sh b/tests/buildkite/build-jvm-packages.sh deleted file mode 100755 index 338a599f7e15..000000000000 --- a/tests/buildkite/build-jvm-packages.sh +++ /dev/null @@ -1,26 +0,0 @@ -#!/bin/bash - -set -euo pipefail - -source tests/buildkite/conftest.sh - -echo "--- Build and test XGBoost JVM packages with Scala 2.12" -tests/ci_build/ci_build.sh jvm tests/ci_build/build_jvm_packages.sh \ - ${SPARK_VERSION} - -echo "--- Stash XGBoost4J JARs (Scala 2.12)" -buildkite-agent artifact upload "jvm-packages/xgboost4j/target/*.jar" -buildkite-agent artifact upload "jvm-packages/xgboost4j-spark/target/*.jar" -buildkite-agent artifact upload "jvm-packages/xgboost4j-flink/target/*.jar" -buildkite-agent artifact upload "jvm-packages/xgboost4j-example/target/*.jar" - -echo "--- Build and test XGBoost JVM packages with Scala 2.13" - -tests/ci_build/ci_build.sh jvm tests/ci_build/build_jvm_packages.sh \ - ${SPARK_VERSION} "" "" "true" - -echo "--- Stash XGBoost4J JARs (Scala 2.13)" -buildkite-agent artifact upload "jvm-packages/xgboost4j/target/*.jar" -buildkite-agent artifact upload "jvm-packages/xgboost4j-spark/target/*.jar" -buildkite-agent artifact upload "jvm-packages/xgboost4j-flink/target/*.jar" -buildkite-agent artifact upload "jvm-packages/xgboost4j-example/target/*.jar" diff --git a/tests/buildkite/deploy-jvm-packages.sh b/tests/buildkite/deploy-jvm-packages.sh deleted file mode 100755 index 812a6c5cafec..000000000000 --- a/tests/buildkite/deploy-jvm-packages.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/bash - -set -euo pipefail - -source tests/buildkite/conftest.sh - -if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]] -then - echo "--- Deploy JVM packages to xgboost-maven-repo S3 repo" - tests/ci_build/ci_build.sh jvm_gpu_build \ - --build-arg CUDA_VERSION_ARG=${CUDA_VERSION} \ - --build-arg NCCL_VERSION_ARG=${NCCL_VERSION} \ - tests/ci_build/deploy_jvm_packages.sh ${SPARK_VERSION} -fi diff --git a/tests/buildkite/enforce_daily_budget.py b/tests/buildkite/enforce_daily_budget.py deleted file mode 100644 index af1b1ce484b8..000000000000 --- a/tests/buildkite/enforce_daily_budget.py +++ /dev/null @@ -1,14 +0,0 @@ -import json -import argparse - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("--response", type=str, required=True) - args = parser.parse_args() - with open(args.response, "r") as f: - payload = f.read() - response = json.loads(payload) - if response["approved"]: - print(f"Testing approved. Reason: {response['reason']}") - else: - raise RuntimeError(f"Testing rejected. Reason: {response['reason']}") diff --git a/tests/buildkite/enforce_daily_budget.sh b/tests/buildkite/enforce_daily_budget.sh deleted file mode 100755 index 8212f07c1b24..000000000000 --- a/tests/buildkite/enforce_daily_budget.sh +++ /dev/null @@ -1,15 +0,0 @@ -#!/bin/bash - -set -euo pipefail - -echo "--- Enforce daily budget" - -source tests/buildkite/conftest.sh - -if [[ $enforce_daily_budget == 0 ]] -then - echo "Automatically approving all test jobs for trunk branches" -else - aws lambda invoke --function-name XGBoostCICostWatcher --invocation-type RequestResponse --region us-west-2 response.json - python3 tests/buildkite/enforce_daily_budget.py --response response.json -fi diff --git a/tests/buildkite/pipeline-mgpu.yml b/tests/buildkite/pipeline-mgpu.yml deleted file mode 100644 index 4246425de0ca..000000000000 --- a/tests/buildkite/pipeline-mgpu.yml +++ /dev/null @@ -1,32 +0,0 @@ -env: - DOCKER_CACHE_ECR_ID: "492475357299" - DOCKER_CACHE_ECR_REGION: "us-west-2" - DISABLE_RELEASE: "1" - # Skip uploading artifacts to S3 bucket - # Also, don't build all CUDA archs; just build sm_75 -steps: - - label: ":moneybag: Enforce daily budget" - command: "tests/buildkite/enforce_daily_budget.sh" - key: enforce-daily-budget - agents: - queue: pipeline-loader - - wait - - block: ":rocket: Run this test job" - if: build.pull_request.id != null || build.branch =~ /^dependabot\// - #### -------- CONTAINER BUILD -------- - - label: ":docker: Build containers" - commands: - - "tests/buildkite/build-containers.sh gpu" - - "tests/buildkite/build-containers.sh gpu_build_rockylinux8" - - "tests/buildkite/build-containers.sh jvm_gpu_build" - key: build-containers - agents: - queue: linux-amd64-cpu - - wait - #### -------- BUILD -------- - - label: ":console: Build and test JVM packages with CUDA" - command: "tests/buildkite/build-jvm-packages-gpu.sh" - key: build-jvm-packages-gpu - agents: - queue: linux-amd64-mgpu - - wait diff --git a/tests/buildkite/pipeline.yml b/tests/buildkite/pipeline.yml deleted file mode 100644 index 65225649a3af..000000000000 --- a/tests/buildkite/pipeline.yml +++ /dev/null @@ -1,48 +0,0 @@ -env: - DOCKER_CACHE_ECR_ID: "492475357299" - DOCKER_CACHE_ECR_REGION: "us-west-2" -steps: - - label: ":moneybag: Enforce daily budget" - command: "tests/buildkite/enforce_daily_budget.sh" - key: enforce-daily-budget - agents: - queue: pipeline-loader - - wait - - block: ":rocket: Run this test job" - if: build.pull_request.id != null || build.branch =~ /^dependabot\// - #### -------- CONTAINER BUILD -------- - - label: ":docker: Build containers" - commands: - - "tests/buildkite/build-containers.sh cpu" - - "tests/buildkite/build-containers.sh gpu" - - "tests/buildkite/build-containers.sh gpu_build_rockylinux8" - key: build-containers - agents: - queue: linux-amd64-cpu - - wait - #### -------- BUILD -------- - - label: ":console: Build JVM packages" - timeout_in_minutes: 30 - command: "tests/buildkite/build-jvm-packages.sh" - key: build-jvm-packages - agents: - queue: linux-amd64-cpu - - label: ":console: Build JVM package doc" - command: "tests/buildkite/build-jvm-doc.sh" - key: build-jvm-doc - agents: - queue: linux-amd64-cpu - - wait - #### -------- TEST -------- - - label: ":console: Run integration tests with JVM packages" - command: "tests/buildkite/test-integration-jvm-packages.sh" - key: test-integration-jvm-packages - agents: - queue: linux-amd64-cpu - - wait - #### -------- DEPLOY JVM -------- - - label: ":console: Deploy JVM packages" - command: "tests/buildkite/deploy-jvm-packages.sh" - key: deploy-jvm-packages - agents: - queue: linux-amd64-cpu diff --git a/tests/ci_build/build_jvm_packages.sh b/tests/ci_build/build_jvm_packages.sh deleted file mode 100755 index 23811f817bd7..000000000000 --- a/tests/ci_build/build_jvm_packages.sh +++ /dev/null @@ -1,30 +0,0 @@ -#!/bin/bash - -set -euo pipefail - -spark_version=$1 -use_cuda=$2 -gpu_arch=$3 -use_scala213=$4 - -gpu_options="" -if [ "x$use_cuda" == "x-Duse.cuda=ON" ]; then - gpu_options="$use_cuda -Pgpu" -fi - -rm -rf build/ -cd jvm-packages - -if [ "x$gpu_arch" != "x" ]; then - export GPU_ARCH_FLAG=$gpu_arch -fi - -if [ "x$use_scala213" != "x" ]; then - cd .. - python ops/change_scala_version.py --scala-version 2.13 --purge-artifacts - cd jvm-packages -fi - -mvn --no-transfer-progress package -Dspark.version=${spark_version} $gpu_options - -set +x diff --git a/tests/ci_build/deploy_jvm_packages.sh b/tests/ci_build/deploy_jvm_packages.sh deleted file mode 100755 index d8c50d297686..000000000000 --- a/tests/ci_build/deploy_jvm_packages.sh +++ /dev/null @@ -1,37 +0,0 @@ -#!/bin/bash - -set -e -set -x - -if [ $# -ne 1 ]; then - echo "Usage: $0 [spark version]" - exit 1 -fi - -spark_version=$1 - -# Initialize local Maven repository -./tests/ci_build/initialize_maven.sh - -cd jvm-packages -rm -rf $(find . -name target) -rm -rf ../build/ - -# Re-build package -# Maven profiles: -# `default` includes modules: xgboost4j, xgboost4j-spark, xgboost4j-flink, xgboost4j-example -# `gpu` includes modules: xgboost4j-gpu, xgboost4j-spark-gpu, sets `use.cuda = ON` -# `scala-2.13` sets the scala binary version to the 2.13 -# `release-to-s3` sets maven deployment targets - -# Deploy to S3 bucket xgboost-maven-repo -mvn --no-transfer-progress package deploy -P default,gpu,release-to-s3 -Dspark.version=${spark_version} -DskipTests -# Deploy scala 2.13 to S3 bucket xgboost-maven-repo -cd .. -python ops/change_scala_version.py --scala-version 2.13 --purge-artifacts -cd jvm-packages/ -mvn --no-transfer-progress package deploy -P default,gpu,release-to-s3 -Dspark.version=${spark_version} -DskipTests - - -set +x -set +e From 67d0cc6fc028569c41fb8342c05b38cdddb1888d Mon Sep 17 00:00:00 2001 From: Bobby Wang Date: Wed, 23 Oct 2024 22:38:52 +0800 Subject: [PATCH 20/45] [jvm-packages] resolve spark compatibility issue (#10917) --------- Co-authored-by: Hyunsu Cho --- .../scala/spark/XGBoostEstimator.scala | 9 ++- .../apache/spark/ml/xgboost/SparkUtils.scala | 55 +++++++++++++++++-- .../xgboost4j/scala/spark/XGBoostSuite.scala | 45 ++++++++------- 3 files changed, 81 insertions(+), 28 deletions(-) diff --git a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostEstimator.scala b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostEstimator.scala index 6978b82da8fc..98b70a63c4f6 100644 --- a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostEstimator.scala +++ b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostEstimator.scala @@ -561,7 +561,11 @@ private[spark] trait XGBoostModel[M <: XGBoostModel[M]] extends Model[M] with ML val featureName = getFeaturesCol val missing = getMissing - val output = dataset.toDF().mapPartitions { rowIter => + // Here, we use RDD instead of DF to avoid different encoders for different + // spark versions for the compatibility issue. + // 3.5+, Encoders.row(schema) + // 3.5-, RowEncoder(schema) + val outRDD = dataset.asInstanceOf[Dataset[Row]].rdd.mapPartitions { rowIter => rowIter.grouped(inferBatchSize).flatMap { batchRow => val features = batchRow.iterator.map(row => row.getAs[Vector]( row.fieldIndex(featureName))) @@ -573,8 +577,9 @@ private[spark] trait XGBoostModel[M <: XGBoostModel[M]] extends Model[M] with ML dm.delete() } } + } + val output = dataset.sparkSession.createDataFrame(outRDD, schema) - }(Encoders.row(schema)) bBooster.unpersist(blocking = false) postTransform(output, pred).toDF() } diff --git a/jvm-packages/xgboost4j-spark/src/main/scala/org/apache/spark/ml/xgboost/SparkUtils.scala b/jvm-packages/xgboost4j-spark/src/main/scala/org/apache/spark/ml/xgboost/SparkUtils.scala index 8bc88434a443..4402f8efca19 100644 --- a/jvm-packages/xgboost4j-spark/src/main/scala/org/apache/spark/ml/xgboost/SparkUtils.scala +++ b/jvm-packages/xgboost4j-spark/src/main/scala/org/apache/spark/ml/xgboost/SparkUtils.scala @@ -16,14 +16,15 @@ package org.apache.spark.ml.xgboost -import org.apache.spark.SparkContext +import org.apache.spark.{SparkContext, SparkException} import org.apache.spark.ml.classification.ProbabilisticClassifierParams import org.apache.spark.ml.linalg.VectorUDT import org.apache.spark.ml.param.Params -import org.apache.spark.ml.util.{DatasetUtils, DefaultParamsReader, DefaultParamsWriter, SchemaUtils} +import org.apache.spark.ml.util.{DefaultParamsReader, DefaultParamsWriter, MetadataUtils, SchemaUtils} import org.apache.spark.ml.util.DefaultParamsReader.Metadata -import org.apache.spark.sql.Dataset -import org.apache.spark.sql.types.{DataType, DoubleType, StructType} +import org.apache.spark.sql.{Column, Dataset, Row} +import org.apache.spark.sql.functions._ +import org.apache.spark.sql.types.{DataType, DoubleType, IntegerType, StructType} import org.json4s.{JObject, JValue} import ml.dmlc.xgboost4j.scala.spark.params.NonXGBoostParams @@ -57,8 +58,52 @@ trait XGBProbabilisticClassifierParams[T <: Params] /** Utils to access the spark internal functions */ object SparkUtils { + private def checkClassificationLabels( + labelCol: String, + numClasses: Option[Int]): Column = { + val casted = col(labelCol).cast(DoubleType) + numClasses match { + case Some(2) => + when(casted.isNull || casted.isNaN, raise_error(lit("Labels MUST NOT be Null or NaN"))) + .when(casted =!= 0 && casted =!= 1, + raise_error(concat(lit("Labels MUST be in {0, 1}, but got "), casted))) + .otherwise(casted) + + case _ => + val n = numClasses.getOrElse(Int.MaxValue) + require(0 < n && n <= Int.MaxValue) + when(casted.isNull || casted.isNaN, raise_error(lit("Labels MUST NOT be Null or NaN"))) + .when(casted < 0 || casted >= n, + raise_error(concat(lit(s"Labels MUST be in [0, $n), but got "), casted))) + .when(casted =!= casted.cast(IntegerType), + raise_error(concat(lit("Labels MUST be Integers, but got "), casted))) + .otherwise(casted) + } + } + + // Copied from DatasetUtils of Spark to compatible with spark below 3.4 def getNumClasses(dataset: Dataset[_], labelCol: String, maxNumClasses: Int = 100): Int = { - DatasetUtils.getNumClasses(dataset, labelCol, maxNumClasses) + MetadataUtils.getNumClasses(dataset.schema(labelCol)) match { + case Some(n: Int) => n + case None => + // Get number of classes from dataset itself. + val maxLabelRow: Array[Row] = dataset + .select(max(checkClassificationLabels(labelCol, Some(maxNumClasses)))) + .take(1) + if (maxLabelRow.isEmpty || maxLabelRow(0).get(0) == null) { + throw new SparkException("ML algorithm was given empty dataset.") + } + val maxDoubleLabel: Double = maxLabelRow.head.getDouble(0) + require((maxDoubleLabel + 1).isValidInt, s"Classifier found max label value =" + + s" $maxDoubleLabel but requires integers in range [0, ... ${Int.MaxValue})") + val numClasses = maxDoubleLabel.toInt + 1 + require(numClasses <= maxNumClasses, s"Classifier inferred $numClasses from label values" + + s" in column $labelCol, but this exceeded the max numClasses ($maxNumClasses) allowed" + + s" to be inferred from values. To avoid this error for labels with > $maxNumClasses" + + s" classes, specify numClasses explicitly in the metadata; this can be done by applying" + + s" StringIndexer to the label column.") + numClasses + } } def checkNumericType(schema: StructType, colName: String, msg: String = ""): Unit = { diff --git a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostSuite.scala b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostSuite.scala index 3a45cf4448c0..f9a7c0c1060d 100644 --- a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostSuite.scala +++ b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostSuite.scala @@ -100,29 +100,32 @@ class XGBoostSuite extends AnyFunSuite with PerTest { .config("spark.executor.cores", 4) .config("spark.executor.resource.gpu.amount", 1) .config("spark.task.resource.gpu.amount", 0.25) - val ss = builder.getOrCreate() - - try { - val df = ss.range(1, 10) - val rdd = df.rdd - - val runtimeParams = new XGBoostClassifier( - Map("device" -> "cuda")).setNumWorkers(1).setNumRound(1) - .getRuntimeParameters(true) - assert(runtimeParams.runOnGpu) - - val finalRDD = FakedXGBoost.tryStageLevelScheduling(ss.sparkContext, runtimeParams, - rdd.asInstanceOf[RDD[(Booster, Map[String, Array[Float]])]]) - - val taskResources = finalRDD.getResourceProfile().taskResources - assert(taskResources.contains("cpus")) - assert(taskResources.get("cpus").get.amount == 3) - - assert(taskResources.contains("gpu")) - assert(taskResources.get("gpu").get.amount == 1.0) - } finally { + if (ss.version < "3.4.1") { + // Pass ss.stop() + } else { + try { + val df = ss.range(1, 10) + val rdd = df.rdd + + val runtimeParams = new XGBoostClassifier( + Map("device" -> "cuda")).setNumWorkers(1).setNumRound(1) + .getRuntimeParameters(true) + assert(runtimeParams.runOnGpu) + + val finalRDD = FakedXGBoost.tryStageLevelScheduling(ss.sparkContext, runtimeParams, + rdd.asInstanceOf[RDD[(Booster, Map[String, Array[Float]])]]) + + val taskResources = finalRDD.getResourceProfile().taskResources + assert(taskResources.contains("cpus")) + assert(taskResources.get("cpus").get.amount == 3) + + assert(taskResources.contains("gpu")) + assert(taskResources.get("gpu").get.amount == 1.0) + } finally { + ss.stop() + } } } } From b65e2ccf8ae7b9f7cba095fad6f866bac00d6f12 Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Thu, 14 Nov 2024 12:49:20 -0800 Subject: [PATCH 21/45] Test GPU JVM packages --- .github/workflows/main.yml | 74 ++++++++++++++++---- jvm-packages/create_jni.py | 16 ++--- ops/docker/ci_container.yml | 20 ++++-- ops/pipeline/build-cuda.sh | 2 + ops/pipeline/build-jvm-gpu.sh | 33 +++++++++ ops/pipeline/build-jvm-macos-m1.sh | 0 ops/pipeline/build-jvm-manylinux2014.sh | 0 ops/pipeline/build-test-jvm-packages-impl.sh | 30 +++++--- ops/pipeline/build-test-jvm-packages.sh | 22 +----- ops/pipeline/build-win64-gpu.ps1 | 2 + ops/pipeline/test-cpp-gpu.sh | 10 ++- ops/pipeline/test-jvm-gpu.sh | 18 +++++ ops/pipeline/test-python.sh | 21 ++++-- 13 files changed, 183 insertions(+), 65 deletions(-) create mode 100755 ops/pipeline/build-jvm-gpu.sh mode change 100644 => 100755 ops/pipeline/build-jvm-macos-m1.sh mode change 100644 => 100755 ops/pipeline/build-jvm-manylinux2014.sh create mode 100755 ops/pipeline/test-jvm-gpu.sh diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 74a1cc135908..1b78bdb69df1 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -33,6 +33,7 @@ jobs: - xgb-ci.manylinux_2_28_x86_64 - xgb-ci.manylinux2014_x86_64 - xgb-ci.jvm + - xgb-ci.jvm_gpu_build runner: [linux-amd64-cpu] include: - container_id: xgb-ci.manylinux2014_aarch64 @@ -171,8 +172,8 @@ jobs: COMMAND: upload KEY: build-cuda-with-rmm - build-jvm-manylinux2014: - name: Build libxgboost4j.so targeting gblic 2.17 + build-manylinux2014: + name: Build manylinux2014_${{ matrix.arch }} wheel needs: build-containers runs-on: - runs-on=${{ github.run_id }} @@ -184,7 +185,6 @@ jobs: runner: linux-arm64-cpu - arch: x86_64 runner: linux-amd64-cpu - steps: # Restart Docker daemon so that it recognizes the ephemeral disks - run: sudo systemctl restart docker @@ -195,10 +195,28 @@ jobs: run: bash ops/docker_build.sh env: CONTAINER_ID: xgb-ci.manylinux2014_${{ matrix.arch }} - - run: bash ops/pipeline/build-jvm-manylinux2014.sh ${{ matrix.arch }} + - run: bash ops/pipeline/build-manylinux2014.sh ${{ matrix.arch }} - build-manylinux2014: - name: Build manylinux2014_${{ matrix.arch }} wheel + build-gpu-rpkg: + name: Build GPU-enabled R package + needs: build-containers + runs-on: + - runs-on=${{ github.run_id }} + - runner=linux-amd64-cpu + steps: + # Restart Docker daemon so that it recognizes the ephemeral disks + - run: sudo systemctl restart docker + - uses: actions/checkout@v4 + with: + submodules: "true" + - name: Fetch container from cache + run: bash ops/docker_build.sh + env: + CONTAINER_ID: xgb-ci.gpu_build_r_rockylinux8 + - run: bash ops/pipeline/build-gpu-rpkg.sh + + build-jvm-manylinux2014: + name: Build libxgboost4j.so targeting glibc 2.17 needs: build-containers runs-on: - runs-on=${{ github.run_id }} @@ -220,10 +238,10 @@ jobs: run: bash ops/docker_build.sh env: CONTAINER_ID: xgb-ci.manylinux2014_${{ matrix.arch }} - - run: bash ops/pipeline/build-manylinux2014.sh ${{ matrix.arch }} + - run: bash ops/pipeline/build-jvm-manylinux2014.sh ${{ matrix.arch }} - build-gpu-rpkg: - name: Build GPU-enabled R package + build-jvm-gpu: + name: Build libxgboost4j.so with CUDA needs: build-containers runs-on: - runs-on=${{ github.run_id }} @@ -237,8 +255,13 @@ jobs: - name: Fetch container from cache run: bash ops/docker_build.sh env: - CONTAINER_ID: xgb-ci.gpu_build_r_rockylinux8 - - run: bash ops/pipeline/build-gpu-rpkg.sh + CONTAINER_ID: xgb-ci.jvm_gpu_build + - run: bash ops/pipeline/build-jvm-gpu.sh + - name: Stash files + run: bash ops/stash_artifacts.sh lib/libxgboost4j.so + env: + COMMAND: upload + KEY: build-jvm-gpu build-test-jvm-packages: name: Build and test JVM packages @@ -256,10 +279,12 @@ jobs: run: bash ops/docker_build.sh env: CONTAINER_ID: xgb-ci.jvm - - run: bash ops/pipeline/build-test-jvm-packages.sh + - name: Build and test JVM packages (Scala 2.12) + run: bash ops/pipeline/build-test-jvm-packages.sh env: SCALA_VERSION: 2.12 - - run: bash ops/pipeline/build-test-jvm-packages.sh + - name: Build and test JVM packages (Scala 2.13) + run: bash ops/pipeline/build-test-jvm-packages.sh env: SCALA_VERSION: 2.13 @@ -358,3 +383,26 @@ jobs: KEY: ${{ matrix.artifact_from }} - name: Run Python tests, ${{ matrix.description }} run: bash ops/pipeline/test-python.sh ${{ matrix.suite }} ${{ matrix.container }} + + test-jvm-packages-gpu: + name: Test JVM packages with CUDA + needs: [build-jvm-gpu] + runs-on: + - runs-on=${{ github.run_id }} + - runner=linux-amd64-mgpu + steps: + # Restart Docker daemon so that it recognizes the ephemeral disks + - run: sudo systemctl restart docker + - uses: actions/checkout@v4 + with: + submodules: "true" + - name: Fetch container from cache + run: bash ops/docker_build.sh + env: + CONTAINER_ID: xgb-ci.jvm_gpu_build + - name: Unstash files + run: bash ops/stash_artifacts.sh lib/libxgboost4j.so + env: + COMMAND: download + KEY: build-jvm-gpu + - run: bash ops/pipeline/test-jvm-gpu.sh diff --git a/jvm-packages/create_jni.py b/jvm-packages/create_jni.py index 6be7b451ce14..fbd9b4ce5672 100755 --- a/jvm-packages/create_jni.py +++ b/jvm-packages/create_jni.py @@ -32,7 +32,7 @@ def cd(path): path = normpath(path) cwd = os.getcwd() os.chdir(path) - print("cd " + path) + print("cd " + path, flush=True) try: yield path finally: @@ -41,7 +41,7 @@ def cd(path): def maybe_makedirs(path): path = normpath(path) - print("mkdir -p " + path) + print("mkdir -p " + path, flush=True) try: os.makedirs(path) except OSError as e: @@ -50,14 +50,14 @@ def maybe_makedirs(path): def run(command, **kwargs): - print(command) + print(command, flush=True) subprocess.run(command, shell=True, check=True, env=os.environ, **kwargs) def cp(source, target): source = normpath(source) target = normpath(target) - print("cp {0} {1}".format(source, target)) + print("cp {0} {1}".format(source, target), flush=True) shutil.copy(source, target) @@ -78,7 +78,7 @@ def native_build(args): subprocess.check_output("/usr/libexec/java_home").strip().decode() ) - print("building Java wrapper") + print("building Java wrapper", flush=True) with cd(".."): build_dir = "build-gpu" if cli_args.use_cuda == "ON" else "build" maybe_makedirs(build_dir) @@ -123,7 +123,7 @@ def native_build(args): run("cmake .. " + " ".join(args + [generator])) break except subprocess.CalledProcessError as e: - print(f"Failed to build with generator: {generator}", e) + print(f"Failed to build with generator: {generator}", e, flush=True) with cd(os.path.pardir): shutil.rmtree(build_dir) maybe_makedirs(build_dir) @@ -132,7 +132,7 @@ def native_build(args): run("cmake --build . --config Release" + maybe_parallel_build) - print("copying native library") + print("copying native library", flush=True) library_name, os_folder = { "Windows": ("xgboost4j.dll", "windows"), "Darwin": ("libxgboost4j.dylib", "macos"), @@ -153,7 +153,7 @@ def native_build(args): maybe_makedirs(output_folder) cp("../lib/" + library_name, output_folder) - print("copying train/test files") + print("copying train/test files", flush=True) # for xgboost4j maybe_makedirs("xgboost4j/src/test/resources") diff --git a/ops/docker/ci_container.yml b/ops/docker/ci_container.yml index d042e35549f9..90c9e6c8c800 100644 --- a/ops/docker/ci_container.yml +++ b/ops/docker/ci_container.yml @@ -6,8 +6,8 @@ xgb-ci.gpu_build_rockylinux8: container_def: gpu_build_rockylinux8 build_args: - CUDA_VERSION_ARG: "12.5.1" - NCCL_VERSION_ARG: "2.22.3-1" + CUDA_VERSION_ARG: "12.4.1" + NCCL_VERSION_ARG: "2.23.4-1" RAPIDS_VERSION_ARG: "24.10" xgb-ci.gpu_build_r_rockylinux8: @@ -19,22 +19,22 @@ xgb-ci.gpu_build_r_rockylinux8: xgb-ci.gpu: container_def: gpu build_args: - CUDA_VERSION_ARG: "12.5.1" - NCCL_VERSION_ARG: "2.22.3-1" + CUDA_VERSION_ARG: "12.4.1" + NCCL_VERSION_ARG: "2.23.4-1" RAPIDS_VERSION_ARG: "24.10" xgb-ci.gpu_dev_ver: container_def: gpu build_args: - CUDA_VERSION_ARG: "12.5.1" - NCCL_VERSION_ARG: "2.22.3-1" + CUDA_VERSION_ARG: "12.4.1" + NCCL_VERSION_ARG: "2.23.4-1" RAPIDS_VERSION_ARG: "24.12" RAPIDSAI_CONDA_CHANNEL_ARG: "rapidsai-nightly" xgb-ci.clang_tidy: container_def: clang_tidy build_args: - CUDA_VERSION_ARG: "12.5.1" + CUDA_VERSION_ARG: "12.4.1" xgb-ci.cpu: container_def: cpu @@ -53,3 +53,9 @@ xgb-ci.manylinux2014_aarch64: xgb-ci.jvm: container_def: jvm + +xgb-ci.jvm_gpu_build: + container_def: jvm_gpu_build + build_args: + CUDA_VERSION_ARG: "12.4.1" + NCCL_VERSION_ARG: "2.23.4-1" diff --git a/ops/pipeline/build-cuda.sh b/ops/pipeline/build-cuda.sh index 9dc7dfad0224..bcda081b338e 100755 --- a/ops/pipeline/build-cuda.sh +++ b/ops/pipeline/build-cuda.sh @@ -17,6 +17,8 @@ echo "--- Build with CUDA" echo "--- Build libxgboost from the source" set -x +# Work around https://github.com/NVIDIA/cccl/issues/1956 +# TODO(hcho3): Remove this once new CUDA version ships with CCCL 2.6.0+ git clone https://github.com/NVIDIA/cccl.git -b v2.6.1 --quiet python3 ops/docker_run.py \ --container-id xgb-ci.gpu_build_rockylinux8 \ diff --git a/ops/pipeline/build-jvm-gpu.sh b/ops/pipeline/build-jvm-gpu.sh new file mode 100755 index 000000000000..ee12fbd78289 --- /dev/null +++ b/ops/pipeline/build-jvm-gpu.sh @@ -0,0 +1,33 @@ +#!/bin/bash +## Build libxgboost4j.so with CUDA + +set -euo pipefail + +source ops/pipeline/enforce-ci.sh + +echo "--- Build libxgboost4j.so with CUDA" + +# if [[ ($is_pull_request == 1) || ($is_release_branch == 0) ]] +#then + arch_flag="-DGPU_COMPUTE_VER=75" +#else +# arch_flag="" +#fi + +COMMAND=$( +cat <<-EOF +cd build-gpu/ && \ +cmake .. -DCMAKE_PREFIX_PATH=/workspace/cccl -GNinja -DUSE_CUDA=ON -DUSE_NCCL=ON \ + -DJVM_BINDINGS=ON -DCMAKE_EXPORT_COMPILE_COMMANDS=ON ${arch_flag} && \ + ninja +EOF +) + +set -x +mkdir -p build-gpu/ +# Work around https://github.com/NVIDIA/cccl/issues/1956 +# TODO(hcho3): Remove this once new CUDA version ships with CCCL 2.6.0+ +git clone https://github.com/NVIDIA/cccl.git -b v2.6.1 --quiet --depth 1 +python3 ops/docker_run.py \ + --container-id xgb-ci.jvm_gpu_build \ + -- bash -c "${COMMAND}" diff --git a/ops/pipeline/build-jvm-macos-m1.sh b/ops/pipeline/build-jvm-macos-m1.sh old mode 100644 new mode 100755 diff --git a/ops/pipeline/build-jvm-manylinux2014.sh b/ops/pipeline/build-jvm-manylinux2014.sh old mode 100644 new mode 100755 diff --git a/ops/pipeline/build-test-jvm-packages-impl.sh b/ops/pipeline/build-test-jvm-packages-impl.sh index 717868521408..180788436d9b 100755 --- a/ops/pipeline/build-test-jvm-packages-impl.sh +++ b/ops/pipeline/build-test-jvm-packages-impl.sh @@ -8,8 +8,6 @@ cat <<-EOF Inputs - SCALA_VERSION: Scala version, either 2.12 or 2.13 (Required) - USE_CUDA: Set to 1 to enable CUDA - - CUDA_ARCH: Semicolon separated list of GPU compute capability targets - (e.g. '35;61') Only applicable if USE_CUDA=1 - SKIP_NATIVE_BUILD: Set to 1 to have the JVM packages use an externally provided libxgboost4j.so. (Usually Maven will invoke create_jni.py to build it from scratch.) When using this option, make sure to @@ -40,10 +38,31 @@ else fi # If SKIP_NATIVE_BUILD is set, copy in libxgboost4j.so from lib/ +# Also copy in other files needed for testing. (Usually create_jni.py would perform this +# step, but we need to do it manually here.) if [[ "${SKIP_NATIVE_BUILD:-}" == "1" ]] then echo "Using externally provided libxgboost4j.so. Locating one from lib/..." - cp -v lib/libxgboost4j.so ./jvm-packages/xgboost4j/src/main/resources/lib/linux/x86_64/ + mkdir -p jvm-packages/xgboost4j/src/main/resources/lib/linux/x86_64/ + cp -v lib/libxgboost4j.so jvm-packages/xgboost4j/src/main/resources/lib/linux/x86_64/ + mkdir -p jvm-packages/xgboost4j/src/test/resources + mkdir -p jvm-packages/xgboost4j-spark/src/test/resources + mkdir -p jvm-packages/xgboost4j-spark-gpu/src/test/resources + + # Generate machine.txt.* files from the CLI regression demo + # TODO(hcho3): Remove once CLI is removed + pushd demo/CLI/regression + python3 mapfeat.py + python3 mknfold.py machine.txt 1 + popd + + cp -v demo/data/agaricus.* \ + jvm-packages/xgboost4j/src/test/resources + cp -v demo/CLI/regression/machine.txt.t* demo/data/agaricus.* \ + jvm-packages/xgboost4j-spark/src/test/resources + cp -v demo/data/veterans_lung_cancer.csv \ + jvm-packages/xgboost4j-spark/src/test/resources/rank.train.csv \ + jvm-packages/xgboost4j-spark-gpu/src/test/resources fi cd jvm-packages/ @@ -70,11 +89,6 @@ then fi set -x -if [[ -n "${CUDA_ARCH:-}" ]] -then - export GPU_ARCH_FLAG="-DGPU_COMPUTE_VER='${CUDA_ARCH}'" -fi - mvn --no-transfer-progress clean install ${mvn_options} # Integration tests diff --git a/ops/pipeline/build-test-jvm-packages.sh b/ops/pipeline/build-test-jvm-packages.sh index 30a11d444d1b..1feddf2bff98 100755 --- a/ops/pipeline/build-test-jvm-packages.sh +++ b/ops/pipeline/build-test-jvm-packages.sh @@ -6,14 +6,7 @@ INPUT_DOC=$( cat <<-EOF Inputs - - SCALA_VERSION: Scala version, either 2.12 or 2.13 (Required) - - USE_CUDA: Set to 1 to enable CUDA - - CUDA_ARCH: Semicolon separated list of GPU compute capability targets - (e.g. '35;61') Only applicable if USE_CUDA=1 - - SKIP_NATIVE_BUILD: Set to 1 to have the JVM packages use an externally provided - libxgboost4j.so. (Usually Maven will invoke create_jni.py to - build it from scratch.) When using this option, make sure to - place libxgboost4j.so in lib/ directory. + - SCALA_VERSION: Scala version, either 2.12 or 2.13 (Required) EOF ) @@ -32,15 +25,6 @@ done set -x -run_args="-e SCALA_VERSION=${SCALA_VERSION}" -for arg in "USE_CUDA" "CUDA_ARCH" "SKIP_NATIVE_BUILD" -do - if [[ -n "${!arg:-}" ]] - then - run_args="${run_args} -e ${arg}=${!arg}" - fi -done -echo "${run_args}" - python3 ops/docker_run.py --container-id xgb-ci.jvm \ - --run-args "${run_args}" -- ops/pipeline/build-test-jvm-packages-impl.sh + --run-args "-e SCALA_VERSION=${SCALA_VERSION}" \ + -- ops/pipeline/build-test-jvm-packages-impl.sh diff --git a/ops/pipeline/build-win64-gpu.ps1 b/ops/pipeline/build-win64-gpu.ps1 index 48863528684a..c691a55f954c 100644 --- a/ops/pipeline/build-win64-gpu.ps1 +++ b/ops/pipeline/build-win64-gpu.ps1 @@ -11,6 +11,8 @@ nvcc --version # $arch_flag = "" #} +# Work around https://github.com/NVIDIA/cccl/issues/1956 +# TODO(hcho3): Remove this once new CUDA version ships with CCCL 2.6.0+ git clone https://github.com/NVIDIA/cccl.git -b v2.6.1 --quiet mkdir build cd build diff --git a/ops/pipeline/test-cpp-gpu.sh b/ops/pipeline/test-cpp-gpu.sh index 8ff66a554e0c..b66162d66a50 100755 --- a/ops/pipeline/test-cpp-gpu.sh +++ b/ops/pipeline/test-cpp-gpu.sh @@ -15,21 +15,25 @@ case "${arg}" in gpu) echo "--- Run Google Tests, using a single GPU" python3 ops/docker_run.py --container-id xgb-ci.gpu --use-gpus \ - --run-args='--privileged' \ + -- nvidia-smi + python3 ops/docker_run.py --container-id xgb-ci.gpu --use-gpus \ -- build/testxgboost ;; gpu-rmm) echo "--- Run Google Tests, using a single GPU, RMM enabled" python3 ops/docker_run.py --container-id xgb-ci.gpu --use-gpus \ - --run-args='--privileged' \ + -- nvidia-smi + python3 ops/docker_run.py --container-id xgb-ci.gpu --use-gpus \ -- build/testxgboost --use-rmm-pool ;; mgpu) echo "--- Run Google Tests, using multiple GPUs" python3 ops/docker_run.py --container-id xgb-ci.gpu --use-gpus \ - --run-args='--privileged --shm-size=4g' \ + -- nvidia-smi + python3 ops/docker_run.py --container-id xgb-ci.gpu --use-gpus \ + --run-args='--shm-size=4g' \ -- build/testxgboost --gtest_filter=*MGPU* ;; diff --git a/ops/pipeline/test-jvm-gpu.sh b/ops/pipeline/test-jvm-gpu.sh new file mode 100755 index 000000000000..272b55ad0d1a --- /dev/null +++ b/ops/pipeline/test-jvm-gpu.sh @@ -0,0 +1,18 @@ +#!/bin/bash +## Test JVM packages with CUDA. Note: this script assumes that +## the user has already built libxgboost4j.so with CUDA support +## and place it in the lib/ directory. + +set -euo pipefail + +# source ops/pipeline/enforce-ci.sh + +SCALA_VERSION=2.12 + +set -x + +python3 ops/docker_run.py --container-id xgb-ci.jvm_gpu_build --use-gpus \ + -- nvidia-smi +python3 ops/docker_run.py --container-id xgb-ci.jvm_gpu_build --use-gpus \ + --run-args "-e SCALA_VERSION=${SCALA_VERSION} -e USE_CUDA=1 -e SKIP_NATIVE_BUILD=1" \ + -- ops/pipeline/build-test-jvm-packages-impl.sh diff --git a/ops/pipeline/test-python.sh b/ops/pipeline/test-python.sh index b33b38ac187c..3002e878cf6e 100755 --- a/ops/pipeline/test-python.sh +++ b/ops/pipeline/test-python.sh @@ -31,12 +31,15 @@ case "$suite" in echo "-- Run Python tests, using a single GPU" echo " python -c 'from cupy.cuda import jitify; jitify._init_module()' - pytest -v -s -rxXs --fulltrace --durations=0 -m 'not mgpu' tests/python-gpu + pytest -v -sx -rxXs --fulltrace --durations=0 -m 'not mgpu' tests/python-gpu " >> test-python-wrapper.sh set -x cat test-python-wrapper.sh python3 ops/docker_run.py --container-id "${container_id}" --use-gpus \ - --run-args='--privileged' \ + -- nvidia-smi + python3 ops/docker_run.py --container-id "${container_id}" --use-gpus \ + -- bash -c "source activate gpu_test && python -c 'from numba import cuda; cuda.detect()'" + python3 ops/docker_run.py --container-id "${container_id}" --use-gpus \ -- bash test-python-wrapper.sh gpu_test ;; @@ -44,15 +47,19 @@ case "$suite" in echo "-- Run Python tests, using multiple GPUs" echo " python -c 'from cupy.cuda import jitify; jitify._init_module()' - pytest -v -s -rxXs --fulltrace --durations=0 -m 'mgpu' tests/python-gpu - pytest -v -s -rxXs --fulltrace --durations=0 -m 'mgpu' tests/test_distributed/test_gpu_with_dask - pytest -v -s -rxXs --fulltrace --durations=0 -m 'mgpu' tests/test_distributed/test_gpu_with_spark - pytest -v -s -rxXs --fulltrace --durations=0 -m 'mgpu' tests/test_distributed/test_gpu_federated + pytest -v -sx -rxXs --fulltrace --durations=0 -m 'mgpu' tests/python-gpu + pytest -v -sx -rxXs --fulltrace --durations=0 -m 'mgpu' tests/test_distributed/test_gpu_with_dask + pytest -v -sx -rxXs --fulltrace --durations=0 -m 'mgpu' tests/test_distributed/test_gpu_with_spark + pytest -v -sx -rxXs --fulltrace --durations=0 -m 'mgpu' tests/test_distributed/test_gpu_federated " >> test-python-wrapper.sh set -x cat test-python-wrapper.sh python3 ops/docker_run.py --container-id "${container_id}" --use-gpus \ - --run-args='--privileged --shm-size=4g' \ + -- nvidia-smi + python3 ops/docker_run.py --container-id "${container_id}" --use-gpus \ + -- bash -c "source activate gpu_test && python -c 'from numba import cuda; cuda.detect()'" + python3 ops/docker_run.py --container-id "${container_id}" --use-gpus \ + --run-args='--shm-size=4g' \ -- bash test-python-wrapper.sh gpu_test ;; From f4d94a19b903d4bfd6458b90f0f8201616f2765d Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Sat, 2 Nov 2024 13:49:28 +0800 Subject: [PATCH 22/45] Disable the host numa virtual memory allocator for now. (#10934) --- src/common/device_helpers.cu | 5 +++++ tests/cpp/common/test_device_vector.cu | 11 +++-------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/common/device_helpers.cu b/src/common/device_helpers.cu index 608a535cd8cb..01e81b16ee0b 100644 --- a/src/common/device_helpers.cu +++ b/src/common/device_helpers.cu @@ -7,6 +7,11 @@ namespace dh { PinnedMemory::PinnedMemory() { + // Use the `GrowOnlyPinnedMemoryImpl` as the only option for now. + // See https://github.com/dmlc/xgboost/issues/10933 + this->impl_.emplace(); + return; + #if defined(xgboost_IS_WIN) this->impl_.emplace(); #else diff --git a/tests/cpp/common/test_device_vector.cu b/tests/cpp/common/test_device_vector.cu index d7a03e41a64b..ec1a420bd349 100644 --- a/tests/cpp/common/test_device_vector.cu +++ b/tests/cpp/common/test_device_vector.cu @@ -31,6 +31,9 @@ class TestVirtualMem : public ::testing::TestWithParam { public: void Run() { auto type = this->GetParam(); + if (type == CU_MEM_LOCATION_TYPE_HOST_NUMA) { + GTEST_SKIP_("Host numa might require special system capabilities, skipping for now."); + } detail::GrowOnlyVirtualMemVec vec{type}; auto prop = xgboost::cudr::MakeAllocProp(type); auto gran = xgboost::cudr::GetAllocGranularity(&prop); @@ -110,14 +113,6 @@ TEST(TestVirtualMem, Version) { xgboost::curt::DrVersion(&major, &minor); LOG(INFO) << "Latest supported CUDA version by the driver:" << major << "." << minor; PinnedMemory pinned; -#if defined(xgboost_IS_WIN) ASSERT_FALSE(pinned.IsVm()); -#else // defined(xgboost_IS_WIN) - if (major >= 12 && minor >= 5) { - ASSERT_TRUE(pinned.IsVm()); - } else { - ASSERT_FALSE(pinned.IsVm()); - } -#endif // defined(xgboost_IS_WIN) } } // namespace dh From e1c7e24e1db4cd72759531b90cc8bd4df05847ea Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Thu, 14 Nov 2024 23:39:19 -0800 Subject: [PATCH 23/45] Run GPU tests with privilege escalation --- ops/docker/dockerfile/Dockerfile.gpu | 3 ++- ops/pipeline/test-cpp-gpu.sh | 4 +++- ops/pipeline/test-jvm-gpu.sh | 2 +- ops/pipeline/test-python.sh | 17 +++++++---------- 4 files changed, 13 insertions(+), 13 deletions(-) diff --git a/ops/docker/dockerfile/Dockerfile.gpu b/ops/docker/dockerfile/Dockerfile.gpu index eac35c3aaa90..beb1710d2d7a 100644 --- a/ops/docker/dockerfile/Dockerfile.gpu +++ b/ops/docker/dockerfile/Dockerfile.gpu @@ -28,7 +28,8 @@ RUN \ mamba create -y -n gpu_test -c ${RAPIDSAI_CONDA_CHANNEL_ARG} -c conda-forge -c nvidia \ python=3.10 "cudf=$RAPIDS_VERSION_ARG.*" "rmm=$RAPIDS_VERSION_ARG.*" cuda-version=$CUDA_SHORT_VER \ "nccl>=${NCCL_SHORT_VER}" \ - dask \ + "dask<=2024.10.0" \ + "distributed<=2024.10.0" \ "dask-cuda=$RAPIDS_VERSION_ARG.*" "dask-cudf=$RAPIDS_VERSION_ARG.*" cupy \ numpy pytest pytest-timeout scipy scikit-learn pandas matplotlib wheel \ python-kubernetes urllib3 graphviz hypothesis loky \ diff --git a/ops/pipeline/test-cpp-gpu.sh b/ops/pipeline/test-cpp-gpu.sh index b66162d66a50..98f467250dd0 100755 --- a/ops/pipeline/test-cpp-gpu.sh +++ b/ops/pipeline/test-cpp-gpu.sh @@ -17,6 +17,7 @@ case "${arg}" in python3 ops/docker_run.py --container-id xgb-ci.gpu --use-gpus \ -- nvidia-smi python3 ops/docker_run.py --container-id xgb-ci.gpu --use-gpus \ + --run-args='--privileged' \ -- build/testxgboost ;; @@ -25,6 +26,7 @@ case "${arg}" in python3 ops/docker_run.py --container-id xgb-ci.gpu --use-gpus \ -- nvidia-smi python3 ops/docker_run.py --container-id xgb-ci.gpu --use-gpus \ + --run-args='--privileged' \ -- build/testxgboost --use-rmm-pool ;; @@ -33,7 +35,7 @@ case "${arg}" in python3 ops/docker_run.py --container-id xgb-ci.gpu --use-gpus \ -- nvidia-smi python3 ops/docker_run.py --container-id xgb-ci.gpu --use-gpus \ - --run-args='--shm-size=4g' \ + --run-args='--shm-size=4g --privileged' \ -- build/testxgboost --gtest_filter=*MGPU* ;; diff --git a/ops/pipeline/test-jvm-gpu.sh b/ops/pipeline/test-jvm-gpu.sh index 272b55ad0d1a..108ec749674b 100755 --- a/ops/pipeline/test-jvm-gpu.sh +++ b/ops/pipeline/test-jvm-gpu.sh @@ -14,5 +14,5 @@ set -x python3 ops/docker_run.py --container-id xgb-ci.jvm_gpu_build --use-gpus \ -- nvidia-smi python3 ops/docker_run.py --container-id xgb-ci.jvm_gpu_build --use-gpus \ - --run-args "-e SCALA_VERSION=${SCALA_VERSION} -e USE_CUDA=1 -e SKIP_NATIVE_BUILD=1" \ + --run-args "-e SCALA_VERSION=${SCALA_VERSION} -e USE_CUDA=1 -e SKIP_NATIVE_BUILD=1 --privileged" \ -- ops/pipeline/build-test-jvm-packages-impl.sh diff --git a/ops/pipeline/test-python.sh b/ops/pipeline/test-python.sh index 3002e878cf6e..3997f416ec46 100755 --- a/ops/pipeline/test-python.sh +++ b/ops/pipeline/test-python.sh @@ -31,15 +31,14 @@ case "$suite" in echo "-- Run Python tests, using a single GPU" echo " python -c 'from cupy.cuda import jitify; jitify._init_module()' - pytest -v -sx -rxXs --fulltrace --durations=0 -m 'not mgpu' tests/python-gpu + pytest -v -s -rxXs --fulltrace --durations=0 -m 'not mgpu' tests/python-gpu " >> test-python-wrapper.sh set -x cat test-python-wrapper.sh python3 ops/docker_run.py --container-id "${container_id}" --use-gpus \ -- nvidia-smi python3 ops/docker_run.py --container-id "${container_id}" --use-gpus \ - -- bash -c "source activate gpu_test && python -c 'from numba import cuda; cuda.detect()'" - python3 ops/docker_run.py --container-id "${container_id}" --use-gpus \ + --run-args='--privileged' \ -- bash test-python-wrapper.sh gpu_test ;; @@ -47,19 +46,17 @@ case "$suite" in echo "-- Run Python tests, using multiple GPUs" echo " python -c 'from cupy.cuda import jitify; jitify._init_module()' - pytest -v -sx -rxXs --fulltrace --durations=0 -m 'mgpu' tests/python-gpu - pytest -v -sx -rxXs --fulltrace --durations=0 -m 'mgpu' tests/test_distributed/test_gpu_with_dask - pytest -v -sx -rxXs --fulltrace --durations=0 -m 'mgpu' tests/test_distributed/test_gpu_with_spark - pytest -v -sx -rxXs --fulltrace --durations=0 -m 'mgpu' tests/test_distributed/test_gpu_federated + pytest -v -s -rxXs --fulltrace --durations=0 -m 'mgpu' tests/python-gpu + pytest -v -s -rxXs --fulltrace --durations=0 -m 'mgpu' tests/test_distributed/test_gpu_with_dask + pytest -v -s -rxXs --fulltrace --durations=0 -m 'mgpu' tests/test_distributed/test_gpu_with_spark + pytest -v -s -rxXs --fulltrace --durations=0 -m 'mgpu' tests/test_distributed/test_gpu_federated " >> test-python-wrapper.sh set -x cat test-python-wrapper.sh python3 ops/docker_run.py --container-id "${container_id}" --use-gpus \ -- nvidia-smi python3 ops/docker_run.py --container-id "${container_id}" --use-gpus \ - -- bash -c "source activate gpu_test && python -c 'from numba import cuda; cuda.detect()'" - python3 ops/docker_run.py --container-id "${container_id}" --use-gpus \ - --run-args='--shm-size=4g' \ + --run-args='--shm-size=4g --privileged' \ -- bash test-python-wrapper.sh gpu_test ;; From 11b0427b277d0577d971feafba2b48b8aaee16d3 Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Sat, 16 Nov 2024 22:50:38 -0800 Subject: [PATCH 24/45] Reboot after install driver; use proprietary driver for now --- ops/packer/linux/bootstrap.sh | 32 ++++++++++++++--------------- ops/packer/linux/install_drivers.sh | 14 +++++++++++++ ops/packer/linux/linux.pkr.hcl | 13 +++++++++++- 3 files changed, 42 insertions(+), 17 deletions(-) create mode 100644 ops/packer/linux/install_drivers.sh diff --git a/ops/packer/linux/bootstrap.sh b/ops/packer/linux/bootstrap.sh index 9dbda19c3baa..57be6e14b507 100644 --- a/ops/packer/linux/bootstrap.sh +++ b/ops/packer/linux/bootstrap.sh @@ -1,21 +1,6 @@ #!/bin/bash set -euo pipefail -## Install basic tools -echo 'debconf debconf/frontend select Noninteractive' | sudo debconf-set-selections -sudo apt-get update -sudo apt-get install -y cmake git build-essential wget ca-certificates curl unzip - -## Install CUDA 12.5 + driver -echo "Installilng CUDA and driver..." -wget -nv https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/x86_64/cuda-ubuntu2404.pin -sudo mv cuda-ubuntu2404.pin /etc/apt/preferences.d/cuda-repository-pin-600 -wget -nv https://developer.download.nvidia.com/compute/cuda/12.5.1/local_installers/cuda-repo-ubuntu2404-12-5-local_12.5.1-555.42.06-1_amd64.deb -sudo dpkg -i cuda-repo-ubuntu2404-12-5-local_12.5.1-555.42.06-1_amd64.deb -sudo cp /var/cuda-repo-ubuntu2404-12-5-local/cuda-*-keyring.gpg /usr/share/keyrings/ -sudo apt-get update -sudo apt-get install -y cuda-toolkit-12-5 nvidia-driver-555-open cuda-drivers-555 - ## Install Docker # Add Docker's official GPG key: sudo install -m 0755 -d /etc/apt/keyrings @@ -31,6 +16,12 @@ sudo apt-get install -y docker-ce docker-ce-cli containerd.io docker-buildx-plug # Allow users to use Docker without sudo sudo usermod -aG docker ubuntu +# Start Docker daemon +sudo systemctl is-active --quiet docker.service || sudo systemctl start docker.service +sudo systemctl is-enabled --quiet docker.service || sudo systemctl enable docker.service +sleep 10 # Docker daemon takes time to come up after installing +sudo docker info + ## Install NVIDIA Container Toolkit curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg \ && curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | \ @@ -41,12 +32,21 @@ sudo apt-get install -y nvidia-container-toolkit sudo nvidia-ctk runtime configure --runtime=docker sudo systemctl restart docker +sleep 10 +sudo docker run --rm --gpus all ubuntu nvidia-smi +sudo systemctl stop docker + ## Install AWS CLI v2 wget -nv https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip -O awscliv2.zip -unzip awscliv2.zip +unzip -q awscliv2.zip sudo ./aws/install +rm -rf ./aws/ ./awscliv2.zip ## Install jq and yq sudo apt update && sudo apt install jq +mkdir yq/ +pushd yq/ wget -nv https://github.com/mikefarah/yq/releases/download/v4.44.3/yq_linux_amd64.tar.gz -O - | \ tar xz && sudo mv ./yq_linux_amd64 /usr/bin/yq +popd +rm -rf yq/ diff --git a/ops/packer/linux/install_drivers.sh b/ops/packer/linux/install_drivers.sh new file mode 100644 index 000000000000..07309be836a8 --- /dev/null +++ b/ops/packer/linux/install_drivers.sh @@ -0,0 +1,14 @@ +#!/bin/bash +set -euo pipefail + +## Install basic tools +echo 'debconf debconf/frontend select Noninteractive' | sudo debconf-set-selections +sudo apt-get update +sudo apt-get install -y cmake git build-essential wget ca-certificates curl unzip + +## Install CUDA Toolkit 12.6 (Driver will be installed later) +wget -nv https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/x86_64/cuda-keyring_1.1-1_all.deb +sudo dpkg -i cuda-keyring_1.1-1_all.deb +sudo apt-get update +sudo apt-get -y install cuda-toolkit-12-6 cuda-drivers-565 +rm cuda-keyring_1.1-1_all.deb diff --git a/ops/packer/linux/linux.pkr.hcl b/ops/packer/linux/linux.pkr.hcl index 1dc11f9bac03..c6990894764a 100644 --- a/ops/packer/linux/linux.pkr.hcl +++ b/ops/packer/linux/linux.pkr.hcl @@ -63,6 +63,17 @@ build { sources = ["source.amazon-ebs.runs-on-linux"] provisioner "shell" { - script = "bootstrap.sh" + script = "install_drivers.sh" + pause_after = "30s" + } + + provisioner "shell" { + expect_disconnect = true + inline = ["echo 'Reboot VM'", "sudo reboot"] + } + + provisioner "shell" { + pause_before = "1m0s" + script = "bootstrap.sh" } } From 49eaec14339c7efcf88088e8478f49422a327a0e Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Sat, 16 Nov 2024 22:54:59 -0800 Subject: [PATCH 25/45] Try removing --privileged flag --- ops/pipeline/test-cpp-gpu.sh | 4 +--- ops/pipeline/test-jvm-gpu.sh | 2 +- ops/pipeline/test-python.sh | 3 +-- 3 files changed, 3 insertions(+), 6 deletions(-) diff --git a/ops/pipeline/test-cpp-gpu.sh b/ops/pipeline/test-cpp-gpu.sh index 98f467250dd0..b66162d66a50 100755 --- a/ops/pipeline/test-cpp-gpu.sh +++ b/ops/pipeline/test-cpp-gpu.sh @@ -17,7 +17,6 @@ case "${arg}" in python3 ops/docker_run.py --container-id xgb-ci.gpu --use-gpus \ -- nvidia-smi python3 ops/docker_run.py --container-id xgb-ci.gpu --use-gpus \ - --run-args='--privileged' \ -- build/testxgboost ;; @@ -26,7 +25,6 @@ case "${arg}" in python3 ops/docker_run.py --container-id xgb-ci.gpu --use-gpus \ -- nvidia-smi python3 ops/docker_run.py --container-id xgb-ci.gpu --use-gpus \ - --run-args='--privileged' \ -- build/testxgboost --use-rmm-pool ;; @@ -35,7 +33,7 @@ case "${arg}" in python3 ops/docker_run.py --container-id xgb-ci.gpu --use-gpus \ -- nvidia-smi python3 ops/docker_run.py --container-id xgb-ci.gpu --use-gpus \ - --run-args='--shm-size=4g --privileged' \ + --run-args='--shm-size=4g' \ -- build/testxgboost --gtest_filter=*MGPU* ;; diff --git a/ops/pipeline/test-jvm-gpu.sh b/ops/pipeline/test-jvm-gpu.sh index 108ec749674b..272b55ad0d1a 100755 --- a/ops/pipeline/test-jvm-gpu.sh +++ b/ops/pipeline/test-jvm-gpu.sh @@ -14,5 +14,5 @@ set -x python3 ops/docker_run.py --container-id xgb-ci.jvm_gpu_build --use-gpus \ -- nvidia-smi python3 ops/docker_run.py --container-id xgb-ci.jvm_gpu_build --use-gpus \ - --run-args "-e SCALA_VERSION=${SCALA_VERSION} -e USE_CUDA=1 -e SKIP_NATIVE_BUILD=1 --privileged" \ + --run-args "-e SCALA_VERSION=${SCALA_VERSION} -e USE_CUDA=1 -e SKIP_NATIVE_BUILD=1" \ -- ops/pipeline/build-test-jvm-packages-impl.sh diff --git a/ops/pipeline/test-python.sh b/ops/pipeline/test-python.sh index 3997f416ec46..02907253bc0f 100755 --- a/ops/pipeline/test-python.sh +++ b/ops/pipeline/test-python.sh @@ -38,7 +38,6 @@ case "$suite" in python3 ops/docker_run.py --container-id "${container_id}" --use-gpus \ -- nvidia-smi python3 ops/docker_run.py --container-id "${container_id}" --use-gpus \ - --run-args='--privileged' \ -- bash test-python-wrapper.sh gpu_test ;; @@ -56,7 +55,7 @@ case "$suite" in python3 ops/docker_run.py --container-id "${container_id}" --use-gpus \ -- nvidia-smi python3 ops/docker_run.py --container-id "${container_id}" --use-gpus \ - --run-args='--shm-size=4g --privileged' \ + --run-args='--shm-size=4g' \ -- bash test-python-wrapper.sh gpu_test ;; From d3482e1b6d67698de299dc153cb6441f1c798d9b Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Sun, 17 Nov 2024 02:19:16 -0800 Subject: [PATCH 26/45] Revert "Disable the host numa virtual memory allocator for now. (#10934)" This reverts commit f4d94a19b903d4bfd6458b90f0f8201616f2765d. --- src/common/device_helpers.cu | 5 ----- tests/cpp/common/test_device_vector.cu | 11 ++++++++--- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/common/device_helpers.cu b/src/common/device_helpers.cu index 01e81b16ee0b..608a535cd8cb 100644 --- a/src/common/device_helpers.cu +++ b/src/common/device_helpers.cu @@ -7,11 +7,6 @@ namespace dh { PinnedMemory::PinnedMemory() { - // Use the `GrowOnlyPinnedMemoryImpl` as the only option for now. - // See https://github.com/dmlc/xgboost/issues/10933 - this->impl_.emplace(); - return; - #if defined(xgboost_IS_WIN) this->impl_.emplace(); #else diff --git a/tests/cpp/common/test_device_vector.cu b/tests/cpp/common/test_device_vector.cu index ec1a420bd349..d7a03e41a64b 100644 --- a/tests/cpp/common/test_device_vector.cu +++ b/tests/cpp/common/test_device_vector.cu @@ -31,9 +31,6 @@ class TestVirtualMem : public ::testing::TestWithParam { public: void Run() { auto type = this->GetParam(); - if (type == CU_MEM_LOCATION_TYPE_HOST_NUMA) { - GTEST_SKIP_("Host numa might require special system capabilities, skipping for now."); - } detail::GrowOnlyVirtualMemVec vec{type}; auto prop = xgboost::cudr::MakeAllocProp(type); auto gran = xgboost::cudr::GetAllocGranularity(&prop); @@ -113,6 +110,14 @@ TEST(TestVirtualMem, Version) { xgboost::curt::DrVersion(&major, &minor); LOG(INFO) << "Latest supported CUDA version by the driver:" << major << "." << minor; PinnedMemory pinned; +#if defined(xgboost_IS_WIN) ASSERT_FALSE(pinned.IsVm()); +#else // defined(xgboost_IS_WIN) + if (major >= 12 && minor >= 5) { + ASSERT_TRUE(pinned.IsVm()); + } else { + ASSERT_FALSE(pinned.IsVm()); + } +#endif // defined(xgboost_IS_WIN) } } // namespace dh From 130d303f40c25fc3fb3b07469bbb59f80118a8cf Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Mon, 18 Nov 2024 09:30:31 -0800 Subject: [PATCH 27/45] Build JVM docs --- .github/workflows/main.yml | 25 +++++++++++++++++-- .github/workflows/windows.yml | 1 - doc/jvm/api.rst | 1 + jvm-packages/pom.xml | 16 ++++++++++++ .../build-jvm-doc-impl.sh} | 18 +++++++------ ops/pipeline/build-jvm-doc.sh | 7 ++++-- 6 files changed, 56 insertions(+), 12 deletions(-) rename ops/{build_jvm_doc.sh => pipeline/build-jvm-doc-impl.sh} (57%) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 1b78bdb69df1..47e195267d49 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -20,7 +20,6 @@ jobs: runs-on: - runs-on=${{ github.run_id }} - runner=${{ matrix.runner }} - - spot=false strategy: matrix: container_id: @@ -121,7 +120,6 @@ jobs: runs-on: - runs-on=${{ github.run_id }} - runner=linux-amd64-cpu - - spot=false steps: # Restart Docker daemon so that it recognizes the ephemeral disks - run: sudo systemctl restart docker @@ -263,6 +261,29 @@ jobs: COMMAND: upload KEY: build-jvm-gpu + build-jvm-docs: + name: Build docs for JVM packages + needs: [build-jvm-gpu] + runs-on: + - runs-on=${{ github.run_id }} + - runner=linux-amd64-cpu + steps: + # Restart Docker daemon so that it recognizes the ephemeral disks + - run: sudo systemctl restart docker + - uses: actions/checkout@v4 + with: + submodules: "true" + - name: Fetch container from cache + run: bash ops/docker_build.sh + env: + CONTAINER_ID: xgb-ci.jvm_gpu_build + - name: Unstash files + run: bash ops/stash_artifacts.sh lib/libxgboost4j.so + env: + COMMAND: download + KEY: build-jvm-gpu + - run: bash ops/pipeline/build-jvm-doc.sh + build-test-jvm-packages: name: Build and test JVM packages needs: build-containers diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml index 0fc50815d683..73a258158b12 100644 --- a/.github/workflows/windows.yml +++ b/.github/workflows/windows.yml @@ -26,7 +26,6 @@ jobs: runs-on: - runs-on=${{ github.run_id }} - runner=windows-cpu - - spot=false steps: - uses: actions/checkout@v4 with: diff --git a/doc/jvm/api.rst b/doc/jvm/api.rst index b9e7821aa6fa..3d56cb2c9aa4 100644 --- a/doc/jvm/api.rst +++ b/doc/jvm/api.rst @@ -5,4 +5,5 @@ API Docs for the JVM packages * `XGBoost4J Java API <../jvm_docs/javadocs/index.html>`_ * `XGBoost4J Scala API <../jvm_docs/scaladocs/xgboost4j/index.html>`_ * `XGBoost4J-Spark Scala API <../jvm_docs/scaladocs/xgboost4j-spark/index.html>`_ +* `XGBoost4J-Spark-GPU Scala API <../jvm_docs/scaladocs/xgboost4j-spark-gpu/index.html>`_ * `XGBoost4J-Flink Scala API <../jvm_docs/scaladocs/xgboost4j-flink/index.html>`_ diff --git a/jvm-packages/pom.xml b/jvm-packages/pom.xml index af7aec0a6982..815e8b473139 100644 --- a/jvm-packages/pom.xml +++ b/jvm-packages/pom.xml @@ -112,6 +112,22 @@ + + docs + + ON + true + true + true + + + xgboost4j + xgboost4j-spark + xgboost4j-spark-gpu + xgboost4j-flink + + + release diff --git a/ops/build_jvm_doc.sh b/ops/pipeline/build-jvm-doc-impl.sh similarity index 57% rename from ops/build_jvm_doc.sh rename to ops/pipeline/build-jvm-doc-impl.sh index 6f785f488027..c334b8ad91d1 100755 --- a/ops/build_jvm_doc.sh +++ b/ops/pipeline/build-jvm-doc-impl.sh @@ -1,6 +1,7 @@ #!/bin/bash - ## Build docs for the JVM packages and package it in a tarball +## Note: Note: this script assumes that the user has already built libxgboost4j.so +## and place it in the lib/ directory. if [[ $# -ne 1 ]] then @@ -10,23 +11,26 @@ fi set -euo pipefail -rm -rf build/ -cd jvm-packages - branch_name=$1 +# Copy in libxgboost4j.so +mkdir -p jvm-packages/xgboost4j/src/main/resources/lib/linux/x86_64/ +cp -v lib/libxgboost4j.so jvm-packages/xgboost4j/src/main/resources/lib/linux/x86_64/ + +cd jvm-packages/ # Install JVM packages in local Maven repository -mvn --no-transfer-progress install -DskipTests +mvn --no-transfer-progress install -Pdocs # Build Scaladocs -mvn --no-transfer-progress scala:doc -DskipTests +mvn --no-transfer-progress scala:doc -Pdocs # Build Javadocs -mvn --no-transfer-progress javadoc:javadoc -DskipTests +mvn --no-transfer-progress javadoc:javadoc -Pdocs # Package JVM docs in a tarball mkdir -p tmp/scaladocs cp -rv xgboost4j/target/site/apidocs/ ./tmp/javadocs/ cp -rv xgboost4j/target/site/scaladocs/ ./tmp/scaladocs/xgboost4j/ cp -rv xgboost4j-spark/target/site/scaladocs/ ./tmp/scaladocs/xgboost4j-spark/ +cp -rv xgboost4j-spark-gpu/target/site/scaladocs/ ./tmp/scaladocs/xgboost4j-spark-gpu/ cp -rv xgboost4j-flink/target/site/scaladocs/ ./tmp/scaladocs/xgboost4j-flink/ cd tmp diff --git a/ops/pipeline/build-jvm-doc.sh b/ops/pipeline/build-jvm-doc.sh index 7b029a4e7e26..0c1afe46e212 100755 --- a/ops/pipeline/build-jvm-doc.sh +++ b/ops/pipeline/build-jvm-doc.sh @@ -1,4 +1,7 @@ #!/bin/bash +## Build docs for the JVM packages and package it in a tarball +## Note: Note: this script assumes that the user has already built libxgboost4j.so +## and place it in the lib/ directory. set -euox pipefail @@ -6,8 +9,8 @@ source ops/pipeline/enforce-ci.sh echo "--- Build JVM packages doc" python3 ops/docker_run.py \ - --container-id jvm \ - -- ops/build_jvm_doc.sh ${BRANCH_NAME} + --container-id xgb-ci.jvm_gpu_build \ + -- ops/pipeline/build-jvm-doc-impl.sh ${BRANCH_NAME} if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]] then echo "--- Upload JVM packages doc" From a45b24fd9608f2bf05dfd4282bad05a03eb10373 Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Mon, 18 Nov 2024 10:37:48 -0800 Subject: [PATCH 28/45] Re-org test scripts --- .../build-gpu-rpkg-impl.sh} | 0 ops/pipeline/build-gpu-rpkg.sh | 2 +- ops/pipeline/test-python-impl.sh | 60 ++++++++++++++ ops/pipeline/test-python.sh | 81 +------------------ 4 files changed, 63 insertions(+), 80 deletions(-) rename ops/{build_r_pkg_with_cuda.sh => pipeline/build-gpu-rpkg-impl.sh} (100%) create mode 100755 ops/pipeline/test-python-impl.sh diff --git a/ops/build_r_pkg_with_cuda.sh b/ops/pipeline/build-gpu-rpkg-impl.sh similarity index 100% rename from ops/build_r_pkg_with_cuda.sh rename to ops/pipeline/build-gpu-rpkg-impl.sh diff --git a/ops/pipeline/build-gpu-rpkg.sh b/ops/pipeline/build-gpu-rpkg.sh index c7d3f7fa4235..e85826f36a26 100755 --- a/ops/pipeline/build-gpu-rpkg.sh +++ b/ops/pipeline/build-gpu-rpkg.sh @@ -7,7 +7,7 @@ source ops/pipeline/enforce-ci.sh echo "--- Build XGBoost R package with CUDA" python3 ops/docker_run.py \ --container-id xgb-ci.gpu_build_r_rockylinux8 \ - -- ops/build_r_pkg_with_cuda.sh \ + -- ops/pipeline/build-gpu-rpkg-impl.sh \ ${GITHUB_SHA} if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]] diff --git a/ops/pipeline/test-python-impl.sh b/ops/pipeline/test-python-impl.sh new file mode 100755 index 000000000000..bd71cfb06435 --- /dev/null +++ b/ops/pipeline/test-python-impl.sh @@ -0,0 +1,60 @@ +#!/bin/bash + +set -euo pipefail + +if [[ "$#" -lt 1 ]] +then + echo "Usage: $0 {gpu|mgpu|cpu|cpu-arm64}" + exit 1 +fi + +suite="$1" + +set -x + +export PYSPARK_DRIVER_PYTHON=$(which python) +export PYSPARK_PYTHON=$(which python) +export SPARK_TESTING=1 + +pip install -v ./python-package/dist/*.whl + +case "$suite" in + gpu) + echo "-- Run Python tests, using a single GPU" + source activate gpu_test + python -c 'from cupy.cuda import jitify; jitify._init_module()' + pytest -v -s -rxXs --fulltrace --durations=0 -m 'not mgpu' tests/python-gpu + ;; + mgpu) + echo "-- Run Python tests, using multiple GPUs" + source activate gpu_test + python -c 'from cupy.cuda import jitify; jitify._init_module()' + pytest -v -s -rxXs --fulltrace --durations=0 -m 'mgpu' tests/python-gpu + pytest -v -s -rxXs --fulltrace --durations=0 -m 'mgpu' \ + tests/test_distributed/test_gpu_with_dask + pytest -v -s -rxXs --fulltrace --durations=0 -m 'mgpu' \ + tests/test_distributed/test_gpu_with_spark + pytest -v -s -rxXs --fulltrace --durations=0 -m 'mgpu' \ + tests/test_distributed/test_gpu_federated + ;; + cpu) + echo "-- Run Python tests (CPU)" + source activate linux_cpu_test + export RAY_OBJECT_STORE_ALLOW_SLOW_STORAGE=1 + pytest -v -s -rxXs --fulltrace --durations=0 tests/python + pytest -v -s -rxXs --fulltrace --durations=0 tests/test_distributed/test_with_dask + pytest -v -s -rxXs --fulltrace --durations=0 tests/test_distributed/test_with_spark + pytest -v -s -rxXs --fulltrace --durations=0 tests/test_distributed/test_federated + ;; + cpu-arm64) + echo "-- Run Python tests (CPU, ARM64)" + source activate aarch64_test + pytest -v -s -rxXs --fulltrace --durations=0 \ + tests/python/test_basic.py tests/python/test_basic_models.py \ + tests/python/test_model_compatibility.py + ;; + *) + echo "Unrecognized argument: $suite" + exit 1 + ;; +esac diff --git a/ops/pipeline/test-python.sh b/ops/pipeline/test-python.sh index 02907253bc0f..047a6f411d6d 100755 --- a/ops/pipeline/test-python.sh +++ b/ops/pipeline/test-python.sh @@ -13,82 +13,5 @@ fi suite="$1" container_id="$2" -cat > test-python-wrapper.sh <<-'EOF' -#!/bin/bash -source activate "$1" - -set -euox pipefail - -export PYSPARK_DRIVER_PYTHON=$(which python) -export PYSPARK_PYTHON=$(which python) -export SPARK_TESTING=1 - -pip install -v ./python-package/dist/*.whl -EOF - -case "$suite" in - gpu) - echo "-- Run Python tests, using a single GPU" - echo " - python -c 'from cupy.cuda import jitify; jitify._init_module()' - pytest -v -s -rxXs --fulltrace --durations=0 -m 'not mgpu' tests/python-gpu - " >> test-python-wrapper.sh - set -x - cat test-python-wrapper.sh - python3 ops/docker_run.py --container-id "${container_id}" --use-gpus \ - -- nvidia-smi - python3 ops/docker_run.py --container-id "${container_id}" --use-gpus \ - -- bash test-python-wrapper.sh gpu_test - ;; - - mgpu) - echo "-- Run Python tests, using multiple GPUs" - echo " - python -c 'from cupy.cuda import jitify; jitify._init_module()' - pytest -v -s -rxXs --fulltrace --durations=0 -m 'mgpu' tests/python-gpu - pytest -v -s -rxXs --fulltrace --durations=0 -m 'mgpu' tests/test_distributed/test_gpu_with_dask - pytest -v -s -rxXs --fulltrace --durations=0 -m 'mgpu' tests/test_distributed/test_gpu_with_spark - pytest -v -s -rxXs --fulltrace --durations=0 -m 'mgpu' tests/test_distributed/test_gpu_federated - " >> test-python-wrapper.sh - set -x - cat test-python-wrapper.sh - python3 ops/docker_run.py --container-id "${container_id}" --use-gpus \ - -- nvidia-smi - python3 ops/docker_run.py --container-id "${container_id}" --use-gpus \ - --run-args='--shm-size=4g' \ - -- bash test-python-wrapper.sh gpu_test - ;; - - cpu) - echo "-- Run Python tests (CPU)" - echo " - export RAY_OBJECT_STORE_ALLOW_SLOW_STORAGE=1 - pytest -v -s -rxXs --fulltrace --durations=0 tests/python - pytest -v -s -rxXs --fulltrace --durations=0 tests/test_distributed/test_with_dask - pytest -v -s -rxXs --fulltrace --durations=0 tests/test_distributed/test_with_spark - pytest -v -s -rxXs --fulltrace --durations=0 tests/test_distributed/test_federated - " >> test-python-wrapper.sh - set -x - cat test-python-wrapper.sh - python3 ops/docker_run.py --container-id "${container_id}" \ - -- bash test-python-wrapper.sh linux_cpu_test - ;; - - cpu-arm64) - echo "-- Run Python tests (CPU, ARM64)" - echo " - pytest -v -s -rxXs --fulltrace --durations=0 \\ - tests/python/test_basic.py tests/python/test_basic_models.py \\ - tests/python/test_model_compatibility.py - " >> test-python-wrapper.sh - set -x - cat test-python-wrapper.sh - python3 ops/docker_run.py --container-id "${container_id}" \ - -- bash test-python-wrapper.sh aarch64_test - ;; - - *) - echo "Unrecognized argument: $suite" - exit 1 - ;; -esac +python3 ops/docker_run.py --container-id "${container_id}" \ + -- bash ops/pipeline/test-python-impl.sh "${suite}" From 6dc0df2c3d2aefb1ff168f3ef5213199682c9ca5 Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Mon, 18 Nov 2024 10:52:22 -0800 Subject: [PATCH 29/45] More reorg --- dev/prepare_jvm_release.py | 2 +- ops/pipeline/build-cpu-arm64.sh | 6 ++--- ops/pipeline/build-cpu.sh | 4 +-- ops/pipeline/build-cuda-with-rmm.sh | 6 ++--- ops/pipeline/build-cuda.sh | 8 +++--- ops/pipeline/build-gpu-rpkg-impl.sh | 2 +- ops/pipeline/build-manylinux2014.sh | 4 +-- .../build-python-wheels-macos.sh} | 2 +- ops/pipeline/build-test-jvm-packages-impl.sh | 2 +- ops/pipeline/build-win64-gpu.ps1 | 2 +- ops/pipeline/run-clang-tidy.sh | 2 +- ops/pipeline/test-python-impl.sh | 25 ++++++++++++++----- ops/{ => script}/build_via_cmake.sh | 0 ops/{ => script}/change_scala_version.py | 0 ops/{ => script}/change_version.py | 0 ops/{ => script}/format_wheel_meta.py | 0 ops/{lint => script}/lint_cmake.sh | 0 ops/{lint => script}/lint_cpp.py | 0 ops/{lint => script}/lint_python.py | 23 ++++++++--------- ops/{lint => script}/lint_r.R | 0 ops/{ => script}/rename_whl.py | 0 ops/{clang-tidy => script}/run_clang_tidy.py | 2 +- ops/{ => script}/test_r_package.py | 0 ops/{clang-tidy => script}/test_tidy.cc | 0 ops/{ => script}/test_utils.py | 2 +- ops/{ => script}/update_rapids.sh | 0 ops/{ => script}/verify_link.sh | 0 27 files changed, 52 insertions(+), 40 deletions(-) rename ops/{build_python_wheels_macos.sh => pipeline/build-python-wheels-macos.sh} (97%) rename ops/{ => script}/build_via_cmake.sh (100%) rename ops/{ => script}/change_scala_version.py (100%) rename ops/{ => script}/change_version.py (100%) rename ops/{ => script}/format_wheel_meta.py (100%) rename ops/{lint => script}/lint_cmake.sh (100%) rename ops/{lint => script}/lint_cpp.py (100%) rename ops/{lint => script}/lint_python.py (95%) rename ops/{lint => script}/lint_r.R (100%) rename ops/{ => script}/rename_whl.py (100%) rename ops/{clang-tidy => script}/run_clang_tidy.py (99%) rename ops/{ => script}/test_r_package.py (100%) rename ops/{clang-tidy => script}/test_tidy.cc (100%) rename ops/{ => script}/test_utils.py (99%) rename ops/{ => script}/update_rapids.sh (100%) rename ops/{ => script}/verify_link.sh (100%) diff --git a/dev/prepare_jvm_release.py b/dev/prepare_jvm_release.py index 927cb4945950..c5a72724f707 100644 --- a/dev/prepare_jvm_release.py +++ b/dev/prepare_jvm_release.py @@ -203,7 +203,7 @@ def main(): ) print( "5. Remove the Scala 2.12 artifacts and build Scala 2.13 artifacts:\n" - " python ops/change_scala_version.py --scala-version 2.13 --purge-artifacts\n" + " python ops/script/change_scala_version.py --scala-version 2.13 --purge-artifacts\n" " GPG_TTY=$(tty) mvn deploy -Prelease -DskipTests -Dskip.native.build=true" ) print( diff --git a/ops/pipeline/build-cpu-arm64.sh b/ops/pipeline/build-cpu-arm64.sh index 8a5db56d9eeb..4be57557ea36 100755 --- a/ops/pipeline/build-cpu-arm64.sh +++ b/ops/pipeline/build-cpu-arm64.sh @@ -11,7 +11,7 @@ source ops/pipeline/enforce-ci.sh echo "--- Build libxgboost from the source" python3 ops/docker_run.py \ --container-id xgb-ci.aarch64 \ - -- ops/build_via_cmake.sh \ + -- ops/script/build_via_cmake.sh \ --conda-env=aarch64_test \ -DUSE_OPENMP=ON \ -DHIDE_CXX_SYMBOL=ON @@ -26,7 +26,7 @@ python3 ops/docker_run.py \ --container-id xgb-ci.aarch64 \ -- bash -c \ "cd python-package && rm -rf dist/* && pip wheel --no-deps -v . --wheel-dir dist/" -python3 ops/rename_whl.py \ +python3 ops/script/rename_whl.py \ --wheel-path python-package/dist/*.whl \ --commit-hash ${GITHUB_SHA} \ --platform-tag ${WHEEL_TAG} @@ -35,7 +35,7 @@ echo "--- Audit binary wheel to ensure it's compliant with ${WHEEL_TAG} standard python3 ops/docker_run.py \ --container-id xgb-ci.aarch64 \ -- auditwheel repair --plat ${WHEEL_TAG} python-package/dist/*.whl -python3 ops/rename_whl.py \ +python3 ops/script/rename_whl.py \ --wheel-path wheelhouse/*.whl \ --commit-hash ${GITHUB_SHA} \ --platform-tag ${WHEEL_TAG} diff --git a/ops/pipeline/build-cpu.sh b/ops/pipeline/build-cpu.sh index 60346203d85f..22384d056f15 100755 --- a/ops/pipeline/build-cpu.sh +++ b/ops/pipeline/build-cpu.sh @@ -18,7 +18,7 @@ echo "--- Run Google Test with sanitizer enabled" sudo sysctl vm.mmap_rnd_bits=28 python3 ops/docker_run.py \ --container-id xgb-ci.cpu \ - -- ops/build_via_cmake.sh \ + -- ops/script/build_via_cmake.sh \ -DUSE_SANITIZER=ON \ -DENABLED_SANITIZERS="address;leak;undefined" \ -DCMAKE_BUILD_TYPE=Debug \ @@ -35,7 +35,7 @@ python3 ops/docker_run.py \ echo "--- Run Google Test" python3 ops/docker_run.py \ --container-id xgb-ci.cpu \ - -- ops/build_via_cmake.sh \ + -- ops/script/build_via_cmake.sh \ -DCMAKE_PREFIX_PATH=/opt/grpc \ -DPLUGIN_FEDERATED=ON python3 ops/docker_run.py \ diff --git a/ops/pipeline/build-cuda-with-rmm.sh b/ops/pipeline/build-cuda-with-rmm.sh index 24523bd875c0..1da0e5e61827 100755 --- a/ops/pipeline/build-cuda-with-rmm.sh +++ b/ops/pipeline/build-cuda-with-rmm.sh @@ -18,7 +18,7 @@ echo "--- Build with CUDA with RMM" echo "--- Build libxgboost from the source" python3 ops/docker_run.py \ --container-id xgb-ci.gpu_build_rockylinux8 \ - -- ops/build_via_cmake.sh \ + -- ops/script/build_via_cmake.sh \ -DCMAKE_PREFIX_PATH="/opt/grpc;/opt/rmm;/opt/rmm/lib64/rapids/cmake" \ -DUSE_CUDA=ON \ -DUSE_OPENMP=ON \ @@ -36,7 +36,7 @@ python3 ops/docker_run.py \ --container-id xgb-ci.gpu_build_rockylinux8 \ -- bash -c \ "cd python-package && rm -rf dist/* && pip wheel --no-deps -v . --wheel-dir dist/" -python3 ops/rename_whl.py \ +python3 ops/script/rename_whl.py \ --wheel-path python-package/dist/*.whl \ --commit-hash ${GITHUB_SHA} \ --platform-tag ${WHEEL_TAG} @@ -46,7 +46,7 @@ python3 ops/docker_run.py \ --container-id xgb-ci.$WHEEL_TAG \ -- auditwheel repair \ --plat ${WHEEL_TAG} python-package/dist/*.whl -python3 ops/rename_whl.py \ +python3 ops/script/rename_whl.py \ --wheel-path wheelhouse/*.whl \ --commit-hash ${GITHUB_SHA} \ --platform-tag ${WHEEL_TAG} diff --git a/ops/pipeline/build-cuda.sh b/ops/pipeline/build-cuda.sh index bcda081b338e..0487fb209dbe 100755 --- a/ops/pipeline/build-cuda.sh +++ b/ops/pipeline/build-cuda.sh @@ -22,7 +22,7 @@ set -x git clone https://github.com/NVIDIA/cccl.git -b v2.6.1 --quiet python3 ops/docker_run.py \ --container-id xgb-ci.gpu_build_rockylinux8 \ - -- ops/build_via_cmake.sh \ + -- ops/script/build_via_cmake.sh \ -DCMAKE_PREFIX_PATH="/opt/grpc;/workspace/cccl" \ -DUSE_CUDA=ON \ -DUSE_OPENMP=ON \ @@ -39,7 +39,7 @@ python3 ops/docker_run.py \ --container-id xgb-ci.gpu_build_rockylinux8 \ -- bash -c \ "cd python-package && rm -rf dist/* && pip wheel --no-deps -v . --wheel-dir dist/" -python3 ops/rename_whl.py \ +python3 ops/script/rename_whl.py \ --wheel-path python-package/dist/*.whl \ --commit-hash ${GITHUB_SHA} \ --platform-tag ${WHEEL_TAG} @@ -49,7 +49,7 @@ python3 ops/docker_run.py \ --container-id xgb-ci.manylinux_2_28_x86_64 \ -- auditwheel repair \ --plat ${WHEEL_TAG} python-package/dist/*.whl -python3 ops/rename_whl.py \ +python3 ops/script/rename_whl.py \ --wheel-path wheelhouse/*.whl \ --commit-hash ${GITHUB_SHA} \ --platform-tag ${WHEEL_TAG} @@ -68,7 +68,7 @@ then # Generate the meta info which includes xgboost version and the commit info python3 ops/docker_run.py \ --container-id xgb-ci.gpu_build_rockylinux8 \ - -- python ops/format_wheel_meta.py \ + -- python ops/script/format_wheel_meta.py \ --wheel-path python-package/dist/*.whl \ --commit-hash ${GITHUB_SHA} \ --platform-tag ${WHEEL_TAG} \ diff --git a/ops/pipeline/build-gpu-rpkg-impl.sh b/ops/pipeline/build-gpu-rpkg-impl.sh index d0a7c9295195..2815b8f448f1 100755 --- a/ops/pipeline/build-gpu-rpkg-impl.sh +++ b/ops/pipeline/build-gpu-rpkg-impl.sh @@ -14,7 +14,7 @@ fi commit_hash="$1" -python3 ops/test_r_package.py --task=pack +python3 ops/script/test_r_package.py --task=pack mv xgboost/ xgboost_rpack/ mkdir build diff --git a/ops/pipeline/build-manylinux2014.sh b/ops/pipeline/build-manylinux2014.sh index 3f04c0f7e7f4..7802fa555187 100755 --- a/ops/pipeline/build-manylinux2014.sh +++ b/ops/pipeline/build-manylinux2014.sh @@ -30,7 +30,7 @@ git checkout python-package/pyproject.toml python-package/xgboost/core.py python3 ops/docker_run.py \ --container-id ${image} \ -- auditwheel repair --plat ${WHEEL_TAG} python-package/dist/*.whl -python3 ops/rename_whl.py \ +python3 ops/script/rename_whl.py \ --wheel-path wheelhouse/*.whl \ --commit-hash ${GITHUB_SHA} \ --platform-tag ${WHEEL_TAG} @@ -51,7 +51,7 @@ git checkout python-package/pyproject.toml # discard the patch python3 ops/docker_run.py \ --container-id ${image} \ -- auditwheel repair --plat ${WHEEL_TAG} python-package/dist/xgboost_cpu-*.whl -python3 ops/rename_whl.py \ +python3 ops/script/rename_whl.py \ --wheel-path wheelhouse/xgboost_cpu-*.whl \ --commit-hash ${GITHUB_SHA} \ --platform-tag ${WHEEL_TAG} diff --git a/ops/build_python_wheels_macos.sh b/ops/pipeline/build-python-wheels-macos.sh similarity index 97% rename from ops/build_python_wheels_macos.sh rename to ops/pipeline/build-python-wheels-macos.sh index f2d1c692c8cb..3715ec9e7e0f 100644 --- a/ops/build_python_wheels_macos.sh +++ b/ops/pipeline/build-python-wheels-macos.sh @@ -43,7 +43,7 @@ export CIBW_REPAIR_WHEEL_COMMAND_MACOS="delocate-wheel --require-archs {delocate python -m pip install cibuildwheel python -m cibuildwheel python-package --output-dir wheelhouse -python tests/ci_build/rename_whl.py \ +python ops/script/rename_whl.py \ --wheel-path wheelhouse/*.whl \ --commit-hash ${commit_id} \ --platform-tag ${wheel_tag} diff --git a/ops/pipeline/build-test-jvm-packages-impl.sh b/ops/pipeline/build-test-jvm-packages-impl.sh index 180788436d9b..3290bf0f17c9 100755 --- a/ops/pipeline/build-test-jvm-packages-impl.sh +++ b/ops/pipeline/build-test-jvm-packages-impl.sh @@ -31,7 +31,7 @@ set -x # Set Scala version if [[ "${SCALA_VERSION}" == "2.12" || "${SCALA_VERSION}" == "2.13" ]] then - python ops/change_scala_version.py --scala-version ${SCALA_VERSION} --purge-artifacts + python ops/script/change_scala_version.py --scala-version ${SCALA_VERSION} --purge-artifacts else echo "Error: SCALA_VERSION must be either 2.12 or 2.13" exit 2 diff --git a/ops/pipeline/build-win64-gpu.ps1 b/ops/pipeline/build-win64-gpu.ps1 index c691a55f954c..cc5380a7c7c2 100644 --- a/ops/pipeline/build-win64-gpu.ps1 +++ b/ops/pipeline/build-win64-gpu.ps1 @@ -31,7 +31,7 @@ pip install --user -v "pip>=23" pip --version pip wheel --no-deps -v . --wheel-dir dist/ if ($LASTEXITCODE -ne 0) { throw "Last command failed" } -python ../ops/rename_whl.py ` +python ../ops/script/rename_whl.py ` --wheel-path (Get-ChildItem dist/*.whl | Select-Object -Expand FullName) ` --commit-hash $Env:GITHUB_SHA ` --platform-tag win_amd64 diff --git a/ops/pipeline/run-clang-tidy.sh b/ops/pipeline/run-clang-tidy.sh index 496b601bfdfb..a9ff039ee4ca 100755 --- a/ops/pipeline/run-clang-tidy.sh +++ b/ops/pipeline/run-clang-tidy.sh @@ -8,4 +8,4 @@ source ops/pipeline/enforce-ci.sh python3 ops/docker_run.py \ --container-id xgb-ci.clang_tidy \ - -- python3 ops/clang-tidy/run_clang_tidy.py --cuda-archs 75 + -- python3 ops/script/run_clang_tidy.py --cuda-archs 75 diff --git a/ops/pipeline/test-python-impl.sh b/ops/pipeline/test-python-impl.sh index bd71cfb06435..be1cb410c96c 100755 --- a/ops/pipeline/test-python-impl.sh +++ b/ops/pipeline/test-python-impl.sh @@ -1,6 +1,6 @@ #!/bin/bash -set -euo pipefail +set -eo pipefail if [[ "$#" -lt 1 ]] then @@ -10,7 +10,24 @@ fi suite="$1" -set -x +# Cannot set -u before Conda env activation +case "$suite" in + gpu|mgpu) + source activate gpu_test + ;; + cpu) + source activate linux_cpu_test + ;; + cpu-arm64) + source activate aarch64_test + ;; + *) + echo "Unrecognized argument: $suite" + exit 1 + ;; +esac + +set -xu export PYSPARK_DRIVER_PYTHON=$(which python) export PYSPARK_PYTHON=$(which python) @@ -21,13 +38,11 @@ pip install -v ./python-package/dist/*.whl case "$suite" in gpu) echo "-- Run Python tests, using a single GPU" - source activate gpu_test python -c 'from cupy.cuda import jitify; jitify._init_module()' pytest -v -s -rxXs --fulltrace --durations=0 -m 'not mgpu' tests/python-gpu ;; mgpu) echo "-- Run Python tests, using multiple GPUs" - source activate gpu_test python -c 'from cupy.cuda import jitify; jitify._init_module()' pytest -v -s -rxXs --fulltrace --durations=0 -m 'mgpu' tests/python-gpu pytest -v -s -rxXs --fulltrace --durations=0 -m 'mgpu' \ @@ -39,7 +54,6 @@ case "$suite" in ;; cpu) echo "-- Run Python tests (CPU)" - source activate linux_cpu_test export RAY_OBJECT_STORE_ALLOW_SLOW_STORAGE=1 pytest -v -s -rxXs --fulltrace --durations=0 tests/python pytest -v -s -rxXs --fulltrace --durations=0 tests/test_distributed/test_with_dask @@ -48,7 +62,6 @@ case "$suite" in ;; cpu-arm64) echo "-- Run Python tests (CPU, ARM64)" - source activate aarch64_test pytest -v -s -rxXs --fulltrace --durations=0 \ tests/python/test_basic.py tests/python/test_basic_models.py \ tests/python/test_model_compatibility.py diff --git a/ops/build_via_cmake.sh b/ops/script/build_via_cmake.sh similarity index 100% rename from ops/build_via_cmake.sh rename to ops/script/build_via_cmake.sh diff --git a/ops/change_scala_version.py b/ops/script/change_scala_version.py similarity index 100% rename from ops/change_scala_version.py rename to ops/script/change_scala_version.py diff --git a/ops/change_version.py b/ops/script/change_version.py similarity index 100% rename from ops/change_version.py rename to ops/script/change_version.py diff --git a/ops/format_wheel_meta.py b/ops/script/format_wheel_meta.py similarity index 100% rename from ops/format_wheel_meta.py rename to ops/script/format_wheel_meta.py diff --git a/ops/lint/lint_cmake.sh b/ops/script/lint_cmake.sh similarity index 100% rename from ops/lint/lint_cmake.sh rename to ops/script/lint_cmake.sh diff --git a/ops/lint/lint_cpp.py b/ops/script/lint_cpp.py similarity index 100% rename from ops/lint/lint_cpp.py rename to ops/script/lint_cpp.py diff --git a/ops/lint/lint_python.py b/ops/script/lint_python.py similarity index 95% rename from ops/lint/lint_python.py rename to ops/script/lint_python.py index c8d0f47709ab..29339d6e04d0 100644 --- a/ops/lint/lint_python.py +++ b/ops/script/lint_python.py @@ -16,8 +16,6 @@ class LintersPaths: BLACK = ( # core "python-package/", - # CI - "tests/ci_build/tidy.py", # tests "tests/python/test_config.py", "tests/python/test_callback.py", @@ -66,10 +64,11 @@ class LintersPaths: "demo/guide-python/update_process.py", "demo/aft_survival/aft_survival_viz_demo.py", # CI - "tests/ci_build/lint_python.py", - "tests/ci_build/test_r_package.py", - "tests/ci_build/test_utils.py", - "tests/ci_build/change_version.py", + "ops/script/run_clang_tidy.py", + "ops/script/lint_python.py", + "ops/script/test_r_package.py", + "ops/script/test_utils.py", + "ops/script/change_version.py", ) ISORT = ( @@ -79,7 +78,7 @@ class LintersPaths: "tests/test_distributed/", "tests/python/", "tests/python-gpu/", - "tests/ci_build/", + "ops/script/", # demo "demo/", # misc @@ -123,11 +122,11 @@ class LintersPaths: "demo/guide-python/learning_to_rank.py", "demo/aft_survival/aft_survival_viz_demo.py", # CI - "tests/ci_build/tidy.py", - "tests/ci_build/lint_python.py", - "tests/ci_build/test_r_package.py", - "tests/ci_build/test_utils.py", - "tests/ci_build/change_version.py", + "ops/script/run_clang_tidy.py", + "ops/script/lint_python.py", + "ops/script/test_r_package.py", + "ops/script/test_utils.py", + "ops/script/change_version.py", ) diff --git a/ops/lint/lint_r.R b/ops/script/lint_r.R similarity index 100% rename from ops/lint/lint_r.R rename to ops/script/lint_r.R diff --git a/ops/rename_whl.py b/ops/script/rename_whl.py similarity index 100% rename from ops/rename_whl.py rename to ops/script/rename_whl.py diff --git a/ops/clang-tidy/run_clang_tidy.py b/ops/script/run_clang_tidy.py similarity index 99% rename from ops/clang-tidy/run_clang_tidy.py rename to ops/script/run_clang_tidy.py index 24cb270393e8..aaeccdaf3718 100755 --- a/ops/clang-tidy/run_clang_tidy.py +++ b/ops/script/run_clang_tidy.py @@ -265,7 +265,7 @@ def test_tidy(args: argparse.Namespace) -> None: """ root_path = os.path.abspath(os.path.curdir) tidy_file = os.path.join(root_path, ".clang-tidy") - test_file_path = os.path.join(root_path, "ops", "clang-tidy", "test_tidy.cc") + test_file_path = os.path.join(root_path, "ops", "script", "test_tidy.cc") tidy_config = "--config-file=" + tidy_file if not args.tidy_version: diff --git a/ops/test_r_package.py b/ops/script/test_r_package.py similarity index 100% rename from ops/test_r_package.py rename to ops/script/test_r_package.py diff --git a/ops/clang-tidy/test_tidy.cc b/ops/script/test_tidy.cc similarity index 100% rename from ops/clang-tidy/test_tidy.cc rename to ops/script/test_tidy.cc diff --git a/ops/test_utils.py b/ops/script/test_utils.py similarity index 99% rename from ops/test_utils.py rename to ops/script/test_utils.py index f05fed4dc7f8..adcd05d5a124 100644 --- a/ops/test_utils.py +++ b/ops/script/test_utils.py @@ -75,7 +75,7 @@ def print_time() -> None: ROOT = os.path.normpath( os.path.join( - os.path.dirname(os.path.abspath(__file__)), os.path.pardir + os.path.dirname(os.path.abspath(__file__)), os.path.pardir, os.path.pardir ) ) R_PACKAGE = os.path.join(ROOT, "R-package") diff --git a/ops/update_rapids.sh b/ops/script/update_rapids.sh similarity index 100% rename from ops/update_rapids.sh rename to ops/script/update_rapids.sh diff --git a/ops/verify_link.sh b/ops/script/verify_link.sh similarity index 100% rename from ops/verify_link.sh rename to ops/script/verify_link.sh From d1d82bc827e54530e1d77745ea294717f8c3cc06 Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Sat, 2 Nov 2024 13:49:28 +0800 Subject: [PATCH 30/45] Disable the host numa virtual memory allocator for now. (#10934) --- src/common/device_helpers.cu | 5 +++++ tests/cpp/common/test_device_vector.cu | 11 +++-------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/common/device_helpers.cu b/src/common/device_helpers.cu index 608a535cd8cb..01e81b16ee0b 100644 --- a/src/common/device_helpers.cu +++ b/src/common/device_helpers.cu @@ -7,6 +7,11 @@ namespace dh { PinnedMemory::PinnedMemory() { + // Use the `GrowOnlyPinnedMemoryImpl` as the only option for now. + // See https://github.com/dmlc/xgboost/issues/10933 + this->impl_.emplace(); + return; + #if defined(xgboost_IS_WIN) this->impl_.emplace(); #else diff --git a/tests/cpp/common/test_device_vector.cu b/tests/cpp/common/test_device_vector.cu index d7a03e41a64b..ec1a420bd349 100644 --- a/tests/cpp/common/test_device_vector.cu +++ b/tests/cpp/common/test_device_vector.cu @@ -31,6 +31,9 @@ class TestVirtualMem : public ::testing::TestWithParam { public: void Run() { auto type = this->GetParam(); + if (type == CU_MEM_LOCATION_TYPE_HOST_NUMA) { + GTEST_SKIP_("Host numa might require special system capabilities, skipping for now."); + } detail::GrowOnlyVirtualMemVec vec{type}; auto prop = xgboost::cudr::MakeAllocProp(type); auto gran = xgboost::cudr::GetAllocGranularity(&prop); @@ -110,14 +113,6 @@ TEST(TestVirtualMem, Version) { xgboost::curt::DrVersion(&major, &minor); LOG(INFO) << "Latest supported CUDA version by the driver:" << major << "." << minor; PinnedMemory pinned; -#if defined(xgboost_IS_WIN) ASSERT_FALSE(pinned.IsVm()); -#else // defined(xgboost_IS_WIN) - if (major >= 12 && minor >= 5) { - ASSERT_TRUE(pinned.IsVm()); - } else { - ASSERT_FALSE(pinned.IsVm()); - } -#endif // defined(xgboost_IS_WIN) } } // namespace dh From ad75358f735b94c13e99fbe633517ad1bdcfedde Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Mon, 18 Nov 2024 15:15:47 -0800 Subject: [PATCH 31/45] Fix run flags to Python tests --- ops/pipeline/test-python.sh | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/ops/pipeline/test-python.sh b/ops/pipeline/test-python.sh index 047a6f411d6d..507deb37d9c0 100755 --- a/ops/pipeline/test-python.sh +++ b/ops/pipeline/test-python.sh @@ -13,5 +13,13 @@ fi suite="$1" container_id="$2" -python3 ops/docker_run.py --container-id "${container_id}" \ +if [[ "$suite" == "gpu" || "$suite" == "mgpu" ]] +then + gpu_option="--use-gpus" +else + gpu_option="" +fi + +python3 ops/docker_run.py --container-id "${container_id}" ${gpu_option} \ + --run-args='--shm-size=4g' \ -- bash ops/pipeline/test-python-impl.sh "${suite}" From 06d060b88f23973adc0a4d2d363060130491cf05 Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Mon, 18 Nov 2024 16:26:08 -0800 Subject: [PATCH 32/45] Separate workflow for JVM packages --- .github/workflows/jvm_tests.yml | 161 ++++++++++++++++++++++++++++++++ .github/workflows/main.yml | 123 +----------------------- 2 files changed, 162 insertions(+), 122 deletions(-) create mode 100644 .github/workflows/jvm_tests.yml diff --git a/.github/workflows/jvm_tests.yml b/.github/workflows/jvm_tests.yml new file mode 100644 index 000000000000..5894a4bead7d --- /dev/null +++ b/.github/workflows/jvm_tests.yml @@ -0,0 +1,161 @@ +name: XGBoost CI (JVM packages) + +on: [push, pull_request] + +permissions: + contents: read # to fetch code (actions/checkout) + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + +env: + BRANCH_NAME: >- + ${{ github.event.pull_request.number && 'PR-' }}${{ github.event.pull_request.number || github.ref_name }} + USE_DOCKER_CACHE: 1 + +jobs: + build-containers: + name: Build CI containers + runs-on: + - runs-on=${{ github.run_id }} + - runner=${{ matrix.runner }} + strategy: + matrix: + container_id: + - xgb-ci.manylinux2014_x86_64 + - xgb-ci.jvm + - xgb-ci.jvm_gpu_build + runner: [linux-amd64-cpu] + include: + - container_id: xgb-ci.manylinux2014_aarch64 + runner: linux-arm64-cpu + steps: + # Restart Docker daemon so that it recognizes the ephemeral disks + - run: sudo systemctl restart docker + - uses: actions/checkout@v4 + with: + submodules: "true" + - name: Build ${{ matrix.container_id }} + run: bash ops/docker_build.sh + env: + CONTAINER_ID: ${{ matrix.container_id }} + + build-jvm-manylinux2014: + name: Build libxgboost4j.so targeting glibc 2.17 + needs: build-containers + runs-on: + - runs-on=${{ github.run_id }} + - runner=${{ matrix.runner }} + strategy: + matrix: + include: + - arch: aarch64 + runner: linux-arm64-cpu + - arch: x86_64 + runner: linux-amd64-cpu + steps: + # Restart Docker daemon so that it recognizes the ephemeral disks + - run: sudo systemctl restart docker + - uses: actions/checkout@v4 + with: + submodules: "true" + - name: Fetch container from cache + run: bash ops/docker_build.sh + env: + CONTAINER_ID: xgb-ci.manylinux2014_${{ matrix.arch }} + - run: bash ops/pipeline/build-jvm-manylinux2014.sh ${{ matrix.arch }} + + build-jvm-gpu: + name: Build libxgboost4j.so with CUDA + needs: build-containers + runs-on: + - runs-on=${{ github.run_id }} + - runner=linux-amd64-cpu + steps: + # Restart Docker daemon so that it recognizes the ephemeral disks + - run: sudo systemctl restart docker + - uses: actions/checkout@v4 + with: + submodules: "true" + - name: Fetch container from cache + run: bash ops/docker_build.sh + env: + CONTAINER_ID: xgb-ci.jvm_gpu_build + - run: bash ops/pipeline/build-jvm-gpu.sh + - name: Stash files + run: bash ops/stash_artifacts.sh lib/libxgboost4j.so + env: + COMMAND: upload + KEY: build-jvm-gpu + + build-jvm-docs: + name: Build docs for JVM packages + needs: [build-jvm-gpu] + runs-on: + - runs-on=${{ github.run_id }} + - runner=linux-amd64-cpu + steps: + # Restart Docker daemon so that it recognizes the ephemeral disks + - run: sudo systemctl restart docker + - uses: actions/checkout@v4 + with: + submodules: "true" + - name: Fetch container from cache + run: bash ops/docker_build.sh + env: + CONTAINER_ID: xgb-ci.jvm_gpu_build + - name: Unstash files + run: bash ops/stash_artifacts.sh lib/libxgboost4j.so + env: + COMMAND: download + KEY: build-jvm-gpu + - run: bash ops/pipeline/build-jvm-doc.sh + + build-test-jvm-packages: + name: Build and test JVM packages + needs: build-containers + runs-on: + - runs-on=${{ github.run_id }} + - runner=linux-amd64-cpu + steps: + # Restart Docker daemon so that it recognizes the ephemeral disks + - run: sudo systemctl restart docker + - uses: actions/checkout@v4 + with: + submodules: "true" + - name: Fetch container from cache + run: bash ops/docker_build.sh + env: + CONTAINER_ID: xgb-ci.jvm + - name: Build and test JVM packages (Scala 2.12) + run: bash ops/pipeline/build-test-jvm-packages.sh + env: + SCALA_VERSION: 2.12 + - name: Build and test JVM packages (Scala 2.13) + run: bash ops/pipeline/build-test-jvm-packages.sh + env: + SCALA_VERSION: 2.13 + + test-jvm-packages-gpu: + name: Test JVM packages with CUDA + needs: [build-jvm-gpu] + runs-on: + - runs-on=${{ github.run_id }} + - runner=linux-amd64-mgpu + steps: + # Restart Docker daemon so that it recognizes the ephemeral disks + - run: sudo systemctl restart docker + - uses: actions/checkout@v4 + with: + submodules: "true" + - name: Fetch container from cache + run: bash ops/docker_build.sh + env: + CONTAINER_ID: xgb-ci.jvm_gpu_build + - name: Unstash files + run: bash ops/stash_artifacts.sh lib/libxgboost4j.so + env: + COMMAND: download + KEY: build-jvm-gpu + - run: bash ops/pipeline/test-jvm-gpu.sh diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 47e195267d49..80e6db40cfb6 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -31,8 +31,6 @@ jobs: - xgb-ci.clang_tidy - xgb-ci.manylinux_2_28_x86_64 - xgb-ci.manylinux2014_x86_64 - - xgb-ci.jvm - - xgb-ci.jvm_gpu_build runner: [linux-amd64-cpu] include: - container_id: xgb-ci.manylinux2014_aarch64 @@ -213,102 +211,6 @@ jobs: CONTAINER_ID: xgb-ci.gpu_build_r_rockylinux8 - run: bash ops/pipeline/build-gpu-rpkg.sh - build-jvm-manylinux2014: - name: Build libxgboost4j.so targeting glibc 2.17 - needs: build-containers - runs-on: - - runs-on=${{ github.run_id }} - - runner=${{ matrix.runner }} - strategy: - matrix: - include: - - arch: aarch64 - runner: linux-arm64-cpu - - arch: x86_64 - runner: linux-amd64-cpu - steps: - # Restart Docker daemon so that it recognizes the ephemeral disks - - run: sudo systemctl restart docker - - uses: actions/checkout@v4 - with: - submodules: "true" - - name: Fetch container from cache - run: bash ops/docker_build.sh - env: - CONTAINER_ID: xgb-ci.manylinux2014_${{ matrix.arch }} - - run: bash ops/pipeline/build-jvm-manylinux2014.sh ${{ matrix.arch }} - - build-jvm-gpu: - name: Build libxgboost4j.so with CUDA - needs: build-containers - runs-on: - - runs-on=${{ github.run_id }} - - runner=linux-amd64-cpu - steps: - # Restart Docker daemon so that it recognizes the ephemeral disks - - run: sudo systemctl restart docker - - uses: actions/checkout@v4 - with: - submodules: "true" - - name: Fetch container from cache - run: bash ops/docker_build.sh - env: - CONTAINER_ID: xgb-ci.jvm_gpu_build - - run: bash ops/pipeline/build-jvm-gpu.sh - - name: Stash files - run: bash ops/stash_artifacts.sh lib/libxgboost4j.so - env: - COMMAND: upload - KEY: build-jvm-gpu - - build-jvm-docs: - name: Build docs for JVM packages - needs: [build-jvm-gpu] - runs-on: - - runs-on=${{ github.run_id }} - - runner=linux-amd64-cpu - steps: - # Restart Docker daemon so that it recognizes the ephemeral disks - - run: sudo systemctl restart docker - - uses: actions/checkout@v4 - with: - submodules: "true" - - name: Fetch container from cache - run: bash ops/docker_build.sh - env: - CONTAINER_ID: xgb-ci.jvm_gpu_build - - name: Unstash files - run: bash ops/stash_artifacts.sh lib/libxgboost4j.so - env: - COMMAND: download - KEY: build-jvm-gpu - - run: bash ops/pipeline/build-jvm-doc.sh - - build-test-jvm-packages: - name: Build and test JVM packages - needs: build-containers - runs-on: - - runs-on=${{ github.run_id }} - - runner=linux-amd64-cpu - steps: - # Restart Docker daemon so that it recognizes the ephemeral disks - - run: sudo systemctl restart docker - - uses: actions/checkout@v4 - with: - submodules: "true" - - name: Fetch container from cache - run: bash ops/docker_build.sh - env: - CONTAINER_ID: xgb-ci.jvm - - name: Build and test JVM packages (Scala 2.12) - run: bash ops/pipeline/build-test-jvm-packages.sh - env: - SCALA_VERSION: 2.12 - - name: Build and test JVM packages (Scala 2.13) - run: bash ops/pipeline/build-test-jvm-packages.sh - env: - SCALA_VERSION: 2.13 - test-cpp-gpu: name: Run Google Tests with GPU(s) needs: [build-cuda, build-cuda-with-rmm] @@ -348,7 +250,7 @@ jobs: test-python: name: Run Python tests - needs: [build-cuda] + needs: [build-cuda, build-cpu-arm64] runs-on: - runs-on=${{ github.run_id }} - runner=${{ matrix.runner }} @@ -404,26 +306,3 @@ jobs: KEY: ${{ matrix.artifact_from }} - name: Run Python tests, ${{ matrix.description }} run: bash ops/pipeline/test-python.sh ${{ matrix.suite }} ${{ matrix.container }} - - test-jvm-packages-gpu: - name: Test JVM packages with CUDA - needs: [build-jvm-gpu] - runs-on: - - runs-on=${{ github.run_id }} - - runner=linux-amd64-mgpu - steps: - # Restart Docker daemon so that it recognizes the ephemeral disks - - run: sudo systemctl restart docker - - uses: actions/checkout@v4 - with: - submodules: "true" - - name: Fetch container from cache - run: bash ops/docker_build.sh - env: - CONTAINER_ID: xgb-ci.jvm_gpu_build - - name: Unstash files - run: bash ops/stash_artifacts.sh lib/libxgboost4j.so - env: - COMMAND: download - KEY: build-jvm-gpu - - run: bash ops/pipeline/test-jvm-gpu.sh From 588dd67f9b1dc9671a69bd63dc9178201be7a448 Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Mon, 18 Nov 2024 16:29:44 -0800 Subject: [PATCH 33/45] Rename workflow files --- .github/workflows/{jvm_tests.yml => jvm_tests_runs_on.yml} | 0 .github/workflows/{main.yml => main_runs_on.yml} | 0 .github/workflows/{windows.yml => windows_runs_on.yml} | 0 3 files changed, 0 insertions(+), 0 deletions(-) rename .github/workflows/{jvm_tests.yml => jvm_tests_runs_on.yml} (100%) rename .github/workflows/{main.yml => main_runs_on.yml} (100%) rename .github/workflows/{windows.yml => windows_runs_on.yml} (100%) diff --git a/.github/workflows/jvm_tests.yml b/.github/workflows/jvm_tests_runs_on.yml similarity index 100% rename from .github/workflows/jvm_tests.yml rename to .github/workflows/jvm_tests_runs_on.yml diff --git a/.github/workflows/main.yml b/.github/workflows/main_runs_on.yml similarity index 100% rename from .github/workflows/main.yml rename to .github/workflows/main_runs_on.yml diff --git a/.github/workflows/windows.yml b/.github/workflows/windows_runs_on.yml similarity index 100% rename from .github/workflows/windows.yml rename to .github/workflows/windows_runs_on.yml From cb8f63f4991c09ee1c13fd1a2c25ee081191352d Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Mon, 18 Nov 2024 17:52:21 -0800 Subject: [PATCH 34/45] Consolidate workflow defs --- .github/workflows/i386.yml | 6 +- .github/workflows/jvm_tests.yml | 238 ++++++++-- .github/workflows/jvm_tests_runs_on.yml | 161 ------- .github/workflows/lint.yml | 144 ++++++ .github/workflows/macos.yml | 24 - .github/workflows/main.yml | 431 +++++++++++------- .github/workflows/main_runs_on.yml | 308 ------------- .github/workflows/misc.yml | 133 ++++++ .github/workflows/python_tests.yml | 204 +-------- ...hon_wheels.yml => python_wheels_macos.yml} | 28 +- .github/workflows/r_tests.yml | 84 ++-- .github/workflows/scorecards.yml | 2 +- .github/workflows/sycl_tests.yml | 94 ++++ .github/workflows/update_rapids.yml | 2 +- .../{windows_runs_on.yml => windows.yml} | 7 +- ops/{docker => }/conda_env/aarch64_test.yml | 0 ops/{docker => }/conda_env/cpp_test.yml | 0 ops/{docker => }/conda_env/jvm_tests.yml | 0 ops/{docker => }/conda_env/linux_cpu_test.yml | 0 .../conda_env/linux_sycl_test.yml | 0 ops/{docker => }/conda_env/macos_cpu_test.yml | 0 ops/{docker => }/conda_env/python_lint.yml | 0 ops/{docker => }/conda_env/sdist_test.yml | 0 ops/{docker => }/conda_env/win64_test.yml | 0 ops/docker/dockerfile/Dockerfile.aarch64 | 2 +- ops/docker/dockerfile/Dockerfile.clang_tidy | 2 +- ops/docker/dockerfile/Dockerfile.cpu | 2 +- ops/docker/dockerfile/Dockerfile.gpu | 2 +- .../Dockerfile.gpu_build_r_rockylinux8 | 2 +- ops/docker/dockerfile/Dockerfile.jvm | 2 +- .../dockerfile/Dockerfile.jvm_gpu_build | 2 +- .../Dockerfile.manylinux2014_aarch64 | 2 +- .../Dockerfile.manylinux2014_x86_64 | 2 +- .../Dockerfile.manylinux_2_28_x86_64 | 2 +- ops/docker_build.py | 6 +- ops/docker_run.py | 4 +- ...m1.sh => build-jvm-macos-apple-silicon.sh} | 8 +- ops/pipeline/build-jvm-macos-intel.sh | 44 ++ ops/pipeline/build-python-wheels-macos.sh | 1 - ops/pipeline/test-win64-gpu.ps1 | 2 +- ops/script/build_via_cmake.sh | 11 +- ops/script/lint_cmake.sh | 2 +- ops/script/run_clang_tidy.py | 4 +- .../test_gpu_with_dask/test_gpu_with_dask.py | 2 +- 44 files changed, 970 insertions(+), 1000 deletions(-) delete mode 100644 .github/workflows/jvm_tests_runs_on.yml create mode 100644 .github/workflows/lint.yml delete mode 100644 .github/workflows/macos.yml delete mode 100644 .github/workflows/main_runs_on.yml create mode 100644 .github/workflows/misc.yml rename .github/workflows/{python_wheels.yml => python_wheels_macos.yml} (55%) create mode 100644 .github/workflows/sycl_tests.yml rename .github/workflows/{windows_runs_on.yml => windows.yml} (93%) rename ops/{docker => }/conda_env/aarch64_test.yml (100%) rename ops/{docker => }/conda_env/cpp_test.yml (100%) rename ops/{docker => }/conda_env/jvm_tests.yml (100%) rename ops/{docker => }/conda_env/linux_cpu_test.yml (100%) rename ops/{docker => }/conda_env/linux_sycl_test.yml (100%) rename ops/{docker => }/conda_env/macos_cpu_test.yml (100%) rename ops/{docker => }/conda_env/python_lint.yml (100%) rename ops/{docker => }/conda_env/sdist_test.yml (100%) rename ops/{docker => }/conda_env/win64_test.yml (100%) rename ops/pipeline/{build-jvm-macos-m1.sh => build-jvm-macos-apple-silicon.sh} (85%) create mode 100755 ops/pipeline/build-jvm-macos-intel.sh mode change 100644 => 100755 ops/pipeline/build-python-wheels-macos.sh mode change 100644 => 100755 ops/script/lint_cmake.sh diff --git a/.github/workflows/i386.yml b/.github/workflows/i386.yml index aec7e9d31087..aa71147e2581 100644 --- a/.github/workflows/i386.yml +++ b/.github/workflows/i386.yml @@ -19,7 +19,7 @@ jobs: ports: - 5000:5000 steps: - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 + - uses: actions/checkout@v4.2.2 with: submodules: 'true' - name: Set up Docker Buildx @@ -30,7 +30,7 @@ jobs: uses: docker/build-push-action@v6 with: context: . - file: tests/ci_build/Dockerfile.i386 + file: ops/docker/dockerfile/Dockerfile.i386 push: true tags: localhost:5000/xgboost/build-32bit:latest cache-from: type=gha @@ -40,4 +40,4 @@ jobs: docker run --rm -v $PWD:/workspace -w /workspace \ -e CXXFLAGS='-Wno-error=overloaded-virtual -Wno-error=maybe-uninitialized -Wno-error=redundant-move' \ localhost:5000/xgboost/build-32bit:latest \ - tests/ci_build/build_via_cmake.sh + bash ops/script/build_via_cmake.sh diff --git a/.github/workflows/jvm_tests.yml b/.github/workflows/jvm_tests.yml index dcbd9de55b50..f9385fa4acaf 100644 --- a/.github/workflows/jvm_tests.yml +++ b/.github/workflows/jvm_tests.yml @@ -1,44 +1,193 @@ -name: XGBoost-JVM-Tests +name: XGBoost CI (JVM packages) on: [push, pull_request] permissions: - contents: read # to fetch code (actions/checkout) + contents: read # to fetch code (actions/checkout) concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} cancel-in-progress: true +env: + BRANCH_NAME: >- + ${{ github.event.pull_request.number && 'PR-' }}${{ github.event.pull_request.number || github.ref_name }} + USE_DOCKER_CACHE: 1 + jobs: - test-with-jvm: - name: Test JVM on OS ${{ matrix.os }} + build-containers: + name: Build CI containers (${{ matrix.container_id }}) + runs-on: + - runs-on=${{ github.run_id }} + - runner=${{ matrix.runner }} + strategy: + matrix: + container_id: + - xgb-ci.manylinux2014_x86_64 + - xgb-ci.jvm + - xgb-ci.jvm_gpu_build + runner: [linux-amd64-cpu] + include: + - container_id: xgb-ci.manylinux2014_aarch64 + runner: linux-arm64-cpu + steps: + # Restart Docker daemon so that it recognizes the ephemeral disks + - run: sudo systemctl restart docker + - uses: actions/checkout@v4.2.2 + with: + submodules: "true" + - name: Build ${{ matrix.container_id }} + run: bash ops/docker_build.sh + env: + CONTAINER_ID: ${{ matrix.container_id }} + + build-jvm-manylinux2014: + name: >- + Build libxgboost4j.so targeting glibc 2.17 + (arch ${{ matrix.arch }}, runner ${{ matrix.runner }}) + needs: build-containers + runs-on: + - runs-on=${{ github.run_id }} + - runner=${{ matrix.runner }} + strategy: + fail-fast: false + matrix: + include: + - arch: aarch64 + runner: linux-arm64-cpu + - arch: x86_64 + runner: linux-amd64-cpu + steps: + # Restart Docker daemon so that it recognizes the ephemeral disks + - run: sudo systemctl restart docker + - uses: actions/checkout@v4.2.2 + with: + submodules: "true" + - name: Fetch container from cache + run: bash ops/docker_build.sh + env: + CONTAINER_ID: xgb-ci.manylinux2014_${{ matrix.arch }} + - run: bash ops/pipeline/build-jvm-manylinux2014.sh ${{ matrix.arch }} + + build-jvm-gpu: + name: Build libxgboost4j.so with CUDA + needs: build-containers + runs-on: + - runs-on=${{ github.run_id }} + - runner=linux-amd64-cpu + steps: + # Restart Docker daemon so that it recognizes the ephemeral disks + - run: sudo systemctl restart docker + - uses: actions/checkout@v4.2.2 + with: + submodules: "true" + - name: Fetch container from cache + run: bash ops/docker_build.sh + env: + CONTAINER_ID: xgb-ci.jvm_gpu_build + - run: bash ops/pipeline/build-jvm-gpu.sh + - name: Stash files + run: bash ops/stash_artifacts.sh lib/libxgboost4j.so + env: + COMMAND: upload + KEY: build-jvm-gpu + + build-jvm-mac: + name: "Build libxgboost4j.dylib for ${{ matrix.description }}" + runs-on: ${{ matrix.runner }} + strategy: + fail-fast: false + matrix: + include: + - description: "MacOS (Apple Silicon)" + script: ops/pipeline/build-jvm-macos-apple-silicon.sh + runner: macos-14 + - description: "MacOS (Intel)" + script: ops/pipeline/build-jvm-macos-intel.sh + runner: macos-13 + steps: + - uses: actions/checkout@v4.2.2 + with: + submodules: "true" + - run: bash ${{ matrix.script }} + + build-jvm-docs: + name: Build docs for JVM packages + needs: [build-jvm-gpu] + runs-on: + - runs-on=${{ github.run_id }} + - runner=linux-amd64-cpu + steps: + # Restart Docker daemon so that it recognizes the ephemeral disks + - run: sudo systemctl restart docker + - uses: actions/checkout@v4.2.2 + with: + submodules: "true" + - name: Fetch container from cache + run: bash ops/docker_build.sh + env: + CONTAINER_ID: xgb-ci.jvm_gpu_build + - name: Unstash files + run: bash ops/stash_artifacts.sh lib/libxgboost4j.so + env: + COMMAND: download + KEY: build-jvm-gpu + - run: bash ops/pipeline/build-jvm-doc.sh + + build-test-jvm-packages: + name: Build and test JVM packages (Linux) + needs: build-containers + runs-on: + - runs-on=${{ github.run_id }} + - runner=linux-amd64-cpu + steps: + # Restart Docker daemon so that it recognizes the ephemeral disks + - run: sudo systemctl restart docker + - uses: actions/checkout@v4.2.2 + with: + submodules: "true" + - name: Fetch container from cache + run: bash ops/docker_build.sh + env: + CONTAINER_ID: xgb-ci.jvm + - name: Build and test JVM packages (Scala 2.12) + run: bash ops/pipeline/build-test-jvm-packages.sh + env: + SCALA_VERSION: 2.12 + - name: Build and test JVM packages (Scala 2.13) + run: bash ops/pipeline/build-test-jvm-packages.sh + env: + SCALA_VERSION: 2.13 + + build-test-jvm-packages-other-os: + name: Build and test JVM packages (${{ matrix.os }}) timeout-minutes: 30 runs-on: ${{ matrix.os }} strategy: fail-fast: false matrix: - os: [windows-latest, ubuntu-latest, macos-13] + os: [windows-latest, macos-13] steps: - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 + - uses: actions/checkout@v4.2.2 with: submodules: 'true' - - uses: actions/setup-java@b36c23c0d998641eff861008f374ee103c25ac73 # v4.4.0 + - uses: actions/setup-java@v4.5.0 with: distribution: 'temurin' java-version: '8' - - uses: conda-incubator/setup-miniconda@a4260408e20b96e80095f42ff7f1a15b27dd94ca # v3.0.4 + - uses: conda-incubator/setup-miniconda@v3.1.0 with: miniforge-variant: Miniforge3 miniforge-version: latest activate-environment: jvm_tests - environment-file: tests/ci_build/conda_env/jvm_tests.yml + environment-file: ops/conda_env/jvm_tests.yml use-mamba: true - name: Cache Maven packages - uses: actions/cache@6849a6489940f00c2f30c0fb92c6274307ccb58a # v4.1.2 + uses: actions/cache@v4.1.2 with: path: ~/.m2 key: ${{ runner.os }}-m2-${{ hashFiles('./jvm-packages/pom.xml') }} @@ -49,52 +198,41 @@ jobs: cd jvm-packages mvn test -B -pl :xgboost4j_2.12 - - name: Test XGBoost4J (Core, Spark, Examples) - run: | - rm -rfv build/ - cd jvm-packages - mvn -B test - if: matrix.os == 'ubuntu-latest' # Distributed training doesn't work on Windows - - - name: Extract branch name - shell: bash - run: | - echo "branch=${GITHUB_REF#refs/heads/}" >> "$GITHUB_OUTPUT" - id: extract_branch - if: | - (github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_')) && - (matrix.os == 'windows-latest' || matrix.os == 'macos-13') - - name: Publish artifact xgboost4j.dll to S3 run: | cd lib/ Rename-Item -Path xgboost4j.dll -NewName xgboost4j_${{ github.sha }}.dll dir - python -m awscli s3 cp xgboost4j_${{ github.sha }}.dll s3://xgboost-nightly-builds/${{ steps.extract_branch.outputs.branch }}/libxgboost4j/ --acl public-read --region us-west-2 - if: | - (github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_')) && - matrix.os == 'windows-latest' + python -m awscli s3 cp xgboost4j_${{ github.sha }}.dll ` + s3://xgboost-nightly-builds/${{ env.BRANCH_NAME }}/libxgboost4j/ ` + --acl public-read --region us-west-2 + if: matrix.os == 'windows-latest' + # if: | + # (github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_')) && + # matrix.os == 'windows-latest' env: AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID_IAM_S3_UPLOADER }} AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY_IAM_S3_UPLOADER }} - - name: Publish artifact libxgboost4j.dylib to S3 - shell: bash -l {0} - run: | - cd lib/ - mv -v libxgboost4j.dylib libxgboost4j_${{ github.sha }}.dylib - ls - python -m awscli s3 cp libxgboost4j_${{ github.sha }}.dylib s3://xgboost-nightly-builds/${{ steps.extract_branch.outputs.branch }}/libxgboost4j/ --acl public-read --region us-west-2 - if: | - (github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_')) && - matrix.os == 'macos-13' - env: - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID_IAM_S3_UPLOADER }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY_IAM_S3_UPLOADER }} - - - name: Build and Test XGBoost4J with scala 2.13 - run: | - rm -rfv build/ - cd jvm-packages - mvn -B clean install test -Pdefault,scala-2.13 - if: matrix.os == 'ubuntu-latest' # Distributed training doesn't work on Windows + test-jvm-packages-gpu: + name: Test JVM packages with CUDA + needs: [build-jvm-gpu] + runs-on: + - runs-on=${{ github.run_id }} + - runner=linux-amd64-mgpu + steps: + # Restart Docker daemon so that it recognizes the ephemeral disks + - run: sudo systemctl restart docker + - uses: actions/checkout@v4.2.2 + with: + submodules: "true" + - name: Fetch container from cache + run: bash ops/docker_build.sh + env: + CONTAINER_ID: xgb-ci.jvm_gpu_build + - name: Unstash files + run: bash ops/stash_artifacts.sh lib/libxgboost4j.so + env: + COMMAND: download + KEY: build-jvm-gpu + - run: bash ops/pipeline/test-jvm-gpu.sh diff --git a/.github/workflows/jvm_tests_runs_on.yml b/.github/workflows/jvm_tests_runs_on.yml deleted file mode 100644 index 5894a4bead7d..000000000000 --- a/.github/workflows/jvm_tests_runs_on.yml +++ /dev/null @@ -1,161 +0,0 @@ -name: XGBoost CI (JVM packages) - -on: [push, pull_request] - -permissions: - contents: read # to fetch code (actions/checkout) - -concurrency: - group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} - cancel-in-progress: true - -env: - BRANCH_NAME: >- - ${{ github.event.pull_request.number && 'PR-' }}${{ github.event.pull_request.number || github.ref_name }} - USE_DOCKER_CACHE: 1 - -jobs: - build-containers: - name: Build CI containers - runs-on: - - runs-on=${{ github.run_id }} - - runner=${{ matrix.runner }} - strategy: - matrix: - container_id: - - xgb-ci.manylinux2014_x86_64 - - xgb-ci.jvm - - xgb-ci.jvm_gpu_build - runner: [linux-amd64-cpu] - include: - - container_id: xgb-ci.manylinux2014_aarch64 - runner: linux-arm64-cpu - steps: - # Restart Docker daemon so that it recognizes the ephemeral disks - - run: sudo systemctl restart docker - - uses: actions/checkout@v4 - with: - submodules: "true" - - name: Build ${{ matrix.container_id }} - run: bash ops/docker_build.sh - env: - CONTAINER_ID: ${{ matrix.container_id }} - - build-jvm-manylinux2014: - name: Build libxgboost4j.so targeting glibc 2.17 - needs: build-containers - runs-on: - - runs-on=${{ github.run_id }} - - runner=${{ matrix.runner }} - strategy: - matrix: - include: - - arch: aarch64 - runner: linux-arm64-cpu - - arch: x86_64 - runner: linux-amd64-cpu - steps: - # Restart Docker daemon so that it recognizes the ephemeral disks - - run: sudo systemctl restart docker - - uses: actions/checkout@v4 - with: - submodules: "true" - - name: Fetch container from cache - run: bash ops/docker_build.sh - env: - CONTAINER_ID: xgb-ci.manylinux2014_${{ matrix.arch }} - - run: bash ops/pipeline/build-jvm-manylinux2014.sh ${{ matrix.arch }} - - build-jvm-gpu: - name: Build libxgboost4j.so with CUDA - needs: build-containers - runs-on: - - runs-on=${{ github.run_id }} - - runner=linux-amd64-cpu - steps: - # Restart Docker daemon so that it recognizes the ephemeral disks - - run: sudo systemctl restart docker - - uses: actions/checkout@v4 - with: - submodules: "true" - - name: Fetch container from cache - run: bash ops/docker_build.sh - env: - CONTAINER_ID: xgb-ci.jvm_gpu_build - - run: bash ops/pipeline/build-jvm-gpu.sh - - name: Stash files - run: bash ops/stash_artifacts.sh lib/libxgboost4j.so - env: - COMMAND: upload - KEY: build-jvm-gpu - - build-jvm-docs: - name: Build docs for JVM packages - needs: [build-jvm-gpu] - runs-on: - - runs-on=${{ github.run_id }} - - runner=linux-amd64-cpu - steps: - # Restart Docker daemon so that it recognizes the ephemeral disks - - run: sudo systemctl restart docker - - uses: actions/checkout@v4 - with: - submodules: "true" - - name: Fetch container from cache - run: bash ops/docker_build.sh - env: - CONTAINER_ID: xgb-ci.jvm_gpu_build - - name: Unstash files - run: bash ops/stash_artifacts.sh lib/libxgboost4j.so - env: - COMMAND: download - KEY: build-jvm-gpu - - run: bash ops/pipeline/build-jvm-doc.sh - - build-test-jvm-packages: - name: Build and test JVM packages - needs: build-containers - runs-on: - - runs-on=${{ github.run_id }} - - runner=linux-amd64-cpu - steps: - # Restart Docker daemon so that it recognizes the ephemeral disks - - run: sudo systemctl restart docker - - uses: actions/checkout@v4 - with: - submodules: "true" - - name: Fetch container from cache - run: bash ops/docker_build.sh - env: - CONTAINER_ID: xgb-ci.jvm - - name: Build and test JVM packages (Scala 2.12) - run: bash ops/pipeline/build-test-jvm-packages.sh - env: - SCALA_VERSION: 2.12 - - name: Build and test JVM packages (Scala 2.13) - run: bash ops/pipeline/build-test-jvm-packages.sh - env: - SCALA_VERSION: 2.13 - - test-jvm-packages-gpu: - name: Test JVM packages with CUDA - needs: [build-jvm-gpu] - runs-on: - - runs-on=${{ github.run_id }} - - runner=linux-amd64-mgpu - steps: - # Restart Docker daemon so that it recognizes the ephemeral disks - - run: sudo systemctl restart docker - - uses: actions/checkout@v4 - with: - submodules: "true" - - name: Fetch container from cache - run: bash ops/docker_build.sh - env: - CONTAINER_ID: xgb-ci.jvm_gpu_build - - name: Unstash files - run: bash ops/stash_artifacts.sh lib/libxgboost4j.so - env: - COMMAND: download - KEY: build-jvm-gpu - - run: bash ops/pipeline/test-jvm-gpu.sh diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml new file mode 100644 index 000000000000..caceb3e3893b --- /dev/null +++ b/.github/workflows/lint.yml @@ -0,0 +1,144 @@ +name: XGBoost CI (Lint) + +on: [push, pull_request] + +permissions: + contents: read # to fetch code (actions/checkout) + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + +env: + BRANCH_NAME: >- + ${{ github.event.pull_request.number && 'PR-' }}${{ github.event.pull_request.number || github.ref_name }} + +jobs: + build-containers: + name: Build CI containers (${{ matrix.container_id }}) + runs-on: + - runs-on=${{ github.run_id }} + - runner=${{ matrix.runner }} + strategy: + fail-fast: false + matrix: + include: + - container_id: xgb-ci.clang_tidy + runner: linux-amd64-cpu + steps: + # Restart Docker daemon so that it recognizes the ephemeral disks + - run: sudo systemctl restart docker + - uses: actions/checkout@v4.2.2 + with: + submodules: "true" + - name: Build ${{ matrix.container_id }} + run: bash ops/docker_build.sh + env: + CONTAINER_ID: ${{ matrix.container_id }} + + clang-tidy: + name: Run clang-tidy + needs: build-containers + runs-on: + - runs-on=${{ github.run_id }} + - runner=linux-amd64-cpu + steps: + # Restart Docker daemon so that it recognizes the ephemeral disks + - run: sudo systemctl restart docker + - uses: actions/checkout@v4.2.2 + with: + submodules: "true" + - name: Fetch container from cache + run: bash ops/docker_build.sh + env: + CONTAINER_ID: xgb-ci.clang_tidy + - run: bash ops/pipeline/run-clang-tidy.sh + + python-mypy-lint: + runs-on: ubuntu-latest + name: Type and format checks for the Python package + steps: + - uses: actions/checkout@v4.2.2 + with: + submodules: 'true' + - uses: conda-incubator/setup-miniconda@v3.1.0 + with: + miniforge-variant: Miniforge3 + miniforge-version: latest + activate-environment: python_lint + environment-file: ops/conda_env/python_lint.yml + use-mamba: true + - name: Display Conda env + shell: bash -el {0} + run: | + conda info + conda list + - name: Run mypy + shell: bash -el {0} + run: | + python ops/script/lint_python.py --format=0 --type-check=1 --pylint=0 + - name: Run formatter + shell: bash -el {0} + run: | + python ops/script/lint_python.py --format=1 --type-check=0 --pylint=0 + - name: Run pylint + shell: bash -el {0} + run: | + python ops/script/lint_python.py --format=0 --type-check=0 --pylint=1 + + cpp-lint: + runs-on: ubuntu-latest + name: Code linting for C++ + steps: + - uses: actions/checkout@v4.2.2 + with: + submodules: 'true' + - uses: actions/setup-python@v5.3.0 + with: + python-version: "3.10" + architecture: 'x64' + - name: Install Python packages + run: | + python -m pip install wheel setuptools cmakelint cpplint==1.6.1 pylint + - name: Run lint + run: | + python3 ops/script/lint_cpp.py + bash ops/script/lint_cmake.sh + + lintr: + runs-on: ${{ matrix.os }} + name: Run R linters on OS ${{ matrix.os }}, R ${{ matrix.r }} + strategy: + fail-fast: false + matrix: + include: + - os: ubuntu-latest + r: "release" + env: + R_REMOTES_NO_ERRORS_FROM_WARNINGS: true + + steps: + - uses: actions/checkout@v4.2.2 + with: + submodules: 'true' + + - uses: r-lib/actions/setup-r@v2.11.0 + with: + r-version: ${{ matrix.r }} + + - name: Cache R packages + uses: actions/cache@v4.1.2 + with: + path: ${{ env.R_LIBS_USER }} + key: ${{ runner.os }}-r-${{ matrix.r }}-7-${{ hashFiles('R-package/DESCRIPTION') }} + restore-keys: ${{ runner.os }}-r-${{ matrix.r }}-7-${{ hashFiles('R-package/DESCRIPTION') }} + + - name: Install dependencies + shell: Rscript {0} + run: | + source("./R-package/tests/helper_scripts/install_deps.R") + + - name: Run lintr + run: | + MAKEFLAGS="-j$(nproc)" R CMD INSTALL R-package/ + Rscript ops/script/lint_r.R $(pwd) diff --git a/.github/workflows/macos.yml b/.github/workflows/macos.yml deleted file mode 100644 index 2bb3e1aba46c..000000000000 --- a/.github/workflows/macos.yml +++ /dev/null @@ -1,24 +0,0 @@ -name: Nextgen XGBoost CI, MacOS - -on: [push, pull_request] - -permissions: - contents: read # to fetch code (actions/checkout) - -concurrency: - group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} - cancel-in-progress: true - -env: - BRANCH_NAME: >- - ${{ github.event.pull_request.number && 'PR-' }}${{ github.event.pull_request.number || github.ref_name }} - -jobs: - mac-m1-jvm: - name: "Build libxgboost4j.dylib for MacOS M1" - runs-on: macos-14 - steps: - - uses: actions/checkout@v4 - with: - submodules: "true" - - run: bash ops/pipeline/build-jvm-macos-m1.sh diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 3c0a67b4f463..77208a146443 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -1,193 +1,294 @@ -# This is a basic workflow to help you get started with Actions +name: XGBoost CI -name: XGBoost-CI - -# Controls when the action will run. Triggers the workflow on push or pull request -# events but only for the master branch on: [push, pull_request] permissions: - contents: read # to fetch code (actions/checkout) + contents: read # to fetch code (actions/checkout) concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} cancel-in-progress: true -# A workflow run is made up of one or more jobs that can run sequentially or in parallel +env: + BRANCH_NAME: >- + ${{ github.event.pull_request.number && 'PR-' }}${{ github.event.pull_request.number || github.ref_name }} + USE_DOCKER_CACHE: 1 + jobs: - gtest-cpu: - name: Test Google C++ test (CPU) - runs-on: ${{ matrix.os }} + build-containers: + name: Build CI containers (${{ matrix.container_id }}) + runs-on: + - runs-on=${{ github.run_id }} + - runner=${{ matrix.runner }} strategy: - fail-fast: false matrix: - os: [macos-12] + container_id: + - xgb-ci.gpu_build_rockylinux8 + - xgb-ci.gpu_build_r_rockylinux8 + - xgb-ci.gpu + - xgb-ci.gpu_dev_ver + - xgb-ci.cpu + - xgb-ci.manylinux_2_28_x86_64 + - xgb-ci.manylinux2014_x86_64 + runner: [linux-amd64-cpu] + include: + - container_id: xgb-ci.manylinux2014_aarch64 + runner: linux-arm64-cpu + - container_id: xgb-ci.aarch64 + runner: linux-arm64-cpu + steps: + # Restart Docker daemon so that it recognizes the ephemeral disks + - run: sudo systemctl restart docker + - uses: actions/checkout@v4.2.2 + with: + submodules: "true" + - name: Build ${{ matrix.container_id }} + run: bash ops/docker_build.sh + env: + CONTAINER_ID: ${{ matrix.container_id }} + + build-cpu: + name: Build CPU + needs: build-containers + runs-on: + - runs-on=${{ github.run_id }} + - runner=linux-amd64-cpu + steps: + # Restart Docker daemon so that it recognizes the ephemeral disks + - run: sudo systemctl restart docker + - uses: actions/checkout@v4.2.2 + with: + submodules: "true" + - name: Fetch container from cache + run: bash ops/docker_build.sh + env: + CONTAINER_ID: xgb-ci.cpu + - run: bash ops/pipeline/build-cpu.sh + - name: Stash CLI executable + run: bash ops/stash_artifacts.sh ./xgboost + env: + COMMAND: upload + KEY: build-cpu + + build-cpu-arm64: + name: Build CPU ARM64 + manylinux_2_28_aarch64 wheel + needs: build-containers + runs-on: + - runs-on=${{ github.run_id }} + - runner=linux-arm64-cpu + steps: + # Restart Docker daemon so that it recognizes the ephemeral disks + - run: sudo systemctl restart docker + - uses: actions/checkout@v4.2.2 + with: + submodules: "true" + - name: Fetch container from cache + run: bash ops/docker_build.sh + env: + CONTAINER_ID: xgb-ci.aarch64 + - run: bash ops/pipeline/build-cpu-arm64.sh + - name: Stash files + run: bash ops/stash_artifacts.sh ./xgboost python-package/dist/*.whl + env: + COMMAND: upload + KEY: build-cpu-arm64 + + build-cuda: + name: Build CUDA + manylinux_2_28_x86_64 wheel + needs: build-containers + runs-on: + - runs-on=${{ github.run_id }} + - runner=linux-amd64-cpu steps: - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 - with: - submodules: 'true' - - name: Install system packages - run: | - brew install ninja libomp - - name: Build gtest binary - run: | - mkdir build - cd build - cmake .. -DGOOGLE_TEST=ON -DUSE_OPENMP=ON -DUSE_DMLC_GTEST=ON -GNinja -DBUILD_DEPRECATED_CLI=ON -DUSE_SANITIZER=ON -DENABLED_SANITIZERS=address -DCMAKE_BUILD_TYPE=RelWithDebInfo - ninja -v - - name: Run gtest binary - run: | - cd build - ./testxgboost - ctest -R TestXGBoostCLI --extra-verbose + # Restart Docker daemon so that it recognizes the ephemeral disks + - run: sudo systemctl restart docker + - uses: actions/checkout@v4.2.2 + with: + submodules: "true" + - name: Fetch container from cache + run: bash ops/docker_build.sh + env: + CONTAINER_ID: xgb-ci.gpu_build_rockylinux8 + - name: Fetch container from cache + run: bash ops/docker_build.sh + env: + CONTAINER_ID: xgb-ci.manylinux_2_28_x86_64 + - run: bash ops/pipeline/build-cuda.sh + - name: Stash files + run: | + bash ops/stash_artifacts.sh \ + build/testxgboost ./xgboost python-package/dist/*.whl + env: + COMMAND: upload + KEY: build-cuda - gtest-cpu-nonomp: - name: Test Google C++ unittest (CPU Non-OMP) - runs-on: ${{ matrix.os }} + build-cuda-with-rmm: + name: Build CUDA with RMM + needs: build-containers + runs-on: + - runs-on=${{ github.run_id }} + - runner=linux-amd64-cpu + steps: + # Restart Docker daemon so that it recognizes the ephemeral disks + - run: sudo systemctl restart docker + - uses: actions/checkout@v4.2.2 + with: + submodules: "true" + - name: Fetch container from cache + run: bash ops/docker_build.sh + env: + CONTAINER_ID: xgb-ci.gpu_build_rockylinux8 + - name: Fetch container from cache + run: bash ops/docker_build.sh + env: + CONTAINER_ID: xgb-ci.manylinux_2_28_x86_64 + - run: bash ops/pipeline/build-cuda-with-rmm.sh + - name: Stash files + run: bash ops/stash_artifacts.sh build/testxgboost + env: + COMMAND: upload + KEY: build-cuda-with-rmm + + build-manylinux2014: + name: Build manylinux2014_${{ matrix.arch }} wheel + needs: build-containers + runs-on: + - runs-on=${{ github.run_id }} + - runner=${{ matrix.runner }} strategy: fail-fast: false matrix: - os: [ubuntu-latest] + include: + - arch: aarch64 + runner: linux-arm64-cpu + - arch: x86_64 + runner: linux-amd64-cpu + steps: + # Restart Docker daemon so that it recognizes the ephemeral disks + - run: sudo systemctl restart docker + - uses: actions/checkout@v4.2.2 + with: + submodules: "true" + - name: Fetch container from cache + run: bash ops/docker_build.sh + env: + CONTAINER_ID: xgb-ci.manylinux2014_${{ matrix.arch }} + - run: bash ops/pipeline/build-manylinux2014.sh ${{ matrix.arch }} + + build-gpu-rpkg: + name: Build GPU-enabled R package + needs: build-containers + runs-on: + - runs-on=${{ github.run_id }} + - runner=linux-amd64-cpu steps: - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 - with: - submodules: 'true' - - name: Install system packages - run: | - sudo apt-get install -y --no-install-recommends ninja-build - - name: Build and install XGBoost - shell: bash -l {0} - run: | - mkdir build - cd build - cmake .. -GNinja -DGOOGLE_TEST=ON -DUSE_DMLC_GTEST=ON -DUSE_OPENMP=OFF -DBUILD_DEPRECATED_CLI=ON - ninja -v - - name: Run gtest binary - run: | - cd build - ctest --extra-verbose + # Restart Docker daemon so that it recognizes the ephemeral disks + - run: sudo systemctl restart docker + - uses: actions/checkout@v4.2.2 + with: + submodules: "true" + - name: Fetch container from cache + run: bash ops/docker_build.sh + env: + CONTAINER_ID: xgb-ci.gpu_build_r_rockylinux8 + - run: bash ops/pipeline/build-gpu-rpkg.sh - gtest-cpu-sycl: - name: Test Google C++ unittest (CPU SYCL) - runs-on: ${{ matrix.os }} + test-cpp-gpu: + name: >- + Run Google Tests with GPUs + (Suite ${{ matrix.suite }}, Runner ${{ matrix.runner }}) + needs: [build-cuda, build-cuda-with-rmm] + runs-on: + - runs-on=${{ github.run_id }} + - runner=${{ matrix.runner }} strategy: fail-fast: false matrix: - os: [ubuntu-latest] - python-version: ["3.10"] + include: + - suite: gpu + runner: linux-amd64-gpu + artifact_from: build-cuda + - suite: gpu-rmm + runner: linux-amd64-gpu + artifact_from: build-cuda-with-rmm + - suite: mgpu + runner: linux-amd64-mgpu + artifact_from: build-cuda steps: - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 - with: - submodules: 'true' - - uses: conda-incubator/setup-miniconda@a4260408e20b96e80095f42ff7f1a15b27dd94ca # v3.0.4 - with: - miniforge-variant: Miniforge3 - miniforge-version: latest - activate-environment: linux_sycl_test - environment-file: tests/ci_build/conda_env/linux_sycl_test.yml - use-mamba: true - - name: Display Conda env - run: | - conda info - conda list - - name: Build and install XGBoost - shell: bash -l {0} - run: | - mkdir build - cd build - cmake .. -DGOOGLE_TEST=ON -DUSE_DMLC_GTEST=ON -DPLUGIN_SYCL=ON -DCMAKE_CXX_COMPILER=g++ -DCMAKE_C_COMPILER=gcc -DCMAKE_INSTALL_PREFIX=$CONDA_PREFIX - make -j$(nproc) - - name: Run gtest binary for SYCL - run: | - cd build - ./testxgboost --gtest_filter=Sycl* - - name: Run gtest binary for non SYCL - run: | - cd build - ./testxgboost --gtest_filter=-Sycl* + # Restart Docker daemon so that it recognizes the ephemeral disks + - run: sudo systemctl restart docker + - uses: actions/checkout@v4.2.2 + with: + submodules: "true" + - name: Fetch container from cache + run: bash ops/docker_build.sh + env: + CONTAINER_ID: xgb-ci.gpu + - name: Unstash gtest + run: | + bash ops/stash_artifacts.sh build/testxgboost + chmod +x build/testxgboost + env: + COMMAND: download + KEY: ${{ matrix.artifact_from }} + - run: bash ops/pipeline/test-cpp-gpu.sh ${{ matrix.suite }} - c-api-demo: - name: Test installing XGBoost lib + building the C API demo - runs-on: ${{ matrix.os }} - defaults: - run: - shell: bash -l {0} + test-python: + name: Run Python tests (${{ matrix.description }}) + needs: [build-cuda, build-cpu-arm64] + runs-on: + - runs-on=${{ github.run_id }} + - runner=${{ matrix.runner }} strategy: fail-fast: false matrix: - os: ["ubuntu-latest"] - python-version: ["3.10"] - steps: - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 - with: - submodules: 'true' - - uses: conda-incubator/setup-miniconda@a4260408e20b96e80095f42ff7f1a15b27dd94ca # v3.0.4 - with: - miniforge-variant: Miniforge3 - miniforge-version: latest - activate-environment: cpp_test - environment-file: tests/ci_build/conda_env/cpp_test.yml - use-mamba: true - - name: Display Conda env - run: | - conda info - conda list - - - name: Build and install XGBoost static library - run: | - mkdir build - cd build - cmake .. -DBUILD_STATIC_LIB=ON -DCMAKE_INSTALL_PREFIX=$CONDA_PREFIX -GNinja - ninja -v install - cd - - - name: Build and run C API demo with static - run: | - pushd . - cd demo/c-api/ - mkdir build - cd build - cmake .. -GNinja -DCMAKE_PREFIX_PATH=$CONDA_PREFIX - ninja -v - ctest - cd .. - rm -rf ./build - popd - - - name: Build and install XGBoost shared library - run: | - cd build - cmake .. -DBUILD_STATIC_LIB=OFF -DCMAKE_INSTALL_PREFIX=$CONDA_PREFIX -GNinja -DPLUGIN_FEDERATED=ON -DGOOGLE_TEST=ON - ninja -v install - ./testxgboost - cd - - - name: Build and run C API demo with shared - run: | - pushd . - cd demo/c-api/ - mkdir build - cd build - cmake .. -GNinja -DCMAKE_PREFIX_PATH=$CONDA_PREFIX - ninja -v - ctest - popd - ./tests/ci_build/verify_link.sh ./demo/c-api/build/basic/api-demo - ./tests/ci_build/verify_link.sh ./demo/c-api/build/external-memory/external-memory-demo - - cpp-lint: - runs-on: ubuntu-latest - name: Code linting for C++ + include: + - description: "single GPU" + container: xgb-ci.gpu + suite: gpu + runner: linux-amd64-gpu + artifact_from: build-cuda + - description: "single GPU, nightly deps" + container: xgb-ci.gpu_dev_ver + suite: gpu + runner: linux-amd64-gpu + artifact_from: build-cuda + - description: "multiple GPUs" + container: xgb-ci.gpu + suite: mgpu + runner: linux-amd64-mgpu + artifact_from: build-cuda + - description: "multiple GPUs, nightly deps" + container: xgb-ci.gpu_dev_ver + suite: mgpu + runner: linux-amd64-mgpu + artifact_from: build-cuda + - description: "CPU" + container: xgb-ci.cpu + suite: cpu + runner: linux-amd64-cpu + artifact_from: build-cuda + - description: "CPU ARM64" + container: xgb-ci.aarch64 + suite: cpu-arm64 + runner: linux-arm64-cpu + artifact_from: build-cpu-arm64 steps: - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 - with: - submodules: 'true' - - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 - with: - python-version: "3.10" - architecture: 'x64' - - name: Install Python packages - run: | - python -m pip install wheel setuptools cmakelint cpplint==1.6.1 pylint - - name: Run lint - run: | - python3 tests/ci_build/lint_cpp.py - sh ./tests/ci_build/lint_cmake.sh + # Restart Docker daemon so that it recognizes the ephemeral disks + - run: sudo systemctl restart docker + - uses: actions/checkout@v4.2.2 + with: + submodules: "true" + - name: Fetch container from cache + run: bash ops/docker_build.sh + env: + CONTAINER_ID: ${{ matrix.container }} + - name: Unstash Python wheel + run: | + bash ops/stash_artifacts.sh python-package/dist/*.whl ./xgboost + chmod +x ./xgboost + env: + COMMAND: download + KEY: ${{ matrix.artifact_from }} + - name: Run Python tests, ${{ matrix.description }} + run: bash ops/pipeline/test-python.sh ${{ matrix.suite }} ${{ matrix.container }} diff --git a/.github/workflows/main_runs_on.yml b/.github/workflows/main_runs_on.yml deleted file mode 100644 index 80e6db40cfb6..000000000000 --- a/.github/workflows/main_runs_on.yml +++ /dev/null @@ -1,308 +0,0 @@ -name: Nextgen XGBoost CI - -on: [push, pull_request] - -permissions: - contents: read # to fetch code (actions/checkout) - -concurrency: - group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} - cancel-in-progress: true - -env: - BRANCH_NAME: >- - ${{ github.event.pull_request.number && 'PR-' }}${{ github.event.pull_request.number || github.ref_name }} - USE_DOCKER_CACHE: 1 - -jobs: - build-containers: - name: Build CI containers - runs-on: - - runs-on=${{ github.run_id }} - - runner=${{ matrix.runner }} - strategy: - matrix: - container_id: - - xgb-ci.gpu_build_rockylinux8 - - xgb-ci.gpu_build_r_rockylinux8 - - xgb-ci.gpu - - xgb-ci.gpu_dev_ver - - xgb-ci.cpu - - xgb-ci.clang_tidy - - xgb-ci.manylinux_2_28_x86_64 - - xgb-ci.manylinux2014_x86_64 - runner: [linux-amd64-cpu] - include: - - container_id: xgb-ci.manylinux2014_aarch64 - runner: linux-arm64-cpu - - container_id: xgb-ci.aarch64 - runner: linux-arm64-cpu - steps: - # Restart Docker daemon so that it recognizes the ephemeral disks - - run: sudo systemctl restart docker - - uses: actions/checkout@v4 - with: - submodules: "true" - - name: Build ${{ matrix.container_id }} - run: bash ops/docker_build.sh - env: - CONTAINER_ID: ${{ matrix.container_id }} - - clang-tidy: - name: Run clang-tidy - needs: build-containers - runs-on: - - runs-on=${{ github.run_id }} - - runner=linux-amd64-cpu - steps: - # Restart Docker daemon so that it recognizes the ephemeral disks - - run: sudo systemctl restart docker - - uses: actions/checkout@v4 - with: - submodules: "true" - - name: Fetch container from cache - run: bash ops/docker_build.sh - env: - CONTAINER_ID: xgb-ci.clang_tidy - - run: bash ops/pipeline/run-clang-tidy.sh - - build-cpu: - name: Build CPU - needs: build-containers - runs-on: - - runs-on=${{ github.run_id }} - - runner=linux-amd64-cpu - steps: - # Restart Docker daemon so that it recognizes the ephemeral disks - - run: sudo systemctl restart docker - - uses: actions/checkout@v4 - with: - submodules: "true" - - name: Fetch container from cache - run: bash ops/docker_build.sh - env: - CONTAINER_ID: xgb-ci.cpu - - run: bash ops/pipeline/build-cpu.sh - - name: Stash CLI executable - run: bash ops/stash_artifacts.sh ./xgboost - env: - COMMAND: upload - KEY: build-cpu - - build-cpu-arm64: - name: Build CPU ARM64 + manylinux_2_28_aarch64 wheel - needs: build-containers - runs-on: - - runs-on=${{ github.run_id }} - - runner=linux-arm64-cpu - steps: - # Restart Docker daemon so that it recognizes the ephemeral disks - - run: sudo systemctl restart docker - - uses: actions/checkout@v4 - with: - submodules: "true" - - name: Fetch container from cache - run: bash ops/docker_build.sh - env: - CONTAINER_ID: xgb-ci.aarch64 - - run: bash ops/pipeline/build-cpu-arm64.sh - - name: Stash files - run: bash ops/stash_artifacts.sh ./xgboost python-package/dist/*.whl - env: - COMMAND: upload - KEY: build-cpu-arm64 - - build-cuda: - name: Build CUDA + manylinux_2_28_x86_64 wheel - needs: build-containers - runs-on: - - runs-on=${{ github.run_id }} - - runner=linux-amd64-cpu - steps: - # Restart Docker daemon so that it recognizes the ephemeral disks - - run: sudo systemctl restart docker - - uses: actions/checkout@v4 - with: - submodules: "true" - - name: Fetch container from cache - run: bash ops/docker_build.sh - env: - CONTAINER_ID: xgb-ci.gpu_build_rockylinux8 - - name: Fetch container from cache - run: bash ops/docker_build.sh - env: - CONTAINER_ID: xgb-ci.manylinux_2_28_x86_64 - - run: bash ops/pipeline/build-cuda.sh - - name: Stash files - run: | - bash ops/stash_artifacts.sh \ - build/testxgboost ./xgboost python-package/dist/*.whl - env: - COMMAND: upload - KEY: build-cuda - - build-cuda-with-rmm: - name: Build CUDA with RMM - needs: build-containers - runs-on: - - runs-on=${{ github.run_id }} - - runner=linux-amd64-cpu - steps: - # Restart Docker daemon so that it recognizes the ephemeral disks - - run: sudo systemctl restart docker - - uses: actions/checkout@v4 - with: - submodules: "true" - - name: Fetch container from cache - run: bash ops/docker_build.sh - env: - CONTAINER_ID: xgb-ci.gpu_build_rockylinux8 - - name: Fetch container from cache - run: bash ops/docker_build.sh - env: - CONTAINER_ID: xgb-ci.manylinux_2_28_x86_64 - - run: bash ops/pipeline/build-cuda-with-rmm.sh - - name: Stash files - run: bash ops/stash_artifacts.sh build/testxgboost - env: - COMMAND: upload - KEY: build-cuda-with-rmm - - build-manylinux2014: - name: Build manylinux2014_${{ matrix.arch }} wheel - needs: build-containers - runs-on: - - runs-on=${{ github.run_id }} - - runner=${{ matrix.runner }} - strategy: - matrix: - include: - - arch: aarch64 - runner: linux-arm64-cpu - - arch: x86_64 - runner: linux-amd64-cpu - steps: - # Restart Docker daemon so that it recognizes the ephemeral disks - - run: sudo systemctl restart docker - - uses: actions/checkout@v4 - with: - submodules: "true" - - name: Fetch container from cache - run: bash ops/docker_build.sh - env: - CONTAINER_ID: xgb-ci.manylinux2014_${{ matrix.arch }} - - run: bash ops/pipeline/build-manylinux2014.sh ${{ matrix.arch }} - - build-gpu-rpkg: - name: Build GPU-enabled R package - needs: build-containers - runs-on: - - runs-on=${{ github.run_id }} - - runner=linux-amd64-cpu - steps: - # Restart Docker daemon so that it recognizes the ephemeral disks - - run: sudo systemctl restart docker - - uses: actions/checkout@v4 - with: - submodules: "true" - - name: Fetch container from cache - run: bash ops/docker_build.sh - env: - CONTAINER_ID: xgb-ci.gpu_build_r_rockylinux8 - - run: bash ops/pipeline/build-gpu-rpkg.sh - - test-cpp-gpu: - name: Run Google Tests with GPU(s) - needs: [build-cuda, build-cuda-with-rmm] - runs-on: - - runs-on=${{ github.run_id }} - - runner=${{ matrix.runner }} - strategy: - matrix: - include: - - suite: gpu - runner: linux-amd64-gpu - artifact_from: build-cuda - - suite: gpu-rmm - runner: linux-amd64-gpu - artifact_from: build-cuda-with-rmm - - suite: mgpu - runner: linux-amd64-mgpu - artifact_from: build-cuda - steps: - # Restart Docker daemon so that it recognizes the ephemeral disks - - run: sudo systemctl restart docker - - uses: actions/checkout@v4 - with: - submodules: "true" - - name: Fetch container from cache - run: bash ops/docker_build.sh - env: - CONTAINER_ID: xgb-ci.gpu - - name: Unstash gtest - run: | - bash ops/stash_artifacts.sh build/testxgboost - chmod +x build/testxgboost - env: - COMMAND: download - KEY: ${{ matrix.artifact_from }} - - run: bash ops/pipeline/test-cpp-gpu.sh ${{ matrix.suite }} - - test-python: - name: Run Python tests - needs: [build-cuda, build-cpu-arm64] - runs-on: - - runs-on=${{ github.run_id }} - - runner=${{ matrix.runner }} - strategy: - matrix: - include: - - description: "single GPU" - container: xgb-ci.gpu - suite: gpu - runner: linux-amd64-gpu - artifact_from: build-cuda - - description: "single GPU, nightly deps" - container: xgb-ci.gpu_dev_ver - suite: gpu - runner: linux-amd64-gpu - artifact_from: build-cuda - - description: "multiple GPUs" - container: xgb-ci.gpu - suite: mgpu - runner: linux-amd64-mgpu - artifact_from: build-cuda - - description: "multiple GPUs, nightly deps" - container: xgb-ci.gpu_dev_ver - suite: mgpu - runner: linux-amd64-mgpu - artifact_from: build-cuda - - description: "CPU" - container: xgb-ci.cpu - suite: cpu - runner: linux-amd64-cpu - artifact_from: build-cuda - - description: "CPU ARM64" - container: xgb-ci.aarch64 - suite: cpu-arm64 - runner: linux-arm64-cpu - artifact_from: build-cpu-arm64 - steps: - # Restart Docker daemon so that it recognizes the ephemeral disks - - run: sudo systemctl restart docker - - uses: actions/checkout@v4 - with: - submodules: "true" - - name: Fetch container from cache - run: bash ops/docker_build.sh - env: - CONTAINER_ID: ${{ matrix.container }} - - name: Unstash Python wheel - run: | - bash ops/stash_artifacts.sh python-package/dist/*.whl ./xgboost - chmod +x ./xgboost - env: - COMMAND: download - KEY: ${{ matrix.artifact_from }} - - name: Run Python tests, ${{ matrix.description }} - run: bash ops/pipeline/test-python.sh ${{ matrix.suite }} ${{ matrix.container }} diff --git a/.github/workflows/misc.yml b/.github/workflows/misc.yml new file mode 100644 index 000000000000..7294faa0d93b --- /dev/null +++ b/.github/workflows/misc.yml @@ -0,0 +1,133 @@ +name: XGBoost CI (misc) + +on: [push, pull_request] + +permissions: + contents: read # to fetch code (actions/checkout) + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + +env: + BRANCH_NAME: >- + ${{ github.event.pull_request.number && 'PR-' }}${{ github.event.pull_request.number || github.ref_name }} + +jobs: + gtest-cpu: + name: Test Google C++ test (CPU) + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [macos-13] + steps: + - uses: actions/checkout@v4.2.2 + with: + submodules: 'true' + - name: Install system packages + run: | + brew install ninja libomp + - name: Build gtest binary + run: | + mkdir build + cd build + cmake .. -DGOOGLE_TEST=ON -DUSE_OPENMP=ON -DUSE_DMLC_GTEST=ON -GNinja -DBUILD_DEPRECATED_CLI=ON -DUSE_SANITIZER=ON -DENABLED_SANITIZERS=address -DCMAKE_BUILD_TYPE=RelWithDebInfo + ninja -v + - name: Run gtest binary + run: | + cd build + ./testxgboost + ctest -R TestXGBoostCLI --extra-verbose + + gtest-cpu-nonomp: + name: Test Google C++ unittest (CPU Non-OMP) + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [ubuntu-latest] + steps: + - uses: actions/checkout@v4.2.2 + with: + submodules: 'true' + - name: Install system packages + run: | + sudo apt-get install -y --no-install-recommends ninja-build + - name: Build and install XGBoost + shell: bash -l {0} + run: | + mkdir build + cd build + cmake .. -GNinja -DGOOGLE_TEST=ON -DUSE_DMLC_GTEST=ON -DUSE_OPENMP=OFF -DBUILD_DEPRECATED_CLI=ON + ninja -v + - name: Run gtest binary + run: | + cd build + ctest --extra-verbose + + c-api-demo: + name: Test installing XGBoost lib + building the C API demo + runs-on: ${{ matrix.os }} + defaults: + run: + shell: bash -l {0} + strategy: + fail-fast: false + matrix: + os: ["ubuntu-latest"] + python-version: ["3.10"] + steps: + - uses: actions/checkout@v4.2.2 + with: + submodules: 'true' + - uses: conda-incubator/setup-miniconda@v3.1.0 + with: + miniforge-variant: Miniforge3 + miniforge-version: latest + activate-environment: cpp_test + environment-file: ops/conda_env/cpp_test.yml + use-mamba: true + - name: Display Conda env + run: | + conda info + conda list + - name: Build and install XGBoost static library + run: | + mkdir build + cd build + cmake .. -DBUILD_STATIC_LIB=ON -DCMAKE_INSTALL_PREFIX=$CONDA_PREFIX -GNinja + ninja -v install + cd - + - name: Build and run C API demo with static + run: | + pushd . + cd demo/c-api/ + mkdir build + cd build + cmake .. -GNinja -DCMAKE_PREFIX_PATH=$CONDA_PREFIX + ninja -v + ctest + cd .. + rm -rf ./build + popd + + - name: Build and install XGBoost shared library + run: | + cd build + cmake .. -DBUILD_STATIC_LIB=OFF -DCMAKE_INSTALL_PREFIX=$CONDA_PREFIX -GNinja -DPLUGIN_FEDERATED=ON -DGOOGLE_TEST=ON + ninja -v install + ./testxgboost + cd - + - name: Build and run C API demo with shared + run: | + pushd . + cd demo/c-api/ + mkdir build + cd build + cmake .. -GNinja -DCMAKE_PREFIX_PATH=$CONDA_PREFIX + ninja -v + ctest + popd + ./ops/script/verify_link.sh ./demo/c-api/build/basic/api-demo + ./ops/script/verify_link.sh ./demo/c-api/build/external-memory/external-memory-demo diff --git a/.github/workflows/python_tests.yml b/.github/workflows/python_tests.yml index 907cf98e1011..c43d8b056c8d 100644 --- a/.github/workflows/python_tests.yml +++ b/.github/workflows/python_tests.yml @@ -1,4 +1,4 @@ -name: XGBoost-Python-Tests +name: XGBoost CI (Python tests) on: [push, pull_request] @@ -14,54 +14,23 @@ concurrency: cancel-in-progress: true jobs: - python-mypy-lint: - runs-on: ubuntu-latest - name: Type and format checks for the Python package - strategy: - matrix: - os: [ubuntu-latest] - steps: - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 - with: - submodules: 'true' - - uses: conda-incubator/setup-miniconda@a4260408e20b96e80095f42ff7f1a15b27dd94ca # v3.0.4 - with: - miniforge-variant: Miniforge3 - miniforge-version: latest - activate-environment: python_lint - environment-file: tests/ci_build/conda_env/python_lint.yml - use-mamba: true - - name: Display Conda env - run: | - conda info - conda list - - name: Run mypy - run: | - python tests/ci_build/lint_python.py --format=0 --type-check=1 --pylint=0 - - name: Run formatter - run: | - python tests/ci_build/lint_python.py --format=1 --type-check=0 --pylint=0 - - name: Run pylint - run: | - python tests/ci_build/lint_python.py --format=0 --type-check=0 --pylint=1 - python-sdist-test-on-Linux: - # Mismatched glibcxx version between system and conda forge. runs-on: ${{ matrix.os }} name: Test installing XGBoost Python source package on ${{ matrix.os }} strategy: + fail-fast: false matrix: os: [ubuntu-latest] steps: - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 + - uses: actions/checkout@v4.2.2 with: submodules: 'true' - - uses: conda-incubator/setup-miniconda@a4260408e20b96e80095f42ff7f1a15b27dd94ca # v3.0.4 + - uses: conda-incubator/setup-miniconda@v3.1.0 with: miniforge-variant: Miniforge3 miniforge-version: latest activate-environment: sdist_test - environment-file: tests/ci_build/conda_env/sdist_test.yml + environment-file: ops/conda_env/sdist_test.yml use-mamba: true - name: Display Conda env run: | @@ -82,18 +51,19 @@ jobs: runs-on: ${{ matrix.os }} name: Test installing XGBoost Python source package on ${{ matrix.os }} strategy: + fail-fast: false matrix: os: [macos-13, windows-latest] python-version: ["3.10"] steps: - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 + - uses: actions/checkout@v4.2.2 with: submodules: 'true' - name: Install osx system dependencies if: matrix.os == 'macos-13' run: | brew install ninja libomp - - uses: conda-incubator/setup-miniconda@a4260408e20b96e80095f42ff7f1a15b27dd94ca # v3.0.4 + - uses: conda-incubator/setup-miniconda@v3.1.0 with: auto-update-conda: true python-version: ${{ matrix.python-version }} @@ -115,25 +85,25 @@ jobs: python -c 'import xgboost' python-tests-on-macos: - name: Test XGBoost Python package on ${{ matrix.config.os }} - runs-on: ${{ matrix.config.os }} + name: Test XGBoost Python package on ${{ matrix.os }} + runs-on: ${{ matrix.os }} timeout-minutes: 60 strategy: + fail-fast: false matrix: - config: - - {os: macos-13} + os: [macos-13] steps: - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 + - uses: actions/checkout@v4.2.2 with: submodules: 'true' - - uses: conda-incubator/setup-miniconda@a4260408e20b96e80095f42ff7f1a15b27dd94ca # v3.0.4 + - uses: conda-incubator/setup-miniconda@v3.1.0 with: miniforge-variant: Miniforge3 miniforge-version: latest activate-environment: macos_cpu_test - environment-file: tests/ci_build/conda_env/macos_cpu_test.yml + environment-file: ops/conda_env/macos_cpu_test.yml use-mamba: true - name: Display Conda env @@ -167,159 +137,21 @@ jobs: run: | pytest -s -v -rxXs --durations=0 ./tests/test_distributed/test_with_dask - python-tests-on-win: - name: Test XGBoost Python package on ${{ matrix.config.os }} - runs-on: ${{ matrix.config.os }} - timeout-minutes: 60 - strategy: - matrix: - config: - - {os: windows-latest, python-version: '3.10'} - - steps: - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 - with: - submodules: 'true' - - - uses: conda-incubator/setup-miniconda@a4260408e20b96e80095f42ff7f1a15b27dd94ca # v3.0.4 - with: - auto-update-conda: true - python-version: ${{ matrix.config.python-version }} - activate-environment: win64_env - environment-file: tests/ci_build/conda_env/win64_cpu_test.yml - - - name: Display Conda env - run: | - conda info - conda list - - - name: Build XGBoost on Windows - run: | - mkdir build_msvc - cd build_msvc - cmake .. -G"Visual Studio 17 2022" -DCMAKE_CONFIGURATION_TYPES="Release" -A x64 -DBUILD_DEPRECATED_CLI=ON - cmake --build . --config Release --parallel $(nproc) - - - name: Install Python package - run: | - cd python-package - python --version - pip wheel -v . --wheel-dir dist/ - pip install ./dist/*.whl - - - name: Test Python package - run: | - pytest -s -v -rxXs --durations=0 ./tests/python - - python-tests-on-ubuntu: - name: Test XGBoost Python package on ${{ matrix.config.os }} - runs-on: ${{ matrix.config.os }} - timeout-minutes: 90 - strategy: - matrix: - config: - - {os: ubuntu-latest, python-version: "3.10"} - - steps: - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 - with: - submodules: 'true' - - - uses: conda-incubator/setup-miniconda@a4260408e20b96e80095f42ff7f1a15b27dd94ca # v3.0.4 - with: - miniforge-variant: Miniforge3 - miniforge-version: latest - activate-environment: linux_cpu_test - environment-file: tests/ci_build/conda_env/linux_cpu_test.yml - use-mamba: true - - - name: Display Conda env - run: | - conda info - conda list - - - name: Build XGBoost on Ubuntu - run: | - mkdir build - cd build - cmake .. -GNinja -DCMAKE_PREFIX_PATH=$CONDA_PREFIX -DBUILD_DEPRECATED_CLI=ON - ninja - - - name: Install Python package - run: | - cd python-package - python --version - pip install -v . - - - name: Test Python package - run: | - pytest -s -v -rxXs --durations=0 ./tests/python - - - name: Test Dask Interface - run: | - pytest -s -v -rxXs --durations=0 ./tests/test_distributed/test_with_dask - - - name: Test PySpark Interface - shell: bash -l {0} - run: | - pytest -s -v -rxXs --durations=0 ./tests/test_distributed/test_with_spark - - python-sycl-tests-on-ubuntu: - name: Test XGBoost Python package with SYCL on ${{ matrix.config.os }} - runs-on: ${{ matrix.config.os }} - timeout-minutes: 90 - strategy: - matrix: - config: - - {os: ubuntu-latest, python-version: "3.10"} - - steps: - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 - with: - submodules: 'true' - - - uses: conda-incubator/setup-miniconda@a4260408e20b96e80095f42ff7f1a15b27dd94ca # v3.0.4 - with: - miniforge-variant: Miniforge3 - miniforge-version: latest - activate-environment: linux_sycl_test - environment-file: tests/ci_build/conda_env/linux_sycl_test.yml - use-mamba: true - - - name: Display Conda env - run: | - conda info - conda list - - name: Build XGBoost on Ubuntu - run: | - mkdir build - cd build - cmake .. -DPLUGIN_SYCL=ON -DCMAKE_CXX_COMPILER=g++ -DCMAKE_C_COMPILER=gcc -DCMAKE_PREFIX_PATH=$CONDA_PREFIX - make -j$(nproc) - - name: Install Python package - run: | - cd python-package - python --version - pip install -v . - - name: Test Python package - run: | - pytest -s -v -rxXs --durations=0 ./tests/python-sycl/ - - python-system-installation-on-ubuntu: name: Test XGBoost Python package System Installation on ${{ matrix.os }} runs-on: ${{ matrix.os }} strategy: + fail-fast: false matrix: os: [ubuntu-latest] steps: - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 + - uses: actions/checkout@v4.2.2 with: submodules: 'true' - name: Set up Python 3.10 - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 + uses: actions/setup-python@v5.3.0 with: python-version: "3.10" diff --git a/.github/workflows/python_wheels.yml b/.github/workflows/python_wheels_macos.yml similarity index 55% rename from .github/workflows/python_wheels.yml rename to .github/workflows/python_wheels_macos.yml index 3b7a8072c109..a4cff8eb0e6f 100644 --- a/.github/workflows/python_wheels.yml +++ b/.github/workflows/python_wheels_macos.yml @@ -1,9 +1,9 @@ -name: XGBoost-Python-Wheels +name: Build Python wheels targeting MacOS on: [push, pull_request] permissions: - contents: read # to fetch code (actions/checkout) + contents: read # to fetch code (actions/checkout) defaults: run: @@ -13,11 +13,16 @@ concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} cancel-in-progress: true +env: + BRANCH_NAME: >- + ${{ github.event.pull_request.number && 'PR-' }}${{ github.event.pull_request.number || github.ref_name }} + jobs: - python-wheels: + python-wheels-macos: name: Build wheel for ${{ matrix.platform_id }} runs-on: ${{ matrix.os }} strategy: + fail-fast: false matrix: include: - os: macos-13 @@ -25,31 +30,26 @@ jobs: - os: macos-14 platform_id: macosx_arm64 steps: - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 + - uses: actions/checkout@v4.2.2 with: submodules: 'true' - name: Set up homebrew - uses: Homebrew/actions/setup-homebrew@68fa6aeb1ccb0596d311f2b34ec74ec21ee68e54 + uses: Homebrew/actions/setup-homebrew@13341b4d5e459a98bbe0b122b12c11bf90518cc8 - name: Install libomp run: brew install libomp - - uses: conda-incubator/setup-miniconda@a4260408e20b96e80095f42ff7f1a15b27dd94ca # v3.0.4 + - uses: conda-incubator/setup-miniconda@v3.1.0 with: miniforge-variant: Miniforge3 miniforge-version: latest python-version: "3.10" use-mamba: true - name: Build wheels - run: bash tests/ci_build/build_python_wheels.sh ${{ matrix.platform_id }} ${{ github.sha }} - - name: Extract branch name - run: | - echo "branch=${GITHUB_REF#refs/heads/}" >> "$GITHUB_OUTPUT" - id: extract_branch - if: github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_') + run: bash ops/pipeline/build-python-wheels-macos.sh ${{ matrix.platform_id }} ${{ github.sha }} - name: Upload Python wheel - if: github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_') + # if: github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_') run: | python -m pip install awscli - python -m awscli s3 cp wheelhouse/*.whl s3://xgboost-nightly-builds/${{ steps.extract_branch.outputs.branch }}/ --acl public-read --region us-west-2 + python -m awscli s3 cp wheelhouse/*.whl s3://xgboost-nightly-builds/${{ env.BRANCH_NAME }}/ --acl public-read --region us-west-2 env: AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID_IAM_S3_UPLOADER }} AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY_IAM_S3_UPLOADER }} diff --git a/.github/workflows/r_tests.yml b/.github/workflows/r_tests.yml index c56d1f8ef943..3885c126f11e 100644 --- a/.github/workflows/r_tests.yml +++ b/.github/workflows/r_tests.yml @@ -13,78 +13,46 @@ concurrency: cancel-in-progress: true jobs: - lintr: - runs-on: ${{ matrix.config.os }} - name: Run R linters on OS ${{ matrix.config.os }}, R ${{ matrix.config.r }}, Compiler ${{ matrix.config.compiler }}, Build ${{ matrix.config.build }} - strategy: - matrix: - config: - - {os: ubuntu-latest, r: 'release'} - env: - R_REMOTES_NO_ERRORS_FROM_WARNINGS: true - RSPM: ${{ matrix.config.rspm }} - - steps: - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 - with: - submodules: 'true' - - - uses: r-lib/actions/setup-r@929c772977a3a13c8733b363bf5a2f685c25dd91 # v2.9.0 - with: - r-version: ${{ matrix.config.r }} - - - name: Cache R packages - uses: actions/cache@6849a6489940f00c2f30c0fb92c6274307ccb58a # v4.1.2 - with: - path: ${{ env.R_LIBS_USER }} - key: ${{ runner.os }}-r-${{ matrix.config.r }}-7-${{ hashFiles('R-package/DESCRIPTION') }} - restore-keys: ${{ runner.os }}-r-${{ matrix.config.r }}-7-${{ hashFiles('R-package/DESCRIPTION') }} - - - name: Install dependencies - shell: Rscript {0} - run: | - source("./R-package/tests/helper_scripts/install_deps.R") - - - name: Run lintr - run: | - MAKEFLAGS="-j$(nproc)" R CMD INSTALL R-package/ - Rscript tests/ci_build/lint_r.R $(pwd) - test-Rpkg: - runs-on: ${{ matrix.config.os }} - name: Test R on OS ${{ matrix.config.os }}, R ${{ matrix.config.r }}, Compiler ${{ matrix.config.compiler }}, Build ${{ matrix.config.build }} + runs-on: ${{ matrix.os }} + name: Test R on OS ${{ matrix.os }}, R ${{ matrix.r }}, Compiler ${{ matrix.compiler }}, Build ${{ matrix.build }} strategy: fail-fast: false matrix: - config: - - {os: windows-latest, r: 'release', compiler: 'mingw', build: 'autotools'} - - {os: ubuntu-latest, r: 'release', compiler: 'none', build: 'cmake'} + include: + - os: windows-latest + r: release + compiler: mingw + build: autotools + - os: ubuntu-latest + r: release + compiler: none + build: cmake env: R_REMOTES_NO_ERRORS_FROM_WARNINGS: true - RSPM: ${{ matrix.config.rspm }} steps: - name: Install system dependencies run: | sudo apt update sudo apt install libcurl4-openssl-dev libssl-dev libssh2-1-dev libgit2-dev libglpk-dev libxml2-dev libharfbuzz-dev libfribidi-dev - if: matrix.config.os == 'ubuntu-latest' - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 + if: matrix.os == 'ubuntu-latest' + - uses: actions/checkout@v4.2.2 with: submodules: 'true' - - uses: r-lib/actions/setup-r@929c772977a3a13c8733b363bf5a2f685c25dd91 # v2.9.0 + - uses: r-lib/actions/setup-r@v2.11.0 with: - r-version: ${{ matrix.config.r }} + r-version: ${{ matrix.r }} - name: Cache R packages - uses: actions/cache@6849a6489940f00c2f30c0fb92c6274307ccb58a # v4.1.2 + uses: actions/cache@v4.1.2 with: path: ${{ env.R_LIBS_USER }} - key: ${{ runner.os }}-r-${{ matrix.config.r }}-7-${{ hashFiles('R-package/DESCRIPTION') }} - restore-keys: ${{ runner.os }}-r-${{ matrix.config.r }}-7-${{ hashFiles('R-package/DESCRIPTION') }} + key: ${{ runner.os }}-r-${{ matrix.r }}-7-${{ hashFiles('R-package/DESCRIPTION') }} + restore-keys: ${{ runner.os }}-r-${{ matrix.r }}-7-${{ hashFiles('R-package/DESCRIPTION') }} - - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 + - uses: actions/setup-python@v5.3.0 with: python-version: "3.10" architecture: 'x64' @@ -98,13 +66,13 @@ jobs: - name: Test R run: | - python tests/ci_build/test_r_package.py --compiler='${{ matrix.config.compiler }}' --build-tool="${{ matrix.config.build }}" --task=check - if: matrix.config.compiler != 'none' + python ops/script/test_r_package.py --compiler='${{ matrix.compiler }}' --build-tool="${{ matrix.build }}" --task=check + if: matrix.compiler != 'none' - name: Test R run: | - python tests/ci_build/test_r_package.py --build-tool="${{ matrix.config.build }}" --task=check - if: matrix.config.compiler == 'none' + python ops/script/test_r_package.py --build-tool="${{ matrix.build }}" --task=check + if: matrix.compiler == 'none' test-R-on-Debian: name: Test R package on Debian @@ -123,7 +91,7 @@ jobs: run: | git config --global --add safe.directory "${GITHUB_WORKSPACE}" - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 + - uses: actions/checkout@v4.2.2 with: submodules: 'true' @@ -135,7 +103,7 @@ jobs: - name: Test R shell: bash -l {0} run: | - python3 tests/ci_build/test_r_package.py --r=/usr/bin/R --build-tool=autotools --task=check + python3 ops/script/test_r_package.py --r=/usr/bin/R --build-tool=autotools --task=check - uses: dorny/paths-filter@v3 id: changes @@ -147,4 +115,4 @@ jobs: - name: Run document check if: steps.changes.outputs.r_package == 'true' run: | - python3 tests/ci_build/test_r_package.py --r=/usr/bin/R --task=doc + python3 ops/script/test_r_package.py --r=/usr/bin/R --task=doc diff --git a/.github/workflows/scorecards.yml b/.github/workflows/scorecards.yml index 85a9abb57e1b..8ab77ec4c382 100644 --- a/.github/workflows/scorecards.yml +++ b/.github/workflows/scorecards.yml @@ -22,7 +22,7 @@ jobs: steps: - name: "Checkout code" - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 + uses: actions/checkout@v4.2.2 with: persist-credentials: false diff --git a/.github/workflows/sycl_tests.yml b/.github/workflows/sycl_tests.yml new file mode 100644 index 000000000000..54ebcb5f9532 --- /dev/null +++ b/.github/workflows/sycl_tests.yml @@ -0,0 +1,94 @@ +name: XGBoost CI (oneAPI) + +on: [push, pull_request] + +permissions: + contents: read # to fetch code (actions/checkout) + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + +env: + BRANCH_NAME: >- + ${{ github.event.pull_request.number && 'PR-' }}${{ github.event.pull_request.number || github.ref_name }} + +jobs: + gtest-cpu-sycl: + name: Test Google C++ unittest (CPU SYCL) + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [ubuntu-latest] + steps: + - uses: actions/checkout@v4.2.2 + with: + submodules: 'true' + - uses: conda-incubator/setup-miniconda@v3.1.0 + with: + miniforge-variant: Miniforge3 + miniforge-version: latest + activate-environment: linux_sycl_test + environment-file: ops/conda_env/linux_sycl_test.yml + use-mamba: true + - name: Display Conda env + run: | + conda info + conda list + - name: Build and install XGBoost + shell: bash -l {0} + run: | + mkdir build + cd build + cmake .. -DGOOGLE_TEST=ON -DUSE_DMLC_GTEST=ON -DPLUGIN_SYCL=ON -DCMAKE_CXX_COMPILER=g++ -DCMAKE_C_COMPILER=gcc -DCMAKE_INSTALL_PREFIX=$CONDA_PREFIX + make -j$(nproc) + - name: Run gtest binary for SYCL + run: | + cd build + ./testxgboost --gtest_filter=Sycl* + - name: Run gtest binary for non SYCL + run: | + cd build + ./testxgboost --gtest_filter=-Sycl* + + python-sycl-tests-on-ubuntu: + name: Test XGBoost Python package with SYCL on ${{ matrix.os }} + runs-on: ${{ matrix.os }} + timeout-minutes: 90 + strategy: + fail-fast: false + matrix: + os: [ubuntu-latest] + + steps: + - uses: actions/checkout@v4.2.2 + with: + submodules: 'true' + + - uses: conda-incubator/setup-miniconda@v3.1.0 + with: + miniforge-variant: Miniforge3 + miniforge-version: latest + activate-environment: linux_sycl_test + environment-file: ops/conda_env/linux_sycl_test.yml + use-mamba: true + + - name: Display Conda env + run: | + conda info + conda list + - name: Build XGBoost on Ubuntu + run: | + mkdir build + cd build + cmake .. -DPLUGIN_SYCL=ON -DCMAKE_CXX_COMPILER=g++ -DCMAKE_C_COMPILER=gcc -DCMAKE_PREFIX_PATH=$CONDA_PREFIX + make -j$(nproc) + - name: Install Python package + run: | + cd python-package + python --version + pip install -v . + - name: Test Python package + run: | + pytest -s -v -rxXs --durations=0 ./tests/python-sycl/ diff --git a/.github/workflows/update_rapids.yml b/.github/workflows/update_rapids.yml index 5e229db4c050..636661db46b8 100644 --- a/.github/workflows/update_rapids.yml +++ b/.github/workflows/update_rapids.yml @@ -25,7 +25,7 @@ jobs: name: Check latest RAPIDS runs-on: ubuntu-latest steps: - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 + - uses: actions/checkout@v4.2.2 with: submodules: 'true' - name: Check latest RAPIDS and update conftest.sh diff --git a/.github/workflows/windows_runs_on.yml b/.github/workflows/windows.yml similarity index 93% rename from .github/workflows/windows_runs_on.yml rename to .github/workflows/windows.yml index 73a258158b12..3dc9c4962646 100644 --- a/.github/workflows/windows_runs_on.yml +++ b/.github/workflows/windows.yml @@ -1,4 +1,4 @@ -name: Nextgen XGBoost CI Windows +name: XGBoost CI (Windows) on: [push, pull_request] @@ -27,7 +27,7 @@ jobs: - runs-on=${{ github.run_id }} - runner=windows-cpu steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v4.2.2 with: submodules: "true" - run: powershell ops/pipeline/build-win64-gpu.ps1 @@ -39,6 +39,7 @@ jobs: env: COMMAND: upload KEY: build-win64-gpu + test-win64-gpu: name: Test XGBoost on Windows needs: build-win64-gpu @@ -46,7 +47,7 @@ jobs: - runs-on=${{ github.run_id }} - runner=windows-gpu steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v4.2.2 with: submodules: "true" - name: Unstash files diff --git a/ops/docker/conda_env/aarch64_test.yml b/ops/conda_env/aarch64_test.yml similarity index 100% rename from ops/docker/conda_env/aarch64_test.yml rename to ops/conda_env/aarch64_test.yml diff --git a/ops/docker/conda_env/cpp_test.yml b/ops/conda_env/cpp_test.yml similarity index 100% rename from ops/docker/conda_env/cpp_test.yml rename to ops/conda_env/cpp_test.yml diff --git a/ops/docker/conda_env/jvm_tests.yml b/ops/conda_env/jvm_tests.yml similarity index 100% rename from ops/docker/conda_env/jvm_tests.yml rename to ops/conda_env/jvm_tests.yml diff --git a/ops/docker/conda_env/linux_cpu_test.yml b/ops/conda_env/linux_cpu_test.yml similarity index 100% rename from ops/docker/conda_env/linux_cpu_test.yml rename to ops/conda_env/linux_cpu_test.yml diff --git a/ops/docker/conda_env/linux_sycl_test.yml b/ops/conda_env/linux_sycl_test.yml similarity index 100% rename from ops/docker/conda_env/linux_sycl_test.yml rename to ops/conda_env/linux_sycl_test.yml diff --git a/ops/docker/conda_env/macos_cpu_test.yml b/ops/conda_env/macos_cpu_test.yml similarity index 100% rename from ops/docker/conda_env/macos_cpu_test.yml rename to ops/conda_env/macos_cpu_test.yml diff --git a/ops/docker/conda_env/python_lint.yml b/ops/conda_env/python_lint.yml similarity index 100% rename from ops/docker/conda_env/python_lint.yml rename to ops/conda_env/python_lint.yml diff --git a/ops/docker/conda_env/sdist_test.yml b/ops/conda_env/sdist_test.yml similarity index 100% rename from ops/docker/conda_env/sdist_test.yml rename to ops/conda_env/sdist_test.yml diff --git a/ops/docker/conda_env/win64_test.yml b/ops/conda_env/win64_test.yml similarity index 100% rename from ops/docker/conda_env/win64_test.yml rename to ops/conda_env/win64_test.yml diff --git a/ops/docker/dockerfile/Dockerfile.aarch64 b/ops/docker/dockerfile/Dockerfile.aarch64 index 8d6cfaca39fa..9dff2a05230b 100644 --- a/ops/docker/dockerfile/Dockerfile.aarch64 +++ b/ops/docker/dockerfile/Dockerfile.aarch64 @@ -32,7 +32,7 @@ RUN set -ex; \ # Default entry-point to use if running locally # It will preserve attributes of created files -COPY entrypoint.sh /scripts/ +COPY docker/entrypoint.sh /scripts/ WORKDIR /workspace ENTRYPOINT ["/scripts/entrypoint.sh"] diff --git a/ops/docker/dockerfile/Dockerfile.clang_tidy b/ops/docker/dockerfile/Dockerfile.clang_tidy index c9528015c17e..de7d9bd3f254 100644 --- a/ops/docker/dockerfile/Dockerfile.clang_tidy +++ b/ops/docker/dockerfile/Dockerfile.clang_tidy @@ -44,7 +44,7 @@ RUN set -ex; \ # Default entry-point to use if running locally # It will preserve attributes of created files -COPY entrypoint.sh /scripts/ +COPY docker/entrypoint.sh /scripts/ WORKDIR /workspace ENTRYPOINT ["/scripts/entrypoint.sh"] diff --git a/ops/docker/dockerfile/Dockerfile.cpu b/ops/docker/dockerfile/Dockerfile.cpu index 64b28026a89c..a426ce5da30c 100644 --- a/ops/docker/dockerfile/Dockerfile.cpu +++ b/ops/docker/dockerfile/Dockerfile.cpu @@ -51,7 +51,7 @@ RUN set -ex; \ # Default entry-point to use if running locally # It will preserve attributes of created files -COPY entrypoint.sh /scripts/ +COPY docker/entrypoint.sh /scripts/ WORKDIR /workspace ENTRYPOINT ["/scripts/entrypoint.sh"] diff --git a/ops/docker/dockerfile/Dockerfile.gpu b/ops/docker/dockerfile/Dockerfile.gpu index d8be4d3b07ef..96a532fc2ff1 100644 --- a/ops/docker/dockerfile/Dockerfile.gpu +++ b/ops/docker/dockerfile/Dockerfile.gpu @@ -48,7 +48,7 @@ RUN set -ex; \ # Default entry-point to use if running locally # It will preserve attributes of created files -COPY entrypoint.sh /scripts/ +COPY docker/entrypoint.sh /scripts/ WORKDIR /workspace ENTRYPOINT ["/scripts/entrypoint.sh"] diff --git a/ops/docker/dockerfile/Dockerfile.gpu_build_r_rockylinux8 b/ops/docker/dockerfile/Dockerfile.gpu_build_r_rockylinux8 index 7c1d4e8ef642..2d18b1eeb315 100644 --- a/ops/docker/dockerfile/Dockerfile.gpu_build_r_rockylinux8 +++ b/ops/docker/dockerfile/Dockerfile.gpu_build_r_rockylinux8 @@ -52,7 +52,7 @@ RUN set -ex; \ # Default entry-point to use if running locally # It will preserve attributes of created files -COPY entrypoint.sh /scripts/ +COPY docker/entrypoint.sh /scripts/ WORKDIR /workspace ENTRYPOINT ["/scripts/entrypoint.sh"] diff --git a/ops/docker/dockerfile/Dockerfile.jvm b/ops/docker/dockerfile/Dockerfile.jvm index c4584747f5db..9fd62e52de93 100644 --- a/ops/docker/dockerfile/Dockerfile.jvm +++ b/ops/docker/dockerfile/Dockerfile.jvm @@ -37,7 +37,7 @@ RUN set -ex; \ # Default entry-point to use if running locally # It will preserve attributes of created files -COPY entrypoint.sh /scripts/ +COPY docker/entrypoint.sh /scripts/ WORKDIR /workspace ENTRYPOINT ["/scripts/entrypoint.sh"] diff --git a/ops/docker/dockerfile/Dockerfile.jvm_gpu_build b/ops/docker/dockerfile/Dockerfile.jvm_gpu_build index 7f0168df467f..4983493a6878 100644 --- a/ops/docker/dockerfile/Dockerfile.jvm_gpu_build +++ b/ops/docker/dockerfile/Dockerfile.jvm_gpu_build @@ -48,7 +48,7 @@ RUN set -ex; \ # Default entry-point to use if running locally # It will preserve attributes of created files -COPY entrypoint.sh /scripts/ +COPY docker/entrypoint.sh /scripts/ WORKDIR /workspace ENTRYPOINT ["/scripts/entrypoint.sh"] diff --git a/ops/docker/dockerfile/Dockerfile.manylinux2014_aarch64 b/ops/docker/dockerfile/Dockerfile.manylinux2014_aarch64 index 52baff43bb6f..7800033f552d 100644 --- a/ops/docker/dockerfile/Dockerfile.manylinux2014_aarch64 +++ b/ops/docker/dockerfile/Dockerfile.manylinux2014_aarch64 @@ -11,7 +11,7 @@ RUN set -ex; \ # Default entry-point to use if running locally # It will preserve attributes of created files -COPY entrypoint.sh /scripts/ +COPY docker/entrypoint.sh /scripts/ WORKDIR /workspace ENTRYPOINT ["/scripts/entrypoint.sh"] diff --git a/ops/docker/dockerfile/Dockerfile.manylinux2014_x86_64 b/ops/docker/dockerfile/Dockerfile.manylinux2014_x86_64 index fdfcbd277360..8214b598d8d4 100644 --- a/ops/docker/dockerfile/Dockerfile.manylinux2014_x86_64 +++ b/ops/docker/dockerfile/Dockerfile.manylinux2014_x86_64 @@ -11,7 +11,7 @@ RUN set -ex; \ # Default entry-point to use if running locally # It will preserve attributes of created files -COPY entrypoint.sh /scripts/ +COPY docker/entrypoint.sh /scripts/ WORKDIR /workspace ENTRYPOINT ["/scripts/entrypoint.sh"] diff --git a/ops/docker/dockerfile/Dockerfile.manylinux_2_28_x86_64 b/ops/docker/dockerfile/Dockerfile.manylinux_2_28_x86_64 index 5e264e2f16e6..f5dac54b9b8f 100644 --- a/ops/docker/dockerfile/Dockerfile.manylinux_2_28_x86_64 +++ b/ops/docker/dockerfile/Dockerfile.manylinux_2_28_x86_64 @@ -9,7 +9,7 @@ RUN set -ex; \ # Default entry-point to use if running locally # It will preserve attributes of created files -COPY entrypoint.sh /scripts/ +COPY docker/entrypoint.sh /scripts/ WORKDIR /workspace ENTRYPOINT ["/scripts/entrypoint.sh"] diff --git a/ops/docker_build.py b/ops/docker_build.py index 922d528814a4..b096d9201d0f 100644 --- a/ops/docker_build.py +++ b/ops/docker_build.py @@ -9,7 +9,7 @@ import sys from typing import Optional -from docker_run import SCRIPT_DIR, fancy_print_cli_args +from docker_run import OPS_DIR, fancy_print_cli_args def parse_build_args(raw_build_args: list[str]) -> list[dict[str, str]]: @@ -71,9 +71,9 @@ def docker_build( def main(args: argparse.Namespace) -> None: # Dockerfile to be used in docker build dockerfile_path = ( - SCRIPT_DIR / "docker" / "dockerfile" / f"Dockerfile.{args.container_def}" + OPS_DIR / "docker" / "dockerfile" / f"Dockerfile.{args.container_def}" ) - docker_context_path = SCRIPT_DIR / "docker" + docker_context_path = OPS_DIR build_args = parse_build_args(args.build_arg) diff --git a/ops/docker_run.py b/ops/docker_run.py index 161c81b477b0..41ec9acb17c2 100644 --- a/ops/docker_run.py +++ b/ops/docker_run.py @@ -12,8 +12,8 @@ import sys import textwrap -SCRIPT_DIR = pathlib.Path(__file__).expanduser().resolve().parent -PROJECT_ROOT_DIR = SCRIPT_DIR.parent +OPS_DIR = pathlib.Path(__file__).expanduser().resolve().parent +PROJECT_ROOT_DIR = OPS_DIR.parent LINEWIDTH = 88 TEXT_WRAPPER = textwrap.TextWrapper( width=LINEWIDTH, diff --git a/ops/pipeline/build-jvm-macos-m1.sh b/ops/pipeline/build-jvm-macos-apple-silicon.sh similarity index 85% rename from ops/pipeline/build-jvm-macos-m1.sh rename to ops/pipeline/build-jvm-macos-apple-silicon.sh index 75785aa03eba..0c0aa6300729 100755 --- a/ops/pipeline/build-jvm-macos-m1.sh +++ b/ops/pipeline/build-jvm-macos-apple-silicon.sh @@ -1,5 +1,5 @@ #!/bin/bash -## Build libxgboost4j.dylib targeting MacOS +## Build libxgboost4j.dylib targeting MacOS (Apple Silicon) set -euox pipefail @@ -34,11 +34,11 @@ pushd lib libname=libxgboost4j_m1_${GITHUB_SHA}.dylib mv -v libxgboost4j.dylib ${libname} -if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]] -then +# if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]] +# then aws s3 cp ${libname} \ s3://xgboost-nightly-builds/${BRANCH_NAME}/libxgboost4j/ \ --acl public-read --no-progress -fi +# fi popd set +x diff --git a/ops/pipeline/build-jvm-macos-intel.sh b/ops/pipeline/build-jvm-macos-intel.sh new file mode 100755 index 000000000000..ee71a8b13078 --- /dev/null +++ b/ops/pipeline/build-jvm-macos-intel.sh @@ -0,0 +1,44 @@ +#!/bin/bash +## Build libxgboost4j.dylib targeting MacOS (Intel) + +set -euox pipefail + +source ops/pipeline/enforce-ci.sh + +# Display system info +echo "--- Display system information" +set -x +system_profiler SPSoftwareDataType +sysctl -n machdep.cpu.brand_string +uname -m +set +x + +brew install ninja libomp + +# Build XGBoost4J binary +echo "--- Build libxgboost4j.dylib" +set -x +mkdir build +pushd build +export JAVA_HOME=$(/usr/libexec/java_home) +cmake .. -GNinja -DJVM_BINDINGS=ON -DUSE_OPENMP=ON -DCMAKE_OSX_DEPLOYMENT_TARGET=10.15 +ninja -v +popd +rm -rf build +otool -L lib/libxgboost.dylib +set +x + +echo "--- Upload libxgboost4j.dylib" +set -x +pushd lib +libname=libxgboost4j_intel_${GITHUB_SHA}.dylib +mv -v libxgboost4j.dylib ${libname} + +# if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]] +# then + aws s3 cp ${libname} \ + s3://xgboost-nightly-builds/${BRANCH_NAME}/libxgboost4j/ \ + --acl public-read --no-progress +# fi +popd +set +x diff --git a/ops/pipeline/build-python-wheels-macos.sh b/ops/pipeline/build-python-wheels-macos.sh old mode 100644 new mode 100755 index 3715ec9e7e0f..697514c0c3ad --- a/ops/pipeline/build-python-wheels-macos.sh +++ b/ops/pipeline/build-python-wheels-macos.sh @@ -30,7 +30,6 @@ if [[ "$platform_id" == macosx_* ]]; then # Set up environment variables to configure cibuildwheel export CIBW_BUILD=cp${cpython_ver}-${platform_id} export CIBW_ARCHS=${cibw_archs} - export CIBW_ENVIRONMENT=${setup_env_var} export CIBW_TEST_SKIP='*-macosx_arm64' export CIBW_BUILD_VERBOSITY=3 else diff --git a/ops/pipeline/test-win64-gpu.ps1 b/ops/pipeline/test-win64-gpu.ps1 index e4a55c77b2bd..2416d53b3f85 100644 --- a/ops/pipeline/test-win64-gpu.ps1 +++ b/ops/pipeline/test-win64-gpu.ps1 @@ -13,7 +13,7 @@ if ($LASTEXITCODE -ne 0) { throw "Last command failed" } Write-Host "--- Set up Python env" conda activate $env_name = -join("win64_", (New-Guid).ToString().replace("-", "")) -mamba env create -n ${env_name} --file=ops/docker/conda_env/win64_test.yml +mamba env create -n ${env_name} --file=ops/conda_env/win64_test.yml conda activate ${env_name} python -m pip install ` (Get-ChildItem python-package/dist/*.whl | Select-Object -Expand FullName) diff --git a/ops/script/build_via_cmake.sh b/ops/script/build_via_cmake.sh index 857ebbbec0c2..86e3677f4392 100755 --- a/ops/script/build_via_cmake.sh +++ b/ops/script/build_via_cmake.sh @@ -2,9 +2,16 @@ set -euo pipefail -if [[ "$1" == --conda-env=* ]] +if [[ "$#" -lt 1 ]] then - conda_env=$(echo "$1" | sed 's/^--conda-env=//g' -) + conda_env="" +else + conda_env="$1" +fi + +if [[ "${conda_env}" == --conda-env=* ]] +then + conda_env=$(echo "${conda_env}" | sed 's/^--conda-env=//g' -) echo "Activating Conda environment ${conda_env}" shift 1 cmake_args="$@" diff --git a/ops/script/lint_cmake.sh b/ops/script/lint_cmake.sh old mode 100644 new mode 100755 index d67ecd0844ed..55aeb20e8fb2 --- a/ops/script/lint_cmake.sh +++ b/ops/script/lint_cmake.sh @@ -1,6 +1,6 @@ #!/bin/bash -set -e +set -euo pipefail cmake_files=$( find . -name CMakeLists.txt -o -path "./cmake/*.cmake" \ diff --git a/ops/script/run_clang_tidy.py b/ops/script/run_clang_tidy.py index aaeccdaf3718..dca5d1069598 100755 --- a/ops/script/run_clang_tidy.py +++ b/ops/script/run_clang_tidy.py @@ -19,7 +19,9 @@ def call(args: list[str]) -> tuple[int, int, str, list[str]]: # `workspace` is a name used in the CI container. Normally we should keep the dir # as `xgboost`. matched = re.search( - "(workspace|xgboost)/.*(ops|src|tests|include)/.*warning:", error_msg, re.MULTILINE + "(workspace|xgboost)/.*(ops|src|tests|include)/.*warning:", + error_msg, + re.MULTILINE, ) if matched is None: diff --git a/tests/test_distributed/test_gpu_with_dask/test_gpu_with_dask.py b/tests/test_distributed/test_gpu_with_dask/test_gpu_with_dask.py index b7be3c44c1df..5746f33044e9 100644 --- a/tests/test_distributed/test_gpu_with_dask/test_gpu_with_dask.py +++ b/tests/test_distributed/test_gpu_with_dask/test_gpu_with_dask.py @@ -101,7 +101,7 @@ def is_df(part: T) -> T: X.columns = X.columns.astype("object") # Make sure the output can be integrated back to original dataframe X.columns = X.columns.astype("object") - # Work around https://github.com/dmlc/xgboost/issues/10752 + # Work around https://github.com/dmlc/xgboost/issues/10752 X["predict"] = predictions X["inplace_predict"] = series_predictions From 9f9db2b54d1ea836a485b9bb70110ff94a922f17 Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Mon, 18 Nov 2024 18:51:05 -0800 Subject: [PATCH 35/45] Various fixes --- .github/dependabot.yml | 35 +++++++++++++++ .github/lock.yml | 32 ++++++++++++++ .github/workflows/freebsd.yml | 34 ++++++++++++++ .github/workflows/r_nold.yml | 44 +++++++++++++++++++ .github/workflows/sycl_tests.yml | 23 +++++----- ops/conda_env/linux_sycl_test.yml | 1 + .../Dockerfile.gpu_build_rockylinux8 | 2 +- ops/pipeline/build-jvm-doc-impl.sh | 2 +- 8 files changed, 160 insertions(+), 13 deletions(-) create mode 100644 .github/dependabot.yml create mode 100644 .github/lock.yml create mode 100644 .github/workflows/freebsd.yml create mode 100644 .github/workflows/r_nold.yml diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 000000000000..1a8098071ba3 --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,35 @@ +# To get started with Dependabot version updates, you'll need to specify which +# package ecosystems to update and where the package manifests are located. +# Please see the documentation for all configuration options: +# https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates + +version: 2 +updates: + - package-ecosystem: "maven" + directory: "/jvm-packages" + schedule: + interval: "monthly" + - package-ecosystem: "maven" + directory: "/jvm-packages/xgboost4j" + schedule: + interval: "monthly" + - package-ecosystem: "maven" + directory: "/jvm-packages/xgboost4j-gpu" + schedule: + interval: "monthly" + - package-ecosystem: "maven" + directory: "/jvm-packages/xgboost4j-example" + schedule: + interval: "monthly" + - package-ecosystem: "maven" + directory: "/jvm-packages/xgboost4j-spark" + schedule: + interval: "monthly" + - package-ecosystem: "maven" + directory: "/jvm-packages/xgboost4j-spark-gpu" + schedule: + interval: "monthly" + - package-ecosystem: "github-actions" + directory: / + schedule: + interval: "monthly" diff --git a/.github/lock.yml b/.github/lock.yml new file mode 100644 index 000000000000..f916abe5a367 --- /dev/null +++ b/.github/lock.yml @@ -0,0 +1,32 @@ +# Configuration for lock-threads - https://github.com/dessant/lock-threads + +# Number of days of inactivity before a closed issue or pull request is locked +daysUntilLock: 90 + +# Issues and pull requests with these labels will not be locked. Set to `[]` to disable +exemptLabels: + - feature-request + +# Label to add before locking, such as `outdated`. Set to `false` to disable +lockLabel: false + +# Comment to post before locking. Set to `false` to disable +lockComment: false + +# Assign `resolved` as the reason for locking. Set to `false` to disable +setLockReason: true + +# Limit to only `issues` or `pulls` +# only: issues + +# Optionally, specify configuration settings just for `issues` or `pulls` +# issues: +# exemptLabels: +# - help-wanted +# lockLabel: outdated + +# pulls: +# daysUntilLock: 30 + +# Repository to extend settings from +# _extends: repo diff --git a/.github/workflows/freebsd.yml b/.github/workflows/freebsd.yml new file mode 100644 index 000000000000..d3208a1294d1 --- /dev/null +++ b/.github/workflows/freebsd.yml @@ -0,0 +1,34 @@ +name: FreeBSD + +on: [push, pull_request] + +permissions: + contents: read # to fetch code (actions/checkout) + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + +jobs: + test: + runs-on: ubuntu-latest + timeout-minutes: 20 + name: A job to run test in FreeBSD + steps: + - uses: actions/checkout@v4 + with: + submodules: 'true' + - name: Test in FreeBSD + id: test + uses: vmactions/freebsd-vm@v1 + with: + usesh: true + prepare: | + pkg install -y cmake git ninja googletest + + run: | + mkdir build + cd build + cmake .. -GNinja -DGOOGLE_TEST=ON + ninja -v + ./testxgboost diff --git a/.github/workflows/r_nold.yml b/.github/workflows/r_nold.yml new file mode 100644 index 000000000000..4b506927e06c --- /dev/null +++ b/.github/workflows/r_nold.yml @@ -0,0 +1,44 @@ +# Run expensive R tests with the help of rhub. Only triggered by a pull request review +# See discussion at https://github.com/dmlc/xgboost/pull/6378 + +name: XGBoost-R-noLD + +on: + pull_request_review_comment: + types: [created] + +permissions: + contents: read # to fetch code (actions/checkout) + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + +jobs: + test-R-noLD: + if: github.event.comment.body == '/gha run r-nold-test' && contains('OWNER,MEMBER,COLLABORATOR', github.event.comment.author_association) + timeout-minutes: 120 + runs-on: ubuntu-latest + container: + image: rhub/debian-gcc-devel-nold + steps: + - name: Install git and system packages + shell: bash + run: | + apt update && apt install libcurl4-openssl-dev libssl-dev libssh2-1-dev libgit2-dev libglpk-dev libxml2-dev libharfbuzz-dev libfribidi-dev git -y + + - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 + with: + submodules: 'true' + + - name: Install dependencies + shell: bash -l {0} + run: | + /tmp/R-devel/bin/Rscript -e "source('./R-package/tests/helper_scripts/install_deps.R')" + + - name: Run R tests + shell: bash + run: | + cd R-package && \ + /tmp/R-devel/bin/R CMD INSTALL . && \ + /tmp/R-devel/bin/R -q -e "library(testthat); setwd('tests'); source('testthat.R')" diff --git a/.github/workflows/sycl_tests.yml b/.github/workflows/sycl_tests.yml index 54ebcb5f9532..b317050fc652 100644 --- a/.github/workflows/sycl_tests.yml +++ b/.github/workflows/sycl_tests.yml @@ -5,6 +5,10 @@ on: [push, pull_request] permissions: contents: read # to fetch code (actions/checkout) +defaults: + run: + shell: bash -l {0} + concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} cancel-in-progress: true @@ -37,20 +41,16 @@ jobs: conda info conda list - name: Build and install XGBoost - shell: bash -l {0} run: | mkdir build cd build - cmake .. -DGOOGLE_TEST=ON -DUSE_DMLC_GTEST=ON -DPLUGIN_SYCL=ON -DCMAKE_CXX_COMPILER=g++ -DCMAKE_C_COMPILER=gcc -DCMAKE_INSTALL_PREFIX=$CONDA_PREFIX - make -j$(nproc) - - name: Run gtest binary for SYCL - run: | - cd build - ./testxgboost --gtest_filter=Sycl* - - name: Run gtest binary for non SYCL + cmake .. -DGOOGLE_TEST=ON -DUSE_DMLC_GTEST=ON -DPLUGIN_SYCL=ON -DCMAKE_CXX_COMPILER=g++ \ + -DCMAKE_C_COMPILER=gcc -DCMAKE_INSTALL_PREFIX=$CONDA_PREFIX -GNinja + ninja + - name: Run gtest run: | cd build - ./testxgboost --gtest_filter=-Sycl* + ./testxgboost python-sycl-tests-on-ubuntu: name: Test XGBoost Python package with SYCL on ${{ matrix.os }} @@ -82,8 +82,9 @@ jobs: run: | mkdir build cd build - cmake .. -DPLUGIN_SYCL=ON -DCMAKE_CXX_COMPILER=g++ -DCMAKE_C_COMPILER=gcc -DCMAKE_PREFIX_PATH=$CONDA_PREFIX - make -j$(nproc) + cmake .. -DPLUGIN_SYCL=ON -DCMAKE_CXX_COMPILER=g++ -DCMAKE_C_COMPILER=gcc \ + -DCMAKE_PREFIX_PATH=$CONDA_PREFIX -GNinja + ninja - name: Install Python package run: | cd python-package diff --git a/ops/conda_env/linux_sycl_test.yml b/ops/conda_env/linux_sycl_test.yml index 5b3a15f7e3b1..f1ce49492d42 100644 --- a/ops/conda_env/linux_sycl_test.yml +++ b/ops/conda_env/linux_sycl_test.yml @@ -18,6 +18,7 @@ dependencies: - pytest-timeout - pytest-cov - dask +- ninja - dpcpp_linux-64 - onedpl-devel - intel-openmp diff --git a/ops/docker/dockerfile/Dockerfile.gpu_build_rockylinux8 b/ops/docker/dockerfile/Dockerfile.gpu_build_rockylinux8 index d021190b6744..ae79e88b15b3 100644 --- a/ops/docker/dockerfile/Dockerfile.gpu_build_rockylinux8 +++ b/ops/docker/dockerfile/Dockerfile.gpu_build_rockylinux8 @@ -76,7 +76,7 @@ RUN set -ex; \ # Default entry-point to use if running locally # It will preserve attributes of created files -COPY entrypoint.sh /scripts/ +COPY docker/entrypoint.sh /scripts/ WORKDIR /workspace ENTRYPOINT ["/scripts/entrypoint.sh"] diff --git a/ops/pipeline/build-jvm-doc-impl.sh b/ops/pipeline/build-jvm-doc-impl.sh index c334b8ad91d1..4e95f284e25c 100755 --- a/ops/pipeline/build-jvm-doc-impl.sh +++ b/ops/pipeline/build-jvm-doc-impl.sh @@ -27,7 +27,7 @@ mvn --no-transfer-progress javadoc:javadoc -Pdocs # Package JVM docs in a tarball mkdir -p tmp/scaladocs -cp -rv xgboost4j/target/site/apidocs/ ./tmp/javadocs/ +cp -rv xgboost4j/target/reports/apidocs/ ./tmp/javadocs/ cp -rv xgboost4j/target/site/scaladocs/ ./tmp/scaladocs/xgboost4j/ cp -rv xgboost4j-spark/target/site/scaladocs/ ./tmp/scaladocs/xgboost4j-spark/ cp -rv xgboost4j-spark-gpu/target/site/scaladocs/ ./tmp/scaladocs/xgboost4j-spark-gpu/ From dce4f7ba55b78a7754b6c9ee2e08460ea3ed825c Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Mon, 18 Nov 2024 19:40:22 -0800 Subject: [PATCH 36/45] Disable dependabot for now --- .github/dependabot.yml | 35 ----------------------------------- 1 file changed, 35 deletions(-) delete mode 100644 .github/dependabot.yml diff --git a/.github/dependabot.yml b/.github/dependabot.yml deleted file mode 100644 index 1a8098071ba3..000000000000 --- a/.github/dependabot.yml +++ /dev/null @@ -1,35 +0,0 @@ -# To get started with Dependabot version updates, you'll need to specify which -# package ecosystems to update and where the package manifests are located. -# Please see the documentation for all configuration options: -# https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates - -version: 2 -updates: - - package-ecosystem: "maven" - directory: "/jvm-packages" - schedule: - interval: "monthly" - - package-ecosystem: "maven" - directory: "/jvm-packages/xgboost4j" - schedule: - interval: "monthly" - - package-ecosystem: "maven" - directory: "/jvm-packages/xgboost4j-gpu" - schedule: - interval: "monthly" - - package-ecosystem: "maven" - directory: "/jvm-packages/xgboost4j-example" - schedule: - interval: "monthly" - - package-ecosystem: "maven" - directory: "/jvm-packages/xgboost4j-spark" - schedule: - interval: "monthly" - - package-ecosystem: "maven" - directory: "/jvm-packages/xgboost4j-spark-gpu" - schedule: - interval: "monthly" - - package-ecosystem: "github-actions" - directory: / - schedule: - interval: "monthly" From dfd5624d181ba550d2719eca06652f15702ffa79 Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Tue, 19 Nov 2024 08:09:48 -0800 Subject: [PATCH 37/45] Fixes --- .github/workflows/freebsd.yml | 32 ++-- .github/workflows/i386.yml | 44 +++--- .github/workflows/jvm_tests.yml | 91 +++++------ .github/workflows/lint.yml | 122 +++++++-------- .github/workflows/misc.yml | 172 ++++++++++---------- .github/workflows/python_tests.yml | 182 +++++++++++----------- .github/workflows/python_wheels_macos.yml | 46 +++--- .github/workflows/r_nold.yml | 40 ++--- .github/workflows/r_tests.yml | 152 +++++++++--------- .github/workflows/sycl_tests.yml | 58 +++---- .github/workflows/update_rapids.yml | 32 ++-- 11 files changed, 487 insertions(+), 484 deletions(-) diff --git a/.github/workflows/freebsd.yml b/.github/workflows/freebsd.yml index d3208a1294d1..d0eb13c20fb6 100644 --- a/.github/workflows/freebsd.yml +++ b/.github/workflows/freebsd.yml @@ -15,20 +15,20 @@ jobs: timeout-minutes: 20 name: A job to run test in FreeBSD steps: - - uses: actions/checkout@v4 - with: - submodules: 'true' - - name: Test in FreeBSD - id: test - uses: vmactions/freebsd-vm@v1 - with: - usesh: true - prepare: | - pkg install -y cmake git ninja googletest + - uses: actions/checkout@v4 + with: + submodules: 'true' + - name: Test in FreeBSD + id: test + uses: vmactions/freebsd-vm@v1 + with: + usesh: true + prepare: | + pkg install -y cmake git ninja googletest - run: | - mkdir build - cd build - cmake .. -GNinja -DGOOGLE_TEST=ON - ninja -v - ./testxgboost + run: | + mkdir build + cd build + cmake .. -GNinja -DGOOGLE_TEST=ON + ninja -v + ./testxgboost diff --git a/.github/workflows/i386.yml b/.github/workflows/i386.yml index aa71147e2581..455d6ea91033 100644 --- a/.github/workflows/i386.yml +++ b/.github/workflows/i386.yml @@ -19,25 +19,25 @@ jobs: ports: - 5000:5000 steps: - - uses: actions/checkout@v4.2.2 - with: - submodules: 'true' - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3.7.1 - with: - driver-opts: network=host - - name: Build and push container - uses: docker/build-push-action@v6 - with: - context: . - file: ops/docker/dockerfile/Dockerfile.i386 - push: true - tags: localhost:5000/xgboost/build-32bit:latest - cache-from: type=gha - cache-to: type=gha,mode=max - - name: Build XGBoost - run: | - docker run --rm -v $PWD:/workspace -w /workspace \ - -e CXXFLAGS='-Wno-error=overloaded-virtual -Wno-error=maybe-uninitialized -Wno-error=redundant-move' \ - localhost:5000/xgboost/build-32bit:latest \ - bash ops/script/build_via_cmake.sh + - uses: actions/checkout@v4.2.2 + with: + submodules: 'true' + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3.7.1 + with: + driver-opts: network=host + - name: Build and push container + uses: docker/build-push-action@v6 + with: + context: . + file: ops/docker/dockerfile/Dockerfile.i386 + push: true + tags: localhost:5000/xgboost/build-32bit:latest + cache-from: type=gha + cache-to: type=gha,mode=max + - name: Build XGBoost + run: | + docker run --rm -v $PWD:/workspace -w /workspace \ + -e CXXFLAGS='-Wno-error=overloaded-virtual -Wno-error=maybe-uninitialized -Wno-error=redundant-move' \ + localhost:5000/xgboost/build-32bit:latest \ + bash ops/script/build_via_cmake.sh diff --git a/.github/workflows/jvm_tests.yml b/.github/workflows/jvm_tests.yml index f9385fa4acaf..ab21e2f19466 100644 --- a/.github/workflows/jvm_tests.yml +++ b/.github/workflows/jvm_tests.yml @@ -110,6 +110,9 @@ jobs: with: submodules: "true" - run: bash ${{ matrix.script }} + env: + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID_IAM_S3_UPLOADER }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY_IAM_S3_UPLOADER }} build-jvm-docs: name: Build docs for JVM packages @@ -169,50 +172,50 @@ jobs: os: [windows-latest, macos-13] steps: - - uses: actions/checkout@v4.2.2 - with: - submodules: 'true' - - - uses: actions/setup-java@v4.5.0 - with: - distribution: 'temurin' - java-version: '8' - - - uses: conda-incubator/setup-miniconda@v3.1.0 - with: - miniforge-variant: Miniforge3 - miniforge-version: latest - activate-environment: jvm_tests - environment-file: ops/conda_env/jvm_tests.yml - use-mamba: true - - - name: Cache Maven packages - uses: actions/cache@v4.1.2 - with: - path: ~/.m2 - key: ${{ runner.os }}-m2-${{ hashFiles('./jvm-packages/pom.xml') }} - restore-keys: ${{ runner.os }}-m2-${{ hashFiles('./jvm-packages/pom.xml') }} - - - name: Test XGBoost4J (Core) - run: | - cd jvm-packages - mvn test -B -pl :xgboost4j_2.12 - - - name: Publish artifact xgboost4j.dll to S3 - run: | - cd lib/ - Rename-Item -Path xgboost4j.dll -NewName xgboost4j_${{ github.sha }}.dll - dir - python -m awscli s3 cp xgboost4j_${{ github.sha }}.dll ` - s3://xgboost-nightly-builds/${{ env.BRANCH_NAME }}/libxgboost4j/ ` - --acl public-read --region us-west-2 - if: matrix.os == 'windows-latest' - # if: | - # (github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_')) && - # matrix.os == 'windows-latest' - env: - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID_IAM_S3_UPLOADER }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY_IAM_S3_UPLOADER }} + - uses: actions/checkout@v4.2.2 + with: + submodules: 'true' + + - uses: actions/setup-java@v4.5.0 + with: + distribution: 'temurin' + java-version: '8' + + - uses: conda-incubator/setup-miniconda@v3.1.0 + with: + miniforge-variant: Miniforge3 + miniforge-version: latest + activate-environment: jvm_tests + environment-file: ops/conda_env/jvm_tests.yml + use-mamba: true + + - name: Cache Maven packages + uses: actions/cache@v4.1.2 + with: + path: ~/.m2 + key: ${{ runner.os }}-m2-${{ hashFiles('./jvm-packages/pom.xml') }} + restore-keys: ${{ runner.os }}-m2-${{ hashFiles('./jvm-packages/pom.xml') }} + + - name: Test XGBoost4J (Core) + run: | + cd jvm-packages + mvn test -B -pl :xgboost4j_2.12 + + - name: Publish artifact xgboost4j.dll to S3 + run: | + cd lib/ + Rename-Item -Path xgboost4j.dll -NewName xgboost4j_${{ github.sha }}.dll + dir + python -m awscli s3 cp xgboost4j_${{ github.sha }}.dll ` + s3://xgboost-nightly-builds/${{ env.BRANCH_NAME }}/libxgboost4j/ ` + --acl public-read --region us-west-2 + if: matrix.os == 'windows-latest' + # if: | + # (github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_')) && + # matrix.os == 'windows-latest' + env: + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID_IAM_S3_UPLOADER }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY_IAM_S3_UPLOADER }} test-jvm-packages-gpu: name: Test JVM packages with CUDA diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index caceb3e3893b..59b3cecf57ed 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -58,52 +58,52 @@ jobs: runs-on: ubuntu-latest name: Type and format checks for the Python package steps: - - uses: actions/checkout@v4.2.2 - with: - submodules: 'true' - - uses: conda-incubator/setup-miniconda@v3.1.0 - with: - miniforge-variant: Miniforge3 - miniforge-version: latest - activate-environment: python_lint - environment-file: ops/conda_env/python_lint.yml - use-mamba: true - - name: Display Conda env - shell: bash -el {0} - run: | - conda info - conda list - - name: Run mypy - shell: bash -el {0} - run: | - python ops/script/lint_python.py --format=0 --type-check=1 --pylint=0 - - name: Run formatter - shell: bash -el {0} - run: | - python ops/script/lint_python.py --format=1 --type-check=0 --pylint=0 - - name: Run pylint - shell: bash -el {0} - run: | - python ops/script/lint_python.py --format=0 --type-check=0 --pylint=1 + - uses: actions/checkout@v4.2.2 + with: + submodules: 'true' + - uses: conda-incubator/setup-miniconda@v3.1.0 + with: + miniforge-variant: Miniforge3 + miniforge-version: latest + activate-environment: python_lint + environment-file: ops/conda_env/python_lint.yml + use-mamba: true + - name: Display Conda env + shell: bash -el {0} + run: | + conda info + conda list + - name: Run mypy + shell: bash -el {0} + run: | + python ops/script/lint_python.py --format=0 --type-check=1 --pylint=0 + - name: Run formatter + shell: bash -el {0} + run: | + python ops/script/lint_python.py --format=1 --type-check=0 --pylint=0 + - name: Run pylint + shell: bash -el {0} + run: | + python ops/script/lint_python.py --format=0 --type-check=0 --pylint=1 cpp-lint: runs-on: ubuntu-latest name: Code linting for C++ steps: - - uses: actions/checkout@v4.2.2 - with: - submodules: 'true' - - uses: actions/setup-python@v5.3.0 - with: - python-version: "3.10" - architecture: 'x64' - - name: Install Python packages - run: | - python -m pip install wheel setuptools cmakelint cpplint==1.6.1 pylint - - name: Run lint - run: | - python3 ops/script/lint_cpp.py - bash ops/script/lint_cmake.sh + - uses: actions/checkout@v4.2.2 + with: + submodules: 'true' + - uses: actions/setup-python@v5.3.0 + with: + python-version: "3.10" + architecture: 'x64' + - name: Install Python packages + run: | + python -m pip install wheel setuptools cmakelint cpplint==1.6.1 pylint + - name: Run lint + run: | + python3 ops/script/lint_cpp.py + bash ops/script/lint_cmake.sh lintr: runs-on: ${{ matrix.os }} @@ -118,27 +118,27 @@ jobs: R_REMOTES_NO_ERRORS_FROM_WARNINGS: true steps: - - uses: actions/checkout@v4.2.2 - with: - submodules: 'true' + - uses: actions/checkout@v4.2.2 + with: + submodules: 'true' - - uses: r-lib/actions/setup-r@v2.11.0 - with: - r-version: ${{ matrix.r }} + - uses: r-lib/actions/setup-r@v2.11.0 + with: + r-version: ${{ matrix.r }} - - name: Cache R packages - uses: actions/cache@v4.1.2 - with: - path: ${{ env.R_LIBS_USER }} - key: ${{ runner.os }}-r-${{ matrix.r }}-7-${{ hashFiles('R-package/DESCRIPTION') }} - restore-keys: ${{ runner.os }}-r-${{ matrix.r }}-7-${{ hashFiles('R-package/DESCRIPTION') }} + - name: Cache R packages + uses: actions/cache@v4.1.2 + with: + path: ${{ env.R_LIBS_USER }} + key: ${{ runner.os }}-r-${{ matrix.r }}-7-${{ hashFiles('R-package/DESCRIPTION') }} + restore-keys: ${{ runner.os }}-r-${{ matrix.r }}-7-${{ hashFiles('R-package/DESCRIPTION') }} - - name: Install dependencies - shell: Rscript {0} - run: | - source("./R-package/tests/helper_scripts/install_deps.R") + - name: Install dependencies + shell: Rscript {0} + run: | + source("./R-package/tests/helper_scripts/install_deps.R") - - name: Run lintr - run: | - MAKEFLAGS="-j$(nproc)" R CMD INSTALL R-package/ - Rscript ops/script/lint_r.R $(pwd) + - name: Run lintr + run: | + MAKEFLAGS="-j$(nproc)" R CMD INSTALL R-package/ + Rscript ops/script/lint_r.R $(pwd) diff --git a/.github/workflows/misc.yml b/.github/workflows/misc.yml index 7294faa0d93b..b1b92c1528b7 100644 --- a/.github/workflows/misc.yml +++ b/.github/workflows/misc.yml @@ -22,23 +22,23 @@ jobs: matrix: os: [macos-13] steps: - - uses: actions/checkout@v4.2.2 - with: - submodules: 'true' - - name: Install system packages - run: | - brew install ninja libomp - - name: Build gtest binary - run: | - mkdir build - cd build - cmake .. -DGOOGLE_TEST=ON -DUSE_OPENMP=ON -DUSE_DMLC_GTEST=ON -GNinja -DBUILD_DEPRECATED_CLI=ON -DUSE_SANITIZER=ON -DENABLED_SANITIZERS=address -DCMAKE_BUILD_TYPE=RelWithDebInfo - ninja -v - - name: Run gtest binary - run: | - cd build - ./testxgboost - ctest -R TestXGBoostCLI --extra-verbose + - uses: actions/checkout@v4.2.2 + with: + submodules: 'true' + - name: Install system packages + run: | + brew install ninja libomp + - name: Build gtest binary + run: | + mkdir build + cd build + cmake .. -DGOOGLE_TEST=ON -DUSE_OPENMP=ON -DUSE_DMLC_GTEST=ON -GNinja -DBUILD_DEPRECATED_CLI=ON -DUSE_SANITIZER=ON -DENABLED_SANITIZERS=address -DCMAKE_BUILD_TYPE=RelWithDebInfo + ninja -v + - name: Run gtest binary + run: | + cd build + ./testxgboost + ctest -R TestXGBoostCLI --extra-verbose gtest-cpu-nonomp: name: Test Google C++ unittest (CPU Non-OMP) @@ -48,23 +48,23 @@ jobs: matrix: os: [ubuntu-latest] steps: - - uses: actions/checkout@v4.2.2 - with: - submodules: 'true' - - name: Install system packages - run: | - sudo apt-get install -y --no-install-recommends ninja-build - - name: Build and install XGBoost - shell: bash -l {0} - run: | - mkdir build - cd build - cmake .. -GNinja -DGOOGLE_TEST=ON -DUSE_DMLC_GTEST=ON -DUSE_OPENMP=OFF -DBUILD_DEPRECATED_CLI=ON - ninja -v - - name: Run gtest binary - run: | - cd build - ctest --extra-verbose + - uses: actions/checkout@v4.2.2 + with: + submodules: 'true' + - name: Install system packages + run: | + sudo apt-get install -y --no-install-recommends ninja-build + - name: Build and install XGBoost + shell: bash -l {0} + run: | + mkdir build + cd build + cmake .. -GNinja -DGOOGLE_TEST=ON -DUSE_DMLC_GTEST=ON -DUSE_OPENMP=OFF -DBUILD_DEPRECATED_CLI=ON + ninja -v + - name: Run gtest binary + run: | + cd build + ctest --extra-verbose c-api-demo: name: Test installing XGBoost lib + building the C API demo @@ -78,56 +78,56 @@ jobs: os: ["ubuntu-latest"] python-version: ["3.10"] steps: - - uses: actions/checkout@v4.2.2 - with: - submodules: 'true' - - uses: conda-incubator/setup-miniconda@v3.1.0 - with: - miniforge-variant: Miniforge3 - miniforge-version: latest - activate-environment: cpp_test - environment-file: ops/conda_env/cpp_test.yml - use-mamba: true - - name: Display Conda env - run: | - conda info - conda list - - name: Build and install XGBoost static library - run: | - mkdir build - cd build - cmake .. -DBUILD_STATIC_LIB=ON -DCMAKE_INSTALL_PREFIX=$CONDA_PREFIX -GNinja - ninja -v install - cd - - - name: Build and run C API demo with static - run: | - pushd . - cd demo/c-api/ - mkdir build - cd build - cmake .. -GNinja -DCMAKE_PREFIX_PATH=$CONDA_PREFIX - ninja -v - ctest - cd .. - rm -rf ./build - popd + - uses: actions/checkout@v4.2.2 + with: + submodules: 'true' + - uses: conda-incubator/setup-miniconda@v3.1.0 + with: + miniforge-variant: Miniforge3 + miniforge-version: latest + activate-environment: cpp_test + environment-file: ops/conda_env/cpp_test.yml + use-mamba: true + - name: Display Conda env + run: | + conda info + conda list + - name: Build and install XGBoost static library + run: | + mkdir build + cd build + cmake .. -DBUILD_STATIC_LIB=ON -DCMAKE_INSTALL_PREFIX=$CONDA_PREFIX -GNinja + ninja -v install + cd - + - name: Build and run C API demo with static + run: | + pushd . + cd demo/c-api/ + mkdir build + cd build + cmake .. -GNinja -DCMAKE_PREFIX_PATH=$CONDA_PREFIX + ninja -v + ctest + cd .. + rm -rf ./build + popd - - name: Build and install XGBoost shared library - run: | - cd build - cmake .. -DBUILD_STATIC_LIB=OFF -DCMAKE_INSTALL_PREFIX=$CONDA_PREFIX -GNinja -DPLUGIN_FEDERATED=ON -DGOOGLE_TEST=ON - ninja -v install - ./testxgboost - cd - - - name: Build and run C API demo with shared - run: | - pushd . - cd demo/c-api/ - mkdir build - cd build - cmake .. -GNinja -DCMAKE_PREFIX_PATH=$CONDA_PREFIX - ninja -v - ctest - popd - ./ops/script/verify_link.sh ./demo/c-api/build/basic/api-demo - ./ops/script/verify_link.sh ./demo/c-api/build/external-memory/external-memory-demo + - name: Build and install XGBoost shared library + run: | + cd build + cmake .. -DBUILD_STATIC_LIB=OFF -DCMAKE_INSTALL_PREFIX=$CONDA_PREFIX -GNinja -DPLUGIN_FEDERATED=ON -DGOOGLE_TEST=ON + ninja -v install + ./testxgboost + cd - + - name: Build and run C API demo with shared + run: | + pushd . + cd demo/c-api/ + mkdir build + cd build + cmake .. -GNinja -DCMAKE_PREFIX_PATH=$CONDA_PREFIX + ninja -v + ctest + popd + ./ops/script/verify_link.sh ./demo/c-api/build/basic/api-demo + ./ops/script/verify_link.sh ./demo/c-api/build/external-memory/external-memory-demo diff --git a/.github/workflows/python_tests.yml b/.github/workflows/python_tests.yml index c43d8b056c8d..344e2f276b22 100644 --- a/.github/workflows/python_tests.yml +++ b/.github/workflows/python_tests.yml @@ -22,28 +22,28 @@ jobs: matrix: os: [ubuntu-latest] steps: - - uses: actions/checkout@v4.2.2 - with: - submodules: 'true' - - uses: conda-incubator/setup-miniconda@v3.1.0 - with: - miniforge-variant: Miniforge3 - miniforge-version: latest - activate-environment: sdist_test - environment-file: ops/conda_env/sdist_test.yml - use-mamba: true - - name: Display Conda env - run: | - conda info - conda list - - name: Build and install XGBoost - run: | - cd python-package - python --version - python -m build --sdist - pip install -v ./dist/xgboost-*.tar.gz --config-settings use_openmp=False - cd .. - python -c 'import xgboost' + - uses: actions/checkout@v4.2.2 + with: + submodules: 'true' + - uses: conda-incubator/setup-miniconda@v3.1.0 + with: + miniforge-variant: Miniforge3 + miniforge-version: latest + activate-environment: sdist_test + environment-file: ops/conda_env/sdist_test.yml + use-mamba: true + - name: Display Conda env + run: | + conda info + conda list + - name: Build and install XGBoost + run: | + cd python-package + python --version + python -m build --sdist + pip install -v ./dist/xgboost-*.tar.gz --config-settings use_openmp=False + cd .. + python -c 'import xgboost' python-sdist-test: # Use system toolchain instead of conda toolchain for macos and windows. @@ -56,33 +56,33 @@ jobs: os: [macos-13, windows-latest] python-version: ["3.10"] steps: - - uses: actions/checkout@v4.2.2 - with: - submodules: 'true' - - name: Install osx system dependencies - if: matrix.os == 'macos-13' - run: | - brew install ninja libomp - - uses: conda-incubator/setup-miniconda@v3.1.0 - with: - auto-update-conda: true - python-version: ${{ matrix.python-version }} - activate-environment: test - - name: Install build - run: | - conda install -c conda-forge python-build - - name: Display Conda env - run: | - conda info - conda list - - name: Build and install XGBoost - run: | - cd python-package - python --version - python -m build --sdist - pip install -v ./dist/xgboost-*.tar.gz - cd .. - python -c 'import xgboost' + - uses: actions/checkout@v4.2.2 + with: + submodules: 'true' + - name: Install osx system dependencies + if: matrix.os == 'macos-13' + run: | + brew install ninja libomp + - uses: conda-incubator/setup-miniconda@v3.1.0 + with: + auto-update-conda: true + python-version: ${{ matrix.python-version }} + activate-environment: test + - name: Install build + run: | + conda install -c conda-forge python-build + - name: Display Conda env + run: | + conda info + conda list + - name: Build and install XGBoost + run: | + cd python-package + python --version + python -m build --sdist + pip install -v ./dist/xgboost-*.tar.gz + cd .. + python -c 'import xgboost' python-tests-on-macos: name: Test XGBoost Python package on ${{ matrix.os }} @@ -94,48 +94,48 @@ jobs: os: [macos-13] steps: - - uses: actions/checkout@v4.2.2 - with: - submodules: 'true' - - - uses: conda-incubator/setup-miniconda@v3.1.0 - with: - miniforge-variant: Miniforge3 - miniforge-version: latest - activate-environment: macos_cpu_test - environment-file: ops/conda_env/macos_cpu_test.yml - use-mamba: true - - - name: Display Conda env - run: | - conda info - conda list - - - name: Build XGBoost on macos - run: | - brew install ninja - - mkdir build - cd build - # Set prefix, to use OpenMP library from Conda env - # See https://github.com/dmlc/xgboost/issues/7039#issuecomment-1025038228 - # to learn why we don't use libomp from Homebrew. - cmake .. -GNinja -DCMAKE_PREFIX_PATH=$CONDA_PREFIX -DBUILD_DEPRECATED_CLI=ON - ninja - - - name: Install Python package - run: | - cd python-package - python --version - pip install -v . - - - name: Test Python package - run: | - pytest -s -v -rxXs --durations=0 ./tests/python - - - name: Test Dask Interface - run: | - pytest -s -v -rxXs --durations=0 ./tests/test_distributed/test_with_dask + - uses: actions/checkout@v4.2.2 + with: + submodules: 'true' + + - uses: conda-incubator/setup-miniconda@v3.1.0 + with: + miniforge-variant: Miniforge3 + miniforge-version: latest + activate-environment: macos_cpu_test + environment-file: ops/conda_env/macos_cpu_test.yml + use-mamba: true + + - name: Display Conda env + run: | + conda info + conda list + + - name: Build XGBoost on macos + run: | + brew install ninja + + mkdir build + cd build + # Set prefix, to use OpenMP library from Conda env + # See https://github.com/dmlc/xgboost/issues/7039#issuecomment-1025038228 + # to learn why we don't use libomp from Homebrew. + cmake .. -GNinja -DCMAKE_PREFIX_PATH=$CONDA_PREFIX -DBUILD_DEPRECATED_CLI=ON + ninja + + - name: Install Python package + run: | + cd python-package + python --version + pip install -v . + + - name: Test Python package + run: | + pytest -s -v -rxXs --durations=0 ./tests/python + + - name: Test Dask Interface + run: | + pytest -s -v -rxXs --durations=0 ./tests/test_distributed/test_with_dask python-system-installation-on-ubuntu: name: Test XGBoost Python package System Installation on ${{ matrix.os }} diff --git a/.github/workflows/python_wheels_macos.yml b/.github/workflows/python_wheels_macos.yml index a4cff8eb0e6f..f0f5042660df 100644 --- a/.github/workflows/python_wheels_macos.yml +++ b/.github/workflows/python_wheels_macos.yml @@ -30,26 +30,26 @@ jobs: - os: macos-14 platform_id: macosx_arm64 steps: - - uses: actions/checkout@v4.2.2 - with: - submodules: 'true' - - name: Set up homebrew - uses: Homebrew/actions/setup-homebrew@13341b4d5e459a98bbe0b122b12c11bf90518cc8 - - name: Install libomp - run: brew install libomp - - uses: conda-incubator/setup-miniconda@v3.1.0 - with: - miniforge-variant: Miniforge3 - miniforge-version: latest - python-version: "3.10" - use-mamba: true - - name: Build wheels - run: bash ops/pipeline/build-python-wheels-macos.sh ${{ matrix.platform_id }} ${{ github.sha }} - - name: Upload Python wheel - # if: github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_') - run: | - python -m pip install awscli - python -m awscli s3 cp wheelhouse/*.whl s3://xgboost-nightly-builds/${{ env.BRANCH_NAME }}/ --acl public-read --region us-west-2 - env: - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID_IAM_S3_UPLOADER }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY_IAM_S3_UPLOADER }} + - uses: actions/checkout@v4.2.2 + with: + submodules: 'true' + - name: Set up homebrew + uses: Homebrew/actions/setup-homebrew@13341b4d5e459a98bbe0b122b12c11bf90518cc8 + - name: Install libomp + run: brew install libomp + - uses: conda-incubator/setup-miniconda@v3.1.0 + with: + miniforge-variant: Miniforge3 + miniforge-version: latest + python-version: "3.10" + use-mamba: true + - name: Build wheels + run: bash ops/pipeline/build-python-wheels-macos.sh ${{ matrix.platform_id }} ${{ github.sha }} + - name: Upload Python wheel + # if: github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_') + run: | + python -m pip install awscli + python -m awscli s3 cp wheelhouse/*.whl s3://xgboost-nightly-builds/${{ env.BRANCH_NAME }}/ --acl public-read --region us-west-2 + env: + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID_IAM_S3_UPLOADER }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY_IAM_S3_UPLOADER }} diff --git a/.github/workflows/r_nold.yml b/.github/workflows/r_nold.yml index 4b506927e06c..6ff4aa079e95 100644 --- a/.github/workflows/r_nold.yml +++ b/.github/workflows/r_nold.yml @@ -22,23 +22,23 @@ jobs: container: image: rhub/debian-gcc-devel-nold steps: - - name: Install git and system packages - shell: bash - run: | - apt update && apt install libcurl4-openssl-dev libssl-dev libssh2-1-dev libgit2-dev libglpk-dev libxml2-dev libharfbuzz-dev libfribidi-dev git -y - - - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 - with: - submodules: 'true' - - - name: Install dependencies - shell: bash -l {0} - run: | - /tmp/R-devel/bin/Rscript -e "source('./R-package/tests/helper_scripts/install_deps.R')" - - - name: Run R tests - shell: bash - run: | - cd R-package && \ - /tmp/R-devel/bin/R CMD INSTALL . && \ - /tmp/R-devel/bin/R -q -e "library(testthat); setwd('tests'); source('testthat.R')" + - name: Install git and system packages + shell: bash + run: | + apt update && apt install libcurl4-openssl-dev libssl-dev libssh2-1-dev libgit2-dev libglpk-dev libxml2-dev libharfbuzz-dev libfribidi-dev git -y + + - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 + with: + submodules: 'true' + + - name: Install dependencies + shell: bash -l {0} + run: | + /tmp/R-devel/bin/Rscript -e "source('./R-package/tests/helper_scripts/install_deps.R')" + + - name: Run R tests + shell: bash + run: | + cd R-package && \ + /tmp/R-devel/bin/R CMD INSTALL . && \ + /tmp/R-devel/bin/R -q -e "library(testthat); setwd('tests'); source('testthat.R')" diff --git a/.github/workflows/r_tests.yml b/.github/workflows/r_tests.yml index 3885c126f11e..f5e5152fa29a 100644 --- a/.github/workflows/r_tests.yml +++ b/.github/workflows/r_tests.yml @@ -32,47 +32,47 @@ jobs: R_REMOTES_NO_ERRORS_FROM_WARNINGS: true steps: - - name: Install system dependencies - run: | - sudo apt update - sudo apt install libcurl4-openssl-dev libssl-dev libssh2-1-dev libgit2-dev libglpk-dev libxml2-dev libharfbuzz-dev libfribidi-dev - if: matrix.os == 'ubuntu-latest' - - uses: actions/checkout@v4.2.2 - with: - submodules: 'true' - - - uses: r-lib/actions/setup-r@v2.11.0 - with: - r-version: ${{ matrix.r }} - - - name: Cache R packages - uses: actions/cache@v4.1.2 - with: - path: ${{ env.R_LIBS_USER }} - key: ${{ runner.os }}-r-${{ matrix.r }}-7-${{ hashFiles('R-package/DESCRIPTION') }} - restore-keys: ${{ runner.os }}-r-${{ matrix.r }}-7-${{ hashFiles('R-package/DESCRIPTION') }} - - - uses: actions/setup-python@v5.3.0 - with: - python-version: "3.10" - architecture: 'x64' - - - uses: r-lib/actions/setup-tinytex@v2 - - - name: Install dependencies - shell: Rscript {0} - run: | - source("./R-package/tests/helper_scripts/install_deps.R") - - - name: Test R - run: | - python ops/script/test_r_package.py --compiler='${{ matrix.compiler }}' --build-tool="${{ matrix.build }}" --task=check - if: matrix.compiler != 'none' - - - name: Test R - run: | - python ops/script/test_r_package.py --build-tool="${{ matrix.build }}" --task=check - if: matrix.compiler == 'none' + - name: Install system dependencies + run: | + sudo apt update + sudo apt install libcurl4-openssl-dev libssl-dev libssh2-1-dev libgit2-dev libglpk-dev libxml2-dev libharfbuzz-dev libfribidi-dev + if: matrix.os == 'ubuntu-latest' + - uses: actions/checkout@v4.2.2 + with: + submodules: 'true' + + - uses: r-lib/actions/setup-r@v2.11.0 + with: + r-version: ${{ matrix.r }} + + - name: Cache R packages + uses: actions/cache@v4.1.2 + with: + path: ${{ env.R_LIBS_USER }} + key: ${{ runner.os }}-r-${{ matrix.r }}-7-${{ hashFiles('R-package/DESCRIPTION') }} + restore-keys: ${{ runner.os }}-r-${{ matrix.r }}-7-${{ hashFiles('R-package/DESCRIPTION') }} + + - uses: actions/setup-python@v5.3.0 + with: + python-version: "3.10" + architecture: 'x64' + + - uses: r-lib/actions/setup-tinytex@v2 + + - name: Install dependencies + shell: Rscript {0} + run: | + source("./R-package/tests/helper_scripts/install_deps.R") + + - name: Test R + run: | + python ops/script/test_r_package.py --compiler='${{ matrix.compiler }}' --build-tool="${{ matrix.build }}" --task=check + if: matrix.compiler != 'none' + + - name: Test R + run: | + python ops/script/test_r_package.py --build-tool="${{ matrix.build }}" --task=check + if: matrix.compiler == 'none' test-R-on-Debian: name: Test R package on Debian @@ -81,38 +81,38 @@ jobs: image: rhub/debian-gcc-release steps: - - name: Install system dependencies - run: | - # Must run before checkout to have the latest git installed. - # No need to add pandoc, the container has it figured out. - apt update && apt install libcurl4-openssl-dev libssl-dev libssh2-1-dev libgit2-dev libglpk-dev libxml2-dev libharfbuzz-dev libfribidi-dev git -y - - - name: Trust git cloning project sources - run: | - git config --global --add safe.directory "${GITHUB_WORKSPACE}" - - - uses: actions/checkout@v4.2.2 - with: - submodules: 'true' - - - name: Install dependencies - shell: bash -l {0} - run: | - Rscript -e "source('./R-package/tests/helper_scripts/install_deps.R')" - - - name: Test R - shell: bash -l {0} - run: | - python3 ops/script/test_r_package.py --r=/usr/bin/R --build-tool=autotools --task=check - - - uses: dorny/paths-filter@v3 - id: changes - with: - filters: | - r_package: - - 'R-package/**' - - - name: Run document check - if: steps.changes.outputs.r_package == 'true' - run: | - python3 ops/script/test_r_package.py --r=/usr/bin/R --task=doc + - name: Install system dependencies + run: | + # Must run before checkout to have the latest git installed. + # No need to add pandoc, the container has it figured out. + apt update && apt install libcurl4-openssl-dev libssl-dev libssh2-1-dev libgit2-dev libglpk-dev libxml2-dev libharfbuzz-dev libfribidi-dev git -y + + - name: Trust git cloning project sources + run: | + git config --global --add safe.directory "${GITHUB_WORKSPACE}" + + - uses: actions/checkout@v4.2.2 + with: + submodules: 'true' + + - name: Install dependencies + shell: bash -l {0} + run: | + Rscript -e "source('./R-package/tests/helper_scripts/install_deps.R')" + + - name: Test R + shell: bash -l {0} + run: | + python3 ops/script/test_r_package.py --r=/usr/bin/R --build-tool=autotools --task=check + + - uses: dorny/paths-filter@v3 + id: changes + with: + filters: | + r_package: + - 'R-package/**' + + - name: Run document check + if: steps.changes.outputs.r_package == 'true' + run: | + python3 ops/script/test_r_package.py --r=/usr/bin/R --task=doc diff --git a/.github/workflows/sycl_tests.yml b/.github/workflows/sycl_tests.yml index b317050fc652..467734607ea6 100644 --- a/.github/workflows/sycl_tests.yml +++ b/.github/workflows/sycl_tests.yml @@ -62,34 +62,34 @@ jobs: os: [ubuntu-latest] steps: - - uses: actions/checkout@v4.2.2 - with: - submodules: 'true' + - uses: actions/checkout@v4.2.2 + with: + submodules: 'true' - - uses: conda-incubator/setup-miniconda@v3.1.0 - with: - miniforge-variant: Miniforge3 - miniforge-version: latest - activate-environment: linux_sycl_test - environment-file: ops/conda_env/linux_sycl_test.yml - use-mamba: true + - uses: conda-incubator/setup-miniconda@v3.1.0 + with: + miniforge-variant: Miniforge3 + miniforge-version: latest + activate-environment: linux_sycl_test + environment-file: ops/conda_env/linux_sycl_test.yml + use-mamba: true - - name: Display Conda env - run: | - conda info - conda list - - name: Build XGBoost on Ubuntu - run: | - mkdir build - cd build - cmake .. -DPLUGIN_SYCL=ON -DCMAKE_CXX_COMPILER=g++ -DCMAKE_C_COMPILER=gcc \ - -DCMAKE_PREFIX_PATH=$CONDA_PREFIX -GNinja - ninja - - name: Install Python package - run: | - cd python-package - python --version - pip install -v . - - name: Test Python package - run: | - pytest -s -v -rxXs --durations=0 ./tests/python-sycl/ + - name: Display Conda env + run: | + conda info + conda list + - name: Build XGBoost on Ubuntu + run: | + mkdir build + cd build + cmake .. -DPLUGIN_SYCL=ON -DCMAKE_CXX_COMPILER=g++ -DCMAKE_C_COMPILER=gcc \ + -DCMAKE_PREFIX_PATH=$CONDA_PREFIX -GNinja + ninja + - name: Install Python package + run: | + cd python-package + python --version + pip install -v . + - name: Test Python package + run: | + pytest -s -v -rxXs --durations=0 ./tests/python-sycl/ diff --git a/.github/workflows/update_rapids.yml b/.github/workflows/update_rapids.yml index 636661db46b8..03a39f72b660 100644 --- a/.github/workflows/update_rapids.yml +++ b/.github/workflows/update_rapids.yml @@ -25,20 +25,20 @@ jobs: name: Check latest RAPIDS runs-on: ubuntu-latest steps: - - uses: actions/checkout@v4.2.2 - with: - submodules: 'true' - - name: Check latest RAPIDS and update conftest.sh - run: | - bash tests/buildkite/update-rapids.sh - - name: Create Pull Request - uses: peter-evans/create-pull-request@v7 - if: github.ref == 'refs/heads/master' - with: - add-paths: | - tests/buildkite - branch: create-pull-request/update-rapids - base: master - title: "[CI] Update RAPIDS to latest stable" - commit-message: "[CI] Update RAPIDS to latest stable" + - uses: actions/checkout@v4.2.2 + with: + submodules: 'true' + - name: Check latest RAPIDS and update conftest.sh + run: | + bash tests/buildkite/update-rapids.sh + - name: Create Pull Request + uses: peter-evans/create-pull-request@v7 + if: github.ref == 'refs/heads/master' + with: + add-paths: | + tests/buildkite + branch: create-pull-request/update-rapids + base: master + title: "[CI] Update RAPIDS to latest stable" + commit-message: "[CI] Update RAPIDS to latest stable" From 0ee55c2786f334c97f679cc09944c5991e24bfcd Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Tue, 19 Nov 2024 09:00:27 -0800 Subject: [PATCH 38/45] Don't upload artifacts from pull requests --- ops/pipeline/build-cuda-with-rmm.sh | 10 +++++----- ops/pipeline/build-cuda.sh | 10 +++++----- ops/pipeline/build-jvm-gpu.sh | 10 +++++----- ops/pipeline/build-jvm-macos-apple-silicon.sh | 6 +++--- ops/pipeline/build-jvm-macos-intel.sh | 6 +++--- 5 files changed, 21 insertions(+), 21 deletions(-) diff --git a/ops/pipeline/build-cuda-with-rmm.sh b/ops/pipeline/build-cuda-with-rmm.sh index 1da0e5e61827..50bbf8b340f3 100755 --- a/ops/pipeline/build-cuda-with-rmm.sh +++ b/ops/pipeline/build-cuda-with-rmm.sh @@ -8,12 +8,12 @@ source ops/pipeline/enforce-ci.sh echo "--- Build with CUDA with RMM" -#if [[ ($is_pull_request == 1) || ($is_release_branch == 0) ]] -#then +if [[ ($is_pull_request == 1) || ($is_release_branch == 0) ]] +then arch_flag="-DGPU_COMPUTE_VER=75" -#else -# arch_flag="" -#fi +else + arch_flag="" +fi echo "--- Build libxgboost from the source" python3 ops/docker_run.py \ diff --git a/ops/pipeline/build-cuda.sh b/ops/pipeline/build-cuda.sh index 0487fb209dbe..4ed82618da23 100755 --- a/ops/pipeline/build-cuda.sh +++ b/ops/pipeline/build-cuda.sh @@ -8,12 +8,12 @@ source ops/pipeline/enforce-ci.sh echo "--- Build with CUDA" -# if [[ ($is_pull_request == 1) || ($is_release_branch == 0) ]] -#then +if [[ ($is_pull_request == 1) || ($is_release_branch == 0) ]] +then arch_flag="-DGPU_COMPUTE_VER=75" -#else -# arch_flag="" -#fi +else + arch_flag="" +fi echo "--- Build libxgboost from the source" set -x diff --git a/ops/pipeline/build-jvm-gpu.sh b/ops/pipeline/build-jvm-gpu.sh index ee12fbd78289..6bcd2a327553 100755 --- a/ops/pipeline/build-jvm-gpu.sh +++ b/ops/pipeline/build-jvm-gpu.sh @@ -7,12 +7,12 @@ source ops/pipeline/enforce-ci.sh echo "--- Build libxgboost4j.so with CUDA" -# if [[ ($is_pull_request == 1) || ($is_release_branch == 0) ]] -#then +if [[ ($is_pull_request == 1) || ($is_release_branch == 0) ]] +then arch_flag="-DGPU_COMPUTE_VER=75" -#else -# arch_flag="" -#fi +else + arch_flag="" +fi COMMAND=$( cat <<-EOF diff --git a/ops/pipeline/build-jvm-macos-apple-silicon.sh b/ops/pipeline/build-jvm-macos-apple-silicon.sh index 0c0aa6300729..99ca20d7e1e3 100755 --- a/ops/pipeline/build-jvm-macos-apple-silicon.sh +++ b/ops/pipeline/build-jvm-macos-apple-silicon.sh @@ -34,11 +34,11 @@ pushd lib libname=libxgboost4j_m1_${GITHUB_SHA}.dylib mv -v libxgboost4j.dylib ${libname} -# if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]] -# then +if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]] +then aws s3 cp ${libname} \ s3://xgboost-nightly-builds/${BRANCH_NAME}/libxgboost4j/ \ --acl public-read --no-progress -# fi +fi popd set +x diff --git a/ops/pipeline/build-jvm-macos-intel.sh b/ops/pipeline/build-jvm-macos-intel.sh index ee71a8b13078..ecf480d3c063 100755 --- a/ops/pipeline/build-jvm-macos-intel.sh +++ b/ops/pipeline/build-jvm-macos-intel.sh @@ -34,11 +34,11 @@ pushd lib libname=libxgboost4j_intel_${GITHUB_SHA}.dylib mv -v libxgboost4j.dylib ${libname} -# if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]] -# then +if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]] +then aws s3 cp ${libname} \ s3://xgboost-nightly-builds/${BRANCH_NAME}/libxgboost4j/ \ --acl public-read --no-progress -# fi +fi popd set +x From cb55d7a7c53b6f7dbadcf321026dd051bb4caba9 Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Tue, 19 Nov 2024 09:39:10 -0800 Subject: [PATCH 39/45] Fix --- .github/workflows/jvm_tests.yml | 7 +++---- .github/workflows/python_wheels_macos.yml | 2 +- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/.github/workflows/jvm_tests.yml b/.github/workflows/jvm_tests.yml index ab21e2f19466..8eecc83c0c19 100644 --- a/.github/workflows/jvm_tests.yml +++ b/.github/workflows/jvm_tests.yml @@ -209,10 +209,9 @@ jobs: python -m awscli s3 cp xgboost4j_${{ github.sha }}.dll ` s3://xgboost-nightly-builds/${{ env.BRANCH_NAME }}/libxgboost4j/ ` --acl public-read --region us-west-2 - if: matrix.os == 'windows-latest' - # if: | - # (github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_')) && - # matrix.os == 'windows-latest' + if: | + (github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_')) && + matrix.os == 'windows-latest' env: AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID_IAM_S3_UPLOADER }} AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY_IAM_S3_UPLOADER }} diff --git a/.github/workflows/python_wheels_macos.yml b/.github/workflows/python_wheels_macos.yml index f0f5042660df..02f21593c220 100644 --- a/.github/workflows/python_wheels_macos.yml +++ b/.github/workflows/python_wheels_macos.yml @@ -46,7 +46,7 @@ jobs: - name: Build wheels run: bash ops/pipeline/build-python-wheels-macos.sh ${{ matrix.platform_id }} ${{ github.sha }} - name: Upload Python wheel - # if: github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_') + if: github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_') run: | python -m pip install awscli python -m awscli s3 cp wheelhouse/*.whl s3://xgboost-nightly-builds/${{ env.BRANCH_NAME }}/ --acl public-read --region us-west-2 From 6641f7d1c555387c226943f160433a66f0422ea0 Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Tue, 19 Nov 2024 09:45:31 -0800 Subject: [PATCH 40/45] Fix merge conflict --- .github/runs-on.yml | 4 ---- 1 file changed, 4 deletions(-) diff --git a/.github/runs-on.yml b/.github/runs-on.yml index 1911c527481d..e21895ee8c3b 100644 --- a/.github/runs-on.yml +++ b/.github/runs-on.yml @@ -34,7 +34,3 @@ runners: cpu: 32 family: ["c7i-flex", "c7i", "c7a", "c5", "c5a"] image: windows-amd64 -<<<<<<< HEAD -======= - ->>>>>>> upstream/master From 0727cf5f80be4ebdfd951c5b3db9e1d99988cf7f Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Tue, 19 Nov 2024 10:25:27 -0800 Subject: [PATCH 41/45] Fix --- .github/workflows/windows.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml index 3dc9c4962646..afd9e65192ba 100644 --- a/.github/workflows/windows.yml +++ b/.github/workflows/windows.yml @@ -18,7 +18,7 @@ env: ${{ github.event.pull_request.number && 'PR-' }}${{ github.event.pull_request.number || github.ref_name }} ARTIFACT_STASH_PREFIX: cache/${{ github.repository }}/stash/${{ github.run_id }} # TODO(hcho3): Remove - RUNS_ON_S3_BUCKET_CACHE: runs-on-s3bucketcache-m3ikdpczirva + RUNS_ON_S3_BUCKET_CACHE: runs-on-s3bucketcache-dv5n3gmnaius jobs: build-win64-gpu: From 3c2e5c88c4c92fe58c0cb1d24a655e8d220f09e4 Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Tue, 19 Nov 2024 11:09:18 -0800 Subject: [PATCH 42/45] Remove unneeded matrix; set max-parallel --- .github/workflows/jvm_tests.yml | 1 + .github/workflows/lint.yml | 33 +++++++++--------------------- .github/workflows/main.yml | 3 +++ .github/workflows/misc.yml | 19 +++-------------- .github/workflows/python_tests.yml | 26 ++++++----------------- .github/workflows/sycl_tests.yml | 15 +++----------- 6 files changed, 26 insertions(+), 71 deletions(-) diff --git a/.github/workflows/jvm_tests.yml b/.github/workflows/jvm_tests.yml index 8eecc83c0c19..659de52c30e0 100644 --- a/.github/workflows/jvm_tests.yml +++ b/.github/workflows/jvm_tests.yml @@ -21,6 +21,7 @@ jobs: - runs-on=${{ github.run_id }} - runner=${{ matrix.runner }} strategy: + max-parallel: 2 matrix: container_id: - xgb-ci.manylinux2014_x86_64 diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index 59b3cecf57ed..70d892b1061d 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -15,26 +15,20 @@ env: jobs: build-containers: - name: Build CI containers (${{ matrix.container_id }}) + name: Build CI containers + env: + CONTAINER_ID: xgb-ci.clang_tidy runs-on: - runs-on=${{ github.run_id }} - - runner=${{ matrix.runner }} - strategy: - fail-fast: false - matrix: - include: - - container_id: xgb-ci.clang_tidy - runner: linux-amd64-cpu + - runner=linux-amd64-cpu steps: # Restart Docker daemon so that it recognizes the ephemeral disks - run: sudo systemctl restart docker - uses: actions/checkout@v4.2.2 with: submodules: "true" - - name: Build ${{ matrix.container_id }} + - name: Build ${{ env.CONTAINER_ID }} run: bash ops/docker_build.sh - env: - CONTAINER_ID: ${{ matrix.container_id }} clang-tidy: name: Run clang-tidy @@ -106,17 +100,10 @@ jobs: bash ops/script/lint_cmake.sh lintr: - runs-on: ${{ matrix.os }} - name: Run R linters on OS ${{ matrix.os }}, R ${{ matrix.r }} - strategy: - fail-fast: false - matrix: - include: - - os: ubuntu-latest - r: "release" + runs-on: ubuntu-latest + name: Run R linters on Ubuntu env: R_REMOTES_NO_ERRORS_FROM_WARNINGS: true - steps: - uses: actions/checkout@v4.2.2 with: @@ -124,14 +111,14 @@ jobs: - uses: r-lib/actions/setup-r@v2.11.0 with: - r-version: ${{ matrix.r }} + r-version: "release" - name: Cache R packages uses: actions/cache@v4.1.2 with: path: ${{ env.R_LIBS_USER }} - key: ${{ runner.os }}-r-${{ matrix.r }}-7-${{ hashFiles('R-package/DESCRIPTION') }} - restore-keys: ${{ runner.os }}-r-${{ matrix.r }}-7-${{ hashFiles('R-package/DESCRIPTION') }} + key: ${{ runner.os }}-r-release-7-${{ hashFiles('R-package/DESCRIPTION') }} + restore-keys: ${{ runner.os }}-r-release-7-${{ hashFiles('R-package/DESCRIPTION') }} - name: Install dependencies shell: Rscript {0} diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 77208a146443..15822c55f0d5 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -21,6 +21,7 @@ jobs: - runs-on=${{ github.run_id }} - runner=${{ matrix.runner }} strategy: + max-parallel: 2 matrix: container_id: - xgb-ci.gpu_build_rockylinux8 @@ -203,6 +204,7 @@ jobs: - runner=${{ matrix.runner }} strategy: fail-fast: false + max-parallel: 2 matrix: include: - suite: gpu @@ -241,6 +243,7 @@ jobs: - runner=${{ matrix.runner }} strategy: fail-fast: false + max-parallel: 2 matrix: include: - description: "single GPU" diff --git a/.github/workflows/misc.yml b/.github/workflows/misc.yml index b1b92c1528b7..1e6df46615d5 100644 --- a/.github/workflows/misc.yml +++ b/.github/workflows/misc.yml @@ -16,11 +16,7 @@ env: jobs: gtest-cpu: name: Test Google C++ test (CPU) - runs-on: ${{ matrix.os }} - strategy: - fail-fast: false - matrix: - os: [macos-13] + runs-on: macos-13 steps: - uses: actions/checkout@v4.2.2 with: @@ -42,11 +38,7 @@ jobs: gtest-cpu-nonomp: name: Test Google C++ unittest (CPU Non-OMP) - runs-on: ${{ matrix.os }} - strategy: - fail-fast: false - matrix: - os: [ubuntu-latest] + runs-on: ubuntu-latest steps: - uses: actions/checkout@v4.2.2 with: @@ -68,15 +60,10 @@ jobs: c-api-demo: name: Test installing XGBoost lib + building the C API demo - runs-on: ${{ matrix.os }} + runs-on: ubuntu-latest defaults: run: shell: bash -l {0} - strategy: - fail-fast: false - matrix: - os: ["ubuntu-latest"] - python-version: ["3.10"] steps: - uses: actions/checkout@v4.2.2 with: diff --git a/.github/workflows/python_tests.yml b/.github/workflows/python_tests.yml index 344e2f276b22..bcc0f5b8ba81 100644 --- a/.github/workflows/python_tests.yml +++ b/.github/workflows/python_tests.yml @@ -15,12 +15,8 @@ concurrency: jobs: python-sdist-test-on-Linux: - runs-on: ${{ matrix.os }} - name: Test installing XGBoost Python source package on ${{ matrix.os }} - strategy: - fail-fast: false - matrix: - os: [ubuntu-latest] + runs-on: ubuntu-latest + name: Test installing XGBoost Python source package steps: - uses: actions/checkout@v4.2.2 with: @@ -85,14 +81,9 @@ jobs: python -c 'import xgboost' python-tests-on-macos: - name: Test XGBoost Python package on ${{ matrix.os }} - runs-on: ${{ matrix.os }} + name: Test XGBoost Python package on macos-13 + runs-on: macos-13 timeout-minutes: 60 - strategy: - fail-fast: false - matrix: - os: [macos-13] - steps: - uses: actions/checkout@v4.2.2 with: @@ -138,13 +129,8 @@ jobs: pytest -s -v -rxXs --durations=0 ./tests/test_distributed/test_with_dask python-system-installation-on-ubuntu: - name: Test XGBoost Python package System Installation on ${{ matrix.os }} - runs-on: ${{ matrix.os }} - strategy: - fail-fast: false - matrix: - os: [ubuntu-latest] - + name: Test XGBoost Python package System Installation on Ubuntu + runs-on: ubuntu-latest steps: - uses: actions/checkout@v4.2.2 with: diff --git a/.github/workflows/sycl_tests.yml b/.github/workflows/sycl_tests.yml index 467734607ea6..7f6214016c00 100644 --- a/.github/workflows/sycl_tests.yml +++ b/.github/workflows/sycl_tests.yml @@ -20,11 +20,7 @@ env: jobs: gtest-cpu-sycl: name: Test Google C++ unittest (CPU SYCL) - runs-on: ${{ matrix.os }} - strategy: - fail-fast: false - matrix: - os: [ubuntu-latest] + runs-on: ubuntu-latest steps: - uses: actions/checkout@v4.2.2 with: @@ -53,14 +49,9 @@ jobs: ./testxgboost python-sycl-tests-on-ubuntu: - name: Test XGBoost Python package with SYCL on ${{ matrix.os }} - runs-on: ${{ matrix.os }} + name: Test XGBoost Python package with SYCL + runs-on: ubuntu-latest timeout-minutes: 90 - strategy: - fail-fast: false - matrix: - os: [ubuntu-latest] - steps: - uses: actions/checkout@v4.2.2 with: From 32f7406b2eff7b14cba6d171a8a644393b175adf Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Tue, 19 Nov 2024 16:36:32 -0800 Subject: [PATCH 43/45] Formatting fixes --- ops/docker/entrypoint.sh | 7 +++- ops/docker_build.py | 4 +-- ops/docker_run.py | 13 ------- ops/script/change_scala_version.py | 2 +- ops/script/format_wheel_meta.py | 6 ++-- ops/script/lint_cpp.py | 34 ++++++++++++------- ops/script/lint_python.py | 15 +++----- ops/script/rename_whl.py | 6 ++-- ops/stash_artifacts.py | 4 +-- .../test_gpu_with_dask/test_gpu_with_dask.py | 2 -- 10 files changed, 42 insertions(+), 51 deletions(-) diff --git a/ops/docker/entrypoint.sh b/ops/docker/entrypoint.sh index babe4359e8e1..40135c197c73 100755 --- a/ops/docker/entrypoint.sh +++ b/ops/docker/entrypoint.sh @@ -1,6 +1,8 @@ #!/usr/bin/env bash -# This wrapper script +# This wrapper script propagates the user information from the host +# to the container. This way, any files generated by processes running +# in the container will be accessible in the host. set -euo pipefail @@ -15,6 +17,9 @@ else rm /this_is_writable_file_system fi +## Assumption: the host passes correct user information via environment variables +## CI_BUILD_UID, CI_BUILD_GID, CI_BUILD_USER, CI_BUILD_GROUP + if [[ -n ${CI_BUILD_UID:-} ]] && [[ -n ${CI_BUILD_GID:-} ]] then groupadd -o -g "${CI_BUILD_GID}" "${CI_BUILD_GROUP}" || true diff --git a/ops/docker_build.py b/ops/docker_build.py index b096d9201d0f..1fed975ce223 100644 --- a/ops/docker_build.py +++ b/ops/docker_build.py @@ -12,7 +12,7 @@ from docker_run import OPS_DIR, fancy_print_cli_args -def parse_build_args(raw_build_args: list[str]) -> list[dict[str, str]]: +def parse_build_args(raw_build_args: list[str]) -> dict[str, str]: parsed_build_args = dict() for arg in raw_build_args: try: @@ -28,7 +28,7 @@ def parse_build_args(raw_build_args: list[str]) -> list[dict[str, str]]: def docker_build( container_id: str, *, - build_args: list[dict[str, str]], + build_args: dict[str, str], dockerfile_path: pathlib.Path, docker_context_path: pathlib.Path, cache_from: Optional[str], diff --git a/ops/docker_run.py b/ops/docker_run.py index 41ec9acb17c2..7e61c5a14f39 100644 --- a/ops/docker_run.py +++ b/ops/docker_run.py @@ -28,19 +28,6 @@ def parse_run_args(raw_run_args: str) -> list[str]: return [x for x in raw_run_args.split() if x] -def compute_container_id(container_name: str, build_args: list[dict[str, str]]) -> str: - container_id = f"xgb-ci.{container_name}" - # For some build arguments, append special suffixies - for arg_name, suffix in [ - ("CUDA_VERSION_ARG", "cuda"), - ("RAPIDS_VERSION_ARG", "rapids"), - ("JDK_VERSION_ARG", "jdk"), - ]: - if arg_name in build_args: - container_id += f"_{suffix}{build_args[arg_name]}" - return container_id - - def get_user_ids() -> dict[str, str]: uid = os.getuid() gid = os.getgid() diff --git a/ops/script/change_scala_version.py b/ops/script/change_scala_version.py index 3489479dd464..ed475a1f9582 100644 --- a/ops/script/change_scala_version.py +++ b/ops/script/change_scala_version.py @@ -4,7 +4,7 @@ import shutil -def main(args): +def main(args: argparse.Namespace) -> None: if args.scala_version == "2.12": scala_ver = "2.12" scala_patchver = "2.12.18" diff --git a/ops/script/format_wheel_meta.py b/ops/script/format_wheel_meta.py index 570f7854cf62..a7def879905e 100644 --- a/ops/script/format_wheel_meta.py +++ b/ops/script/format_wheel_meta.py @@ -3,12 +3,12 @@ XGBoost Python package. """ +import argparse import json import pathlib -from argparse import ArgumentParser -def main(args): +def main(args: argparse.Namespace) -> None: wheel_path = pathlib.Path(args.wheel_path).expanduser().resolve() if not wheel_path.exists(): raise ValueError(f"Wheel cannot be found at path {wheel_path}") @@ -37,7 +37,7 @@ def main(args): if __name__ == "__main__": - parser = ArgumentParser( + parser = argparse.ArgumentParser( description="Format meta.json encoding the latest nightly version of the Python wheel" ) parser.add_argument( diff --git a/ops/script/lint_cpp.py b/ops/script/lint_cpp.py index d4775d6b6b3e..2d00b219ceab 100644 --- a/ops/script/lint_cpp.py +++ b/ops/script/lint_cpp.py @@ -2,6 +2,7 @@ import os import re import sys +from typing import TextIO import cpplint from cpplint import _cpplint_state @@ -9,7 +10,7 @@ CXX_SUFFIX = set(["cc", "c", "cpp", "h", "cu", "hpp"]) -def filepath_enumerate(paths): +def filepath_enumerate(paths: list[str]) -> list[str]: """Enumerate the file paths of all subfiles of the list of paths""" out = [] for path in paths: @@ -22,7 +23,7 @@ def filepath_enumerate(paths): return out -def get_header_guard_dmlc(filename): +def get_header_guard_dmlc(filename: str) -> str: """Get Header Guard Convention for DMLC Projects. For headers in include, directly use the path @@ -54,11 +55,10 @@ def get_header_guard_dmlc(filename): class Lint: - def __init__(self): + def __init__(self) -> None: self.project_name = "xgboost" - self.cpp_header_map = {} - self.cpp_src_map = {} - self.python_map = {} + self.cpp_header_map: dict[str, dict[str, int]] = {} + self.cpp_src_map: dict[str, dict[str, int]] = {} self.pylint_cats = set(["error", "warning", "convention", "refactor"]) # setup cpp lint @@ -78,7 +78,7 @@ def __init__(self): cpplint._SetCountingStyle("toplevel") cpplint._line_length = 100 - def process_cpp(self, path, suffix): + def process_cpp(self, path: str, suffix: str) -> None: """Process a cpp file.""" _cpplint_state.ResetErrorCounts() cpplint.ProcessFile(str(path), _cpplint_state.verbose_level) @@ -91,7 +91,9 @@ def process_cpp(self, path, suffix): self.cpp_src_map[str(path)] = errors @staticmethod - def _print_summary_map(strm, result_map, ftype): + def _print_summary_map( + strm: TextIO, result_map: dict[str, dict[str, int]], ftype: str + ) -> int: """Print summary of certain result map.""" if len(result_map) == 0: return 0 @@ -105,7 +107,7 @@ def _print_summary_map(strm, result_map, ftype): ) return len(result_map) - npass - def print_summary(self, strm): + def print_summary(self, strm: TextIO) -> int: """Print summary of lint.""" nerr = 0 nerr += Lint._print_summary_map(strm, self.cpp_header_map, "cpp-header") @@ -122,7 +124,7 @@ def print_summary(self, strm): cpplint.GetHeaderGuardCPPVariable = get_header_guard_dmlc -def process(fname, allow_type): +def process(fname: str, allow_type: list[str]) -> None: """Process a file.""" fname = str(fname) arr = fname.rsplit(".", 1) @@ -132,13 +134,19 @@ def process(fname, allow_type): _HELPER.process_cpp(fname, arr[-1]) -def main(): +def main() -> None: parser = argparse.ArgumentParser(description="run cpp lint") parser.add_argument( "path", nargs="*", help="Path to traverse", - default=["src", "include", os.path.join("R-package", "src"), "python-package", "plugin/sycl"], + default=[ + "src", + "include", + os.path.join("R-package", "src"), + "python-package", + "plugin/sycl", + ], ) parser.add_argument( "--exclude_path", @@ -149,7 +157,7 @@ def main(): args = parser.parse_args() excluded_paths = filepath_enumerate(args.exclude_path) - allow_type = [] + allow_type: list[str] = [] allow_type += CXX_SUFFIX for path in args.path: diff --git a/ops/script/lint_python.py b/ops/script/lint_python.py index a589385b2588..67343cc430ac 100644 --- a/ops/script/lint_python.py +++ b/ops/script/lint_python.py @@ -68,11 +68,7 @@ class LintersPaths: "demo/guide-python/update_process.py", "demo/aft_survival/aft_survival_viz_demo.py", # CI - "ops/script/run_clang_tidy.py", - "ops/script/lint_python.py", - "ops/script/test_r_package.py", - "ops/script/test_utils.py", - "ops/script/change_version.py", + "ops/", ) ISORT = ( @@ -82,12 +78,13 @@ class LintersPaths: "tests/test_distributed/", "tests/python/", "tests/python-gpu/", - "ops/script/", # demo "demo/", # misc "dev/", "doc/", + # CI + "ops/", ) MYPY = ( @@ -129,11 +126,7 @@ class LintersPaths: "demo/guide-python/learning_to_rank.py", "demo/aft_survival/aft_survival_viz_demo.py", # CI - "ops/script/run_clang_tidy.py", - "ops/script/lint_python.py", - "ops/script/test_r_package.py", - "ops/script/test_utils.py", - "ops/script/change_version.py", + "ops/", ) diff --git a/ops/script/rename_whl.py b/ops/script/rename_whl.py index 500196190b3d..d4467720c738 100644 --- a/ops/script/rename_whl.py +++ b/ops/script/rename_whl.py @@ -1,8 +1,8 @@ +import argparse import pathlib -from argparse import ArgumentParser -def main(args): +def main(args: argparse.Namespace) -> None: wheel_path = pathlib.Path(args.wheel_path).expanduser().resolve() if not wheel_path.exists(): raise ValueError(f"Wheel cannot be found at path {wheel_path}") @@ -43,7 +43,7 @@ def main(args): if __name__ == "__main__": - parser = ArgumentParser( + parser = argparse.ArgumentParser( description="Format a Python wheel's name using the git commit hash and platform tag" ) parser.add_argument( diff --git a/ops/stash_artifacts.py b/ops/stash_artifacts.py index 405804b499c6..827e448ac49e 100644 --- a/ops/stash_artifacts.py +++ b/ops/stash_artifacts.py @@ -84,7 +84,7 @@ def aws_s3_download_with_wildcard(src: str, dest: Path) -> None: ) -def upload(args): +def upload(args: argparse.Namespace) -> None: print(f"Uploading artifacts with prefix {args.prefix}...") for artifact in args.artifacts: artifact_path = Path(artifact) @@ -92,7 +92,7 @@ def upload(args): aws_s3_upload(artifact_path, s3_url) -def download(args): +def download(args: argparse.Namespace) -> None: print(f"Downloading artifacts with prefix {args.prefix}...") for artifact in args.artifacts: artifact_path = Path(artifact) diff --git a/tests/test_distributed/test_gpu_with_dask/test_gpu_with_dask.py b/tests/test_distributed/test_gpu_with_dask/test_gpu_with_dask.py index 5746f33044e9..3bc7d46eb721 100644 --- a/tests/test_distributed/test_gpu_with_dask/test_gpu_with_dask.py +++ b/tests/test_distributed/test_gpu_with_dask/test_gpu_with_dask.py @@ -100,8 +100,6 @@ def is_df(part: T) -> T: # Work around https://github.com/dmlc/xgboost/issues/10752 X.columns = X.columns.astype("object") # Make sure the output can be integrated back to original dataframe - X.columns = X.columns.astype("object") - # Work around https://github.com/dmlc/xgboost/issues/10752 X["predict"] = predictions X["inplace_predict"] = series_predictions From e148e29b0662f81b1ee3acab75cacf85e77fd0f8 Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Tue, 19 Nov 2024 16:53:02 -0800 Subject: [PATCH 44/45] Fix update_rapids.sh --- .github/workflows/update_rapids.yml | 2 +- ops/docker/ci_container.yml | 10 +++++++--- ops/script/update_rapids.sh | 9 ++++++--- 3 files changed, 14 insertions(+), 7 deletions(-) diff --git a/.github/workflows/update_rapids.yml b/.github/workflows/update_rapids.yml index 03a39f72b660..d6be99d00851 100644 --- a/.github/workflows/update_rapids.yml +++ b/.github/workflows/update_rapids.yml @@ -30,7 +30,7 @@ jobs: submodules: 'true' - name: Check latest RAPIDS and update conftest.sh run: | - bash tests/buildkite/update-rapids.sh + bash ops/script/update_rapids.sh - name: Create Pull Request uses: peter-evans/create-pull-request@v7 if: github.ref == 'refs/heads/master' diff --git a/ops/docker/ci_container.yml b/ops/docker/ci_container.yml index 90c9e6c8c800..f5eb7eb982df 100644 --- a/ops/docker/ci_container.yml +++ b/ops/docker/ci_container.yml @@ -3,12 +3,16 @@ # Each container will be built using the definition from # ops/docker/dockerfile/Dockerfile.CONTAINER_DEF +rapids_versions: + stable: &rapids_version "24.10" + dev: &dev_rapids_version "24.12" + xgb-ci.gpu_build_rockylinux8: container_def: gpu_build_rockylinux8 build_args: CUDA_VERSION_ARG: "12.4.1" NCCL_VERSION_ARG: "2.23.4-1" - RAPIDS_VERSION_ARG: "24.10" + RAPIDS_VERSION_ARG: *rapids_version xgb-ci.gpu_build_r_rockylinux8: container_def: gpu_build_r_rockylinux8 @@ -21,14 +25,14 @@ xgb-ci.gpu: build_args: CUDA_VERSION_ARG: "12.4.1" NCCL_VERSION_ARG: "2.23.4-1" - RAPIDS_VERSION_ARG: "24.10" + RAPIDS_VERSION_ARG: *rapids_version xgb-ci.gpu_dev_ver: container_def: gpu build_args: CUDA_VERSION_ARG: "12.4.1" NCCL_VERSION_ARG: "2.23.4-1" - RAPIDS_VERSION_ARG: "24.12" + RAPIDS_VERSION_ARG: *dev_rapids_version RAPIDSAI_CONDA_CHANNEL_ARG: "rapidsai-nightly" xgb-ci.clang_tidy: diff --git a/ops/script/update_rapids.sh b/ops/script/update_rapids.sh index f6a2675bdfa9..d7958ce70d86 100755 --- a/ops/script/update_rapids.sh +++ b/ops/script/update_rapids.sh @@ -7,7 +7,10 @@ echo "LATEST_RAPIDS_VERSION = $LATEST_RAPIDS_VERSION" DEV_RAPIDS_VERSION=$(date +%Y-%m-%d -d "20${LATEST_RAPIDS_VERSION//./-}-01 + 2 month" | cut -c3-7 | tr - .) echo "DEV_RAPIDS_VERSION = $DEV_RAPIDS_VERSION" -PARENT_PATH=$( cd "$(dirname "${BASH_SOURCE[0]}")" ; pwd -P ) +OPS_PATH=$( cd "$(dirname "${BASH_SOURCE[0]}")/.." ; pwd -P ) +CONTAINER_YAML="$OPS_PATH/docker/ci_container.yml" -sed -i "s/^RAPIDS_VERSION=[[:digit:]]\+\.[[:digit:]]\+/RAPIDS_VERSION=${LATEST_RAPIDS_VERSION}/" $PARENT_PATH/conftest.sh -sed -i "s/^DEV_RAPIDS_VERSION=[[:digit:]]\+\.[[:digit:]]\+/DEV_RAPIDS_VERSION=${DEV_RAPIDS_VERSION}/" $PARENT_PATH/conftest.sh +sed -i "s/\&rapids_version \"[[:digit:]]\+\.[[:digit:]]\+\"/\&rapids_version \"${LATEST_RAPIDS_VERSION}\"/" \ + "$CONTAINER_YAML" +sed -i "s/\&dev_rapids_version \"[[:digit:]]\+\.[[:digit:]]\+\"/\&dev_rapids_version \"${DEV_RAPIDS_VERSION}\"/" \ + "$CONTAINER_YAML" From d8abb3c70fc747328333e5ef7bf4b64bc745f961 Mon Sep 17 00:00:00 2001 From: Hyunsu Cho Date: Tue, 19 Nov 2024 18:14:38 -0800 Subject: [PATCH 45/45] Add JVM deploy --- .github/workflows/jvm_tests.yml | 43 ++++++++++++++++++++++ ops/pipeline/build-win64-gpu.ps1 | 8 ++--- ops/pipeline/deploy-jvm-packages-impl.sh | 45 ++++++++++++++++++++++++ ops/pipeline/deploy-jvm-packages.sh | 21 +++++++++++ 4 files changed, 113 insertions(+), 4 deletions(-) create mode 100755 ops/pipeline/deploy-jvm-packages-impl.sh create mode 100755 ops/pipeline/deploy-jvm-packages.sh diff --git a/.github/workflows/jvm_tests.yml b/.github/workflows/jvm_tests.yml index 659de52c30e0..549094d52e37 100644 --- a/.github/workflows/jvm_tests.yml +++ b/.github/workflows/jvm_tests.yml @@ -162,6 +162,11 @@ jobs: run: bash ops/pipeline/build-test-jvm-packages.sh env: SCALA_VERSION: 2.13 + - name: Stash files + run: bash ops/stash_artifacts.sh lib/libxgboost4j.so + env: + COMMAND: upload + KEY: build-test-jvm-packages build-test-jvm-packages-other-os: name: Build and test JVM packages (${{ matrix.os }}) @@ -239,3 +244,41 @@ jobs: COMMAND: download KEY: build-jvm-gpu - run: bash ops/pipeline/test-jvm-gpu.sh + + deploy-jvm-packages: + name: Deploy JVM packages to S3 (${{ matrix.variant }}) + needs: [build-jvm-gpu, build-test-jvm-packages, test-jvm-packages-gpu] + runs-on: + - runs-on=${{ github.run_id }} + - runner=linux-amd64-cpu + strategy: + fail-fast: false + matrix: + include: + - variant: cpu + container_id: xgb-ci.jvm + artifact_from: build-test-jvm-packages + - variant: gpu + container_id: xgb-ci.jvm_gpu_build + artifact_from: build-jvm-gpu + steps: + # Restart Docker daemon so that it recognizes the ephemeral disks + - run: sudo systemctl restart docker + - uses: actions/checkout@v4.2.2 + with: + submodules: "true" + - name: Fetch container from cache + run: bash ops/docker_build.sh + env: + CONTAINER_ID: ${{ matrix.container_id }} + - name: Unstash files + run: | + bash ops/stash_artifacts.sh lib/libxgboost4j.so + ls -lh lib/libxgboost4j.so + env: + COMMAND: download + KEY: ${{ matrix.artifact_from }} + - name: Deploy JVM packages to S3 + run: >- + bash ops/pipeline/deploy-jvm-packages.sh ${{ matrix.variant }} \ + ${{ matrix.container_id }} diff --git a/ops/pipeline/build-win64-gpu.ps1 b/ops/pipeline/build-win64-gpu.ps1 index cc5380a7c7c2..76cc955059b8 100644 --- a/ops/pipeline/build-win64-gpu.ps1 +++ b/ops/pipeline/build-win64-gpu.ps1 @@ -5,11 +5,11 @@ $ErrorActionPreference = "Stop" Write-Host "--- Build libxgboost on Windows with CUDA" nvcc --version -#if ( $is_release_branch -eq 0 ) { +if ( $is_release_branch -eq 0 ) { $arch_flag = "-DGPU_COMPUTE_VER=75" -#} else { -# $arch_flag = "" -#} +} else { + $arch_flag = "" +} # Work around https://github.com/NVIDIA/cccl/issues/1956 # TODO(hcho3): Remove this once new CUDA version ships with CCCL 2.6.0+ diff --git a/ops/pipeline/deploy-jvm-packages-impl.sh b/ops/pipeline/deploy-jvm-packages-impl.sh new file mode 100755 index 000000000000..36fd23a583d6 --- /dev/null +++ b/ops/pipeline/deploy-jvm-packages-impl.sh @@ -0,0 +1,45 @@ +#!/bin/bash +## Deploy JVM packages to xgboost-maven-repo S3 bucket + +set -euox pipefail + +if [[ "$#" -lt 1 ]] +then + echo "Usage: $0 {cpu,gpu}" + exit 1 +fi + +variant="$1" + +maven_options="-DskipTests -Dmaven.test.skip=true -Dskip.native.build=true" +case "$variant" in + cpu) + # CPU variant + for scala_version in 2.12 2.13 + do + python ops/script/change_scala_version.py --scala-version ${scala_version} --purge-artifacts + pushd jvm-packages + mvn --no-transfer-progress deploy -Pdefault,release-to-s3 ${maven_options} + mvn clean + mvn clean -Pdefault,release-to-s3 + popd + done + ;; + gpu) + # GPU variant + for scala_version in 2.12 2.13 + do + python ops/script/change_scala_version.py --scala-version ${scala_version} --purge-artifacts + pushd jvm-packages + mvn --no-transfer-progress install -Pgpu ${maven_options} + mvn --no-transfer-progress deploy -Pgpu,release-to-s3 -pl xgboost4j-spark-gpu ${maven_options} + mvn clean + mvn clean -Pgpu,release-to-s3 + popd + done + ;; + *) + echo "Unrecognized argument: $variant" + exit 2 + ;; +esac diff --git a/ops/pipeline/deploy-jvm-packages.sh b/ops/pipeline/deploy-jvm-packages.sh new file mode 100755 index 000000000000..866b6dded393 --- /dev/null +++ b/ops/pipeline/deploy-jvm-packages.sh @@ -0,0 +1,21 @@ +#!/bin/bash + +set -euox pipefail + +source ops/pipeline/enforce-ci.sh + +if [[ "$#" -lt 2 ]] +then + echo "Usage: $0 {cpu,gpu} {container_id}" + exit 1 +fi + +variant="$1" +container_id="$2" + +# if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]] +# then + echo "--- Deploy JVM packages to xgboost-maven-repo S3 repo" + python3 ops/docker_run.py --container-id "${container_id}" \ + -- ops/pipeline/deploy-jvm-packages-impl.sh "${variant}" +# fi