Merge branch 'main' into microphysics_graupel

C2SM · Sep 9, 2024 · d8b4e2f · d8b4e2f
2 parents 3f1c149 + c33682e
commit d8b4e2f
Show file tree

Hide file tree

Showing 11 changed files with 456 additions and 446 deletions.
diff --git a/ci/base.yml b/ci/base.yml
@@ -3,47 +3,68 @@ include:
 
 stages:
   - baseimage
-  - build
+  - image
   - test
   - benchmark
 
-.py310: &py310
+variables:
   PYVERSION_PREFIX: py310
   PYVERSION: 3.10.9
 
 # Base image build step with SHA256 checksum for caching
-build_baseimage:
-  extends: .container-builder-cscs-zen2
+.build_baseimage:
   stage: baseimage
   before_script:
-    - DOCKER_TAG=`sha256sum $DOCKERFILE | head -c 16`
-    - export PERSIST_IMAGE_NAME=$CSCS_REGISTRY_PATH/public/base/icon4py:$DOCKER_TAG-$PYVERSION
+    # include build arguments in hash since we use a parameterized Docker file
+    - DOCKER_TAG=`echo "$(cat $DOCKERFILE) $DOCKER_BUILD_ARGS" | sha256sum | head -c 16`
+    - export PERSIST_IMAGE_NAME=$CSCS_REGISTRY_PATH/public/$ARCH/base/icon4py:$DOCKER_TAG-$PYVERSION
     - echo "BASE_IMAGE_${PYVERSION_PREFIX}=$PERSIST_IMAGE_NAME" >> build.env
   artifacts:
     reports:
       dotenv: build.env
   variables:
     DOCKERFILE: ci/docker/base.Dockerfile
+    # change to 'always' if you want to rebuild, even if target tag exists already (if-not-exists is the default, i.e. we could also skip the variable)
     CSCS_REBUILD_POLICY: if-not-exists
-    DOCKER_BUILD_ARGS: '["PYVERSION=$PYVERSION", "CI_PROJECT_DIR=$CI_PROJECT_DIR"]'
-    <<: *py310
+    DOCKER_BUILD_ARGS: '["ARCH=$ARCH", "BASE_IMAGE=$BASE_IMAGE", "HPC_SDK_VERSION=$HPC_SDK_VERSION", "HPC_SDK_NAME=$HPC_SDK_NAME", "CUPY_PACKAGE=$CUPY_PACKAGE", "PYVERSION=$PYVERSION", "CI_PROJECT_DIR=$CI_PROJECT_DIR"]'
+build_baseimage_x86_64:
+  extends: [.container-builder-cscs-zen2, .build_baseimage]
+  variables:
+    # x86_64 test target is Daint-gpu through Sarus:
+    # the base image does not need to provide the cuda runtime
+    BASE_IMAGE: "ubuntu:20.04"
+    HPC_SDK_VERSION: 22.11
+    HPC_SDK_NAME: "nvhpc_2022_2211_Linux_${ARCH}_cuda_11.8"
+    CUPY_PACKAGE: cupy-cuda11x
+build_baseimage_aarch64:
+  extends: [.container-builder-cscs-gh200, .build_baseimage]
+  variables:
+    # aarm64 test target is Todi through Container Engine:
+    # the base image should provide the cuda runtime, therefore we use the cuda base image
+    BASE_IMAGE: "docker.io/nvidia/cuda:12.4.1-base-ubuntu20.04"
+    HPC_SDK_VERSION: 24.5
+    HPC_SDK_NAME: "nvhpc_2024_245_Linux_${ARCH}_cuda_12.4"
+    CUPY_PACKAGE: cupy-cuda12x
+  # TODO: re-enable CI job when Todi is back in operational state
+  when: manual
 
-build_image:
-  stage: build
-  extends: .container-builder-cscs-zen2
-  needs: ["build_baseimage"]
+.build_image:
+  stage: image
   variables:
       # Unique image name based on commit SHA
-      PERSIST_IMAGE_NAME: $CSCS_REGISTRY_PATH/public/icon4py/icon4py-ci:$CI_COMMIT_SHA-$PYVERSION
+      PERSIST_IMAGE_NAME: $CSCS_REGISTRY_PATH/public/$ARCH/icon4py/icon4py-ci:$CI_COMMIT_SHA-$PYVERSION
       DOCKERFILE: ci/docker/checkout.Dockerfile
       DOCKER_BUILD_ARGS: '["PYVERSION=$PYVERSION", "BASE_IMAGE=${BASE_IMAGE_${PYVERSION_PREFIX}}"]'
-      <<: *py310
+build_image_x86_64:
+  extends: [.container-builder-cscs-zen2, .build_image]
+  needs: [build_baseimage_x86_64]
+build_image_aarch64:
+  extends: [.container-builder-cscs-gh200, .build_image]
+  needs: [build_baseimage_aarch64]
 
 .test_template:
-  extends: .container-runner-daint-gpu
-  needs: ["build_image"]
   timeout: 8h
-  image: $CSCS_REGISTRY_PATH/public/icon4py/icon4py-ci:$CI_COMMIT_SHA-$PYVERSION
+  image: $CSCS_REGISTRY_PATH/public/$ARCH/icon4py/icon4py-ci:$CI_COMMIT_SHA-$PYVERSION
   before_script:
     - python -c "import cupy"
     - cd /icon4py
@@ -55,11 +76,26 @@ build_image:
     CRAY_CUDA_MPS: 1
     NUM_PROCESSES: auto
     VIRTUALENV_SYSTEM_SITE_PACKAGES: 1
-    CSCS_NEEDED_DATA: icon4py
-    TEST_DATA_PATH: "/project/d121/icon4py/ci/testdata"
+    TEST_DATA_PATH: "/icon4py/ci/testdata"
     ICON_GRID_LOC: "${TEST_DATA_PATH}/grids/mch_ch_r04b09_dsl"
     PY2F_GPU_TESTS: 1
-    HPC_SDK_PATH: "/opt/nvidia/hpc_sdk/Linux_x86_64/22.11"
     CUDACXX: "${HPC_SDK_PATH}/compilers/bin/nvcc"
     NVFORTRAN_COMPILER: "${HPC_SDK_PATH}/compilers/bin/nvfortran"
-    <<: *py310
+.test_template_x86_64:
+  extends: [.container-runner-daint-gpu-f7t, .test_template]
+  needs: [build_image_x86_64]
+  variables:
+    CSCS_ADDITIONAL_MOUNTS: '["/project/d121/icon4py/ci/testdata:$TEST_DATA_PATH"]'
+    HPC_SDK_PATH: "/opt/nvidia/hpc_sdk/Linux_${ARCH}/22.11"
+.test_template_aarch64:
+  extends: [.container-runner-todi-gh200, .test_template]
+  needs: [build_image_aarch64]
+  variables:
+    CSCS_ADDITIONAL_MOUNTS: '["/store/migration/project/d121/icon4py/ci/testdata:$TEST_DATA_PATH"]'
+    HPC_SDK_PATH: "/opt/nvidia/hpc_sdk/Linux_${ARCH}/24.5"
+    # Grace-Hopper gpu architecture is not enabled by default in CUDA build
+    CUDAARCHS: "90"
+    # Limit test parallelism to avoid "OSError: too many open files" in the gt4py build stage.
+    # Another problem, observed in test stage, is that gpu tests hang in combination with CUDA MPS,
+    # when high test parallelism is used.
+    NUM_PROCESSES: 16
diff --git a/ci/benchmark.yml b/ci/benchmark.yml
@@ -1,8 +1,7 @@
 include:
   - local: 'ci/base.yml'
 
-benchmark_model_stencils:
-  extends: .test_template
+.benchmark_model_stencils:
   stage: benchmark
   script:
     # force execution of tests where validation is expected to fail, because the reason for failure is wrong numpy reference
@@ -11,3 +10,7 @@ benchmark_model_stencils:
     matrix:
       - BACKEND: [gtfn_cpu, gtfn_gpu]
         GRID: [icon_grid, icon_grid_global]
+benchmark_model_stencils_x86_64:
+  extends: [.benchmark_model_stencils, .test_template_x86_64]
+benchmark_model_stencils_aarch64:
+  extends: [.benchmark_model_stencils, .test_template_aarch64]
diff --git a/ci/dace.yml b/ci/dace.yml
@@ -4,8 +4,7 @@ include:
 variables:
   DACE_VERSION: "0.16.1"
 
-test_model_stencils:
-  extends: .test_template
+.test_model_stencils:
   stage: test
   script:
     - pip install dace==$DACE_VERSION
@@ -14,9 +13,12 @@ test_model_stencils:
     matrix:
     - BACKEND: [dace_cpu, dace_gpu]
       GRID: [simple_grid, icon_grid]
+test_model_stencils_x86_64:
+  extends: [.test_model_stencils, .test_template_x86_64]
+test_model_stencils_aarch64:
+  extends: [.test_model_stencils, .test_template_aarch64]
 
-benchmark_model_stencils:
-  extends: .test_template
+.benchmark_model_stencils:
   stage: benchmark
   script:
     - pip install dace==$DACE_VERSION
@@ -26,3 +28,7 @@ benchmark_model_stencils:
     matrix:
     - BACKEND: [dace_cpu, dace_gpu]
       GRID: [icon_grid, icon_grid_global]
+benchmark_model_stencils_x86_64:
+  extends: [.benchmark_model_stencils, .test_template_x86_64]
+benchmark_model_stencils_aarch64:
+  extends: [.benchmark_model_stencils, .test_template_aarch64]
diff --git a/ci/default.yml b/ci/default.yml
@@ -1,8 +1,7 @@
 include:
   - local: 'ci/base.yml'
 
-test_model_stencils:
-  extends: .test_template
+.test_model_stencils:
   stage: test
   script:
     - tox -r -e run_stencil_tests -c model/ -- --backend=$BACKEND --grid=$GRID --verbose
@@ -15,19 +14,29 @@ test_model_stencils:
     - if: $BACKEND == "roundtrip" && $GRID == "icon_grid"
       when: never
     - when: on_success
+test_model_stencils_x86_64:
+  extends: [.test_model_stencils, .test_template_x86_64]
+test_model_stencils_aarch64:
+  extends: [.test_model_stencils, .test_template_aarch64]
 
-test_tools:
-  extends: .test_template
+.test_tools:
   stage: test
   script:
     - tox -r -c tools/ --verbose
+test_tools_x86_64:
+  extends: [.test_tools, .test_template_x86_64]
+test_tools_aarch64:
+  extends: [.test_tools, .test_template_aarch64]
 
-test_model_datatests:
-  extends: .test_template
+.test_model_datatests:
   stage: test
   script:
     - tox -r -e run_model_tests -c model/ --verbose -- --backend=$BACKEND $COMPONENT
   parallel:
     matrix:
     - COMPONENT: [atmosphere/diffusion/tests/diffusion_tests, atmosphere/dycore/tests/dycore_tests, atmosphere/subgrid_scale_physics/microphysics/tests, common/tests, driver/tests]
       BACKEND: [gtfn_cpu]
+test_model_datatests_x86_64:
+  extends: [.test_model_datatests, .test_template_x86_64]
+test_model_datatests_aarch64:
+  extends: [.test_model_datatests, .test_template_aarch64]
diff --git a/ci/docker/base.Dockerfile b/ci/docker/base.Dockerfile
@@ -1,4 +1,5 @@
-FROM ubuntu:20.04
+ARG BASE_IMAGE=ubuntu:20.04
+FROM ${BASE_IMAGE}
 
 ENV LANG C.UTF-8
 ENV LC_ALL C.UTF-8
@@ -22,6 +23,7 @@ RUN apt-get update -qq && apt-get install -qq -y --no-install-recommends \
     xz-utils \
     tk-dev \
     libffi-dev \
+    libhdf5-dev \
     liblzma-dev \
     python-openssl \
     libreadline-dev \
@@ -33,23 +35,25 @@ RUN apt-get update -qq && apt-get install -qq -y --no-install-recommends \
 # Install NVIDIA HPC SDK for nvfortran
 ARG HPC_SDK_VERSION=22.11
 ARG HPC_SDK_NAME=nvhpc_2022_2211_Linux_x86_64_cuda_11.8
-ARG HPC_SDK_URL=https://developer.download.nvidia.com/hpc-sdk/22.11/${HPC_SDK_NAME}.tar.gz
+ENV HPC_SDK_URL=https://developer.download.nvidia.com/hpc-sdk/${HPC_SDK_VERSION}/${HPC_SDK_NAME}.tar.gz
 
 RUN wget -q ${HPC_SDK_URL} -O /tmp/nvhpc.tar.gz && \
     mkdir -p /opt/nvidia && \
     tar -xzf /tmp/nvhpc.tar.gz -C /opt/nvidia && \
     rm /tmp/nvhpc.tar.gz
 
-ENV NVHPC_DEFAULT_CUDA=11.8
 ENV NVHPC_SILENT=1
 RUN cd /opt/nvidia/${HPC_SDK_NAME} && ./install
 
 # Set environment variables
-ENV HPC_SDK_PATH=/opt/nvidia/hpc_sdk/Linux_x86_64/${HPC_SDK_VERSION}
+ARG ARCH=x86_64
+ENV HPC_SDK_PATH=/opt/nvidia/hpc_sdk/Linux_${ARCH}/${HPC_SDK_VERSION}
+# The variable CUDA_PATH is used by cupy to find the cuda toolchain
+ENV CUDA_PATH=${HPC_SDK_PATH}/cuda
 
 ENV PATH=${HPC_SDK_PATH}/compilers/bin:${HPC_SDK_PATH}/comm_libs/mpi/bin:${PATH} \
     MANPATH=${HPC_SDK_PATH}/compilers/man:${MANPATH} \
-    LD_LIBRARY_PATH=${HPC_SDK_PATH}/cuda/lib64:${HPC_SDK_PATH}/math_libs/lib64:${LD_LIBRARY_PATH}
+    LD_LIBRARY_PATH=${CUDA_PATH}/lib64:${HPC_SDK_PATH}/math_libs/lib64:${LD_LIBRARY_PATH}
 
 # Install Boost
 RUN wget --quiet https://archives.boost.io/release/1.85.0/source/boost_1_85_0.tar.gz && \
@@ -76,4 +80,5 @@ RUN pyenv update && \
 
 ENV PATH="/root/.pyenv/shims:${PATH}"
 
-RUN pip install --upgrade pip setuptools wheel tox clang-format cupy-cuda11x
+ARG CUPY_PACKAGE=cupy-cuda11x
+RUN pip install --upgrade pip setuptools wheel tox clang-format ${CUPY_PACKAGE}