Skip to content

Commit

Permalink
Merge branch 'main' into microphysics_graupel
Browse files Browse the repository at this point in the history
  • Loading branch information
OngChia committed Sep 9, 2024
2 parents 3f1c149 + c33682e commit d8b4e2f
Show file tree
Hide file tree
Showing 11 changed files with 456 additions and 446 deletions.
78 changes: 57 additions & 21 deletions ci/base.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,47 +3,68 @@ include:

stages:
- baseimage
- build
- image
- test
- benchmark

.py310: &py310
variables:
PYVERSION_PREFIX: py310
PYVERSION: 3.10.9

# Base image build step with SHA256 checksum for caching
build_baseimage:
extends: .container-builder-cscs-zen2
.build_baseimage:
stage: baseimage
before_script:
- DOCKER_TAG=`sha256sum $DOCKERFILE | head -c 16`
- export PERSIST_IMAGE_NAME=$CSCS_REGISTRY_PATH/public/base/icon4py:$DOCKER_TAG-$PYVERSION
# include build arguments in hash since we use a parameterized Docker file
- DOCKER_TAG=`echo "$(cat $DOCKERFILE) $DOCKER_BUILD_ARGS" | sha256sum | head -c 16`
- export PERSIST_IMAGE_NAME=$CSCS_REGISTRY_PATH/public/$ARCH/base/icon4py:$DOCKER_TAG-$PYVERSION
- echo "BASE_IMAGE_${PYVERSION_PREFIX}=$PERSIST_IMAGE_NAME" >> build.env
artifacts:
reports:
dotenv: build.env
variables:
DOCKERFILE: ci/docker/base.Dockerfile
# change to 'always' if you want to rebuild, even if target tag exists already (if-not-exists is the default, i.e. we could also skip the variable)
CSCS_REBUILD_POLICY: if-not-exists
DOCKER_BUILD_ARGS: '["PYVERSION=$PYVERSION", "CI_PROJECT_DIR=$CI_PROJECT_DIR"]'
<<: *py310
DOCKER_BUILD_ARGS: '["ARCH=$ARCH", "BASE_IMAGE=$BASE_IMAGE", "HPC_SDK_VERSION=$HPC_SDK_VERSION", "HPC_SDK_NAME=$HPC_SDK_NAME", "CUPY_PACKAGE=$CUPY_PACKAGE", "PYVERSION=$PYVERSION", "CI_PROJECT_DIR=$CI_PROJECT_DIR"]'
build_baseimage_x86_64:
extends: [.container-builder-cscs-zen2, .build_baseimage]
variables:
# x86_64 test target is Daint-gpu through Sarus:
# the base image does not need to provide the cuda runtime
BASE_IMAGE: "ubuntu:20.04"
HPC_SDK_VERSION: 22.11
HPC_SDK_NAME: "nvhpc_2022_2211_Linux_${ARCH}_cuda_11.8"
CUPY_PACKAGE: cupy-cuda11x
build_baseimage_aarch64:
extends: [.container-builder-cscs-gh200, .build_baseimage]
variables:
# aarm64 test target is Todi through Container Engine:
# the base image should provide the cuda runtime, therefore we use the cuda base image
BASE_IMAGE: "docker.io/nvidia/cuda:12.4.1-base-ubuntu20.04"
HPC_SDK_VERSION: 24.5
HPC_SDK_NAME: "nvhpc_2024_245_Linux_${ARCH}_cuda_12.4"
CUPY_PACKAGE: cupy-cuda12x
# TODO: re-enable CI job when Todi is back in operational state
when: manual

build_image:
stage: build
extends: .container-builder-cscs-zen2
needs: ["build_baseimage"]
.build_image:
stage: image
variables:
# Unique image name based on commit SHA
PERSIST_IMAGE_NAME: $CSCS_REGISTRY_PATH/public/icon4py/icon4py-ci:$CI_COMMIT_SHA-$PYVERSION
PERSIST_IMAGE_NAME: $CSCS_REGISTRY_PATH/public/$ARCH/icon4py/icon4py-ci:$CI_COMMIT_SHA-$PYVERSION
DOCKERFILE: ci/docker/checkout.Dockerfile
DOCKER_BUILD_ARGS: '["PYVERSION=$PYVERSION", "BASE_IMAGE=${BASE_IMAGE_${PYVERSION_PREFIX}}"]'
<<: *py310
build_image_x86_64:
extends: [.container-builder-cscs-zen2, .build_image]
needs: [build_baseimage_x86_64]
build_image_aarch64:
extends: [.container-builder-cscs-gh200, .build_image]
needs: [build_baseimage_aarch64]

.test_template:
extends: .container-runner-daint-gpu
needs: ["build_image"]
timeout: 8h
image: $CSCS_REGISTRY_PATH/public/icon4py/icon4py-ci:$CI_COMMIT_SHA-$PYVERSION
image: $CSCS_REGISTRY_PATH/public/$ARCH/icon4py/icon4py-ci:$CI_COMMIT_SHA-$PYVERSION
before_script:
- python -c "import cupy"
- cd /icon4py
Expand All @@ -55,11 +76,26 @@ build_image:
CRAY_CUDA_MPS: 1
NUM_PROCESSES: auto
VIRTUALENV_SYSTEM_SITE_PACKAGES: 1
CSCS_NEEDED_DATA: icon4py
TEST_DATA_PATH: "/project/d121/icon4py/ci/testdata"
TEST_DATA_PATH: "/icon4py/ci/testdata"
ICON_GRID_LOC: "${TEST_DATA_PATH}/grids/mch_ch_r04b09_dsl"
PY2F_GPU_TESTS: 1
HPC_SDK_PATH: "/opt/nvidia/hpc_sdk/Linux_x86_64/22.11"
CUDACXX: "${HPC_SDK_PATH}/compilers/bin/nvcc"
NVFORTRAN_COMPILER: "${HPC_SDK_PATH}/compilers/bin/nvfortran"
<<: *py310
.test_template_x86_64:
extends: [.container-runner-daint-gpu-f7t, .test_template]
needs: [build_image_x86_64]
variables:
CSCS_ADDITIONAL_MOUNTS: '["/project/d121/icon4py/ci/testdata:$TEST_DATA_PATH"]'
HPC_SDK_PATH: "/opt/nvidia/hpc_sdk/Linux_${ARCH}/22.11"
.test_template_aarch64:
extends: [.container-runner-todi-gh200, .test_template]
needs: [build_image_aarch64]
variables:
CSCS_ADDITIONAL_MOUNTS: '["/store/migration/project/d121/icon4py/ci/testdata:$TEST_DATA_PATH"]'
HPC_SDK_PATH: "/opt/nvidia/hpc_sdk/Linux_${ARCH}/24.5"
# Grace-Hopper gpu architecture is not enabled by default in CUDA build
CUDAARCHS: "90"
# Limit test parallelism to avoid "OSError: too many open files" in the gt4py build stage.
# Another problem, observed in test stage, is that gpu tests hang in combination with CUDA MPS,
# when high test parallelism is used.
NUM_PROCESSES: 16
7 changes: 5 additions & 2 deletions ci/benchmark.yml
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
include:
- local: 'ci/base.yml'

benchmark_model_stencils:
extends: .test_template
.benchmark_model_stencils:
stage: benchmark
script:
# force execution of tests where validation is expected to fail, because the reason for failure is wrong numpy reference
Expand All @@ -11,3 +10,7 @@ benchmark_model_stencils:
matrix:
- BACKEND: [gtfn_cpu, gtfn_gpu]
GRID: [icon_grid, icon_grid_global]
benchmark_model_stencils_x86_64:
extends: [.benchmark_model_stencils, .test_template_x86_64]
benchmark_model_stencils_aarch64:
extends: [.benchmark_model_stencils, .test_template_aarch64]
14 changes: 10 additions & 4 deletions ci/dace.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,7 @@ include:
variables:
DACE_VERSION: "0.16.1"

test_model_stencils:
extends: .test_template
.test_model_stencils:
stage: test
script:
- pip install dace==$DACE_VERSION
Expand All @@ -14,9 +13,12 @@ test_model_stencils:
matrix:
- BACKEND: [dace_cpu, dace_gpu]
GRID: [simple_grid, icon_grid]
test_model_stencils_x86_64:
extends: [.test_model_stencils, .test_template_x86_64]
test_model_stencils_aarch64:
extends: [.test_model_stencils, .test_template_aarch64]

benchmark_model_stencils:
extends: .test_template
.benchmark_model_stencils:
stage: benchmark
script:
- pip install dace==$DACE_VERSION
Expand All @@ -26,3 +28,7 @@ benchmark_model_stencils:
matrix:
- BACKEND: [dace_cpu, dace_gpu]
GRID: [icon_grid, icon_grid_global]
benchmark_model_stencils_x86_64:
extends: [.benchmark_model_stencils, .test_template_x86_64]
benchmark_model_stencils_aarch64:
extends: [.benchmark_model_stencils, .test_template_aarch64]
21 changes: 15 additions & 6 deletions ci/default.yml
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
include:
- local: 'ci/base.yml'

test_model_stencils:
extends: .test_template
.test_model_stencils:
stage: test
script:
- tox -r -e run_stencil_tests -c model/ -- --backend=$BACKEND --grid=$GRID --verbose
Expand All @@ -15,19 +14,29 @@ test_model_stencils:
- if: $BACKEND == "roundtrip" && $GRID == "icon_grid"
when: never
- when: on_success
test_model_stencils_x86_64:
extends: [.test_model_stencils, .test_template_x86_64]
test_model_stencils_aarch64:
extends: [.test_model_stencils, .test_template_aarch64]

test_tools:
extends: .test_template
.test_tools:
stage: test
script:
- tox -r -c tools/ --verbose
test_tools_x86_64:
extends: [.test_tools, .test_template_x86_64]
test_tools_aarch64:
extends: [.test_tools, .test_template_aarch64]

test_model_datatests:
extends: .test_template
.test_model_datatests:
stage: test
script:
- tox -r -e run_model_tests -c model/ --verbose -- --backend=$BACKEND $COMPONENT
parallel:
matrix:
- COMPONENT: [atmosphere/diffusion/tests/diffusion_tests, atmosphere/dycore/tests/dycore_tests, atmosphere/subgrid_scale_physics/microphysics/tests, common/tests, driver/tests]
BACKEND: [gtfn_cpu]
test_model_datatests_x86_64:
extends: [.test_model_datatests, .test_template_x86_64]
test_model_datatests_aarch64:
extends: [.test_model_datatests, .test_template_aarch64]
17 changes: 11 additions & 6 deletions ci/docker/base.Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
FROM ubuntu:20.04
ARG BASE_IMAGE=ubuntu:20.04
FROM ${BASE_IMAGE}

ENV LANG C.UTF-8
ENV LC_ALL C.UTF-8
Expand All @@ -22,6 +23,7 @@ RUN apt-get update -qq && apt-get install -qq -y --no-install-recommends \
xz-utils \
tk-dev \
libffi-dev \
libhdf5-dev \
liblzma-dev \
python-openssl \
libreadline-dev \
Expand All @@ -33,23 +35,25 @@ RUN apt-get update -qq && apt-get install -qq -y --no-install-recommends \
# Install NVIDIA HPC SDK for nvfortran
ARG HPC_SDK_VERSION=22.11
ARG HPC_SDK_NAME=nvhpc_2022_2211_Linux_x86_64_cuda_11.8
ARG HPC_SDK_URL=https://developer.download.nvidia.com/hpc-sdk/22.11/${HPC_SDK_NAME}.tar.gz
ENV HPC_SDK_URL=https://developer.download.nvidia.com/hpc-sdk/${HPC_SDK_VERSION}/${HPC_SDK_NAME}.tar.gz

RUN wget -q ${HPC_SDK_URL} -O /tmp/nvhpc.tar.gz && \
mkdir -p /opt/nvidia && \
tar -xzf /tmp/nvhpc.tar.gz -C /opt/nvidia && \
rm /tmp/nvhpc.tar.gz

ENV NVHPC_DEFAULT_CUDA=11.8
ENV NVHPC_SILENT=1
RUN cd /opt/nvidia/${HPC_SDK_NAME} && ./install

# Set environment variables
ENV HPC_SDK_PATH=/opt/nvidia/hpc_sdk/Linux_x86_64/${HPC_SDK_VERSION}
ARG ARCH=x86_64
ENV HPC_SDK_PATH=/opt/nvidia/hpc_sdk/Linux_${ARCH}/${HPC_SDK_VERSION}
# The variable CUDA_PATH is used by cupy to find the cuda toolchain
ENV CUDA_PATH=${HPC_SDK_PATH}/cuda

ENV PATH=${HPC_SDK_PATH}/compilers/bin:${HPC_SDK_PATH}/comm_libs/mpi/bin:${PATH} \
MANPATH=${HPC_SDK_PATH}/compilers/man:${MANPATH} \
LD_LIBRARY_PATH=${HPC_SDK_PATH}/cuda/lib64:${HPC_SDK_PATH}/math_libs/lib64:${LD_LIBRARY_PATH}
LD_LIBRARY_PATH=${CUDA_PATH}/lib64:${HPC_SDK_PATH}/math_libs/lib64:${LD_LIBRARY_PATH}

# Install Boost
RUN wget --quiet https://archives.boost.io/release/1.85.0/source/boost_1_85_0.tar.gz && \
Expand All @@ -76,4 +80,5 @@ RUN pyenv update && \

ENV PATH="/root/.pyenv/shims:${PATH}"

RUN pip install --upgrade pip setuptools wheel tox clang-format cupy-cuda11x
ARG CUPY_PACKAGE=cupy-cuda11x
RUN pip install --upgrade pip setuptools wheel tox clang-format ${CUPY_PACKAGE}
Loading

0 comments on commit d8b4e2f

Please sign in to comment.