From a6992cdd61556f62032130a2891575ef1fa56d8a Mon Sep 17 00:00:00 2001 From: eap Date: Wed, 24 Jul 2024 17:42:22 -0600 Subject: [PATCH 1/5] Pcluster update --- configs/sites/tier1/aws-pcluster/README.md | 74 ++++++++++++------- .../sites/tier1/aws-pcluster/packages.yaml | 10 +-- 2 files changed, 53 insertions(+), 31 deletions(-) diff --git a/configs/sites/tier1/aws-pcluster/README.md b/configs/sites/tier1/aws-pcluster/README.md index e5ff59c2f..d97a73c05 100644 --- a/configs/sites/tier1/aws-pcluster/README.md +++ b/configs/sites/tier1/aws-pcluster/README.md @@ -9,8 +9,8 @@ from the Community AMIs tab that matches your desired OS and PCluster version. Select an instance type of the same family that you are planning to use for the head and the compute nodes, and specify enough enough storage for a swap file and a spack-stack installation. -- AMI Name: aws-parallelcluster-3.7.1-ubuntu-2204-lts-hvm-x86_64 -- AMI ID: ami-0906e8b928cde5ccc +- AMI Name: aws-parallelcluster-3.9.3-ubuntu-2204-lts-hvm-x86_64 +- AMI ID: ami-08e88679103d6ab5f - Instance r7a.4xlarge (uses same processor architecture as hpc7a instances) - Use 300GB of gp3 storage as / - Attach security groups allowing ssh inbound traffic and NFS outbound traffic @@ -21,7 +21,7 @@ and NFS access. ``` aws ec2 run-instances \ - --image-id ami-0906e8b928cde5ccc \ + --image-id ami-08e88679103d6ab5f \ --count 1 \ --instance-type r7a.4xlarge \ --key-name YOUR-KEY-NAME \ @@ -84,6 +84,7 @@ apt install -y \ tcl-dev \ nano sudo -u ubuntu git lfs install +sudo -u ubuntu git config --global credential.helper 'cache --timeout=3600' # Install compilers and various dev dependencies. @@ -209,22 +210,20 @@ apt install -y docker-ce \ docker-buildx-plugin \ docker-compose-plugin docker run hello-world +sudo usermod -aG docker $USER # DH* TODO 2023/02/21: Add users to group docker so that non-root users can run it # See https://docs.docker.com/engine/install/linux-postinstall/ # Exit root session exit - -# Basic git config -git config --global credential.helper cache ``` 4. Build ecflow outside of spack to be able to link against OS boost ``` mkdir -p /home/ubuntu/jedi/ecflow-5.8.4/src cd /home/ubuntu/jedi/ecflow-5.8.4/src -wget https://confluence.ecmwf.int/download/attachments/8650755/ecFlow-5.8.4-Source.tar.gz?api=v2 -mv ecFlow-5.8.4-Source.tar.gz\?api\=v2 ecFlow-5.8.4-Source.tar.gz +wget -O ecFlow-5.8.4-Source.tar.gz \ + https://confluence.ecmwf.int/download/attachments/8650755/ecFlow-5.8.4-Source.tar.gz?api=v2 tar -xvzf ecFlow-5.8.4-Source.tar.gz export WK=/home/ubuntu/jedi/ecflow-5.8.4/src/ecFlow-5.8.4-Source export BOOST_ROOT=/usr @@ -308,21 +307,22 @@ ln -sf /opt/lmod/lmod/init/profile.fish /etc/profile.d/z00_lmod.fish # Update the Intel MPI module to load AWS libfabric. cat << 'EOF' >> /opt/intel/oneapi/mpi/2021.10.0/modulefiles/mpi conflict openmpi -if { [ module-info mode load ] && ![ is-loaded libfabric-aws/1.18.2amzn1.0 ] } { - module load libfabric-aws/1.18.2amzn1.0 +if { [ module-info mode load ] && ![ is-loaded libfabric-aws/1.19.0amzn4.0 ] } { + module load libfabric-aws/1.19.0amzn4.0 } EOF # Update the AWS OpenMPI module to load AWS libfabric. -cat << 'EOF' >> /usr/share/modules/modulefiles/openmpi/4.1.5 +cat << 'EOF' >> /opt/amazon/modules/modulefiles/openmpi/4.1.6 conflict mpi -if { [ module-info mode load ] && ![ is-loaded libfabric-aws/1.18.2amzn1.0 ] } { - module load libfabric-aws/1.18.2amzn1.0 +if { [ module-info mode load ] && ![ is-loaded libfabric-aws/1.19.0amzn4.0 ] } { + module load libfabric-aws/1.19.0amzn4.0 } EOF # Add a number of default module locations to the lmod startup script. cat << 'EOF' >> /etc/profile.d/z01_lmod.sh +module use /opt/amazon/modules/modulefiles module use /usr/share/modules/modulefiles module use /opt/intel/oneapi/mpi/2021.10.0/modulefiles module use /home/ubuntu/jedi/modulefiles @@ -335,10 +335,10 @@ exit ssh ... # Now user ubuntu module av -module load libfabric-aws/1.18.2amzn1.0 -module load openmpi/4.1.5 +module load libfabric-aws/1.19.0amzn4.0 +module load openmpi/4.1.6 module list -module unload openmpi/4.1.5 +module unload openmpi/4.1.6 module load mpi module list module purge @@ -383,7 +383,8 @@ git clone --recurse-submodules -b release/1.7.0 https://github.com/JCSDA/spack-s cd spack-stack-1.7/ . setup.sh spack stack create env --site aws-pcluster --template=unified-dev --name=unified-intel -spack env activate -p envs/unified-env +cd unified-intel +spack env activate -p . # Edit envs/unified-intel/spack.yaml. # 1) Find this line: @@ -392,9 +393,9 @@ spack env activate -p envs/unified-env # the line should look like this: # compilers: [%intel'] -spack concretize 2>&1 | tee log.concretize.unified-env.001 -./util/show_duplicate_packages.py -d log.concretize.unified-env.001 -spack install --verbose 2>&1 | tee log.install.unified-env.001 +spack concretize 2>&1 | tee log.concretize.001 +${SPACK_STACK_DIR}/util/show_duplicate_packages.py -d log.concretize.001 +spack install -j 12 --verbose 2>&1 | tee log.install.001 spack module lmod refresh spack stack setup-meta-modules ``` @@ -425,7 +426,7 @@ cat << 'EOF' >> ${SPACK_SYSTEM_CONFIG_PATH}/packages.yaml - spec: intel-oneapi-mpi@2021.10.0%intel@2022.1.0 prefix: /opt/intel modules: - - libfabric-aws/1.18.2amzn1.0 + - libfabric-aws/1.19.0amzn4.0 - intelmpi EOF @@ -433,12 +434,12 @@ EOF cat << 'EOF' >> ${SPACK_SYSTEM_CONFIG_PATH}/packages.yaml openmpi: externals: - - spec: openmpi@4.1.5%gcc@9.4.0~cuda~cxx~cxx_exceptions~java~memchecker+pmi~static~wrapper-rpath + - spec: openmpi@4.1.6%gcc@9.4.0~cuda~cxx~cxx_exceptions~java~memchecker+pmi~static~wrapper-rpath fabrics=ofi schedulers=slurm prefix: /opt/amazon/openmpi modules: - - libfabric-aws/1.18.2amzn1.0 - - openmpi/4.1.5 + - libfabric-aws/1.19.0amzn4.0 + - openmpi/4.1.6 EOF # Can't find qt5 because qtpluginfo is broken, @@ -467,7 +468,7 @@ export -n SPACK_SYSTEM_CONFIG_PATH spack config add "packages:mpi:buildable:False" spack config add "packages:python:buildable:False" spack config add "packages:openssl:buildable:False" -spack config add "packages:all:providers:mpi:[intel-oneapi-mpi@2021.10.0, openmpi@4.1.5]" +spack config add "packages:all:providers:mpi:[intel-oneapi-mpi@2021.10.0, openmpi@4.1.6]" spack config add "packages:all:compiler:[intel@2021.10.0, gcc@11.4.0]" # edit envs/unified-env/site/compilers.yaml and replace the following line in the **Intel** compiler section: @@ -500,7 +501,7 @@ spack stack setup-meta-modules # Example given for building jedi-bundle module use /mnt/experiments-efs/spack-stack-1.7/envs/unified-intel/install/modulefiles/Core module load stack-gcc/11.4.0 -module load stack-openmpi/4.1.5 +module load stack-openmpi/4.1.6 module load base-env module load jedi-mpas-env module load jedi-fv3-env @@ -530,6 +531,7 @@ source /opt/intel/oneapi/mpi/2021.10.0/env/vars.sh # Activate spack stack modules. module use /mnt/experiments-efs/spack-stack-1.7/envs/unified-intel/install/modulefiles/Core +module use $HOME/spack-stack-1.7/envs/unified-intel/install/modulefiles/Core module load stack-intel/2021.10.0 module load stack-intel-oneapi-mpi/2021.10.0 module load base-env @@ -550,3 +552,23 @@ make update make -j10 ctest ``` + +15. Once the parallel cluster head node image is fully configured and tested you +can create an AMI snapshot based on the configured instance using the +[instructions](https://docs.aws.amazon.com/parallelcluster/latest/ug/building-custom-ami-v3.html) +published by AWS. Included here is a short summary of those instructions. + +``` +# 1) In a SSH session on the instance delete any unwanted files and directories +# in your home directory and in the /root directory. + +# 2) Use the AMI cleanup script to prepare the instance for imaging. +sudo /usr/local/sbin/ami_cleanup.sh +sudo apt clean + +# 3) From the AWS console navigate to the build instance and choose +# "Instance state" and select "Stop instance" + +# 4) Create a new AMI from the instance. From at the instance view in the prior +# step select "Actions" and choose "Image" and then "Create image". +``` diff --git a/configs/sites/tier1/aws-pcluster/packages.yaml b/configs/sites/tier1/aws-pcluster/packages.yaml index ffcf5cebd..6b7387393 100644 --- a/configs/sites/tier1/aws-pcluster/packages.yaml +++ b/configs/sites/tier1/aws-pcluster/packages.yaml @@ -11,7 +11,7 @@ packages: compiler: [intel@2021.10.0, gcc@11.4.0] #compiler: [oneapi@2024.0.2] providers: - mpi: [intel-oneapi-mpi@2021.10.0, openmpi@4.1.5] + mpi: [intel-oneapi-mpi@2021.10.0, openmpi@4.1.6] #mpi: [intel-oneapi-mpi@2021.11] intel-oneapi-mpi: @@ -19,7 +19,7 @@ packages: - spec: intel-oneapi-mpi@2021.10.0%intel@2021.10.0 prefix: /opt/intel/oneapi modules: - - libfabric-aws/1.18.2amzn1.0 + - libfabric-aws/1.19.0amzn4.0 - mpi #externals: #- spec: intel-oneapi-mpi@2021.11%oneapi@2024.0.2 @@ -31,12 +31,12 @@ packages: buildable: false openmpi: externals: - - spec: openmpi@4.1.5%gcc@11.4.0~cuda~cxx~cxx_exceptions~java~memchecker+pmi~static~wrapper-rpath + - spec: openmpi@4.1.6%gcc@11.4.0~cuda~cxx~cxx_exceptions~java~memchecker+pmi~static~wrapper-rpath fabrics=ofi schedulers=slurm prefix: /opt/amazon/openmpi modules: - - libfabric-aws/1.18.2amzn1.0 - - openmpi/4.1.5 + - libfabric-aws/1.19.0amzn4.0 + - openmpi/4.1.6 binutils: From d8c7b2e6ade717ae4e1354ec8865a8580926101f Mon Sep 17 00:00:00 2001 From: eap Date: Mon, 29 Jul 2024 17:02:06 -0600 Subject: [PATCH 2/5] review --- configs/sites/tier1/aws-pcluster/README.md | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/configs/sites/tier1/aws-pcluster/README.md b/configs/sites/tier1/aws-pcluster/README.md index d97a73c05..0d25cb67e 100644 --- a/configs/sites/tier1/aws-pcluster/README.md +++ b/configs/sites/tier1/aws-pcluster/README.md @@ -124,7 +124,6 @@ apt install -y \ libqt5svg5-dev \ qt5dxcb-plugin - # This is because boost doesn't work with the Intel compiler apt install -y \ libboost1.74-dev \ @@ -223,7 +222,7 @@ exit mkdir -p /home/ubuntu/jedi/ecflow-5.8.4/src cd /home/ubuntu/jedi/ecflow-5.8.4/src wget -O ecFlow-5.8.4-Source.tar.gz \ - https://confluence.ecmwf.int/download/attachments/8650755/ecFlow-5.8.4-Source.tar.gz?api=v2 + https://github.com/ecmwf/ecflow/releases/download/-5.8.4/ecFlow-5.8.4-Source.tar.gz tar -xvzf ecFlow-5.8.4-Source.tar.gz export WK=/home/ubuntu/jedi/ecflow-5.8.4/src/ecFlow-5.8.4-Source export BOOST_ROOT=/usr @@ -383,7 +382,7 @@ git clone --recurse-submodules -b release/1.7.0 https://github.com/JCSDA/spack-s cd spack-stack-1.7/ . setup.sh spack stack create env --site aws-pcluster --template=unified-dev --name=unified-intel -cd unified-intel +cd envs/unified-intel spack env activate -p . # Edit envs/unified-intel/spack.yaml. From 5769cae1c87653ea1d0f64ecbeeae3e1fbc452b9 Mon Sep 17 00:00:00 2001 From: eap Date: Mon, 29 Jul 2024 17:03:49 -0600 Subject: [PATCH 3/5] remove paths_ignore directive --- .github/workflows/macos-ci-aarch64.yaml | 7 ------- .github/workflows/ubuntu-ci-x86_64-gnu.yaml | 7 ------- .github/workflows/ubuntu-ci-x86_64-intel.yaml | 7 ------- 3 files changed, 21 deletions(-) diff --git a/.github/workflows/macos-ci-aarch64.yaml b/.github/workflows/macos-ci-aarch64.yaml index 49a4db694..e1045c9b2 100644 --- a/.github/workflows/macos-ci-aarch64.yaml +++ b/.github/workflows/macos-ci-aarch64.yaml @@ -1,13 +1,6 @@ name: macos-ci-aarch64-build on: pull_request: - paths-ignore: - - 'configs/sites/**' - - 'doc/**' - - '**.md' - - '.github/ISSUE_TEMPLATE/*' - - '.github/workflows/macos-ci-aarch64-notest.yaml' - - '.gitignore' workflow_dispatch: concurrency: diff --git a/.github/workflows/ubuntu-ci-x86_64-gnu.yaml b/.github/workflows/ubuntu-ci-x86_64-gnu.yaml index 02e6f0a03..c24fc1d82 100644 --- a/.github/workflows/ubuntu-ci-x86_64-gnu.yaml +++ b/.github/workflows/ubuntu-ci-x86_64-gnu.yaml @@ -1,13 +1,6 @@ name: ubuntu-ci-c6a-x86_64-gnu-build on: pull_request: - paths-ignore: - - 'configs/sites/**' - - 'doc/**' - - '**.md' - - '.github/ISSUE_TEMPLATE/*' - - '.github/workflows/ubuntu-ci-x86_64-gnu-notest.yaml' - - '.gitignore' workflow_dispatch: concurrency: diff --git a/.github/workflows/ubuntu-ci-x86_64-intel.yaml b/.github/workflows/ubuntu-ci-x86_64-intel.yaml index 9808bef0d..8387b5403 100644 --- a/.github/workflows/ubuntu-ci-x86_64-intel.yaml +++ b/.github/workflows/ubuntu-ci-x86_64-intel.yaml @@ -1,13 +1,6 @@ name: ubuntu-ci-c6a-x86_64-intel-build on: pull_request: - paths-ignore: - - 'configs/sites/**' - - 'doc/**' - - '**.md' - - '.github/ISSUE_TEMPLATE/*' - - '.github/workflows/ubuntu-ci-x86_64-intel-notest.yaml' - - '.gitignore' workflow_dispatch: concurrency: From 8ce259a921cc8b2a2eeef2599a2542a87da98b31 Mon Sep 17 00:00:00 2001 From: eap Date: Tue, 30 Jul 2024 10:10:44 -0600 Subject: [PATCH 4/5] update ecflow download location --- configs/sites/tier1/aws-pcluster/README.md | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/configs/sites/tier1/aws-pcluster/README.md b/configs/sites/tier1/aws-pcluster/README.md index 0d25cb67e..5c3c0d86e 100644 --- a/configs/sites/tier1/aws-pcluster/README.md +++ b/configs/sites/tier1/aws-pcluster/README.md @@ -221,10 +221,9 @@ exit ``` mkdir -p /home/ubuntu/jedi/ecflow-5.8.4/src cd /home/ubuntu/jedi/ecflow-5.8.4/src -wget -O ecFlow-5.8.4-Source.tar.gz \ - https://github.com/ecmwf/ecflow/releases/download/-5.8.4/ecFlow-5.8.4-Source.tar.gz -tar -xvzf ecFlow-5.8.4-Source.tar.gz -export WK=/home/ubuntu/jedi/ecflow-5.8.4/src/ecFlow-5.8.4-Source +wget https://github.com/ecmwf/ecflow/archive/refs/tags/5.8.4.tar.gz +tar -xvzf 5.8.4.tar.gz +export WK=/home/ubuntu/jedi/ecflow-5.8.4/src/ecflow-5.8.4 export BOOST_ROOT=/usr # Build ecFlow From 4805e7a18fe0a0246c6a099c6caf89e6acf826e2 Mon Sep 17 00:00:00 2001 From: eap Date: Tue, 30 Jul 2024 14:45:59 -0600 Subject: [PATCH 5/5] update +intersphinx_mapping --- doc/source/conf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/conf.py b/doc/source/conf.py index 2462bbb56..82920fd02 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -200,7 +200,7 @@ # -- Options for intersphinx extension --------------------------------------- # Example configuration for intersphinx: refer to the Python standard library. -intersphinx_mapping = {'https://docs.python.org/': None} +intersphinx_mapping = {'python': ('https://docs.python.org/3', None)} # -- Options for todo extension ----------------------------------------------