Skip to content

Commit

Permalink
Merge pull request #1210 from eap/feature/aws-pcluster-3.9.3
Browse files Browse the repository at this point in the history
Update the parallel cluster site config from 3.7.1 to 3.9.3. This change also updates the github test actions as detailed in #1208 ensuring that required tests run for all pull requests without consideration of file exclusion filters.
  • Loading branch information
eap authored Jul 30, 2024
2 parents 2c9cb91 + 4805e7a commit 289721c
Show file tree
Hide file tree
Showing 7 changed files with 55 additions and 59 deletions.
6 changes: 0 additions & 6 deletions .github/workflows/macos-ci-aarch64.yaml
Original file line number Diff line number Diff line change
@@ -1,12 +1,6 @@
name: macos-ci-aarch64-build
on:
pull_request:
paths-ignore:
- 'configs/sites/**'
- 'doc/**'
- '**.md'
- '.github/ISSUE_TEMPLATE/*'
- '.gitignore'
workflow_dispatch:

concurrency:
Expand Down
6 changes: 0 additions & 6 deletions .github/workflows/ubuntu-ci-x86_64-gnu.yaml
Original file line number Diff line number Diff line change
@@ -1,12 +1,6 @@
name: ubuntu-ci-c6a-x86_64-gnu-build
on:
pull_request:
paths-ignore:
- 'configs/sites/**'
- 'doc/**'
- '**.md'
- '.github/ISSUE_TEMPLATE/*'
- '.gitignore'
workflow_dispatch:

concurrency:
Expand Down
6 changes: 0 additions & 6 deletions .github/workflows/ubuntu-ci-x86_64-intel.yaml
Original file line number Diff line number Diff line change
@@ -1,12 +1,6 @@
name: ubuntu-ci-c6a-x86_64-intel-build
on:
pull_request:
paths-ignore:
- 'configs/sites/**'
- 'doc/**'
- '**.md'
- '.github/ISSUE_TEMPLATE/*'
- '.gitignore'
workflow_dispatch:

concurrency:
Expand Down
6 changes: 0 additions & 6 deletions .github/workflows/ubuntu-ci-x86_64-oneapi.yaml
Original file line number Diff line number Diff line change
@@ -1,12 +1,6 @@
name: ubuntu-ci-c6a-x86_64-oneapi-build
on:
pull_request:
paths-ignore:
- 'configs/sites/**'
- 'doc/**'
- '**.md'
- '.github/ISSUE_TEMPLATE/*'
- '.gitignore'
workflow_dispatch:

concurrency:
Expand Down
78 changes: 49 additions & 29 deletions configs/sites/tier1/aws-pcluster/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@ from the Community AMIs tab that matches your desired OS and PCluster version.
Select an instance type of the same family that you are planning to use for the
head and the compute nodes, and specify enough enough storage for a swap file
and a spack-stack installation.
- AMI Name: aws-parallelcluster-3.7.1-ubuntu-2204-lts-hvm-x86_64
- AMI ID: ami-0906e8b928cde5ccc
- AMI Name: aws-parallelcluster-3.9.3-ubuntu-2204-lts-hvm-x86_64
- AMI ID: ami-08e88679103d6ab5f
- Instance r7a.4xlarge (uses same processor architecture as hpc7a instances)
- Use 300GB of gp3 storage as /
- Attach security groups allowing ssh inbound traffic and NFS outbound traffic
Expand All @@ -21,7 +21,7 @@ and NFS access.

```
aws ec2 run-instances \
--image-id ami-0906e8b928cde5ccc \
--image-id ami-08e88679103d6ab5f \
--count 1 \
--instance-type r7a.4xlarge \
--key-name YOUR-KEY-NAME \
Expand Down Expand Up @@ -84,6 +84,7 @@ apt install -y \
tcl-dev \
nano
sudo -u ubuntu git lfs install
sudo -u ubuntu git config --global credential.helper 'cache --timeout=3600'
# Install compilers and various dev dependencies.
Expand Down Expand Up @@ -123,7 +124,6 @@ apt install -y \
libqt5svg5-dev \
qt5dxcb-plugin
# This is because boost doesn't work with the Intel compiler
apt install -y \
libboost1.74-dev \
Expand Down Expand Up @@ -209,24 +209,21 @@ apt install -y docker-ce \
docker-buildx-plugin \
docker-compose-plugin
docker run hello-world
sudo usermod -aG docker $USER
# DH* TODO 2023/02/21: Add users to group docker so that non-root users can run it
# See https://docs.docker.com/engine/install/linux-postinstall/
# Exit root session
exit
# Basic git config
git config --global credential.helper cache
```

4. Build ecflow outside of spack to be able to link against OS boost
```
mkdir -p /home/ubuntu/jedi/ecflow-5.8.4/src
cd /home/ubuntu/jedi/ecflow-5.8.4/src
wget https://confluence.ecmwf.int/download/attachments/8650755/ecFlow-5.8.4-Source.tar.gz?api=v2
mv ecFlow-5.8.4-Source.tar.gz\?api\=v2 ecFlow-5.8.4-Source.tar.gz
tar -xvzf ecFlow-5.8.4-Source.tar.gz
export WK=/home/ubuntu/jedi/ecflow-5.8.4/src/ecFlow-5.8.4-Source
wget https://github.com/ecmwf/ecflow/archive/refs/tags/5.8.4.tar.gz
tar -xvzf 5.8.4.tar.gz
export WK=/home/ubuntu/jedi/ecflow-5.8.4/src/ecflow-5.8.4
export BOOST_ROOT=/usr
# Build ecFlow
Expand Down Expand Up @@ -308,21 +305,22 @@ ln -sf /opt/lmod/lmod/init/profile.fish /etc/profile.d/z00_lmod.fish
# Update the Intel MPI module to load AWS libfabric.
cat << 'EOF' >> /opt/intel/oneapi/mpi/2021.10.0/modulefiles/mpi
conflict openmpi
if { [ module-info mode load ] && ![ is-loaded libfabric-aws/1.18.2amzn1.0 ] } {
module load libfabric-aws/1.18.2amzn1.0
if { [ module-info mode load ] && ![ is-loaded libfabric-aws/1.19.0amzn4.0 ] } {
module load libfabric-aws/1.19.0amzn4.0
}
EOF
# Update the AWS OpenMPI module to load AWS libfabric.
cat << 'EOF' >> /usr/share/modules/modulefiles/openmpi/4.1.5
cat << 'EOF' >> /opt/amazon/modules/modulefiles/openmpi/4.1.6
conflict mpi
if { [ module-info mode load ] && ![ is-loaded libfabric-aws/1.18.2amzn1.0 ] } {
module load libfabric-aws/1.18.2amzn1.0
if { [ module-info mode load ] && ![ is-loaded libfabric-aws/1.19.0amzn4.0 ] } {
module load libfabric-aws/1.19.0amzn4.0
}
EOF
# Add a number of default module locations to the lmod startup script.
cat << 'EOF' >> /etc/profile.d/z01_lmod.sh
module use /opt/amazon/modules/modulefiles
module use /usr/share/modules/modulefiles
module use /opt/intel/oneapi/mpi/2021.10.0/modulefiles
module use /home/ubuntu/jedi/modulefiles
Expand All @@ -335,10 +333,10 @@ exit
ssh ...
# Now user ubuntu
module av
module load libfabric-aws/1.18.2amzn1.0
module load openmpi/4.1.5
module load libfabric-aws/1.19.0amzn4.0
module load openmpi/4.1.6
module list
module unload openmpi/4.1.5
module unload openmpi/4.1.6
module load mpi
module list
module purge
Expand Down Expand Up @@ -383,7 +381,8 @@ git clone --recurse-submodules -b release/1.7.0 https://github.com/JCSDA/spack-s
cd spack-stack-1.7/
. setup.sh
spack stack create env --site aws-pcluster --template=unified-dev --name=unified-intel
spack env activate -p envs/unified-env
cd envs/unified-intel
spack env activate -p .
# Edit envs/unified-intel/spack.yaml.
# 1) Find this line:
Expand All @@ -392,9 +391,9 @@ spack env activate -p envs/unified-env
# the line should look like this:
# compilers: [%intel']
spack concretize 2>&1 | tee log.concretize.unified-env.001
./util/show_duplicate_packages.py -d log.concretize.unified-env.001
spack install --verbose 2>&1 | tee log.install.unified-env.001
spack concretize 2>&1 | tee log.concretize.001
${SPACK_STACK_DIR}/util/show_duplicate_packages.py -d log.concretize.001
spack install -j 12 --verbose 2>&1 | tee log.install.001
spack module lmod refresh
spack stack setup-meta-modules
```
Expand Down Expand Up @@ -425,20 +424,20 @@ cat << 'EOF' >> ${SPACK_SYSTEM_CONFIG_PATH}/packages.yaml
- spec: [email protected]%[email protected]
prefix: /opt/intel
modules:
- libfabric-aws/1.18.2amzn1.0
- libfabric-aws/1.19.0amzn4.0
- intelmpi
EOF
# Add external openmpi
cat << 'EOF' >> ${SPACK_SYSTEM_CONFIG_PATH}/packages.yaml
openmpi:
externals:
- spec: [email protected].5%[email protected]~cuda~cxx~cxx_exceptions~java~memchecker+pmi~static~wrapper-rpath
- spec: [email protected].6%[email protected]~cuda~cxx~cxx_exceptions~java~memchecker+pmi~static~wrapper-rpath
fabrics=ofi schedulers=slurm
prefix: /opt/amazon/openmpi
modules:
- libfabric-aws/1.18.2amzn1.0
- openmpi/4.1.5
- libfabric-aws/1.19.0amzn4.0
- openmpi/4.1.6
EOF
# Can't find qt5 because qtpluginfo is broken,
Expand Down Expand Up @@ -467,7 +466,7 @@ export -n SPACK_SYSTEM_CONFIG_PATH
spack config add "packages:mpi:buildable:False"
spack config add "packages:python:buildable:False"
spack config add "packages:openssl:buildable:False"
spack config add "packages:all:providers:mpi:[[email protected], [email protected].5]"
spack config add "packages:all:providers:mpi:[[email protected], [email protected].6]"
spack config add "packages:all:compiler:[[email protected], [email protected]]"
# edit envs/unified-env/site/compilers.yaml and replace the following line in the **Intel** compiler section:
Expand Down Expand Up @@ -500,7 +499,7 @@ spack stack setup-meta-modules
# Example given for building jedi-bundle
module use /mnt/experiments-efs/spack-stack-1.7/envs/unified-intel/install/modulefiles/Core
module load stack-gcc/11.4.0
module load stack-openmpi/4.1.5
module load stack-openmpi/4.1.6
module load base-env
module load jedi-mpas-env
module load jedi-fv3-env
Expand Down Expand Up @@ -530,6 +529,7 @@ source /opt/intel/oneapi/mpi/2021.10.0/env/vars.sh
# Activate spack stack modules.
module use /mnt/experiments-efs/spack-stack-1.7/envs/unified-intel/install/modulefiles/Core
module use $HOME/spack-stack-1.7/envs/unified-intel/install/modulefiles/Core
module load stack-intel/2021.10.0
module load stack-intel-oneapi-mpi/2021.10.0
module load base-env
Expand All @@ -550,3 +550,23 @@ make update
make -j10
ctest
```

15. Once the parallel cluster head node image is fully configured and tested you
can create an AMI snapshot based on the configured instance using the
[instructions](https://docs.aws.amazon.com/parallelcluster/latest/ug/building-custom-ami-v3.html)
published by AWS. Included here is a short summary of those instructions.

```
# 1) In a SSH session on the instance delete any unwanted files and directories
# in your home directory and in the /root directory.
# 2) Use the AMI cleanup script to prepare the instance for imaging.
sudo /usr/local/sbin/ami_cleanup.sh
sudo apt clean
# 3) From the AWS console navigate to the build instance and choose
# "Instance state" and select "Stop instance"
# 4) Create a new AMI from the instance. From at the instance view in the prior
# step select "Actions" and choose "Image" and then "Create image".
```
10 changes: 5 additions & 5 deletions configs/sites/tier1/aws-pcluster/packages.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,15 +11,15 @@ packages:
compiler: [[email protected], [email protected]]
#compiler: [[email protected]]
providers:
mpi: [[email protected], [email protected].5]
mpi: [[email protected], [email protected].6]
#mpi: [[email protected]]

intel-oneapi-mpi:
externals:
- spec: [email protected]%[email protected]
prefix: /opt/intel/oneapi
modules:
- libfabric-aws/1.18.2amzn1.0
- libfabric-aws/1.19.0amzn4.0
- mpi
#externals:
#- spec: [email protected]%[email protected]
Expand All @@ -31,12 +31,12 @@ packages:
buildable: false
openmpi:
externals:
- spec: [email protected].5%[email protected]~cuda~cxx~cxx_exceptions~java~memchecker+pmi~static~wrapper-rpath
- spec: [email protected].6%[email protected]~cuda~cxx~cxx_exceptions~java~memchecker+pmi~static~wrapper-rpath
fabrics=ofi schedulers=slurm
prefix: /opt/amazon/openmpi
modules:
- libfabric-aws/1.18.2amzn1.0
- openmpi/4.1.5
- libfabric-aws/1.19.0amzn4.0
- openmpi/4.1.6


binutils:
Expand Down
2 changes: 1 addition & 1 deletion doc/source/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,7 +200,7 @@
# -- Options for intersphinx extension ---------------------------------------

# Example configuration for intersphinx: refer to the Python standard library.
intersphinx_mapping = {'https://docs.python.org/': None}
intersphinx_mapping = {'python': ('https://docs.python.org/3', None)}

# -- Options for todo extension ----------------------------------------------

Expand Down

0 comments on commit 289721c

Please sign in to comment.