Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Auto PR - develop β†’ MAPL-v3 - Use CMake to determine MPI Stack #271

Merged
merged 5 commits into from
Feb 21, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
159 changes: 94 additions & 65 deletions scripts/fv3_setup
Original file line number Diff line number Diff line change
Expand Up @@ -162,26 +162,8 @@ endif
# Test for Compiler and MPI Setup
#######################################################################

setenv BASEDIR `awk '{print $2}' $ETCDIR/BASEDIR.rc`

if ( `echo $BASEDIR | grep -i mvapich2` != '') then
set MPI = mvapich2
else if ( `echo $BASEDIR | grep -i mpich` != '') then
set MPI = mpich
else if ( `echo $BASEDIR | grep -i openmpi` != '') then
set MPI = openmpi
else if ( `echo $BASEDIR | grep -i hpcx` != '') then
set MPI = openmpi
else if ( `echo $BASEDIR | grep -i impi` != '') then
set MPI = intelmpi
else if ( `echo $BASEDIR | grep -i intelmpi` != '') then
set MPI = intelmpi
else if ( `echo $BASEDIR | grep -i mpt` != '') then
set MPI = mpt
else
# Assume default is Intel MPI in case of older baselibs
set MPI = intelmpi
endif
# Get MPI stack from CMake
set MPI_STACK = @MPI_STACK@

#######################################################################
# Enter Experiment Specific Run Parameters
Expand Down Expand Up @@ -310,7 +292,6 @@ if ( $SITE == 'NCCS' ) then
set BUILT_ON_SLES15 = @BUILT_ON_SLES15@

if ("$BUILT_ON_SLES15" == "TRUE") then
set DEFAULT_MODEL = 'mil'
echo "Enter the ${C1}Processor Type${CN} you wish to run on:"
echo " ${C2}mil (Milan)${CN} (default)"
echo " "
Expand All @@ -329,7 +310,7 @@ if ( $SITE == 'NCCS' ) then
else
echo "Enter the ${C1}Processor Type${CN} you wish to run on:"
echo " ${C2}sky (Skylake)${CN}"
echo " ${C2}cas (Cascade Lake)${CN} (default)"
echo " ${C2}cas (Cascade Lake) (default)${CN}"
echo " "
set MODEL = `echo $<`
set MODEL = `echo $MODEL | tr "[:upper:]" "[:lower:]"`
Expand Down Expand Up @@ -358,20 +339,17 @@ else if ( $SITE == 'NAS' ) then
echo "Enter the ${C1}Processor Type${CN} you wish to run on:"
echo " ${C2}has (Haswell)${CN}"
echo " ${C2}bro (Broadwell)${CN}"
echo " ${C2}sky (Skylake)${CN}"
echo " ${C2}cas (Cascade Lake)${CN} (default)"
echo " ${C2}sky (Skylake)${CN} (default)"
echo " ${C2}cas (Cascade Lake)${CN}"
echo " ${C2}rom (AMD Rome)${CN}"
echo " "
echo " NOTE 1: Due to how FV3 is compiled by default, Sandy Bridge"
echo " and Ivy Bridge are not supported by current GEOS"
echo " "
echo " NOTE 2: GEOS is non-zero-diff when running on AMD Rome"
echo " compared to the other Intel nodes."
echo " NOTE Due to how FV3 is compiled by default, Sandy Bridge"
echo " and Ivy Bridge are not supported by current GEOS"
echo " "
set MODEL = `echo $<`
set MODEL = `echo $MODEL | tr "[:upper:]" "[:lower:]"`
if ( .$MODEL == .) then
set MODEL = 'cas'
set MODEL = 'sky'
endif

if( $MODEL != 'has' & \
Expand Down Expand Up @@ -715,60 +693,91 @@ echo $GROUP > $HOME/.GROUProot
# Set Recommended MPI Stack Settings
#######################################################################

# By default do not write restarts by oserver
set RESTART_BY_OSERVER = NO

/bin/rm -f $EXPDIR/SETENV.commands

if( $MPI == openmpi ) then
if( $MPI_STACK == openmpi ) then

# Open MPI and GEOS has issues with restart writing. Having the
# oserver write them can be orders of magnitude faster

set RESTART_BY_OSERVER = YES

# This turns off an annoying warning when running
# Open MPI on a system where TMPDIRs are on a networked
# file system
# Testing by Bill Putman determined some useful
# Open MPI parameters. Testing shows these work
# on both OSs at NCCS and on macOS

cat > $EXPDIR/SETENV.commands << EOF
setenv OMPI_MCA_shmem_mmap_enable_nfs_warning 0
# Turn off warning about TMPDIR on NFS
setenv OMPI_MCA_shmem_mmap_enable_nfs_warning 0
# pre-connect MPI procs on mpi_init
setenv OMPI_MCA_mpi_preconnect_all 1
setenv OMPI_MCA_coll_tuned_bcast_algorithm 7
setenv OMPI_MCA_coll_tuned_scatter_algorithm 2
setenv OMPI_MCA_coll_tuned_reduce_scatter_algorithm 3
setenv OMPI_MCA_coll_tuned_allreduce_algorithm 3
setenv OMPI_MCA_coll_tuned_allgather_algorithm 4
setenv OMPI_MCA_coll_tuned_allgatherv_algorithm 3
setenv OMPI_MCA_coll_tuned_gather_algorithm 1
setenv OMPI_MCA_coll_tuned_barrier_algorithm 0
# required for a tuned flag to be effective
setenv OMPI_MCA_coll_tuned_use_dynamic_rules 1
# disable file locks
setenv OMPI_MCA_sharedfp "^lockedfile,individual"
EOF

# The below settings seem to be recommended for hybrid
# systems using MVAPICH2 but could change
# systems using MVAPICH but could change

else if( $MPI == mvapich ) then
else if( $MPI_STACK == mvapich ) then

# MVAPICH and GEOS has issues with restart writing. Having the
# oserver write them seems to...work
set RESTART_BY_OSERVER = YES

cat > $EXPDIR/SETENV.commands << EOF
setenv MV2_ENABLE_AFFINITY 0
setenv SLURM_DISTRIBUTION block
setenv MV2_MPIRUN_TIMEOUT 100
setenv MV2_GATHERV_SSEND_THRESHOLD 256
setenv MV2_ENABLE_AFFINITY 0
setenv SLURM_DISTRIBUTION block
setenv MV2_MPIRUN_TIMEOUT 100
setenv MV2_GATHERV_SSEND_THRESHOLD 256
EOF

else if( $MPI == mpt ) then
else if( $MPI_STACK == mpt ) then

cat > $EXPDIR/SETENV.commands << EOF

setenv MPI_COLL_REPRODUCIBLE
setenv SLURM_DISTRIBUTION block
setenv MPI_COLL_REPRODUCIBLE
setenv SLURM_DISTRIBUTION block

#setenv MPI_DISPLAY_SETTINGS 1
#setenv MPI_VERBOSE 1

#setenv MPI_DISPLAY_SETTINGS 1
#setenv MPI_VERBOSE 1
setenv MPI_MEMMAP_OFF
unsetenv MPI_NUM_MEMORY_REGIONS
setenv MPI_XPMEM_ENABLED yes
unsetenv SUPPRESS_XPMEM_TRIM_THRESH

unsetenv MPI_MEMMAP_OFF
unsetenv MPI_NUM_MEMORY_REGIONS
setenv MPI_XPMEM_ENABLED yes
unsetenv SUPPRESS_XPMEM_TRIM_THRESH
setenv MPI_LAUNCH_TIMEOUT 40

setenv MPI_LAUNCH_TIMEOUT 40
setenv MPI_COMM_MAX 1024
setenv MPI_GROUP_MAX 1024
setenv MPI_BUFS_PER_PROC 256

# For some reason, PMI_RANK is randomly set and interferes
# with binarytile.x and other executables.
unsetenv PMI_RANK
# For some reason, PMI_RANK is randomly set and interferes
# with binarytile.x and other executables.
unsetenv PMI_RANK

# Often when debugging on MPT, the traceback from Intel Fortran
# is "absorbed" and only MPT's errors are displayed. To allow the
# compiler's traceback to be displayed, uncomment this environment
# variable
#setenv FOR_IGNORE_EXCEPTIONS false
# Often when debugging on MPT, the traceback from Intel Fortran
# is "absorbed" and only MPT's errors are displayed. To allow the
# compiler's traceback to be displayed, uncomment this environment
# variable
#setenv FOR_IGNORE_EXCEPTIONS false

EOF

else if( $MPI == intelmpi ) then
else if( $MPI_STACK == intelmpi ) then

cat > $EXPDIR/SETENV.commands << EOF
#setenv MPS_STAT_LEVEL 4
Expand Down Expand Up @@ -800,13 +809,32 @@ EOF

endif # if NOT Singularity

# Testing on SLES15 showed that the mlx provider did not seem
# to work at scale. So we move to use the verbs provider. Note:
# still seems to have issues at c720
# Testing by Bill Putman found these to be
# useful flags with Intel MPI on SLES15 on the
# Milan nodes.
# Note 1: Testing by NCCS shows the PSM3 provider
# runs on the Infiniband fabric. Tests show it runs
# up to C720.
# Note 2: When the Cascade Lakes are moved to
# SLES15, these will need to be Milan-only flags
# as Intel MPI will probably work just fine with
# Intel chips.
if ("$BUILT_ON_SLES15" == "TRUE") then
cat >> $EXPDIR/SETENV.commands << EOF
setenv I_MPI_OFI_PROVIDER verbs
setenv I_MPI_COLL_EXTERNAL 0
setenv I_MPI_FALLBACK 0
setenv I_MPI_FABRICS ofi
setenv I_MPI_OFI_PROVIDER psm3
setenv I_MPI_ADJUST_SCATTER 2
setenv I_MPI_ADJUST_SCATTERV 2
setenv I_MPI_ADJUST_GATHER 2
setenv I_MPI_ADJUST_GATHERV 3
setenv I_MPI_ADJUST_ALLGATHER 3
setenv I_MPI_ADJUST_ALLGATHERV 3
setenv I_MPI_ADJUST_ALLREDUCE 12
setenv I_MPI_ADJUST_REDUCE 10
setenv I_MPI_ADJUST_BCAST 11
setenv I_MPI_ADJUST_REDUCE_SCATTER 4
setenv I_MPI_ADJUST_BARRIER 9
EOF

endif # if SLES15
Expand All @@ -815,6 +843,7 @@ endif # if NCCS

endif # if mpi


#######################################################################
# Create Local Scripts and Resource Files
#######################################################################
Expand Down
Loading