From 608e47d5d87a02f0a2f62a9478b79614bb4264c1 Mon Sep 17 00:00:00 2001 From: Matthew Thompson Date: Wed, 21 Feb 2024 07:53:02 -0500 Subject: [PATCH 1/2] Use CMake to determine MPI Stack --- scripts/fv3_setup | 30 ++++++------------------------ 1 file changed, 6 insertions(+), 24 deletions(-) diff --git a/scripts/fv3_setup b/scripts/fv3_setup index 6ee2507..ee11ddb 100755 --- a/scripts/fv3_setup +++ b/scripts/fv3_setup @@ -162,26 +162,8 @@ endif # Test for Compiler and MPI Setup ####################################################################### -setenv BASEDIR `awk '{print $2}' $ETCDIR/BASEDIR.rc` - - if ( `echo $BASEDIR | grep -i mvapich2` != '') then - set MPI = mvapich2 -else if ( `echo $BASEDIR | grep -i mpich` != '') then - set MPI = mpich -else if ( `echo $BASEDIR | grep -i openmpi` != '') then - set MPI = openmpi -else if ( `echo $BASEDIR | grep -i hpcx` != '') then - set MPI = openmpi -else if ( `echo $BASEDIR | grep -i impi` != '') then - set MPI = intelmpi -else if ( `echo $BASEDIR | grep -i intelmpi` != '') then - set MPI = intelmpi -else if ( `echo $BASEDIR | grep -i mpt` != '') then - set MPI = mpt -else - # Assume default is Intel MPI in case of older baselibs - set MPI = intelmpi -endif +# Get MPI stack from CMake +set MPI_STACK = @MPI_STACK@ ####################################################################### # Enter Experiment Specific Run Parameters @@ -717,7 +699,7 @@ echo $GROUP > $HOME/.GROUProot /bin/rm -f $EXPDIR/SETENV.commands -if( $MPI == openmpi ) then +if( $MPI_STACK == openmpi ) then # This turns off an annoying warning when running # Open MPI on a system where TMPDIRs are on a networked @@ -730,7 +712,7 @@ EOF # The below settings seem to be recommended for hybrid # systems using MVAPICH2 but could change -else if( $MPI == mvapich ) then +else if( $MPI_STACK == mvapich ) then cat > $EXPDIR/SETENV.commands << EOF setenv MV2_ENABLE_AFFINITY 0 @@ -739,7 +721,7 @@ cat > $EXPDIR/SETENV.commands << EOF setenv MV2_GATHERV_SSEND_THRESHOLD 256 EOF -else if( $MPI == mpt ) then +else if( $MPI_STACK == mpt ) then cat > $EXPDIR/SETENV.commands << EOF @@ -768,7 +750,7 @@ cat > $EXPDIR/SETENV.commands << EOF EOF -else if( $MPI == intelmpi ) then +else if( $MPI_STACK == intelmpi ) then cat > $EXPDIR/SETENV.commands << EOF #setenv MPS_STAT_LEVEL 4 From 68c4a5cee0b5d5bd234d94297171700d276b7d64 Mon Sep 17 00:00:00 2001 From: Matthew Thompson Date: Wed, 21 Feb 2024 09:14:19 -0500 Subject: [PATCH 2/2] Update MPI settings to match gcm_setup --- scripts/fv3_setup | 129 +++++++++++++++++++++++++++++++--------------- 1 file changed, 88 insertions(+), 41 deletions(-) diff --git a/scripts/fv3_setup b/scripts/fv3_setup index ee11ddb..4895f13 100755 --- a/scripts/fv3_setup +++ b/scripts/fv3_setup @@ -292,7 +292,6 @@ if ( $SITE == 'NCCS' ) then set BUILT_ON_SLES15 = @BUILT_ON_SLES15@ if ("$BUILT_ON_SLES15" == "TRUE") then - set DEFAULT_MODEL = 'mil' echo "Enter the ${C1}Processor Type${CN} you wish to run on:" echo " ${C2}mil (Milan)${CN} (default)" echo " " @@ -311,7 +310,7 @@ if ( $SITE == 'NCCS' ) then else echo "Enter the ${C1}Processor Type${CN} you wish to run on:" echo " ${C2}sky (Skylake)${CN}" - echo " ${C2}cas (Cascade Lake)${CN} (default)" + echo " ${C2}cas (Cascade Lake) (default)${CN}" echo " " set MODEL = `echo $<` set MODEL = `echo $MODEL | tr "[:upper:]" "[:lower:]"` @@ -340,20 +339,17 @@ else if ( $SITE == 'NAS' ) then echo "Enter the ${C1}Processor Type${CN} you wish to run on:" echo " ${C2}has (Haswell)${CN}" echo " ${C2}bro (Broadwell)${CN}" - echo " ${C2}sky (Skylake)${CN}" - echo " ${C2}cas (Cascade Lake)${CN} (default)" + echo " ${C2}sky (Skylake)${CN} (default)" + echo " ${C2}cas (Cascade Lake)${CN}" echo " ${C2}rom (AMD Rome)${CN}" echo " " - echo " NOTE 1: Due to how FV3 is compiled by default, Sandy Bridge" - echo " and Ivy Bridge are not supported by current GEOS" - echo " " - echo " NOTE 2: GEOS is non-zero-diff when running on AMD Rome" - echo " compared to the other Intel nodes." + echo " NOTE Due to how FV3 is compiled by default, Sandy Bridge" + echo " and Ivy Bridge are not supported by current GEOS" echo " " set MODEL = `echo $<` set MODEL = `echo $MODEL | tr "[:upper:]" "[:lower:]"` if ( .$MODEL == .) then - set MODEL = 'cas' + set MODEL = 'sky' endif if( $MODEL != 'has' & \ @@ -697,56 +693,87 @@ echo $GROUP > $HOME/.GROUProot # Set Recommended MPI Stack Settings ####################################################################### +# By default do not write restarts by oserver +set RESTART_BY_OSERVER = NO + /bin/rm -f $EXPDIR/SETENV.commands if( $MPI_STACK == openmpi ) then -# This turns off an annoying warning when running -# Open MPI on a system where TMPDIRs are on a networked -# file system +# Open MPI and GEOS has issues with restart writing. Having the +# oserver write them can be orders of magnitude faster + +set RESTART_BY_OSERVER = YES + +# Testing by Bill Putman determined some useful +# Open MPI parameters. Testing shows these work +# on both OSs at NCCS and on macOS cat > $EXPDIR/SETENV.commands << EOF - setenv OMPI_MCA_shmem_mmap_enable_nfs_warning 0 +# Turn off warning about TMPDIR on NFS +setenv OMPI_MCA_shmem_mmap_enable_nfs_warning 0 +# pre-connect MPI procs on mpi_init +setenv OMPI_MCA_mpi_preconnect_all 1 +setenv OMPI_MCA_coll_tuned_bcast_algorithm 7 +setenv OMPI_MCA_coll_tuned_scatter_algorithm 2 +setenv OMPI_MCA_coll_tuned_reduce_scatter_algorithm 3 +setenv OMPI_MCA_coll_tuned_allreduce_algorithm 3 +setenv OMPI_MCA_coll_tuned_allgather_algorithm 4 +setenv OMPI_MCA_coll_tuned_allgatherv_algorithm 3 +setenv OMPI_MCA_coll_tuned_gather_algorithm 1 +setenv OMPI_MCA_coll_tuned_barrier_algorithm 0 +# required for a tuned flag to be effective +setenv OMPI_MCA_coll_tuned_use_dynamic_rules 1 +# disable file locks +setenv OMPI_MCA_sharedfp "^lockedfile,individual" EOF # The below settings seem to be recommended for hybrid -# systems using MVAPICH2 but could change +# systems using MVAPICH but could change else if( $MPI_STACK == mvapich ) then +# MVAPICH and GEOS has issues with restart writing. Having the +# oserver write them seems to...work +set RESTART_BY_OSERVER = YES + cat > $EXPDIR/SETENV.commands << EOF - setenv MV2_ENABLE_AFFINITY 0 - setenv SLURM_DISTRIBUTION block - setenv MV2_MPIRUN_TIMEOUT 100 - setenv MV2_GATHERV_SSEND_THRESHOLD 256 +setenv MV2_ENABLE_AFFINITY 0 +setenv SLURM_DISTRIBUTION block +setenv MV2_MPIRUN_TIMEOUT 100 +setenv MV2_GATHERV_SSEND_THRESHOLD 256 EOF else if( $MPI_STACK == mpt ) then cat > $EXPDIR/SETENV.commands << EOF - setenv MPI_COLL_REPRODUCIBLE - setenv SLURM_DISTRIBUTION block +setenv MPI_COLL_REPRODUCIBLE +setenv SLURM_DISTRIBUTION block - #setenv MPI_DISPLAY_SETTINGS 1 - #setenv MPI_VERBOSE 1 +#setenv MPI_DISPLAY_SETTINGS 1 +#setenv MPI_VERBOSE 1 - unsetenv MPI_MEMMAP_OFF - unsetenv MPI_NUM_MEMORY_REGIONS - setenv MPI_XPMEM_ENABLED yes - unsetenv SUPPRESS_XPMEM_TRIM_THRESH +setenv MPI_MEMMAP_OFF +unsetenv MPI_NUM_MEMORY_REGIONS +setenv MPI_XPMEM_ENABLED yes +unsetenv SUPPRESS_XPMEM_TRIM_THRESH - setenv MPI_LAUNCH_TIMEOUT 40 +setenv MPI_LAUNCH_TIMEOUT 40 - # For some reason, PMI_RANK is randomly set and interferes - # with binarytile.x and other executables. - unsetenv PMI_RANK +setenv MPI_COMM_MAX 1024 +setenv MPI_GROUP_MAX 1024 +setenv MPI_BUFS_PER_PROC 256 - # Often when debugging on MPT, the traceback from Intel Fortran - # is "absorbed" and only MPT's errors are displayed. To allow the - # compiler's traceback to be displayed, uncomment this environment - # variable - #setenv FOR_IGNORE_EXCEPTIONS false +# For some reason, PMI_RANK is randomly set and interferes +# with binarytile.x and other executables. +unsetenv PMI_RANK + +# Often when debugging on MPT, the traceback from Intel Fortran +# is "absorbed" and only MPT's errors are displayed. To allow the +# compiler's traceback to be displayed, uncomment this environment +# variable +#setenv FOR_IGNORE_EXCEPTIONS false EOF @@ -782,13 +809,32 @@ EOF endif # if NOT Singularity -# Testing on SLES15 showed that the mlx provider did not seem -# to work at scale. So we move to use the verbs provider. Note: -# still seems to have issues at c720 +# Testing by Bill Putman found these to be +# useful flags with Intel MPI on SLES15 on the +# Milan nodes. +# Note 1: Testing by NCCS shows the PSM3 provider +# runs on the Infiniband fabric. Tests show it runs +# up to C720. +# Note 2: When the Cascade Lakes are moved to +# SLES15, these will need to be Milan-only flags +# as Intel MPI will probably work just fine with +# Intel chips. if ("$BUILT_ON_SLES15" == "TRUE") then cat >> $EXPDIR/SETENV.commands << EOF -setenv I_MPI_OFI_PROVIDER verbs -setenv I_MPI_COLL_EXTERNAL 0 +setenv I_MPI_FALLBACK 0 +setenv I_MPI_FABRICS ofi +setenv I_MPI_OFI_PROVIDER psm3 +setenv I_MPI_ADJUST_SCATTER 2 +setenv I_MPI_ADJUST_SCATTERV 2 +setenv I_MPI_ADJUST_GATHER 2 +setenv I_MPI_ADJUST_GATHERV 3 +setenv I_MPI_ADJUST_ALLGATHER 3 +setenv I_MPI_ADJUST_ALLGATHERV 3 +setenv I_MPI_ADJUST_ALLREDUCE 12 +setenv I_MPI_ADJUST_REDUCE 10 +setenv I_MPI_ADJUST_BCAST 11 +setenv I_MPI_ADJUST_REDUCE_SCATTER 4 +setenv I_MPI_ADJUST_BARRIER 9 EOF endif # if SLES15 @@ -797,6 +843,7 @@ endif # if NCCS endif # if mpi + ####################################################################### # Create Local Scripts and Resource Files #######################################################################