diff --git a/tests/error-test.conf b/tests/error-test.conf new file mode 100644 index 0000000000..34e8e52364 --- /dev/null +++ b/tests/error-test.conf @@ -0,0 +1,26 @@ + + +# FIXME: THIS FILE SHOULD NOT BE MERGED TO DEVELOP + + +# This should succeed +COMPILE | rrfs | intel | -DAPP=ATM -DCCPP_SUITES=FV3_RAP,FV3_RAP_sfcdiff,FV3_HRRR,FV3_RRFS_v1beta,FV3_RRFS_v1nssl -D32BIT=ON | | fv3 | + +# This should succeed +RUN | rrfs_v1nssl_nohailnoccn | | baseline | + +# These variants of rrfs_v1beta should always fail, and prevent the workflow from completing. +RUN | rrfs_v1beta_fail_to_copy | | baseline | +RUN | rrfs_v1beta_fail_to_run | | baseline | + +# Removing -DFASTER=ON here ensures results change, but the test runs. The workflow jobs should complete +# for all three of these tests, but the results should change. +COMPILE | atm_faster_dyn32 | intel | -DAPP=ATM -DCCPP_SUITES=FV3_GFS_v17_p8,FV3_GFS_v15_thompson_mynn_lam3km -D32BIT=ON | | fv3 | +RUN | regional_control_faster | | baseline | + +# The --invalid-argument ensures the compile job will fail. The workflow should not submit the tests jobs for this compile job. +COMPILE | hafsw | intel | -DAPP=HAFSW --invalid-argument -DMOVING_NEST=ON -DCCPP_SUITES=FV3_HAFS_v1_gfdlmp_tedmf,FV3_HAFS_v1_gfdlmp_tedmf_nonsst,FV3_HAFS_v1_thompson_tedmf_gfdlsf,FV3_global_nest_v1 -D32BIT=ON | | fv3 | +RUN | hafs_regional_atm | | baseline | +RUN | hafs_regional_atm_thompson_gfdlsf | | baseline | +RUN | hafs_regional_atm_ocn | | baseline | +RUN | hafs_regional_atm_wav | | baseline | diff --git a/tests/fv3_conf/fv3_qsub.IN_acorn b/tests/fv3_conf/fv3_qsub.IN_acorn index db6b3f1d12..6c852534b6 100644 --- a/tests/fv3_conf/fv3_qsub.IN_acorn +++ b/tests/fv3_conf/fv3_qsub.IN_acorn @@ -30,6 +30,13 @@ export ESMF_RUNTIME_COMPLIANCECHECK=OFF:depth=4 export ESMF_RUNTIME_PROFILE=ON export ESMF_RUNTIME_PROFILE_OUTPUT="SUMMARY" +# FIXME: THIS NEW "IF" BLOCK SHOULD NOT BE MERGED TO DEVELOP +if [ "${JOB_SHOULD_FAIL:-NO}" = WHEN_RUNNING ] ; then + echo "The job should abort now, with exit status 1." 1>&2 + echo "If error checking is working, the metascheduler should mark the job as failed." 1>&2 + false +fi + mpiexec -n @[TASKS] -ppn @[TPN] -depth @[THRD] ./fv3.exe echo "Model ended: " `date` diff --git a/tests/fv3_conf/fv3_qsub.IN_derecho b/tests/fv3_conf/fv3_qsub.IN_derecho index 8793d7edb5..d1208c656a 100644 --- a/tests/fv3_conf/fv3_qsub.IN_derecho +++ b/tests/fv3_conf/fv3_qsub.IN_derecho @@ -35,6 +35,13 @@ export MPICH_COLL_OPT_OFF=1 # Avoid job errors because of filesystem synchronization delays sync && sleep 1 +# FIXME: THIS NEW "IF" BLOCK SHOULD NOT BE MERGED TO DEVELOP +if [ "${JOB_SHOULD_FAIL:-NO}" = WHEN_RUNNING ] ; then + echo "The job should abort now, with exit status 1." 1>&2 + echo "If error checking is working, the metascheduler should mark the job as failed." 1>&2 + false +fi + mpiexec -n @[UFS_TASKS] -ppn @[PPN] --hostfile $PBS_NODEFILE ./fv3.exe echo "Model ended: " `date` diff --git a/tests/fv3_conf/fv3_qsub.IN_wcoss2 b/tests/fv3_conf/fv3_qsub.IN_wcoss2 index db6b3f1d12..46f33bb8f3 100644 --- a/tests/fv3_conf/fv3_qsub.IN_wcoss2 +++ b/tests/fv3_conf/fv3_qsub.IN_wcoss2 @@ -30,6 +30,13 @@ export ESMF_RUNTIME_COMPLIANCECHECK=OFF:depth=4 export ESMF_RUNTIME_PROFILE=ON export ESMF_RUNTIME_PROFILE_OUTPUT="SUMMARY" +# FIXME: THIS NEW "IF" BLOCK SHOULD NOT BE MERGED TO DEVELOP +if [ "${JOB_SHOULD_FAIL:-NO}" == WHEN_RUNNING ] ; then + echo "The job should abort now, with exit status 1." 1>&2 + echo "If error checking is working, the metascheduler should mark the job as failed." 1>&2 + false +fi + mpiexec -n @[TASKS] -ppn @[TPN] -depth @[THRD] ./fv3.exe echo "Model ended: " `date` diff --git a/tests/fv3_conf/fv3_slurm.IN_expanse b/tests/fv3_conf/fv3_slurm.IN_expanse index 1dbc6bcbb5..7fbea592e9 100644 --- a/tests/fv3_conf/fv3_slurm.IN_expanse +++ b/tests/fv3_conf/fv3_slurm.IN_expanse @@ -26,6 +26,14 @@ echo "Model started: "`date` export OMP_STACK_SIZE=512M export OMP_NUM_THREADS=@[THRD] export I_MPI_PMI_LIBRARY=/cm/shared/apps/slurm/current/lib64/libpmi.so + +# FIXME: THIS NEW "IF" BLOCK SHOULD NOT BE MERGED TO DEVELOP +if [ "${JOB_SHOULD_FAIL:-NO}" == WHEN_RUNNING ] ; then + echo "The job should abort now, with exit status 1." 1>&2 + echo "If error checking is working, the metascheduler should mark the job as failed." 1>&2 + false +fi + srun -n @[TASKS] ./fv3.exe echo "Model ended: " `date` diff --git a/tests/fv3_conf/fv3_slurm.IN_gaea b/tests/fv3_conf/fv3_slurm.IN_gaea index 8545e689e6..325ff6d69c 100644 --- a/tests/fv3_conf/fv3_slurm.IN_gaea +++ b/tests/fv3_conf/fv3_slurm.IN_gaea @@ -32,6 +32,13 @@ export ESMF_RUNTIME_PROFILE_OUTPUT="SUMMARY" # Avoid job errors because of filesystem synchronization delays sync && sleep 1 +# FIXME: THIS NEW "IF" BLOCK SHOULD NOT BE MERGED TO DEVELOP +if [ "${JOB_SHOULD_FAIL:-NO}" == WHEN_RUNNING ] ; then + echo "The job should abort now, with exit status 1." 1>&2 + echo "If error checking is working, the metascheduler should mark the job as failed." 1>&2 + false +fi + srun --label -n @[TASKS] ./fv3.exe echo "Model ended: " `date` diff --git a/tests/fv3_conf/fv3_slurm.IN_hera b/tests/fv3_conf/fv3_slurm.IN_hera index 288b0ec78b..6d82ee67dd 100644 --- a/tests/fv3_conf/fv3_slurm.IN_hera +++ b/tests/fv3_conf/fv3_slurm.IN_hera @@ -38,6 +38,13 @@ export PSM_SHAREDCONTEXTS=1 # Avoid job errors because of filesystem synchronization delays sync && sleep 1 +# FIXME: THIS NEW "IF" BLOCK SHOULD NOT BE MERGED TO DEVELOP +if [ "${JOB_SHOULD_FAIL:-NO}" == WHEN_RUNNING ] ; then + echo "The job should abort now, with exit status 1." 1>&2 + echo "If error checking is working, the metascheduler should mark the job as failed." 1>&2 + false +fi + # shellcheck disable=SC2102 srun --label -n @[TASKS] ./fv3.exe diff --git a/tests/fv3_conf/fv3_slurm.IN_hercules b/tests/fv3_conf/fv3_slurm.IN_hercules index c4853fb585..47178d3310 100644 --- a/tests/fv3_conf/fv3_slurm.IN_hercules +++ b/tests/fv3_conf/fv3_slurm.IN_hercules @@ -46,6 +46,13 @@ fi # Avoid job errors because of filesystem synchronization delays sync && sleep 1 +# FIXME: THIS NEW "IF" BLOCK SHOULD NOT BE MERGED TO DEVELOP +if [ "${JOB_SHOULD_FAIL:-NO}" == WHEN_RUNNING ] ; then + echo "The job should abort now, with exit status 1." 1>&2 + echo "If error checking is working, the metascheduler should mark the job as failed." 1>&2 + false +fi + srun --label -n @[TASKS] ./fv3.exe echo "Model ended: " `date` diff --git a/tests/fv3_conf/fv3_slurm.IN_jet b/tests/fv3_conf/fv3_slurm.IN_jet index 21effeb05e..0cfe947c70 100644 --- a/tests/fv3_conf/fv3_slurm.IN_jet +++ b/tests/fv3_conf/fv3_slurm.IN_jet @@ -36,6 +36,13 @@ export ESMF_RUNTIME_PROFILE_OUTPUT="SUMMARY" # Avoid job errors because of filesystem synchronization delays sync && sleep 1 +# FIXME: THIS NEW "IF" BLOCK SHOULD NOT BE MERGED TO DEVELOP +if [ "${JOB_SHOULD_FAIL:-NO}" == WHEN_RUNNING ] ; then + echo "The job should abort now, with exit status 1." 1>&2 + echo "If error checking is working, the metascheduler should mark the job as failed." 1>&2 + false +fi + srun --label -n @[TASKS] --cpus-per-task=@[THRD] ./fv3.exe echo "Model ended: " `date` diff --git a/tests/fv3_conf/fv3_slurm.IN_noaacloud b/tests/fv3_conf/fv3_slurm.IN_noaacloud index 519e29b96a..3f9b8e6222 100644 --- a/tests/fv3_conf/fv3_slurm.IN_noaacloud +++ b/tests/fv3_conf/fv3_slurm.IN_noaacloud @@ -39,6 +39,13 @@ export OMP_NUM_THREADS=1 # Avoid job errors because of filesystem synchronization delays sync && sleep 1 +# FIXME: THIS NEW "IF" BLOCK SHOULD NOT BE MERGED TO DEVELOP +if [ "${JOB_SHOULD_FAIL:-NO}" == WHEN_RUNNING ] ; then + echo "The job should abort now, with exit status 1." 1>&2 + echo "If error checking is working, the metascheduler should mark the job as failed." 1>&2 + false +fi + srun --mpi=pmi2 --label -n @[TASKS] ./fv3.exe echo "Model ended: " `date` diff --git a/tests/fv3_conf/fv3_slurm.IN_orion b/tests/fv3_conf/fv3_slurm.IN_orion index 50e8cf5655..a1c736cca7 100644 --- a/tests/fv3_conf/fv3_slurm.IN_orion +++ b/tests/fv3_conf/fv3_slurm.IN_orion @@ -39,6 +39,13 @@ export ESMF_RUNTIME_PROFILE_OUTPUT="SUMMARY" # Avoid job errors because of filesystem synchronization delays sync && sleep 1 +# FIXME: THIS NEW "IF" BLOCK SHOULD NOT BE MERGED TO DEVELOP +if [ "${JOB_SHOULD_FAIL:-NO}" == WHEN_RUNNING ] ; then + echo "The job should abort now, with exit status 1." 1>&2 + echo "If error checking is working, the metascheduler should mark the job as failed." 1>&2 + false +fi + srun --label -n @[TASKS] ./fv3.exe echo "Model ended: " `date` diff --git a/tests/fv3_conf/fv3_slurm.IN_s4 b/tests/fv3_conf/fv3_slurm.IN_s4 index f3f9730604..31b8d4c308 100644 --- a/tests/fv3_conf/fv3_slurm.IN_s4 +++ b/tests/fv3_conf/fv3_slurm.IN_s4 @@ -35,6 +35,13 @@ export PSM_SHAREDCONTEXTS=1 # Avoid job errors because of filesystem synchronization delays sync && sleep 1 +# FIXME: THIS NEW "IF" BLOCK SHOULD NOT BE MERGED TO DEVELOP +if [ "${JOB_SHOULD_FAIL:-NO}" = WHEN_RUNNING ] ; then + echo "The job should abort now, with exit status 1." 1>&2 + echo "If error checking is working, the metascheduler should mark the job as failed." 1>&2 + false +fi + srun --label -n @[TASKS] ./fv3.exe echo "Model ended: " `date` diff --git a/tests/fv3_conf/fv3_slurm.IN_stampede b/tests/fv3_conf/fv3_slurm.IN_stampede index 384cc778f0..83ff2cef06 100644 --- a/tests/fv3_conf/fv3_slurm.IN_stampede +++ b/tests/fv3_conf/fv3_slurm.IN_stampede @@ -29,6 +29,13 @@ export LD_BIND_NOW=1 # Avoid job errors because of filesystem synchronization delays #sync && sleep 1 +# FIXME: THIS NEW "IF" BLOCK SHOULD NOT BE MERGED TO DEVELOP +if [ "${JOB_SHOULD_FAIL:-NO}" = WHEN_RUNNING ] ; then + echo "The job should abort now, with exit status 1." 1>&2 + echo "If error checking is working, the metascheduler should mark the job as failed." 1>&2 + false +fi + #mpirun -prepend-rank -np $SBATCH_NP ./fv3.exe ibrun -n @[TASKS] ./fv3.exe diff --git a/tests/rt.conf b/tests/rt.conf index 46e1e77e78..0ad22fe033 100644 --- a/tests/rt.conf +++ b/tests/rt.conf @@ -152,8 +152,9 @@ RUN | rrfs_v1beta | RUN | rrfs_v1nssl | | baseline | RUN | rrfs_v1nssl_nohailnoccn | | baseline | -# This variant of rrfs_v1beta should always fail. -RUN | rrfs_v1beta_failing | | baseline | +# These variants of rrfs_v1beta should always fail. +RUN | rrfs_v1beta_fail_to_copy | | baseline | +RUN | rrfs_v1beta_fail_to_run | | baseline | COMPILE | csawmg | intel | -DAPP=ATM -DCCPP_SUITES=FV3_GFS_v16_csawmg,FV3_GFS_v16_ras | - noaacloud | fv3 | RUN | control_csawmg | - noaacloud | baseline | diff --git a/tests/run_test.sh b/tests/run_test.sh index a9091ce15e..f387bfea8d 100755 --- a/tests/run_test.sh +++ b/tests/run_test.sh @@ -365,19 +365,18 @@ elif [[ ${SCHEDULER} = 'slurm' ]]; then fi fi +# FIXME: THIS NEW "IF" BLOCK SHOULD NOT BE MERGED TO DEVELOP +if [[ "${JOB_SHOULD_FAIL:-NO}" == WHEN_COPYING ]] ; then + echo "The job should abort now, with exit status 1." 1>&2 + echo "If error checking is working, the metascheduler should mark the job as failed." 1>&2 + false +fi + ################################################################################ # Submit test job ################################################################################ export OMP_ENV=${OMP_ENV:-""} if [[ ${SCHEDULER} = 'none' ]]; then - - # FIXME: THIS NEW "IF" BLOCK SHOULD NOT BE MERGED TO DEVELOP - if [[ "${JOB_SHOULD_FAIL:-NO}" == YES ]] ; then - echo "The job should abort now, with exit status 1." 1>&2 - echo "If error checking is working, the metascheduler should mark the job as failed." 1>&2 - false - fi - ulimit -s unlimited if [[ ${CI_TEST} = 'true' ]]; then ( eval "${OMP_ENV}" ; diff --git a/tests/tests/rrfs_v1beta_fail_to_copy b/tests/tests/rrfs_v1beta_fail_to_copy new file mode 100644 index 0000000000..0e108c4414 --- /dev/null +++ b/tests/tests/rrfs_v1beta_fail_to_copy @@ -0,0 +1,70 @@ + + +# FIXME: THIS FILE SHOULD NOT BE MERGED TO DEVELOP + + +############################################################################### +# +# RRFS v1beta variant that always fails at runtime +# +############################################################################### + +export TEST_DESCR="Variant of RRFS_v1beta that always fails at runtime" + +export CNTL_DIR=rrfs_v1beta_fail_to_copy + +export LIST_FILES="sfcf000.nc \ + sfcf009.nc \ + sfcf012.nc \ + atmf000.nc \ + atmf009.nc \ + atmf012.nc \ + GFSFLX.GrbF00 \ + GFSFLX.GrbF09 \ + GFSFLX.GrbF12 \ + GFSPRS.GrbF00 \ + GFSPRS.GrbF09 \ + GFSPRS.GrbF12 \ + RESTART/20210323.060000.coupler.res \ + RESTART/20210323.060000.fv_core.res.nc \ + RESTART/20210323.060000.fv_core.res.tile1.nc \ + RESTART/20210323.060000.fv_core.res.tile2.nc \ + RESTART/20210323.060000.fv_core.res.tile3.nc \ + RESTART/20210323.060000.fv_core.res.tile4.nc \ + RESTART/20210323.060000.fv_core.res.tile5.nc \ + RESTART/20210323.060000.fv_core.res.tile6.nc \ + RESTART/20210323.060000.fv_srf_wnd.res.tile1.nc \ + RESTART/20210323.060000.fv_srf_wnd.res.tile2.nc \ + RESTART/20210323.060000.fv_srf_wnd.res.tile3.nc \ + RESTART/20210323.060000.fv_srf_wnd.res.tile4.nc \ + RESTART/20210323.060000.fv_srf_wnd.res.tile5.nc \ + RESTART/20210323.060000.fv_srf_wnd.res.tile6.nc \ + RESTART/20210323.060000.fv_tracer.res.tile1.nc \ + RESTART/20210323.060000.fv_tracer.res.tile2.nc \ + RESTART/20210323.060000.fv_tracer.res.tile3.nc \ + RESTART/20210323.060000.fv_tracer.res.tile4.nc \ + RESTART/20210323.060000.fv_tracer.res.tile5.nc \ + RESTART/20210323.060000.fv_tracer.res.tile6.nc \ + RESTART/20210323.060000.phy_data.tile1.nc \ + RESTART/20210323.060000.phy_data.tile2.nc \ + RESTART/20210323.060000.phy_data.tile3.nc \ + RESTART/20210323.060000.phy_data.tile4.nc \ + RESTART/20210323.060000.phy_data.tile5.nc \ + RESTART/20210323.060000.phy_data.tile6.nc \ + RESTART/20210323.060000.sfc_data.tile1.nc \ + RESTART/20210323.060000.sfc_data.tile2.nc \ + RESTART/20210323.060000.sfc_data.tile3.nc \ + RESTART/20210323.060000.sfc_data.tile4.nc \ + RESTART/20210323.060000.sfc_data.tile5.nc \ + RESTART/20210323.060000.sfc_data.tile6.nc" + +export_rrfs_v1 +export RESTART_INTERVAL="6 -1" +export OUTPUT_FH='0 09 12' + +# A special flag that tells the job to fail at runtime. +export JOB_SHOULD_FAIL=WHEN_COPYING + +if [[ " hera orion hercules jet " =~ " ${MACHINE_ID} " ]] ; then + ZSTANDARD_LEVEL=5 +fi diff --git a/tests/tests/rrfs_v1beta_failing b/tests/tests/rrfs_v1beta_fail_to_run similarity index 97% rename from tests/tests/rrfs_v1beta_failing rename to tests/tests/rrfs_v1beta_fail_to_run index b1a9a10d34..29b2babdfd 100644 --- a/tests/tests/rrfs_v1beta_failing +++ b/tests/tests/rrfs_v1beta_fail_to_run @@ -11,7 +11,7 @@ export TEST_DESCR="Variant of RRFS_v1beta that always fails at runtime" -export CNTL_DIR=rrfs_v1beta_failing +export CNTL_DIR=rrfs_v1beta_fail_to_run export LIST_FILES="sfcf000.nc \ sfcf009.nc \ @@ -63,7 +63,7 @@ export RESTART_INTERVAL="6 -1" export OUTPUT_FH='0 09 12' # A special flag that tells the job to fail at runtime. -export JOB_SHOULD_FAIL=YES +export JOB_SHOULD_FAIL=WHEN_RUNNING if [[ " hera orion hercules jet " =~ " ${MACHINE_ID} " ]] ; then ZSTANDARD_LEVEL=5