From 6040be731659ce0bc9dc5049b74c30a01ecbe70e Mon Sep 17 00:00:00 2001 From: "Eric T. Johnson" Date: Wed, 31 Jan 2024 15:33:42 -0500 Subject: [PATCH 1/8] Consolidate copies of process.xrb into one location --- job_scripts/{perlmutter => hpss}/process.xrb | 0 job_scripts/summit/process.xrb | 246 ------------------- sphinx_docs/source/nersc-hpss.rst | 6 +- sphinx_docs/source/olcf-workflow.rst | 15 +- 4 files changed, 10 insertions(+), 257 deletions(-) rename job_scripts/{perlmutter => hpss}/process.xrb (100%) delete mode 100755 job_scripts/summit/process.xrb diff --git a/job_scripts/perlmutter/process.xrb b/job_scripts/hpss/process.xrb similarity index 100% rename from job_scripts/perlmutter/process.xrb rename to job_scripts/hpss/process.xrb diff --git a/job_scripts/summit/process.xrb b/job_scripts/summit/process.xrb deleted file mode 100755 index 25972d9..0000000 --- a/job_scripts/summit/process.xrb +++ /dev/null @@ -1,246 +0,0 @@ -#!/bin/ksh -p - -#---------------------------------------------------------------------------- -# user modifiable variables: - -# pidfile is a lock file that is used to make sure that only one instance -# of this script is working on the current directory -pidfile=process.pid - - -# set the prefix of the plotfiles and checkpoint files -plt_prefix=*plt -chk_prefix=*chk - -# directory to archive to on HPSS -- set this to the working directory -work_dir=`pwd` -HPSS_DIR=`basename $work_dir` - -# set HTAR command -HTAR=htar - -# path to the ftime executable -- used for making a simple ftime.out file -# listing the name of the plotfile and its simulation time -FTIME_EXE=ftime.Linux.gfortran.exe - - -#---------------------------------------------------------------------------- -# initialization stuff - -# check to make sure that the lock file does not already exist. -if [ -f $pidfile ]; then - echo 2>&1 "process lock file " $pidfile " already exists" - exit -1 -fi - -# create the lock file -echo $$ > $pidfile - -# if our process if killed, remove the lock file first -trap '/bin/rm -f $pidfile' EXIT HUP TERM XCPU KILL - -# Number of seconds to sleep before checking again. -N=60 - - -#---------------------------------------------------------------------------- -# make storage directories - -# once we process a file, we will move the plotfiles into the plotfiles/ -# directory. This then hides them from the script, so if the system -# later purges the files in the pltXXXXX directory and the .processed -# file, we don't overwrite our archived data with a tarred empty -# directory structure. We do the same with the checkpoint files (using -# checkfiles/) - -if [ ! -d plotfiles ]; then - mkdir plotfiles -fi - -if [ ! -d checkfiles ]; then - mkdir checkfiles -fi - - -#---------------------------------------------------------------------------- -# the processing function - -# Process Files. Once a plotfile is successfully processed, we will output -# a file pltXXXXX.processed (checkpoint files are only archived, with a -# chkXXXXX.processed file appearing once the archiving is successful). -# Subsequent invocations of this routine will skip over any plotfiles or -# checkpoint files that have a corresponding .processed file. - - -function process_files -{ - if [ ! -f $pidfile ]; then - echo "process: $pidfile has been removed, exiting" - exit - fi - - - # plotfiles - - # Take all but the final plt file -- we want to ensure they're completely - # written to disk. Strip out any tar files that are lying around as well - # as pltXXXXX.processed files. We restrict the find command to a depth of - # 1 to avoid catching any already-processed files in the plotfiles/ - # directory - pltlist5=$(find . -maxdepth 1 -type d -name "${plt_prefix}?????" -print | sort) - pltlist6=$(find . -maxdepth 1 -type d -name "${plt_prefix}??????" -print | sort) - pltlist7=$(find . -maxdepth 1 -type d -name "${plt_prefix}???????" -print | sort) - - pltlist="$pltlist5 $pltlist6 $pltlist7" - - if [ "$pltlist" ]; then - nl=$(echo "$pltlist" | wc -l) - nl=$(expr $nl - 1) - if [ $nl -eq 0 ]; then - pltlist="" - else - pltlist=$(echo "$pltlist" | head -$nl) - fi - fi - - - for dir in ${pltlist} - do - if [ -d ${dir} ]; then - - # only work on the file if there is not a .processed file in the - # main directory or the plotfiles/ directory - if [ ! -f ${dir}.processed ] && [ ! -f plotfiles/${dir}.processed ]; then - - # do processing - - # store the file on HPSS - ${HTAR} -H copies=2 -cvf ${HPSS_DIR}/${dir}.tar ${dir} > ${dir}.htar - - # Ordinarily, we'd check htar's exit status (0 = successful), but - # on some machines (like Atlas) htar doesn't return a valid exit - # status. Instead we'll grep for the success line at the end of - # htar's output (which we piped into a file) and check the output - # status of grep - grep "HTAR: HTAR SUCCESSFUL" ${dir}.htar >> /dev/null - - # The variable $? holds the exit status of the previous command - if [ $? -eq 0 ]; then - - # mark this file as processed so we skip it next time - date > ${dir}.processed - - # output the plotfile name and simulation time to ftime.out - if [ `command -v ${FTIME_EXE}` ] ; then - ${FTIME_EXE} ${dir} >> ftime.out - fi - - # remove the htar temporary file - rm ${dir}.htar - - # move the plotfile into the plotfiles directory - mv ${dir} plotfiles/ - - # ..and the corresponding .processed file too. - mv ${dir}.processed plotfiles/ - - # and visualize it - #runtimevis.py plotfiles/${dir} - - fi - - fi # end test of whether plotfile already processed - - fi # end test of whether plotfile is a directory (as it should be) - - done - - - # checkpoint files - - # Take all but the final chk file -- we want to ensure they're completely - # written to disk. Strip out any tar files that are lying around as well - # as chkXXXXX.processed files. We restrict the find command to a depth of - # 1 to avoid catching any already-processed files in the checkfiles/ - # directory - chklist5=$(find . -maxdepth 1 -type d -name "${chk_prefix}?[05]000" -print | sort) - chklist6=$(find . -maxdepth 1 -type d -name "${chk_prefix}??[05]000" -print | sort) - chklist7=$(find . -maxdepth 1 -type d -name "${chk_prefix}???[05]000" -print | sort) - - chklist="$chklist5 $chklist6 $chklist7" - - if [ "$chklist" ]; then - nl=$(echo "$chklist" | wc -l) - nl=$(expr $nl - 1) - if [ $nl -eq 0 ]; then - chklist="" - else - chklist=$(echo "$chklist" | head -$nl) - fi - fi - - - for dir in ${chklist} - do - if [ -d ${dir} ]; then - - if [ ! -f ${dir}.processed ] && [ ! -f checkfiles/${dir}.processed ]; then - - # store the file on HPSS - ${HTAR} -H copies=2 -cvf ${HPSS_DIR}/${dir}.tar ${dir} > ${dir}.htar - - # Ordinarily, we'd check htar's exit status (0 = successful), but - # on some machines (like Atlas) htar doesn't return a valid exit - # status. Instead we'll grep for the success line at the end of - # htar's output (which we piped into a file) and check the output - # status of grep - grep "HTAR: HTAR SUCCESSFUL" ${dir}.htar >> /dev/null - - # The variable $? holds the exit status of the previous command - if [ $? -eq 0 ]; then - - # mark this file as processed so we skip it next time - date > ${dir}.processed - - # remove the htar temporary file - rm ${dir}.htar - - # move the checkpoint file into the checkfiles directory - mv ${dir} checkfiles/ - - # ..and the corresponding .processed file too. - mv ${dir}.processed checkfiles/ - - fi - - fi - - fi - done - -} - - -#---------------------------------------------------------------------------- -# the main function - -# archive any diagnostic files first -- give them a unique name, appending -# the date string, to make sure that we don't overwrite anything -datestr=$(date +"%Y%m%d_%H%M_%S") -ftime_files=$(find . -maxdepth 1 -name "ftime.out" -print) -inputs_files=$(find . -maxdepth 1 -name "inputs*" -print) -diag_files=$(find . -maxdepth 1 -name "*diag.out" -print) -model_files=$(find . -maxdepth 1 -name "*.hse.*" -print) -job_files=$(find . -maxdepth 1 -name "*.slurm" -print) $(find . -maxdepth 1 -name "*.submit" -print) -process_files=$(find . -maxdepth 1 -name "process*" -print) - -${HTAR} -cvf ${HPSS_DIR}/diag_files_${datestr}.tar ${model_files} ${ftime_files} ${inputs_files} ${probin_files} ${job_files} ${process_files} >> /dev/null - - -# Loop, waiting for plt and chk directories to appear. - -while true -do - process_files - sleep $N -done diff --git a/sphinx_docs/source/nersc-hpss.rst b/sphinx_docs/source/nersc-hpss.rst index 4be1df7..0273fbf 100644 --- a/sphinx_docs/source/nersc-hpss.rst +++ b/sphinx_docs/source/nersc-hpss.rst @@ -9,15 +9,15 @@ frequently, since the scratch filesystems fill up and NERSC will purge data periodically. -The script ``nersc.xfer.slurm``: +The script ``nersc.xfer.slurm`` in ``job_scripts/perlmutter/``: :download:`nersc.xfer.slurm <../../job_scripts/perlmutter/nersc.xfer.slurm>` can be used to archive data to HPSS automatically. This is submitted to the xfer queue and runs the -script ``process.xrb``: +script ``process.xrb`` in ``job_scripts/hpss/``: -:download:`process.xrb <../../job_scripts/perlmutter/process.xrb>` +:download:`process.xrb <../../job_scripts/hpss/process.xrb>` which continually looks for output and stores it to HPSS. diff --git a/sphinx_docs/source/olcf-workflow.rst b/sphinx_docs/source/olcf-workflow.rst index 3b78ff6..93284c7 100644 --- a/sphinx_docs/source/olcf-workflow.rst +++ b/sphinx_docs/source/olcf-workflow.rst @@ -383,14 +383,13 @@ where ``test_hpss.sh`` is a SLURM script that contains the ``htar`` commands needed to archive your data. This uses ``slurm`` as the job manager. -An example is provided by the ``process.xrb`` archiving script and -associated ``summit_hpss.submit`` submission script in -``jobs_scripts/summit/``. Together these will detect new plotfiles as -they are generated, tar them up (using ``htar``) and archive them onto -HPSS. They will also store the inputs, probin, and other runtime -generated files. If ``ftime`` is found in your path, it will also -create a file called ``ftime.out`` that lists the simulation time -corresponding to each plotfile. +An example is provided by the ``process.xrb`` archiving script in +``job_scripts/hpss/`` and associated ``summit_hpss.submit`` submission script +in ``jobs_scripts/summit/``. Together these will detect new plotfiles as they +are generated, tar them up (using ``htar``) and archive them onto HPSS. They +will also store the inputs, probin, and other runtime generated files. If +``ftime`` is found in your path, it will also create a file called +``ftime.out`` that lists the simulation time corresponding to each plotfile. Once the plotfiles are archived they are moved to a subdirectory under your run directory called ``plotfiles/``. From 160e60492f266515c67d038530ece2960dd030ee Mon Sep 17 00:00:00 2001 From: "Eric T. Johnson" Date: Wed, 31 Jan 2024 15:46:53 -0500 Subject: [PATCH 2/8] Clean up whitespace --- job_scripts/hpss/process.xrb | 82 ++++++++++++++++++------------------ 1 file changed, 41 insertions(+), 41 deletions(-) diff --git a/job_scripts/hpss/process.xrb b/job_scripts/hpss/process.xrb index 25972d9..b4b5825 100755 --- a/job_scripts/hpss/process.xrb +++ b/job_scripts/hpss/process.xrb @@ -3,7 +3,7 @@ #---------------------------------------------------------------------------- # user modifiable variables: -# pidfile is a lock file that is used to make sure that only one instance +# pidfile is a lock file that is used to make sure that only one instance # of this script is working on the current directory pidfile=process.pid @@ -67,7 +67,7 @@ fi # Process Files. Once a plotfile is successfully processed, we will output # a file pltXXXXX.processed (checkpoint files are only archived, with a -# chkXXXXX.processed file appearing once the archiving is successful). +# chkXXXXX.processed file appearing once the archiving is successful). # Subsequent invocations of this routine will skip over any plotfiles or # checkpoint files that have a corresponding .processed file. @@ -82,9 +82,9 @@ function process_files # plotfiles - # Take all but the final plt file -- we want to ensure they're completely - # written to disk. Strip out any tar files that are lying around as well - # as pltXXXXX.processed files. We restrict the find command to a depth of + # Take all but the final plt file -- we want to ensure they're completely + # written to disk. Strip out any tar files that are lying around as well + # as pltXXXXX.processed files. We restrict the find command to a depth of # 1 to avoid catching any already-processed files in the plotfiles/ # directory pltlist5=$(find . -maxdepth 1 -type d -name "${plt_prefix}?????" -print | sort) @@ -117,35 +117,35 @@ function process_files # store the file on HPSS ${HTAR} -H copies=2 -cvf ${HPSS_DIR}/${dir}.tar ${dir} > ${dir}.htar - # Ordinarily, we'd check htar's exit status (0 = successful), but - # on some machines (like Atlas) htar doesn't return a valid exit - # status. Instead we'll grep for the success line at the end of - # htar's output (which we piped into a file) and check the output - # status of grep - grep "HTAR: HTAR SUCCESSFUL" ${dir}.htar >> /dev/null + # Ordinarily, we'd check htar's exit status (0 = successful), but + # on some machines (like Atlas) htar doesn't return a valid exit + # status. Instead we'll grep for the success line at the end of + # htar's output (which we piped into a file) and check the output + # status of grep + grep "HTAR: HTAR SUCCESSFUL" ${dir}.htar >> /dev/null + + # The variable $? holds the exit status of the previous command + if [ $? -eq 0 ]; then - # The variable $? holds the exit status of the previous command - if [ $? -eq 0 ]; then - # mark this file as processed so we skip it next time date > ${dir}.processed - # output the plotfile name and simulation time to ftime.out - if [ `command -v ${FTIME_EXE}` ] ; then - ${FTIME_EXE} ${dir} >> ftime.out - fi + # output the plotfile name and simulation time to ftime.out + if [ `command -v ${FTIME_EXE}` ] ; then + ${FTIME_EXE} ${dir} >> ftime.out + fi - # remove the htar temporary file - rm ${dir}.htar + # remove the htar temporary file + rm ${dir}.htar - # move the plotfile into the plotfiles directory - mv ${dir} plotfiles/ + # move the plotfile into the plotfiles directory + mv ${dir} plotfiles/ - # ..and the corresponding .processed file too. - mv ${dir}.processed plotfiles/ + # ..and the corresponding .processed file too. + mv ${dir}.processed plotfiles/ - # and visualize it - #runtimevis.py plotfiles/${dir} + # and visualize it + #runtimevis.py plotfiles/${dir} fi @@ -158,10 +158,10 @@ function process_files # checkpoint files - # Take all but the final chk file -- we want to ensure they're completely - # written to disk. Strip out any tar files that are lying around as well + # Take all but the final chk file -- we want to ensure they're completely + # written to disk. Strip out any tar files that are lying around as well # as chkXXXXX.processed files. We restrict the find command to a depth of - # 1 to avoid catching any already-processed files in the checkfiles/ + # 1 to avoid catching any already-processed files in the checkfiles/ # directory chklist5=$(find . -maxdepth 1 -type d -name "${chk_prefix}?[05]000" -print | sort) chklist6=$(find . -maxdepth 1 -type d -name "${chk_prefix}??[05]000" -print | sort) @@ -189,15 +189,15 @@ function process_files # store the file on HPSS ${HTAR} -H copies=2 -cvf ${HPSS_DIR}/${dir}.tar ${dir} > ${dir}.htar - # Ordinarily, we'd check htar's exit status (0 = successful), but - # on some machines (like Atlas) htar doesn't return a valid exit - # status. Instead we'll grep for the success line at the end of - # htar's output (which we piped into a file) and check the output - # status of grep + # Ordinarily, we'd check htar's exit status (0 = successful), but + # on some machines (like Atlas) htar doesn't return a valid exit + # status. Instead we'll grep for the success line at the end of + # htar's output (which we piped into a file) and check the output + # status of grep grep "HTAR: HTAR SUCCESSFUL" ${dir}.htar >> /dev/null - # The variable $? holds the exit status of the previous command - if [ $? -eq 0 ]; then + # The variable $? holds the exit status of the previous command + if [ $? -eq 0 ]; then # mark this file as processed so we skip it next time date > ${dir}.processed @@ -205,11 +205,11 @@ function process_files # remove the htar temporary file rm ${dir}.htar - # move the checkpoint file into the checkfiles directory - mv ${dir} checkfiles/ + # move the checkpoint file into the checkfiles directory + mv ${dir} checkfiles/ - # ..and the corresponding .processed file too. - mv ${dir}.processed checkfiles/ + # ..and the corresponding .processed file too. + mv ${dir}.processed checkfiles/ fi @@ -224,7 +224,7 @@ function process_files #---------------------------------------------------------------------------- # the main function -# archive any diagnostic files first -- give them a unique name, appending +# archive any diagnostic files first -- give them a unique name, appending # the date string, to make sure that we don't overwrite anything datestr=$(date +"%Y%m%d_%H%M_%S") ftime_files=$(find . -maxdepth 1 -name "ftime.out" -print) From cf7888ce201061b5443281527f8209b0b75b3fb8 Mon Sep 17 00:00:00 2001 From: "Eric T. Johnson" Date: Wed, 31 Jan 2024 15:52:16 -0500 Subject: [PATCH 3/8] Fix diag and job files --- job_scripts/hpss/process.xrb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/job_scripts/hpss/process.xrb b/job_scripts/hpss/process.xrb index b4b5825..0a7f6fb 100755 --- a/job_scripts/hpss/process.xrb +++ b/job_scripts/hpss/process.xrb @@ -231,10 +231,10 @@ ftime_files=$(find . -maxdepth 1 -name "ftime.out" -print) inputs_files=$(find . -maxdepth 1 -name "inputs*" -print) diag_files=$(find . -maxdepth 1 -name "*diag.out" -print) model_files=$(find . -maxdepth 1 -name "*.hse.*" -print) -job_files=$(find . -maxdepth 1 -name "*.slurm" -print) $(find . -maxdepth 1 -name "*.submit" -print) +job_files=$(find . -maxdepth 1 -name "*.slurm" -print; find . -maxdepth 1 -name "*.submit" -print) process_files=$(find . -maxdepth 1 -name "process*" -print) -${HTAR} -cvf ${HPSS_DIR}/diag_files_${datestr}.tar ${model_files} ${ftime_files} ${inputs_files} ${probin_files} ${job_files} ${process_files} >> /dev/null +${HTAR} -cvf ${HPSS_DIR}/diag_files_${datestr}.tar ${model_files} ${ftime_files} ${inputs_files} ${diag_files} ${job_files} ${process_files} >> /dev/null # Loop, waiting for plt and chk directories to appear. From 13a04dfdad0347c12cc668e123e93c2e2856fc02 Mon Sep 17 00:00:00 2001 From: "Eric T. Johnson" Date: Wed, 31 Jan 2024 17:03:59 -0500 Subject: [PATCH 4/8] Use arrays for lists of files This simplifies skipping the last plt and chk files, and should help diagnostic files getting accidentally skipped. --- job_scripts/hpss/process.xrb | 66 ++++++++++++++---------------------- 1 file changed, 26 insertions(+), 40 deletions(-) diff --git a/job_scripts/hpss/process.xrb b/job_scripts/hpss/process.xrb index 0a7f6fb..82e75bc 100755 --- a/job_scripts/hpss/process.xrb +++ b/job_scripts/hpss/process.xrb @@ -87,24 +87,16 @@ function process_files # as pltXXXXX.processed files. We restrict the find command to a depth of # 1 to avoid catching any already-processed files in the plotfiles/ # directory - pltlist5=$(find . -maxdepth 1 -type d -name "${plt_prefix}?????" -print | sort) - pltlist6=$(find . -maxdepth 1 -type d -name "${plt_prefix}??????" -print | sort) - pltlist7=$(find . -maxdepth 1 -type d -name "${plt_prefix}???????" -print | sort) - - pltlist="$pltlist5 $pltlist6 $pltlist7" - - if [ "$pltlist" ]; then - nl=$(echo "$pltlist" | wc -l) - nl=$(expr $nl - 1) - if [ $nl -eq 0 ]; then - pltlist="" - else - pltlist=$(echo "$pltlist" | head -$nl) - fi - fi + pltlist=($( + find . -maxdepth 1 -type d -name "${plt_prefix}?????" -print | sort + find . -maxdepth 1 -type d -name "${plt_prefix}??????" -print | sort + find . -maxdepth 1 -type d -name "${plt_prefix}???????" -print | sort + )) + # Don't process the final plt file + unset "pltlist[-1]" - for dir in ${pltlist} + for dir in "${pltlist[@]}" do if [ -d ${dir} ]; then @@ -163,24 +155,16 @@ function process_files # as chkXXXXX.processed files. We restrict the find command to a depth of # 1 to avoid catching any already-processed files in the checkfiles/ # directory - chklist5=$(find . -maxdepth 1 -type d -name "${chk_prefix}?[05]000" -print | sort) - chklist6=$(find . -maxdepth 1 -type d -name "${chk_prefix}??[05]000" -print | sort) - chklist7=$(find . -maxdepth 1 -type d -name "${chk_prefix}???[05]000" -print | sort) - - chklist="$chklist5 $chklist6 $chklist7" - - if [ "$chklist" ]; then - nl=$(echo "$chklist" | wc -l) - nl=$(expr $nl - 1) - if [ $nl -eq 0 ]; then - chklist="" - else - chklist=$(echo "$chklist" | head -$nl) - fi - fi + chklist=($( + find . -maxdepth 1 -type d -name "${chk_prefix}?[05]000" -print | sort + find . -maxdepth 1 -type d -name "${chk_prefix}??[05]000" -print | sort + find . -maxdepth 1 -type d -name "${chk_prefix}???[05]000" -print | sort + )) + # Don't process the final chk file + unset "chklist[-1]" - for dir in ${chklist} + for dir in "${chklist[@]}" do if [ -d ${dir} ]; then @@ -227,14 +211,16 @@ function process_files # archive any diagnostic files first -- give them a unique name, appending # the date string, to make sure that we don't overwrite anything datestr=$(date +"%Y%m%d_%H%M_%S") -ftime_files=$(find . -maxdepth 1 -name "ftime.out" -print) -inputs_files=$(find . -maxdepth 1 -name "inputs*" -print) -diag_files=$(find . -maxdepth 1 -name "*diag.out" -print) -model_files=$(find . -maxdepth 1 -name "*.hse.*" -print) -job_files=$(find . -maxdepth 1 -name "*.slurm" -print; find . -maxdepth 1 -name "*.submit" -print) -process_files=$(find . -maxdepth 1 -name "process*" -print) - -${HTAR} -cvf ${HPSS_DIR}/diag_files_${datestr}.tar ${model_files} ${ftime_files} ${inputs_files} ${diag_files} ${job_files} ${process_files} >> /dev/null +all_files=($( + find . -maxdepth 1 -name "ftime.out" -print + find . -maxdepth 1 -name "inputs*" -print + find . -maxdepth 1 -name "*diag.out" -print + find . -maxdepth 1 -name "*.hse.*" -print + find . -maxdepth 1 -name "*.slurm" -print; find . -maxdepth 1 -name "*.submit" -print + find . -maxdepth 1 -name "process*" -print +)) + +${HTAR} -cvf ${HPSS_DIR}/diag_files_${datestr}.tar "${all_files[@]}" >> /dev/null # Loop, waiting for plt and chk directories to appear. From fbc86ae2a9ec2e3254a6795adef397ae2b5224cd Mon Sep 17 00:00:00 2001 From: "Eric T. Johnson" Date: Wed, 31 Jan 2024 17:17:13 -0500 Subject: [PATCH 5/8] Create directories as needed on HPSS --- job_scripts/hpss/process.xrb | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/job_scripts/hpss/process.xrb b/job_scripts/hpss/process.xrb index 82e75bc..e8ec49d 100755 --- a/job_scripts/hpss/process.xrb +++ b/job_scripts/hpss/process.xrb @@ -19,6 +19,10 @@ HPSS_DIR=`basename $work_dir` # set HTAR command HTAR=htar +# extra arguments to HTAR +# -P will create intermediate directories on HPSS (i.e. mkdir -p) +HTAR_ARGS=(-H copies=2 -P) + # path to the ftime executable -- used for making a simple ftime.out file # listing the name of the plotfile and its simulation time FTIME_EXE=ftime.Linux.gfortran.exe @@ -107,7 +111,7 @@ function process_files # do processing # store the file on HPSS - ${HTAR} -H copies=2 -cvf ${HPSS_DIR}/${dir}.tar ${dir} > ${dir}.htar + ${HTAR} "${HTAR_ARGS[@]}" -cvf ${HPSS_DIR}/${dir}.tar ${dir} > ${dir}.htar # Ordinarily, we'd check htar's exit status (0 = successful), but # on some machines (like Atlas) htar doesn't return a valid exit @@ -171,7 +175,7 @@ function process_files if [ ! -f ${dir}.processed ] && [ ! -f checkfiles/${dir}.processed ]; then # store the file on HPSS - ${HTAR} -H copies=2 -cvf ${HPSS_DIR}/${dir}.tar ${dir} > ${dir}.htar + ${HTAR} "${HTAR_ARGS[@]}" -cvf ${HPSS_DIR}/${dir}.tar ${dir} > ${dir}.htar # Ordinarily, we'd check htar's exit status (0 = successful), but # on some machines (like Atlas) htar doesn't return a valid exit @@ -220,7 +224,7 @@ all_files=($( find . -maxdepth 1 -name "process*" -print )) -${HTAR} -cvf ${HPSS_DIR}/diag_files_${datestr}.tar "${all_files[@]}" >> /dev/null +${HTAR} -P -cvf ${HPSS_DIR}/diag_files_${datestr}.tar "${all_files[@]}" >> /dev/null # Loop, waiting for plt and chk directories to appear. From 2c5031dc1456b23481cd42bf9ec1bf73d510bc18 Mon Sep 17 00:00:00 2001 From: "Eric T. Johnson" Date: Wed, 31 Jan 2024 17:50:23 -0500 Subject: [PATCH 6/8] Store slurm job ID in process.xrb's lock file This lets a new job check whether the old one is actually still running or was just killed uncleanly. --- job_scripts/hpss/process.xrb | 45 +++++++++++++++++++++++++++--------- 1 file changed, 34 insertions(+), 11 deletions(-) diff --git a/job_scripts/hpss/process.xrb b/job_scripts/hpss/process.xrb index e8ec49d..634f467 100755 --- a/job_scripts/hpss/process.xrb +++ b/job_scripts/hpss/process.xrb @@ -3,9 +3,9 @@ #---------------------------------------------------------------------------- # user modifiable variables: -# pidfile is a lock file that is used to make sure that only one instance +# jobidfile is a lock file that is used to make sure that only one instance # of this script is working on the current directory -pidfile=process.pid +jobidfile=process.jobid # set the prefix of the plotfiles and checkpoint files @@ -32,16 +32,31 @@ FTIME_EXE=ftime.Linux.gfortran.exe # initialization stuff # check to make sure that the lock file does not already exist. -if [ -f $pidfile ]; then - echo 2>&1 "process lock file " $pidfile " already exists" - exit -1 +if [ -f "$jobidfile" ]; then + # check if job is still running + existing_job=$(<"$jobidfile") + if [ "$(sacct -X -P -n -o State -j "$existing_job")" != RUNNING ]; then + echo "process: removing stale lock file for job $existing_job" + rm "$jobidfile" + else + echo "process job $existing_job is still running" + exit 2 + fi fi # create the lock file -echo $$ > $pidfile - -# if our process if killed, remove the lock file first -trap '/bin/rm -f $pidfile' EXIT HUP TERM XCPU KILL +echo "$SLURM_JOB_ID" > "$jobidfile" + +# if our process is killed, remove the lock file first +function cleanup { + echo "process: received signal; removing $jobidfile" + command rm -f "$jobidfile" + # remove the EXIT handler, since we only want to do this once + trap - EXIT + # don't exit, so we can finish the current operation: + # $jobidfile is checked at the start of each loop iteration in process_files() +} +trap cleanup EXIT HUP INT QUIT TERM XCPU # Number of seconds to sleep before checking again. N=60 @@ -78,8 +93,8 @@ fi function process_files { - if [ ! -f $pidfile ]; then - echo "process: $pidfile has been removed, exiting" + if [ ! -f $jobidfile ]; then + echo "process: $jobidfile has been removed, exiting" exit fi @@ -102,6 +117,10 @@ function process_files for dir in "${pltlist[@]}" do + if [ ! -f $jobidfile ]; then + echo "process: $jobidfile has been removed, exiting" + exit + fi if [ -d ${dir} ]; then # only work on the file if there is not a .processed file in the @@ -170,6 +189,10 @@ function process_files for dir in "${chklist[@]}" do + if [ ! -f $jobidfile ]; then + echo "process: $jobidfile has been removed, exiting" + exit + fi if [ -d ${dir} ]; then if [ ! -f ${dir}.processed ] && [ ! -f checkfiles/${dir}.processed ]; then From 89fbb79110e3562a81f5270805efc21e856ae980 Mon Sep 17 00:00:00 2001 From: "Eric T. Johnson" Date: Wed, 31 Jan 2024 17:59:04 -0500 Subject: [PATCH 7/8] Use srun to run process.xrb for better signal handling --- job_scripts/hpss/process.xrb | 4 +++- job_scripts/perlmutter/nersc.xfer.slurm | 14 ++++---------- job_scripts/summit/summit_hpss.submit | 12 +++--------- 3 files changed, 10 insertions(+), 20 deletions(-) diff --git a/job_scripts/hpss/process.xrb b/job_scripts/hpss/process.xrb index 634f467..4b5f3c5 100755 --- a/job_scripts/hpss/process.xrb +++ b/job_scripts/hpss/process.xrb @@ -255,5 +255,7 @@ ${HTAR} -P -cvf ${HPSS_DIR}/diag_files_${datestr}.tar "${all_files[@]}" >> /dev/ while true do process_files - sleep $N + # put sleep in the background so the shell can handle signals + sleep $N & + wait done diff --git a/job_scripts/perlmutter/nersc.xfer.slurm b/job_scripts/perlmutter/nersc.xfer.slurm index 5e2879f..41af123 100644 --- a/job_scripts/perlmutter/nersc.xfer.slurm +++ b/job_scripts/perlmutter/nersc.xfer.slurm @@ -1,17 +1,11 @@ -#!/bin/ksh +#!/bin/bash #SBATCH --qos=xfer #SBATCH -J xrb-hpss-xfer #SBATCH -t 12:00:00 #SBATCH --licenses=SCRATCH -cd $SLURM_SUBMIT_DIR - # do our archiving -pidfile=process.pid - -./process.xrb - -PID=$! -trap 'kill -s TERM $PID' EXIT TERM HUP XCPU KILL +cd "$SLURM_SUBMIT_DIR" || exit -rm -f process.pid +# use srun so any control signals get sent to the child too +srun ./process.xrb diff --git a/job_scripts/summit/summit_hpss.submit b/job_scripts/summit/summit_hpss.submit index 8e366ef..d212aa8 100644 --- a/job_scripts/summit/summit_hpss.submit +++ b/job_scripts/summit/summit_hpss.submit @@ -5,13 +5,7 @@ #SBATCH -N 1 # do our archiving -pidfile=process.pid +cd "$SLURM_SUBMIT_DIR" || exit -cd $SLURM_SUBMIT_DIR - -./process.xrb - -PID=$! -trap 'kill -s TERM $PID' EXIT TERM HUP XCPU KILL - -rm -f process.pid +# use srun so any control signals get sent to the child too +srun ./process.xrb From 581e93272a47b0d8a0c26f44c60e475e1281a5a6 Mon Sep 17 00:00:00 2001 From: "Eric T. Johnson" Date: Wed, 31 Jan 2024 18:41:33 -0500 Subject: [PATCH 8/8] Update docs to reflect process.xrb changes --- sphinx_docs/source/nersc-hpss.rst | 34 +++++++++------------------- sphinx_docs/source/olcf-workflow.rst | 14 ++++++------ 2 files changed, 18 insertions(+), 30 deletions(-) diff --git a/sphinx_docs/source/nersc-hpss.rst b/sphinx_docs/source/nersc-hpss.rst index 0273fbf..2784a67 100644 --- a/sphinx_docs/source/nersc-hpss.rst +++ b/sphinx_docs/source/nersc-hpss.rst @@ -19,31 +19,17 @@ script ``process.xrb`` in ``job_scripts/hpss/``: :download:`process.xrb <../../job_scripts/hpss/process.xrb>` -which continually looks for output and stores -it to HPSS. +which continually looks for output and stores it to HPSS. +By default, the destination directory on HPSS will be have the same name +as the directory your plotfiles are located in. This can be changed by +editing the``$HPSS_DIR`` variable at the top of ``process.xrb``. The following describes how to use the scripts: -1. Create a directory in HPSS that has the same - name as the directory your plotfiles are located in - (just the directory name, not the full path). e.g. if you are running in a directory call - ``/pscratch/sd/z/zingale/wdconvect/`` run, then do: - - .. prompt:: bash - - hsi - mkdir wdconvect - - .. note:: - - If the ``hsi`` command prompts you for your password, you will need - to talk to the NERSC help desk to ask for password-less access to - HPSS. - -2. Copy the ``process.xrb`` script and the slurm script ``nersc.xfer.slurm`` +#. Copy the ``process.xrb`` script and the slurm script ``nersc.xfer.slurm`` into the directory with the plotfiles. -3. Submit the archive job: +#. Submit the archive job: .. prompt:: bash @@ -80,14 +66,16 @@ Some additional notes: the date-string to allow multiple archives to co-exist. * When ``process.xrb`` is running, it creates a lockfile (called - ``process.pid``) that ensures that only one instance of the script + ``process.jobid``) that ensures that only one instance of the script is running at any one time. .. warning:: Sometimes if the job is not terminated normally, the - ``process.pid`` file will be left behind, in which case, the script - aborts. Just delete that if you know the script is not running. + ``process.jobid`` file will be left behind. Later jobs should be + able to detect this and clean up the stale lockfile, but if this + doesn't work, you can delete the file if you know the script is not + running. Jobs in the xfer queue start up quickly. The best approach is to start one as you start your main job (or make it dependent on the main diff --git a/sphinx_docs/source/olcf-workflow.rst b/sphinx_docs/source/olcf-workflow.rst index 93284c7..ab114ab 100644 --- a/sphinx_docs/source/olcf-workflow.rst +++ b/sphinx_docs/source/olcf-workflow.rst @@ -394,15 +394,15 @@ will also store the inputs, probin, and other runtime generated files. If Once the plotfiles are archived they are moved to a subdirectory under your run directory called ``plotfiles/``. +By default, the files will be archived to a directory in HPSS with the same +name as the directory your plotfiles are located in. This can be changed +by editing the ``$HPSS_DIR`` variable at the top of ``process.xrb``. -To use this, we do the following: - -#. Enter the HPSS system via ``hsi`` -#. Create the output directory -- this should have the same name as the directory - you are running in on summit +To use this, we do the following: -#. Exit HPSS +#. Copy the ``process.xrb`` and ``summit_hpss.submit`` scripts into the + directory with the plotfiles. #. Launch the script via: @@ -410,7 +410,7 @@ To use this, we do the following: sbatch summit_hpss.submit - It will for the full time you asked, searching for plotfiles as + It will run for the full time you asked, searching for plotfiles as they are created and moving them to HPSS as they are produced (it will always leave the very last plotfile alone, since it can't tell if it is still being written).