Skip to content

Commit

Permalink
Merge pull request #438 from NREL/ccaradon/kestrel_tmp_dir_fix
Browse files Browse the repository at this point in the history
Fixes temp directory and bigmem issues with kestrel runs
  • Loading branch information
nmerket authored Apr 16, 2024
2 parents 92993de + 1748ec9 commit 670ad58
Show file tree
Hide file tree
Showing 3 changed files with 16 additions and 2 deletions.
8 changes: 7 additions & 1 deletion buildstockbatch/hpc.py
Original file line number Diff line number Diff line change
Expand Up @@ -486,6 +486,7 @@ def queue_sampling(
"--time={}".format(cfg[cls.HPC_NAME].get("sampling", {}).get("time", 60)),
"--account={}".format(cfg[cls.HPC_NAME]["account"]),
"--nodes=1",
"--mem={}".format(cls.DEFAULT_NODE_MEMORY_MB),
"--export={}".format(",".join(env.keys())),
"--output=sampling.out",
hpc_sh,
Expand Down Expand Up @@ -536,6 +537,7 @@ def queue_jobs(self, array_ids=None, hipri=False):
"sbatch",
"--account={}".format(account),
"--time={}".format(walltime),
"--mem={}".format(self.DEFAULT_NODE_MEMORY_MB),
"--export={}".format(",".join(export_vars)),
"--array={}".format(array_spec),
"--output=job.out-%a",
Expand Down Expand Up @@ -617,13 +619,15 @@ def queue_post_processing(self, after_jobids=[], upload_only=False, hipri=False)

args = [
"sbatch",
"--tmp=1000000",
"--account={}".format(account),
"--time={}".format(walltime),
"--export={}".format(",".join(env_export.keys())),
"--job-name=bstkpost",
"--output=postprocessing.out",
"--nodes=1",
":",
"--tmp=1000000",
"--mem={}".format(memory),
"--output=dask_workers.out",
"--nodes={}".format(n_workers),
Expand Down Expand Up @@ -743,6 +747,7 @@ class EagleBatch(SlurmBatch):
CORES_PER_NODE = 36
MIN_SIMS_PER_JOB = 36 * 2
DEFAULT_POSTPROCESSING_NODE_MEMORY_MB = 85248
DEFAULT_NODE_MEMORY_MB = 85248 # standard node on Eagle
DEFAULT_POSTPROCESSING_N_PROCS = 18
DEFAULT_POSTPROCESSING_N_WORKERS = 2

Expand Down Expand Up @@ -773,7 +778,8 @@ class KestrelBatch(SlurmBatch):
HPC_NAME = "kestrel"
CORES_PER_NODE = 104
MIN_SIMS_PER_JOB = 104 * 2
DEFAULT_POSTPROCESSING_NODE_MEMORY_MB = 250000 # Standard node
DEFAULT_POSTPROCESSING_NODE_MEMORY_MB = 247000 # Standard node
DEFAULT_NODE_MEMORY_MB = 247000 # standard node on Kestrel
DEFAULT_POSTPROCESSING_N_PROCS = 52
DEFAULT_POSTPROCESSING_N_WORKERS = 2

Expand Down
4 changes: 4 additions & 0 deletions buildstockbatch/kestrel.sh
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,10 @@ df -h

module load python apptainer
source "$MY_PYTHON_ENV/bin/activate"
# Default LOCAL_SCRATCH = /tmp/scratch
# Setting to user-specific dir to avoid
# issues with deleting previous buildstock run debris
export LOCAL_SCRATCH=/tmp/scratch/$USER
source /kfs2/shared-projects/buildstock/aws_credentials.sh

time python -u -m buildstockbatch.hpc kestrel "$PROJECTFILE"
6 changes: 5 additions & 1 deletion buildstockbatch/kestrel_postprocessing.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,10 @@ df -h

module load python apptainer
source "$MY_PYTHON_ENV/bin/activate"
# Default LOCAL_SCRATCH = /tmp/scratch
# Setting to user-specific dir to avoid
# issues with deleting previous buildstock run debris
export LOCAL_SCRATCH=/tmp/scratch/$USER
source /kfs2/shared-projects/buildstock/aws_credentials.sh

export POSTPROCESS=1
Expand All @@ -30,6 +34,6 @@ pdsh -w $SLURM_JOB_NODELIST_PACK_GROUP_1 "free -h"
pdsh -w $SLURM_JOB_NODELIST_PACK_GROUP_1 "df -i; df -h"

$MY_PYTHON_ENV/bin/dask scheduler --scheduler-file $SCHEDULER_FILE &> $OUT_DIR/dask_scheduler.out &
pdsh -w $SLURM_JOB_NODELIST_PACK_GROUP_1 "source /kfs2/shared-projects/buildstock/aws_credentials.sh; $MY_PYTHON_ENV/bin/dask worker --scheduler-file $SCHEDULER_FILE --local-directory /tmp/scratch/dask --nworkers ${NPROCS} --nthreads 1 --memory-limit ${MEMORY}MB" &> $OUT_DIR/dask_workers.out &
pdsh -w $SLURM_JOB_NODELIST_PACK_GROUP_1 "source /kfs2/shared-projects/buildstock/aws_credentials.sh; $MY_PYTHON_ENV/bin/dask worker --scheduler-file $SCHEDULER_FILE --local-directory $LOCAL_SCRATCH/dask --nworkers ${NPROCS} --nthreads 1 --memory-limit ${MEMORY}MB" &> $OUT_DIR/dask_workers.out &

time python -u -m buildstockbatch.hpc kestrel "$PROJECTFILE"

0 comments on commit 670ad58

Please sign in to comment.