-
Notifications
You must be signed in to change notification settings - Fork 0
/
50_hpc_cluster_slurm_prolog
65 lines (53 loc) · 2.34 KB
/
50_hpc_cluster_slurm_prolog
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
#!/usr/bin/bash
# Runs when first job or job step initiation on that node
# With PrologFlags=Alloc will force the script to be executed at job allocation
# Invoked by slurmd daemon as SlurmdUser (i.e. root)
export PATH="/usr/share/Modules/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin"
SLURM_BIN=/usr/bin
SLURM_SCONTROL=${SLURM_BIN}/scontrol
SLURM_SINFO=${SLURM_BIN}/sinfo
HOST_=$(/bin/hostname -s)
if [ x$SLURM_JOB_UID = "x" ] || [ x$SLURM_JOB_ID = "x" ]; then exit 0; fi
UNHEALTHY=0
REASON=""
# While dedicated nodes should never have a REASON when they enter the PROLOG
# it is possible for a shared node to fail in the EPILOG of one job WHILE
# the PROLOG for another job is running. Until we create a barrier that prevents
# this we need to be more careful here...
function check_status(){
# Mark node unhealthy if any check fails and append message to reason
if [[ $1 != "$2" ]]; then
if [[ $UNHEALTHY -eq 0 ]]; then
UNHEALTHY=1
PREV_REASON=$(${SLURM_SINFO} -o '%E' -hn ${HOST_} | /bin/sed -e '/^none$/d')
if [[ ${#PREV_REASON} -gt 0 ]]; then REASON="$PREV_REASON"; fi
fi
NEW_REASON="$3;"
# append new reason if it isn't already inside reason
if [[ "$REASON" != *$NEW_REASON* ]]; then REASON="$REASON$NEW_REASON"; fi
fi
}
# Create local scratch directory
/bin/mkdir -p -m 0700 /scratch/${SLURM_JOB_USER}/job_${SLURM_JOB_ID}
check_status 0 $? "PROLOG: Cannot create local scratch"
# Set local scratch ownership and permissions
/bin/chmod 0700 /scratch/${SLURM_JOB_USER}
check_status 0 $? "PROLOG: Cannot chmod user local scratch dir"
/bin/chown ${SLURM_JOB_USER} /scratch/${SLURM_JOB_USER}
check_status 0 $? "PROLOG: Cannot chown user local scratch dir"
/bin/chown -R ${SLURM_JOB_USER} /scratch/${SLURM_JOB_USER}/job_${SLURM_JOB_ID}
check_status 0 $? "PROLOG: Cannot chown user local scratch dir"
# Set NVIDIA driver modes
#/usr/bin/nvidia-smi --compute-mode=0 >> /dev/null 2>&1
#/usr/bin/nvidia-smi --persistence-mode=1 >> /dev/null 2>&1
# Clean-up non-shared nodes
if [[ $(ls -d /scratch/*/* | wc -l) -eq 1 ]]; then
/bin/sync
/bin/echo 3 > /proc/sys/vm/drop_caches
fi
# Offline node with REASON if prolog checks fail
if [[ $UNHEALTHY -ne 0 ]]; then
${SLURM_SCONTROL} update nodename=${HOST_} state=fail reason="${REASON}"
exit 1
fi
exit 0