-
Notifications
You must be signed in to change notification settings - Fork 0
/
50_hpc_cluster_slurm_epilog
102 lines (83 loc) · 3.97 KB
/
50_hpc_cluster_slurm_epilog
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
#!/usr/bin/bash
# Runs at job termination
# Invoked by slurmd daemon as SlurmdUser (i.e. root)
SLURM_BIN=/usr/bin
SLURM_SCONTROL=${SLURM_BIN}/scontrol
SLURM_SINFO=${SLURM_BIN}/sinfo
HOST_=$(/bin/hostname -s)
if [ x$SLURM_JOB_UID = "x" ] || [ x$SLURM_JOB_ID = "x" ]; then exit 0; fi
UNHEALTHY=0
REASON=""
function check_status(){
# Mark node unhealthy if any check fails and append message to reason
if [[ $1 != "$2" ]]; then
if [[ $UNHEALTHY -eq 0 ]]; then
UNHEALTHY=1
PREV_REASON=$(${SLURM_SINFO} -o '%E' -hn ${HOST_} | /bin/sed -e '/^none$/d')
if [[ ${#PREV_REASON} -gt 0 ]]; then REASON="$PREV_REASON"; fi
fi
NEW_REASON="$3;"
# append new reason if it isn't already inside reason
if [[ "$REASON" != *$NEW_REASON* ]]; then REASON="$REASON$NEW_REASON"; fi
fi
}
function user_cleanup(){
# Called at end of last job by user
# Give stuff a chance to cleanup
/usr/bin/killall -u ${SLURM_JOB_USER}
# kill leftover allocated semaphores, shared memory, and message queues
for S in $(/usr/bin/ipcs -s | /bin/grep "${SLURM_JOB_USER}" | /bin/awk '{print $2}'); do /usr/bin/ipcrm -s $S; done
for M in $(/usr/bin/ipcs -m "${SLURM_JOB_USER}" | /bin/grep -v key | /bin/awk '{print $2}'); do /usr/bin/ipcrm -m $M; done
for Q in $(/usr/bin/ipcs -q "${SLURM_JOB_USER}" | /bin/grep -v key | /bin/awk '{print $2}'); do /usr/bin/ipcrm -q $Q; done
# Left Over processes that do not kill
PIDS=$(/bin/ps -u ${SLURM_JOB_USER} -o "pid=")
if [[ -n "$PIDS" ]]; then /bin/sleep 20; /usr/bin/killall -9 -u ${SLURM_JOB_USER}; /bin/sleep 10; fi
PIDS=$(/bin/ps -u ${SLURM_JOB_USER} -o "pid=")
if [[ -n "$PIDS" ]]; then check_status 0 1 "EPILOG: Leftover process for ${SLURM_JOB_USER} from ${SLURM_JOB_ID}"; fi
# Remove any files the user created in /tmp or /dev/shm
# Don't recurse to remove directories, only remove files
/bin/find /tmp -user ${SLURM_JOB_USER} -type 'f' -print0 | /usr/bin/xargs -0 -r /bin/rm -f
/bin/find /dev/shm -user ${SLURM_JOB_USER} -type 'f' -print0 | /usr/bin/xargs -0 -r /bin/rm -f
}
#=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
SLURM_EPILOG_LOCK_FILE=/dev/shm/slurm.epilog.lock
(
/usr/bin/flock -x 200
RVAL=$?
if [ $RVAL -ne 0 ]; then exit $RVAL; fi
if [ $SLURM_UID -lt 100 ]; then exit 0; fi # don't try to kill user root or system daemon jobs
# Check if there are any other users or jobs (race condition possible)
LIST=$(${SLURM_BIN}/squeue --noheader --format="%A|%U" --node=localhost);
HAS_OTHER_JOB=0;
HAS_OTHER_USER=0;
for ITEM in $LIST; do
JOB_ID=${STR%|*}; USER_ID=${STR#*|};
if [ $JOB_ID -ne $SLURM_JOB_ID ] ; then HAS_OTHER_JOB=1; fi
if [ $USER_ID -ne $SLURM_JOB_UID ] ; then HAS_OTHER_USER=1; fi
done
# Cleanup based on precense of other jobs or users on the node
if [[ $HAS_OTHER_JOB -eq 0 ]]; then
user_cleanup;
/bin/sync;
/bin/echo 3 > /proc/sys/vm/drop_caches;
#/usr/bin/nvidia-smi --persistence-mode=0 >> /dev/null 2>&1;
elif [[ $HAS_OTHER_USER -eq 0 ]]; then
user_cleanup;
fi
# Remove local per job scratch space
/bin/rm -rf /scratch/${SLURM_JOB_USER}/job_${SLURM_JOB_ID}
check_status 0 $? "Cannot remove local scratch space"
# Clean up /tmp and /dev/shm
# Get rid of all ordinary files older than 168h (7 days) and not accessed or modified in the past 168 hours
/usr/bin/find /tmp -xdev -atime +8 -mtime +8 -type 'f' -print0 | /usr/bin/xargs -0 -r /bin/rm -f
/usr/bin/find /dev/shm -xdev -atime +8 -mtime +8 -type 'f' -print0 | /usr/bin/xargs -0 -r /bin/rm -f
# Return GPU driver settings to default state
#/usr/bin/nvidia-smi --compute-mode=0 >> /dev/null 2>&1
exit 0
) 200>${SLURM_EPILOG_LOCK_FILE}
# Offline node with REASON if any epilog checks fail
if [[ "$UNHEALTHY" -ne 0 ]]; then
${SLURM_SCONTROL} update nodename=${HOST_} state=fail reason="${REASON}"
exit 1
fi
exit 0