From 00a06ea573dd5b8f58097fe0356c897f804ec287 Mon Sep 17 00:00:00 2001 From: Gavin Seegoolam <22893238+gavinlive@users.noreply.github.com> Date: Fri, 23 Oct 2020 18:18:24 +0100 Subject: [PATCH 01/10] added tasksetting --- README.md | 3 ++ cpu_affinities.json | 66 ++++++++++++++++++++++++++++++++++++++++++++ gpu_monitor.py | 67 ++++++++++++++++++++++++++++++++++++++++----- servers.txt | 5 ++++ 4 files changed, 134 insertions(+), 7 deletions(-) create mode 100644 cpu_affinities.json diff --git a/README.md b/README.md index 0b83306..545e54f 100644 --- a/README.md +++ b/README.md @@ -12,6 +12,7 @@ The script works by using your account to SSH into the servers and running `nvid - Show all current users of all GPUs (-l or --list) - Show all GPUs used by yourself (-m or --me) - Resolve usernames to real names (-f or --finger) +- Taskset jobs running on GPUs (-t or --taskset) ## Requirements @@ -35,6 +36,8 @@ Server myserver.com: If you have some set of servers that you regularily check, specify them in the file `servers.txt`, one address per line. Once you did that, running just `./gpu_monitor.py` checks all servers specified in this file by default. +If you wish to automatically set the CPU-GPU affinities, specify the `cpu_affinities.json` file as shown in the example. + If you want to list all GPUs and who currently uses them, you can use the `-l` flag: ``` > ./gpu_monitor.py -l myserver.com diff --git a/cpu_affinities.json b/cpu_affinities.json new file mode 100644 index 0000000..f069158 --- /dev/null +++ b/cpu_affinities.json @@ -0,0 +1,66 @@ +{ + "monal04.doc.ic.ac.uk": { + "affinities": { + "0": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], + "1": [26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38], + "2": [13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25], + "3": [39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51] + } + }, + "monal03.doc.ic.ac.uk": { + "affinities": { + "0": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], + "1": [26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38], + "2": [13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25], + "3": [39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51] + } + }, + "monal05.doc.ic.ac.uk": { + "affinities": { + "0": [0,1,2], + "1": [3,4,5], + "2": [6,7,8], + "3": [9,10,11], + "4": [12,13,28], + "5": [29,30,31], + "6": [32,33,34], + "7": [35,36,37], + "8": [38,39,46,47,48,49], + "9": [40,41,42,43,44,45] + } + }, + "monal06.doc.ic.ac.uk": { + "affinities": { + "0": [0,1,2], + "1": [3,4,5], + "2": [6,7,8], + "3": [9,10,11], + "4": [12,13,28], + "5": [29,30,31], + "6": [32,33,34], + "7": [35,36,37], + "8": [38,39,46,47,48,49], + "9": [40,41,42,43,44,45] + } + }, + "lory.doc.ic.ac.uk": { + "affinities": { + "0": [0,1,2,3,4], + "1": [5,6,7,8,9], + "2": [10,11,12,13,14], + "3": [15,16,17,18,19], + "4": [40,41,42,43,44], + "5": [45,46,47,48,49], + "6": [50,51,52,53,54], + "7": [55,56,57,58,59], + "8": [20,21,22,23,24], + "9": [25,26,27,28,29], + "10": [30,31,32,33,34], + "11": [35,36,37,38,39], + "12": [60,61,62,63,64], + "13": [65,66,67,68, 69], + "14": [70,71,72,73,74], + "15": [75,76,77,78,79] + } + } +} \ No newline at end of file diff --git a/gpu_monitor.py b/gpu_monitor.py index 11df2f0..401e238 100755 --- a/gpu_monitor.py +++ b/gpu_monitor.py @@ -10,6 +10,7 @@ import pwd import subprocess import sys +import json import xml.etree.ElementTree as ET from collections import defaultdict from functools import partial @@ -26,10 +27,18 @@ SERVER_FILE_PATH = os.path.join(os.path.dirname(os.path.realpath(sys.argv[0])), DEFAULT_SERVER_FILE) +# Default cpu affinities file for tasksetting +DEFAULT_TASKSET_FILE = 'cpu_affinities.json' +SERVER_TASKSET_PATH = os.path.join(os.path.dirname(os.path.realpath(sys.argv[0])), + DEFAULT_TASKSET_FILE) + +NULL_FUNCTION = lambda *args, **kwargs : None + parser = argparse.ArgumentParser(description='Check state of GPU servers') parser.add_argument('-v', '--verbose', action='store_true', help='Be verbose') parser.add_argument('-l', '--list', action='store_true', help='Show used GPUs') +parser.add_argument('-t', '--taskset', action='store_true', help='Use Taskset to set CPU-GPU Affinities') parser.add_argument('-f', '--finger', action='store_true', help='Attempt to resolve user names to real names') parser.add_argument('-m', '--me', action='store_true', @@ -44,6 +53,8 @@ 'is interrupted')) parser.add_argument('--server-file', default=SERVER_FILE_PATH, help='File with addresses of servers to check') +parser.add_argument('--taskset-file', default=SERVER_TASKSET_PATH, + help='File with cpu affinities information if using tasksetting functionality') parser.add_argument('servers', nargs='*', default=[], help='Servers to probe') @@ -60,9 +71,15 @@ # Command for running ps locally PS_CMD = 'ps -o pid= -o ruser= -p {pids}' +# Command for tasksetting locally +TASKSET_CMD = 'taskset -cp {cpus} {pid}' + # Command for running ps remotely REMOTE_PS_CMD = '{} {}'.format(SSH_CMD, PS_CMD) +# Command for tasksetting remotely +REMOTE_TASKSET_CMD = '{} {}'.format(SSH_CMD, TASKSET_CMD) + # Command for getting real names remotely # See https://stackoverflow.com/a/38235661 REAL_NAMES_CMD = """<<-"EOF" @@ -114,6 +131,10 @@ def run_ps_local(pids): res = run_command(cmd) return res.decode('ascii') if res is not None else None +def run_taskset_local(cpus, pid): + cmd = TASKSET_CMD.format(cpus=','.join(cpus), pid=pid) + res = run_command(cmd) + return res.decode('ascii') if res is not None else None def run_ps_remote(server, pids, ssh_timeout, cmd_timeout): cmd = REMOTE_PS_CMD.format(server=server, @@ -124,6 +145,17 @@ def run_ps_remote(server, pids, ssh_timeout, cmd_timeout): return res.decode('ascii') if res is not None else None +def run_taskset_remote(server, cpus, pid, ssh_timeout, cmd_timeout): + cmd = REMOTE_TASKSET_CMD.format(server=server, + pid=pid, + cpus=','.join(cpus), + ssh_timeout=ssh_timeout, + cmd_timeout=cmd_timeout) + res = run_command(cmd) + return res.decode('ascii') if res is not None else None + + + def get_real_names_local(users): real_names_by_users = {} for user in users: @@ -183,9 +215,9 @@ def print_free_gpus(server, gpu_infos): info('\tGPU {}, {}'.format(gpu_info['idx'], gpu_info['model'])) -def print_gpu_infos(server, gpu_infos, run_ps, run_get_real_names, - filter_by_user=None, - translate_to_real_names=False): +def print_gpu_infos(server, gpu_infos, run_ps, run_taskset, + run_get_real_names, filter_by_user=None, + translate_to_real_names=False, cpu_affinities={}): pids = [pid for gpu_info in gpu_infos for pid in gpu_info['pids']] if len(pids) > 0: ps = run_ps(pids=pids) @@ -197,6 +229,11 @@ def print_gpu_infos(server, gpu_infos, run_ps, run_get_real_names, else: users_by_pid = {} + if server in cpu_affinities.keys(): + for pid in pids: + taskset = run_taskset(cpus=cpu_affinities[server]["affinities"], + pid=pid) + if translate_to_real_names: all_users = set((users_by_pid[pid] for gpu_info in gpu_infos for pid in gpu_info['pids'])) @@ -238,6 +275,14 @@ def main(argv): error('Could not open server file {}'.format(args.server_file)) return + try: + debug('Using taskset file {}'.format(args.taskset_file)) + with open(args.taskset_file, 'r') as f: + cpu_affinities = json.load(f) + except OSError as e: + error('Could not open server file {}'.format(args.server_file)) + cpu_affinities = {} + if len(args.servers) == 0: error(('No GPU servers to connect to specified.\nPut addresses in ' 'the server file or specify them manually as an argument')) @@ -258,6 +303,7 @@ def main(argv): if server == '.' or server == 'localhost' or server == '127.0.0.1': run_nvidiasmi = run_nvidiasmi_local run_ps = run_ps_local + run_taskset = run_taskset_local run_get_real_names = get_real_names_local else: run_nvidiasmi = partial(run_nvidiasmi_remote, @@ -268,11 +314,17 @@ def main(argv): server=server, ssh_timeout=args.ssh_timeout, cmd_timeout=args.cmd_timeout) + run_taskset = partial(run_taskset_remote, + server=server, + ssh_timeout=args.ssh_timeout, + cmd_timeout=args.cmd_timeout) run_get_real_names = partial(get_real_names_remote, server=server, ssh_timeout=args.ssh_timeout, cmd_timeout=args.cmd_timeout) + run_taskset = NULL_FUNCTION if args.taskset is False else run_taskset + nvidiasmi = run_nvidiasmi() if nvidiasmi is None: error(('Could not reach {} or ' @@ -281,10 +333,11 @@ def main(argv): gpu_infos = get_gpu_infos(nvidiasmi) - if args.list: - print_gpu_infos(server, gpu_infos, run_ps, run_get_real_names, - filter_by_user=args.user, - translate_to_real_names=args.finger) + if args.list or args.taskset: + print_gpu_infos(server, gpu_infos, run_ps, run_taskset, + run_get_real_names, filter_by_user=args.user, + translate_to_real_names=args.finger, + cpu_affinities=cpu_affinities) else: print_free_gpus(server, gpu_infos) diff --git a/servers.txt b/servers.txt index e69de29..ebc579a 100644 --- a/servers.txt +++ b/servers.txt @@ -0,0 +1,5 @@ +monal03.doc.ic.ac.uk +monal04.doc.ic.ac.uk +monal05.doc.ic.ac.uk +monal06.doc.ic.ac.uk +lory.doc.ic.ac.uk \ No newline at end of file From f44103576f1aa64010c575fdb35ac715e500f037 Mon Sep 17 00:00:00 2001 From: Gavin Seegoolam <22893238+gavinlive@users.noreply.github.com> Date: Fri, 23 Oct 2020 19:06:26 +0100 Subject: [PATCH 02/10] added taskset daemon --- gpu_monitor.py | 9 ++++++--- taskset_daemon.sh | 6 ++++++ 2 files changed, 12 insertions(+), 3 deletions(-) create mode 100755 taskset_daemon.sh diff --git a/gpu_monitor.py b/gpu_monitor.py index 401e238..7975c6a 100755 --- a/gpu_monitor.py +++ b/gpu_monitor.py @@ -132,6 +132,7 @@ def run_ps_local(pids): return res.decode('ascii') if res is not None else None def run_taskset_local(cpus, pid): + cpus = [str(x) for x in cpus] cmd = TASKSET_CMD.format(cpus=','.join(cpus), pid=pid) res = run_command(cmd) return res.decode('ascii') if res is not None else None @@ -146,6 +147,7 @@ def run_ps_remote(server, pids, ssh_timeout, cmd_timeout): def run_taskset_remote(server, cpus, pid, ssh_timeout, cmd_timeout): + cpus = [str(x) for x in cpus] cmd = REMOTE_TASKSET_CMD.format(server=server, pid=pid, cpus=','.join(cpus), @@ -230,9 +232,10 @@ def print_gpu_infos(server, gpu_infos, run_ps, run_taskset, users_by_pid = {} if server in cpu_affinities.keys(): - for pid in pids: - taskset = run_taskset(cpus=cpu_affinities[server]["affinities"], - pid=pid) + for gpu_info in gpu_infos: + for pid in gpu_info["pids"]: + gpu_cpus=cpu_affinities[server]["affinities"][str(gpu_info["idx"])] + taskset = run_taskset(cpus=gpu_cpus, pid=pid) if translate_to_real_names: all_users = set((users_by_pid[pid] for gpu_info in gpu_infos diff --git a/taskset_daemon.sh b/taskset_daemon.sh new file mode 100755 index 0000000..825fa08 --- /dev/null +++ b/taskset_daemon.sh @@ -0,0 +1,6 @@ +DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" +script=python $DIR/gpu_monitor.py -v -t + +cmd="watch -n 300 " +$cmd $script + From d5963dc6edbf34d07cb284fbf5ae033a7ea979f7 Mon Sep 17 00:00:00 2001 From: Gavin Seegoolam <22893238+gavinlive@users.noreply.github.com> Date: Fri, 23 Oct 2020 19:12:45 +0100 Subject: [PATCH 03/10] typo --- taskset_daemon.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/taskset_daemon.sh b/taskset_daemon.sh index 825fa08..655496f 100755 --- a/taskset_daemon.sh +++ b/taskset_daemon.sh @@ -2,5 +2,5 @@ DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" script=python $DIR/gpu_monitor.py -v -t cmd="watch -n 300 " -$cmd $script +$cmd "$script" From e6681d9f791c5e03d4ed9f6dd782c2822bea3cf3 Mon Sep 17 00:00:00 2001 From: "Huaqi (Harvey) Qiu" Date: Fri, 23 Oct 2020 23:16:28 +0100 Subject: [PATCH 04/10] add hyper-threading cores to monal05 & 06 --- cpu_affinities.json | 42 +++++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/cpu_affinities.json b/cpu_affinities.json index f069158..0d21fd0 100644 --- a/cpu_affinities.json +++ b/cpu_affinities.json @@ -17,30 +17,30 @@ }, "monal05.doc.ic.ac.uk": { "affinities": { - "0": [0,1,2], - "1": [3,4,5], - "2": [6,7,8], - "3": [9,10,11], - "4": [12,13,28], - "5": [29,30,31], - "6": [32,33,34], - "7": [35,36,37], - "8": [38,39,46,47,48,49], - "9": [40,41,42,43,44,45] + "0": [0,1,2,14,15,16], + "1": [3,4,5,17,18,19], + "2": [6,7,8,20,21,22], + "3": [9,10,11,23,24,25], + "4": [12,13,28,26,27], + "5": [29,30,31,43,44], + "6": [32,33,34,46,47], + "7": [35,36,37,49,50], + "8": [38,39,42,45,52,53], + "9": [40,41,48,51,54,55] } }, "monal06.doc.ic.ac.uk": { "affinities": { - "0": [0,1,2], - "1": [3,4,5], - "2": [6,7,8], - "3": [9,10,11], - "4": [12,13,28], - "5": [29,30,31], - "6": [32,33,34], - "7": [35,36,37], - "8": [38,39,46,47,48,49], - "9": [40,41,42,43,44,45] + "0": [0,1,2,14,15,16], + "1": [3,4,5,17,18,19], + "2": [6,7,8,20,21,22], + "3": [9,10,11,23,24,25], + "4": [12,13,28,26,27], + "5": [29,30,31,43,44], + "6": [32,33,34,46,47], + "7": [35,36,37,49,50], + "8": [38,39,42,45,52,53], + "9": [40,41,48,51,54,55] } }, "lory.doc.ic.ac.uk": { @@ -63,4 +63,4 @@ "15": [75,76,77,78,79] } } -} \ No newline at end of file +} From 609f44321a4da3d2577676d11dc0cc02f80d9211 Mon Sep 17 00:00:00 2001 From: Gavin Seegoolam <22893238+gavinlive@users.noreply.github.com> Date: Thu, 3 Dec 2020 12:41:43 +0000 Subject: [PATCH 05/10] [minor] mode --- LICENSE | 0 README.md | 0 servers.txt | 0 3 files changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 100755 LICENSE mode change 100644 => 100755 README.md mode change 100644 => 100755 servers.txt diff --git a/LICENSE b/LICENSE old mode 100644 new mode 100755 diff --git a/README.md b/README.md old mode 100644 new mode 100755 diff --git a/servers.txt b/servers.txt old mode 100644 new mode 100755 From 2afccd472cc3cb8b94bce9e1a75b8e76b877dcd6 Mon Sep 17 00:00:00 2001 From: Gavin Seegoolam <22893238+gavinlive@users.noreply.github.com> Date: Thu, 3 Dec 2020 12:41:57 +0000 Subject: [PATCH 06/10] increased timeout --- gpu_monitor.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gpu_monitor.py b/gpu_monitor.py index 7975c6a..3f1fa98 100755 --- a/gpu_monitor.py +++ b/gpu_monitor.py @@ -17,10 +17,10 @@ from logging import debug, info, error # Default timeout in seconds after which SSH stops trying to connect -DEFAULT_SSH_TIMEOUT = 3 +DEFAULT_SSH_TIMEOUT = 30 # Default timeout in seconds after which remote commands are interrupted -DEFAULT_CMD_TIMEOUT = 10 +DEFAULT_CMD_TIMEOUT = 50 # Default server file DEFAULT_SERVER_FILE = 'servers.txt' From 691b3d104bcbbf5a31539a7cefc31a06f91b2842 Mon Sep 17 00:00:00 2001 From: Gavin Seegoolam <22893238+gavinlive@users.noreply.github.com> Date: Thu, 3 Dec 2020 12:42:24 +0000 Subject: [PATCH 07/10] new affinities --- cpu_affinities.json | 0 1 file changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 100755 cpu_affinities.json diff --git a/cpu_affinities.json b/cpu_affinities.json old mode 100644 new mode 100755 From a61536c628a9f5f43b12850620dcbcdc690dbd2d Mon Sep 17 00:00:00 2001 From: Huaqi Qiu Date: Thu, 1 Jul 2021 17:29:13 +0100 Subject: [PATCH 08/10] daemon --- gpu_monitor.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/gpu_monitor.py b/gpu_monitor.py index 3f1fa98..d51c48e 100755 --- a/gpu_monitor.py +++ b/gpu_monitor.py @@ -346,4 +346,8 @@ def main(argv): if __name__ == '__main__': - main(sys.argv[1:]) + import time + while True: + main(sys.argv[1:]) + time.sleep(15) + From 0ab040d692264dcb62b94d43ebc3449839ba58c8 Mon Sep 17 00:00:00 2001 From: gavinlive <22893238+gavinlive@users.noreply.github.com> Date: Thu, 1 Jul 2021 17:33:51 +0100 Subject: [PATCH 09/10] main daemonised --- taskset_daemon.sh | 6 ------ 1 file changed, 6 deletions(-) delete mode 100755 taskset_daemon.sh diff --git a/taskset_daemon.sh b/taskset_daemon.sh deleted file mode 100755 index 655496f..0000000 --- a/taskset_daemon.sh +++ /dev/null @@ -1,6 +0,0 @@ -DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" -script=python $DIR/gpu_monitor.py -v -t - -cmd="watch -n 300 " -$cmd "$script" - From d386e84a5a44c3d65b40b3d0855491de7380ac9f Mon Sep 17 00:00:00 2001 From: Gavin Seegoolam <22893238+gavinlive@users.noreply.github.com> Date: Thu, 1 Jul 2021 17:46:21 +0100 Subject: [PATCH 10/10] Added daemon --- README.md | 1 + gpu_monitor.py | 19 ++++++++++++------- 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 545e54f..de33135 100755 --- a/README.md +++ b/README.md @@ -13,6 +13,7 @@ The script works by using your account to SSH into the servers and running `nvid - Show all GPUs used by yourself (-m or --me) - Resolve usernames to real names (-f or --finger) - Taskset jobs running on GPUs (-t or --taskset) +- Run on a loop (-d or --daemon) ## Requirements diff --git a/gpu_monitor.py b/gpu_monitor.py index d51c48e..b125a90 100755 --- a/gpu_monitor.py +++ b/gpu_monitor.py @@ -10,6 +10,7 @@ import pwd import subprocess import sys +import time import json import xml.etree.ElementTree as ET from collections import defaultdict @@ -38,6 +39,8 @@ parser.add_argument('-v', '--verbose', action='store_true', help='Be verbose') parser.add_argument('-l', '--list', action='store_true', help='Show used GPUs') +parser.add_argument('-d', '--daemon', action='store_true', + help='Loop') parser.add_argument('-t', '--taskset', action='store_true', help='Use Taskset to set CPU-GPU Affinities') parser.add_argument('-f', '--finger', action='store_true', help='Attempt to resolve user names to real names') @@ -262,9 +265,8 @@ def print_gpu_infos(server, gpu_infos, run_ps, run_taskset, status)) -def main(argv): - args = parser.parse_args(argv) - +def main(args): + logging.basicConfig(format='%(message)s', level=logging.DEBUG if args.verbose else logging.INFO) @@ -346,8 +348,11 @@ def main(argv): if __name__ == '__main__': - import time - while True: - main(sys.argv[1:]) - time.sleep(15) + args = parser.parse_args(sys.argv[1:]) + if args.daemon: + while True: + main(args) + time.sleep(15) + else: + main(args)