-
Notifications
You must be signed in to change notification settings - Fork 29
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Added an option to retrospective set CPU affinities for each process running on GPUs #1
base: master
Are you sure you want to change the base?
Changes from all commits
00a06ea
f441035
d5963dc
e6681d9
0f75f63
609f443
2afccd4
691b3d1
11ac8d0
a61536c
7d56430
0ab040d
d386e84
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,66 @@ | ||
{ | ||
"monal04.doc.ic.ac.uk": { | ||
"affinities": { | ||
"0": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], | ||
"1": [26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38], | ||
"2": [13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25], | ||
"3": [39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51] | ||
} | ||
}, | ||
"monal03.doc.ic.ac.uk": { | ||
"affinities": { | ||
"0": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], | ||
"1": [26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38], | ||
"2": [13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25], | ||
"3": [39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51] | ||
} | ||
}, | ||
"monal05.doc.ic.ac.uk": { | ||
"affinities": { | ||
"0": [0,1,2,14,15,16], | ||
"1": [3,4,5,17,18,19], | ||
"2": [6,7,8,20,21,22], | ||
"3": [9,10,11,23,24,25], | ||
"4": [12,13,28,26,27], | ||
"5": [29,30,31,43,44], | ||
"6": [32,33,34,46,47], | ||
"7": [35,36,37,49,50], | ||
"8": [38,39,42,45,52,53], | ||
"9": [40,41,48,51,54,55] | ||
} | ||
}, | ||
"monal06.doc.ic.ac.uk": { | ||
"affinities": { | ||
"0": [0,1,2,14,15,16], | ||
"1": [3,4,5,17,18,19], | ||
"2": [6,7,8,20,21,22], | ||
"3": [9,10,11,23,24,25], | ||
"4": [12,13,28,26,27], | ||
"5": [29,30,31,43,44], | ||
"6": [32,33,34,46,47], | ||
"7": [35,36,37,49,50], | ||
"8": [38,39,42,45,52,53], | ||
"9": [40,41,48,51,54,55] | ||
} | ||
}, | ||
"lory.doc.ic.ac.uk": { | ||
"affinities": { | ||
"0": [0,1,2,3,4], | ||
"1": [5,6,7,8,9], | ||
"2": [10,11,12,13,14], | ||
"3": [15,16,17,18,19], | ||
"4": [40,41,42,43,44], | ||
"5": [45,46,47,48,49], | ||
"6": [50,51,52,53,54], | ||
"7": [55,56,57,58,59], | ||
"8": [20,21,22,23,24], | ||
"9": [25,26,27,28,29], | ||
"10": [30,31,32,33,34], | ||
"11": [35,36,37,38,39], | ||
"12": [60,61,62,63,64], | ||
"13": [65,66,67,68, 69], | ||
"14": [70,71,72,73,74], | ||
"15": [75,76,77,78,79] | ||
} | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -10,26 +10,38 @@ | |
import pwd | ||
import subprocess | ||
import sys | ||
import time | ||
import json | ||
import xml.etree.ElementTree as ET | ||
from collections import defaultdict | ||
from functools import partial | ||
from logging import debug, info, error | ||
|
||
# Default timeout in seconds after which SSH stops trying to connect | ||
DEFAULT_SSH_TIMEOUT = 3 | ||
DEFAULT_SSH_TIMEOUT = 30 | ||
|
||
# Default timeout in seconds after which remote commands are interrupted | ||
DEFAULT_CMD_TIMEOUT = 10 | ||
DEFAULT_CMD_TIMEOUT = 50 | ||
|
||
# Default server file | ||
DEFAULT_SERVER_FILE = 'servers.txt' | ||
SERVER_FILE_PATH = os.path.join(os.path.dirname(os.path.realpath(sys.argv[0])), | ||
DEFAULT_SERVER_FILE) | ||
|
||
# Default cpu affinities file for tasksetting | ||
DEFAULT_TASKSET_FILE = 'cpu_affinities.json' | ||
SERVER_TASKSET_PATH = os.path.join(os.path.dirname(os.path.realpath(sys.argv[0])), | ||
DEFAULT_TASKSET_FILE) | ||
|
||
NULL_FUNCTION = lambda *args, **kwargs : None | ||
|
||
parser = argparse.ArgumentParser(description='Check state of GPU servers') | ||
parser.add_argument('-v', '--verbose', action='store_true', | ||
help='Be verbose') | ||
parser.add_argument('-l', '--list', action='store_true', help='Show used GPUs') | ||
parser.add_argument('-d', '--daemon', action='store_true', | ||
help='Loop') | ||
parser.add_argument('-t', '--taskset', action='store_true', help='Use Taskset to set CPU-GPU Affinities') | ||
parser.add_argument('-f', '--finger', action='store_true', | ||
help='Attempt to resolve user names to real names') | ||
parser.add_argument('-m', '--me', action='store_true', | ||
|
@@ -44,6 +56,8 @@ | |
'is interrupted')) | ||
parser.add_argument('--server-file', default=SERVER_FILE_PATH, | ||
help='File with addresses of servers to check') | ||
parser.add_argument('--taskset-file', default=SERVER_TASKSET_PATH, | ||
help='File with cpu affinities information if using tasksetting functionality') | ||
parser.add_argument('servers', nargs='*', default=[], | ||
help='Servers to probe') | ||
|
||
|
@@ -60,9 +74,15 @@ | |
# Command for running ps locally | ||
PS_CMD = 'ps -o pid= -o ruser= -p {pids}' | ||
|
||
# Command for tasksetting locally | ||
TASKSET_CMD = 'taskset -cp {cpus} {pid}' | ||
|
||
# Command for running ps remotely | ||
REMOTE_PS_CMD = '{} {}'.format(SSH_CMD, PS_CMD) | ||
|
||
# Command for tasksetting remotely | ||
REMOTE_TASKSET_CMD = '{} {}'.format(SSH_CMD, TASKSET_CMD) | ||
|
||
# Command for getting real names remotely | ||
# See https://stackoverflow.com/a/38235661 | ||
REAL_NAMES_CMD = """<<-"EOF" | ||
|
@@ -114,6 +134,11 @@ def run_ps_local(pids): | |
res = run_command(cmd) | ||
return res.decode('ascii') if res is not None else None | ||
|
||
def run_taskset_local(cpus, pid): | ||
cpus = [str(x) for x in cpus] | ||
cmd = TASKSET_CMD.format(cpus=','.join(cpus), pid=pid) | ||
res = run_command(cmd) | ||
return res.decode('ascii') if res is not None else None | ||
|
||
def run_ps_remote(server, pids, ssh_timeout, cmd_timeout): | ||
cmd = REMOTE_PS_CMD.format(server=server, | ||
|
@@ -124,6 +149,18 @@ def run_ps_remote(server, pids, ssh_timeout, cmd_timeout): | |
return res.decode('ascii') if res is not None else None | ||
|
||
|
||
def run_taskset_remote(server, cpus, pid, ssh_timeout, cmd_timeout): | ||
cpus = [str(x) for x in cpus] | ||
cmd = REMOTE_TASKSET_CMD.format(server=server, | ||
pid=pid, | ||
cpus=','.join(cpus), | ||
ssh_timeout=ssh_timeout, | ||
cmd_timeout=cmd_timeout) | ||
Comment on lines
+155
to
+158
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please align indentation to bracket |
||
res = run_command(cmd) | ||
return res.decode('ascii') if res is not None else None | ||
|
||
|
||
|
||
def get_real_names_local(users): | ||
real_names_by_users = {} | ||
for user in users: | ||
|
@@ -183,9 +220,9 @@ def print_free_gpus(server, gpu_infos): | |
info('\tGPU {}, {}'.format(gpu_info['idx'], gpu_info['model'])) | ||
|
||
|
||
def print_gpu_infos(server, gpu_infos, run_ps, run_get_real_names, | ||
filter_by_user=None, | ||
translate_to_real_names=False): | ||
def print_gpu_infos(server, gpu_infos, run_ps, run_taskset, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Instead of including the taskset functionality in |
||
run_get_real_names, filter_by_user=None, | ||
translate_to_real_names=False, cpu_affinities={}): | ||
pids = [pid for gpu_info in gpu_infos for pid in gpu_info['pids']] | ||
if len(pids) > 0: | ||
ps = run_ps(pids=pids) | ||
|
@@ -197,6 +234,12 @@ def print_gpu_infos(server, gpu_infos, run_ps, run_get_real_names, | |
else: | ||
users_by_pid = {} | ||
|
||
if server in cpu_affinities.keys(): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This can be just |
||
for gpu_info in gpu_infos: | ||
for pid in gpu_info["pids"]: | ||
gpu_cpus=cpu_affinities[server]["affinities"][str(gpu_info["idx"])] | ||
Comment on lines
+239
to
+240
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please use single quotes to match with the rest of the script :) |
||
taskset = run_taskset(cpus=gpu_cpus, pid=pid) | ||
|
||
if translate_to_real_names: | ||
all_users = set((users_by_pid[pid] for gpu_info in gpu_infos | ||
for pid in gpu_info['pids'])) | ||
|
@@ -222,9 +265,8 @@ def print_gpu_infos(server, gpu_infos, run_ps, run_get_real_names, | |
status)) | ||
|
||
|
||
def main(argv): | ||
args = parser.parse_args(argv) | ||
|
||
def main(args): | ||
|
||
logging.basicConfig(format='%(message)s', | ||
level=logging.DEBUG if args.verbose else logging.INFO) | ||
|
||
|
@@ -238,6 +280,14 @@ def main(argv): | |
error('Could not open server file {}'.format(args.server_file)) | ||
return | ||
|
||
try: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This should be under |
||
debug('Using taskset file {}'.format(args.taskset_file)) | ||
with open(args.taskset_file, 'r') as f: | ||
cpu_affinities = json.load(f) | ||
except OSError as e: | ||
error('Could not open server file {}'.format(args.server_file)) | ||
cpu_affinities = {} | ||
|
||
if len(args.servers) == 0: | ||
error(('No GPU servers to connect to specified.\nPut addresses in ' | ||
'the server file or specify them manually as an argument')) | ||
|
@@ -258,6 +308,7 @@ def main(argv): | |
if server == '.' or server == 'localhost' or server == '127.0.0.1': | ||
run_nvidiasmi = run_nvidiasmi_local | ||
run_ps = run_ps_local | ||
run_taskset = run_taskset_local | ||
run_get_real_names = get_real_names_local | ||
else: | ||
run_nvidiasmi = partial(run_nvidiasmi_remote, | ||
|
@@ -268,11 +319,17 @@ def main(argv): | |
server=server, | ||
ssh_timeout=args.ssh_timeout, | ||
cmd_timeout=args.cmd_timeout) | ||
run_taskset = partial(run_taskset_remote, | ||
server=server, | ||
ssh_timeout=args.ssh_timeout, | ||
cmd_timeout=args.cmd_timeout) | ||
run_get_real_names = partial(get_real_names_remote, | ||
server=server, | ||
ssh_timeout=args.ssh_timeout, | ||
cmd_timeout=args.cmd_timeout) | ||
|
||
run_taskset = NULL_FUNCTION if args.taskset is False else run_taskset | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is not needed anymore with the changes below |
||
|
||
nvidiasmi = run_nvidiasmi() | ||
if nvidiasmi is None: | ||
error(('Could not reach {} or ' | ||
|
@@ -281,13 +338,21 @@ def main(argv): | |
|
||
gpu_infos = get_gpu_infos(nvidiasmi) | ||
|
||
if args.list: | ||
print_gpu_infos(server, gpu_infos, run_ps, run_get_real_names, | ||
filter_by_user=args.user, | ||
translate_to_real_names=args.finger) | ||
if args.list or args.taskset: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Once taskset is factored into its own method, there should be |
||
print_gpu_infos(server, gpu_infos, run_ps, run_taskset, | ||
run_get_real_names, filter_by_user=args.user, | ||
translate_to_real_names=args.finger, | ||
cpu_affinities=cpu_affinities) | ||
else: | ||
print_free_gpus(server, gpu_infos) | ||
|
||
|
||
if __name__ == '__main__': | ||
main(sys.argv[1:]) | ||
args = parser.parse_args(sys.argv[1:]) | ||
if args.daemon: | ||
while True: | ||
main(args) | ||
time.sleep(15) | ||
else: | ||
main(args) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
monal03.doc.ic.ac.uk | ||
monal04.doc.ic.ac.uk | ||
monal05.doc.ic.ac.uk | ||
monal06.doc.ic.ac.uk | ||
lory.doc.ic.ac.uk | ||
Comment on lines
+1
to
+5
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This file should stay empty. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Not needed anymore with my changes requested below