From 00a06ea573dd5b8f58097fe0356c897f804ec287 Mon Sep 17 00:00:00 2001
From: Gavin Seegoolam <22893238+gavinlive@users.noreply.github.com>
Date: Fri, 23 Oct 2020 18:18:24 +0100
Subject: [PATCH 01/10] added tasksetting

---
 README.md           |  3 ++
 cpu_affinities.json | 66 ++++++++++++++++++++++++++++++++++++++++++++
 gpu_monitor.py      | 67 ++++++++++++++++++++++++++++++++++++++++-----
 servers.txt         |  5 ++++
 4 files changed, 134 insertions(+), 7 deletions(-)
 create mode 100644 cpu_affinities.json

diff --git a/README.md b/README.md
index 0b83306..545e54f 100644
--- a/README.md
+++ b/README.md
@@ -12,6 +12,7 @@ The script works by using your account to SSH into the servers and running `nvid
 - Show all current users of all GPUs (-l or --list)
 - Show all GPUs used by yourself (-m or --me)
 - Resolve usernames to real names (-f or --finger)
+- Taskset jobs running on GPUs (-t or --taskset)
 
 ## Requirements
 
@@ -35,6 +36,8 @@ Server myserver.com:
 If you have some set of servers that you regularily check, specify them in the file `servers.txt`, one address per line.
 Once you did that, running just `./gpu_monitor.py` checks all servers specified in this file by default.
 
+If you wish to automatically set the CPU-GPU affinities, specify the `cpu_affinities.json` file as shown in the example.
+
 If you want to list all GPUs and who currently uses them, you can use the `-l` flag:
 ```
 > ./gpu_monitor.py -l myserver.com
diff --git a/cpu_affinities.json b/cpu_affinities.json
new file mode 100644
index 0000000..f069158
--- /dev/null
+++ b/cpu_affinities.json
@@ -0,0 +1,66 @@
+{
+    "monal04.doc.ic.ac.uk": {
+        "affinities": {
+            "0": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
+            "1": [26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38],
+            "2": [13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25],
+            "3": [39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51]
+        }
+    },
+    "monal03.doc.ic.ac.uk": {
+        "affinities": {
+            "0": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
+            "1": [26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38],
+            "2": [13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25],
+            "3": [39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51]
+        }
+    },
+    "monal05.doc.ic.ac.uk": {
+        "affinities": {
+            "0": [0,1,2],
+            "1": [3,4,5],
+            "2": [6,7,8],
+            "3": [9,10,11],
+            "4": [12,13,28],
+            "5": [29,30,31],
+            "6": [32,33,34],
+            "7": [35,36,37],
+            "8": [38,39,46,47,48,49],
+            "9": [40,41,42,43,44,45]
+        }
+    },
+    "monal06.doc.ic.ac.uk": {
+        "affinities": {
+            "0": [0,1,2],
+            "1": [3,4,5],
+            "2": [6,7,8],
+            "3": [9,10,11],
+            "4": [12,13,28],
+            "5": [29,30,31],
+            "6": [32,33,34],
+            "7": [35,36,37],
+            "8": [38,39,46,47,48,49],
+            "9": [40,41,42,43,44,45]
+        }
+    },
+    "lory.doc.ic.ac.uk": {
+        "affinities": {
+            "0": [0,1,2,3,4],
+            "1": [5,6,7,8,9],
+            "2": [10,11,12,13,14],
+            "3": [15,16,17,18,19],
+            "4": [40,41,42,43,44],
+            "5": [45,46,47,48,49],
+            "6": [50,51,52,53,54],
+            "7": [55,56,57,58,59],
+            "8": [20,21,22,23,24],
+            "9": [25,26,27,28,29],
+            "10": [30,31,32,33,34],
+            "11": [35,36,37,38,39],
+            "12": [60,61,62,63,64],
+            "13": [65,66,67,68, 69],
+            "14": [70,71,72,73,74],
+            "15": [75,76,77,78,79]
+        }
+    }
+}
\ No newline at end of file
diff --git a/gpu_monitor.py b/gpu_monitor.py
index 11df2f0..401e238 100755
--- a/gpu_monitor.py
+++ b/gpu_monitor.py
@@ -10,6 +10,7 @@
 import pwd
 import subprocess
 import sys
+import json
 import xml.etree.ElementTree as ET
 from collections import defaultdict
 from functools import partial
@@ -26,10 +27,18 @@
 SERVER_FILE_PATH = os.path.join(os.path.dirname(os.path.realpath(sys.argv[0])),
                                 DEFAULT_SERVER_FILE)
 
+# Default cpu affinities file for tasksetting
+DEFAULT_TASKSET_FILE = 'cpu_affinities.json'
+SERVER_TASKSET_PATH = os.path.join(os.path.dirname(os.path.realpath(sys.argv[0])),
+                                DEFAULT_TASKSET_FILE)
+
+NULL_FUNCTION = lambda *args, **kwargs : None
+
 parser = argparse.ArgumentParser(description='Check state of GPU servers')
 parser.add_argument('-v', '--verbose', action='store_true',
                     help='Be verbose')
 parser.add_argument('-l', '--list', action='store_true', help='Show used GPUs')
+parser.add_argument('-t', '--taskset', action='store_true', help='Use Taskset to set CPU-GPU Affinities')
 parser.add_argument('-f', '--finger', action='store_true',
                     help='Attempt to resolve user names to real names')
 parser.add_argument('-m', '--me', action='store_true',
@@ -44,6 +53,8 @@
                           'is interrupted'))
 parser.add_argument('--server-file', default=SERVER_FILE_PATH,
                     help='File with addresses of servers to check')
+parser.add_argument('--taskset-file', default=SERVER_TASKSET_PATH,
+                    help='File with cpu affinities information if using tasksetting functionality')
 parser.add_argument('servers', nargs='*', default=[],
                     help='Servers to probe')
 
@@ -60,9 +71,15 @@
 # Command for running ps locally
 PS_CMD = 'ps -o pid= -o ruser= -p {pids}'
 
+# Command for tasksetting locally
+TASKSET_CMD = 'taskset -cp {cpus} {pid}'
+
 # Command for running ps remotely
 REMOTE_PS_CMD = '{} {}'.format(SSH_CMD, PS_CMD)
 
+# Command for tasksetting remotely
+REMOTE_TASKSET_CMD = '{} {}'.format(SSH_CMD, TASKSET_CMD)
+
 # Command for getting real names remotely
 # See https://stackoverflow.com/a/38235661
 REAL_NAMES_CMD = """<<-"EOF"
@@ -114,6 +131,10 @@ def run_ps_local(pids):
     res = run_command(cmd)
     return res.decode('ascii') if res is not None else None
 
+def run_taskset_local(cpus, pid):
+    cmd = TASKSET_CMD.format(cpus=','.join(cpus), pid=pid)
+    res = run_command(cmd)
+    return res.decode('ascii') if res is not None else None
 
 def run_ps_remote(server, pids, ssh_timeout, cmd_timeout):
     cmd = REMOTE_PS_CMD.format(server=server,
@@ -124,6 +145,17 @@ def run_ps_remote(server, pids, ssh_timeout, cmd_timeout):
     return res.decode('ascii') if res is not None else None
 
 
+def run_taskset_remote(server, cpus, pid, ssh_timeout, cmd_timeout):
+    cmd = REMOTE_TASKSET_CMD.format(server=server,
+                               pid=pid,
+                               cpus=','.join(cpus),
+                               ssh_timeout=ssh_timeout,
+                               cmd_timeout=cmd_timeout)
+    res = run_command(cmd)
+    return res.decode('ascii') if res is not None else None
+
+
+
 def get_real_names_local(users):
     real_names_by_users = {}
     for user in users:
@@ -183,9 +215,9 @@ def print_free_gpus(server, gpu_infos):
             info('\tGPU {}, {}'.format(gpu_info['idx'], gpu_info['model']))
 
 
-def print_gpu_infos(server, gpu_infos, run_ps, run_get_real_names,
-                    filter_by_user=None,
-                    translate_to_real_names=False):
+def print_gpu_infos(server, gpu_infos, run_ps, run_taskset,
+                    run_get_real_names, filter_by_user=None,
+                    translate_to_real_names=False, cpu_affinities={}):
     pids = [pid for gpu_info in gpu_infos for pid in gpu_info['pids']]
     if len(pids) > 0:
         ps = run_ps(pids=pids)
@@ -197,6 +229,11 @@ def print_gpu_infos(server, gpu_infos, run_ps, run_get_real_names,
     else:
         users_by_pid = {}
 
+    if server in cpu_affinities.keys():
+        for pid in pids:
+            taskset = run_taskset(cpus=cpu_affinities[server]["affinities"],
+                                 pid=pid)
+
     if translate_to_real_names:
         all_users = set((users_by_pid[pid] for gpu_info in gpu_infos
                          for pid in gpu_info['pids']))
@@ -238,6 +275,14 @@ def main(argv):
             error('Could not open server file {}'.format(args.server_file))
             return
 
+    try:
+        debug('Using taskset file {}'.format(args.taskset_file))
+        with open(args.taskset_file, 'r') as f:
+            cpu_affinities = json.load(f)
+    except OSError as e:
+        error('Could not open server file {}'.format(args.server_file))
+        cpu_affinities = {}
+       
     if len(args.servers) == 0:
         error(('No GPU servers to connect to specified.\nPut addresses in '
                'the server file or specify them manually as an argument'))
@@ -258,6 +303,7 @@ def main(argv):
         if server == '.' or server == 'localhost' or server == '127.0.0.1':
             run_nvidiasmi = run_nvidiasmi_local
             run_ps = run_ps_local
+            run_taskset = run_taskset_local
             run_get_real_names = get_real_names_local
         else:
             run_nvidiasmi = partial(run_nvidiasmi_remote,
@@ -268,11 +314,17 @@ def main(argv):
                              server=server,
                              ssh_timeout=args.ssh_timeout,
                              cmd_timeout=args.cmd_timeout)
+            run_taskset = partial(run_taskset_remote,
+                             server=server,
+                             ssh_timeout=args.ssh_timeout,
+                             cmd_timeout=args.cmd_timeout)
             run_get_real_names = partial(get_real_names_remote,
                                          server=server,
                                          ssh_timeout=args.ssh_timeout,
                                          cmd_timeout=args.cmd_timeout)
 
+        run_taskset = NULL_FUNCTION if args.taskset is False else run_taskset
+
         nvidiasmi = run_nvidiasmi()
         if nvidiasmi is None:
             error(('Could not reach {} or '
@@ -281,10 +333,11 @@ def main(argv):
 
         gpu_infos = get_gpu_infos(nvidiasmi)
 
-        if args.list:
-            print_gpu_infos(server, gpu_infos, run_ps, run_get_real_names,
-                            filter_by_user=args.user,
-                            translate_to_real_names=args.finger)
+        if args.list or args.taskset:
+            print_gpu_infos(server, gpu_infos, run_ps, run_taskset,
+                            run_get_real_names, filter_by_user=args.user,
+                            translate_to_real_names=args.finger,
+                            cpu_affinities=cpu_affinities)
         else:
             print_free_gpus(server, gpu_infos)
 
diff --git a/servers.txt b/servers.txt
index e69de29..ebc579a 100644
--- a/servers.txt
+++ b/servers.txt
@@ -0,0 +1,5 @@
+monal03.doc.ic.ac.uk
+monal04.doc.ic.ac.uk
+monal05.doc.ic.ac.uk
+monal06.doc.ic.ac.uk
+lory.doc.ic.ac.uk
\ No newline at end of file

From f44103576f1aa64010c575fdb35ac715e500f037 Mon Sep 17 00:00:00 2001
From: Gavin Seegoolam <22893238+gavinlive@users.noreply.github.com>
Date: Fri, 23 Oct 2020 19:06:26 +0100
Subject: [PATCH 02/10] added taskset daemon

---
 gpu_monitor.py    | 9 ++++++---
 taskset_daemon.sh | 6 ++++++
 2 files changed, 12 insertions(+), 3 deletions(-)
 create mode 100755 taskset_daemon.sh

diff --git a/gpu_monitor.py b/gpu_monitor.py
index 401e238..7975c6a 100755
--- a/gpu_monitor.py
+++ b/gpu_monitor.py
@@ -132,6 +132,7 @@ def run_ps_local(pids):
     return res.decode('ascii') if res is not None else None
 
 def run_taskset_local(cpus, pid):
+    cpus = [str(x) for x in cpus]
     cmd = TASKSET_CMD.format(cpus=','.join(cpus), pid=pid)
     res = run_command(cmd)
     return res.decode('ascii') if res is not None else None
@@ -146,6 +147,7 @@ def run_ps_remote(server, pids, ssh_timeout, cmd_timeout):
 
 
 def run_taskset_remote(server, cpus, pid, ssh_timeout, cmd_timeout):
+    cpus = [str(x) for x in cpus]
     cmd = REMOTE_TASKSET_CMD.format(server=server,
                                pid=pid,
                                cpus=','.join(cpus),
@@ -230,9 +232,10 @@ def print_gpu_infos(server, gpu_infos, run_ps, run_taskset,
         users_by_pid = {}
 
     if server in cpu_affinities.keys():
-        for pid in pids:
-            taskset = run_taskset(cpus=cpu_affinities[server]["affinities"],
-                                 pid=pid)
+        for gpu_info in gpu_infos:
+            for pid in gpu_info["pids"]:
+                gpu_cpus=cpu_affinities[server]["affinities"][str(gpu_info["idx"])]
+                taskset = run_taskset(cpus=gpu_cpus, pid=pid)
 
     if translate_to_real_names:
         all_users = set((users_by_pid[pid] for gpu_info in gpu_infos
diff --git a/taskset_daemon.sh b/taskset_daemon.sh
new file mode 100755
index 0000000..825fa08
--- /dev/null
+++ b/taskset_daemon.sh
@@ -0,0 +1,6 @@
+DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
+script=python $DIR/gpu_monitor.py -v -t
+
+cmd="watch -n 300 "
+$cmd $script
+

From d5963dc6edbf34d07cb284fbf5ae033a7ea979f7 Mon Sep 17 00:00:00 2001
From: Gavin Seegoolam <22893238+gavinlive@users.noreply.github.com>
Date: Fri, 23 Oct 2020 19:12:45 +0100
Subject: [PATCH 03/10] typo

---
 taskset_daemon.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/taskset_daemon.sh b/taskset_daemon.sh
index 825fa08..655496f 100755
--- a/taskset_daemon.sh
+++ b/taskset_daemon.sh
@@ -2,5 +2,5 @@ DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
 script=python $DIR/gpu_monitor.py -v -t
 
 cmd="watch -n 300 "
-$cmd $script
+$cmd "$script"
 

From e6681d9f791c5e03d4ed9f6dd782c2822bea3cf3 Mon Sep 17 00:00:00 2001
From: "Huaqi (Harvey) Qiu" <qiuhuaqi@hotmail.com>
Date: Fri, 23 Oct 2020 23:16:28 +0100
Subject: [PATCH 04/10] add hyper-threading cores to monal05 & 06

---
 cpu_affinities.json | 42 +++++++++++++++++++++---------------------
 1 file changed, 21 insertions(+), 21 deletions(-)

diff --git a/cpu_affinities.json b/cpu_affinities.json
index f069158..0d21fd0 100644
--- a/cpu_affinities.json
+++ b/cpu_affinities.json
@@ -17,30 +17,30 @@
     },
     "monal05.doc.ic.ac.uk": {
         "affinities": {
-            "0": [0,1,2],
-            "1": [3,4,5],
-            "2": [6,7,8],
-            "3": [9,10,11],
-            "4": [12,13,28],
-            "5": [29,30,31],
-            "6": [32,33,34],
-            "7": [35,36,37],
-            "8": [38,39,46,47,48,49],
-            "9": [40,41,42,43,44,45]
+            "0": [0,1,2,14,15,16],
+            "1": [3,4,5,17,18,19],
+            "2": [6,7,8,20,21,22],
+            "3": [9,10,11,23,24,25],
+            "4": [12,13,28,26,27],
+            "5": [29,30,31,43,44],
+            "6": [32,33,34,46,47],
+            "7": [35,36,37,49,50],
+            "8": [38,39,42,45,52,53],
+            "9": [40,41,48,51,54,55]
         }
     },
     "monal06.doc.ic.ac.uk": {
         "affinities": {
-            "0": [0,1,2],
-            "1": [3,4,5],
-            "2": [6,7,8],
-            "3": [9,10,11],
-            "4": [12,13,28],
-            "5": [29,30,31],
-            "6": [32,33,34],
-            "7": [35,36,37],
-            "8": [38,39,46,47,48,49],
-            "9": [40,41,42,43,44,45]
+            "0": [0,1,2,14,15,16],
+            "1": [3,4,5,17,18,19],
+            "2": [6,7,8,20,21,22],
+            "3": [9,10,11,23,24,25],
+            "4": [12,13,28,26,27],
+            "5": [29,30,31,43,44],
+            "6": [32,33,34,46,47],
+            "7": [35,36,37,49,50],
+            "8": [38,39,42,45,52,53],
+            "9": [40,41,48,51,54,55]
         }
     },
     "lory.doc.ic.ac.uk": {
@@ -63,4 +63,4 @@
             "15": [75,76,77,78,79]
         }
     }
-}
\ No newline at end of file
+}

From 609f44321a4da3d2577676d11dc0cc02f80d9211 Mon Sep 17 00:00:00 2001
From: Gavin Seegoolam <22893238+gavinlive@users.noreply.github.com>
Date: Thu, 3 Dec 2020 12:41:43 +0000
Subject: [PATCH 05/10] [minor] mode

---
 LICENSE     | 0
 README.md   | 0
 servers.txt | 0
 3 files changed, 0 insertions(+), 0 deletions(-)
 mode change 100644 => 100755 LICENSE
 mode change 100644 => 100755 README.md
 mode change 100644 => 100755 servers.txt

diff --git a/LICENSE b/LICENSE
old mode 100644
new mode 100755
diff --git a/README.md b/README.md
old mode 100644
new mode 100755
diff --git a/servers.txt b/servers.txt
old mode 100644
new mode 100755

From 2afccd472cc3cb8b94bce9e1a75b8e76b877dcd6 Mon Sep 17 00:00:00 2001
From: Gavin Seegoolam <22893238+gavinlive@users.noreply.github.com>
Date: Thu, 3 Dec 2020 12:41:57 +0000
Subject: [PATCH 06/10] increased timeout

---
 gpu_monitor.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/gpu_monitor.py b/gpu_monitor.py
index 7975c6a..3f1fa98 100755
--- a/gpu_monitor.py
+++ b/gpu_monitor.py
@@ -17,10 +17,10 @@
 from logging import debug, info, error
 
 # Default timeout in seconds after which SSH stops trying to connect
-DEFAULT_SSH_TIMEOUT = 3
+DEFAULT_SSH_TIMEOUT = 30
 
 # Default timeout in seconds after which remote commands are interrupted
-DEFAULT_CMD_TIMEOUT = 10
+DEFAULT_CMD_TIMEOUT = 50
 
 # Default server file
 DEFAULT_SERVER_FILE = 'servers.txt'

From 691b3d104bcbbf5a31539a7cefc31a06f91b2842 Mon Sep 17 00:00:00 2001
From: Gavin Seegoolam <22893238+gavinlive@users.noreply.github.com>
Date: Thu, 3 Dec 2020 12:42:24 +0000
Subject: [PATCH 07/10] new affinities

---
 cpu_affinities.json | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 mode change 100644 => 100755 cpu_affinities.json

diff --git a/cpu_affinities.json b/cpu_affinities.json
old mode 100644
new mode 100755

From a61536c628a9f5f43b12850620dcbcdc690dbd2d Mon Sep 17 00:00:00 2001
From: Huaqi Qiu <huaqi.qiu@outlook.com>
Date: Thu, 1 Jul 2021 17:29:13 +0100
Subject: [PATCH 08/10] daemon

---
 gpu_monitor.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/gpu_monitor.py b/gpu_monitor.py
index 3f1fa98..d51c48e 100755
--- a/gpu_monitor.py
+++ b/gpu_monitor.py
@@ -346,4 +346,8 @@ def main(argv):
 
 
 if __name__ == '__main__':
-    main(sys.argv[1:])
+    import time
+    while True:
+        main(sys.argv[1:])
+        time.sleep(15)
+

From 0ab040d692264dcb62b94d43ebc3449839ba58c8 Mon Sep 17 00:00:00 2001
From: gavinlive <22893238+gavinlive@users.noreply.github.com>
Date: Thu, 1 Jul 2021 17:33:51 +0100
Subject: [PATCH 09/10] main daemonised

---
 taskset_daemon.sh | 6 ------
 1 file changed, 6 deletions(-)
 delete mode 100755 taskset_daemon.sh

diff --git a/taskset_daemon.sh b/taskset_daemon.sh
deleted file mode 100755
index 655496f..0000000
--- a/taskset_daemon.sh
+++ /dev/null
@@ -1,6 +0,0 @@
-DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
-script=python $DIR/gpu_monitor.py -v -t
-
-cmd="watch -n 300 "
-$cmd "$script"
-

From d386e84a5a44c3d65b40b3d0855491de7380ac9f Mon Sep 17 00:00:00 2001
From: Gavin Seegoolam <22893238+gavinlive@users.noreply.github.com>
Date: Thu, 1 Jul 2021 17:46:21 +0100
Subject: [PATCH 10/10] Added daemon

---
 README.md      |  1 +
 gpu_monitor.py | 19 ++++++++++++-------
 2 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/README.md b/README.md
index 545e54f..de33135 100755
--- a/README.md
+++ b/README.md
@@ -13,6 +13,7 @@ The script works by using your account to SSH into the servers and running `nvid
 - Show all GPUs used by yourself (-m or --me)
 - Resolve usernames to real names (-f or --finger)
 - Taskset jobs running on GPUs (-t or --taskset)
+- Run on a loop (-d or --daemon)
 
 ## Requirements
 
diff --git a/gpu_monitor.py b/gpu_monitor.py
index d51c48e..b125a90 100755
--- a/gpu_monitor.py
+++ b/gpu_monitor.py
@@ -10,6 +10,7 @@
 import pwd
 import subprocess
 import sys
+import time
 import json
 import xml.etree.ElementTree as ET
 from collections import defaultdict
@@ -38,6 +39,8 @@
 parser.add_argument('-v', '--verbose', action='store_true',
                     help='Be verbose')
 parser.add_argument('-l', '--list', action='store_true', help='Show used GPUs')
+parser.add_argument('-d', '--daemon', action='store_true',
+                    help='Loop')
 parser.add_argument('-t', '--taskset', action='store_true', help='Use Taskset to set CPU-GPU Affinities')
 parser.add_argument('-f', '--finger', action='store_true',
                     help='Attempt to resolve user names to real names')
@@ -262,9 +265,8 @@ def print_gpu_infos(server, gpu_infos, run_ps, run_taskset,
                                         status))
 
 
-def main(argv):
-    args = parser.parse_args(argv)
-
+def main(args):
+    
     logging.basicConfig(format='%(message)s',
                         level=logging.DEBUG if args.verbose else logging.INFO)
 
@@ -346,8 +348,11 @@ def main(argv):
 
 
 if __name__ == '__main__':
-    import time
-    while True:
-        main(sys.argv[1:])
-        time.sleep(15)
+    args = parser.parse_args(sys.argv[1:])
+    if args.daemon:
+        while True:
+          main(args)
+          time.sleep(15)
+    else:
+          main(args)