add systemd and proc info

PatWie · Jan 17, 2018 · 9157212 · 9157212
1 parent 42959d5
commit 9157212
Show file tree

Hide file tree

Showing 40 changed files with 10,064 additions and 299 deletions.
diff --git a/Godeps/Godeps.json b/Godeps/Godeps.json
diff --git a/README.md b/README.md
@@ -1,9 +1,6 @@
 # CLUSTER-SMI
 
-The same as `nvidia-smi` but for multiple machines at the same time.
-
-<p align="center"> <img src="./cluster-smi.jpg" width="100%"> </p>
-
+The same as `nvidia-smi` but for multiple machines.
 
 Run `cluster-smi` and the output should be something like
 
@@ -30,18 +27,22 @@ Run `cluster-smi` and the output should be something like
 
 Additional information are available, when using `cluster-smi -p -t`.
 
-Each machine you want to monitor need to start *cluster-smi-node* (e.g. using systemd). They are sending the information to a *cluster-smi-server*, which further distribute these information to client (*cluster-smi*). Only the machines running *cluster-smi-node* require CUDA dependencies.
+
+<p align="center"> <img src="./cluster-smi.jpg" width="100%"> </p>
+
+Each machine you want to monitor need to start *cluster-smi-node* (e.g. using systemd). They are sending information from the nvidia-driver to a *cluster-smi-server*, which further distribute these information to client (*cluster-smi*). Only the machines running *cluster-smi-node* require CUDA dependencies.
 
 You might be interested as well in [cluster-top](https://github.com/PatWie/cluster-top) for CPUS.
 
 ## Install
 
-### Requirements+Dependencies
+### Requirements + Dependencies
 
-I assume you can compile a CUDA program, as the `cluster-smi-node` depends on the NVIDIA driver to get the metrics.
+- CUDA (just for `cluster-smi-node.go`)
+- ZMQ (4.0.1)
 
+Unfortunately, *ZMQ* can only be dynamically linked (`libzmq.so`) to this repository and you need to build it separately by
 
-Dependencies are *MsgPack* for serialization and *ZMQ* (tested with 4.0.1) for messaging. Unfortunately, *ZMQ* can only be dynamically linked (`libzmq.so`) to this repository and you need to build it separately by
 ```bash
 # compile ZMQ library for c++
 cd /path/to/your_lib_folder
@@ -65,7 +66,7 @@ Edit the CFLAGS, LDFLAGS in file `nvvml/nvml.go` to match your setup.
 
 ### Compiling
 
-You need to copy one file
+You need to copy one config-file
 
 ```console
 user@host $ cp config.example.go config.go
@@ -99,3 +100,22 @@ make all
 3. use `cluster-smi` like `nvidia-smi`
 
 Make sure, the machines can communicate using the specifiec ports (e.g., `ufw allow 9080, 9081`)
+
+## Use systemd
+
+To ease the use of this app, I suggest to add the *cluster-smi-node* into a systemd-service. An example config file can be found <a href="./docs/cluster-smi-node.example.service">here</a>. The steps would be
+
+```bash
+# add new entry to systemd
+sudo cp docs/cluster-smi-node.example.service /etc/systemd/system/cluster-smi-node.service
+# edit the path to cluster-smi-node
+sudo nano /etc/systemd/system/cluster-smi-node.service
+# make sure you can start and stop the service (have a look at you cluster-smi client)
+sudo service cluster-smi-node start
+sudo service cluster-smi-node stop
+# register cluster-smi-node to start on reboots
+sudo systemctl enable cluster-smi-node.service
+
+# last, start the service
+sudo service cluster-smi-node start
+```
diff --git a/cluster.go b/cluster.go
@@ -3,7 +3,10 @@ package main
 import (
 	"github.com/patwie/cluster-smi/cluster"
 	"github.com/patwie/cluster-smi/nvml"
+	"github.com/patwie/cluster-smi/proc"
 	"os"
+	"os/user"
+	"strconv"
 	"time"
 )
 
@@ -36,6 +39,7 @@ func FetchNode(n *cluster.Node) {
 	n.Time = time.Now()
 
 	for idx, device := range devices {
+
 		meminfo, _ := device.GetMemoryInfo()
 		gpuPercent, _, _ := device.GetUtilization()
 		memPercent := int(meminfo.Used / meminfo.Total)
@@ -46,23 +50,39 @@ func FetchNode(n *cluster.Node) {
 			panic(err)
 		}
 
-		var p []cluster.Process
+		// collect al proccess informations
+		var processes []cluster.Process
+
 		for i := 0; i < len(deviceProcs); i++ {
+
 			if int(deviceProcs[i].Pid) == 0 {
 				continue
 			}
 
-			p = append(p, cluster.Process{
-				Pid:           deviceProcs[i].Pid,
+			PID := deviceProcs[i].Pid
+			_, name := proc.TimeAndNameFromPID(PID)
+
+			UID := proc.UIDFromPID(PID)
+			user, err := user.LookupId(strconv.Itoa(UID))
+
+			username := "unknown"
+			if err == nil {
+				username = user.Username
+			}
+
+			processes = append(processes, cluster.Process{
+				Pid:           PID,
 				UsedGpuMemory: deviceProcs[i].UsedGpuMemory,
+				Name:          name,
+				Username:      username,
 			})
 		}
 
 		n.Devices[idx].Id = idx
 		n.Devices[idx].Name = device.DeviceName
 		n.Devices[idx].Utilization = gpuPercent
 		n.Devices[idx].MemoryUtilization = cluster.Memory{meminfo.Used, meminfo.Free, meminfo.Total, memPercent}
-		n.Devices[idx].Processes = p
+		n.Devices[idx].Processes = processes
 
 	}
 }
diff --git a/cluster/data.go b/cluster/data.go
@@ -25,6 +25,8 @@ type Memory struct {
 type Process struct {
 	Pid           int
 	UsedGpuMemory int64
+	Name          string
+	Username      string
 }
 
 type Device struct {
@@ -139,7 +141,7 @@ func (c *Cluster) Print(show_processes bool, show_time bool, timeout_threshold i
 							device_name,
 							device_MemoryInfo,
 							device_utilization,
-							fmt.Sprintf("(%d) %3d MiB", p.Pid, p.UsedGpuMemory/1024/1024),
+							fmt.Sprintf("%s (%d, %s) %3d MiB", p.Name, p.Pid, p.Username, p.UsedGpuMemory/1024/1024),
 						}
 
 						if show_time {
@@ -186,6 +188,6 @@ func (c *Cluster) Print(show_processes bool, show_time bool, timeout_threshold i
 		}
 	}
 	fmt.Printf("\033[2J")
-	fmt.Println(time.Now().Format("Mon Jan 2 15:04:05 2006"), "(possible flags: -p -t)")
+	fmt.Println(time.Now().Format("Mon Jan 2 15:04:05 2006"))
 	fmt.Println(table.Render())
 }
diff --git a/docs/cluster-smi-node.example.service b/docs/cluster-smi-node.example.service
@@ -0,0 +1,13 @@
+[Unit]
+Description=Cluster-Smi-Node
+
+After=network.target local-fs.target multi-user.target
+Requires=network.target local-fs.target multi-user.target
+
+
+[Service]
+Type=simple
+StandardOutput=journal+console
+ExecStart=/path/to/cluster-smi-node
+
+## Add me by: sudo systemctl enable cluster-smi-node.service
diff --git a/nvml/info.txt b/nvml/info.txt
@@ -1,2 +1,4 @@
-Code in this directory is borrowed from:
+The original code in this directory is borrowed from:
 https://github.com/tankbusta/nvidia_exporter
+
+I modified some parts.
diff --git a/proc/proc.c b/proc/proc.c
@@ -0,0 +1,133 @@
+// Author: Patrick Wieschollek, 2018
+#include <stdio.h>
+#include <stdbool.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <string.h>
+
+#define MAX_NAME 128
+
+void get_mem(unsigned long *mem_total, unsigned long *mem_free, unsigned long *mem_available) {
+  char line[100], *p;
+  FILE* statusf;
+
+
+  statusf = fopen("/proc/meminfo", "r");
+  if (!statusf)
+    return;
+
+
+  fgets(line, 100, statusf);
+  sscanf(line, "%*s %lu %*s", mem_total);
+  fgets(line, 100, statusf);
+  sscanf(line, "%*s %lu %*s", mem_free);
+  fgets(line, 100, statusf);
+  sscanf(line, "%*s %lu %*s", mem_available);
+
+
+  fclose(statusf);
+}
+
+// read total cpu time
+unsigned long long int read_cpu_tick() {
+  unsigned long long int usertime, nicetime, systemtime, idletime;
+  unsigned long long int ioWait, irq, softIrq, steal, guest, guestnice;
+  usertime = nicetime = systemtime = idletime = 0;
+  ioWait = irq = softIrq = steal = guest = guestnice = 0;
+
+  FILE *fp;
+  fp = fopen("/proc/stat", "r");
+  if (fp != NULL) {
+    if (fscanf(fp,   "cpu  %16llu %16llu %16llu %16llu %16llu %16llu %16llu %16llu %16llu %16llu",
+               &usertime, &nicetime, &systemtime, &idletime,
+               &ioWait, &irq, &softIrq, &steal, &guest, &guestnice) == EOF) {
+      fclose(fp);
+      return 0;
+    } else {
+      fclose(fp);
+      return usertime + nicetime + systemtime + idletime + ioWait + irq + softIrq + steal + guest + guestnice;
+    }
+  } else {
+    return 0;
+  }
+}
+
+void get_uid_from_pid(unsigned long pid, unsigned long *uid) {
+  char path[40], line[100], *p;
+  FILE* statusf;
+
+  snprintf(path, 40, "/proc/%ld/status", pid);
+
+  statusf = fopen(path, "r");
+  if (!statusf)
+    return;
+
+  while (fgets(line, 100, statusf)) {
+    if (strncmp(line, "Uid:", 4) != 0)
+      continue;
+    // Uid: 1000    1000    1000    1000
+    sscanf(line, "%*s %lu %*s", uid);
+    break;
+  }
+  fclose(statusf);
+}
+
+// read cpu tick for a specific process
+void read_time_and_name_from_pid(unsigned long pid, unsigned long *time, char *name) {
+
+  char fn[MAX_NAME + 1];
+  snprintf(fn, sizeof fn, "/proc/%ld/stat", pid);
+
+  unsigned long utime = 0;
+  unsigned long stime = 0;
+
+  *time = 0;
+
+  FILE * fp;
+  fp = fopen(fn, "r");
+  if (fp != NULL) {
+    /*
+    (1) pid       %d  The process ID
+    (2) comm      %s  The filename of the executable, in parentheses.
+    (3) state     %c
+    (4) ppid      %d  The PID of the parent of this process.
+    (5) pgrp      %d  The process group ID of the process.
+    (6) session   %d  The session ID of the process.
+    (7) tty_nr    %d  The controlling terminal of the process.
+    (8) tpgid     %d  The ID of the foreground process group 
+    (9) flags     %u  The kernel flags word of the process. 
+    (10) minflt   %lu The number of minor faults the process has made
+    (11) cminflt  %lu The number of minor faults that the process's waited-for children have made.
+    (12) majflt   %lu The number of major faults the process has made
+    (13) cmajflt  %lu The number of major faults that the process's
+    (14) utime    %lu Amount of time that this process has been scheduled in user mode
+    (15) stime    %lu Amount of time that this process has been scheduled in kernel mode
+    ...
+    */
+
+    // extract
+    bool success = fscanf(fp, "%*d (%s %*c %*d %*d %*d %*d %*d %*u %*lu %*lu %*lu %*lu %lu"
+                          "%lu %*ld %*ld %*d %*d %*d %*d %*u %*lu %*ld",
+                          name, &utime, &stime) != EOF;
+    fclose(fp);
+
+    if (!success) {
+      // something went wrong
+      return;
+    }
+
+    // remove ")" suffix
+    if (strlen(name) > 2) {
+      name[strlen(name) - 1] = 0;
+    }
+
+    *time = utime + stime;
+
+  }
+}
+
+// return number of cores
+unsigned int num_cores() {
+  return sysconf(_SC_NPROCESSORS_ONLN);
+}
+
diff --git a/proc/proc.go b/proc/proc.go
@@ -0,0 +1,50 @@
+package proc
+
+// #include "proc.h"
+// #include <stdlib.h>
+import "C"
+import "unsafe"
+
+func mallocCStringBuffer(size uint) *C.char {
+	buf := make([]byte, size)
+	return C.CString(string(buf))
+}
+
+// CpuTick returns the total number of Jiffies
+func CpuTick() (t int64) {
+	return int64(C.read_cpu_tick())
+}
+
+// TimeAndNameFromPID returns used time (int) and command (string)
+func TimeAndNameFromPID(pid int) (int64, string) {
+	time := C.ulong(0)
+
+	var c_dst *C.char = mallocCStringBuffer(128 + 1)
+	defer C.free(unsafe.Pointer(c_dst))
+
+	C.read_time_and_name_from_pid(C.ulong(pid), &time, c_dst)
+	return int64(time), C.GoString(c_dst)
+}
+
+// NumberCPUCores returns number of CPU cores
+func NumberCPUCores() (n int) {
+	return int(C.num_cores())
+}
+
+// Find user id (UID) for a given process id (PID)
+func UIDFromPID(pid int) (uid int) {
+	c_uid := C.ulong(0)
+	C.get_uid_from_pid(C.ulong(pid), &c_uid)
+	return int(c_uid)
+}
+
+// get memory information of RAM
+func GetRAMMemoryInfo() (total int64, free int64, available int64) {
+	c_total := C.ulong(0)
+	c_free := C.ulong(0)
+	c_available := C.ulong(0)
+
+	C.get_mem(&c_total, &c_free, &c_available)
+
+	return int64(c_total), int64(c_free), int64(c_available)
+}