Skip to content

Commit

Permalink
add systemd and proc info
Browse files Browse the repository at this point in the history
  • Loading branch information
PatWie committed Jan 17, 2018
1 parent 42959d5 commit 9157212
Show file tree
Hide file tree
Showing 40 changed files with 10,064 additions and 299 deletions.
12 changes: 8 additions & 4 deletions Godeps/Godeps.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

38 changes: 29 additions & 9 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,9 +1,6 @@
# CLUSTER-SMI

The same as `nvidia-smi` but for multiple machines at the same time.

<p align="center"> <img src="./cluster-smi.jpg" width="100%"> </p>

The same as `nvidia-smi` but for multiple machines.

Run `cluster-smi` and the output should be something like

Expand All @@ -30,18 +27,22 @@ Run `cluster-smi` and the output should be something like

Additional information are available, when using `cluster-smi -p -t`.

Each machine you want to monitor need to start *cluster-smi-node* (e.g. using systemd). They are sending the information to a *cluster-smi-server*, which further distribute these information to client (*cluster-smi*). Only the machines running *cluster-smi-node* require CUDA dependencies.

<p align="center"> <img src="./cluster-smi.jpg" width="100%"> </p>

Each machine you want to monitor need to start *cluster-smi-node* (e.g. using systemd). They are sending information from the nvidia-driver to a *cluster-smi-server*, which further distribute these information to client (*cluster-smi*). Only the machines running *cluster-smi-node* require CUDA dependencies.

You might be interested as well in [cluster-top](https://github.com/PatWie/cluster-top) for CPUS.

## Install

### Requirements+Dependencies
### Requirements + Dependencies

I assume you can compile a CUDA program, as the `cluster-smi-node` depends on the NVIDIA driver to get the metrics.
- CUDA (just for `cluster-smi-node.go`)
- ZMQ (4.0.1)

Unfortunately, *ZMQ* can only be dynamically linked (`libzmq.so`) to this repository and you need to build it separately by

Dependencies are *MsgPack* for serialization and *ZMQ* (tested with 4.0.1) for messaging. Unfortunately, *ZMQ* can only be dynamically linked (`libzmq.so`) to this repository and you need to build it separately by
```bash
# compile ZMQ library for c++
cd /path/to/your_lib_folder
Expand All @@ -65,7 +66,7 @@ Edit the CFLAGS, LDFLAGS in file `nvvml/nvml.go` to match your setup.

### Compiling

You need to copy one file
You need to copy one config-file

```console
user@host $ cp config.example.go config.go
Expand Down Expand Up @@ -99,3 +100,22 @@ make all
3. use `cluster-smi` like `nvidia-smi`

Make sure, the machines can communicate using the specifiec ports (e.g., `ufw allow 9080, 9081`)

## Use systemd

To ease the use of this app, I suggest to add the *cluster-smi-node* into a systemd-service. An example config file can be found <a href="./docs/cluster-smi-node.example.service">here</a>. The steps would be

```bash
# add new entry to systemd
sudo cp docs/cluster-smi-node.example.service /etc/systemd/system/cluster-smi-node.service
# edit the path to cluster-smi-node
sudo nano /etc/systemd/system/cluster-smi-node.service
# make sure you can start and stop the service (have a look at you cluster-smi client)
sudo service cluster-smi-node start
sudo service cluster-smi-node stop
# register cluster-smi-node to start on reboots
sudo systemctl enable cluster-smi-node.service

# last, start the service
sudo service cluster-smi-node start
```
28 changes: 24 additions & 4 deletions cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,10 @@ package main
import (
"github.com/patwie/cluster-smi/cluster"
"github.com/patwie/cluster-smi/nvml"
"github.com/patwie/cluster-smi/proc"
"os"
"os/user"
"strconv"
"time"
)

Expand Down Expand Up @@ -36,6 +39,7 @@ func FetchNode(n *cluster.Node) {
n.Time = time.Now()

for idx, device := range devices {

meminfo, _ := device.GetMemoryInfo()
gpuPercent, _, _ := device.GetUtilization()
memPercent := int(meminfo.Used / meminfo.Total)
Expand All @@ -46,23 +50,39 @@ func FetchNode(n *cluster.Node) {
panic(err)
}

var p []cluster.Process
// collect al proccess informations
var processes []cluster.Process

for i := 0; i < len(deviceProcs); i++ {

if int(deviceProcs[i].Pid) == 0 {
continue
}

p = append(p, cluster.Process{
Pid: deviceProcs[i].Pid,
PID := deviceProcs[i].Pid
_, name := proc.TimeAndNameFromPID(PID)

UID := proc.UIDFromPID(PID)
user, err := user.LookupId(strconv.Itoa(UID))

username := "unknown"
if err == nil {
username = user.Username
}

processes = append(processes, cluster.Process{
Pid: PID,
UsedGpuMemory: deviceProcs[i].UsedGpuMemory,
Name: name,
Username: username,
})
}

n.Devices[idx].Id = idx
n.Devices[idx].Name = device.DeviceName
n.Devices[idx].Utilization = gpuPercent
n.Devices[idx].MemoryUtilization = cluster.Memory{meminfo.Used, meminfo.Free, meminfo.Total, memPercent}
n.Devices[idx].Processes = p
n.Devices[idx].Processes = processes

}
}
6 changes: 4 additions & 2 deletions cluster/data.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@ type Memory struct {
type Process struct {
Pid int
UsedGpuMemory int64
Name string
Username string
}

type Device struct {
Expand Down Expand Up @@ -139,7 +141,7 @@ func (c *Cluster) Print(show_processes bool, show_time bool, timeout_threshold i
device_name,
device_MemoryInfo,
device_utilization,
fmt.Sprintf("(%d) %3d MiB", p.Pid, p.UsedGpuMemory/1024/1024),
fmt.Sprintf("%s (%d, %s) %3d MiB", p.Name, p.Pid, p.Username, p.UsedGpuMemory/1024/1024),
}

if show_time {
Expand Down Expand Up @@ -186,6 +188,6 @@ func (c *Cluster) Print(show_processes bool, show_time bool, timeout_threshold i
}
}
fmt.Printf("\033[2J")
fmt.Println(time.Now().Format("Mon Jan 2 15:04:05 2006"), "(possible flags: -p -t)")
fmt.Println(time.Now().Format("Mon Jan 2 15:04:05 2006"))
fmt.Println(table.Render())
}
13 changes: 13 additions & 0 deletions docs/cluster-smi-node.example.service
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
[Unit]
Description=Cluster-Smi-Node

After=network.target local-fs.target multi-user.target
Requires=network.target local-fs.target multi-user.target


[Service]
Type=simple
StandardOutput=journal+console
ExecStart=/path/to/cluster-smi-node

## Add me by: sudo systemctl enable cluster-smi-node.service
4 changes: 3 additions & 1 deletion nvml/info.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,4 @@
Code in this directory is borrowed from:
The original code in this directory is borrowed from:
https://github.com/tankbusta/nvidia_exporter

I modified some parts.
133 changes: 133 additions & 0 deletions proc/proc.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
// Author: Patrick Wieschollek, 2018
#include <stdio.h>
#include <stdbool.h>
#include <unistd.h>
#include <stdlib.h>
#include <string.h>

#define MAX_NAME 128

void get_mem(unsigned long *mem_total, unsigned long *mem_free, unsigned long *mem_available) {
char line[100], *p;
FILE* statusf;


statusf = fopen("/proc/meminfo", "r");
if (!statusf)
return;


fgets(line, 100, statusf);
sscanf(line, "%*s %lu %*s", mem_total);
fgets(line, 100, statusf);
sscanf(line, "%*s %lu %*s", mem_free);
fgets(line, 100, statusf);
sscanf(line, "%*s %lu %*s", mem_available);


fclose(statusf);
}

// read total cpu time
unsigned long long int read_cpu_tick() {
unsigned long long int usertime, nicetime, systemtime, idletime;
unsigned long long int ioWait, irq, softIrq, steal, guest, guestnice;
usertime = nicetime = systemtime = idletime = 0;
ioWait = irq = softIrq = steal = guest = guestnice = 0;

FILE *fp;
fp = fopen("/proc/stat", "r");
if (fp != NULL) {
if (fscanf(fp, "cpu %16llu %16llu %16llu %16llu %16llu %16llu %16llu %16llu %16llu %16llu",
&usertime, &nicetime, &systemtime, &idletime,
&ioWait, &irq, &softIrq, &steal, &guest, &guestnice) == EOF) {
fclose(fp);
return 0;
} else {
fclose(fp);
return usertime + nicetime + systemtime + idletime + ioWait + irq + softIrq + steal + guest + guestnice;
}
} else {
return 0;
}
}

void get_uid_from_pid(unsigned long pid, unsigned long *uid) {
char path[40], line[100], *p;
FILE* statusf;

snprintf(path, 40, "/proc/%ld/status", pid);

statusf = fopen(path, "r");
if (!statusf)
return;

while (fgets(line, 100, statusf)) {
if (strncmp(line, "Uid:", 4) != 0)
continue;
// Uid: 1000 1000 1000 1000
sscanf(line, "%*s %lu %*s", uid);
break;
}
fclose(statusf);
}

// read cpu tick for a specific process
void read_time_and_name_from_pid(unsigned long pid, unsigned long *time, char *name) {

char fn[MAX_NAME + 1];
snprintf(fn, sizeof fn, "/proc/%ld/stat", pid);

unsigned long utime = 0;
unsigned long stime = 0;

*time = 0;

FILE * fp;
fp = fopen(fn, "r");
if (fp != NULL) {
/*
(1) pid %d The process ID
(2) comm %s The filename of the executable, in parentheses.
(3) state %c
(4) ppid %d The PID of the parent of this process.
(5) pgrp %d The process group ID of the process.
(6) session %d The session ID of the process.
(7) tty_nr %d The controlling terminal of the process.
(8) tpgid %d The ID of the foreground process group
(9) flags %u The kernel flags word of the process.
(10) minflt %lu The number of minor faults the process has made
(11) cminflt %lu The number of minor faults that the process's waited-for children have made.
(12) majflt %lu The number of major faults the process has made
(13) cmajflt %lu The number of major faults that the process's
(14) utime %lu Amount of time that this process has been scheduled in user mode
(15) stime %lu Amount of time that this process has been scheduled in kernel mode
...
*/

// extract
bool success = fscanf(fp, "%*d (%s %*c %*d %*d %*d %*d %*d %*u %*lu %*lu %*lu %*lu %lu"
"%lu %*ld %*ld %*d %*d %*d %*d %*u %*lu %*ld",
name, &utime, &stime) != EOF;
fclose(fp);

if (!success) {
// something went wrong
return;
}

// remove ")" suffix
if (strlen(name) > 2) {
name[strlen(name) - 1] = 0;
}

*time = utime + stime;

}
}

// return number of cores
unsigned int num_cores() {
return sysconf(_SC_NPROCESSORS_ONLN);
}

50 changes: 50 additions & 0 deletions proc/proc.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
package proc

// #include "proc.h"
// #include <stdlib.h>
import "C"
import "unsafe"

func mallocCStringBuffer(size uint) *C.char {
buf := make([]byte, size)
return C.CString(string(buf))
}

// CpuTick returns the total number of Jiffies
func CpuTick() (t int64) {
return int64(C.read_cpu_tick())
}

// TimeAndNameFromPID returns used time (int) and command (string)
func TimeAndNameFromPID(pid int) (int64, string) {
time := C.ulong(0)

var c_dst *C.char = mallocCStringBuffer(128 + 1)
defer C.free(unsafe.Pointer(c_dst))

C.read_time_and_name_from_pid(C.ulong(pid), &time, c_dst)
return int64(time), C.GoString(c_dst)
}

// NumberCPUCores returns number of CPU cores
func NumberCPUCores() (n int) {
return int(C.num_cores())
}

// Find user id (UID) for a given process id (PID)
func UIDFromPID(pid int) (uid int) {
c_uid := C.ulong(0)
C.get_uid_from_pid(C.ulong(pid), &c_uid)
return int(c_uid)
}

// get memory information of RAM
func GetRAMMemoryInfo() (total int64, free int64, available int64) {
c_total := C.ulong(0)
c_free := C.ulong(0)
c_available := C.ulong(0)

C.get_mem(&c_total, &c_free, &c_available)

return int64(c_total), int64(c_free), int64(c_available)
}
Loading

0 comments on commit 9157212

Please sign in to comment.