Skip to content

Commit

Permalink
add -d for fanspeed, temp and power usage.
Browse files Browse the repository at this point in the history
  • Loading branch information
Toru Tamaki committed Nov 30, 2018
1 parent 9415a01 commit 2853c21
Show file tree
Hide file tree
Showing 4 changed files with 69 additions and 11 deletions.
3 changes: 2 additions & 1 deletion cluster-smi-local.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ func main() {
showTimePtr := flag.Bool("t", false, "show time of events")
showExtendedPtr := flag.Bool("e", false, "extended view")
showProcessesPtr := flag.Bool("p", false, "verbose process information")
showDetailPtr := flag.Bool("d", false, "detail view with fan, temp, and power info")
nodeRegex := flag.String("n", ".", "match node-names with regex for display information "+
"(if not specified, all nodes will be shown)")
usernameFilter := flag.String("u", "", "show all information only for specific user")
Expand All @@ -48,7 +49,7 @@ func main() {
}

clus.FilterNodes(*nodeRegex)
clus.Print(*showProcessesPtr, *showTimePtr, cfg.Timeout, *useColor, *showExtendedPtr)
clus.Print(*showProcessesPtr, *showTimePtr, cfg.Timeout, *useColor, *showExtendedPtr, *showDetailPtr)
time.Sleep(time.Duration(cfg.Tick) * time.Second)
}

Expand Down
3 changes: 2 additions & 1 deletion cluster-smi.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ func main() {
showTimePtr := flag.Bool("t", false, "show time of events")
showExtendedPtr := flag.Bool("e", false, "extended view")
showProcessesPtr := flag.Bool("p", false, "verbose process information")
showDetailPtr := flag.Bool("d", false, "detail view with fan, temp, and power info")
nodeRegex := flag.String("n", ".", "match node-names with regex for display information "+
"(if not specified, all nodes will be shown)")
usernameFilter := flag.String("u", "", "show all information only for specific user")
Expand Down Expand Up @@ -85,7 +86,7 @@ func main() {

clus.Sort()
clus.FilterNodes(*nodeRegex)
clus.Print(*showProcessesPtr, *showTimePtr, cfg.Timeout, *useColor, *showExtendedPtr)
clus.Print(*showProcessesPtr, *showTimePtr, cfg.Timeout, *useColor, *showExtendedPtr, *showDetailPtr)
time.Sleep(time.Duration(cfg.Tick) * time.Second)
}

Expand Down
8 changes: 7 additions & 1 deletion cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ func InitNode(n *cluster.Node) {
devices, _ := nvml.GetDevices()

for i := 0; i < len(devices); i++ {
n.Devices = append(n.Devices, cluster.Device{0, "", 0, cluster.Memory{0, 0, 0, 0}, nil})
n.Devices = append(n.Devices, cluster.Device{0, "", 0, cluster.Memory{0, 0, 0, 0}, 0, 0, 0, nil})
}
}

Expand All @@ -52,6 +52,9 @@ func FetchNode(n *cluster.Node) {
meminfo, _ := device.GetMemoryInfo()
gpuPercent, _, _ := device.GetUtilization()
memPercent := int(meminfo.Used / meminfo.Total)
powerUsage, _ := device.GetPowerUsage()
fanSpeed, _ := device.GetFanSpeed()
tempc, _, _ := device.GetTemperature()

// read processes
deviceProcs, err := device.GetProcessInfo()
Expand Down Expand Up @@ -95,6 +98,9 @@ func FetchNode(n *cluster.Node) {
n.Devices[idx].Name = device.DeviceName
n.Devices[idx].Utilization = gpuPercent
n.Devices[idx].MemoryUtilization = cluster.Memory{meminfo.Used, meminfo.Free, meminfo.Total, memPercent}
n.Devices[idx].FanSpeed = fanSpeed
n.Devices[idx].PowerUsage = powerUsage
n.Devices[idx].Temperature = tempc
n.Devices[idx].Processes = processes

}
Expand Down
66 changes: 58 additions & 8 deletions cluster/data.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,9 @@ type Device struct {
Name string `json:"name"`
Utilization int `json:"utilization"`
MemoryUtilization Memory `json:"memory"`
FanSpeed int `json:"fan_speed"`
Temperature int `json:"temperature"`
PowerUsage int `json:"power_usage"`
Processes []Process
}

Expand Down Expand Up @@ -77,7 +80,8 @@ func FilterByUser(c Cluster, username string) Cluster {

if len(Processes) > 0 {
current_device := Device{
d.Id, d.Name, d.Utilization, d.MemoryUtilization, Processes,
d.Id, d.Name, d.Utilization, d.MemoryUtilization,
d.FanSpeed, d.Temperature, d.PowerUsage, Processes,
}
Devices = append(Devices, current_device)
}
Expand Down Expand Up @@ -174,12 +178,17 @@ func highlight(s string) string {
return fmt.Sprintf("\033[0;33m%s\033[0m", s)
}

func (c *Cluster) Print(show_processes bool, show_time bool, timeout_threshold int, useColor bool, extended bool) {
func (c *Cluster) Print(show_processes bool, show_time bool, timeout_threshold int, useColor bool, extended bool, show_detail bool) {

table := termtables.CreateTable()

tableHeader := []interface{}{"Node", "Gpu", "Memory-Usage", "GPU-Util"}

if show_detail {
tableHeader = append(tableHeader, "Fan")
tableHeader = append(tableHeader, "Temp")
tableHeader = append(tableHeader, "Power")
}
if show_processes {
tableHeader = append(tableHeader, "PID")
tableHeader = append(tableHeader, "User")
Expand Down Expand Up @@ -211,6 +220,10 @@ func (c *Cluster) Print(show_processes bool, show_time bool, timeout_threshold i
"",
}

if show_detail {
tableRow = append(tableRow, []interface{}{"", "", ""}...)
}

if show_processes {
tableRow = append(tableRow, []interface{}{"", "", "", "", ""}...)
}
Expand All @@ -235,7 +248,16 @@ func (c *Cluster) Print(show_processes bool, show_time bool, timeout_threshold i
d.MemoryUtilization.Total/1024/1024,
int(d.MemoryUtilization.Used*100/d.MemoryUtilization.Total))
device_utilization := fmt.Sprintf("%3d %%", d.Utilization)


device_FanSpeed := ""
device_Temp := ""
device_PowerUtil := ""
if show_detail {
device_FanSpeed = fmt.Sprintf("%3d %%", d.FanSpeed)
device_Temp = fmt.Sprintf("%3d C", d.Temperature)
device_PowerUtil = fmt.Sprintf("%3dW", d.PowerUsage)
}

if timeout {
device_MemoryInfo = "TimeOut"
device_utilization = "TimeOut"
Expand All @@ -256,6 +278,11 @@ func (c *Cluster) Print(show_processes bool, show_time bool, timeout_threshold i
device_name = ""
device_MemoryInfo = ""
device_utilization = ""
if show_detail {
device_FanSpeed = ""
device_Temp = ""
device_PowerUtil = ""
}
}

processName := p.Name
Expand All @@ -272,6 +299,11 @@ func (c *Cluster) Print(show_processes bool, show_time bool, timeout_threshold i
device_name = highlight(device_name)
device_MemoryInfo = highlight(device_MemoryInfo)
device_utilization = highlight(device_utilization)
if show_detail {
device_FanSpeed = highlight(device_FanSpeed)
device_Temp = highlight(device_Temp)
device_PowerUtil = highlight(device_PowerUtil)
}
processPID = highlight(fmt.Sprintf("%v", p.Pid))
processUsername = highlight(processUsername)
processName = highlight(processName)
Expand All @@ -284,12 +316,22 @@ func (c *Cluster) Print(show_processes bool, show_time bool, timeout_threshold i
device_name,
device_MemoryInfo,
device_utilization,
processPID,
processUsername,
processName,
processUseGPUMemory,
processRuntime,
}
if show_detail {
tableRow = append(tableRow, []interface{}{
device_FanSpeed,
device_Temp,
device_PowerUtil,
}...)
}
tableRow = append(tableRow, []interface{}{
processPID,
processUsername,
processName,
processUseGPUMemory,
processRuntime,
}...)

// fmt.Sprintf("%s (%d, %s) %3d MiB %v", p.Name, p.Pid, p.Username, p.UsedGpuMemory/1024/1024, p.RunTime),

if show_time {
Expand Down Expand Up @@ -326,6 +368,14 @@ func (c *Cluster) Print(show_processes bool, show_time bool, timeout_threshold i
device_utilization,
}

if show_detail {
tableRow = append(tableRow, []interface{}{
device_FanSpeed,
device_Temp,
device_PowerUtil,
}...)
}

if show_processes {
tableRow = append(tableRow, []interface{}{"", "", "", "", ""}...)
}
Expand Down

0 comments on commit 2853c21

Please sign in to comment.