Skip to content

Commit

Permalink
Merge pull request #174 from MalloZup/sbd-timeouts
Browse files Browse the repository at this point in the history
Implement SBD watchdog and msgwait metrics
  • Loading branch information
stefanotorresi authored Sep 3, 2020
2 parents 7a7dbdc + 428cb9f commit 8912739
Show file tree
Hide file tree
Showing 5 changed files with 91 additions and 3 deletions.
48 changes: 48 additions & 0 deletions collector/sbd/sbd.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import (
"os"
"os/exec"
"regexp"
"strconv"
"strings"

"github.com/pkg/errors"
Expand All @@ -20,6 +21,7 @@ const subsystem = "sbd"
const SBD_STATUS_UNHEALTHY = "unhealthy"
const SBD_STATUS_HEALTHY = "healthy"

// NewCollector create a new sbd collector
func NewCollector(sbdPath string, sbdConfigPath string) (*sbdCollector, error) {
err := checkArguments(sbdPath, sbdConfigPath)
if err != nil {
Expand All @@ -33,6 +35,7 @@ func NewCollector(sbdPath string, sbdConfigPath string) (*sbdCollector, error) {
}

c.SetDescriptor("devices", "SBD devices; one line per device", []string{"device", "status"})
c.SetDescriptor("timeouts", "SBD timeouts for each device and type", []string{"device", "type"})

return c, nil
}
Expand Down Expand Up @@ -68,6 +71,15 @@ func (c *sbdCollector) CollectWithError(ch chan<- prometheus.Metric) error {
ch <- c.MakeGaugeMetric("devices", 1, sbdDev, sbdStatus)
}

sbdWatchdogs, sbdMsgWaits := c.getSbdTimeouts(sbdDevices)
for sbdDev, sbdWatchdog := range sbdWatchdogs {
ch <- c.MakeGaugeMetric("timeouts", sbdWatchdog, sbdDev, "watchdog")
}

for sbdDev, sbdMsgWait := range sbdMsgWaits {
ch <- c.MakeGaugeMetric("timeouts", sbdMsgWait, sbdDev, "msgwait")
}

return nil
}

Expand Down Expand Up @@ -132,3 +144,39 @@ func (c *sbdCollector) getSbdDeviceStatuses(sbdDevices []string) map[string]stri

return sbdStatuses
}

// for each sbd device, extract the watchdog and msgwait timeout via regex
func (c *sbdCollector) getSbdTimeouts(sbdDevices []string) (map[string]float64, map[string]float64) {
sbdWatchdogs := make(map[string]float64)
sbdMsgWaits := make(map[string]float64)
for _, sbdDev := range sbdDevices {
sbdDump, _ := exec.Command(c.sbdPath, "-d", sbdDev, "dump").Output()

regexW := regexp.MustCompile(`Timeout \(msgwait\) *: \d+`)
regex := regexp.MustCompile(`Timeout \(watchdog\) *: \d+`)

msgWaitLine := regexW.FindStringSubmatch(string(sbdDump))
watchdogLine := regex.FindStringSubmatch(string(sbdDump))

if watchdogLine == nil || msgWaitLine == nil {
continue
}

// get the timeout from the line
regexNumber := regexp.MustCompile(`\d+`)
watchdogTimeout := regexNumber.FindString(string(watchdogLine[0]))
msgWaitTimeout := regexNumber.FindString(string(msgWaitLine[0]))

// map the timeout to the device
if s, err := strconv.ParseFloat(watchdogTimeout, 64); err == nil {
sbdWatchdogs[sbdDev] = s
}

// map the timeout to the device
if s, err := strconv.ParseFloat(msgWaitTimeout, 64); err == nil {
sbdMsgWaits[sbdDev] = s
}

}
return sbdWatchdogs, sbdMsgWaits
}
9 changes: 8 additions & 1 deletion collector/sbd/sbd_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -214,6 +214,13 @@ func TestNewSbdCollectorChecksSbdExecutableBits(t *testing.T) {
}

func TestSBDCollector(t *testing.T) {
collector, _ := NewCollector("../../test/fake_sbd.sh", "../../test/fake_sbdconfig")
collector, _ := NewCollector("../../test/fake_sbd_dump.sh", "../../test/fake_sbdconfig")
assertcustom.Metrics(t, collector, "sbd.metrics")
}

func TestWatchdog(t *testing.T) {
collector, err := NewCollector("../../test/fake_sbd_dump.sh", "../../test/fake_sbdconfig")

assert.Nil(t, err)
assertcustom.Metrics(t, collector, "sbd.metrics")
}
15 changes: 14 additions & 1 deletion doc/metrics.md
Original file line number Diff line number Diff line change
Expand Up @@ -198,7 +198,8 @@ The status of each Corosync ring; `1` means healthy, `0` means faulty.
The SBD subsystems collect devices stats by parsing its configuration and the output of `sbd --dump`.

0. [Sample](../test/sbd.metrics)
2. [`ha_cluster_sbd_devices`](#ha_cluster_sbd_devices)
1. [`ha_cluster_sbd_devices`](#ha_cluster_sbd_devices)
2. [`ha_cluster_sbd_timeouts`](#ha_cluster_sbd_timeouts)

### `ha_cluster_sbd_devices`

Expand All @@ -214,6 +215,18 @@ Either the value is `1`, or the line is absent altogether.

The total number of lines for this metric will be the cardinality of `device`.

### `ha_cluster_sbd_timeouts`

#### Description

The SBD timeouts pro SBD device
Value is an integer expessing the timeout

#### Labels

- `device`: the path of the SBD device
- `type`: either `watchdog` or `msgwait`


## DRBD

Expand Down
14 changes: 14 additions & 0 deletions test/fake_sbd_dump.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#!/usr/bin/env bash

cat <<EOF
==Dumping header on disk /dev/vdc
Header version : 2.1
UUID : 1ed3171d-066d-47ca-8f76-aec25d9efed4
Number of slots : 255
Sector size : 512
Timeout (watchdog) : 9
Timeout (allocate) : 2
Timeout (loop) : 1
Timeout (msgwait) : 10
==Header on disk /dev/vdc is dumped
EOF
8 changes: 7 additions & 1 deletion test/sbd.metrics
Original file line number Diff line number Diff line change
@@ -1,4 +1,10 @@
# HELP ha_cluster_sbd_devices SBD devices; one line per device
# TYPE ha_cluster_sbd_devices gauge
ha_cluster_sbd_devices{device="/dev/vdc",status="healthy"} 1
ha_cluster_sbd_devices{device="/dev/vdd",status="unhealthy"} 1
ha_cluster_sbd_devices{device="/dev/vdd",status="healthy"} 1
# HELP ha_cluster_sbd_timeouts SBD timeouts for each device and type
# TYPE ha_cluster_sbd_timeouts gauge
ha_cluster_sbd_timeouts{device="/dev/vdc",type="msgwait"} 10
ha_cluster_sbd_timeouts{device="/dev/vdc",type="watchdog"} 9
ha_cluster_sbd_timeouts{device="/dev/vdd",type="msgwait"} 10
ha_cluster_sbd_timeouts{device="/dev/vdd",type="watchdog"} 9

0 comments on commit 8912739

Please sign in to comment.