Skip to content

Commit

Permalink
refactor: update checker interfaces to support multiple hosts and to …
Browse files Browse the repository at this point in the history
…add to prom stats
  • Loading branch information
whiskeyjimbo committed Jan 8, 2025
1 parent 4f29eaf commit 9169602
Show file tree
Hide file tree
Showing 9 changed files with 176 additions and 122 deletions.
14 changes: 13 additions & 1 deletion internal/checkers/checker.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,13 @@ type CheckResult struct {
Success bool
}

type HostCheckResult struct {
Host string
CheckResult
}

type Checker interface {
Check(ctx context.Context, address string) CheckResult
Check(ctx context.Context, hosts []string, port string) []HostCheckResult
Protocol() Protocol
}

Expand Down Expand Up @@ -60,3 +65,10 @@ func newSuccessResult(duration time.Duration) CheckResult {
Error: nil,
}
}

func newHostResult(host string, result CheckResult) HostCheckResult {
return HostCheckResult{
Host: host,
CheckResult: result,
}
}
21 changes: 12 additions & 9 deletions internal/checkers/dns.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@ package checkers

import (
"context"
"fmt"
"net"
"time"
)
Expand All @@ -17,16 +16,20 @@ func (c *DNSChecker) Protocol() Protocol {
return DNS
}

func (c *DNSChecker) Check(ctx context.Context, address string) CheckResult {
start := time.Now()
func (c *DNSChecker) Check(ctx context.Context, hosts []string, port string) []HostCheckResult {
results := make([]HostCheckResult, 0, len(hosts))

resolver := net.Resolver{}
_, err := resolver.LookupHost(ctx, address)
elapsed := time.Since(start)
for _, host := range hosts {
start := time.Now()

if err != nil {
return newFailedResult(elapsed, fmt.Errorf("DNS lookup failed: %w", err))
_, err := net.DefaultResolver.LookupHost(ctx, host)
if err != nil {
results = append(results, newHostResult(host, newFailedResult(time.Since(start), err)))
continue
}

results = append(results, newHostResult(host, newSuccessResult(time.Since(start))))
}

return newSuccessResult(elapsed)
return results
}
41 changes: 23 additions & 18 deletions internal/checkers/http.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,24 +23,29 @@ func (c *HTTPChecker) Protocol() Protocol {
return HTTP
}

func (c *HTTPChecker) Check(ctx context.Context, address string) CheckResult {
start := time.Now()

req, err := http.NewRequestWithContext(ctx, "GET", fmt.Sprintf("http://%s", address), nil)
if err != nil {
return newFailedResult(time.Since(start), err)
}

resp, err := c.client.Do(req)
elapsed := time.Since(start)
if err != nil {
return newFailedResult(elapsed, err)
}
defer resp.Body.Close()

if resp.StatusCode != http.StatusOK {
return newFailedResult(elapsed, fmt.Errorf("HTTP status code: %d", resp.StatusCode))
func (c *HTTPChecker) Check(ctx context.Context, hosts []string, port string) []HostCheckResult {
results := make([]HostCheckResult, 0, len(hosts))

for _, host := range hosts {
address := fmt.Sprintf("%s:%s", host, port)
url := fmt.Sprintf("http://%s", address)
start := time.Now()

req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
if err != nil {
results = append(results, newHostResult(host, newFailedResult(time.Since(start), err)))
continue
}

resp, err := c.client.Do(req)
if err != nil {
results = append(results, newHostResult(host, newFailedResult(time.Since(start), err)))
continue
}
resp.Body.Close()

results = append(results, newHostResult(host, newSuccessResult(time.Since(start))))
}

return newSuccessResult(elapsed)
return results
}
54 changes: 28 additions & 26 deletions internal/checkers/https.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,34 +39,36 @@ func NewHTTPSChecker() *HTTPSChecker {
return &HTTPSChecker{client: client}
}

func (c *HTTPSChecker) Check(ctx context.Context, address string) CheckResult {
start := time.Now()
url := fmt.Sprintf("https://%s", address)

req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
if err != nil {
return CheckResult{Success: false, Error: err}
}

resp, err := c.client.Do(req)
if err != nil {
return CheckResult{Success: false, Error: err}
}
defer resp.Body.Close()

// Get certificate information
certInfo := c.getCertificateInfo(resp.TLS)

result := CheckResult{
Success: resp.StatusCode < 400,
ResponseTime: time.Since(start),
Error: nil,
Metadata: map[string]interface{}{
"certificate": certInfo,
},
func (c *HTTPSChecker) Check(ctx context.Context, hosts []string, port string) []HostCheckResult {
results := make([]HostCheckResult, 0, len(hosts))

for _, host := range hosts {
address := fmt.Sprintf("%s:%s", host, port)
url := fmt.Sprintf("https://%s", address)
start := time.Now()

req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
if err != nil {
results = append(results, newHostResult(host, newFailedResult(time.Since(start), err)))
continue
}

resp, err := c.client.Do(req)
if err != nil {
results = append(results, newHostResult(host, newFailedResult(time.Since(start), err)))
continue
}

// Get certificate information
certInfo := c.getCertificateInfo(resp.TLS)
result := newSuccessResult(time.Since(start))
result.Metadata = map[string]interface{}{"certificate": certInfo}

resp.Body.Close()
results = append(results, newHostResult(host, result))
}

return result
return results
}

func (c *HTTPSChecker) getCertificateInfo(tlsState *tls.ConnectionState) *CertInfo {
Expand Down
36 changes: 26 additions & 10 deletions internal/checkers/smtp.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@ package checkers

import (
"context"
"fmt"
"net"
"net/smtp"
"time"
)
Expand All @@ -16,16 +18,30 @@ func (c *SMTPChecker) Protocol() Protocol {
return SMTP
}

func (c *SMTPChecker) Check(ctx context.Context, address string) CheckResult {
start := time.Now()

client, err := smtp.Dial(address)
elapsed := time.Since(start)

if err != nil {
return newFailedResult(elapsed, err)
func (c *SMTPChecker) Check(ctx context.Context, hosts []string, port string) []HostCheckResult {
results := make([]HostCheckResult, 0, len(hosts))

for _, host := range hosts {
address := fmt.Sprintf("%s:%s", host, port)
start := time.Now()

d := net.Dialer{Timeout: 10 * time.Second}
conn, err := d.DialContext(ctx, "tcp", address)
if err != nil {
results = append(results, newHostResult(host, newFailedResult(time.Since(start), err)))
continue
}

client, err := smtp.NewClient(conn, host)
if err != nil {
conn.Close()
results = append(results, newHostResult(host, newFailedResult(time.Since(start), err)))
continue
}
client.Close()

results = append(results, newHostResult(host, newSuccessResult(time.Since(start))))
}
defer client.Close()

return newSuccessResult(elapsed)
return results
}
31 changes: 17 additions & 14 deletions internal/checkers/tcp.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,20 +21,23 @@ func (c *TCPChecker) Protocol() Protocol {
return TCP
}

func (c *TCPChecker) Check(ctx context.Context, address string) CheckResult {
start := time.Now()

dialer := net.Dialer{
Timeout: c.timeout,
}

conn, err := dialer.DialContext(ctx, "tcp", address)
elapsed := time.Since(start)

if err != nil {
return newFailedResult(elapsed, fmt.Errorf("TCP connection failed: %w", err))
func (c *TCPChecker) Check(ctx context.Context, hosts []string, port string) []HostCheckResult {
results := make([]HostCheckResult, 0, len(hosts))

for _, host := range hosts {
address := fmt.Sprintf("%s:%s", host, port)
start := time.Now()

d := net.Dialer{}
conn, err := d.DialContext(ctx, "tcp", address)
if err != nil {
results = append(results, newHostResult(host, newFailedResult(time.Since(start), err)))
continue
}
conn.Close()

results = append(results, newHostResult(host, newSuccessResult(time.Since(start))))
}
defer conn.Close()

return newSuccessResult(elapsed)
return results
}
56 changes: 33 additions & 23 deletions internal/metrics/prometheus.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,16 +46,21 @@ type PrometheusMetrics struct {
monitorSite string
}

type GroupMetrics struct {
Site string
Group string
Port string
Protocol string
Tags []string
type HostResult struct {
Success bool
ResponseTime time.Duration
HostsUp int
HostsTotal int
Error error
}

type GroupMetrics struct {
Site string
Group string
Port string
Protocol string
Tags []string
HostResults map[string]HostResult
HostsUp int
HostsTotal int
}

func NewPrometheusMetrics(logger *zap.SugaredLogger, monitorSite string) *PrometheusMetrics {
Expand Down Expand Up @@ -100,31 +105,36 @@ func StartMetricsServer(logger *zap.SugaredLogger) {
}

func (p *PrometheusMetrics) UpdateGroup(metrics GroupMetrics) {
labels := MetricLabels{
Site: metrics.Site,
Group: metrics.Group,
Port: metrics.Port,
Protocol: metrics.Protocol,
for host, result := range metrics.HostResults {
labels := MetricLabels{
Site: metrics.Site,
Group: metrics.Group,
Host: host,
Port: metrics.Port,
Protocol: metrics.Protocol,
}
p.updateMetrics(labels, metrics.Tags, result.Success, result.ResponseTime)
}
p.updateMetrics(labels, metrics.Tags, metrics.Success, metrics.ResponseTime)
p.updateGroupCounts(metrics.Site, metrics.Group, metrics.Port, metrics.Protocol, metrics.HostsUp, metrics.HostsTotal)
}

func (p *PrometheusMetrics) updateMetrics(labels MetricLabels, tags []string, success bool, elapsed time.Duration) {
tagString := normalizeTagString(tags)
labelValues := []string{labels.Site, labels.Group, labels.Host, labels.Port, labels.Protocol, tagString}

fullLabels := []string{labels.Site, labels.Group, labels.Host, labels.Port, labels.Protocol, tagString}

histLabels := []string{labels.Site, labels.Group, labels.Port, labels.Protocol, tagString}

statusValue := 0.0
if success {
statusValue = 1.0
}
p.checkStatus.WithLabelValues(labelValues...).Set(statusValue)
p.checkStatus.WithLabelValues(fullLabels...).Set(statusValue)

// Update latency metrics only for successful checks
if success {
latencyMs := float64(elapsed.Milliseconds())
p.checkLatency.WithLabelValues(labelValues...).Set(latencyMs)
p.latencyHist.WithLabelValues(labelValues...).Observe(latencyMs)
p.checkLatency.WithLabelValues(fullLabels...).Set(float64(elapsed.Milliseconds()))
p.latencyHist.WithLabelValues(histLabels...).Observe(float64(elapsed.Seconds()))
}

p.updateGraphMetrics(labels, tagString, success, elapsed)
Expand Down Expand Up @@ -266,8 +276,8 @@ func createCheckStatusMetric() *prometheus.GaugeVec {
return promauto.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: namespace,
Name: "check_status",
Help: "Status of the check (1 for up, 0 for down)",
Name: "host_check_status",
Help: "Status of the host check (1 for up, 0 for down)",
},
[]string{"site", "group", "host", "port", "protocol", "tags"},
)
Expand All @@ -277,8 +287,8 @@ func createCheckLatencyMetric() *prometheus.GaugeVec {
return promauto.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: namespace,
Name: "check_latency_seconds",
Help: "Latency of the check in seconds",
Name: "host_check_latency_milliseconds",
Help: "Latency of the host check in milliseconds",
},
[]string{"site", "group", "host", "port", "protocol", "tags"},
)
Expand Down
Loading

0 comments on commit 9169602

Please sign in to comment.