From 8baf15750b1001ef11b22d85bd217c4872fb4f6a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Geyslan=20Greg=C3=B3rio?= Date: Tue, 15 Oct 2024 10:38:59 -0300 Subject: [PATCH] chore: add perfbuf metric per event (METRICS=1) Enabled only when built with METRICS=1. BPFPerfEventSubmitAttemptsCount and BPFPerfEventSubmitFailuresCount count the number of events processed by the eBPF programs and written to or attempted to be written to the perf buffer. It is incremented right after the attempt of writing the event to the perf buffer, making it possible to measure if the that event was successfully written to the perf buffer or not. This metric can be used to monitor the performance of individual eBPF events and to detect potential bottlenecks. --- Makefile | 6 +++ pkg/ebpf/c/common/buffer.h | 33 ++++++++++++- pkg/ebpf/perf_count.go | 79 +++++++++++++++++++++++++++++++ pkg/ebpf/tracee.go | 14 +++++- pkg/metrics/collector.go | 83 ++++++++++++++++++++++++++++++++ pkg/metrics/event_collector.go | 69 +++++++++++++++++++++++++++ pkg/metrics/stats.go | 86 +++++++++++++++++++++++++--------- pkg/version/version.go | 9 +++- 8 files changed, 354 insertions(+), 25 deletions(-) create mode 100644 pkg/ebpf/perf_count.go create mode 100644 pkg/metrics/collector.go create mode 100644 pkg/metrics/event_collector.go diff --git a/Makefile b/Makefile index 66c47ba22ca4..715af62e6697 100644 --- a/Makefile +++ b/Makefile @@ -167,6 +167,10 @@ else GO_DEBUG_FLAG = -w endif +ifeq ($(METRICS),1) + BPF_DEBUG_FLAG += -DMETRICS +endif + ifeq ($(UNAME_M),x86_64) ARCH = x86_64 LINUX_ARCH = x86 @@ -423,6 +427,7 @@ $(OUTPUT_DIR)/tracee.bpf.o: \ $(TRACEE_EBPF_OBJ_HEADERS) # $(CMD_CLANG) \ + $(BPF_DEBUG_FLAG) \ -D__TARGET_ARCH_$(LINUX_ARCH) \ -D__BPF_TRACING__ \ -DCORE \ @@ -501,6 +506,7 @@ $(OUTPUT_DIR)/tracee: \ -ldflags="$(GO_DEBUG_FLAG) \ -extldflags \"$(CGO_EXT_LDFLAGS_EBPF)\" \ -X github.com/aquasecurity/tracee/pkg/version.version=$(VERSION) \ + -X github.com/aquasecurity/tracee/pkg/version.metrics=$(METRICS) \ " \ -v -o $@ \ ./cmd/tracee diff --git a/pkg/ebpf/c/common/buffer.h b/pkg/ebpf/c/common/buffer.h index 742a277082e5..74cf46cbe6dc 100644 --- a/pkg/ebpf/c/common/buffer.h +++ b/pkg/ebpf/c/common/buffer.h @@ -458,6 +458,24 @@ statfunc int save_args_to_submit_buf(event_data_t *event, args_t *args) return arg_num; } +#ifdef METRICS +struct event_stats_values { + u64 attempts; + u64 failures; +}; + +typedef struct event_stats_values event_stats_values_t; + +struct events_stats { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, MAX_EVENT_ID); + __type(key, u32); // eventid + __type(value, event_stats_values_t); +} events_stats SEC(".maps"); + +typedef struct events_stats events_stats_t; +#endif + statfunc int events_perf_submit(program_data_t *p, long ret) { p->event->context.retval = ret; @@ -484,7 +502,20 @@ statfunc int events_perf_submit(program_data_t *p, long ret) : : [size] "r"(size), [max_size] "i"(MAX_EVENT_SIZE)); - return bpf_perf_event_output(p->ctx, &events, BPF_F_CURRENT_CPU, p->event, size); + u64 perf_ret = bpf_perf_event_output(p->ctx, &events, BPF_F_CURRENT_CPU, p->event, size); + +#ifdef METRICS + // update event stats + event_stats_values_t *evt_stat = bpf_map_lookup_elem(&events_stats, &p->event->context.eventid); + if (unlikely(evt_stat == NULL)) + return perf_ret; + + __sync_fetch_and_add(&evt_stat->attempts, 1); + if (perf_ret < 0) + __sync_fetch_and_add(&evt_stat->failures, 1); +#endif + + return perf_ret; } statfunc int signal_perf_submit(void *ctx, controlplane_signal_t *sig) diff --git a/pkg/ebpf/perf_count.go b/pkg/ebpf/perf_count.go new file mode 100644 index 000000000000..f9512135f2f3 --- /dev/null +++ b/pkg/ebpf/perf_count.go @@ -0,0 +1,79 @@ +package ebpf + +import ( + "context" + "encoding/binary" + "time" + "unsafe" + + "github.com/aquasecurity/tracee/pkg/events" + "github.com/aquasecurity/tracee/pkg/logger" +) + +// eventStatsValues mirrors the C struct event_stats_values (event_stats_values_t). +type eventStatsValues struct { + submitAttempts uint64 + submitFailures uint64 +} + +// countPerfEventSubmissions is a goroutine that periodically counts the +// number of attempts and failures to submit events to the perf buffer +func (t *Tracee) countPerfEventSubmissions(ctx context.Context) { + logger.Debugw("Starting countPerfEventSubmissions goroutine") + defer logger.Debugw("Stopped countPerfEventSubmissions goroutine") + + evtsCountsBPFMap, err := t.bpfModule.GetMap("events_stats") + if err != nil { + logger.Errorw("Failed to get events_stats map", "error", err) + return + } + + evtStatZero := eventStatsValues{} + for _, id := range t.policyManager.EventsSelected() { + key := uint32(id) + err := evtsCountsBPFMap.Update(unsafe.Pointer(&key), unsafe.Pointer(&evtStatZero)) + if err != nil { + logger.Errorw("Failed to update events_stats map", "error", err) + } + } + + ticker := time.NewTicker(10 * time.Second) + defer ticker.Stop() + + for { + select { + case <-ctx.Done(): + return + case <-ticker.C: + t.stats.BPFPerfEventSubmitAttemptsCount.Reset() + t.stats.BPFPerfEventSubmitFailuresCount.Reset() + + // Get the counts of each event from the BPF map + iter := evtsCountsBPFMap.Iterator() + for iter.Next() { + key := binary.LittleEndian.Uint32(iter.Key()) + value, err := evtsCountsBPFMap.GetValue(unsafe.Pointer(&key)) + if err != nil { + logger.Errorw("Failed to get value from events_stats map", "error", err) + continue + } + + // Get counts + id := events.ID(key) + attempts := binary.LittleEndian.Uint64(value[0:8]) + failures := binary.LittleEndian.Uint64(value[8:16]) + t.stats.BPFPerfEventSubmitAttemptsCount.Set(uint64(id), attempts) + t.stats.BPFPerfEventSubmitFailuresCount.Set(uint64(id), failures) + + // Update Prometheus metrics for current event + evtName := events.Core.GetDefinitionByID(id).GetName() + t.stats.BPFPerfEventSubmitAttemptsCount.GaugeVec().WithLabelValues(evtName).Set(float64(attempts)) + t.stats.BPFPerfEventSubmitFailuresCount.GaugeVec().WithLabelValues(evtName).Set(float64(failures)) + } + + // Log the counts + t.stats.BPFPerfEventSubmitAttemptsCount.Log() + t.stats.BPFPerfEventSubmitFailuresCount.Log() + } + } +} diff --git a/pkg/ebpf/tracee.go b/pkg/ebpf/tracee.go index 0945566e034e..7809340c5fb6 100644 --- a/pkg/ebpf/tracee.go +++ b/pkg/ebpf/tracee.go @@ -47,6 +47,7 @@ import ( "github.com/aquasecurity/tracee/pkg/utils/environment" "github.com/aquasecurity/tracee/pkg/utils/proc" "github.com/aquasecurity/tracee/pkg/utils/sharedobjs" + "github.com/aquasecurity/tracee/pkg/version" "github.com/aquasecurity/tracee/types/trace" ) @@ -63,7 +64,9 @@ type Tracee struct { running atomic.Bool done chan struct{} // signal to safely stop end-stage processing OutDir *os.File // use utils.XXX functions to create or write to this file - stats metrics.Stats + // Metrics + stats *metrics.Stats + // Engine sigEngine *engine.Engine // Events eventsSorter *sorting.EventsChronologicalSorter @@ -128,7 +131,7 @@ type Tracee struct { } func (t *Tracee) Stats() *metrics.Stats { - return &t.stats + return t.stats } func (t *Tracee) Engine() *engine.Engine { @@ -224,6 +227,7 @@ func New(cfg config.Config) (*Tracee, error) { t := &Tracee{ config: cfg, done: make(chan struct{}), + stats: metrics.NewStats(), writtenFiles: make(map[string]string), readFiles: make(map[string]string), capturedFiles: make(map[string]int64), @@ -1370,6 +1374,12 @@ func (t *Tracee) Run(ctx gocontext.Context) error { t.controlPlane.Start() go t.controlPlane.Run(ctx) + // Measure event perf buffer write attempts (METRICS build only) + + if version.MetricsBuild() { + go t.countPerfEventSubmissions(ctx) + } + // Main event loop (polling events perf buffer) t.eventsPerfMap.Poll(pollTimeout) diff --git a/pkg/metrics/collector.go b/pkg/metrics/collector.go new file mode 100644 index 000000000000..c7198423d2e4 --- /dev/null +++ b/pkg/metrics/collector.go @@ -0,0 +1,83 @@ +package metrics + +import ( + "maps" + "sync" + + "github.com/aquasecurity/tracee/pkg/counter" + "github.com/aquasecurity/tracee/pkg/logger" + "github.com/prometheus/client_golang/prometheus" +) + +type Collector struct { + m sync.RWMutex + description string + values map[uint64]uint64 + gaugeVec *prometheus.GaugeVec +} + +func NewCollector(description string, gv *prometheus.GaugeVec) *Collector { + return &Collector{ + m: sync.RWMutex{}, + description: description, + values: make(map[uint64]uint64), + gaugeVec: gv, + } +} + +func (c *Collector) Get(id uint64) uint64 { + c.m.RLock() + defer c.m.RUnlock() + + return c.values[id] +} + +func (c *Collector) Set(id uint64, v uint64) { + c.m.Lock() + defer c.m.Unlock() + + c.values[id] = v +} + +func (c *Collector) Total() uint64 { + c.m.RLock() + defer c.m.RUnlock() + + total := counter.NewCounter(0) + for _, v := range c.values { + err := total.Increment(v) + if err != nil { + logger.Errorw("Failed to increment total counter", "error", err) + } + } + + return total.Get() +} + +func (c *Collector) Reset() { + c.m.Lock() + defer c.m.Unlock() + + c.values = make(map[uint64]uint64) +} + +func (c *Collector) Description() string { + c.m.RLock() + defer c.m.RUnlock() + + return c.description +} + +func (c *Collector) GaugeVec() *prometheus.GaugeVec { + c.m.RLock() + defer c.m.RUnlock() + + return c.gaugeVec +} + +func (c *Collector) Values() map[uint64]uint64 { + c.m.RLock() + defer c.m.RUnlock() + + return maps.Clone(c.values) +} diff --git a/pkg/metrics/event_collector.go b/pkg/metrics/event_collector.go new file mode 100644 index 000000000000..8a336a0f2069 --- /dev/null +++ b/pkg/metrics/event_collector.go @@ -0,0 +1,69 @@ +package metrics + +import ( + "github.com/aquasecurity/tracee/pkg/counter" + "github.com/aquasecurity/tracee/pkg/events" + "github.com/aquasecurity/tracee/pkg/logger" + "github.com/prometheus/client_golang/prometheus" +) + +type EventCollector struct { + c *Collector +} + +func NewEventCollector(description string, gv *prometheus.GaugeVec) *EventCollector { + return &EventCollector{ + c: NewCollector(description, gv), + } +} + +func (ec *EventCollector) Get(id uint64) uint64 { + return ec.c.Get(id) +} + +func (ec *EventCollector) Set(id uint64, v uint64) { + ec.c.Set(id, v) +} + +func (ec *EventCollector) Total() uint64 { + return ec.c.Total() +} + +func (ec *EventCollector) Reset() { + ec.c.Reset() +} + +func (ec *EventCollector) Description() string { + return ec.c.Description() +} + +func (ec *EventCollector) GaugeVec() *prometheus.GaugeVec { + return ec.c.GaugeVec() +} + +func (ec *EventCollector) Values() map[uint64]uint64 { + return ec.c.Values() +} + +func (ec *EventCollector) Log() { + values := ec.c.Values() + description := ec.c.Description() + + keyVals := make([]interface{}, 0, len(values)*2+1) + total := counter.NewCounter(0) + for k, v := range values { + keyVals = append(keyVals, + events.Core.GetDefinitionByID(events.ID(k)).GetName(), + v, + ) + + err := total.Increment(v) + if err != nil { + logger.Errorw("Failed to increment total counter", "error", err) + } + } + + // Log the counts + keyVals = append(keyVals, "total", total.Get()) + logger.Infow(description, keyVals...) +} diff --git a/pkg/metrics/stats.go b/pkg/metrics/stats.go index 08eccfc5e8eb..bbd726ee139b 100644 --- a/pkg/metrics/stats.go +++ b/pkg/metrics/stats.go @@ -18,6 +18,45 @@ type Stats struct { LostWrCount counter.Counter LostNtCapCount counter.Counter // lost network capture events LostBPFLogsCount counter.Counter + + BPFPerfEventSubmitAttemptsCount *EventCollector + BPFPerfEventSubmitFailuresCount *EventCollector +} + +func NewStats() *Stats { + return &Stats{ + EventCount: counter.NewCounter(0), + EventsFiltered: counter.NewCounter(0), + NetCapCount: counter.NewCounter(0), + BPFLogsCount: counter.NewCounter(0), + ErrorCount: counter.NewCounter(0), + LostEvCount: counter.NewCounter(0), + LostWrCount: counter.NewCounter(0), + LostNtCapCount: counter.NewCounter(0), + LostBPFLogsCount: counter.NewCounter(0), + BPFPerfEventSubmitAttemptsCount: NewEventCollector( + "Event submit attempts", + prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: "tracee_ebpf", + Name: "bpf_perf_event_submit_attempts", + Help: "calls to submit to the event perf buffer", + }, + []string{"event_name"}, + ), + ), + BPFPerfEventSubmitFailuresCount: NewEventCollector( + "Event submit failures", + prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: "tracee_ebpf", + Name: "bpf_perf_event_submit_failures", + Help: "failed calls to submit to the event perf buffer", + }, + []string{"event_name"}, + ), + ), + } } // Register Stats to prometheus metrics exporter @@ -27,7 +66,6 @@ func (stats *Stats) RegisterPrometheus() error { Name: "events_total", Help: "events collected by tracee-ebpf", }, func() float64 { return float64(stats.EventCount.Get()) })) - if err != nil { return errfmt.WrapError(err) } @@ -37,7 +75,6 @@ func (stats *Stats) RegisterPrometheus() error { Name: "events_filtered", Help: "events filtered by tracee-ebpf in userspace", }, func() float64 { return float64(stats.EventsFiltered.Get()) })) - if err != nil { return errfmt.WrapError(err) } @@ -47,56 +84,63 @@ func (stats *Stats) RegisterPrometheus() error { Name: "network_capture_events_total", Help: "network capture events collected by tracee-ebpf", }, func() float64 { return float64(stats.NetCapCount.Get()) })) - if err != nil { return errfmt.WrapError(err) } err = prometheus.Register(prometheus.NewCounterFunc(prometheus.CounterOpts{ Namespace: "tracee_ebpf", - Name: "lostevents_total", - Help: "events lost in the submission buffer", - }, func() float64 { return float64(stats.LostEvCount.Get()) })) + Name: "bpf_logs_total", + Help: "logs collected by tracee-ebpf during ebpf execution", + }, func() float64 { return float64(stats.BPFLogsCount.Get()) })) + if err != nil { + return errfmt.WrapError(err) + } + + // Updated by countPerfEventSubmissions() goroutine + err = prometheus.Register(stats.BPFPerfEventSubmitAttemptsCount.GaugeVec()) + if err != nil { + return errfmt.WrapError(err) + } + // Updated by countPerfEventSubmissions() goroutine + err = prometheus.Register(stats.BPFPerfEventSubmitFailuresCount.GaugeVec()) if err != nil { return errfmt.WrapError(err) } err = prometheus.Register(prometheus.NewCounterFunc(prometheus.CounterOpts{ Namespace: "tracee_ebpf", - Name: "write_lostevents_total", - Help: "events lost in the write buffer", - }, func() float64 { return float64(stats.LostWrCount.Get()) })) - + Name: "errors_total", + Help: "errors accumulated by tracee-ebpf", + }, func() float64 { return float64(stats.ErrorCount.Get()) })) if err != nil { return errfmt.WrapError(err) } err = prometheus.Register(prometheus.NewCounterFunc(prometheus.CounterOpts{ Namespace: "tracee_ebpf", - Name: "network_capture_lostevents_total", - Help: "network capture lost events in network capture buffer", - }, func() float64 { return float64(stats.LostNtCapCount.Get()) })) - + Name: "lostevents_total", + Help: "events lost in the submission buffer", + }, func() float64 { return float64(stats.LostEvCount.Get()) })) if err != nil { return errfmt.WrapError(err) } err = prometheus.Register(prometheus.NewCounterFunc(prometheus.CounterOpts{ Namespace: "tracee_ebpf", - Name: "bpf_logs_total", - Help: "logs collected by tracee-ebpf during ebpf execution", - }, func() float64 { return float64(stats.BPFLogsCount.Get()) })) - + Name: "write_lostevents_total", + Help: "events lost in the write buffer", + }, func() float64 { return float64(stats.LostWrCount.Get()) })) if err != nil { return errfmt.WrapError(err) } err = prometheus.Register(prometheus.NewCounterFunc(prometheus.CounterOpts{ Namespace: "tracee_ebpf", - Name: "errors_total", - Help: "errors accumulated by tracee-ebpf", - }, func() float64 { return float64(stats.ErrorCount.Get()) })) + Name: "network_capture_lostevents_total", + Help: "network capture lost events in network capture buffer", + }, func() float64 { return float64(stats.LostNtCapCount.Get()) })) return errfmt.WrapError(err) } diff --git a/pkg/version/version.go b/pkg/version/version.go index dd72e954731b..9dd25062c942 100644 --- a/pkg/version/version.go +++ b/pkg/version/version.go @@ -1,7 +1,14 @@ package version -var version string +var ( + version string + metrics string +) func GetVersion() string { return version } + +func MetricsBuild() bool { + return metrics == "1" +}