diff --git a/Makefile b/Makefile index 66c47ba22ca4..715af62e6697 100644 --- a/Makefile +++ b/Makefile @@ -167,6 +167,10 @@ else GO_DEBUG_FLAG = -w endif +ifeq ($(METRICS),1) + BPF_DEBUG_FLAG += -DMETRICS +endif + ifeq ($(UNAME_M),x86_64) ARCH = x86_64 LINUX_ARCH = x86 @@ -423,6 +427,7 @@ $(OUTPUT_DIR)/tracee.bpf.o: \ $(TRACEE_EBPF_OBJ_HEADERS) # $(CMD_CLANG) \ + $(BPF_DEBUG_FLAG) \ -D__TARGET_ARCH_$(LINUX_ARCH) \ -D__BPF_TRACING__ \ -DCORE \ @@ -501,6 +506,7 @@ $(OUTPUT_DIR)/tracee: \ -ldflags="$(GO_DEBUG_FLAG) \ -extldflags \"$(CGO_EXT_LDFLAGS_EBPF)\" \ -X github.com/aquasecurity/tracee/pkg/version.version=$(VERSION) \ + -X github.com/aquasecurity/tracee/pkg/version.metrics=$(METRICS) \ " \ -v -o $@ \ ./cmd/tracee diff --git a/pkg/ebpf/c/common/buffer.h b/pkg/ebpf/c/common/buffer.h index 742a277082e5..74cf46cbe6dc 100644 --- a/pkg/ebpf/c/common/buffer.h +++ b/pkg/ebpf/c/common/buffer.h @@ -458,6 +458,24 @@ statfunc int save_args_to_submit_buf(event_data_t *event, args_t *args) return arg_num; } +#ifdef METRICS +struct event_stats_values { + u64 attempts; + u64 failures; +}; + +typedef struct event_stats_values event_stats_values_t; + +struct events_stats { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, MAX_EVENT_ID); + __type(key, u32); // eventid + __type(value, event_stats_values_t); +} events_stats SEC(".maps"); + +typedef struct events_stats events_stats_t; +#endif + statfunc int events_perf_submit(program_data_t *p, long ret) { p->event->context.retval = ret; @@ -484,7 +502,20 @@ statfunc int events_perf_submit(program_data_t *p, long ret) : : [size] "r"(size), [max_size] "i"(MAX_EVENT_SIZE)); - return bpf_perf_event_output(p->ctx, &events, BPF_F_CURRENT_CPU, p->event, size); + u64 perf_ret = bpf_perf_event_output(p->ctx, &events, BPF_F_CURRENT_CPU, p->event, size); + +#ifdef METRICS + // update event stats + event_stats_values_t *evt_stat = bpf_map_lookup_elem(&events_stats, &p->event->context.eventid); + if (unlikely(evt_stat == NULL)) + return perf_ret; + + __sync_fetch_and_add(&evt_stat->attempts, 1); + if (perf_ret < 0) + __sync_fetch_and_add(&evt_stat->failures, 1); +#endif + + return perf_ret; } statfunc int signal_perf_submit(void *ctx, controlplane_signal_t *sig) diff --git a/pkg/ebpf/perf_count.go b/pkg/ebpf/perf_count.go new file mode 100644 index 000000000000..f9512135f2f3 --- /dev/null +++ b/pkg/ebpf/perf_count.go @@ -0,0 +1,79 @@ +package ebpf + +import ( + "context" + "encoding/binary" + "time" + "unsafe" + + "github.com/aquasecurity/tracee/pkg/events" + "github.com/aquasecurity/tracee/pkg/logger" +) + +// eventStatsValues mirrors the C struct event_stats_values (event_stats_values_t). +type eventStatsValues struct { + submitAttempts uint64 + submitFailures uint64 +} + +// countPerfEventSubmissions is a goroutine that periodically counts the +// number of attempts and failures to submit events to the perf buffer +func (t *Tracee) countPerfEventSubmissions(ctx context.Context) { + logger.Debugw("Starting countPerfEventSubmissions goroutine") + defer logger.Debugw("Stopped countPerfEventSubmissions goroutine") + + evtsCountsBPFMap, err := t.bpfModule.GetMap("events_stats") + if err != nil { + logger.Errorw("Failed to get events_stats map", "error", err) + return + } + + evtStatZero := eventStatsValues{} + for _, id := range t.policyManager.EventsSelected() { + key := uint32(id) + err := evtsCountsBPFMap.Update(unsafe.Pointer(&key), unsafe.Pointer(&evtStatZero)) + if err != nil { + logger.Errorw("Failed to update events_stats map", "error", err) + } + } + + ticker := time.NewTicker(10 * time.Second) + defer ticker.Stop() + + for { + select { + case <-ctx.Done(): + return + case <-ticker.C: + t.stats.BPFPerfEventSubmitAttemptsCount.Reset() + t.stats.BPFPerfEventSubmitFailuresCount.Reset() + + // Get the counts of each event from the BPF map + iter := evtsCountsBPFMap.Iterator() + for iter.Next() { + key := binary.LittleEndian.Uint32(iter.Key()) + value, err := evtsCountsBPFMap.GetValue(unsafe.Pointer(&key)) + if err != nil { + logger.Errorw("Failed to get value from events_stats map", "error", err) + continue + } + + // Get counts + id := events.ID(key) + attempts := binary.LittleEndian.Uint64(value[0:8]) + failures := binary.LittleEndian.Uint64(value[8:16]) + t.stats.BPFPerfEventSubmitAttemptsCount.Set(uint64(id), attempts) + t.stats.BPFPerfEventSubmitFailuresCount.Set(uint64(id), failures) + + // Update Prometheus metrics for current event + evtName := events.Core.GetDefinitionByID(id).GetName() + t.stats.BPFPerfEventSubmitAttemptsCount.GaugeVec().WithLabelValues(evtName).Set(float64(attempts)) + t.stats.BPFPerfEventSubmitFailuresCount.GaugeVec().WithLabelValues(evtName).Set(float64(failures)) + } + + // Log the counts + t.stats.BPFPerfEventSubmitAttemptsCount.Log() + t.stats.BPFPerfEventSubmitFailuresCount.Log() + } + } +} diff --git a/pkg/ebpf/tracee.go b/pkg/ebpf/tracee.go index 0945566e034e..7809340c5fb6 100644 --- a/pkg/ebpf/tracee.go +++ b/pkg/ebpf/tracee.go @@ -47,6 +47,7 @@ import ( "github.com/aquasecurity/tracee/pkg/utils/environment" "github.com/aquasecurity/tracee/pkg/utils/proc" "github.com/aquasecurity/tracee/pkg/utils/sharedobjs" + "github.com/aquasecurity/tracee/pkg/version" "github.com/aquasecurity/tracee/types/trace" ) @@ -63,7 +64,9 @@ type Tracee struct { running atomic.Bool done chan struct{} // signal to safely stop end-stage processing OutDir *os.File // use utils.XXX functions to create or write to this file - stats metrics.Stats + // Metrics + stats *metrics.Stats + // Engine sigEngine *engine.Engine // Events eventsSorter *sorting.EventsChronologicalSorter @@ -128,7 +131,7 @@ type Tracee struct { } func (t *Tracee) Stats() *metrics.Stats { - return &t.stats + return t.stats } func (t *Tracee) Engine() *engine.Engine { @@ -224,6 +227,7 @@ func New(cfg config.Config) (*Tracee, error) { t := &Tracee{ config: cfg, done: make(chan struct{}), + stats: metrics.NewStats(), writtenFiles: make(map[string]string), readFiles: make(map[string]string), capturedFiles: make(map[string]int64), @@ -1370,6 +1374,12 @@ func (t *Tracee) Run(ctx gocontext.Context) error { t.controlPlane.Start() go t.controlPlane.Run(ctx) + // Measure event perf buffer write attempts (METRICS build only) + + if version.MetricsBuild() { + go t.countPerfEventSubmissions(ctx) + } + // Main event loop (polling events perf buffer) t.eventsPerfMap.Poll(pollTimeout) diff --git a/pkg/metrics/collector.go b/pkg/metrics/collector.go new file mode 100644 index 000000000000..c7198423d2e4 --- /dev/null +++ b/pkg/metrics/collector.go @@ -0,0 +1,83 @@ +package metrics + +import ( + "maps" + "sync" + + "github.com/aquasecurity/tracee/pkg/counter" + "github.com/aquasecurity/tracee/pkg/logger" + "github.com/prometheus/client_golang/prometheus" +) + +type Collector struct { + m sync.RWMutex + description string + values map[uint64]uint64 + gaugeVec *prometheus.GaugeVec +} + +func NewCollector(description string, gv *prometheus.GaugeVec) *Collector { + return &Collector{ + m: sync.RWMutex{}, + description: description, + values: make(map[uint64]uint64), + gaugeVec: gv, + } +} + +func (c *Collector) Get(id uint64) uint64 { + c.m.RLock() + defer c.m.RUnlock() + + return c.values[id] +} + +func (c *Collector) Set(id uint64, v uint64) { + c.m.Lock() + defer c.m.Unlock() + + c.values[id] = v +} + +func (c *Collector) Total() uint64 { + c.m.RLock() + defer c.m.RUnlock() + + total := counter.NewCounter(0) + for _, v := range c.values { + err := total.Increment(v) + if err != nil { + logger.Errorw("Failed to increment total counter", "error", err) + } + } + + return total.Get() +} + +func (c *Collector) Reset() { + c.m.Lock() + defer c.m.Unlock() + + c.values = make(map[uint64]uint64) +} + +func (c *Collector) Description() string { + c.m.RLock() + defer c.m.RUnlock() + + return c.description +} + +func (c *Collector) GaugeVec() *prometheus.GaugeVec { + c.m.RLock() + defer c.m.RUnlock() + + return c.gaugeVec +} + +func (c *Collector) Values() map[uint64]uint64 { + c.m.RLock() + defer c.m.RUnlock() + + return maps.Clone(c.values) +} diff --git a/pkg/metrics/event_collector.go b/pkg/metrics/event_collector.go new file mode 100644 index 000000000000..8a336a0f2069 --- /dev/null +++ b/pkg/metrics/event_collector.go @@ -0,0 +1,69 @@ +package metrics + +import ( + "github.com/aquasecurity/tracee/pkg/counter" + "github.com/aquasecurity/tracee/pkg/events" + "github.com/aquasecurity/tracee/pkg/logger" + "github.com/prometheus/client_golang/prometheus" +) + +type EventCollector struct { + c *Collector +} + +func NewEventCollector(description string, gv *prometheus.GaugeVec) *EventCollector { + return &EventCollector{ + c: NewCollector(description, gv), + } +} + +func (ec *EventCollector) Get(id uint64) uint64 { + return ec.c.Get(id) +} + +func (ec *EventCollector) Set(id uint64, v uint64) { + ec.c.Set(id, v) +} + +func (ec *EventCollector) Total() uint64 { + return ec.c.Total() +} + +func (ec *EventCollector) Reset() { + ec.c.Reset() +} + +func (ec *EventCollector) Description() string { + return ec.c.Description() +} + +func (ec *EventCollector) GaugeVec() *prometheus.GaugeVec { + return ec.c.GaugeVec() +} + +func (ec *EventCollector) Values() map[uint64]uint64 { + return ec.c.Values() +} + +func (ec *EventCollector) Log() { + values := ec.c.Values() + description := ec.c.Description() + + keyVals := make([]interface{}, 0, len(values)*2+1) + total := counter.NewCounter(0) + for k, v := range values { + keyVals = append(keyVals, + events.Core.GetDefinitionByID(events.ID(k)).GetName(), + v, + ) + + err := total.Increment(v) + if err != nil { + logger.Errorw("Failed to increment total counter", "error", err) + } + } + + // Log the counts + keyVals = append(keyVals, "total", total.Get()) + logger.Infow(description, keyVals...) +} diff --git a/pkg/metrics/stats.go b/pkg/metrics/stats.go index 08eccfc5e8eb..bbd726ee139b 100644 --- a/pkg/metrics/stats.go +++ b/pkg/metrics/stats.go @@ -18,6 +18,45 @@ type Stats struct { LostWrCount counter.Counter LostNtCapCount counter.Counter // lost network capture events LostBPFLogsCount counter.Counter + + BPFPerfEventSubmitAttemptsCount *EventCollector + BPFPerfEventSubmitFailuresCount *EventCollector +} + +func NewStats() *Stats { + return &Stats{ + EventCount: counter.NewCounter(0), + EventsFiltered: counter.NewCounter(0), + NetCapCount: counter.NewCounter(0), + BPFLogsCount: counter.NewCounter(0), + ErrorCount: counter.NewCounter(0), + LostEvCount: counter.NewCounter(0), + LostWrCount: counter.NewCounter(0), + LostNtCapCount: counter.NewCounter(0), + LostBPFLogsCount: counter.NewCounter(0), + BPFPerfEventSubmitAttemptsCount: NewEventCollector( + "Event submit attempts", + prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: "tracee_ebpf", + Name: "bpf_perf_event_submit_attempts", + Help: "calls to submit to the event perf buffer", + }, + []string{"event_name"}, + ), + ), + BPFPerfEventSubmitFailuresCount: NewEventCollector( + "Event submit failures", + prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: "tracee_ebpf", + Name: "bpf_perf_event_submit_failures", + Help: "failed calls to submit to the event perf buffer", + }, + []string{"event_name"}, + ), + ), + } } // Register Stats to prometheus metrics exporter @@ -27,7 +66,6 @@ func (stats *Stats) RegisterPrometheus() error { Name: "events_total", Help: "events collected by tracee-ebpf", }, func() float64 { return float64(stats.EventCount.Get()) })) - if err != nil { return errfmt.WrapError(err) } @@ -37,7 +75,6 @@ func (stats *Stats) RegisterPrometheus() error { Name: "events_filtered", Help: "events filtered by tracee-ebpf in userspace", }, func() float64 { return float64(stats.EventsFiltered.Get()) })) - if err != nil { return errfmt.WrapError(err) } @@ -47,56 +84,63 @@ func (stats *Stats) RegisterPrometheus() error { Name: "network_capture_events_total", Help: "network capture events collected by tracee-ebpf", }, func() float64 { return float64(stats.NetCapCount.Get()) })) - if err != nil { return errfmt.WrapError(err) } err = prometheus.Register(prometheus.NewCounterFunc(prometheus.CounterOpts{ Namespace: "tracee_ebpf", - Name: "lostevents_total", - Help: "events lost in the submission buffer", - }, func() float64 { return float64(stats.LostEvCount.Get()) })) + Name: "bpf_logs_total", + Help: "logs collected by tracee-ebpf during ebpf execution", + }, func() float64 { return float64(stats.BPFLogsCount.Get()) })) + if err != nil { + return errfmt.WrapError(err) + } + + // Updated by countPerfEventSubmissions() goroutine + err = prometheus.Register(stats.BPFPerfEventSubmitAttemptsCount.GaugeVec()) + if err != nil { + return errfmt.WrapError(err) + } + // Updated by countPerfEventSubmissions() goroutine + err = prometheus.Register(stats.BPFPerfEventSubmitFailuresCount.GaugeVec()) if err != nil { return errfmt.WrapError(err) } err = prometheus.Register(prometheus.NewCounterFunc(prometheus.CounterOpts{ Namespace: "tracee_ebpf", - Name: "write_lostevents_total", - Help: "events lost in the write buffer", - }, func() float64 { return float64(stats.LostWrCount.Get()) })) - + Name: "errors_total", + Help: "errors accumulated by tracee-ebpf", + }, func() float64 { return float64(stats.ErrorCount.Get()) })) if err != nil { return errfmt.WrapError(err) } err = prometheus.Register(prometheus.NewCounterFunc(prometheus.CounterOpts{ Namespace: "tracee_ebpf", - Name: "network_capture_lostevents_total", - Help: "network capture lost events in network capture buffer", - }, func() float64 { return float64(stats.LostNtCapCount.Get()) })) - + Name: "lostevents_total", + Help: "events lost in the submission buffer", + }, func() float64 { return float64(stats.LostEvCount.Get()) })) if err != nil { return errfmt.WrapError(err) } err = prometheus.Register(prometheus.NewCounterFunc(prometheus.CounterOpts{ Namespace: "tracee_ebpf", - Name: "bpf_logs_total", - Help: "logs collected by tracee-ebpf during ebpf execution", - }, func() float64 { return float64(stats.BPFLogsCount.Get()) })) - + Name: "write_lostevents_total", + Help: "events lost in the write buffer", + }, func() float64 { return float64(stats.LostWrCount.Get()) })) if err != nil { return errfmt.WrapError(err) } err = prometheus.Register(prometheus.NewCounterFunc(prometheus.CounterOpts{ Namespace: "tracee_ebpf", - Name: "errors_total", - Help: "errors accumulated by tracee-ebpf", - }, func() float64 { return float64(stats.ErrorCount.Get()) })) + Name: "network_capture_lostevents_total", + Help: "network capture lost events in network capture buffer", + }, func() float64 { return float64(stats.LostNtCapCount.Get()) })) return errfmt.WrapError(err) } diff --git a/pkg/version/version.go b/pkg/version/version.go index dd72e954731b..9dd25062c942 100644 --- a/pkg/version/version.go +++ b/pkg/version/version.go @@ -1,7 +1,14 @@ package version -var version string +var ( + version string + metrics string +) func GetVersion() string { return version } + +func MetricsBuild() bool { + return metrics == "1" +}