Skip to content

Commit

Permalink
chore: add perfbuf metric per event (METRICS=1)
Browse files Browse the repository at this point in the history
Enabled only when the build with METRICS=1.

BPFPerfEventSubmitAttemptsCount and BPFPerfEventSubmitFailuresCount
count the number of events processed by the eBPF programs and written to
or attempted to be written to the perf buffer.

It is incremented right after the attempt of writing the event to the
perf buffer, making it possible to measure if the that event was
successfully written to the perf buffer or not.

This metric can be used to monitor the performance of individual eBPF
events and to detect potential bottlenecks.
  • Loading branch information
geyslan committed Oct 15, 2024
1 parent 30b33a4 commit fac164c
Show file tree
Hide file tree
Showing 8 changed files with 354 additions and 25 deletions.
6 changes: 6 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,10 @@ else
GO_DEBUG_FLAG = -w
endif

ifeq ($(METRICS),1)
BPF_DEBUG_FLAG += -DMETRICS
endif

ifeq ($(UNAME_M),x86_64)
ARCH = x86_64
LINUX_ARCH = x86
Expand Down Expand Up @@ -423,6 +427,7 @@ $(OUTPUT_DIR)/tracee.bpf.o: \
$(TRACEE_EBPF_OBJ_HEADERS)
#
$(CMD_CLANG) \
$(BPF_DEBUG_FLAG) \
-D__TARGET_ARCH_$(LINUX_ARCH) \
-D__BPF_TRACING__ \
-DCORE \
Expand Down Expand Up @@ -501,6 +506,7 @@ $(OUTPUT_DIR)/tracee: \
-ldflags="$(GO_DEBUG_FLAG) \
-extldflags \"$(CGO_EXT_LDFLAGS_EBPF)\" \
-X github.com/aquasecurity/tracee/pkg/version.version=$(VERSION) \
-X github.com/aquasecurity/tracee/pkg/version.debug=$(METRICS) \
" \
-v -o $@ \
./cmd/tracee
Expand Down
33 changes: 32 additions & 1 deletion pkg/ebpf/c/common/buffer.h
Original file line number Diff line number Diff line change
Expand Up @@ -458,6 +458,24 @@ statfunc int save_args_to_submit_buf(event_data_t *event, args_t *args)
return arg_num;
}

#ifdef METRICS
struct event_stats_values {
u64 attempts;
u64 failures;
};

typedef struct event_stats_values event_stats_values_t;

struct events_stats {
__uint(type, BPF_MAP_TYPE_HASH);
__uint(max_entries, MAX_EVENT_ID);
__type(key, u32); // eventid
__type(value, event_stats_values_t);
} events_stats SEC(".maps");

typedef struct events_stats events_stats_t;
#endif

statfunc int events_perf_submit(program_data_t *p, long ret)
{
p->event->context.retval = ret;
Expand All @@ -484,7 +502,20 @@ statfunc int events_perf_submit(program_data_t *p, long ret)
:
: [size] "r"(size), [max_size] "i"(MAX_EVENT_SIZE));

return bpf_perf_event_output(p->ctx, &events, BPF_F_CURRENT_CPU, p->event, size);
u64 perf_ret = bpf_perf_event_output(p->ctx, &events, BPF_F_CURRENT_CPU, p->event, size);

#ifdef METRICS
// update event stats
event_stats_values_t *evt_stat = bpf_map_lookup_elem(&events_stats, &p->event->context.eventid);
if (unlikely(evt_stat == NULL))
return perf_ret;

__sync_fetch_and_add(&evt_stat->attempts, 1);
if (perf_ret < 0)
__sync_fetch_and_add(&evt_stat->failures, 1);
#endif

return perf_ret;
}

statfunc int signal_perf_submit(void *ctx, controlplane_signal_t *sig)
Expand Down
79 changes: 79 additions & 0 deletions pkg/ebpf/perf_count.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
package ebpf

import (
"context"
"encoding/binary"
"time"
"unsafe"

"github.com/aquasecurity/tracee/pkg/events"
"github.com/aquasecurity/tracee/pkg/logger"
)

// eventStatsValues mirrors the C struct event_stats_values (event_stats_values_t).
type eventStatsValues struct {
submitAttempts uint64
submitFailures uint64
}

// countPerfEventSubmissions is a goroutine that periodically counts the
// number of attempts and failures to submit events to the perf buffer
func (t *Tracee) countPerfEventSubmissions(ctx context.Context) {
logger.Debugw("Starting countPerfEventSubmissions goroutine")
defer logger.Debugw("Stopped countPerfEventSubmissions goroutine")

evtsCountsBPFMap, err := t.bpfModule.GetMap("events_stats")
if err != nil {
logger.Errorw("Failed to get events_stats map", "error", err)
return
}

evtStatZero := eventStatsValues{}
for _, id := range t.policyManager.EventsSelected() {
key := uint32(id)
err := evtsCountsBPFMap.Update(unsafe.Pointer(&key), unsafe.Pointer(&evtStatZero))
if err != nil {
logger.Errorw("Failed to update events_stats map", "error", err)
}
}

ticker := time.NewTicker(10 * time.Second)
defer ticker.Stop()

for {
select {
case <-ctx.Done():
return
case <-ticker.C:
t.stats.BPFPerfEventSubmitAttemptsCount.Reset()
t.stats.BPFPerfEventSubmitFailuresCount.Reset()

// Get the counts of each event from the BPF map
iter := evtsCountsBPFMap.Iterator()
for iter.Next() {
key := binary.LittleEndian.Uint32(iter.Key())
value, err := evtsCountsBPFMap.GetValue(unsafe.Pointer(&key))
if err != nil {
logger.Errorw("Failed to get value from events_stats map", "error", err)
continue
}

// Get counts
id := events.ID(key)
attempts := binary.LittleEndian.Uint64(value[0:8])
failures := binary.LittleEndian.Uint64(value[8:16])
t.stats.BPFPerfEventSubmitAttemptsCount.Set(uint64(id), attempts)
t.stats.BPFPerfEventSubmitFailuresCount.Set(uint64(id), failures)

// Update Prometheus metrics for current event
evtName := events.Core.GetDefinitionByID(id).GetName()
t.stats.BPFPerfEventSubmitAttemptsCount.GaugeVec().WithLabelValues(evtName).Set(float64(attempts))
t.stats.BPFPerfEventSubmitFailuresCount.GaugeVec().WithLabelValues(evtName).Set(float64(failures))
}

// Log the counts
t.stats.BPFPerfEventSubmitAttemptsCount.Log()
t.stats.BPFPerfEventSubmitFailuresCount.Log()
}
}
}
14 changes: 12 additions & 2 deletions pkg/ebpf/tracee.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ import (
"github.com/aquasecurity/tracee/pkg/utils/environment"
"github.com/aquasecurity/tracee/pkg/utils/proc"
"github.com/aquasecurity/tracee/pkg/utils/sharedobjs"
"github.com/aquasecurity/tracee/pkg/version"
"github.com/aquasecurity/tracee/types/trace"
)

Expand All @@ -63,7 +64,9 @@ type Tracee struct {
running atomic.Bool
done chan struct{} // signal to safely stop end-stage processing
OutDir *os.File // use utils.XXX functions to create or write to this file
stats metrics.Stats
// Metrics
stats *metrics.Stats
// Engine
sigEngine *engine.Engine
// Events
eventsSorter *sorting.EventsChronologicalSorter
Expand Down Expand Up @@ -128,7 +131,7 @@ type Tracee struct {
}

func (t *Tracee) Stats() *metrics.Stats {
return &t.stats
return t.stats
}

func (t *Tracee) Engine() *engine.Engine {
Expand Down Expand Up @@ -224,6 +227,7 @@ func New(cfg config.Config) (*Tracee, error) {
t := &Tracee{
config: cfg,
done: make(chan struct{}),
stats: metrics.NewStats(),
writtenFiles: make(map[string]string),
readFiles: make(map[string]string),
capturedFiles: make(map[string]int64),
Expand Down Expand Up @@ -1370,6 +1374,12 @@ func (t *Tracee) Run(ctx gocontext.Context) error {
t.controlPlane.Start()
go t.controlPlane.Run(ctx)

// Measure event perf buffer write attempts (METRICS build only)

if version.MetricsBuild() {
go t.countPerfEventSubmissions(ctx)
}

// Main event loop (polling events perf buffer)

t.eventsPerfMap.Poll(pollTimeout)
Expand Down
83 changes: 83 additions & 0 deletions pkg/metrics/collector.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
package metrics

import (
"maps"
"sync"

"github.com/aquasecurity/tracee/pkg/counter"
"github.com/aquasecurity/tracee/pkg/logger"
"github.com/prometheus/client_golang/prometheus"
)

type Collector struct {
m sync.RWMutex
description string
values map[uint64]uint64
gaugeVec *prometheus.GaugeVec
}

func NewCollector(description string, gv *prometheus.GaugeVec) *Collector {
return &Collector{
m: sync.RWMutex{},
description: description,
values: make(map[uint64]uint64),
gaugeVec: gv,
}
}

func (c *Collector) Get(id uint64) uint64 {
c.m.RLock()
defer c.m.RUnlock()

return c.values[id]
}

func (c *Collector) Set(id uint64, v uint64) {
c.m.Lock()
defer c.m.Unlock()

c.values[id] = v
}

func (c *Collector) Total() uint64 {
c.m.RLock()
defer c.m.RUnlock()

total := counter.NewCounter(0)
for _, v := range c.values {
err := total.Increment(v)
if err != nil {
logger.Errorw("Failed to increment total counter", "error", err)
}
}

return total.Get()
}

func (c *Collector) Reset() {
c.m.Lock()
defer c.m.Unlock()

c.values = make(map[uint64]uint64)
}

func (c *Collector) Description() string {
c.m.RLock()
defer c.m.RUnlock()

return c.description
}

func (c *Collector) GaugeVec() *prometheus.GaugeVec {
c.m.RLock()
defer c.m.RUnlock()

return c.gaugeVec
}

func (c *Collector) Values() map[uint64]uint64 {
c.m.RLock()
defer c.m.RUnlock()

return maps.Clone(c.values)
}
69 changes: 69 additions & 0 deletions pkg/metrics/event_collector.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
package metrics

import (
"github.com/aquasecurity/tracee/pkg/counter"
"github.com/aquasecurity/tracee/pkg/events"
"github.com/aquasecurity/tracee/pkg/logger"
"github.com/prometheus/client_golang/prometheus"
)

type EventCollector struct {
c *Collector
}

func NewEventCollector(description string, gv *prometheus.GaugeVec) *EventCollector {
return &EventCollector{
c: NewCollector(description, gv),
}
}

func (ec *EventCollector) Get(id uint64) uint64 {
return ec.c.Get(id)
}

func (ec *EventCollector) Set(id uint64, v uint64) {
ec.c.Set(id, v)
}

func (ec *EventCollector) Total() uint64 {
return ec.c.Total()
}

func (ec *EventCollector) Reset() {
ec.c.Reset()
}

func (ec *EventCollector) Description() string {
return ec.c.Description()
}

func (ec *EventCollector) GaugeVec() *prometheus.GaugeVec {
return ec.c.GaugeVec()
}

func (ec *EventCollector) Values() map[uint64]uint64 {
return ec.c.Values()
}

func (ec *EventCollector) Log() {
values := ec.c.Values()
description := ec.c.Description()

keyVals := make([]interface{}, 0, len(values)*2+1)
total := counter.NewCounter(0)
for k, v := range values {
keyVals = append(keyVals,
events.Core.GetDefinitionByID(events.ID(k)).GetName(),
v,
)

err := total.Increment(v)
if err != nil {
logger.Errorw("Failed to increment total counter", "error", err)
}
}

// Log the counts
keyVals = append(keyVals, "total", total.Get())
logger.Infow(description, keyVals...)
}
Loading

0 comments on commit fac164c

Please sign in to comment.