Skip to content

Commit

Permalink
feat: add metrics for total_active_file and total_inactive_file memory
Browse files Browse the repository at this point in the history
The goal of this PR is to have additional cAdvisor metrics which
expose total_active_file and total_inactive_file.

Today working_set_bytes subtracts total_inactive_file in its calculation,
but there are situations where exposing these metrics directly is valuable.

For example, two containers sharing files in an emptyDir increases total_active_file over time. This is not tracked in the working_set memory.

Exposing total_active_file and total_inactive_file to the user
allows them to subtract out total_active_file or total_inactive_file
if they so choose in their alerts.

In the case of prometheus with a thanos sidecar, working_set can give
a false sense of high memory usage. The kernel counts thanos reading prometheus written files as "active_file" memory.
In that situation, a user may want to exclude active_file from their ContainerLowOnMemory alert.

Relates to: kubernetes/kubernetes#43916
  • Loading branch information
jrcichra committed Apr 24, 2024
1 parent 54dff2b commit 52368c7
Show file tree
Hide file tree
Showing 13 changed files with 126 additions and 18 deletions.
20 changes: 20 additions & 0 deletions cmd/internal/storage/bigquery/bigquery.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,10 @@ const (
colMemoryUsage string = "memory_usage"
// Working set size
colMemoryWorkingSet string = "memory_working_set"
// Total active file size
colMemoryTotalActiveFile string = "memory_total_active_file"
// Total inactive file size
colMemoryTotalInactiveFile string = "memory_total_inactive_file"
// Container page fault
colMemoryContainerPgfault string = "memory_container_pgfault"
// Constainer major page fault
Expand Down Expand Up @@ -133,6 +137,16 @@ func (s *bigqueryStorage) GetSchema() *bigquery.TableSchema {
Name: colMemoryWorkingSet,
}
i++
fields[i] = &bigquery.TableFieldSchema{
Type: typeInteger,
Name: colMemoryTotalActiveFile,
}
i++
fields[i] = &bigquery.TableFieldSchema{
Type: typeInteger,
Name: colMemoryTotalInactiveFile,
}
i++
fields[i] = &bigquery.TableFieldSchema{
Type: typeInteger,
Name: colMemoryContainerPgfault,
Expand Down Expand Up @@ -226,6 +240,12 @@ func (s *bigqueryStorage) containerStatsToRows(
// Working set size
row[colMemoryWorkingSet] = stats.Memory.WorkingSet

// Total active file size
row[colMemoryTotalActiveFile] = stats.Memory.TotalActiveFile

// Total inactive file size
row[colMemoryTotalInactiveFile] = stats.Memory.TotalInactiveFile

// container page fault
row[colMemoryContainerPgfault] = stats.Memory.ContainerData.Pgfault

Expand Down
8 changes: 8 additions & 0 deletions cmd/internal/storage/influxdb/influxdb.go
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,10 @@ const (
serMemoryMappedFile string = "memory_mapped_file"
// Working set size
serMemoryWorkingSet string = "memory_working_set"
// Total active file size
serMemoryTotalActiveFile string = "memory_total_active_file"
// Total inactive file size
serMemoryTotalInactiveFile string = "memory_total_inactive_file"
// Number of memory usage hits limits
serMemoryFailcnt string = "memory_failcnt"
// Cumulative count of memory allocation failures
Expand Down Expand Up @@ -256,6 +260,10 @@ func (s *influxdbStorage) memoryStatsToPoints(
points = append(points, makePoint(serMemoryMappedFile, stats.Memory.MappedFile))
// Working Set Size
points = append(points, makePoint(serMemoryWorkingSet, stats.Memory.WorkingSet))
// Total Active File Size
points = append(points, makePoint(serMemoryTotalActiveFile, stats.Memory.TotalActiveFile))
// Total Inactive File Size
points = append(points, makePoint(serMemoryTotalInactiveFile, stats.Memory.TotalInactiveFile))
// Number of memory usage hits limits
points = append(points, makePoint(serMemoryFailcnt, stats.Memory.Failcnt))

Expand Down
32 changes: 22 additions & 10 deletions cmd/internal/storage/influxdb/influxdb_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,14 @@ func (self *influxDbTestStorageDriver) StatsEq(a, b *info.ContainerStats) bool {
return false
}

if a.Memory.TotalActiveFile != b.Memory.TotalActiveFile {
return false
}

if a.Memory.TotalInactiveFile != b.Memory.TotalInactiveFile {
return false
}

if !reflect.DeepEqual(a.Network, b.Network) {
return false
}
Expand Down Expand Up @@ -253,6 +261,8 @@ func TestContainerStatsToPoints(t *testing.T) {
assertContainsPointWithValue(t, points, serMemoryMappedFile, stats.Memory.MappedFile)
assertContainsPointWithValue(t, points, serMemoryUsage, stats.Memory.Usage)
assertContainsPointWithValue(t, points, serMemoryWorkingSet, stats.Memory.WorkingSet)
assertContainsPointWithValue(t, points, serMemoryTotalActiveFile, stats.Memory.TotalActiveFile)
assertContainsPointWithValue(t, points, serMemoryTotalInactiveFile, stats.Memory.TotalInactiveFile)
assertContainsPointWithValue(t, points, serMemoryFailcnt, stats.Memory.Failcnt)
assertContainsPointWithValue(t, points, serMemoryFailure, stats.Memory.ContainerData.Pgfault)
assertContainsPointWithValue(t, points, serMemoryFailure, stats.Memory.ContainerData.Pgmajfault)
Expand Down Expand Up @@ -346,16 +356,18 @@ func createTestStats() (*info.ContainerInfo, *info.ContainerStats) {
LoadAverage: int32(rand.Intn(1000)),
},
Memory: info.MemoryStats{
Usage: 26767396864,
MaxUsage: 30429605888,
Cache: 7837376512,
RSS: 18930020352,
Swap: 1024,
MappedFile: 1025327104,
WorkingSet: 23630012416,
Failcnt: 1,
ContainerData: info.MemoryStatsMemoryData{Pgfault: 100328455, Pgmajfault: 97},
HierarchicalData: info.MemoryStatsMemoryData{Pgfault: 100328454, Pgmajfault: 96},
Usage: 26767396864,
MaxUsage: 30429605888,
Cache: 7837376512,
RSS: 18930020352,
Swap: 1024,
MappedFile: 1025327104,
WorkingSet: 23630012416,
TotalActiveFile: 29459246253,
TotalInactiveFile: 28364536434,
Failcnt: 1,
ContainerData: info.MemoryStatsMemoryData{Pgfault: 100328455, Pgmajfault: 97},
HierarchicalData: info.MemoryStatsMemoryData{Pgfault: 100328454, Pgmajfault: 96},
},
Hugetlb: map[string]info.HugetlbStats{
"1GB": {Usage: 1234, MaxUsage: 5678, Failcnt: 9},
Expand Down
8 changes: 8 additions & 0 deletions cmd/internal/storage/statsd/statsd.go
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,10 @@ const (
serMemoryMappedFile string = "memory_mapped_file"
// Working set size
serMemoryWorkingSet string = "memory_working_set"
// Total active file size
serMemoryTotalActiveFile string = "memory_total_active_file"
// Total inactive file size
serMemoryTotalInactiveFile string = "memory_total_inactive_file"
// Number of memory usage hits limits
serMemoryFailcnt string = "memory_failcnt"
// Cumulative count of memory allocation failures
Expand Down Expand Up @@ -159,6 +163,10 @@ func (s *statsdStorage) memoryStatsToValues(series *map[string]uint64, stats *in
(*series)[serMemoryMappedFile] = stats.Memory.MappedFile
// Working Set Size
(*series)[serMemoryWorkingSet] = stats.Memory.WorkingSet
// Total Active File Size
(*series)[serMemoryTotalActiveFile] = stats.Memory.TotalActiveFile
// Total Inactive File Size
(*series)[serMemoryTotalInactiveFile] = stats.Memory.TotalInactiveFile
// Number of memory usage hits limits
(*series)[serMemoryFailcnt] = stats.Memory.Failcnt

Expand Down
8 changes: 8 additions & 0 deletions cmd/internal/storage/stdout/stdout.go
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,10 @@ const (
serMemoryMappedFile string = "memory_mapped_file"
// Working set size
serMemoryWorkingSet string = "memory_working_set"
// Total active file
serMemoryTotalActiveFile string = "memory_total_active_file"
// Total inactive file
serMemoryTotalInactiveFile string = "memory_total_inactive_file"
// Number of memory usage hits limits
serMemoryFailcnt string = "memory_failcnt"
// Cumulative count of memory allocation failures
Expand Down Expand Up @@ -164,6 +168,10 @@ func (driver *stdoutStorage) memoryStatsToValues(series *map[string]uint64, stat
(*series)[serMemoryMappedFile] = stats.Memory.MappedFile
// Working Set Size
(*series)[serMemoryWorkingSet] = stats.Memory.WorkingSet
// Total Active File
(*series)[serMemoryTotalActiveFile] = stats.Memory.TotalActiveFile
// Total Inactive File
(*series)[serMemoryTotalInactiveFile] = stats.Memory.TotalInactiveFile
// Number of memory usage hits limits
(*series)[serMemoryFailcnt] = stats.Memory.Failcnt

Expand Down
10 changes: 10 additions & 0 deletions container/libcontainer/handler.go
Original file line number Diff line number Diff line change
Expand Up @@ -834,8 +834,18 @@ func setMemoryStats(s *cgroups.Stats, ret *info.ContainerStats) {
inactiveFileKeyName = "inactive_file"
}

activeFileKeyName := "total_active_file"
if cgroups.IsCgroup2UnifiedMode() {
activeFileKeyName = "active_file"
}

if v, ok := s.MemoryStats.Stats[activeFileKeyName]; ok {
ret.Memory.TotalActiveFile = v
}

workingSet := ret.Memory.Usage
if v, ok := s.MemoryStats.Stats[inactiveFileKeyName]; ok {
ret.Memory.TotalInactiveFile = v
if workingSet < v {
workingSet = 0
} else {
Expand Down
8 changes: 8 additions & 0 deletions info/v1/container.go
Original file line number Diff line number Diff line change
Expand Up @@ -393,6 +393,14 @@ type MemoryStats struct {
// Units: Bytes.
WorkingSet uint64 `json:"working_set"`

// The total amount of active file memory.
// Units: Bytes.
TotalActiveFile uint64 `json:"total_active_file"`

// The total amount of inactive file memory.
// Units: Bytes.
TotalInactiveFile uint64 `json:"total_inactive_file"`

Failcnt uint64 `json:"failcnt"`

// Size of kernel memory allocated in bytes.
Expand Down
12 changes: 7 additions & 5 deletions info/v2/conversion_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -137,11 +137,13 @@ func TestContainerStatsFromV1(t *testing.T) {
v1Stats := v1.ContainerStats{
Timestamp: timestamp,
Memory: v1.MemoryStats{
Usage: 1,
Cache: 2,
RSS: 3,
WorkingSet: 4,
Failcnt: 5,
Usage: 1,
Cache: 2,
RSS: 3,
WorkingSet: 4,
Failcnt: 5,
TotalActiveFile: 6,
TotalInactiveFile: 7,
ContainerData: v1.MemoryStatsMemoryData{
Pgfault: 1,
Pgmajfault: 2,
Expand Down
2 changes: 2 additions & 0 deletions integration/tests/api/test_utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,8 @@ func checkMemoryStats(t *testing.T, stat info.MemoryStats) {

assert.NotEqual(0, stat.Usage, "Memory usage should not be zero")
assert.NotEqual(0, stat.WorkingSet, "Memory working set should not be zero")
assert.NotEqual(0, stat.TotalActiveFile, "Memory total active file should not be zero")
assert.NotEqual(0, stat.TotalInactiveFile, "Memory total inactive file should not be zero")
if stat.WorkingSet > stat.Usage {
t.Errorf("Memory working set (%d) should be at most equal to memory usage (%d)", stat.WorkingSet, stat.Usage)
}
Expand Down
16 changes: 16 additions & 0 deletions metrics/prometheus.go
Original file line number Diff line number Diff line change
Expand Up @@ -431,6 +431,22 @@ func NewPrometheusCollector(i infoProvider, f ContainerLabelsFunc, includedMetri
return metricValues{{value: float64(s.Memory.WorkingSet), timestamp: s.Timestamp}}
},
},
{
name: "container_memory_total_active_file_bytes",
help: "Current total active file in bytes.",
valueType: prometheus.GaugeValue,
getValues: func(s *info.ContainerStats) metricValues {
return metricValues{{value: float64(s.Memory.TotalActiveFile), timestamp: s.Timestamp}}
},
},
{
name: "container_memory_total_inactive_file_bytes",
help: "Current total inactive file in bytes.",
valueType: prometheus.GaugeValue,
getValues: func(s *info.ContainerStats) metricValues {
return metricValues{{value: float64(s.Memory.TotalInactiveFile), timestamp: s.Timestamp}}
},
},
{
name: "container_memory_failures_total",
help: "Cumulative count of memory allocation failures.",
Expand Down
8 changes: 5 additions & 3 deletions metrics/prometheus_fake.go
Original file line number Diff line number Diff line change
Expand Up @@ -329,9 +329,11 @@ func (p testSubcontainersInfoProvider) GetRequestedContainersInfo(string, v2.Req
LoadAverage: 2,
},
Memory: info.MemoryStats{
Usage: 8,
MaxUsage: 8,
WorkingSet: 9,
Usage: 8,
MaxUsage: 8,
WorkingSet: 9,
TotalActiveFile: 7,
TotalInactiveFile: 6,
ContainerData: info.MemoryStatsMemoryData{
Pgfault: 10,
Pgmajfault: 11,
Expand Down
6 changes: 6 additions & 0 deletions metrics/testdata/prometheus_metrics
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,12 @@ container_memory_rss{container_env_foo_env="prod",container_label_foo_label="bar
# HELP container_memory_swap Container swap usage in bytes.
# TYPE container_memory_swap gauge
container_memory_swap{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 8192 1395066363000
# HELP container_memory_total_active_file_bytes Current total active file in bytes.
# TYPE container_memory_total_active_file_bytes gauge
container_memory_total_active_file_bytes{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 7 1395066363000
# HELP container_memory_total_inactive_file_bytes Current total inactive file in bytes.
# TYPE container_memory_total_inactive_file_bytes gauge
container_memory_total_inactive_file_bytes{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 6 1395066363000
# HELP container_memory_usage_bytes Current memory usage in bytes, including all memory regardless of when it was accessed
# TYPE container_memory_usage_bytes gauge
container_memory_usage_bytes{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 8 1395066363000
Expand Down
6 changes: 6 additions & 0 deletions metrics/testdata/prometheus_metrics_whitelist_filtered
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,12 @@ container_memory_rss{container_env_foo_env="prod",id="testcontainer",image="test
# HELP container_memory_swap Container swap usage in bytes.
# TYPE container_memory_swap gauge
container_memory_swap{container_env_foo_env="prod",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 8192 1395066363000
# HELP container_memory_total_active_file_bytes Current total active file in bytes.
# TYPE container_memory_total_active_file_bytes gauge
container_memory_total_active_file_bytes{container_env_foo_env="prod",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 7 1395066363000
# HELP container_memory_total_inactive_file_bytes Current total inactive file in bytes.
# TYPE container_memory_total_inactive_file_bytes gauge
container_memory_total_inactive_file_bytes{container_env_foo_env="prod",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 6 1395066363000
# HELP container_memory_usage_bytes Current memory usage in bytes, including all memory regardless of when it was accessed
# TYPE container_memory_usage_bytes gauge
container_memory_usage_bytes{container_env_foo_env="prod",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 8 1395066363000
Expand Down

0 comments on commit 52368c7

Please sign in to comment.