Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support cgroups v1 and v2 for memory limits #1286

Draft
wants to merge 3 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,10 @@ on:
permissions:
contents: read

defaults:
run:
shell: bash

jobs:

unit-test:
Expand Down
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

- Added Yarn version 4.3.1.
- Added Yarn version 3.8.3.
- Support reading container memory limits from cgroups (v1 and v2)

## [v255] - 2024-06-21

Expand Down
202 changes: 202 additions & 0 deletions etc/cgroups.sh
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@dzuelke this file contains all the "shared" logic between buildpacks, right? if changes occur will we be manually keeping these in sync? is the PHP version the canonical one since it contains most of the underlying tests?

Original file line number Diff line number Diff line change
@@ -0,0 +1,202 @@
#!/usr/bin/env bash

# stdin is the output of e.g. /proc/self/cgroup
cgroup_util_find_controller_from_procfs_cgroup_contents() {
local usage="Usage (stdin is /proc/self/cgroup format): ${FUNCNAME[0]} CONTROLLER"
# there may be an entry for a v1 controller like:
# 7:memory:/someprefix
# if not, then there can be an entry for a v2 unified hierarchy, e.g.:
# 0::/
# we look for the v1 first, as there may be hybrid setups where some controllers are still v1
# so if there is an entry for "memory", a v1 controller is in charge, even if others are v2
(
set -o pipefail
grep -E -e '^[0-9]+:('"${1:?$usage}"')?:/.*' | sort -r -n -k 1 -t ":" | head -n1
)
}

cgroup_util_get_controller_version_from_procfs_cgroup_line() {
readarray -d':' -t line # -t removes trailing delimiter
# with e.g. 'docker run --cgroup-parent foo:bar, the third (relative path) section would contain a colon
if (( ${#line[@]} < 3 )); then
exit 1
fi
if [[ ${line[0]} == "0" ]]; then
echo "2"
else
echo "1"
fi
}

cgroup_util_get_controller_path_from_procfs_cgroup_line() {
readarray -d':' line # no -t, we want any trailing delims for concatenation via printf
if (( ${#line[@]} < 3 )); then
exit 1
fi
# with e.g. 'docker run --cgroup-parent foo:bar, the third (relative path) section would contain a colon, so we have to output from 3 until the end
printf "%s" "${line[@]:2}"
}

# stdin is the output of e.g. /proc/self/mountinfo
# $1 is a controller name, which is matched against the mount options using -O (so it could be a comma-separated list, too)
cgroup_util_find_v1_mount_from_procfs_mountinfo_contents() {
local usage="Usage (stdin is /proc/self/cgroup format): ${FUNCNAME[0]} CONTROLLER"
# must specify --list explicitly or it might output tree parts after all...
findmnt --list --noheadings --first-only -t cgroup -O "${1:?$usage}" -o target -F <(cat)
}

# stdin is the output of e.g. /proc/self/mountinfo
cgroup_util_find_v2_mount_from_procfs_mountinfo_contents() {
# must specify --list explicitly or it might output tree parts after all...
findmnt --list --noheadings --first-only -t cgroup2 -o target -F <(cat)
}

# $1 is the controller name, $2 is the mount root from /proc/self/mountinfo, $3 is the mount relative dir from /proc/self/cgroup
cgroup_util_find_v1_path() {
local usage="Usage: ${FUNCNAME[0]} CONTROLLER MOUNT CGROUP"
local relpath=${3:?$usage}
# strip trailing slash if present (it would also be if it was just "/")
relpath=${relpath%/}
cur="${2:?$usage}${relpath}"
while true; do
if [[ -d "$cur" ]] && compgen -G "${cur}/${1:?$usage}.*" > /dev/null; then
echo "$cur"
return 0
elif [[ "$cur" == "$2" ]]; then
break # we are at the mount, and it does not exist
fi
cur=$(dirname "$cur")
done
return 1
}

# $1 is the controller name, $2 is the mount root from /proc/self/mountinfo, $3 is the mount relative dir from /proc/self/cgroup
cgroup_util_find_v2_path() {
local usage="Usage: ${FUNCNAME[0]} CONTROLLER MOUNT CGROUP"
local retval=${3:?$usage}
# strip trailing slash if present (it would also be if it was just "/")
retval=${2:?$usage}${retval%/}
if grep -Eqs '(^|\s)'"${1:?$usage}"'($|\s)' "${retval}/cgroup.controllers"; then
echo "$retval"
return 0
else
# so it captures the exit status of grep, otherwise it is that of the if
return
fi
}

# this ignores memory.soft_limit_in_bytes on purpose for the reasons outlined in https://www.kernel.org/doc/html/latest/admin-guide/cgroup-v2.html#id1
cgroup_util_read_cgroupv1_memory_limit() {
local usage="Usage: ${FUNCNAME[0]} PATH"
local f="${1:?$usage}/memory.limit_in_bytes"
if [[ -r "$f" ]]; then
[[ -n ${CGROUP_UTIL_VERBOSE-} ]] && echo "Using limit from '${f}'" >&2
cat "$f"
return
else
return 9
fi
}

# this reads memory.high first, then falls back to memory.max, memory.low, or memory.min
cgroup_util_read_cgroupv2_memory_limit() {
local usage="Usage: ${FUNCNAME[0]} PATH"

local f
local limit
# memory.high is the the best limit to read ("This is the main mechanism to control memory usage of a cgroup.", https://www.kernel.org/doc/html/v5.15/admin-guide/cgroup-v2.html)
# we fall back to memory.max first (the final "safety net" limit), then memory.low (best-effort memory protection, e.g. OCI memory.reservation or Docker --memory-reservation), then finally memory.min (hard guaranteed minimum)
for f in "${1:?$usage}/memory.high" "${1}/memory.max" "${1}/memory.low" "${1}/memory.min"; do
if [[ -r "$f" ]]; then
limit=$(cat "$f")
if [[ "$limit" != "max" && "$limit" != "0" ]]; then
[[ -n ${CGROUP_UTIL_VERBOSE-} ]] && echo "Using limit from '${f}'" >&2
echo "$limit"
return
fi
fi
done

return 9
}

# reads a cgroup v1 (memory.limit_in_bytes) or v2 (memory.high, fallback to memory.max, fallback to memory.low, fallback to memory.min)
# if env var CGROUP_UTIL_PROCFS_ROOT is passed, it will be used instead of '/proc' to find '/proc/self/cgroup', '/proc/self/mountinfo' etc (useful for testing, defaults to '/proc')
# if env var CGROUP_UTIL_CGROUPFS_PREFIX is passed, it will be prepended to any /sys/fs/cgroup or similar path used (useful for testing, defaults to '')
# pass a value for env var CGROUP_UTIL_VERBOSE to enable verbose mode
cgroup_util_read_cgroup_memory_limit() {
if [[ -z "${CGROUP_UTIL_PROCFS_ROOT-}" ]]; then
local CGROUP_UTIL_PROCFS_ROOT=/proc
fi

# this value is used as a threshold for "silly" maximums returned e.g. by Docker on a cgroups v1 system
local maximum=$((8 * 1024 * 1024 * 1024 * 1024)) # 8 TB

local controller=memory

local procfs_cgroup_entry
procfs_cgroup_entry=$(cgroup_util_find_controller_from_procfs_cgroup_contents "$controller" < "${CGROUP_UTIL_PROCFS_ROOT}/self/cgroup") || {
[[ -n ${CGROUP_UTIL_VERBOSE-} ]] && echo "Could not find cgroup controller '${controller}' in '${CGROUP_UTIL_PROCFS_ROOT}/self/cgroup'" >&2
return 3
}

local controller_version
controller_version=$(echo "$procfs_cgroup_entry" | cgroup_util_get_controller_version_from_procfs_cgroup_line) || {
[[ -n ${CGROUP_UTIL_VERBOSE-} ]] && echo "Could not determine version for cgroup controller '${controller}' from '${CGROUP_UTIL_PROCFS_ROOT}/self/cgroup'" >&2
return 4
}

local controller_path
controller_path=$(echo "$procfs_cgroup_entry" | cgroup_util_get_controller_path_from_procfs_cgroup_line) || {
[[ -n ${CGROUP_UTIL_VERBOSE-} ]] && echo "Could not determine path for cgroup controller '${controller}' from '${CGROUP_UTIL_PROCFS_ROOT}/self/cgroup'" >&2
return 5
}

local controller_mount
controller_mount=$(cgroup_util_find_v"$controller_version"_mount_from_procfs_mountinfo_contents "$controller" < "${CGROUP_UTIL_PROCFS_ROOT}/self/mountinfo") || {
[[ -n ${CGROUP_UTIL_VERBOSE-} ]] && echo "Could not determine mount point for cgroup controller '${controller}' from '${CGROUP_UTIL_PROCFS_ROOT}/self/mountinfo'" >&2
return 6
}
# for testing purposes, a prefix can be passed to "relocate" the /sys/fs/cgroup/... location we are reading from next
controller_mount="${CGROUP_UTIL_CGROUPFS_PREFIX-}${controller_mount}"

local location
location=$(cgroup_util_find_v"$controller_version"_path "$controller" "$controller_mount" "$controller_path") || {
[[ -n ${CGROUP_UTIL_VERBOSE-} ]] && echo "Could not find a location for cgroup controller '${controller}'" >&2
return 7
}

[[ -n ${CGROUP_UTIL_VERBOSE-} ]] && echo "Reading cgroup v${controller_version} limit from '${location}'" >&2

local limit
limit=$(cgroup_util_read_cgroupv"$controller_version"_memory_limit "$location") || return

if (( maximum > 0 && limit <= maximum )); then
echo "$limit"
return
else
[[ -n ${CGROUP_UTIL_VERBOSE-} ]] && echo "Ignoring cgroup memory limit of ${limit} Bytes (exceeds maximum of ${maximum} Bytes)" >&2
return 99
fi
}

# reads a cgroup v1 (memory.limit_in_bytes) or v2 (memory.high, fallback to memory.max, fallback to memory.low, fallback to memory.min)
# optional argument is a file path to fall back to for reading a default value, useful e.g. when reading on a system that has a "fake" limit info file (defaults to '/sys/fs/cgroup/memory/memory.limit_in_bytes')
# if env var CGROUP_UTIL_PROCFS_ROOT is passed, it will be used instead of '/proc' to find '/proc/self/cgroup', '/proc/self/mountinfo' etc (useful for testing, defaults to '/proc')
# if env var CGROUP_UTIL_CGROUPFS_PREFIX is passed, it will be prepended to any /sys/fs/cgroup or similar path used (useful for testing, defaults to '')
# pass a value for env var CGROUP_UTIL_VERBOSE to enable verbose mode
cgroup_util_read_cgroup_memory_limit_with_fallback() {
local fallback=${1-"${CGROUP_UTIL_CGROUPFS_PREFIX-}/sys/fs/cgroup/memory/memory.limit_in_bytes"}

cgroup_util_read_cgroup_memory_limit || {
local retval=$?

if ((retval != 99)) && [[ -r "$fallback" ]]; then
[[ -n ${CGROUP_UTIL_VERBOSE-} ]] && echo "Reading fallback limit from '${fallback}'" >&2
cat "$fallback"
return
fi

return "$retval"
}
}
9 changes: 8 additions & 1 deletion lib/environment.sh
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,14 @@ write_profile() {
local bp_dir="$1"
local build_dir="$2"
mkdir -p "$build_dir/.profile.d"
cp "$bp_dir"/profile/* "$build_dir/.profile.d/"
cp "$bp_dir"/profile/nodejs.sh "$build_dir/.profile.d/"
write_web_concurrency "$bp_dir" "$build_dir/.profile.d/WEB_CONCURRENCY.sh"
}

write_web_concurrency() {
local bp_dir="$1"
# concatenate these two together
cat "$bp_dir"/etc/cgroups.sh "$bp_dir"/profile/WEB_CONCURRENCY.sh > "$2"
}

write_ci_profile() {
Expand Down
19 changes: 15 additions & 4 deletions profile/WEB_CONCURRENCY.sh
Original file line number Diff line number Diff line change
Expand Up @@ -31,17 +31,28 @@ log_concurrency() {
detect_memory() {
local default=$1

if [ -e /sys/fs/cgroup/memory/memory.limit_in_bytes ]; then
echo $(($(cat /sys/fs/cgroup/memory/memory.limit_in_bytes) / 1048576))
local memory_limit
memory_limit=$(cgroup_util_read_cgroup_memory_limit_with_fallback) && {
echo $(( memory_limit / 1024 / 1024 ))
return
}

if (($? == 99)); then
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

a brief comment about exit code 99 would be nice

dne_memory
else
echo "$default"
fi
fi
}

dne_memory() {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what does dne stand for?

echo "129024"
}

bound_memory() {
local detected=$1
# Memory is bound to the maximum memory of known dyno types: ~126 GB
local max_detected_memory=129024
local max_detected_memory
max_detected_memory=$(dne_memory)
if (( detected > max_detected_memory )); then
echo "$max_detected_memory"
else
Expand Down
42 changes: 33 additions & 9 deletions test/run
Original file line number Diff line number Diff line change
Expand Up @@ -557,35 +557,55 @@ testBuildWithUserCacheDirectoriesCamel() {
}

testConcurrency1X() {
LOG_CONCURRENCY=true MEMORY_AVAILABLE=512 capture "$(pwd)"/profile/WEB_CONCURRENCY.sh
# write_web_concurrency concatenates a vendored library file and our own logic into one file
WEB_CONCURRENCY_SH=$(mktemp)
chmod u+x "$WEB_CONCURRENCY_SH"
write_web_concurrency "$(pwd)" "$WEB_CONCURRENCY_SH"
LOG_CONCURRENCY=true MEMORY_AVAILABLE=512 capture "$WEB_CONCURRENCY_SH"
assertCaptured "Detected 512 MB available memory, 512 MB limit per process (WEB_MEMORY)"
assertCaptured "Recommending WEB_CONCURRENCY=1"
assertCapturedSuccess
}

testConcurrency2X() {
LOG_CONCURRENCY=true MEMORY_AVAILABLE=1024 capture "$(pwd)"/profile/WEB_CONCURRENCY.sh
# write_web_concurrency concatenates a vendored library file and our own logic into one file
WEB_CONCURRENCY_SH=$(mktemp)
chmod u+x "$WEB_CONCURRENCY_SH"
write_web_concurrency "$(pwd)" "$WEB_CONCURRENCY_SH"
LOG_CONCURRENCY=true MEMORY_AVAILABLE=1024 capture "$WEB_CONCURRENCY_SH"
assertCaptured "Detected 1024 MB available memory, 512 MB limit per process (WEB_MEMORY)"
assertCaptured "Recommending WEB_CONCURRENCY=2"
assertCapturedSuccess
}

testConcurrencyPerformanceM() {
LOG_CONCURRENCY=true MEMORY_AVAILABLE=2560 capture "$(pwd)"/profile/WEB_CONCURRENCY.sh
# write_web_concurrency concatenates a vendored library file and our own logic into one file
WEB_CONCURRENCY_SH=$(mktemp)
chmod u+x "$WEB_CONCURRENCY_SH"
write_web_concurrency "$(pwd)" "$WEB_CONCURRENCY_SH"
LOG_CONCURRENCY=true MEMORY_AVAILABLE=2560 capture "$WEB_CONCURRENCY_SH"
assertCaptured "Detected 2560 MB available memory, 512 MB limit per process (WEB_MEMORY)"
assertCaptured "Recommending WEB_CONCURRENCY=5"
assertCapturedSuccess
}

testConcurrencyPerformanceL() {
LOG_CONCURRENCY=true MEMORY_AVAILABLE=14336 capture "$(pwd)"/profile/WEB_CONCURRENCY.sh
assertCaptured "Detected 14336 MB available memory, 512 MB limit per process (WEB_MEMORY)"
assertCaptured "Recommending WEB_CONCURRENCY=28"
assertCapturedSuccess
# write_web_concurrency concatenates a vendored library file and our own logic into one file
WEB_CONCURRENCY_SH=$(mktemp)
chmod u+x "$WEB_CONCURRENCY_SH"
write_web_concurrency "$(pwd)" "$WEB_CONCURRENCY_SH"
LOG_CONCURRENCY=true MEMORY_AVAILABLE=14336 capture "$WEB_CONCURRENCY_SH"
assertCaptured "Detected 14336 MB available memory, 512 MB limit per process (WEB_MEMORY)"
assertCaptured "Recommending WEB_CONCURRENCY=28"
assertCapturedSuccess
}

testConcurrencyCustomLimit() {
LOG_CONCURRENCY=true MEMORY_AVAILABLE=1024 WEB_MEMORY=256 capture "$(pwd)"/profile/WEB_CONCURRENCY.sh
# write_web_concurrency concatenates a vendored library file and our own logic into one file
WEB_CONCURRENCY_SH=$(mktemp)
chmod u+x "$WEB_CONCURRENCY_SH"
write_web_concurrency "$(pwd)" "$WEB_CONCURRENCY_SH"
LOG_CONCURRENCY=true MEMORY_AVAILABLE=1024 WEB_MEMORY=256 capture "$WEB_CONCURRENCY_SH"
assertCaptured "Detected 1024 MB available memory, 256 MB limit per process (WEB_MEMORY)"
assertCaptured "Recommending WEB_CONCURRENCY=4"
assertCapturedSuccess
Expand All @@ -594,7 +614,11 @@ testConcurrencyCustomLimit() {
# When /sys/fs/cgroup/memory/memory.limit_in_bytes lies and gives a ridiculous value
# This happens on Dokku for example
testConcurrencyTooHigh() {
LOG_CONCURRENCY=true MEMORY_AVAILABLE=10000000000 capture "$(pwd)"/profile/WEB_CONCURRENCY.sh
# write_web_concurrency concatenates a vendored library file and our own logic into one file
WEB_CONCURRENCY_SH=$(mktemp)
chmod u+x "$WEB_CONCURRENCY_SH"
write_web_concurrency "$(pwd)" "$WEB_CONCURRENCY_SH"
LOG_CONCURRENCY=true MEMORY_AVAILABLE=10000000000 capture "$WEB_CONCURRENCY_SH"
assertCaptured "Could not determine a reasonable value for WEB_CONCURRENCY"
assertCaptured "Recommending WEB_CONCURRENCY=1"
assertCapturedSuccess
Expand Down