From 05c0b038868e2e72510570896f46c31d74710d32 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Tue, 20 Aug 2024 11:02:37 +0200 Subject: [PATCH 1/9] Make things more verbose, so we see where the memory limit comes from --- test_suite.sh | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/test_suite.sh b/test_suite.sh index 464670b653..be11fba565 100755 --- a/test_suite.sh +++ b/test_suite.sh @@ -167,8 +167,10 @@ fi cgroup_v1_mem_limit="/sys/fs/cgroup/memory/$( Date: Tue, 20 Aug 2024 11:50:31 +0200 Subject: [PATCH 2/9] More verobse --- test_suite.sh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/test_suite.sh b/test_suite.sh index be11fba565..73ef5d966f 100755 --- a/test_suite.sh +++ b/test_suite.sh @@ -166,6 +166,9 @@ else fi cgroup_v1_mem_limit="/sys/fs/cgroup/memory/$( Date: Tue, 20 Aug 2024 11:56:25 +0200 Subject: [PATCH 3/9] Of course, we should use the mounted dir /hostsys. It is entirely possible that inside the container, the /sys directory is different --- test_suite.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/test_suite.sh b/test_suite.sh index 73ef5d966f..7c3b7bfb0f 100755 --- a/test_suite.sh +++ b/test_suite.sh @@ -164,11 +164,11 @@ if [[ "${cpuinfo}" =~ (Core\(s\) per socket:[^0-9]*([0-9]+)) ]]; then else fatal_error "Failed to get the number of cores per socket for the current test hardware with lscpu." fi -cgroup_v1_mem_limit="/sys/fs/cgroup/memory/$( Date: Tue, 20 Aug 2024 11:59:47 +0200 Subject: [PATCH 4/9] get cpuset file from host as well --- bot/test.sh | 2 +- test_suite.sh | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/bot/test.sh b/bot/test.sh index d3f3630ea8..4a917af91e 100755 --- a/bot/test.sh +++ b/bot/test.sh @@ -206,7 +206,7 @@ else fi # Bind mount /sys/fs/cgroup so that we can determine the amount of memory available in our cgroup for # Reframe configuration -TEST_STEP_ARGS+=("--extra-bind-paths" "/sys/fs/cgroup:/hostsys/fs/cgroup:ro") +TEST_STEP_ARGS+=("--extra-bind-paths" "/sys/fs/cgroup:/hostsys/fs/cgroup:ro,/proc/self:/hostproc/self:ro"") # prepare arguments to test_suite.sh (specific to test step) declare -a TEST_SUITE_ARGS=() diff --git a/test_suite.sh b/test_suite.sh index 7c3b7bfb0f..6f42547df0 100755 --- a/test_suite.sh +++ b/test_suite.sh @@ -164,8 +164,8 @@ if [[ "${cpuinfo}" =~ (Core\(s\) per socket:[^0-9]*([0-9]+)) ]]; then else fatal_error "Failed to get the number of cores per socket for the current test hardware with lscpu." fi -cgroup_v1_mem_limit="/hostsys/fs/cgroup/memory/$( Date: Tue, 20 Aug 2024 12:02:31 +0200 Subject: [PATCH 5/9] Removed duplicate quote --- bot/test.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bot/test.sh b/bot/test.sh index 4a917af91e..fe40a7c06c 100755 --- a/bot/test.sh +++ b/bot/test.sh @@ -206,7 +206,7 @@ else fi # Bind mount /sys/fs/cgroup so that we can determine the amount of memory available in our cgroup for # Reframe configuration -TEST_STEP_ARGS+=("--extra-bind-paths" "/sys/fs/cgroup:/hostsys/fs/cgroup:ro,/proc/self:/hostproc/self:ro"") +TEST_STEP_ARGS+=("--extra-bind-paths" "/sys/fs/cgroup:/hostsys/fs/cgroup:ro,/proc/self:/hostproc/self:ro") # prepare arguments to test_suite.sh (specific to test step) declare -a TEST_SUITE_ARGS=() From 6d194632ae7c3b73d80e569e46e123cfb91d120d Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Tue, 20 Aug 2024 12:07:21 +0200 Subject: [PATCH 6/9] Apparently, bind-mounting the hosts /proc gives garbage, but reading it from the containers' /proc is fine. So let's do that --- test_suite.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test_suite.sh b/test_suite.sh index 6f42547df0..7c3b7bfb0f 100755 --- a/test_suite.sh +++ b/test_suite.sh @@ -164,8 +164,8 @@ if [[ "${cpuinfo}" =~ (Core\(s\) per socket:[^0-9]*([0-9]+)) ]]; then else fatal_error "Failed to get the number of cores per socket for the current test hardware with lscpu." fi -cgroup_v1_mem_limit="/hostsys/fs/cgroup/memory/$( Date: Tue, 20 Aug 2024 12:16:27 +0200 Subject: [PATCH 7/9] Remove the bind-mind for /proc, it's not needed anymore --- bot/test.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bot/test.sh b/bot/test.sh index fe40a7c06c..d3f3630ea8 100755 --- a/bot/test.sh +++ b/bot/test.sh @@ -206,7 +206,7 @@ else fi # Bind mount /sys/fs/cgroup so that we can determine the amount of memory available in our cgroup for # Reframe configuration -TEST_STEP_ARGS+=("--extra-bind-paths" "/sys/fs/cgroup:/hostsys/fs/cgroup:ro,/proc/self:/hostproc/self:ro") +TEST_STEP_ARGS+=("--extra-bind-paths" "/sys/fs/cgroup:/hostsys/fs/cgroup:ro") # prepare arguments to test_suite.sh (specific to test step) declare -a TEST_SUITE_ARGS=() From a8602734a213f331ccd77718b96970196b4748fa Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Tue, 20 Aug 2024 12:20:28 +0200 Subject: [PATCH 8/9] Cleanup debugging output --- test_suite.sh | 3 --- 1 file changed, 3 deletions(-) diff --git a/test_suite.sh b/test_suite.sh index 7c3b7bfb0f..5dae1c1bf3 100755 --- a/test_suite.sh +++ b/test_suite.sh @@ -166,9 +166,6 @@ else fi cgroup_v1_mem_limit="/hostsys/fs/cgroup/memory/$( Date: Wed, 21 Aug 2024 12:03:51 +0200 Subject: [PATCH 9/9] Add clear description for why we use /hostsys --- test_suite.sh | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/test_suite.sh b/test_suite.sh index 5dae1c1bf3..584c9dccfc 100755 --- a/test_suite.sh +++ b/test_suite.sh @@ -164,6 +164,11 @@ if [[ "${cpuinfo}" =~ (Core\(s\) per socket:[^0-9]*([0-9]+)) ]]; then else fatal_error "Failed to get the number of cores per socket for the current test hardware with lscpu." fi + +# The /sys inside the container is not the same as the /sys of the host +# We want to extract the memory limit from the cgroup on the host (which is typically set by SLURM). +# Thus, bot/test.sh bind-mounts the host's /sys/fs/cgroup into /hostsys/fs/cgroup +# and that's the prefix we use to extract the memory limit from cgroup_v1_mem_limit="/hostsys/fs/cgroup/memory/$(