From a27827007b0132f76181cc6aa98f653900c5aa58 Mon Sep 17 00:00:00 2001 From: Nashwan Azhari Date: Fri, 20 Dec 2024 14:21:26 +0200 Subject: [PATCH] Fix containerd-related path cleanup on failed bootstrap/join and associated test. (#910) --------- Signed-off-by: Nashwan Azhari Co-authored-by: Berkay Tekin Oz --- src/k8s/pkg/k8sd/app/hooks_remove.go | 22 ++++---- src/k8s/pkg/k8sd/setup/containerd.go | 15 ++++++ src/k8s/pkg/k8sd/setup/directories.go | 3 -- .../integration/templates/bootstrap-fail.yaml | 18 +++++++ tests/integration/tests/test_cleanup.py | 52 +++++++------------ tests/integration/tests/test_util/util.py | 34 ------------ 6 files changed, 66 insertions(+), 78 deletions(-) create mode 100644 tests/integration/templates/bootstrap-fail.yaml diff --git a/src/k8s/pkg/k8sd/app/hooks_remove.go b/src/k8s/pkg/k8sd/app/hooks_remove.go index 4e90e6c05..567d6d350 100644 --- a/src/k8s/pkg/k8sd/app/hooks_remove.go +++ b/src/k8s/pkg/k8sd/app/hooks_remove.go @@ -5,6 +5,7 @@ import ( "database/sql" "errors" "fmt" + "io/fs" "net" "os" @@ -177,17 +178,20 @@ func tryCleanupContainerdPaths(log log.Logger, s snap.Snap) { } // Check directory exists before attempting to remove: - if _, err := os.Stat(dirpath); os.IsNotExist(err) { + if stat, err := os.Stat(dirpath); os.IsNotExist(err) { log.Info("Containerd directory doesn't exist; skipping cleanup", "directory", dirpath) } else { - // NOTE(aznashwan): because of the convoluted interfaces-based way the snap - // composes and creates the original lockfiles (see k8sd/setup/containerd.go) - // this check is meant to defend against accidental code/configuration errors which - // might lead to the root FS being deleted: - realPath, err := os.Readlink(dirpath) - if err != nil { - log.Error(err, fmt.Sprintf("Failed to os.Readlink the directory path for lockfile %q pointing to %q. Skipping cleanup", lockpath, dirpath)) - continue + realPath := dirpath + if stat.Mode()&fs.ModeSymlink != 0 { + // NOTE(aznashwan): because of the convoluted interfaces-based way the snap + // composes and creates the original lockfiles (see k8sd/setup/containerd.go) + // this check is meant to defend against accidental code/configuration errors which + // might lead to the root FS being deleted: + realPath, err = os.Readlink(dirpath) + if err != nil { + log.Error(err, fmt.Sprintf("Failed to os.Readlink the directory path for lockfile %q pointing to %q. Skipping cleanup", lockpath, dirpath)) + continue + } } if realPath == "/" { diff --git a/src/k8s/pkg/k8sd/setup/containerd.go b/src/k8s/pkg/k8sd/setup/containerd.go index 2aba6b70c..662999870 100644 --- a/src/k8s/pkg/k8sd/setup/containerd.go +++ b/src/k8s/pkg/k8sd/setup/containerd.go @@ -91,6 +91,21 @@ func defaultContainerdConfig( // Containerd configures configuration and arguments for containerd on the local node. // Optionally, a number of registry mirrors and auths can be configured. func Containerd(snap snap.Snap, extraContainerdConfig map[string]any, extraArgs map[string]*string) error { + // We create the directories here since PreInitCheck is called before this + // This ensures we only create the directories if we are going to configure containerd + for _, dir := range []string{ + snap.ContainerdConfigDir(), + snap.ContainerdExtraConfigDir(), + snap.ContainerdRegistryConfigDir(), + } { + if dir == "" { + continue + } + if err := os.MkdirAll(dir, 0o700); err != nil { + return fmt.Errorf("failed to create required directory: %w", err) + } + } + configToml := defaultContainerdConfig( snap.CNIConfDir(), snap.CNIBinDir(), diff --git a/src/k8s/pkg/k8sd/setup/directories.go b/src/k8s/pkg/k8sd/setup/directories.go index 73a06e0cf..41b289eb3 100644 --- a/src/k8s/pkg/k8sd/setup/directories.go +++ b/src/k8s/pkg/k8sd/setup/directories.go @@ -19,9 +19,6 @@ func EnsureAllDirectories(snap snap.Snap) error { for _, dir := range []string{ snap.CNIConfDir(), - snap.ContainerdConfigDir(), - snap.ContainerdExtraConfigDir(), - snap.ContainerdRegistryConfigDir(), snap.K8sDqliteStateDir(), snap.KubernetesConfigDir(), snap.KubernetesPKIDir(), diff --git a/tests/integration/templates/bootstrap-fail.yaml b/tests/integration/templates/bootstrap-fail.yaml new file mode 100644 index 000000000..62abb6aa5 --- /dev/null +++ b/tests/integration/templates/bootstrap-fail.yaml @@ -0,0 +1,18 @@ +# Contains the bootstrap configuration for the smoke test. +cluster-config: + network: + enabled: true + dns: + enabled: true + ingress: + enabled: true + load-balancer: + enabled: true + local-storage: + enabled: true + gateway: + enabled: true + metrics-server: + enabled: true +extra-node-kube-apiserver-args: + --foo: bar diff --git a/tests/integration/tests/test_cleanup.py b/tests/integration/tests/test_cleanup.py index be32d4719..f82dae1f5 100644 --- a/tests/integration/tests/test_cleanup.py +++ b/tests/integration/tests/test_cleanup.py @@ -11,11 +11,11 @@ LOG = logging.getLogger(__name__) -KUBE_CONTROLLER_MANAGER_SNAP_PORT = 10257 +CONTAINERD_SOCKET_DIRECTORY_CLASSIC = "/run/containerd" CONTAINERD_PATHS = [ "/etc/containerd", - "/run/containerd", + CONTAINERD_SOCKET_DIRECTORY_CLASSIC, "/var/lib/containerd", ] CNI_PATH = "/opt/cni/bin" @@ -112,48 +112,36 @@ def test_node_cleanup_new_containerd_path(instances: List[harness.Instance]): @pytest.mark.node_count(1) -@pytest.mark.no_setup() +@pytest.mark.disable_k8s_bootstrapping() @pytest.mark.tags(tags.NIGHTLY) -@pytest.mark.skip(reason="the test fails when using a harness other than 'local'") -def test_containerd_path_cleanup_on_failed_init( - instances: List[harness.Instance], tmp_path -): +def test_containerd_path_cleanup_on_failed_init(instances: List[harness.Instance]): """Tests that a failed `bootstrap` properly cleans up any containerd-related paths it may have created as part of the failed `bootstrap`. - It induces a bootstrap failure by pre-binding a required k8s service - port (10257 for the kube-controller-manager) before running `k8s bootstrap`. + It introduces a bootstrap failure by supplying an incorrect argument to the kube-apiserver. + + The bootstrap/join-cluster aborting behavior was added in this PR: + https://github.com/canonical/k8s-snap/pull/772 NOTE: a failed `join-cluster` will trigger the exact same cleanup hook, so the test implicitly applies to it as well. """ instance = instances[0] expected_code = 1 - expected_message = ( - "Encountered error(s) while verifying port availability for Kubernetes " - "services: Port 10257 (needed by: kube-controller-manager) is already in use." - ) - with util.open_port(KUBE_CONTROLLER_MANAGER_SNAP_PORT) as _: - util.setup_k8s_snap(instance, tmp_path, config.SNAP, connect_interfaces=False) + fail_bootstrap_config = (config.MANIFESTS_DIR / "bootstrap-fail.yaml").read_text() - proc = instance.exec( - ["k8s", "bootstrap"], capture_output=True, text=True, check=False - ) - - if proc.returncode != expected_code: - raise AssertionError( - f"Expected `k8s bootstrap` to exit with code {expected_code}, " - f"but it exited with {proc.returncode}.\n" - f"Stdout was: \n{proc.stdout}.\nStderr was: \n{proc.stderr}" - ) + proc = instance.exec( + ["k8s", "bootstrap", "--file", "-"], + input=str.encode(fail_bootstrap_config), + check=False, + ) - if expected_message not in proc.stderr: - raise AssertionError( - f"Expected to find port-related warning '{expected_message}' in " - "stderr of the `k8s bootstrap` command.\n" - f"Stdout was: \n{proc.stdout}.\nStderr was: \n{proc.stderr}" - ) + if proc.returncode != expected_code: + raise AssertionError( + f"Expected `k8s bootstrap` to exit with code {expected_code}, " + f"but it exited with {proc.returncode}.\n" + ) - _assert_paths_not_exist(instance, CONTAINERD_PATHS) + _assert_paths_not_exist(instance, CONTAINERD_PATHS) diff --git a/tests/integration/tests/test_util/util.py b/tests/integration/tests/test_util/util.py index 83359407f..a403c41b5 100644 --- a/tests/integration/tests/test_util/util.py +++ b/tests/integration/tests/test_util/util.py @@ -1,13 +1,11 @@ # # Copyright 2024 Canonical, Ltd. # -import contextlib import ipaddress import json import logging import re import shlex -import socket import subprocess import urllib.request from datetime import datetime @@ -542,38 +540,6 @@ def find_suitable_cidr(parent_cidr: str, excluded_ips: List[str]): raise RuntimeError("Could not find a suitable CIDR for LoadBalancer services") -@contextlib.contextmanager -def open_port( - port: int, - host: str = "", - address_family: socket.AddressFamily = socket.AF_INET, - socket_kind: socket.SocketKind = socket.SOCK_STREAM, - max_backlogged_connections: int = 0, -): - """Context manager which opens a socket with the given properties - and binds it to the given port. - - Yields the already setup and listening socket object for use. - - By default, it will only allow one single active connection - and instantly refuse any new ones. Use the `max_backlogged_connections` - argument if you'd like it to accept more connections as `pending`. - """ - sock = socket.socket(family=address_family, type=socket_kind) - if not host: - host = socket.gethostname() - sock.bind((host, port)) - LOG.info(f"Successfully bound new socket on '{host}:{port}'") - - try: - sock.listen(max_backlogged_connections) - LOG.info(f"Successfully started listening on '{host}:{port}'") - yield sock - finally: - sock.close() - LOG.info(f"Closed socket on '{host}:{port}'") - - def check_file_paths_exist( instance: harness.Instance, paths: List[str] ) -> Mapping[str, bool]: