diff --git a/libnetwork/cni/cni_exec.go b/libnetwork/cni/cni_exec.go index 79d7ef120..4b7ed8c6d 100644 --- a/libnetwork/cni/cni_exec.go +++ b/libnetwork/cni/cni_exec.go @@ -26,8 +26,10 @@ import ( "context" "encoding/json" "fmt" + "os" "os/exec" "path/filepath" + "strings" "github.com/containernetworking/cni/pkg/invoke" "github.com/containernetworking/cni/pkg/version" @@ -80,6 +82,16 @@ func (e *cniExec) ExecPlugin(ctx context.Context, pluginPath string, stdinData [ c.Env = append(c.Env, "XDG_RUNTIME_DIR=") } + // The CNI plugins need access to iptables in $PATH. As it turns out debian doesn't put + // /usr/sbin in $PATH for rootless users. This will break rootless networking completely. + // We might break existing users and we cannot expect everyone to change their $PATH so + // let's add /usr/sbin to $PATH ourselves. + path := os.Getenv("PATH") + if !strings.Contains(path, "/usr/sbin") { + path += ":/usr/sbin" + c.Env = append(c.Env, "PATH="+path) + } + err := c.Run() if err != nil { return nil, annotatePluginError(err, pluginPath, stdout.Bytes(), stderr.Bytes()) diff --git a/libnetwork/cni/cni_suite_test.go b/libnetwork/cni/cni_suite_test.go index e45ad0e42..5fbe79832 100644 --- a/libnetwork/cni/cni_suite_test.go +++ b/libnetwork/cni/cni_suite_test.go @@ -8,8 +8,10 @@ import ( "path/filepath" "testing" + "github.com/containers/common/internal/attributedstring" "github.com/containers/common/libnetwork/cni" "github.com/containers/common/libnetwork/types" + "github.com/containers/common/pkg/config" . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" ) @@ -28,8 +30,12 @@ func TestCni(t *testing.T) { func getNetworkInterface(cniConfDir string) (types.ContainerNetwork, error) { return cni.NewCNINetworkInterface(&cni.InitConfig{ - CNIConfigDir: cniConfDir, - CNIPluginDirs: cniPluginDirs, + CNIConfigDir: cniConfDir, + Config: &config.Config{ + Network: config.NetworkConfig{ + CNIPluginDirs: attributedstring.NewSlice(cniPluginDirs), + }, + }, }) } diff --git a/libnetwork/cni/network.go b/libnetwork/cni/network.go index 49d20b915..7d3369af7 100644 --- a/libnetwork/cni/network.go +++ b/libnetwork/cni/network.go @@ -16,6 +16,7 @@ import ( "time" "github.com/containernetworking/cni/libcni" + "github.com/containers/common/libnetwork/internal/rootlessnetns" "github.com/containers/common/libnetwork/types" "github.com/containers/common/pkg/config" "github.com/containers/common/pkg/version" @@ -53,6 +54,9 @@ type cniNetwork struct { // networks is a map with loaded networks, the key is the network name networks map[string]*network + + // rootlessNetns is used for the rootless network setup/teardown + rootlessNetns *rootlessnetns.Netns } type network struct { @@ -65,21 +69,14 @@ type network struct { type InitConfig struct { // CNIConfigDir is directory where the cni config files are stored. CNIConfigDir string - // CNIPluginDirs is a list of directories where cni should look for the plugins. - CNIPluginDirs []string // RunDir is a directory where temporary files can be stored. RunDir string - // DefaultNetwork is the name for the default network. - DefaultNetwork string - // DefaultSubnet is the default subnet for the default network. - DefaultSubnet string - - // DefaultsubnetPools contains the subnets which must be used to allocate a free subnet by network create - DefaultsubnetPools []config.SubnetPool - // IsMachine describes whenever podman runs in a podman machine environment. IsMachine bool + + // Config containers.conf options + Config *config.Config } // NewCNINetworkInterface creates the ContainerNetwork interface for the CNI backend. @@ -96,12 +93,12 @@ func NewCNINetworkInterface(conf *InitConfig) (types.ContainerNetwork, error) { return nil, err } - defaultNetworkName := conf.DefaultNetwork + defaultNetworkName := conf.Config.Network.DefaultNetwork if defaultNetworkName == "" { defaultNetworkName = types.DefaultNetworkName } - defaultSubnet := conf.DefaultSubnet + defaultSubnet := conf.Config.Network.DefaultSubnet if defaultSubnet == "" { defaultSubnet = types.DefaultSubnet } @@ -110,21 +107,30 @@ func NewCNINetworkInterface(conf *InitConfig) (types.ContainerNetwork, error) { return nil, fmt.Errorf("failed to parse default subnet: %w", err) } - defaultSubnetPools := conf.DefaultsubnetPools + defaultSubnetPools := conf.Config.Network.DefaultSubnetPools if defaultSubnetPools == nil { defaultSubnetPools = config.DefaultSubnetPools } - cni := libcni.NewCNIConfig(conf.CNIPluginDirs, &cniExec{}) + var netns *rootlessnetns.Netns + if unshare.IsRootless() { + netns, err = rootlessnetns.New(conf.RunDir, rootlessnetns.CNI, conf.Config) + if err != nil { + return nil, err + } + } + + cni := libcni.NewCNIConfig(conf.Config.Network.CNIPluginDirs.Values, &cniExec{}) n := &cniNetwork{ cniConfigDir: conf.CNIConfigDir, - cniPluginDirs: conf.CNIPluginDirs, + cniPluginDirs: conf.Config.Network.CNIPluginDirs.Get(), cniConf: cni, defaultNetwork: defaultNetworkName, defaultSubnet: defaultNet, defaultsubnetPools: defaultSubnetPools, isMachine: conf.IsMachine, lock: lock, + rootlessNetns: netns, } return n, nil diff --git a/libnetwork/cni/run.go b/libnetwork/cni/run.go index 2da8da1ad..829c12704 100644 --- a/libnetwork/cni/run.go +++ b/libnetwork/cni/run.go @@ -39,61 +39,71 @@ func (n *cniNetwork) Setup(namespacePath string, options types.SetupOptions) (ma return nil, fmt.Errorf("failed to set the loopback adapter up: %w", err) } - var retErr error - teardownOpts := options - teardownOpts.Networks = map[string]types.PerNetworkOptions{} - // make sure to teardown the already connected networks on error - defer func() { - if retErr != nil { - if len(teardownOpts.Networks) > 0 { - err := n.teardown(namespacePath, types.TeardownOptions(teardownOpts)) - if err != nil { - logrus.Warn(err) + results := make(map[string]types.StatusBlock, len(options.Networks)) + + setup := func() error { + var retErr error + teardownOpts := options + teardownOpts.Networks = map[string]types.PerNetworkOptions{} + // make sure to teardown the already connected networks on error + defer func() { + if retErr != nil { + if len(teardownOpts.Networks) > 0 { + err := n.teardown(namespacePath, types.TeardownOptions(teardownOpts)) + if err != nil { + logrus.Warn(err) + } } } + }() + + ports, err := convertSpecgenPortsToCNIPorts(options.PortMappings) + if err != nil { + return err } - }() - ports, err := convertSpecgenPortsToCNIPorts(options.PortMappings) - if err != nil { - return nil, err - } + for name, netOpts := range options.Networks { + netOpts := netOpts + network := n.networks[name] + rt := getRuntimeConfig(namespacePath, options.ContainerName, options.ContainerID, name, ports, &netOpts) - results := make(map[string]types.StatusBlock, len(options.Networks)) - for name, netOpts := range options.Networks { - netOpts := netOpts - network := n.networks[name] - rt := getRuntimeConfig(namespacePath, options.ContainerName, options.ContainerID, name, ports, &netOpts) - - // If we have more than one static ip we need parse the ips via runtime config, - // make sure to add the ips capability to the first plugin otherwise it doesn't get the ips - if len(netOpts.StaticIPs) > 0 && !network.cniNet.Plugins[0].Network.Capabilities["ips"] { - caps := make(map[string]interface{}) - caps["capabilities"] = map[string]bool{"ips": true} - network.cniNet.Plugins[0], retErr = libcni.InjectConf(network.cniNet.Plugins[0], caps) + // If we have more than one static ip we need parse the ips via runtime config, + // make sure to add the ips capability to the first plugin otherwise it doesn't get the ips + if len(netOpts.StaticIPs) > 0 && !network.cniNet.Plugins[0].Network.Capabilities["ips"] { + caps := make(map[string]interface{}) + caps["capabilities"] = map[string]bool{"ips": true} + network.cniNet.Plugins[0], retErr = libcni.InjectConf(network.cniNet.Plugins[0], caps) + if retErr != nil { + return retErr + } + } + + var res cnitypes.Result + res, retErr = n.cniConf.AddNetworkList(context.Background(), network.cniNet, rt) + // Add this network to teardown opts since it is now connected. + // Also add this if an errors was returned since we want to call teardown on this regardless. + teardownOpts.Networks[name] = netOpts if retErr != nil { - return nil, retErr + return retErr } - } - var res cnitypes.Result - res, retErr = n.cniConf.AddNetworkList(context.Background(), network.cniNet, rt) - // Add this network to teardown opts since it is now connected. - // Also add this if an errors was returned since we want to call teardown on this regardless. - teardownOpts.Networks[name] = netOpts - if retErr != nil { - return nil, retErr + logrus.Debugf("cni result for container %s network %s: %v", options.ContainerID, name, res) + var status types.StatusBlock + status, retErr = CNIResultToStatus(res) + if retErr != nil { + return retErr + } + results[name] = status } + return nil + } - logrus.Debugf("cni result for container %s network %s: %v", options.ContainerID, name, res) - var status types.StatusBlock - status, retErr = CNIResultToStatus(res) - if retErr != nil { - return nil, retErr - } - results[name] = status + if n.rootlessNetns != nil { + err = n.rootlessNetns.Setup(len(options.Networks), setup) + } else { + err = setup() } - return results, nil + return results, err } // CNIResultToStatus convert the cni result to status block @@ -225,28 +235,39 @@ func (n *cniNetwork) teardown(namespacePath string, options types.TeardownOption } var multiErr *multierror.Error - for name, netOpts := range options.Networks { - netOpts := netOpts - rt := getRuntimeConfig(namespacePath, options.ContainerName, options.ContainerID, name, ports, &netOpts) - - cniConfList, newRt, err := getCachedNetworkConfig(n.cniConf, name, rt) - if err == nil { - rt = newRt - } else { - logrus.Warnf("Failed to load cached network config: %v, falling back to loading network %s from disk", err, name) - network := n.networks[name] - if network == nil { - multiErr = multierror.Append(multiErr, fmt.Errorf("network %s: %w", name, types.ErrNoSuchNetwork)) - continue + teardown := func() error { + for name, netOpts := range options.Networks { + netOpts := netOpts + rt := getRuntimeConfig(namespacePath, options.ContainerName, options.ContainerID, name, ports, &netOpts) + + cniConfList, newRt, err := getCachedNetworkConfig(n.cniConf, name, rt) + if err == nil { + rt = newRt + } else { + logrus.Warnf("Failed to load cached network config: %v, falling back to loading network %s from disk", err, name) + network := n.networks[name] + if network == nil { + multiErr = multierror.Append(multiErr, fmt.Errorf("network %s: %w", name, types.ErrNoSuchNetwork)) + continue + } + cniConfList = network.cniNet } - cniConfList = network.cniNet - } - err = n.cniConf.DelNetworkList(context.Background(), cniConfList, rt) - if err != nil { - multiErr = multierror.Append(multiErr, err) + err = n.cniConf.DelNetworkList(context.Background(), cniConfList, rt) + if err != nil { + multiErr = multierror.Append(multiErr, err) + } } + return nil + } + + if n.rootlessNetns != nil { + err = n.rootlessNetns.Teardown(len(options.Networks), teardown) + } else { + err = teardown() } + multiErr = multierror.Append(multiErr, err) + return multiErr.ErrorOrNil() } @@ -267,3 +288,10 @@ func getCachedNetworkConfig(cniConf *libcni.CNIConfig, name string, rt *libcni.R } return cniConfList, rt, nil } + +func (n *cniNetwork) RunInRootlessNetns(toRun func() error) error { + if n.rootlessNetns == nil { + return types.ErrNotRootlessNetns + } + return n.rootlessNetns.Run(n.lock, toRun) +} diff --git a/libnetwork/internal/rootlessnetns/netns.go b/libnetwork/internal/rootlessnetns/netns.go new file mode 100644 index 000000000..edc29f66f --- /dev/null +++ b/libnetwork/internal/rootlessnetns/netns.go @@ -0,0 +1,8 @@ +package rootlessnetns + +type NetworkBackend int + +const ( + Netavark NetworkBackend = iota + CNI +) diff --git a/libnetwork/internal/rootlessnetns/netns_freebsd.go b/libnetwork/internal/rootlessnetns/netns_freebsd.go new file mode 100644 index 000000000..a176d2d82 --- /dev/null +++ b/libnetwork/internal/rootlessnetns/netns_freebsd.go @@ -0,0 +1,28 @@ +package rootlessnetns + +import ( + "errors" + + "github.com/containers/common/pkg/config" + "github.com/containers/storage/pkg/lockfile" +) + +var ErrNotSupported = errors.New("rootless netns only supported on linux") + +type Netns struct{} + +func New(dir string, backend NetworkBackend, conf *config.Config) (*Netns, error) { + return nil, ErrNotSupported +} + +func (n *Netns) Setup(nets int, toRun func() error) error { + return ErrNotSupported +} + +func (n *Netns) Teardown(nets int, toRun func() error) error { + return ErrNotSupported +} + +func (n *Netns) Run(lock *lockfile.LockFile, toRun func() error) error { + return ErrNotSupported +} diff --git a/libnetwork/internal/rootlessnetns/netns_linux.go b/libnetwork/internal/rootlessnetns/netns_linux.go new file mode 100644 index 000000000..8fbb1f590 --- /dev/null +++ b/libnetwork/internal/rootlessnetns/netns_linux.go @@ -0,0 +1,545 @@ +package rootlessnetns + +import ( + "errors" + "fmt" + "io/fs" + "os" + "path/filepath" + "strconv" + "strings" + "syscall" + + "github.com/containernetworking/plugins/pkg/ns" + "github.com/containers/common/libnetwork/resolvconf" + "github.com/containers/common/libnetwork/slirp4netns" + "github.com/containers/common/pkg/config" + "github.com/containers/common/pkg/netns" + "github.com/containers/common/pkg/systemd" + "github.com/containers/storage/pkg/homedir" + "github.com/containers/storage/pkg/lockfile" + "github.com/hashicorp/go-multierror" + "github.com/opencontainers/runtime-spec/specs-go" + "github.com/opencontainers/selinux/go-selinux/label" + "github.com/sirupsen/logrus" + "golang.org/x/sys/unix" +) + +const ( + // rootlessNetnsDir is the directory name + rootlessNetnsDir = "rootless-netns" + // refCountFile file name for the ref count file + refCountFile = "ref-count" + + // rootlessNetNsSilrp4netnsPidFile is the name of the rootless netns slirp4netns pid file + rootlessNetNsSilrp4netnsPidFile = "rootless-netns-slirp4netns.pid" + + // persistentCNIDir is the directory where the CNI files are stored + persistentCNIDir = "/var/lib/cni" + + tmpfs = "tmpfs" + none = "none" + resolvConfName = "resolv.conf" +) + +type Netns struct { + // dir used for the rootless netns + dir string + // backend used for the network setup/teardown + backend NetworkBackend + + // config contains containers.conf options. + config *config.Config +} + +type rootlessNetnsError struct { + msg string + err error +} + +func (e *rootlessNetnsError) Error() string { + msg := e.msg + ": " + return fmt.Sprintf("rootless netns: %s%v", msg, e.err) +} + +func (e *rootlessNetnsError) Unwrap() error { + return e.err +} + +// wrapError wraps the error with extra context +// It will always include "rootless netns:" so the msg should not mention it again, +// msg can be empty to just include the rootless netns part. +// err must be non nil. +func wrapError(msg string, err error) *rootlessNetnsError { + return &rootlessNetnsError{ + msg: msg, + err: err, + } +} + +func New(dir string, backend NetworkBackend, conf *config.Config) (*Netns, error) { + netnsDir := filepath.Join(dir, rootlessNetnsDir) + if err := os.MkdirAll(netnsDir, 0o700); err != nil { + return nil, wrapError("", err) + } + return &Netns{ + dir: netnsDir, + backend: backend, + config: conf, + }, nil +} + +// getPath is a small wrapper around filepath.Join() to have a bit less code +func (n *Netns) getPath(path string) string { + return filepath.Join(n.dir, path) +} + +// getOrCreateNetns returns the rootless netns, if it created a new one the +// returned bool is set to true. +func (n *Netns) getOrCreateNetns() (ns.NetNS, bool, error) { + nsPath := n.getPath(rootlessNetnsDir) + nsRef, err := ns.GetNS(nsPath) + if err == nil { + // TODO check if slirp4netns is alive + return nsRef, false, nil + } + logrus.Debugf("Creating rootless network namespace at %q", nsPath) + // We have to create the netns dir again here because it is possible + // that cleanup() removed it. + if err := os.MkdirAll(n.dir, 0o700); err != nil { + return nil, false, wrapError("", err) + } + netns, err := netns.NewNSAtPath(nsPath) + if err != nil { + return nil, false, wrapError("create netns", err) + } + err = n.setupSlirp4netns(nsPath) + return netns, true, err +} + +func (n *Netns) cleanup() error { + if _, err := os.Stat(n.dir); err != nil { + if errors.Is(err, fs.ErrNotExist) { + // dir does not exists no need for cleanup + return nil + } + return err + } + + logrus.Debug("Cleaning up rootless network namespace") + + nsPath := n.getPath(rootlessNetnsDir) + var multiErr *multierror.Error + if err := netns.UnmountNS(nsPath); err != nil { + multiErr = multierror.Append(multiErr, err) + } + if err := n.cleanupSlirp4netns(); err != nil { + multiErr = multierror.Append(multiErr, wrapError("kill slirp4netns", err)) + } + if err := os.RemoveAll(n.dir); err != nil { + multiErr = multierror.Append(multiErr, wrapError("remove rootless netns dir", err)) + } + + return multiErr.ErrorOrNil() +} + +func (n *Netns) setupSlirp4netns(nsPath string) error { + res, err := slirp4netns.Setup(&slirp4netns.SetupOptions{ + Config: n.config, + ContainerID: "rootless-netns", + Netns: nsPath, + }) + if err != nil { + return wrapError("start slirp4netns", err) + } + // create pid file for the slirp4netns process + // this is need to kill the process in the cleanup + pid := strconv.Itoa(res.Pid) + err = os.WriteFile(n.getPath(rootlessNetNsSilrp4netnsPidFile), []byte(pid), 0o600) + if err != nil { + return wrapError("write slirp4netns pid file", err) + } + + if systemd.RunsOnSystemd() { + // move to systemd scope to prevent systemd from killing it + err = systemd.MoveRootlessNetnsSlirpProcessToUserSlice(res.Pid) + if err != nil { + // only log this, it is not fatal but can lead to issues when running podman inside systemd units + logrus.Errorf("failed to move the rootless netns slirp4netns process to the systemd user.slice: %v", err) + } + } + + // build a new resolv.conf file which uses the slirp4netns dns server address + resolveIP, err := slirp4netns.GetDNS(res.Subnet) + if err != nil { + return wrapError("determine default slirp4netns DNS address", err) + } + + if err := resolvconf.New(&resolvconf.Params{ + Path: n.getPath(resolvConfName), + // fake the netns since we want to filter localhost + Namespaces: []specs.LinuxNamespace{ + {Type: specs.NetworkNamespace}, + }, + IPv6Enabled: res.IPv6, + KeepHostServers: true, + Nameservers: []string{resolveIP.String()}, + }); err != nil { + return wrapError("create resolv.conf", err) + } + return nil +} + +func (n *Netns) cleanupSlirp4netns() error { + pidFile := n.getPath(rootlessNetNsSilrp4netnsPidFile) + b, err := os.ReadFile(pidFile) + if err == nil { + var i int + i, err = strconv.Atoi(string(b)) + if err == nil { + // kill the slirp process so we do not leak it + err = syscall.Kill(i, syscall.SIGTERM) + } + } + return err +} + +// mountAndMkdirDest convenience wrapper for mount and mkdir +func mountAndMkdirDest(source string, target string, fstype string, flags uintptr) error { + if err := os.MkdirAll(target, 0o700); err != nil { + return wrapError("create mount point", err) + } + if err := unix.Mount(source, target, fstype, flags, ""); err != nil { + return wrapError(fmt.Sprintf("mount %q to %q", source, target), err) + } + return nil +} + +func (n *Netns) setupMounts() error { + // Before we can run the given function, + // we have to set up all mounts correctly. + + // The order of the mounts is IMPORTANT. + // The idea of the extra mount ns is to make /run and /var/lib/cni writeable + // for the cni plugins but not affecting the podman user namespace. + // Because the plugins also need access to XDG_RUNTIME_DIR/netns some special setup is needed. + + // The following bind mounts are needed + // 1. XDG_RUNTIME_DIR -> XDG_RUNTIME_DIR/rootless-netns/XDG_RUNTIME_DIR + // 2. /run/systemd -> XDG_RUNTIME_DIR/rootless-netns/run/systemd (only if it exists) + // 3. XDG_RUNTIME_DIR/rootless-netns/resolv.conf -> /etc/resolv.conf or XDG_RUNTIME_DIR/rootless-netns/run/symlink/target + // 4. XDG_RUNTIME_DIR/rootless-netns/var/lib/cni -> /var/lib/cni (if /var/lib/cni does not exist, use the parent dir) + // 5. XDG_RUNTIME_DIR/rootless-netns/run -> /run + + // Create a new mount namespace, + // this must happen inside the netns thread. + err := unix.Unshare(unix.CLONE_NEWNS) + if err != nil { + return wrapError("create new mount namespace", err) + } + + xdgRuntimeDir, err := homedir.GetRuntimeDir() + if err != nil { + return fmt.Errorf("could not get runtime directory: %w", err) + } + newXDGRuntimeDir := n.getPath(xdgRuntimeDir) + // 1. Mount the netns into the new run to keep them accessible. + // Otherwise cni setup will fail because it cannot access the netns files. + err = mountAndMkdirDest(xdgRuntimeDir, newXDGRuntimeDir, none, unix.MS_BIND|unix.MS_SHARED|unix.MS_REC) + if err != nil { + return err + } + + // 2. Also keep /run/systemd if it exists. + // Many files are symlinked into this dir, for example /dev/log. + runSystemd := "/run/systemd" + _, err = os.Stat(runSystemd) + if err == nil { + newRunSystemd := n.getPath(runSystemd) + err = mountAndMkdirDest(runSystemd, newRunSystemd, none, unix.MS_BIND|unix.MS_REC) + if err != nil { + return err + } + } + + // 3. On some distros /etc/resolv.conf is symlinked to somewhere under /run. + // Because the kernel will follow the symlink before mounting, it is not + // possible to mount a file at /etc/resolv.conf. We have to ensure that + // the link target will be available in the mount ns. + // see: https://github.com/containers/podman/issues/10855 + resolvePath := resolvconf.DefaultResolvConf + linkCount := 0 + for i := 1; i < len(resolvePath); i++ { + // Do not use filepath.EvalSymlinks, we only want the first symlink under /run. + // If /etc/resolv.conf has more than one symlink under /run, e.g. + // -> /run/systemd/resolve/stub-resolv.conf -> /run/systemd/resolve/resolv.conf + // we would put the netns resolv.conf file to the last path. However this will + // break dns because the second link does not exist in the mount ns. + // see https://github.com/containers/podman/issues/11222 + // + // We also need to resolve all path components not just the last file. + // see https://github.com/containers/podman/issues/12461 + + if resolvePath[i] != '/' { + // if we are at the last char we need to inc i by one because there is no final slash + if i == len(resolvePath)-1 { + i++ + } else { + // not the end of path, keep going + continue + } + } + path := resolvePath[:i] + + fi, err := os.Lstat(path) + if err != nil { + return fmt.Errorf("failed to stat resolv.conf path: %w", err) + } + + // no link, just continue + if fi.Mode()&os.ModeSymlink == 0 { + continue + } + + link, err := os.Readlink(path) + if err != nil { + return fmt.Errorf("failed to read resolv.conf symlink: %w", err) + } + linkCount++ + if filepath.IsAbs(link) { + // link is as an absolute path + resolvePath = filepath.Join(link, resolvePath[i:]) + } else { + // link is as a relative, join it with the previous path + base := filepath.Dir(path) + resolvePath = filepath.Join(base, link, resolvePath[i:]) + } + // set i back to zero since we now have a new base path + i = 0 + + // we have to stop at the first path under /run because we will have an empty /run and will create the path anyway + // if we would continue we would need to recreate all links under /run + if strings.HasPrefix(resolvePath, "/run/") { + break + } + // make sure wo do not loop forever + if linkCount == 255 { + return errors.New("too many symlinks while resolving /etc/resolv.conf") + } + } + logrus.Debugf("The path of /etc/resolv.conf in the mount ns is %q", resolvePath) + // When /etc/resolv.conf on the host is a symlink to /run/systemd/resolve/stub-resolv.conf, + // we have to mount an empty filesystem on /run/systemd/resolve in the child namespace, + // so as to isolate the directory from the host mount namespace. + // + // Otherwise our bind-mount for /run/systemd/resolve/stub-resolv.conf is unmounted + // when systemd-resolved unlinks and recreates /run/systemd/resolve/stub-resolv.conf on the host. + // see: https://github.com/containers/podman/issues/10929 + if strings.HasPrefix(resolvePath, "/run/systemd/resolve/") { + rsr := n.getPath("/run/systemd/resolve") + err = mountAndMkdirDest("", rsr, tmpfs, unix.MS_NOEXEC|unix.MS_NOSUID|unix.MS_NODEV) + if err != nil { + return err + } + } + if strings.HasPrefix(resolvePath, "/run/") { + resolvePath = n.getPath(resolvePath) + err = os.MkdirAll(filepath.Dir(resolvePath), 0o700) + if err != nil { + return wrapError("create resolv.conf directory", err) + } + // we want to bind mount on this file so we have to create the file first + _, err = os.OpenFile(resolvePath, os.O_CREATE|os.O_RDONLY, 0o600) + if err != nil { + return wrapError("create resolv.conf file: %w", err) + } + } + // mount resolv.conf to make use of the host dns + err = unix.Mount(n.getPath(resolvConfName), resolvePath, none, unix.MS_BIND, "") + if err != nil { + return wrapError(fmt.Sprintf("mount resolv.conf to %q", resolvePath), err) + } + + // 4. CNI plugins need access to /var/lib/cni + if n.backend == CNI { + if err := n.mountCNIVarDir(); err != nil { + return err + } + } + + // 5. Mount the new prepared run dir to /run, it has to be recursive to keep the other bind mounts. + runDir := n.getPath("run") + // relabel the new run directory to the iptables /run label + // this is important, otherwise the iptables command will fail + err = label.Relabel(runDir, "system_u:object_r:iptables_var_run_t:s0", false) + if err != nil { + if !errors.Is(err, unix.ENOTSUP) { + return wrapError("relabel iptables_var_run_t", err) + } + logrus.Debugf("Labeling not supported on %q", runDir) + } + err = mountAndMkdirDest(runDir, "/run", none, unix.MS_BIND|unix.MS_REC) + if err != nil { + return err + } + return nil +} + +func (n *Netns) mountCNIVarDir() error { + varDir := "" + varTarget := persistentCNIDir + // we can only mount to a target dir which exists, check /var/lib/cni recursively + // while we could always use /var there are cases where a user might store the cni + // configs under /var/custom and this would break + for { + if _, err := os.Stat(varTarget); err == nil { + varDir = n.getPath(varTarget) + break + } + varTarget = filepath.Dir(varTarget) + if varTarget == "/" { + break + } + } + if varDir == "" { + return errors.New("failed to stat /var directory") + } + if err := os.MkdirAll(varDir, 0o700); err != nil { + return wrapError("create var dir", err) + } + // make sure to mount var first + err := unix.Mount(varDir, varTarget, none, unix.MS_BIND, "") + if err != nil { + return wrapError(fmt.Sprintf("mount %q to %q", varDir, varTarget), err) + } + return nil +} + +func (n *Netns) runInner(toRun func() error) (err error) { + nsRef, newNs, err := n.getOrCreateNetns() + if err != nil { + return err + } + defer nsRef.Close() + // If a new netns was created make sure to clean it up again on an error to not leak it. + if newNs { + defer func() { + if err != nil { + if err := n.cleanup(); err != nil { + logrus.Errorf("Rootless netns cleanup error after failed setup: %v", err) + } + } + }() + } + + return nsRef.Do(func(_ ns.NetNS) error { + if err := n.setupMounts(); err != nil { + return err + } + return toRun() + }) +} + +func (n *Netns) Setup(nets int, toRun func() error) error { + err := n.runInner(toRun) + if err != nil { + return err + } + _, err = refCount(n.dir, nets) + return err +} + +func (n *Netns) Teardown(nets int, toRun func() error) error { + var multiErr *multierror.Error + count, countErr := refCount(n.dir, -nets) + if countErr != nil { + multiErr = multierror.Append(multiErr, countErr) + } + err := n.runInner(toRun) + if err != nil { + multiErr = multierror.Append(multiErr, err) + } + + // only cleanup if the ref count did not throw an error + if count == 0 && countErr == nil { + err = n.cleanup() + if err != nil { + multiErr = multierror.Append(multiErr, wrapError("cleanup", err)) + } + } + + return multiErr.ErrorOrNil() +} + +// Run any long running function in the userns. +// We need to ensure that during setup/cleanup we are locked to avoid races. +// However because the given function could be running a long time we must +// unlock in between, i.e. this is used by podman unshare --rootless-nets +// and we do not want to keep it locked for the lifetime of the given command. +func (n *Netns) Run(lock *lockfile.LockFile, toRun func() error) error { + lock.Lock() + defer lock.Unlock() + _, err := refCount(n.dir, 1) + if err != nil { + return err + } + inner := func() error { + lock.Unlock() + err = toRun() + lock.Lock() + return err + } + + inErr := n.runInner(inner) + // make sure to always reset the ref counter afterwards + count, err := refCount(n.dir, -1) + if err != nil { + if inErr == nil { + return err + } + logrus.Errorf("Failed to decrement ref count: %v", err) + return inErr + } + if count == 0 { + err = n.cleanup() + if err != nil { + err = wrapError("cleanup", err) + if inErr == nil { + return err + } + logrus.Errorf("Failed to cleanup rootless netns: %v", err) + return inErr + } + } + + return inErr +} + +func refCount(dir string, inc int) (int, error) { + file := filepath.Join(dir, refCountFile) + content, err := os.ReadFile(file) + if err != nil && !errors.Is(err, fs.ErrNotExist) { + return -1, wrapError("read ref counter", err) + } + + currentCount := 0 + if len(content) > 0 { + currentCount, err = strconv.Atoi(string(content)) + if err != nil { + return -1, wrapError("parse ref counter", err) + } + } + + currentCount += inc + if currentCount < 0 { + logrus.Errorf("rootless netns ref counter out of sync, counter is at %d, resetting it back to 0", currentCount) + currentCount = 0 + } + + newNum := strconv.Itoa(currentCount) + if err = os.WriteFile(file, []byte(newNum), 0o600); err != nil { + return -1, wrapError("write ref counter", err) + } + + return currentCount, nil +} diff --git a/libnetwork/internal/rootlessnetns/netns_linux_test.go b/libnetwork/internal/rootlessnetns/netns_linux_test.go new file mode 100644 index 000000000..3562c70bf --- /dev/null +++ b/libnetwork/internal/rootlessnetns/netns_linux_test.go @@ -0,0 +1,77 @@ +package rootlessnetns + +import ( + "os" + "path/filepath" + "strconv" + "testing" + + "github.com/stretchr/testify/assert" +) + +func Test_refCount(t *testing.T) { + tests := []struct { + name string + content string + inc int + want int + }{ + { + name: "init counter", + inc: 1, + want: 1, + }, + { + name: "simple add", + inc: 5, + want: 5, + }, + { + name: "add multiple with content", + content: "0", + inc: 5, + want: 5, + }, + { + name: "add multiple with high number content", + content: "5500", + inc: 2, + want: 5502, + }, + { + name: "simple dec", + content: "5", + inc: -5, + want: 0, + }, + { + name: "dec negative should not go below 0", + content: "0", + inc: -5, + want: 0, + }, + { + name: "dec multiple with high number content", + content: "9800", + inc: -100, + want: 9700, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + dir := t.TempDir() + file := filepath.Join(dir, refCountFile) + if tt.content != "" { + err := os.WriteFile(file, []byte(tt.content), 0o700) + assert.NoError(t, err, "write file error") + } + + got, err := refCount(dir, tt.inc) + assert.NoError(t, err, "refCount() error") + assert.Equal(t, tt.want, got, "counter is equal") + content, err := os.ReadFile(file) + assert.NoError(t, err, "read file error") + assert.Equal(t, strconv.Itoa(tt.want), string(content), "file content after refCount()") + }) + } +} diff --git a/libnetwork/netavark/config_test.go b/libnetwork/netavark/config_test.go index 2ba438e27..d799a18a2 100644 --- a/libnetwork/netavark/config_test.go +++ b/libnetwork/netavark/config_test.go @@ -13,6 +13,7 @@ import ( "github.com/containers/common/libnetwork/netavark" "github.com/containers/common/libnetwork/types" "github.com/containers/common/libnetwork/util" + "github.com/containers/common/pkg/config" . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" gomegaTypes "github.com/onsi/gomega/types" @@ -684,6 +685,7 @@ var _ = Describe("Config", func() { It("update NetworkDNSServers AddDNSServers", func() { libpodNet, err := netavark.NewNetworkInterface(&netavark.InitConfig{ + Config: &config.Config{}, NetworkConfigDir: networkConfDir, NetworkRunDir: networkConfDir, NetavarkBinary: "true", @@ -710,6 +712,7 @@ var _ = Describe("Config", func() { It("update NetworkDNSServers RemoveDNSServers", func() { libpodNet, err := netavark.NewNetworkInterface(&netavark.InitConfig{ + Config: &config.Config{}, NetworkConfigDir: networkConfDir, NetworkRunDir: networkConfDir, NetavarkBinary: "true", @@ -736,6 +739,7 @@ var _ = Describe("Config", func() { It("update NetworkDNSServers Add and Remove DNSServers", func() { libpodNet, err := netavark.NewNetworkInterface(&netavark.InitConfig{ + Config: &config.Config{}, NetworkConfigDir: networkConfDir, NetworkRunDir: networkConfDir, NetavarkBinary: "true", diff --git a/libnetwork/netavark/exec.go b/libnetwork/netavark/exec.go index f2c82359a..e3f904766 100644 --- a/libnetwork/netavark/exec.go +++ b/libnetwork/netavark/exec.go @@ -10,6 +10,7 @@ import ( "os" "os/exec" "strconv" + "strings" "github.com/sirupsen/logrus" ) @@ -79,6 +80,15 @@ func getRustLogEnv() string { func (n *netavarkNetwork) execNetavark(args []string, needPlugin bool, stdin, result interface{}) error { // set the netavark log level to the same as the podman env := append(os.Environ(), getRustLogEnv()) + // Netavark need access to iptables in $PATH. As it turns out debian doesn't put + // /usr/sbin in $PATH for rootless users. This will break rootless networking completely. + // We might break existing users and we cannot expect everyone to change their $PATH so + // let's add /usr/sbin to $PATH ourselves. + path := os.Getenv("PATH") + if !strings.Contains(path, "/usr/sbin") { + path += ":/usr/sbin" + env = append(env, "PATH="+path) + } // if we run with debug log level lets also set RUST_BACKTRACE=1 so we can get the full stack trace in case of panics if logrus.IsLevelEnabled(logrus.DebugLevel) { env = append(env, "RUST_BACKTRACE=1") diff --git a/libnetwork/netavark/ipam_test.go b/libnetwork/netavark/ipam_test.go index 870d03698..0b3fadeb4 100644 --- a/libnetwork/netavark/ipam_test.go +++ b/libnetwork/netavark/ipam_test.go @@ -10,6 +10,7 @@ import ( "os" "github.com/containers/common/libnetwork/types" + "github.com/containers/common/pkg/config" . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" "github.com/sirupsen/logrus" @@ -34,6 +35,7 @@ var _ = Describe("IPAM", func() { JustBeforeEach(func() { libpodNet, err := NewNetworkInterface(&InitConfig{ + Config: &config.Config{}, NetworkConfigDir: networkConfDir, NetworkRunDir: networkConfDir, }) diff --git a/libnetwork/netavark/netavark_suite_test.go b/libnetwork/netavark/netavark_suite_test.go index 548dd8bb7..af4947c18 100644 --- a/libnetwork/netavark/netavark_suite_test.go +++ b/libnetwork/netavark/netavark_suite_test.go @@ -10,9 +10,11 @@ import ( "reflect" "testing" + "github.com/containers/common/internal/attributedstring" "github.com/containers/common/libnetwork/netavark" "github.com/containers/common/libnetwork/types" "github.com/containers/common/libnetwork/util" + "github.com/containers/common/pkg/config" . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" gomegaTypes "github.com/onsi/gomega/types" @@ -34,6 +36,7 @@ func init() { func getNetworkInterface(confDir string) (types.ContainerNetwork, error) { return netavark.NewNetworkInterface(&netavark.InitConfig{ + Config: &config.Config{}, NetworkConfigDir: confDir, NetavarkBinary: netavarkBinary, NetworkRunDir: confDir, @@ -45,7 +48,11 @@ func getNetworkInterfaceWithPlugins(confDir string, pluginDirs []string) (types. NetworkConfigDir: confDir, NetavarkBinary: netavarkBinary, NetworkRunDir: confDir, - PluginDirs: pluginDirs, + Config: &config.Config{ + Network: config.NetworkConfig{ + NetavarkPluginDirs: attributedstring.NewSlice(pluginDirs), + }, + }, }) } diff --git a/libnetwork/netavark/network.go b/libnetwork/netavark/network.go index 592116749..aad3cc7bd 100644 --- a/libnetwork/netavark/network.go +++ b/libnetwork/netavark/network.go @@ -12,6 +12,7 @@ import ( "strings" "time" + "github.com/containers/common/libnetwork/internal/rootlessnetns" "github.com/containers/common/libnetwork/internal/util" "github.com/containers/common/libnetwork/types" "github.com/containers/common/pkg/config" @@ -68,6 +69,9 @@ type netavarkNetwork struct { // networks is a map with loaded networks, the key is the network name networks map[string]*types.Network + + // rootlessNetns is used for the rootless network setup/teardown + rootlessNetns *rootlessnetns.Netns } type InitConfig struct { @@ -82,26 +86,12 @@ type InitConfig struct { // NetworkRunDir is where temporary files are stored, i.e.the ipam db, aardvark config NetworkRunDir string - // FirewallDriver sets the firewall driver to use - FirewallDriver string - - // DefaultNetwork is the name for the default network. - DefaultNetwork string - // DefaultSubnet is the default subnet for the default network. - DefaultSubnet string - - // DefaultsubnetPools contains the subnets which must be used to allocate a free subnet by network create - DefaultsubnetPools []config.SubnetPool - - // DNSBindPort is set the port to pass to netavark for aardvark - DNSBindPort uint16 - - // PluginDirs list of directories were netavark plugins are located - PluginDirs []string - // Syslog describes whenever the netavark debug output should be log to the syslog as well. // This will use logrus to do so, make sure logrus is set up to log to the syslog. Syslog bool + + // Config containers.conf options + Config *config.Config } // NewNetworkInterface creates the ContainerNetwork interface for the netavark backend. @@ -118,12 +108,12 @@ func NewNetworkInterface(conf *InitConfig) (types.ContainerNetwork, error) { return nil, err } - defaultNetworkName := conf.DefaultNetwork + defaultNetworkName := conf.Config.Network.DefaultNetwork if defaultNetworkName == "" { defaultNetworkName = types.DefaultNetworkName } - defaultSubnet := conf.DefaultSubnet + defaultSubnet := conf.Config.Network.DefaultSubnet if defaultSubnet == "" { defaultSubnet = types.DefaultSubnet } @@ -140,11 +130,19 @@ func NewNetworkInterface(conf *InitConfig) (types.ContainerNetwork, error) { return nil, err } - defaultSubnetPools := conf.DefaultsubnetPools + defaultSubnetPools := conf.Config.Network.DefaultSubnetPools if defaultSubnetPools == nil { defaultSubnetPools = config.DefaultSubnetPools } + var netns *rootlessnetns.Netns + if unshare.IsRootless() { + netns, err = rootlessnetns.New(conf.NetworkRunDir, rootlessnetns.Netavark, conf.Config) + if err != nil { + return nil, err + } + } + n := &netavarkNetwork{ networkConfigDir: conf.NetworkConfigDir, networkRunDir: conf.NetworkRunDir, @@ -152,14 +150,15 @@ func NewNetworkInterface(conf *InitConfig) (types.ContainerNetwork, error) { aardvarkBinary: conf.AardvarkBinary, networkRootless: unshare.IsRootless(), ipamDBPath: filepath.Join(conf.NetworkRunDir, "ipam.db"), - firewallDriver: conf.FirewallDriver, + firewallDriver: conf.Config.Network.FirewallDriver, defaultNetwork: defaultNetworkName, defaultSubnet: defaultNet, defaultsubnetPools: defaultSubnetPools, - dnsBindPort: conf.DNSBindPort, - pluginDirs: conf.PluginDirs, + dnsBindPort: conf.Config.Network.DNSBindPort, + pluginDirs: conf.Config.Network.NetavarkPluginDirs.Get(), lock: lock, syslog: conf.Syslog, + rootlessNetns: netns, } return n, nil diff --git a/libnetwork/netavark/run.go b/libnetwork/netavark/run.go index 3df5ced05..42c76690c 100644 --- a/libnetwork/netavark/run.go +++ b/libnetwork/netavark/run.go @@ -72,12 +72,24 @@ func (n *netavarkNetwork) Setup(namespacePath string, options types.SetupOptions } result := map[string]types.StatusBlock{} - err = n.execNetavark([]string{"setup", namespacePath}, needPlugin, netavarkOpts, &result) - if err != nil { - // lets dealloc ips to prevent leaking - if err := n.deallocIPs(&options.NetworkOptions); err != nil { - logrus.Error(err) + setup := func() error { + err := n.execNetavark([]string{"setup", namespacePath}, needPlugin, netavarkOpts, &result) + if err != nil { + // lets dealloc ips to prevent leaking + if err := n.deallocIPs(&options.NetworkOptions); err != nil { + logrus.Error(err) + } + return err } + return nil + } + + if n.rootlessNetns != nil { + err = n.rootlessNetns.Setup(len(options.Networks), setup) + } else { + err = setup() + } + if err != nil { return nil, err } @@ -112,7 +124,16 @@ func (n *netavarkNetwork) Teardown(namespacePath string, options types.TeardownO return fmt.Errorf("failed to convert net opts: %w", err) } - retErr := n.execNetavark([]string{"teardown", namespacePath}, needPlugin, netavarkOpts, nil) + var retErr error + teardown := func() error { + return n.execNetavark([]string{"teardown", namespacePath}, needPlugin, netavarkOpts, nil) + } + + if n.rootlessNetns != nil { + retErr = n.rootlessNetns.Teardown(len(options.Networks), teardown) + } else { + retErr = teardown() + } // when netavark returned an error we still free the used ips // otherwise we could end up in a state where block the ips forever @@ -160,3 +181,10 @@ func (n *netavarkNetwork) convertNetOpts(opts types.NetworkOptions) (*netavarkOp } return &netavarkOptions, needsPlugin, nil } + +func (n *netavarkNetwork) RunInRootlessNetns(toRun func() error) error { + if n.rootlessNetns == nil { + return types.ErrNotRootlessNetns + } + return n.rootlessNetns.Run(n.lock, toRun) +} diff --git a/libnetwork/network/interface.go b/libnetwork/network/interface.go index b3a5f2aec..4a8290ba7 100644 --- a/libnetwork/network/interface.go +++ b/libnetwork/network/interface.go @@ -77,17 +77,12 @@ func NetworkBackend(store storage.Store, conf *config.Config, syslog bool) (type } netInt, err := netavark.NewNetworkInterface(&netavark.InitConfig{ - NetworkConfigDir: confDir, - NetworkRunDir: runDir, - NetavarkBinary: netavarkBin, - AardvarkBinary: aardvarkBin, - PluginDirs: conf.Network.NetavarkPluginDirs.Get(), - FirewallDriver: conf.Network.FirewallDriver, - DefaultNetwork: conf.Network.DefaultNetwork, - DefaultSubnet: conf.Network.DefaultSubnet, - DefaultsubnetPools: conf.Network.DefaultSubnetPools, - DNSBindPort: conf.Network.DNSBindPort, - Syslog: syslog, + Config: conf, + NetworkConfigDir: confDir, + NetworkRunDir: runDir, + NetavarkBinary: netavarkBin, + AardvarkBinary: aardvarkBin, + Syslog: syslog, }) return types.Netavark, netInt, err case types.CNI: @@ -181,13 +176,10 @@ func getCniInterface(conf *config.Config) (types.ContainerNetwork, error) { } } return cni.NewCNINetworkInterface(&cni.InitConfig{ - CNIConfigDir: confDir, - CNIPluginDirs: conf.Network.CNIPluginDirs.Get(), - RunDir: conf.Engine.TmpDir, - DefaultNetwork: conf.Network.DefaultNetwork, - DefaultSubnet: conf.Network.DefaultSubnet, - DefaultsubnetPools: conf.Network.DefaultSubnetPools, - IsMachine: machine.IsGvProxyBased(), + Config: conf, + CNIConfigDir: confDir, + RunDir: conf.Engine.TmpDir, + IsMachine: machine.IsGvProxyBased(), }) } diff --git a/libnetwork/types/define.go b/libnetwork/types/define.go index 6e91ccda9..193377b1a 100644 --- a/libnetwork/types/define.go +++ b/libnetwork/types/define.go @@ -18,6 +18,9 @@ var ( // exists. ErrNetworkExists = errors.New("network already exists") + // ErrNotRootlessNetns indicates the rootless netns can only be used as root + ErrNotRootlessNetns = errors.New("rootless netns cannot be used as root") + // NameRegex is a regular expression to validate names. // This must NOT be changed. NameRegex = regexp.Delayed("^[a-zA-Z0-9][a-zA-Z0-9_.-]*$") diff --git a/libnetwork/types/network.go b/libnetwork/types/network.go index 94087fd37..9e30975cb 100644 --- a/libnetwork/types/network.go +++ b/libnetwork/types/network.go @@ -27,6 +27,10 @@ type ContainerNetwork interface { // Teardown will teardown the container network namespace. Teardown(namespacePath string, options TeardownOptions) error + // RunInRootlessNetns is used to run the given function in the rootless netns. + // Only used as rootless and should return an error as root. + RunInRootlessNetns(toRun func() error) error + // Drivers will return the list of supported network drivers // for this interface. Drivers() []string diff --git a/pkg/cgroups/testdata/cgroup.empty b/pkg/cgroups/testdata/cgroup.empty new file mode 100644 index 000000000..e69de29bb diff --git a/pkg/cgroups/testdata/cgroup.other b/pkg/cgroups/testdata/cgroup.other new file mode 100644 index 000000000..239a7cded --- /dev/null +++ b/pkg/cgroups/testdata/cgroup.other @@ -0,0 +1 @@ +0::/other diff --git a/pkg/cgroups/testdata/cgroup.root b/pkg/cgroups/testdata/cgroup.root new file mode 100644 index 000000000..1e027b2a3 --- /dev/null +++ b/pkg/cgroups/testdata/cgroup.root @@ -0,0 +1 @@ +0::/ diff --git a/pkg/cgroups/utils_linux.go b/pkg/cgroups/utils_linux.go index ed9f0761d..ffdf10aca 100644 --- a/pkg/cgroups/utils_linux.go +++ b/pkg/cgroups/utils_linux.go @@ -4,6 +4,7 @@ package cgroups import ( + "bufio" "bytes" "errors" "fmt" @@ -11,6 +12,7 @@ import ( "path" "path/filepath" "strings" + "sync" "github.com/opencontainers/runc/libcontainer/cgroups" "github.com/opencontainers/runc/libcontainer/configs" @@ -143,3 +145,171 @@ func SetBlkioThrottle(res *configs.Resources, cgroupPath string) error { } return nil } + +// Code below was moved from podman/utils/utils_supported.go and should properly better +// integrated here as some parts may be redundant. + +func getCgroupProcess(procFile string, allowRoot bool) (string, error) { + f, err := os.Open(procFile) + if err != nil { + return "", err + } + defer f.Close() + + scanner := bufio.NewScanner(f) + cgroup := "" + for scanner.Scan() { + line := scanner.Text() + parts := strings.SplitN(line, ":", 3) + if len(parts) != 3 { + return "", fmt.Errorf("cannot parse cgroup line %q", line) + } + if strings.HasPrefix(line, "0::") { + cgroup = line[3:] + break + } + if len(parts[2]) > len(cgroup) { + cgroup = parts[2] + } + } + if len(cgroup) == 0 || (!allowRoot && cgroup == "/") { + return "", fmt.Errorf("could not find cgroup mount in %q", procFile) + } + return cgroup, nil +} + +// GetOwnCgroup returns the cgroup for the current process. +func GetOwnCgroup() (string, error) { + return getCgroupProcess("/proc/self/cgroup", true) +} + +func GetOwnCgroupDisallowRoot() (string, error) { + return getCgroupProcess("/proc/self/cgroup", false) +} + +// GetCgroupProcess returns the cgroup for the specified process process. +func GetCgroupProcess(pid int) (string, error) { + return getCgroupProcess(fmt.Sprintf("/proc/%d/cgroup", pid), true) +} + +// MoveUnderCgroupSubtree moves the PID under a cgroup subtree. +func MoveUnderCgroupSubtree(subtree string) error { + return MoveUnderCgroup("", subtree, nil) +} + +// MoveUnderCgroup moves a group of processes to a new cgroup. +// If cgroup is the empty string, then the current calling process cgroup is used. +// If processes is empty, then the processes from the current cgroup are moved. +func MoveUnderCgroup(cgroup, subtree string, processes []uint32) error { + procFile := "/proc/self/cgroup" + f, err := os.Open(procFile) + if err != nil { + return err + } + defer f.Close() + + unifiedMode, err := IsCgroup2UnifiedMode() + if err != nil { + return err + } + + scanner := bufio.NewScanner(f) + for scanner.Scan() { + line := scanner.Text() + parts := strings.SplitN(line, ":", 3) + if len(parts) != 3 { + return fmt.Errorf("cannot parse cgroup line %q", line) + } + + // root cgroup, skip it + if parts[2] == "/" && !(unifiedMode && parts[1] == "") { + continue + } + + cgroupRoot := "/sys/fs/cgroup" + // Special case the unified mount on hybrid cgroup and named hierarchies. + // This works on Fedora 31, but we should really parse the mounts to see + // where the cgroup hierarchy is mounted. + if parts[1] == "" && !unifiedMode { + // If it is not using unified mode, the cgroup v2 hierarchy is + // usually mounted under /sys/fs/cgroup/unified + cgroupRoot = filepath.Join(cgroupRoot, "unified") + + // Ignore the unified mount if it doesn't exist + if _, err := os.Stat(cgroupRoot); err != nil && os.IsNotExist(err) { + continue + } + } else if parts[1] != "" { + // Assume the controller is mounted at /sys/fs/cgroup/$CONTROLLER. + controller := strings.TrimPrefix(parts[1], "name=") + cgroupRoot = filepath.Join(cgroupRoot, controller) + } + + parentCgroup := cgroup + if parentCgroup == "" { + parentCgroup = parts[2] + } + newCgroup := filepath.Join(cgroupRoot, parentCgroup, subtree) + if err := os.MkdirAll(newCgroup, 0o755); err != nil && !os.IsExist(err) { + return err + } + + f, err := os.OpenFile(filepath.Join(newCgroup, "cgroup.procs"), os.O_RDWR, 0o755) + if err != nil { + return err + } + defer f.Close() + + if len(processes) > 0 { + for _, pid := range processes { + if _, err := f.WriteString(fmt.Sprintf("%d\n", pid)); err != nil { + logrus.Debugf("Cannot move process %d to cgroup %q: %v", pid, newCgroup, err) + } + } + } else { + processesData, err := os.ReadFile(filepath.Join(cgroupRoot, parts[2], "cgroup.procs")) + if err != nil { + return err + } + for _, pid := range bytes.Split(processesData, []byte("\n")) { + if len(pid) == 0 { + continue + } + if _, err := f.Write(pid); err != nil { + logrus.Debugf("Cannot move process %s to cgroup %q: %v", string(pid), newCgroup, err) + } + } + } + } + return nil +} + +var ( + maybeMoveToSubCgroupSync sync.Once + maybeMoveToSubCgroupSyncErr error +) + +// MaybeMoveToSubCgroup moves the current process in a sub cgroup when +// it is running in the root cgroup on a system that uses cgroupv2. +func MaybeMoveToSubCgroup() error { + maybeMoveToSubCgroupSync.Do(func() { + unifiedMode, err := IsCgroup2UnifiedMode() + if err != nil { + maybeMoveToSubCgroupSyncErr = err + return + } + if !unifiedMode { + maybeMoveToSubCgroupSyncErr = nil + return + } + cgroup, err := GetOwnCgroup() + if err != nil { + maybeMoveToSubCgroupSyncErr = err + return + } + if cgroup == "/" { + maybeMoveToSubCgroupSyncErr = MoveUnderCgroupSubtree("init") + } + }) + return maybeMoveToSubCgroupSyncErr +} diff --git a/pkg/cgroups/utils_linux_test.go b/pkg/cgroups/utils_linux_test.go new file mode 100644 index 000000000..9926aa5c7 --- /dev/null +++ b/pkg/cgroups/utils_linux_test.go @@ -0,0 +1,23 @@ +package cgroups + +import ( + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestCgroupProcess(t *testing.T) { + val, err := getCgroupProcess("testdata/cgroup.root", true) + assert.Nil(t, err) + assert.Equal(t, "/", val) + + _, err = getCgroupProcess("testdata/cgroup.root", false) + assert.NotNil(t, err) + + val, err = getCgroupProcess("testdata/cgroup.other", true) + assert.Nil(t, err) + assert.Equal(t, "/other", val) + + _, err = getCgroupProcess("testdata/cgroup.empty", true) + assert.NotNil(t, err) +} diff --git a/pkg/netns/netns_linux.go b/pkg/netns/netns_linux.go index 9f0336bc0..5b5e0daeb 100644 --- a/pkg/netns/netns_linux.go +++ b/pkg/netns/netns_linux.go @@ -32,10 +32,12 @@ import ( "github.com/containernetworking/plugins/pkg/ns" "github.com/containers/storage/pkg/homedir" "github.com/containers/storage/pkg/unshare" - "github.com/sirupsen/logrus" "golang.org/x/sys/unix" ) +// threadNsPath is the /proc path to the current netns handle for the current thread +const threadNsPath = "/proc/thread-self/ns/net" + // GetNSRunDir returns the dir of where to create the netNS. When running // rootless, it needs to be at a location writable by user. func GetNSRunDir() (string, error) { @@ -49,6 +51,10 @@ func GetNSRunDir() (string, error) { return "/run/netns", nil } +func NewNSAtPath(nsPath string) (ns.NetNS, error) { + return newNSPath(nsPath) +} + // NewNS creates a new persistent (bind-mounted) network namespace and returns // an object representing that namespace, without switching to it. func NewNS() (ns.NetNS, error) { @@ -111,8 +117,12 @@ func NewNSWithName(name string) (ns.NetNS, error) { } } - // create an empty file at the mount point nsPath := path.Join(nsRunDir, name) + return newNSPath(nsPath) +} + +func newNSPath(nsPath string) (ns.NetNS, error) { + // create an empty file at the mount point mountPointFd, err := os.OpenFile(nsPath, os.O_RDWR|os.O_CREATE|os.O_EXCL, 0o600) if err != nil { return nil, err @@ -140,24 +150,10 @@ func NewNSWithName(name string) (ns.NetNS, error) { // Don't unlock. By not unlocking, golang will kill the OS thread when the // goroutine is done (for go1.10+) - threadNsPath := getCurrentThreadNetNSPath() - - var origNS ns.NetNS - origNS, err = ns.GetNS(threadNsPath) - if err != nil { - logrus.Warnf("Cannot open current network namespace %s: %q", threadNsPath, err) - return - } - defer func() { - if err := origNS.Close(); err != nil { - logrus.Errorf("Unable to close namespace: %q", err) - } - }() - // create a new netns on the current thread err = unix.Unshare(unix.CLONE_NEWNET) if err != nil { - logrus.Warnf("Cannot create a new network namespace: %q", err) + err = fmt.Errorf("unshare network namespace: %w", err) return } @@ -181,13 +177,8 @@ func NewNSWithName(name string) (ns.NetNS, error) { // UnmountNS unmounts the given netns path func UnmountNS(nsPath string) error { - nsRunDir, err := GetNSRunDir() - if err != nil { - return err - } - // Only unmount if it's been bind-mounted (don't touch namespaces in /proc...) - if strings.HasPrefix(nsPath, nsRunDir) { + if !strings.HasPrefix(nsPath, "/proc/") { if err := unix.Unmount(nsPath, unix.MNT_DETACH); err != nil { return fmt.Errorf("failed to unmount NS: at %s: %v", nsPath, err) } @@ -199,11 +190,3 @@ func UnmountNS(nsPath string) error { return nil } - -// getCurrentThreadNetNSPath copied from pkg/ns -func getCurrentThreadNetNSPath() string { - // /proc/self/ns/net returns the namespace of the main thread, not - // of whatever thread this goroutine is running on. Make sure we - // use the thread's net namespace since the thread is switching around - return fmt.Sprintf("/proc/%d/task/%d/ns/net", os.Getpid(), unix.Gettid()) -} diff --git a/pkg/systemd/systemd_linux.go b/pkg/systemd/systemd_linux.go new file mode 100644 index 000000000..02503618f --- /dev/null +++ b/pkg/systemd/systemd_linux.go @@ -0,0 +1,151 @@ +package systemd + +import ( + "context" + "crypto/rand" + "fmt" + "os" + "strconv" + "sync" + + "github.com/containers/common/pkg/cgroups" + "github.com/containers/storage/pkg/unshare" + systemdDbus "github.com/coreos/go-systemd/v22/dbus" + "github.com/godbus/dbus/v5" + "github.com/sirupsen/logrus" +) + +var ( + runsOnSystemdOnce sync.Once + runsOnSystemd bool +) + +// RunsOnSystemd returns whether the system is using systemd +func RunsOnSystemd() bool { + runsOnSystemdOnce.Do(func() { + // per sd_booted(3), check for this dir + fd, err := os.Stat("/run/systemd/system") + runsOnSystemd = err == nil && fd.IsDir() + }) + return runsOnSystemd +} + +func moveProcessPIDFileToScope(pidPath, slice, scope string) error { + data, err := os.ReadFile(pidPath) + if err != nil { + // do not raise an error if the file doesn't exist + if os.IsNotExist(err) { + return nil + } + return fmt.Errorf("cannot read pid file: %w", err) + } + pid, err := strconv.ParseUint(string(data), 10, 0) + if err != nil { + return fmt.Errorf("cannot parse pid file %s: %w", pidPath, err) + } + + return moveProcessToScope(int(pid), slice, scope) +} + +func moveProcessToScope(pid int, slice, scope string) error { + err := RunUnderSystemdScope(pid, slice, scope) + // If the PID is not valid anymore, do not return an error. + if dbusErr, ok := err.(dbus.Error); ok { + if dbusErr.Name == "org.freedesktop.DBus.Error.UnixProcessIdUnknown" { + return nil + } + } + return err +} + +// MoveRootlessNetnsSlirpProcessToUserSlice moves the slirp4netns process for the rootless netns +// into a different scope so that systemd does not kill it with a container. +func MoveRootlessNetnsSlirpProcessToUserSlice(pid int) error { + randBytes := make([]byte, 4) + _, err := rand.Read(randBytes) + if err != nil { + return err + } + return moveProcessToScope(pid, "user.slice", fmt.Sprintf("rootless-netns-%x.scope", randBytes)) +} + +// MovePauseProcessToScope moves the pause process used for rootless mode to keep the namespaces alive to +// a separate scope. +func MovePauseProcessToScope(pausePidPath string) { + var err error + + for i := 0; i < 10; i++ { + randBytes := make([]byte, 4) + _, err = rand.Read(randBytes) + if err != nil { + logrus.Errorf("failed to read random bytes: %v", err) + continue + } + err = moveProcessPIDFileToScope(pausePidPath, "user.slice", fmt.Sprintf("podman-pause-%x.scope", randBytes)) + if err == nil { + return + } + } + + if err != nil { + unified, err2 := cgroups.IsCgroup2UnifiedMode() + if err2 != nil { + logrus.Warnf("Failed to detect if running with cgroup unified: %v", err) + } + if RunsOnSystemd() && unified { + logrus.Warnf("Failed to add pause process to systemd sandbox cgroup: %v", err) + } else { + logrus.Debugf("Failed to add pause process to systemd sandbox cgroup: %v", err) + } + } +} + +// RunUnderSystemdScope adds the specified pid to a systemd scope +func RunUnderSystemdScope(pid int, slice string, unitName string) error { + var properties []systemdDbus.Property + var conn *systemdDbus.Conn + var err error + + if unshare.GetRootlessUID() != 0 { + conn, err = cgroups.UserConnection(unshare.GetRootlessUID()) + if err != nil { + return err + } + } else { + conn, err = systemdDbus.NewWithContext(context.Background()) + if err != nil { + return err + } + } + defer conn.Close() + properties = append(properties, systemdDbus.PropSlice(slice)) + properties = append(properties, newProp("PIDs", []uint32{uint32(pid)})) + properties = append(properties, newProp("Delegate", true)) + properties = append(properties, newProp("DefaultDependencies", false)) + ch := make(chan string) + _, err = conn.StartTransientUnitContext(context.Background(), unitName, "replace", properties, ch) + if err != nil { + // On errors check if the cgroup already exists, if it does move the process there + if props, err := conn.GetUnitTypePropertiesContext(context.Background(), unitName, "Scope"); err == nil { + if cgroup, ok := props["ControlGroup"].(string); ok && cgroup != "" { + if err := cgroups.MoveUnderCgroup(cgroup, "", []uint32{uint32(pid)}); err == nil { + return nil + } + // On errors return the original error message we got from StartTransientUnit. + } + } + return err + } + + // Block until job is started + <-ch + + return nil +} + +func newProp(name string, units interface{}) systemdDbus.Property { + return systemdDbus.Property{ + Name: name, + Value: dbus.MakeVariant(units), + } +} diff --git a/pkg/systemd/systemd_unsupported.go b/pkg/systemd/systemd_unsupported.go new file mode 100644 index 000000000..e4a628527 --- /dev/null +++ b/pkg/systemd/systemd_unsupported.go @@ -0,0 +1,15 @@ +//go:build !linux + +package systemd + +import "errors" + +func RunsOnSystemd() bool { + return false +} + +func MovePauseProcessToScope(pausePidPath string) {} + +func RunUnderSystemdScope(pid int, slice string, unitName string) error { + return errors.New("RunUnderSystemdScope not supported on this OS") +}