Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Get rid of docker pause containers with a custom runtime #20017

Draft
wants to merge 6 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .changelog/20017.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
```release-note:improvement
docker: Added support for running without pause containers when configuring networking
```
6 changes: 6 additions & 0 deletions drivers/docker/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -310,6 +310,8 @@ var (
// task containers. If true, nomad doesn't start docker_logger/logmon processes
"disable_log_collection": hclspec.NewAttr("disable_log_collection", "bool", false),

"nomad_native_networking": hclspec.NewAttr("nomad_native_networking", "bool", false),

// windows_allow_insecure_container_admin indicates that on windows,
// docker checks the task.user field or, if unset, the container image
// manifest after pulling the container, to see if it's running as
Expand Down Expand Up @@ -675,6 +677,7 @@ type DriverConfig struct {
infraImagePullTimeoutDuration time.Duration `codec:"-"`
ContainerExistsAttempts uint64 `codec:"container_exists_attempts"`
DisableLogCollection bool `codec:"disable_log_collection"`
NomadNativeNetworking bool `codec:"nomad_native_networking"`
PullActivityTimeout string `codec:"pull_activity_timeout"`
PidsLimit int64 `codec:"pids_limit"`
pullActivityTimeoutDuration time.Duration `codec:"-"`
Expand Down Expand Up @@ -828,5 +831,8 @@ func (d *Driver) TaskConfigSchema() (*hclspec.Spec, error) {
// features this driver supports.
func (d *Driver) Capabilities() (*drivers.Capabilities, error) {
driverCapabilities.DisableLogCollection = d.config != nil && d.config.DisableLogCollection
if d.config != nil {
driverCapabilities.MustInitiateNetwork = !d.config.NomadNativeNetworking
}
return driverCapabilities, nil
}
23 changes: 19 additions & 4 deletions drivers/docker/driver.go
Original file line number Diff line number Diff line change
Expand Up @@ -990,6 +990,13 @@ func (d *Driver) createContainerConfig(task *drivers.TaskConfig, driverConfig *T
if _, ok := d.config.allowRuntimes[containerRuntime]; !ok && containerRuntime != "" {
return c, fmt.Errorf("requested runtime %q is not allowed", containerRuntime)
}
if d.config.NomadNativeNetworking {
if containerRuntime == "" {
containerRuntime = "nomad"
} else {
containerRuntime = fmt.Sprintf("nomad-%s", containerRuntime)
}
}

// Validate isolation modes on windows
if runtime.GOOS != "windows" {
Expand Down Expand Up @@ -1038,6 +1045,8 @@ func (d *Driver) createContainerConfig(task *drivers.TaskConfig, driverConfig *T

Runtime: containerRuntime,
GroupAdd: driverConfig.GroupAdd,

Annotations: map[string]string{},
}

hostConfig.Resources = containerapi.Resources{
Expand Down Expand Up @@ -1285,10 +1294,16 @@ func (d *Driver) createContainerConfig(task *drivers.TaskConfig, driverConfig *T
// shared alloc network
if hostConfig.NetworkMode == "" {
if task.NetworkIsolation != nil && task.NetworkIsolation.Path != "" {
// find the previously created parent container to join networks with
netMode := fmt.Sprintf("container:%s", task.NetworkIsolation.Labels[dockerNetSpecLabelKey])
logger.Debug("configuring network mode for task group", "network_mode", netMode)
hostConfig.NetworkMode = containerapi.NetworkMode(netMode)
if d.config.NomadNativeNetworking {
// "host" is not actually true here, it will cause joining the existing namespace
hostConfig.NetworkMode = "host"
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Might still be better to use "none" here and test if it still works? But the differences are probably marginal. docker network inspect host/none will show the containers either way, even if they are not really attached.

hostConfig.Annotations["network_ns"] = task.NetworkIsolation.Path
} else {
// find the previously created parent container to join networks with
netMode := fmt.Sprintf("container:%s", task.NetworkIsolation.Labels[dockerNetSpecLabelKey])
logger.Debug("configuring network mode for task group", "network_mode", netMode)
hostConfig.NetworkMode = containerapi.NetworkMode(netMode)
}
} else {
// docker default
logger.Debug("networking mode not specified; using default")
Expand Down
112 changes: 112 additions & 0 deletions drivers/docker/runcshim/z_runcshim_cmd.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
// Copyright (c) HashiCorp, Inc.
// SPDX-License-Identifier: BUSL-1.1

package runcshim

import (
"encoding/json"
"fmt"
"io"
"os"
"os/exec"
"path/filepath"
"slices"
"syscall"

"github.com/opencontainers/runtime-spec/specs-go"
)

const (
ExitError = 1
)

func exitWithMsg(msg string) {
fmt.Fprintln(os.Stderr, msg)
os.Exit(ExitError)
}

// This init() must be initialized last in package required by the child plugin
// process. It's recommended to avoid any other `init()` or inline any necessary
// calls here. See eeaa95d commit message for more details.
func init() {
if len(os.Args) > 1 && os.Args[1] == "runcshim" {
if len(os.Args) <= 3 {
exitWithMsg("expected path to runc compatible binary")
}

if slices.Contains(os.Args, "create") {
var bundleRoot string
if bundleIndex := slices.Index(os.Args, "--bundle"); bundleIndex != -1 {
if !(bundleIndex+1 < len(os.Args)) {
exitWithMsg("bundle directory not passed")
}
bundleRoot = os.Args[bundleIndex+1]
} else { // Use cwd
wd, err := os.Getwd()
if err != nil {
exitWithMsg(fmt.Sprint(err))
}
bundleRoot = wd
}
configFile := fmt.Sprintf("%s/config.json", bundleRoot)
jsonFile, err := os.Open(configFile)
if err != nil {
exitWithMsg(fmt.Sprintf("Could not open %q: %v", configFile, err))
}
byteValue, err := io.ReadAll(jsonFile)
if err != nil {
jsonFile.Close()
exitWithMsg(fmt.Sprintf("Could not read %q: %v", configFile, err))
}
jsonFile.Close()

var spec specs.Spec
err = json.Unmarshal(byteValue, &spec)
if err != nil {
exitWithMsg(fmt.Sprintf("Could not unmarshal config: %v", err))
}

annotation, ok := spec.Annotations["network_ns"]
if !ok {
exitWithMsg("Missing `network_ns` annotation. Are we called from Nomad?")
}
if spec.Linux == nil {
exitWithMsg("Missing `linux` configuration, you are using linux are you?")
}

// If there is a network namespace, modify it
foundNetworkNamespace := false
for idx := range spec.Linux.Namespaces {
if spec.Linux.Namespaces[idx].Type == "network" {
spec.Linux.Namespaces[idx].Path = annotation
foundNetworkNamespace = true
break
}
}
// if not add one
if !foundNetworkNamespace {
var namespace = specs.LinuxNamespace{Type: "network", Path: annotation}
spec.Linux.Namespaces = append(spec.Linux.Namespaces, namespace)
}

jsonBytes, err := json.Marshal(spec)
if err != nil {
exitWithMsg(fmt.Sprintf("Could not marshal config: %v", err))
}
err = os.WriteFile(configFile, jsonBytes, 0600)
if err != nil {
exitWithMsg(fmt.Sprintf("Failed writing config.json: %v", err))
}
}

runc_binary := os.Args[2]
// Resolve full path via $PATH
runc_binary, err := exec.LookPath(runc_binary)
if err != nil {
fmt.Fprintf(os.Stderr, "Could not resolve full path for %q: %v", runc_binary, err)
os.Exit(ExitError)
}
args := append([]string{filepath.Base(runc_binary)}, os.Args[3:]...)
syscall.Exec(runc_binary, args, os.Environ())
}
}
2 changes: 2 additions & 0 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (
_ "github.com/hashicorp/nomad/client/allocrunner/taskrunner/template/renderer"
_ "github.com/hashicorp/nomad/client/logmon"
_ "github.com/hashicorp/nomad/drivers/docker/docklog"
_ "github.com/hashicorp/nomad/drivers/docker/runcshim"
_ "github.com/hashicorp/nomad/drivers/shared/executor"

// Don't move any other code imports above the import block above!
Expand All @@ -43,6 +44,7 @@ var (
"logmon",
"node-drain",
"node-status",
"runcshim",
"server-force-leave",
"server-join",
"server-members",
Expand Down
33 changes: 32 additions & 1 deletion website/content/docs/drivers/docker.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -1031,7 +1031,7 @@ host system.
container necessary when sharing network namespaces between tasks. Defaults to
`registry.k8s.io/pause-<goarch>:3.3`. The image will only be pulled from the
container registry if its tag is `latest` or the image doesn't yet exist
locally.
locally. Not used when `nomad_native_networking` is set.

- `infra_image_pull_timeout` - A time duration that controls how long Nomad will
wait before cancelling an in-progress pull of the Docker image as specified in
Expand All @@ -1042,6 +1042,37 @@ host system.
pulling the container, to see if it's running as `ContainerAdmin`. If so, exits
with an error unless the task config has `privileged=true`. Defaults to `false`.

- `nomad_native_networking` - Indicates that bridge networking should not use a
pause container but rather let nomad setup the network namespace like it does
for other drivers. This requires the configuration of a `nomad` runtime in the
docker configuration file (usually `/etc/docker/daemon.json`) like this:

```json
{
"runtimes": {
"nomad": {
"path": "/usr/bin/nomad",
"runtimeArgs": ["runcshim", "runc"]
},
}
}
```

If you wish for other runtimes (for instance `nvidia`) to also support the native
networking add further runtimes prefixed with `nomad-` that relay to the original
one:

```
{
"runtimes": {
"nomad-nvidia": {
"path": "/usr/bin/nomad",
"runtimeArgs": ["runcshim", "/usr/bin/nvidia-container-runtime"]
},
}
}
```

## Client Configuration

~> Note: client configuration options will soon be deprecated. Please use
Expand Down