Skip to content

Commit

Permalink
nodeadm: add retry until daemon is detected running
Browse files Browse the repository at this point in the history
  • Loading branch information
ndbaker1 committed Sep 16, 2024
1 parent fcab352 commit d7cdd22
Show file tree
Hide file tree
Showing 8 changed files with 122 additions and 25 deletions.
7 changes: 6 additions & 1 deletion nodeadm/cmd/nodeadm/init/init.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@ package init

import (
"context"
"os"
"os/signal"

"github.com/aws/aws-sdk-go-v2/aws"
"github.com/aws/aws-sdk-go-v2/config"
Expand Down Expand Up @@ -46,6 +48,9 @@ func (c *initCmd) Flaggy() *flaggy.Subcommand {
}

func (c *initCmd) Run(log *zap.Logger, opts *cli.GlobalOptions) error {
ctx, cancel := signal.NotifyContext(context.Background(), os.Interrupt)
defer cancel()

log.Info("Checking user is root..")
root, err := cli.IsRunningAsRoot()
if err != nil {
Expand Down Expand Up @@ -126,7 +131,7 @@ func (c *initCmd) Run(log *zap.Logger, opts *cli.GlobalOptions) error {
nameField := zap.String("name", daemon.Name())

log.Info("Ensuring daemon is running..", nameField)
if err := daemon.EnsureRunning(); err != nil {
if err := daemon.EnsureRunning(ctx); err != nil {
return err
}
log.Info("Daemon is running", nameField)
Expand Down
3 changes: 1 addition & 2 deletions nodeadm/internal/aws/ecr/ecr.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@ import (
"go.uber.org/zap"
"net"
"strings"
"time"

"github.com/aws/aws-sdk-go-v2/config"
"github.com/aws/aws-sdk-go-v2/service/ecr"
Expand All @@ -23,7 +22,7 @@ func GetAuthorizationToken(awsRegion string) (string, error) {
}
ecrClient := ecr.NewFromConfig(awsConfig)
var token *ecr.GetAuthorizationTokenOutput
err = util.RetryExponentialBackoff(3, 2*time.Second, func() error {
err = util.NewRetrier(util.WithBackoffExponential()).Retry(context.TODO(), func() error {
token, err = ecrClient.GetAuthorizationToken(context.Background(), &ecr.GetAuthorizationTokenInput{})
return err
})
Expand Down
21 changes: 19 additions & 2 deletions nodeadm/internal/containerd/daemon.go
Original file line number Diff line number Diff line change
@@ -1,8 +1,13 @@
package containerd

import (
"context"
"fmt"
"time"

"github.com/awslabs/amazon-eks-ami/nodeadm/internal/api"
"github.com/awslabs/amazon-eks-ami/nodeadm/internal/daemon"
"github.com/awslabs/amazon-eks-ami/nodeadm/internal/util"
)

const ContainerdDaemonName = "containerd"
Expand All @@ -23,8 +28,20 @@ func (cd *containerd) Configure(c *api.NodeConfig) error {
return writeContainerdConfig(c)
}

func (cd *containerd) EnsureRunning() error {
return cd.daemonManager.StartDaemon(ContainerdDaemonName)
func (cd *containerd) EnsureRunning(ctx context.Context) error {
if err := cd.daemonManager.StartDaemon(ContainerdDaemonName); err != nil {
return err
}
return util.NewRetrier(util.WithRetryAlways(), util.WithBackoffFixed(250*time.Millisecond)).Retry(ctx, func() error {
status, err := cd.daemonManager.GetDaemonStatus(ContainerdDaemonName)
if err != nil {
return err
}
if status != daemon.DaemonStatusRunning {
return fmt.Errorf("%s status is not %q", ContainerdDaemonName, daemon.DaemonStatusRunning)
}
return nil
})
}

func (cd *containerd) PostLaunch(c *api.NodeConfig) error {
Expand Down
3 changes: 2 additions & 1 deletion nodeadm/internal/containerd/sandbox.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package containerd

import (
"context"
"fmt"
"os/exec"
"regexp"
Expand Down Expand Up @@ -44,7 +45,7 @@ func cacheSandboxImage(cfg *api.NodeConfig) error {
imageSpec := &v1.ImageSpec{Image: sandboxImage}
authConfig := &v1.AuthConfig{Auth: ecrUserToken}

return util.RetryExponentialBackoff(3, 2*time.Second, func() error {
return util.NewRetrier(util.WithBackoffExponential()).Retry(context.TODO(), func() error {
zap.L().Info("Pulling sandbox image..", zap.String("image", sandboxImage))
imageRef, err := client.PullImage(imageSpec, authConfig, nil)
if err != nil {
Expand Down
11 changes: 6 additions & 5 deletions nodeadm/internal/daemon/interface.go
Original file line number Diff line number Diff line change
@@ -1,20 +1,21 @@
package daemon

import "github.com/awslabs/amazon-eks-ami/nodeadm/internal/api"
import (
"context"

"github.com/awslabs/amazon-eks-ami/nodeadm/internal/api"
)

type Daemon interface {
// Configure configures the daemon.
Configure(*api.NodeConfig) error

// EnsureRunning ensures that the daemon is running.
// If the daemon is not running, it will be started.
// If the daemon is already running, and has been re-configured, it will be restarted.
EnsureRunning() error

EnsureRunning(context.Context) error
// PostLaunch runs any additional step that needs to occur after the service
// daemon as been started
PostLaunch(*api.NodeConfig) error

// Name returns the name of the daemon.
Name() string
}
2 changes: 1 addition & 1 deletion nodeadm/internal/daemon/systemd.go
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ func (m *systemdDaemonManager) GetDaemonStatus(name string) (DaemonStatus, error
if err != nil {
return DaemonStatusUnknown, err
}
switch status.Value.String() {
switch status.Value.Value().(string) {
case "active":
return DaemonStatusRunning, nil
case "inactive":
Expand Down
21 changes: 19 additions & 2 deletions nodeadm/internal/kubelet/daemon.go
Original file line number Diff line number Diff line change
@@ -1,8 +1,13 @@
package kubelet

import (
"context"
"fmt"
"time"

"github.com/awslabs/amazon-eks-ami/nodeadm/internal/api"
"github.com/awslabs/amazon-eks-ami/nodeadm/internal/daemon"
"github.com/awslabs/amazon-eks-ami/nodeadm/internal/util"
)

const KubeletDaemonName = "kubelet"
Expand Down Expand Up @@ -44,8 +49,20 @@ func (k *kubelet) Configure(cfg *api.NodeConfig) error {
return nil
}

func (k *kubelet) EnsureRunning() error {
return k.daemonManager.StartDaemon(KubeletDaemonName)
func (k *kubelet) EnsureRunning(ctx context.Context) error {
if err := k.daemonManager.StartDaemon(KubeletDaemonName); err != nil {
return err
}
return util.NewRetrier(util.WithRetryAlways(), util.WithBackoffFixed(250*time.Millisecond)).Retry(ctx, func() error {
status, err := k.daemonManager.GetDaemonStatus(KubeletDaemonName)
if err != nil {
return err
}
if status != daemon.DaemonStatusRunning {
return fmt.Errorf("%s status is not %q", KubeletDaemonName, daemon.DaemonStatusRunning)
}
return nil
})
}

func (k *kubelet) PostLaunch(_ *api.NodeConfig) error {
Expand Down
79 changes: 68 additions & 11 deletions nodeadm/internal/util/retry.go
Original file line number Diff line number Diff line change
@@ -1,16 +1,73 @@
package util

import "time"

func RetryExponentialBackoff(attempts int, initial time.Duration, f func() error) error {
var err error
wait := initial
for i := 0; i < attempts; i++ {
if err = f(); err == nil {
return nil
import (
"context"
"time"
)

type Retrier struct {
ConditionFn func(*Retrier) bool
BackoffFn func(*Retrier) time.Duration

LastErr error
LastWait time.Duration
LastIter int
}

func (r *Retrier) Retry(ctx context.Context, fn func() error) error {
for r.LastIter = 0; r.ConditionFn(r); r.LastIter++ {
if r.LastErr = fn(); r.LastErr == nil {
return r.LastErr
}
select {
case <-ctx.Done():
return ctx.Err()
default:
time.Sleep(r.LastWait)
r.LastWait = r.BackoffFn(r)
}
time.Sleep(wait)
wait *= 2
}
return err
return r.LastErr
}

type fnOpt func(*Retrier)

func NewRetrier(fnOpts ...fnOpt) *Retrier {
retrier := Retrier{
LastErr: nil,
LastIter: 0,
LastWait: time.Second,
}
for _, fn := range append([]fnOpt{
WithRetryCount(5),
WithBackoffExponential(),
}, fnOpts...) {
fn(&retrier)
}
return &retrier
}

func WithRetryCount(maxAttempts int) fnOpt {
return func(r *Retrier) {
r.ConditionFn = func(r *Retrier) bool { return r.LastIter < maxAttempts }
}
}

func WithRetryAlways() fnOpt {
return func(r *Retrier) {
r.ConditionFn = func(r *Retrier) bool { return true }
}
}

func WithBackoffFixed(interval time.Duration) fnOpt {
return func(r *Retrier) {
r.LastWait = interval
r.BackoffFn = func(r *Retrier) time.Duration { return r.LastWait }
}
}

func WithBackoffExponential() fnOpt {
return func(r *Retrier) {
r.BackoffFn = func(r *Retrier) time.Duration { return r.LastWait * 2 }
}
}

0 comments on commit d7cdd22

Please sign in to comment.