From 99b14c8acc63e65c710dca19e2115200209ff314 Mon Sep 17 00:00:00 2001 From: Anderson Queiroz Date: Fri, 6 Oct 2023 14:16:52 +0200 Subject: [PATCH] Revert "fix install/enroll cmd to fail when agent restart fails (#3207)" (#3553) --- ...-Surface-errors-during-Agent's-enroll.yaml | 32 --------- internal/pkg/agent/cmd/enroll.go | 2 +- internal/pkg/agent/cmd/enroll_cmd.go | 62 +++++----------- internal/pkg/agent/cmd/enroll_cmd_test.go | 71 ++++++------------- internal/pkg/agent/cmd/install.go | 4 +- internal/pkg/agent/install/perms_unix.go | 32 ++++----- 6 files changed, 52 insertions(+), 151 deletions(-) delete mode 100644 changelog/fragments/1693403216-Surface-errors-during-Agent's-enroll.yaml diff --git a/changelog/fragments/1693403216-Surface-errors-during-Agent's-enroll.yaml b/changelog/fragments/1693403216-Surface-errors-during-Agent's-enroll.yaml deleted file mode 100644 index f8361f99433..00000000000 --- a/changelog/fragments/1693403216-Surface-errors-during-Agent's-enroll.yaml +++ /dev/null @@ -1,32 +0,0 @@ -# Kind can be one of: -# - breaking-change: a change to previously-documented behavior -# - deprecation: functionality that is being removed in a later release -# - bug-fix: fixes a problem in a previous version -# - enhancement: extends functionality but does not break or fix existing behavior -# - feature: new functionality -# - known-issue: problems that we are aware of in a given version -# - security: impacts on the security of a product or a user’s deployment. -# - upgrade: important information for someone upgrading from a prior version -# - other: does not fit into any of the other categories -kind: bug-fix - -# Change summary; a 80ish characters long description of the change. -summary: Surface errors during Agent's enroll process, failing if any happens. - -# Long description; in case the summary is not enough to describe the change -# this field accommodate a description without length limits. -# NOTE: This field will be rendered only for breaking-change and known-issue kinds at the moment. -#description: - -# Affected component; a word indicating the component this changeset affects. -component: install/enroll - -# PR URL; optional; the PR number that added the changeset. -# If not present is automatically filled by the tooling finding the PR where this changelog fragment has been added. -# NOTE: the tooling supports backports, so it's able to fill the original PR number instead of the backport PR number. -# Please provide it if you are adding a fragment for a different PR. -pr: https://github.com/elastic/elastic-agent/pull/3207 - -# Issue URL; optional; the GitHub issue related to this changeset (either closes or is part of). -# If not present is automatically filled by the tooling with the issue linked to the PR number. -#issue: https://github.com/owner/repo/1234 diff --git a/internal/pkg/agent/cmd/enroll.go b/internal/pkg/agent/cmd/enroll.go index adaa278f32f..1bce5f7e547 100644 --- a/internal/pkg/agent/cmd/enroll.go +++ b/internal/pkg/agent/cmd/enroll.go @@ -351,7 +351,7 @@ func enroll(streams *cli.IOStreams, cmd *cobra.Command) error { // Error: failed to fix permissions: chown /Library/Elastic/Agent/data/elastic-agent-c13f91/elastic-agent.app: operation not permitted // This is because we are fixing permissions twice, once during installation and again during the enrollment step. // When we are enrolling as part of installation on MacOS, skip the second attempt to fix permissions. - fixPermissions := fromInstall + var fixPermissions bool = fromInstall if runtime.GOOS == "darwin" { fixPermissions = false } diff --git a/internal/pkg/agent/cmd/enroll_cmd.go b/internal/pkg/agent/cmd/enroll_cmd.go index f43e1483a8a..b5992f10188 100644 --- a/internal/pkg/agent/cmd/enroll_cmd.go +++ b/internal/pkg/agent/cmd/enroll_cmd.go @@ -172,7 +172,7 @@ func newEnrollCmd( ) } -// newEnrollCmdWithStore creates a new enrollment and accept a custom store. +// newEnrollCmdWithStore creates an new enrollment and accept a custom store. func newEnrollCmdWithStore( log *logger.Logger, options *enrollCmdOption, @@ -187,11 +187,10 @@ func newEnrollCmdWithStore( }, nil } -// Execute enrolls the agent into Fleet. +// Execute tries to enroll the agent into Fleet. func (c *enrollCmd) Execute(ctx context.Context, streams *cli.IOStreams) error { var err error defer c.stopAgent() // ensure its stopped no matter what - span, ctx := apm.StartSpan(ctx, "enroll", "app.internal") defer func() { apm.CaptureError(ctx, err).Send() @@ -236,7 +235,7 @@ func (c *enrollCmd) Execute(ctx context.Context, streams *cli.IOStreams) error { // Ensure that the agent does not use a proxy configuration // when connecting to the local fleet server. // Note that when running fleet-server the enroll request will be sent to :8220, - // however when the agent is running afterward requests will be sent to :8221 + // however when the agent is running afterwards requests will be sent to :8221 c.remoteConfig.Transport.Proxy.Disable = true } @@ -257,7 +256,7 @@ func (c *enrollCmd) Execute(ctx context.Context, streams *cli.IOStreams) error { err = c.enrollWithBackoff(ctx, persistentConfig) if err != nil { - return fmt.Errorf("fail to enroll: %w", err) + return errors.New(err, "fail to enroll") } if c.options.FixPermissions { @@ -268,23 +267,17 @@ func (c *enrollCmd) Execute(ctx context.Context, streams *cli.IOStreams) error { } defer func() { - if err != nil { - fmt.Fprintf(streams.Err, "Something went wrong while enrolling the Elastic Agent: %v\n", err) - } else { - fmt.Fprintln(streams.Out, "Successfully enrolled the Elastic Agent.") - } + fmt.Fprintln(streams.Out, "Successfully enrolled the Elastic Agent.") }() if c.agentProc == nil { - if err = c.daemonReloadWithBackoff(ctx); err != nil { - c.log.Errorf("Elastic Agent might not be running; unable to trigger restart: %v", err) - return fmt.Errorf("could not reload agent daemon, unable to trigger restart: %w", err) + if err := c.daemonReload(ctx); err != nil { + c.log.Infow("Elastic Agent might not be running; unable to trigger restart", "error", err) + } else { + c.log.Info("Successfully triggered restart on running Elastic Agent.") } - - c.log.Info("Successfully triggered restart on running Elastic Agent.") return nil } - c.log.Info("Elastic Agent has been enrolled; start Elastic Agent") return nil } @@ -450,11 +443,6 @@ func (c *enrollCmd) prepareFleetTLS() error { func (c *enrollCmd) daemonReloadWithBackoff(ctx context.Context) error { err := c.daemonReload(ctx) - if err != nil && - (errors.Is(err, context.DeadlineExceeded) || - errors.Is(err, context.Canceled)) { - return fmt.Errorf("could not reload deamon: %w", err) - } if err == nil { return nil } @@ -462,20 +450,17 @@ func (c *enrollCmd) daemonReloadWithBackoff(ctx context.Context) error { signal := make(chan struct{}) backExp := backoff.NewExpBackoff(signal, 10*time.Second, 1*time.Minute) - var i int - for ; i < 5; i++ { + for i := 5; i >= 0; i-- { backExp.Wait() c.log.Info("Retrying to restart...") err = c.daemonReload(ctx) - if err == nil || - errors.Is(err, context.DeadlineExceeded) || - errors.Is(err, context.Canceled) { + if err == nil { break } } close(signal) - return fmt.Errorf("could not reload deamon after %d retries: %w", i+1, err) + return err } func (c *enrollCmd) daemonReload(ctx context.Context) error { @@ -493,20 +478,8 @@ func (c *enrollCmd) enrollWithBackoff(ctx context.Context, persistentConfig map[ c.log.Infof("Starting enrollment to URL: %s", c.client.URI()) err := c.enroll(ctx, persistentConfig) - if err == nil { - return nil - } - - const deadline = 10 * time.Minute - const frequency = 60 * time.Second - - c.log.Infof("1st enrollment attempt failed, retrying for %s, every %s enrolling to URL: %s", - deadline, - frequency, - c.client.URI()) signal := make(chan struct{}) - defer close(signal) - backExp := backoff.NewExpBackoff(signal, frequency, deadline) + backExp := backoff.NewExpBackoff(signal, 60*time.Second, 10*time.Minute) for { retry := false @@ -525,6 +498,7 @@ func (c *enrollCmd) enrollWithBackoff(ctx context.Context, persistentConfig map[ err = c.enroll(ctx, persistentConfig) } + close(signal) return err } @@ -573,10 +547,8 @@ func (c *enrollCmd) enroll(ctx context.Context, persistentConfig map[string]inte c.options.FleetServer.ElasticsearchInsecure, ) if err != nil { - return fmt.Errorf( - "failed creating fleet-server bootstrap config: %w", err) + return err } - // no longer need bootstrap at this point serverConfig.Server.Bootstrap = false fleetConfig.Server = serverConfig.Server @@ -596,11 +568,11 @@ func (c *enrollCmd) enroll(ctx context.Context, persistentConfig map[string]inte reader, err := yamlToReader(configToStore) if err != nil { - return fmt.Errorf("yamlToReader failed: %w", err) + return err } if err := safelyStoreAgentInfo(c.configStore, reader); err != nil { - return fmt.Errorf("failed to store agent config: %w", err) + return err } // clear action store diff --git a/internal/pkg/agent/cmd/enroll_cmd_test.go b/internal/pkg/agent/cmd/enroll_cmd_test.go index 426924273d1..189ad7b6563 100644 --- a/internal/pkg/agent/cmd/enroll_cmd_test.go +++ b/internal/pkg/agent/cmd/enroll_cmd_test.go @@ -16,11 +16,8 @@ import ( "os" "runtime" "strconv" - "strings" "testing" - "time" - "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" "github.com/elastic/elastic-agent/internal/pkg/agent/configuration" @@ -162,24 +159,14 @@ func TestEnroll(t *testing.T) { require.NoError(t, err) streams, _, _, _ := cli.NewTestingIOStreams() - ctx, cancel := context.WithTimeout(context.Background(), 1*time.Minute) - defer cancel() - err = cmd.Execute(ctx, streams) - - if err != nil && - // There is no agent running, therefore nothing to be restarted. - // However, this will cause the Enroll command to return an error - // which we'll ignore here. - !strings.Contains(err.Error(), - "could not reload agent daemon, unable to trigger restart") { - t.Fatalf("enrrol coms returned and unexpected error: %v", err) - } + err = cmd.Execute(context.Background(), streams) + require.NoError(t, err) config, err := readConfig(store.Content) - require.NoError(t, err) - assert.Equal(t, "my-access-api-key", config.AccessAPIKey) - assert.Equal(t, host, config.Client.Host) + require.NoError(t, err) + require.Equal(t, "my-access-api-key", config.AccessAPIKey) + require.Equal(t, host, config.Client.Host) }, )) @@ -229,24 +216,16 @@ func TestEnroll(t *testing.T) { require.NoError(t, err) streams, _, _, _ := cli.NewTestingIOStreams() - ctx, cancel := context.WithTimeout(context.Background(), 1*time.Minute) - defer cancel() - err = cmd.Execute(ctx, streams) - if err != nil && - // There is no agent running, therefore nothing to be restarted. - // However, this will cause the Enroll command to return an error - // which we'll ignore here. - !strings.Contains(err.Error(), - "could not reload agent daemon, unable to trigger restart") { - t.Fatalf("enrrol coms returned and unexpected error: %v", err) - } - - assert.True(t, store.Called) + err = cmd.Execute(context.Background(), streams) + require.NoError(t, err) + + require.True(t, store.Called) + config, err := readConfig(store.Content) - assert.NoError(t, err) - assert.Equal(t, "my-access-api-key", config.AccessAPIKey) - assert.Equal(t, host, config.Client.Host) + require.NoError(t, err) + require.Equal(t, "my-access-api-key", config.AccessAPIKey) + require.Equal(t, host, config.Client.Host) }, )) @@ -296,24 +275,16 @@ func TestEnroll(t *testing.T) { require.NoError(t, err) streams, _, _, _ := cli.NewTestingIOStreams() - ctx, cancel := context.WithTimeout(context.Background(), 1*time.Minute) - defer cancel() - err = cmd.Execute(ctx, streams) - - if err != nil && - // There is no agent running, therefore nothing to be restarted. - // However, this will cause the Enroll command to return an error - // which we'll ignore here. - !strings.Contains(err.Error(), - "could not reload agent daemon, unable to trigger restart") { - t.Fatalf("enrrol coms returned and unexpected error: %v", err) - } - - assert.True(t, store.Called) + err = cmd.Execute(context.Background(), streams) + require.NoError(t, err) + + require.True(t, store.Called) + config, err := readConfig(store.Content) + require.NoError(t, err) - assert.Equal(t, "my-access-api-key", config.AccessAPIKey) - assert.Equal(t, host, config.Client.Host) + require.Equal(t, "my-access-api-key", config.AccessAPIKey) + require.Equal(t, host, config.Client.Host) }, )) diff --git a/internal/pkg/agent/cmd/install.go b/internal/pkg/agent/cmd/install.go index 2cb46bd599d..4fbd37f40da 100644 --- a/internal/pkg/agent/cmd/install.go +++ b/internal/pkg/agent/cmd/install.go @@ -154,7 +154,7 @@ func installCmd(streams *cli.IOStreams, cmd *cobra.Command) error { return fmt.Errorf("problem reading prompt response") } if url == "" { - fmt.Fprintln(streams.Out, "Enrollment cancelled because no URL was provided.") + fmt.Fprintf(streams.Out, "Enrollment cancelled because no URL was provided.\n") return nil } } @@ -224,8 +224,6 @@ func installCmd(streams *cli.IOStreams, cmd *cobra.Command) error { } }() } - - fmt.Fprintln(streams.Out, "Elastic Agent successfully installed, starting enrollment.") } if enroll { diff --git a/internal/pkg/agent/install/perms_unix.go b/internal/pkg/agent/install/perms_unix.go index fc357fd4fde..e84dcd5039c 100644 --- a/internal/pkg/agent/install/perms_unix.go +++ b/internal/pkg/agent/install/perms_unix.go @@ -8,7 +8,6 @@ package install import ( "errors" - "fmt" "io/fs" "os" "path/filepath" @@ -19,26 +18,19 @@ func fixPermissions(topPath string) error { return recursiveRootPermissions(topPath) } -func recursiveRootPermissions(root string) error { - return filepath.Walk(root, func(path string, info fs.FileInfo, err error) error { - if errors.Is(err, fs.ErrNotExist) { +func recursiveRootPermissions(path string) error { + return filepath.Walk(path, func(name string, info fs.FileInfo, err error) error { + if err == nil { + // all files should be owned by root:root + err = os.Chown(name, 0, 0) + if err != nil { + return err + } + // remove any world permissions from the file + err = os.Chmod(name, info.Mode().Perm()&0770) + } else if errors.Is(err, fs.ErrNotExist) { return nil } - if err != nil { - return fmt.Errorf("walk on %q failed: %w", path, err) - } - - // all files should be owned by root:root - err = os.Chown(path, 0, 0) - if err != nil { - return fmt.Errorf("could not fix ownership of %q: %w", path, err) - } - // remove any world permissions from the file - err = os.Chmod(path, info.Mode().Perm()&0770) - if err != nil { - return fmt.Errorf("could not fix permissions of %q: %w", path, err) - } - - return nil + return err }) }