From c1312757c90a65974e65cfcdecf5f5f067540d47 Mon Sep 17 00:00:00 2001 From: "mergify[bot]" <37929162+mergify[bot]@users.noreply.github.com> Date: Wed, 3 Jul 2024 16:21:27 +0200 Subject: [PATCH] fix indefinite memory and CPU consumption when waiting fleet to be ready (#5034) (#5040) * exit if timeout is reached while waiting for fleet server to start * clarify exponential backoff behaviour (cherry picked from commit 8aa347702a0af550fefc1259903fc0b68fc60b6a) Co-authored-by: Anderson Queiroz --- ...mption-when-waiting-fleet-to-be-ready.yaml | 32 +++++++++++++++++++ internal/pkg/agent/cmd/enroll_cmd.go | 10 +++++- internal/pkg/agent/cmd/enroll_cmd_test.go | 22 +++++++++++++ internal/pkg/core/backoff/exponential.go | 5 ++- 4 files changed, 67 insertions(+), 2 deletions(-) create mode 100644 changelog/fragments/1719934992-Fix-indefinite-memory-and-CPU-consumption-when-waiting-fleet-to-be-ready.yaml diff --git a/changelog/fragments/1719934992-Fix-indefinite-memory-and-CPU-consumption-when-waiting-fleet-to-be-ready.yaml b/changelog/fragments/1719934992-Fix-indefinite-memory-and-CPU-consumption-when-waiting-fleet-to-be-ready.yaml new file mode 100644 index 00000000000..c97a91daaa5 --- /dev/null +++ b/changelog/fragments/1719934992-Fix-indefinite-memory-and-CPU-consumption-when-waiting-fleet-to-be-ready.yaml @@ -0,0 +1,32 @@ +# Kind can be one of: +# - breaking-change: a change to previously-documented behavior +# - deprecation: functionality that is being removed in a later release +# - bug-fix: fixes a problem in a previous version +# - enhancement: extends functionality but does not break or fix existing behavior +# - feature: new functionality +# - known-issue: problems that we are aware of in a given version +# - security: impacts on the security of a product or a user’s deployment. +# - upgrade: important information for someone upgrading from a prior version +# - other: does not fit into any of the other categories +kind: bug-fix + +# Change summary; a 80ish characters long description of the change. +summary: Fix indefinite memory and CPU consumption when waiting fleet to be ready + +# Long description; in case the summary is not enough to describe the change +# this field accommodate a description without length limits. +# NOTE: This field will be rendered only for breaking-change and known-issue kinds at the moment. +#description: + +# Affected component; a word indicating the component this changeset affects. +component: + +# PR URL; optional; the PR number that added the changeset. +# If not present is automatically filled by the tooling finding the PR where this changelog fragment has been added. +# NOTE: the tooling supports backports, so it's able to fill the original PR number instead of the backport PR number. +# Please provide it if you are adding a fragment for a different PR. +pr: https://github.com/elastic/elastic-agent/pull/5034 + +# Issue URL; optional; the GitHub issue related to this changeset (either closes or is part of). +# If not present is automatically filled by the tooling with the issue linked to the PR number. +issue: https://github.com/elastic/elastic-agent/issues/5033 diff --git a/internal/pkg/agent/cmd/enroll_cmd.go b/internal/pkg/agent/cmd/enroll_cmd.go index 863727d6879..f7484fcbbc3 100644 --- a/internal/pkg/agent/cmd/enroll_cmd.go +++ b/internal/pkg/agent/cmd/enroll_cmd.go @@ -792,8 +792,16 @@ func waitForFleetServer(ctx context.Context, agentSubproc <-chan *os.ProcessStat msg := "" msgCount := 0 backExp := expBackoffWithContext(innerCtx, 1*time.Second, maxBackoff) + for { - backExp.Wait() + // if the timeout is reached, no response was sent on `res`, therefore + // send an error + if !backExp.Wait() { + resChan <- waitResult{err: fmt.Errorf( + "timed out waiting for Fleet Server to start after %s", + timeout)} + } + state, err := getDaemonState(innerCtx) if errors.Is(err, context.Canceled) { resChan <- waitResult{err: err} diff --git a/internal/pkg/agent/cmd/enroll_cmd_test.go b/internal/pkg/agent/cmd/enroll_cmd_test.go index 8b7a88232f8..a9597336760 100644 --- a/internal/pkg/agent/cmd/enroll_cmd_test.go +++ b/internal/pkg/agent/cmd/enroll_cmd_test.go @@ -577,6 +577,28 @@ func TestDaemonReloadWithBackoff(t *testing.T) { } } +func TestWaitForFleetServer_timeout(t *testing.T) { + log, _ := logger.NewTesting("TestWaitForFleetServer_timeout") + timeout := 5 * time.Second + testTimeout := 2 * timeout + + ctx, cancel := context.WithTimeout(context.Background(), timeout) + defer cancel() + var got string + var err error + require.Eventuallyf(t, + func() bool { + got, err = waitForFleetServer(ctx, make(chan *os.ProcessState, 1), log, timeout) + return true + }, + testTimeout, + 500*time.Millisecond, + "waitForFleetServer never returned") + + assert.Empty(t, got, "waitForFleetServer should have returned and empty enrollmentToken") + assert.Error(t, err, "waitForFleetServer should have returned an error") +} + func withServer( m func(t *testing.T) *http.ServeMux, test func(t *testing.T, host string), diff --git a/internal/pkg/core/backoff/exponential.go b/internal/pkg/core/backoff/exponential.go index 51b5b4e0cb5..a959521c457 100644 --- a/internal/pkg/core/backoff/exponential.go +++ b/internal/pkg/core/backoff/exponential.go @@ -45,7 +45,10 @@ func (b *ExpBackoff) NextWait() time.Duration { return nextWait } -// Wait block until either the timer is completed or channel is done. +// Wait blocks until either the exponential backoff timer is completed or the +// done channel is closed. +// Wait returns true until done is closed. When done is closed, wait returns +// immediately, therefore callers should always check the return value. func (b *ExpBackoff) Wait() bool { b.duration = b.NextWait()