From c1312757c90a65974e65cfcdecf5f5f067540d47 Mon Sep 17 00:00:00 2001
From: "mergify[bot]" <37929162+mergify[bot]@users.noreply.github.com>
Date: Wed, 3 Jul 2024 16:21:27 +0200
Subject: [PATCH] fix indefinite memory and CPU consumption when waiting fleet
 to be ready (#5034) (#5040)

* exit if timeout is reached while waiting for fleet server to start

* clarify exponential backoff behaviour

(cherry picked from commit 8aa347702a0af550fefc1259903fc0b68fc60b6a)

Co-authored-by: Anderson Queiroz <anderson.queiroz@elastic.co>
---
 ...mption-when-waiting-fleet-to-be-ready.yaml | 32 +++++++++++++++++++
 internal/pkg/agent/cmd/enroll_cmd.go          | 10 +++++-
 internal/pkg/agent/cmd/enroll_cmd_test.go     | 22 +++++++++++++
 internal/pkg/core/backoff/exponential.go      |  5 ++-
 4 files changed, 67 insertions(+), 2 deletions(-)
 create mode 100644 changelog/fragments/1719934992-Fix-indefinite-memory-and-CPU-consumption-when-waiting-fleet-to-be-ready.yaml

diff --git a/changelog/fragments/1719934992-Fix-indefinite-memory-and-CPU-consumption-when-waiting-fleet-to-be-ready.yaml b/changelog/fragments/1719934992-Fix-indefinite-memory-and-CPU-consumption-when-waiting-fleet-to-be-ready.yaml
new file mode 100644
index 00000000000..c97a91daaa5
--- /dev/null
+++ b/changelog/fragments/1719934992-Fix-indefinite-memory-and-CPU-consumption-when-waiting-fleet-to-be-ready.yaml
@@ -0,0 +1,32 @@
+# Kind can be one of:
+# - breaking-change: a change to previously-documented behavior
+# - deprecation: functionality that is being removed in a later release
+# - bug-fix: fixes a problem in a previous version
+# - enhancement: extends functionality but does not break or fix existing behavior
+# - feature: new functionality
+# - known-issue: problems that we are aware of in a given version
+# - security: impacts on the security of a product or a user’s deployment.
+# - upgrade: important information for someone upgrading from a prior version
+# - other: does not fit into any of the other categories
+kind: bug-fix
+
+# Change summary; a 80ish characters long description of the change.
+summary: Fix indefinite memory and CPU consumption when waiting fleet to be ready
+
+# Long description; in case the summary is not enough to describe the change
+# this field accommodate a description without length limits.
+# NOTE: This field will be rendered only for breaking-change and known-issue kinds at the moment.
+#description:
+
+# Affected component; a word indicating the component this changeset affects.
+component:
+
+# PR URL; optional; the PR number that added the changeset.
+# If not present is automatically filled by the tooling finding the PR where this changelog fragment has been added.
+# NOTE: the tooling supports backports, so it's able to fill the original PR number instead of the backport PR number.
+# Please provide it if you are adding a fragment for a different PR.
+pr: https://github.com/elastic/elastic-agent/pull/5034
+
+# Issue URL; optional; the GitHub issue related to this changeset (either closes or is part of).
+# If not present is automatically filled by the tooling with the issue linked to the PR number.
+issue: https://github.com/elastic/elastic-agent/issues/5033
diff --git a/internal/pkg/agent/cmd/enroll_cmd.go b/internal/pkg/agent/cmd/enroll_cmd.go
index 863727d6879..f7484fcbbc3 100644
--- a/internal/pkg/agent/cmd/enroll_cmd.go
+++ b/internal/pkg/agent/cmd/enroll_cmd.go
@@ -792,8 +792,16 @@ func waitForFleetServer(ctx context.Context, agentSubproc <-chan *os.ProcessStat
 		msg := ""
 		msgCount := 0
 		backExp := expBackoffWithContext(innerCtx, 1*time.Second, maxBackoff)
+
 		for {
-			backExp.Wait()
+			// if the timeout is reached, no response was sent on `res`, therefore
+			// send an error
+			if !backExp.Wait() {
+				resChan <- waitResult{err: fmt.Errorf(
+					"timed out waiting for Fleet Server to start after %s",
+					timeout)}
+			}
+
 			state, err := getDaemonState(innerCtx)
 			if errors.Is(err, context.Canceled) {
 				resChan <- waitResult{err: err}
diff --git a/internal/pkg/agent/cmd/enroll_cmd_test.go b/internal/pkg/agent/cmd/enroll_cmd_test.go
index 8b7a88232f8..a9597336760 100644
--- a/internal/pkg/agent/cmd/enroll_cmd_test.go
+++ b/internal/pkg/agent/cmd/enroll_cmd_test.go
@@ -577,6 +577,28 @@ func TestDaemonReloadWithBackoff(t *testing.T) {
 	}
 }
 
+func TestWaitForFleetServer_timeout(t *testing.T) {
+	log, _ := logger.NewTesting("TestWaitForFleetServer_timeout")
+	timeout := 5 * time.Second
+	testTimeout := 2 * timeout
+
+	ctx, cancel := context.WithTimeout(context.Background(), timeout)
+	defer cancel()
+	var got string
+	var err error
+	require.Eventuallyf(t,
+		func() bool {
+			got, err = waitForFleetServer(ctx, make(chan *os.ProcessState, 1), log, timeout)
+			return true
+		},
+		testTimeout,
+		500*time.Millisecond,
+		"waitForFleetServer never returned")
+
+	assert.Empty(t, got, "waitForFleetServer should have returned and empty enrollmentToken")
+	assert.Error(t, err, "waitForFleetServer should have returned an error")
+}
+
 func withServer(
 	m func(t *testing.T) *http.ServeMux,
 	test func(t *testing.T, host string),
diff --git a/internal/pkg/core/backoff/exponential.go b/internal/pkg/core/backoff/exponential.go
index 51b5b4e0cb5..a959521c457 100644
--- a/internal/pkg/core/backoff/exponential.go
+++ b/internal/pkg/core/backoff/exponential.go
@@ -45,7 +45,10 @@ func (b *ExpBackoff) NextWait() time.Duration {
 	return nextWait
 }
 
-// Wait block until either the timer is completed or channel is done.
+// Wait blocks until either the exponential backoff timer is completed or the
+// done channel is closed.
+// Wait returns true until done is closed. When done is closed, wait returns
+// immediately, therefore callers should always check the return value.
 func (b *ExpBackoff) Wait() bool {
 	b.duration = b.NextWait()