diff --git a/changelog/fragments/1696530758-bugfix-upgrade-progress-reporter.yaml b/changelog/fragments/1696530758-bugfix-upgrade-progress-reporter.yaml new file mode 100644 index 00000000000..d39f4fa2f41 --- /dev/null +++ b/changelog/fragments/1696530758-bugfix-upgrade-progress-reporter.yaml @@ -0,0 +1,32 @@ +# Kind can be one of: +# - breaking-change: a change to previously-documented behavior +# - deprecation: functionality that is being removed in a later release +# - bug-fix: fixes a problem in a previous version +# - enhancement: extends functionality but does not break or fix existing behavior +# - feature: new functionality +# - known-issue: problems that we are aware of in a given version +# - security: impacts on the security of a product or a user’s deployment. +# - upgrade: important information for someone upgrading from a prior version +# - other: does not fit into any of the other categories +kind: bug-fix + +# Change summary; a 80ish characters long description of the change. +summary: Periodically report progress of Elastic Agent artifact download during upgrade + +# Long description; in case the summary is not enough to describe the change +# this field accommodate a description without length limits. +# NOTE: This field will be rendered only for breaking-change and known-issue kinds at the moment. +#description: + +# Affected component; usually one of "elastic-agent", "fleet-server", "filebeat", "metricbeat", "auditbeat", "all", etc. +component: elastic-agent + +# PR URL; optional; the PR number that added the changeset. +# If not present is automatically filled by the tooling finding the PR where this changelog fragment has been added. +# NOTE: the tooling supports backports, so it's able to fill the original PR number instead of the backport PR number. +# Please provide it if you are adding a fragment for a different PR. +pr: https://github.com/elastic/elastic-agent/pull/3548 + +# Issue URL; optional; the GitHub issue related to this changeset (either closes or is part of). +# If not present is automatically filled by the tooling with the issue linked to the PR number. +#issue: https://github.com/owner/repo/1234 diff --git a/internal/pkg/agent/application/upgrade/artifact/download/http/downloader.go b/internal/pkg/agent/application/upgrade/artifact/download/http/downloader.go index 4336c4153d8..17945ddccb9 100644 --- a/internal/pkg/agent/application/upgrade/artifact/download/http/downloader.go +++ b/internal/pkg/agent/application/upgrade/artifact/download/http/downloader.go @@ -35,6 +35,10 @@ const ( // the default timeout is 10 minutes and this will have it log every 30 seconds. downloadProgressIntervalPercentage = 0.05 + // downloadProgressDefaultInterval defines the default interval at which the current download progress will be reported. + // This value is used if the timeout is not specified (and therefore equal to 0). + downloadProgressMinInterval = 10 * time.Second + // warningProgressIntervalPercentage defines how often to log messages as a warning once the amount of time // passed is this percentage or more of the total allotted time to download. warningProgressIntervalPercentage = 0.75 @@ -223,7 +227,6 @@ func (e *Downloader) downloadFile(ctx context.Context, artifactName, filename, f type downloadProgressReporter struct { log progressLogger sourceURI string - timeout time.Duration interval time.Duration warnTimeout time.Duration length float64 @@ -233,11 +236,15 @@ type downloadProgressReporter struct { } func newDownloadProgressReporter(log progressLogger, sourceURI string, timeout time.Duration, length int) *downloadProgressReporter { + interval := time.Duration(float64(timeout) * downloadProgressIntervalPercentage) + if interval == 0 { + interval = downloadProgressMinInterval + } + return &downloadProgressReporter{ log: log, sourceURI: sourceURI, - timeout: timeout, - interval: time.Duration(float64(timeout) * downloadProgressIntervalPercentage), + interval: interval, warnTimeout: time.Duration(float64(timeout) * warningProgressIntervalPercentage), length: float64(length), } @@ -259,7 +266,7 @@ func (dp *downloadProgressReporter) Report(ctx context.Context) { interval := dp.interval go func() { - t := time.NewTimer(interval) + t := time.NewTicker(interval) defer t.Stop() for { select { diff --git a/internal/pkg/agent/application/upgrade/artifact/download/http/downloader_test.go b/internal/pkg/agent/application/upgrade/artifact/download/http/downloader_test.go index a49c9b6d154..4d6087f479b 100644 --- a/internal/pkg/agent/application/upgrade/artifact/download/http/downloader_test.go +++ b/internal/pkg/agent/application/upgrade/artifact/download/http/downloader_test.go @@ -75,11 +75,11 @@ func TestDownloadBodyError(t *testing.T) { } func TestDownloadLogProgressWithLength(t *testing.T) { - fileSize := 100 * units.MiB + fileSize := 100 * units.MB chunks := 100 chunk := make([]byte, fileSize/chunks) delayBetweenChunks := 10 * time.Millisecond - totalTime := time.Duration(chunks) * (delayBetweenChunks + 1*time.Millisecond) + totalTime := time.Duration(chunks) * delayBetweenChunks srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { w.Header().Set("Content-Length", strconv.Itoa(fileSize)) @@ -120,16 +120,24 @@ func TestDownloadLogProgressWithLength(t *testing.T) { log.lock.RLock() defer log.lock.RUnlock() - // 2 files are downloaded so 4 log messages are expected in the info level and only the complete is over the warn - // window as 2 log messages for warn. - require.Len(t, log.info, 4) - assert.Equal(t, log.info[0].record, "download progress from %s is %s/%s (%.2f%% complete) @ %sps") - assert.Equal(t, log.info[1].record, "download from %s completed in %s @ %sps") - assert.Equal(t, log.info[2].record, "download progress from %s is %s/%s (%.2f%% complete) @ %sps") - assert.Equal(t, log.info[3].record, "download from %s completed in %s @ %sps") - require.Len(t, log.warn, 2) - assert.Equal(t, log.warn[0].record, "download from %s completed in %s @ %sps") - assert.Equal(t, log.warn[1].record, "download from %s completed in %s @ %sps") + expectedProgressMsg := "download progress from %s is %s/%s (%.2f%% complete) @ %sps" + + // Two files are downloaded. Each file is being downloaded in 100 chunks with a delay of 10ms between chunks. The + // expected time to download is, therefore, 100 * 10ms = 1000ms. In reality, the actual download time will be a bit + // more than 1000ms because some time is spent downloading the chunk, in between inter-chunk delays. + // Reporting happens every 0.05 * 1000ms = 50ms. We expect there to be as many log messages at that INFO level as + // the actual total download time / 50ms, for each file. That works out to at least 1000ms / 50ms = 20 INFO log + // messages, for each file, about its download progress. Additionally, we should expect 1 INFO log message, for + // each file, about the download completing. + assertLogs(t, log.info, 20, expectedProgressMsg) + + // By similar math as above, since the download of each file is expected to take 1000ms, and the progress logger + // starts issuing WARN messages once the download has taken more than 75% of the expected time, + // we should see warning messages for at least the last 250 seconds of the download. Given that + // reporting happens every 50 seconds, we should see at least 250s / 50s = 5 WARN log messages, for each file, + // about its download progress. Additionally, we should expect 1 WARN message, for each file, about the download + // completing. + assertLogs(t, log.warn, 5, expectedProgressMsg) } func TestDownloadLogProgressWithoutLength(t *testing.T) { @@ -137,7 +145,7 @@ func TestDownloadLogProgressWithoutLength(t *testing.T) { chunks := 100 chunk := make([]byte, fileSize/chunks) delayBetweenChunks := 10 * time.Millisecond - totalTime := time.Duration(chunks) * (delayBetweenChunks + 1*time.Millisecond) + totalTime := time.Duration(chunks) * delayBetweenChunks srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { w.WriteHeader(http.StatusOK) @@ -177,16 +185,24 @@ func TestDownloadLogProgressWithoutLength(t *testing.T) { log.lock.RLock() defer log.lock.RUnlock() - // 2 files are downloaded so 4 log messages are expected in the info level and only the complete is over the warn - // window as 2 log messages for warn. - require.Len(t, log.info, 4) - assert.Equal(t, log.info[0].record, "download progress from %s has fetched %s @ %sps") - assert.Equal(t, log.info[1].record, "download from %s completed in %s @ %sps") - assert.Equal(t, log.info[2].record, "download progress from %s has fetched %s @ %sps") - assert.Equal(t, log.info[3].record, "download from %s completed in %s @ %sps") - require.Len(t, log.warn, 2) - assert.Equal(t, log.warn[0].record, "download from %s completed in %s @ %sps") - assert.Equal(t, log.warn[1].record, "download from %s completed in %s @ %sps") + expectedProgressMsg := "download progress from %s has fetched %s @ %sps" + + // Two files are downloaded. Each file is being downloaded in 100 chunks with a delay of 10ms between chunks. The + // expected time to download is, therefore, 100 * 10ms = 1000ms. In reality, the actual download time will be a bit + // more than 1000ms because some time is spent downloading the chunk, in between inter-chunk delays. + // Reporting happens every 0.05 * 1000ms = 50ms. We expect there to be as many log messages at that INFO level as + // the actual total download time / 50ms, for each file. That works out to at least 1000ms / 50ms = 20 INFO log + // messages, for each file, about its download progress. Additionally, we should expect 1 INFO log message, for + // each file, about the download completing. + assertLogs(t, log.info, 20, expectedProgressMsg) + + // By similar math as above, since the download of each file is expected to take 1000ms, and the progress logger + // starts issuing WARN messages once the download has taken more than 75% of the expected time, + // we should see warning messages for at least the last 250 seconds of the download. Given that + // reporting happens every 50 seconds, we should see at least 250s / 50s = 5 WARN log messages, for each file, + // about its download progress. Additionally, we should expect 1 WARN message, for each file, about the download + // completing. + assertLogs(t, log.warn, 5, expectedProgressMsg) } type logMessage struct { @@ -227,3 +243,31 @@ func containsMessage(logs []logMessage, msg string) bool { } return false } +func assertLogs(t *testing.T, logs []logMessage, minExpectedProgressLogs int, expectedProgressMsg string) { + t.Helper() + + // Verify that we've logged at least minExpectedProgressLogs (about download progress) + 1 log + // message (about download completion), for each of the two files being downloaded. + require.GreaterOrEqual(t, len(logs), (minExpectedProgressLogs+1)*2) + + // Verify that the first minExpectedProgressLogs messages are about the download progress (for the first file). + i := 0 + for ; i < minExpectedProgressLogs; i++ { + assert.Equal(t, logs[i].record, expectedProgressMsg) + } + + // Find the next message that's about the download being completed (for the first file). + found := false + for ; i < len(logs) && !found; i++ { + found = logs[i].record == "download from %s completed in %s @ %sps" + } + assert.True(t, found) + + // Verify that the next minExpectedProgressLogs messages are about the download progress (for the second file). + for j := 0; j < minExpectedProgressLogs; j++ { + assert.Equal(t, logs[i+j].record, expectedProgressMsg) + } + + // Verify that the last message is about the download being completed (for the second file). + assert.Equal(t, logs[len(logs)-1].record, "download from %s completed in %s @ %sps") +}