diff --git a/changelog/fragments/1694190260-improve-upgrade-logging.yaml b/changelog/fragments/1694190260-improve-upgrade-logging.yaml new file mode 100644 index 00000000000..a471629b90a --- /dev/null +++ b/changelog/fragments/1694190260-improve-upgrade-logging.yaml @@ -0,0 +1,32 @@ +# Kind can be one of: +# - breaking-change: a change to previously-documented behavior +# - deprecation: functionality that is being removed in a later release +# - bug-fix: fixes a problem in a previous version +# - enhancement: extends functionality but does not break or fix existing behavior +# - feature: new functionality +# - known-issue: problems that we are aware of in a given version +# - security: impacts on the security of a product or a user’s deployment. +# - upgrade: important information for someone upgrading from a prior version +# - other: does not fit into any of the other categories +kind: enhancement + +# Change summary; a 80ish characters long description of the change. +summary: Improve logging during Agent upgrades + +# Long description; in case the summary is not enough to describe the change +# this field accommodate a description without length limits. +# NOTE: This field will be rendered only for breaking-change and known-issue kinds at the moment. +#description: + +# Affected component; usually one of "elastic-agent", "fleet-server", "filebeat", "metricbeat", "auditbeat", "all", etc. +component: elastic-agent + +# PR URL; optional; the PR number that added the changeset. +# If not present is automatically filled by the tooling finding the PR where this changelog fragment has been added. +# NOTE: the tooling supports backports, so it's able to fill the original PR number instead of the backport PR number. +# Please provide it if you are adding a fragment for a different PR. +pr: https://github.com/elastic/elastic-agent/pull/3382 + +# Issue URL; optional; the GitHub issue related to this changeset (either closes or is part of). +# If not present is automatically filled by the tooling with the issue linked to the PR number. +#issue: https://github.com/owner/repo/1234 diff --git a/internal/pkg/agent/application/upgrade/cleanup.go b/internal/pkg/agent/application/upgrade/cleanup.go index 4464a1d8f70..cc6bacf4635 100644 --- a/internal/pkg/agent/application/upgrade/cleanup.go +++ b/internal/pkg/agent/application/upgrade/cleanup.go @@ -19,7 +19,7 @@ import ( // cleanNonMatchingVersionsFromDownloads will remove files that do not have the passed version number from the downloads directory. func cleanNonMatchingVersionsFromDownloads(log *logger.Logger, version string) error { downloadsPath := paths.Downloads() - log.Debugw("Cleaning up non-matching downloaded versions", "version", version, "downloads.path", downloadsPath) + log.Infow("Cleaning up non-matching downloaded versions", "version", version, "downloads.path", downloadsPath) files, err := os.ReadDir(downloadsPath) if os.IsNotExist(err) { diff --git a/internal/pkg/agent/application/upgrade/crash_checker.go b/internal/pkg/agent/application/upgrade/crash_checker.go index fd6016208e0..67bc2bf862c 100644 --- a/internal/pkg/agent/application/upgrade/crash_checker.go +++ b/internal/pkg/agent/application/upgrade/crash_checker.go @@ -7,7 +7,6 @@ package upgrade import ( "context" "fmt" - "os" "sync" "time" @@ -53,7 +52,7 @@ func NewCrashChecker(ctx context.Context, ch chan error, log *logger.Logger, che return nil, err } - log.Debugf("running checks using '%s' controller", c.sc.Name()) + log.Infof("running checks using '%s' controller", c.sc.Name()) return c, nil } @@ -62,9 +61,8 @@ func NewCrashChecker(ctx context.Context, ch chan error, log *logger.Logger, che func (ch *CrashChecker) Run(ctx context.Context) { defer ch.sc.Close() - ch.log.Debug("Crash checker started") + ch.log.Info("Crash checker started") for { - ch.log.Debugf("watcher having PID: %d", os.Getpid()) t := time.NewTimer(ch.checkInterval) select { @@ -77,7 +75,7 @@ func (ch *CrashChecker) Run(ctx context.Context) { ch.log.Error(err) } - ch.log.Debugf("retrieved service PID [%d]", pid) + ch.log.Infof("retrieved service PID [%d]", pid) ch.q.Push(pid) // We decide if the Agent process has crashed in either of diff --git a/internal/pkg/agent/application/upgrade/error_checker.go b/internal/pkg/agent/application/upgrade/error_checker.go index d6bad9f3e4d..a0a3516c94a 100644 --- a/internal/pkg/agent/application/upgrade/error_checker.go +++ b/internal/pkg/agent/application/upgrade/error_checker.go @@ -49,7 +49,7 @@ func NewErrorChecker(ch chan error, log *logger.Logger, checkInterval time.Durat // Run runs the checking loop. func (ch *ErrorChecker) Run(ctx context.Context) { - ch.log.Debug("Error checker started") + ch.log.Info("Error checker started") for { t := time.NewTimer(ch.checkInterval) select { diff --git a/internal/pkg/agent/application/upgrade/rollback.go b/internal/pkg/agent/application/upgrade/rollback.go index ac835cd67e2..ae3c568acf1 100644 --- a/internal/pkg/agent/application/upgrade/rollback.go +++ b/internal/pkg/agent/application/upgrade/rollback.go @@ -55,7 +55,7 @@ func Rollback(ctx context.Context, log *logger.Logger, prevHash string, currentH // Cleanup removes all artifacts and files related to a specified version. func Cleanup(log *logger.Logger, currentHash string, removeMarker bool, keepLogs bool) error { - log.Debugw("Cleaning up upgrade", "hash", currentHash, "remove_marker", removeMarker) + log.Infow("Cleaning up upgrade", "hash", currentHash, "remove_marker", removeMarker) <-time.After(afterRestartDelay) // remove upgrade marker @@ -78,7 +78,7 @@ func Cleanup(log *logger.Logger, currentHash string, removeMarker bool, keepLogs // remove symlink to avoid upgrade failures, ignore error prevSymlink := prevSymlinkPath() - log.Debugw("Removing previous symlink path", "file.path", prevSymlinkPath()) + log.Infow("Removing previous symlink path", "file.path", prevSymlinkPath()) _ = os.Remove(prevSymlink) dirPrefix := fmt.Sprintf("%s-", agentName) @@ -93,7 +93,7 @@ func Cleanup(log *logger.Logger, currentHash string, removeMarker bool, keepLogs } hashedDir := filepath.Join(paths.Data(), dir) - log.Debugw("Removing hashed data directory", "file.path", hashedDir) + log.Infow("Removing hashed data directory", "file.path", hashedDir) var ignoredDirs []string if keepLogs { ignoredDirs = append(ignoredDirs, "logs") @@ -110,20 +110,28 @@ func Cleanup(log *logger.Logger, currentHash string, removeMarker bool, keepLogs // agent during upgrade period. func InvokeWatcher(log *logger.Logger) error { if !IsUpgradeable() { - log.Debug("agent is not upgradable, not starting watcher") + log.Info("agent is not upgradable, not starting watcher") return nil } cmd := invokeCmd() defer func() { if cmd.Process != nil { - log.Debugf("releasing watcher %v", cmd.Process.Pid) + log.Infof("releasing watcher %v", cmd.Process.Pid) _ = cmd.Process.Release() } }() - log.Debugw("Starting upgrade watcher", "path", cmd.Path, "args", cmd.Args, "env", cmd.Env, "dir", cmd.Dir) - return cmd.Start() + log.Infow("Starting upgrade watcher", "path", cmd.Path, "args", cmd.Args, "env", cmd.Env, "dir", cmd.Dir) + if err := cmd.Start(); err != nil { + return fmt.Errorf("failed to start Upgrade Watcher: %w", err) + } + + upgradeWatcherPID := cmd.Process.Pid + agentPID := os.Getpid() + log.Infow("Upgrade Watcher invoked", "agent.upgrade.watcher.process.pid", upgradeWatcherPID, "agent.process.pid", agentPID) + + return nil } func restartAgent(ctx context.Context, log *logger.Logger) error { diff --git a/internal/pkg/agent/application/upgrade/step_download.go b/internal/pkg/agent/application/upgrade/step_download.go index 9de9f9ffe13..a1f3bd79ca2 100644 --- a/internal/pkg/agent/application/upgrade/step_download.go +++ b/internal/pkg/agent/application/upgrade/step_download.go @@ -54,7 +54,7 @@ func (u *Upgrader) downloadArtifact(ctx context.Context, version, sourceURI stri } } - u.log.Debugw("Downloading upgrade artifact", "version", version, + u.log.Infow("Downloading upgrade artifact", "version", version, "source_uri", settings.SourceURI, "drop_path", settings.DropPath, "target_path", settings.TargetDirectory, "install_path", settings.InstallPath) @@ -162,7 +162,7 @@ func (u *Upgrader) downloadWithRetries( opFn := func() error { attempt++ - u.log.Debugf("download attempt %d", attempt) + u.log.Infof("download attempt %d", attempt) downloader, err := downloaderCtor(version, u.log, settings) if err != nil { diff --git a/internal/pkg/agent/application/upgrade/step_mark.go b/internal/pkg/agent/application/upgrade/step_mark.go index fa337e3907a..0e9cf0bd92a 100644 --- a/internal/pkg/agent/application/upgrade/step_mark.go +++ b/internal/pkg/agent/application/upgrade/step_mark.go @@ -139,7 +139,7 @@ func UpdateActiveCommit(log *logger.Logger, hash string) error { // CleanMarker removes a marker from disk. func CleanMarker(log *logger.Logger) error { markerFile := markerFilePath() - log.Debugw("Removing marker file", "file.path", markerFile) + log.Infow("Removing marker file", "file.path", markerFile) if err := os.Remove(markerFile); !os.IsNotExist(err) { return err } diff --git a/internal/pkg/agent/application/upgrade/upgrade.go b/internal/pkg/agent/application/upgrade/upgrade.go index 13c787ab592..e653cf54525 100644 --- a/internal/pkg/agent/application/upgrade/upgrade.go +++ b/internal/pkg/agent/application/upgrade/upgrade.go @@ -11,11 +11,11 @@ import ( "path/filepath" "strings" - "github.com/elastic/elastic-agent/internal/pkg/config" - "github.com/otiai10/copy" "go.elastic.co/apm" + "github.com/elastic/elastic-agent/internal/pkg/config" + "github.com/elastic/elastic-agent/internal/pkg/agent/application/info" "github.com/elastic/elastic-agent/internal/pkg/agent/application/paths" "github.com/elastic/elastic-agent/internal/pkg/agent/application/reexec" @@ -172,7 +172,7 @@ func (u *Upgrader) Upgrade(ctx context.Context, version string, sourceURI string cb := shutdownCallback(u.log, paths.Home(), release.Version(), version, release.TrimCommit(newHash)) // Clean everything from the downloads dir - u.log.Debugw("Removing downloads directory", "file.path", paths.Downloads()) + u.log.Infow("Removing downloads directory", "file.path", paths.Downloads()) err = os.RemoveAll(paths.Downloads()) if err != nil { u.log.Errorw("Unable to clean downloads after update", "error.message", err, "file.path", paths.Downloads()) @@ -231,11 +231,11 @@ func copyActionStore(log *logger.Logger, newHash string) error { // copies legacy action_store.yml, state.yml and state.enc encrypted file if exists storePaths := []string{paths.AgentActionStoreFile(), paths.AgentStateStoreYmlFile(), paths.AgentStateStoreFile()} newHome := filepath.Join(filepath.Dir(paths.Home()), fmt.Sprintf("%s-%s", agentName, newHash)) - log.Debugw("Copying action store", "new_home_path", newHome) + log.Infow("Copying action store", "new_home_path", newHome) for _, currentActionStorePath := range storePaths { newActionStorePath := filepath.Join(newHome, filepath.Base(currentActionStorePath)) - log.Debugw("Copying action store path", "from", currentActionStorePath, "to", newActionStorePath) + log.Infow("Copying action store path", "from", currentActionStorePath, "to", newActionStorePath) currentActionStore, err := os.ReadFile(currentActionStorePath) if os.IsNotExist(err) { // nothing to copy @@ -266,7 +266,7 @@ func copyRunDirectory(log *logger.Logger, newHash string) error { err := copyDir(log, oldRunPath, newRunPath, true) if os.IsNotExist(err) { // nothing to copy, operation ok - log.Debugw("Run directory not present", "old_run_path", oldRunPath) + log.Infow("Run directory not present", "old_run_path", oldRunPath) return nil } if err != nil { diff --git a/internal/pkg/agent/cmd/run.go b/internal/pkg/agent/cmd/run.go index c5cc9f4b315..2ab23dae7c2 100644 --- a/internal/pkg/agent/cmd/run.go +++ b/internal/pkg/agent/cmd/run.go @@ -151,6 +151,8 @@ func run(override cfgOverrider, testingMode bool, fleetInitTimeout time.Duration "source": agentName, }) + l.Infow("Elastic Agent started", "process.pid", os.Getpid(), "agent.version", version.GetAgentPackageVersion()) + cfg, err = tryDelayEnroll(ctx, l, cfg, override) if err != nil { err = errors.New(err, "failed to perform delayed enrollment") diff --git a/internal/pkg/agent/cmd/watch.go b/internal/pkg/agent/cmd/watch.go index 23009a3366b..f475cf85669 100644 --- a/internal/pkg/agent/cmd/watch.go +++ b/internal/pkg/agent/cmd/watch.go @@ -13,6 +13,8 @@ import ( "syscall" "time" + "github.com/elastic/elastic-agent/version" + "github.com/elastic/elastic-agent/internal/pkg/config" "github.com/spf13/cobra" @@ -62,6 +64,7 @@ func newWatchCommandWithArgs(_ []string, streams *cli.IOStreams) *cobra.Command } func watchCmd(log *logp.Logger, cfg *configuration.Configuration) error { + log.Infow("Upgrade Watcher started", "process.pid", os.Getpid(), "agent.version", version.GetAgentPackageVersion()) marker, err := upgrade.LoadMarker() if err != nil { log.Error("failed to load marker", err) @@ -69,14 +72,14 @@ func watchCmd(log *logp.Logger, cfg *configuration.Configuration) error { } if marker == nil { // no marker found we're not in upgrade process - log.Debugf("update marker not present at '%s'", paths.Data()) + log.Infof("update marker not present at '%s'", paths.Data()) return nil } locker := filelock.NewAppLocker(paths.Top(), watcherLockFile) if err := locker.TryLock(); err != nil { if errors.Is(err, filelock.ErrAppAlreadyRunning) { - log.Debugf("exiting, lock already exists") + log.Info("exiting, lock already exists") return nil } @@ -89,7 +92,7 @@ func watchCmd(log *logp.Logger, cfg *configuration.Configuration) error { isWithinGrace, tilGrace := gracePeriod(marker, cfg.Settings.Upgrade.Watcher.GracePeriod) if !isWithinGrace { - log.Debugf("not within grace [updatedOn %v] %v", marker.UpdatedOn.String(), time.Since(marker.UpdatedOn).String()) + log.Infof("not within grace [updatedOn %v] %v", marker.UpdatedOn.String(), time.Since(marker.UpdatedOn).String()) // if it is started outside of upgrade loop // if we're not within grace and marker is still there it might mean // that cleanup was not performed ok, cleanup everything except current version diff --git a/internal/pkg/agent/install/testblocking/main.go b/internal/pkg/agent/install/testblocking/main.go new file mode 100644 index 00000000000..adbd771d22d --- /dev/null +++ b/internal/pkg/agent/install/testblocking/main.go @@ -0,0 +1,15 @@ +// Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one +// or more contributor license agreements. Licensed under the Elastic License; +// you may not use this file except in compliance with the Elastic License. + +package main + +import ( + "math" + "time" +) + +// Simple program that blocks forever to ensure exes running from a directory on Windows can be removed during uninstall. +func main() { + <-time.After(time.Duration(math.MaxInt64)) +} diff --git a/internal/pkg/agent/install/uninstall_windows_test.go b/internal/pkg/agent/install/uninstall_windows_test.go index 31812988b68..e029b471513 100644 --- a/internal/pkg/agent/install/uninstall_windows_test.go +++ b/internal/pkg/agent/install/uninstall_windows_test.go @@ -12,37 +12,34 @@ import ( "path/filepath" "testing" + "github.com/otiai10/copy" "github.com/stretchr/testify/require" ) -const simpleBlockForever = ` -package main - -import ( - "math" - "time" -) - -func main() { - <-time.After(time.Duration(math.MaxInt64)) -} -` - func TestRemovePath(t *testing.T) { - dir := filepath.Join(t.TempDir(), "subdir") - err := os.Mkdir(dir, 0644) + var ( + pkgName = "testblocking" + binaryName = pkgName + ".exe" + ) + + // Create a temporary directory that we can safely remove. The directory is created as a new + // sub-directory. This avoids having Microsoft Defender quarantine the file if it is exec'd from + // the default temporary directory. + destDir, err := os.MkdirTemp(pkgName, t.Name()) require.NoError(t, err) - src := filepath.Join(dir, "main.go") - err = os.WriteFile(src, []byte(simpleBlockForever), 0644) - require.NoError(t, err) + // Copy the test executable to the new temporary directory. + destpath, err := filepath.Abs(filepath.Join(destDir, binaryName)) + require.NoErrorf(t, err, "failed dest abs %s + %s", destDir, binaryName) + + srcPath, err := filepath.Abs(filepath.Join(pkgName, binaryName)) + require.NoErrorf(t, err, "failed src abs %s + %s", pkgName, binaryName) - binary := filepath.Join(dir, "main.exe") - cmd := exec.Command("go", "build", "-o", binary, src) - _, err = cmd.CombinedOutput() + err = copy.Copy(srcPath, destpath, copy.Options{Sync: true}) require.NoError(t, err) - cmd = exec.Command(binary) + // Execute the test executable asynchronously. + cmd := exec.Command(destpath) err = cmd.Start() require.NoError(t, err) defer func() { @@ -50,6 +47,7 @@ func TestRemovePath(t *testing.T) { _ = cmd.Wait() }() - err = RemovePath(dir) + // Ensure the directory containing the executable can be removed. + err = RemovePath(destDir) require.NoError(t, err) } diff --git a/magefile.go b/magefile.go index 91d8167d224..3d0e6de1e9e 100644 --- a/magefile.go +++ b/magefile.go @@ -256,16 +256,19 @@ func (Build) Clean() { // TestBinaries build the required binaries for the test suite. func (Build) TestBinaries() error { wd, _ := os.Getwd() - p := filepath.Join(wd, "pkg", "component", "fake") - for _, name := range []string{"component", "shipper"} { - binary := name + testBinaryPkgs := []string{ + filepath.Join(wd, "pkg", "component", "fake", "component"), + filepath.Join(wd, "pkg", "component", "fake", "shipper"), + filepath.Join(wd, "internal", "pkg", "agent", "install", "testblocking"), + } + for _, pkg := range testBinaryPkgs { + binary := filepath.Base(pkg) if runtime.GOOS == "windows" { binary += ".exe" } - fakeDir := filepath.Join(p, name) - outputName := filepath.Join(fakeDir, binary) - err := RunGo("build", "-o", outputName, filepath.Join(fakeDir)) + outputName := filepath.Join(pkg, binary) + err := RunGo("build", "-o", outputName, filepath.Join(pkg)) if err != nil { return err }