From 15f0b2e8e7c5cdcee2145633ccc909bd5d83a7d0 Mon Sep 17 00:00:00 2001 From: Karl Isenberg Date: Mon, 7 Oct 2024 13:53:07 -0700 Subject: [PATCH] chore: cleanup some logs (#1446) - Simplify & fix applier/destroyer logs on success & failure - Add the component in front of a few logs to make it clear where in the code the log came from, to simplify debugging. - Log resource conflicts as warnings with explainations of how remediation will retry. - Only log the keys from the GVK maps in the remediator manager. - Fix remediator shutdown log to go to info instead of warning. --- pkg/parse/updater.go | 11 ++---- pkg/reconciler/finalizer/base_finalizer.go | 11 ++---- pkg/remediator/conflict/handler.go | 6 ++- pkg/remediator/reconcile/reconciler.go | 3 ++ pkg/remediator/watch/filteredwatcher.go | 6 ++- pkg/remediator/watch/manager.go | 37 +++++++++++++------ .../controllers/watch/manager.go | 8 +++- 7 files changed, 53 insertions(+), 29 deletions(-) diff --git a/pkg/parse/updater.go b/pkg/parse/updater.go index aaddc349cc..ed97cf002d 100644 --- a/pkg/parse/updater.go +++ b/pkg/parse/updater.go @@ -209,23 +209,20 @@ func (u *Updater) apply(ctx context.Context, objs []client.Object, commit string } } } - klog.V(1).Info("Applier starting...") + klog.Info("Applier starting...") start := time.Now() u.SyncErrorCache.ResetApplyErrors() objStatusMap, syncStats := u.Applier.Apply(ctx, eventHandler, objs) - if syncStats.Empty() { - klog.V(4).Info("Applier made no new progress") - } else { + if !syncStats.Empty() { klog.Infof("Applier made new progress: %s", syncStats.String()) objStatusMap.Log(klog.V(0)) } metrics.RecordApplyDuration(ctx, metrics.StatusTagKey(err), commit, start) if err != nil { - klog.Warningf("Failed to apply declared resources: %v", err) + klog.Warningf("Applier failed: %v", err) return err } - klog.V(4).Info("Apply completed without error: all resources are up to date.") - klog.V(3).Info("Applier stopped") + klog.Info("Applier succeeded") return nil } diff --git a/pkg/reconciler/finalizer/base_finalizer.go b/pkg/reconciler/finalizer/base_finalizer.go index 745fb20127..56c168c60f 100644 --- a/pkg/reconciler/finalizer/base_finalizer.go +++ b/pkg/reconciler/finalizer/base_finalizer.go @@ -39,12 +39,10 @@ func (bf *baseFinalizer) destroy(ctx context.Context) status.MultiError { } } } - klog.V(1).Info("Destroyer starting...") + klog.Info("Destroyer starting...") // start := time.Now() objStatusMap, syncStats := bf.Destroyer.Destroy(ctx, eventHandler) - if syncStats.Empty() { - klog.V(4).Info("Destroyer made no new progress") - } else { + if !syncStats.Empty() { klog.Infof("Destroyer made new progress: %s", syncStats.String()) objStatusMap.Log(klog.V(0)) } @@ -52,10 +50,9 @@ func (bf *baseFinalizer) destroy(ctx context.Context) status.MultiError { // We don't have the commit here, so we can't send the apply metric. // metrics.RecordApplyDuration(ctx, metrics.StatusTagKey(errs), commit, start) if err != nil { - klog.Warningf("Failed to destroy declared resources: %v", err) + klog.Warningf("Destroyer failed: %v", err) return err } - klog.V(4).Info("Destroyer completed without error: all resources are deleted.") - klog.V(3).Info("Applier stopped") + klog.Info("Destroyer succeeded") return nil } diff --git a/pkg/remediator/conflict/handler.go b/pkg/remediator/conflict/handler.go index 86d9517417..8e81191cb3 100644 --- a/pkg/remediator/conflict/handler.go +++ b/pkg/remediator/conflict/handler.go @@ -45,8 +45,10 @@ type Handler interface { // Record a management conflict error, including log and metric. func Record(ctx context.Context, handler Handler, err status.ManagementConflictError, commit string) { - klog.Errorf("Management conflict detected. "+ - "Reconciler %q received a watch event for object %q, which is manage by namespace reconciler %q. ", + klog.Errorf("Remediator detected a management conflict: "+ + "reconciler %q received a watch event for object %q, which is managed by namespace reconciler %q. "+ + "To resolve the conflict, remove the object from one of the sources of truth "+ + "so that the object is only managed by one reconciler.", err.DesiredManager(), err.ObjectID(), err.CurrentManager()) handler.AddConflictError(err.ObjectID(), err) // TODO: Use separate metrics for management conflicts vs resource conflicts diff --git a/pkg/remediator/reconcile/reconciler.go b/pkg/remediator/reconcile/reconciler.go index ab35820275..f871aae609 100644 --- a/pkg/remediator/reconcile/reconciler.go +++ b/pkg/remediator/reconcile/reconciler.go @@ -98,6 +98,9 @@ func (r *reconciler) Remediate(ctx context.Context, id core.ID, obj client.Objec switch err.Code() { case syncerclient.ResourceConflictCode: // Record cache conflict (create/delete/update failure due to found/not-found/resource-version) + klog.Warningf("Remediator encountered a resource conflict: "+ + "%v. To resolve the conflict, the remediator will fetch the latest object from the cluster "+ + "and requeue it for remediation", err) metrics.RecordResourceConflict(ctx, commit) case status.ManagementConflictErrorCode: if mce, ok := err.(status.ManagementConflictError); ok { diff --git a/pkg/remediator/watch/filteredwatcher.go b/pkg/remediator/watch/filteredwatcher.go index 6e0290d159..3437942e06 100644 --- a/pkg/remediator/watch/filteredwatcher.go +++ b/pkg/remediator/watch/filteredwatcher.go @@ -350,8 +350,12 @@ func (w *filteredWatcher) start(ctx context.Context, resourceVersion string) (bo if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) { return false, status.InternalWrapf(err, "failed to start watch for %s", w.gvk) } else if apierrors.IsNotFound(err) { + statusErr := syncerclient.ConflictWatchResourceDoesNotExist(err, w.gvk) + klog.Warningf("Remediator encountered a resource conflict: "+ + "%v. To resolve the conflict, the remediator will enqueue a resync "+ + "and restart the resource watch after the sync has succeeded.", statusErr) metrics.RecordResourceConflict(ctx, w.getLatestCommit()) - return false, syncerclient.ConflictWatchResourceDoesNotExist(err, w.gvk) + return false, statusErr } return false, status.APIServerErrorf(err, "failed to start watch for %s", w.gvk) } diff --git a/pkg/remediator/watch/manager.go b/pkg/remediator/watch/manager.go index 5ad92cd9fe..6c808053fe 100644 --- a/pkg/remediator/watch/manager.go +++ b/pkg/remediator/watch/manager.go @@ -19,6 +19,7 @@ import ( "errors" "sync" + "golang.org/x/exp/maps" "k8s.io/apimachinery/pkg/api/meta" "k8s.io/apimachinery/pkg/labels" "k8s.io/apimachinery/pkg/runtime/schema" @@ -147,7 +148,7 @@ func (m *Manager) AddWatches(ctx context.Context, gvkMap map[schema.GroupVersion defer m.mux.Unlock() m.watching = true - klog.V(3).Infof("AddWatches(%v)", gvkMap) + klog.V(3).Infof("AddWatches(%v)", maps.Keys(gvkMap)) var startedWatches uint64 @@ -165,7 +166,12 @@ func (m *Manager) AddWatches(ctx context.Context, gvkMap map[schema.GroupVersion if _, err := m.mapper.RESTMapping(gvk.GroupKind(), gvk.Version); err != nil { switch { case meta.IsNoMatchError(err): - klog.Infof("Remediator skipped adding watch for resource %v: %v: resource watch will be started after apply is successful", gvk, err) + statusErr := syncerclient.ConflictWatchResourceDoesNotExist(err, gvk) + klog.Infof("Remediator skipped starting resource watch: "+ + "%v. The remediator will start the resource watch after the sync has succeeded.", statusErr) + // This is expected behavior before a sync attempt. + // It likely means a CR and CRD are in the same ApplySet. + // So don't record a resource conflict metric or return an error here. default: errs = status.Append(errs, status.APIServerErrorWrap(err)) } @@ -180,9 +186,9 @@ func (m *Manager) AddWatches(ctx context.Context, gvkMap map[schema.GroupVersion } if startedWatches > 0 { - klog.Infof("The remediator made new progress: started %d new watches", startedWatches) + klog.Infof("Remediator started %d new watches", startedWatches) } else { - klog.V(4).Infof("The remediator made no new progress") + klog.V(4).Infof("Remediator watches unchanged") } return errs } @@ -200,7 +206,7 @@ func (m *Manager) UpdateWatches(ctx context.Context, gvkMap map[schema.GroupVers defer m.mux.Unlock() m.watching = true - klog.V(3).Infof("UpdateWatches(%v)", gvkMap) + klog.V(3).Infof("UpdateWatches(%v)", maps.Keys(gvkMap)) m.needsUpdate = false @@ -228,8 +234,15 @@ func (m *Manager) UpdateWatches(ctx context.Context, gvkMap map[schema.GroupVers if _, err := m.mapper.RESTMapping(gvk.GroupKind(), gvk.Version); err != nil { switch { case meta.IsNoMatchError(err): + statusErr := syncerclient.ConflictWatchResourceDoesNotExist(err, gvk) + klog.Warningf("Remediator encountered a resource conflict: "+ + "%v. To resolve the conflict, the remediator will enqueue a resync "+ + "and restart the resource watch after the sync has succeeded.", statusErr) + // This is unexpected behavior after a successful sync. + // It likely means that some other controller deleted managed objects shortly after they were applied. + // So record a resource conflict metric and return an error. metrics.RecordResourceConflict(ctx, commit) - errs = status.Append(errs, syncerclient.ConflictWatchResourceDoesNotExist(err, gvk)) + errs = status.Append(errs, statusErr) default: errs = status.Append(errs, status.APIServerErrorWrap(err)) } @@ -244,9 +257,9 @@ func (m *Manager) UpdateWatches(ctx context.Context, gvkMap map[schema.GroupVers } if startedWatches > 0 || stoppedWatches > 0 { - klog.Infof("The remediator made new progress: started %d new watches, and stopped %d watches", startedWatches, stoppedWatches) + klog.Infof("Remediator started %d new watches and stopped %d watches", startedWatches, stoppedWatches) } else { - klog.V(4).Infof("The remediator made no new progress") + klog.V(4).Infof("Remediator watches unchanged") } return errs } @@ -284,10 +297,12 @@ func (m *Manager) startWatcher(ctx context.Context, gvk schema.GroupVersionKind, // threadsafe. func (m *Manager) runWatcher(ctx context.Context, r Runnable, gvk schema.GroupVersionKind) { if err := r.Run(ctx); err != nil { - if errors.Is(err, context.Canceled) { - klog.Infof("Watcher stopped for %s: %v", gvk, status.FormatSingleLine(err)) + // TODO: Make status.Error work with errors.Is unwrapping. + // For now, check the Cause directly, to avoid logging a warning on shutdown. + if errors.Is(err.Cause(), context.Canceled) { + klog.Infof("Watcher stopped for %s: %v", gvk, context.Canceled) } else { - klog.Warningf("Error running watcher for %s: %v", gvk, status.FormatSingleLine(err)) + klog.Warningf("Watcher errored for %s: %v", gvk, status.FormatSingleLine(err)) } m.mux.Lock() delete(m.watcherMap, gvk) diff --git a/pkg/resourcegroup/controllers/watch/manager.go b/pkg/resourcegroup/controllers/watch/manager.go index 9280a3746e..0c63fdc63c 100644 --- a/pkg/resourcegroup/controllers/watch/manager.go +++ b/pkg/resourcegroup/controllers/watch/manager.go @@ -16,6 +16,7 @@ package watch import ( "context" + "errors" "net/http" "sync" @@ -191,7 +192,12 @@ func (m *Manager) startWatcher(ctx context.Context, gvk schema.GroupVersionKind) // threadsafe. func (m *Manager) runWatcher(ctx context.Context, r Runnable, gvk schema.GroupVersionKind) { if err := r.Run(ctx); err != nil { - klog.Warningf("Error running watcher for %s: %v", gvk.String(), err) + // Avoid logging a warning on shutdown. + if errors.Is(err, context.Canceled) { + klog.Infof("Watcher stopped for %s: %v", gvk, context.Canceled) + } else { + klog.Warningf("Watcher errored for %s: %v", gvk, err) + } m.mux.Lock() delete(m.watcherMap, gvk) m.needsUpdate = true