Skip to content

Commit

Permalink
chore: cleanup some logs (#1446)
Browse files Browse the repository at this point in the history
- Simplify & fix applier/destroyer logs on success & failure
- Add the component in front of a few logs to make it clear where
  in the code the log came from, to simplify debugging.
- Log resource conflicts as warnings with explainations of how
  remediation will retry.
- Only log the keys from the GVK maps in the remediator manager.
- Fix remediator shutdown log to go to info instead of warning.
  • Loading branch information
karlkfi authored Oct 7, 2024
1 parent 2edae49 commit 15f0b2e
Show file tree
Hide file tree
Showing 7 changed files with 53 additions and 29 deletions.
11 changes: 4 additions & 7 deletions pkg/parse/updater.go
Original file line number Diff line number Diff line change
Expand Up @@ -209,23 +209,20 @@ func (u *Updater) apply(ctx context.Context, objs []client.Object, commit string
}
}
}
klog.V(1).Info("Applier starting...")
klog.Info("Applier starting...")
start := time.Now()
u.SyncErrorCache.ResetApplyErrors()
objStatusMap, syncStats := u.Applier.Apply(ctx, eventHandler, objs)
if syncStats.Empty() {
klog.V(4).Info("Applier made no new progress")
} else {
if !syncStats.Empty() {
klog.Infof("Applier made new progress: %s", syncStats.String())
objStatusMap.Log(klog.V(0))
}
metrics.RecordApplyDuration(ctx, metrics.StatusTagKey(err), commit, start)
if err != nil {
klog.Warningf("Failed to apply declared resources: %v", err)
klog.Warningf("Applier failed: %v", err)
return err
}
klog.V(4).Info("Apply completed without error: all resources are up to date.")
klog.V(3).Info("Applier stopped")
klog.Info("Applier succeeded")
return nil
}

Expand Down
11 changes: 4 additions & 7 deletions pkg/reconciler/finalizer/base_finalizer.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,23 +39,20 @@ func (bf *baseFinalizer) destroy(ctx context.Context) status.MultiError {
}
}
}
klog.V(1).Info("Destroyer starting...")
klog.Info("Destroyer starting...")
// start := time.Now()
objStatusMap, syncStats := bf.Destroyer.Destroy(ctx, eventHandler)
if syncStats.Empty() {
klog.V(4).Info("Destroyer made no new progress")
} else {
if !syncStats.Empty() {
klog.Infof("Destroyer made new progress: %s", syncStats.String())
objStatusMap.Log(klog.V(0))
}
// TODO: should we have a destroy duration metric?
// We don't have the commit here, so we can't send the apply metric.
// metrics.RecordApplyDuration(ctx, metrics.StatusTagKey(errs), commit, start)
if err != nil {
klog.Warningf("Failed to destroy declared resources: %v", err)
klog.Warningf("Destroyer failed: %v", err)
return err
}
klog.V(4).Info("Destroyer completed without error: all resources are deleted.")
klog.V(3).Info("Applier stopped")
klog.Info("Destroyer succeeded")
return nil
}
6 changes: 4 additions & 2 deletions pkg/remediator/conflict/handler.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,10 @@ type Handler interface {

// Record a management conflict error, including log and metric.
func Record(ctx context.Context, handler Handler, err status.ManagementConflictError, commit string) {
klog.Errorf("Management conflict detected. "+
"Reconciler %q received a watch event for object %q, which is manage by namespace reconciler %q. ",
klog.Errorf("Remediator detected a management conflict: "+
"reconciler %q received a watch event for object %q, which is managed by namespace reconciler %q. "+
"To resolve the conflict, remove the object from one of the sources of truth "+
"so that the object is only managed by one reconciler.",
err.DesiredManager(), err.ObjectID(), err.CurrentManager())
handler.AddConflictError(err.ObjectID(), err)
// TODO: Use separate metrics for management conflicts vs resource conflicts
Expand Down
3 changes: 3 additions & 0 deletions pkg/remediator/reconcile/reconciler.go
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,9 @@ func (r *reconciler) Remediate(ctx context.Context, id core.ID, obj client.Objec
switch err.Code() {
case syncerclient.ResourceConflictCode:
// Record cache conflict (create/delete/update failure due to found/not-found/resource-version)
klog.Warningf("Remediator encountered a resource conflict: "+
"%v. To resolve the conflict, the remediator will fetch the latest object from the cluster "+
"and requeue it for remediation", err)
metrics.RecordResourceConflict(ctx, commit)
case status.ManagementConflictErrorCode:
if mce, ok := err.(status.ManagementConflictError); ok {
Expand Down
6 changes: 5 additions & 1 deletion pkg/remediator/watch/filteredwatcher.go
Original file line number Diff line number Diff line change
Expand Up @@ -350,8 +350,12 @@ func (w *filteredWatcher) start(ctx context.Context, resourceVersion string) (bo
if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) {
return false, status.InternalWrapf(err, "failed to start watch for %s", w.gvk)
} else if apierrors.IsNotFound(err) {
statusErr := syncerclient.ConflictWatchResourceDoesNotExist(err, w.gvk)
klog.Warningf("Remediator encountered a resource conflict: "+
"%v. To resolve the conflict, the remediator will enqueue a resync "+
"and restart the resource watch after the sync has succeeded.", statusErr)
metrics.RecordResourceConflict(ctx, w.getLatestCommit())
return false, syncerclient.ConflictWatchResourceDoesNotExist(err, w.gvk)
return false, statusErr
}
return false, status.APIServerErrorf(err, "failed to start watch for %s", w.gvk)
}
Expand Down
37 changes: 26 additions & 11 deletions pkg/remediator/watch/manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ import (
"errors"
"sync"

"golang.org/x/exp/maps"
"k8s.io/apimachinery/pkg/api/meta"
"k8s.io/apimachinery/pkg/labels"
"k8s.io/apimachinery/pkg/runtime/schema"
Expand Down Expand Up @@ -147,7 +148,7 @@ func (m *Manager) AddWatches(ctx context.Context, gvkMap map[schema.GroupVersion
defer m.mux.Unlock()
m.watching = true

klog.V(3).Infof("AddWatches(%v)", gvkMap)
klog.V(3).Infof("AddWatches(%v)", maps.Keys(gvkMap))

var startedWatches uint64

Expand All @@ -165,7 +166,12 @@ func (m *Manager) AddWatches(ctx context.Context, gvkMap map[schema.GroupVersion
if _, err := m.mapper.RESTMapping(gvk.GroupKind(), gvk.Version); err != nil {
switch {
case meta.IsNoMatchError(err):
klog.Infof("Remediator skipped adding watch for resource %v: %v: resource watch will be started after apply is successful", gvk, err)
statusErr := syncerclient.ConflictWatchResourceDoesNotExist(err, gvk)
klog.Infof("Remediator skipped starting resource watch: "+
"%v. The remediator will start the resource watch after the sync has succeeded.", statusErr)
// This is expected behavior before a sync attempt.
// It likely means a CR and CRD are in the same ApplySet.
// So don't record a resource conflict metric or return an error here.
default:
errs = status.Append(errs, status.APIServerErrorWrap(err))
}
Expand All @@ -180,9 +186,9 @@ func (m *Manager) AddWatches(ctx context.Context, gvkMap map[schema.GroupVersion
}

if startedWatches > 0 {
klog.Infof("The remediator made new progress: started %d new watches", startedWatches)
klog.Infof("Remediator started %d new watches", startedWatches)
} else {
klog.V(4).Infof("The remediator made no new progress")
klog.V(4).Infof("Remediator watches unchanged")
}
return errs
}
Expand All @@ -200,7 +206,7 @@ func (m *Manager) UpdateWatches(ctx context.Context, gvkMap map[schema.GroupVers
defer m.mux.Unlock()
m.watching = true

klog.V(3).Infof("UpdateWatches(%v)", gvkMap)
klog.V(3).Infof("UpdateWatches(%v)", maps.Keys(gvkMap))

m.needsUpdate = false

Expand Down Expand Up @@ -228,8 +234,15 @@ func (m *Manager) UpdateWatches(ctx context.Context, gvkMap map[schema.GroupVers
if _, err := m.mapper.RESTMapping(gvk.GroupKind(), gvk.Version); err != nil {
switch {
case meta.IsNoMatchError(err):
statusErr := syncerclient.ConflictWatchResourceDoesNotExist(err, gvk)
klog.Warningf("Remediator encountered a resource conflict: "+
"%v. To resolve the conflict, the remediator will enqueue a resync "+
"and restart the resource watch after the sync has succeeded.", statusErr)
// This is unexpected behavior after a successful sync.
// It likely means that some other controller deleted managed objects shortly after they were applied.
// So record a resource conflict metric and return an error.
metrics.RecordResourceConflict(ctx, commit)
errs = status.Append(errs, syncerclient.ConflictWatchResourceDoesNotExist(err, gvk))
errs = status.Append(errs, statusErr)
default:
errs = status.Append(errs, status.APIServerErrorWrap(err))
}
Expand All @@ -244,9 +257,9 @@ func (m *Manager) UpdateWatches(ctx context.Context, gvkMap map[schema.GroupVers
}

if startedWatches > 0 || stoppedWatches > 0 {
klog.Infof("The remediator made new progress: started %d new watches, and stopped %d watches", startedWatches, stoppedWatches)
klog.Infof("Remediator started %d new watches and stopped %d watches", startedWatches, stoppedWatches)
} else {
klog.V(4).Infof("The remediator made no new progress")
klog.V(4).Infof("Remediator watches unchanged")
}
return errs
}
Expand Down Expand Up @@ -284,10 +297,12 @@ func (m *Manager) startWatcher(ctx context.Context, gvk schema.GroupVersionKind,
// threadsafe.
func (m *Manager) runWatcher(ctx context.Context, r Runnable, gvk schema.GroupVersionKind) {
if err := r.Run(ctx); err != nil {
if errors.Is(err, context.Canceled) {
klog.Infof("Watcher stopped for %s: %v", gvk, status.FormatSingleLine(err))
// TODO: Make status.Error work with errors.Is unwrapping.
// For now, check the Cause directly, to avoid logging a warning on shutdown.
if errors.Is(err.Cause(), context.Canceled) {
klog.Infof("Watcher stopped for %s: %v", gvk, context.Canceled)
} else {
klog.Warningf("Error running watcher for %s: %v", gvk, status.FormatSingleLine(err))
klog.Warningf("Watcher errored for %s: %v", gvk, status.FormatSingleLine(err))
}
m.mux.Lock()
delete(m.watcherMap, gvk)
Expand Down
8 changes: 7 additions & 1 deletion pkg/resourcegroup/controllers/watch/manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ package watch

import (
"context"
"errors"
"net/http"
"sync"

Expand Down Expand Up @@ -191,7 +192,12 @@ func (m *Manager) startWatcher(ctx context.Context, gvk schema.GroupVersionKind)
// threadsafe.
func (m *Manager) runWatcher(ctx context.Context, r Runnable, gvk schema.GroupVersionKind) {
if err := r.Run(ctx); err != nil {
klog.Warningf("Error running watcher for %s: %v", gvk.String(), err)
// Avoid logging a warning on shutdown.
if errors.Is(err, context.Canceled) {
klog.Infof("Watcher stopped for %s: %v", gvk, context.Canceled)
} else {
klog.Warningf("Watcher errored for %s: %v", gvk, err)
}
m.mux.Lock()
delete(m.watcherMap, gvk)
m.needsUpdate = true
Expand Down

0 comments on commit 15f0b2e

Please sign in to comment.