From e49dcd9cb9755848dbca3ea5ebf325c03bd89bf7 Mon Sep 17 00:00:00 2001 From: Cyril Galibern Date: Mon, 6 Jan 2025 19:19:13 +0100 Subject: [PATCH 01/16] [icfg] Implements the minimum restart=2 for standby resources Extract of restart keyword: Resources with `standby=true` have `restart` forced to a minimum of 2, to increase chances of a restart success. --- daemon/icfg/main.go | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/daemon/icfg/main.go b/daemon/icfg/main.go index c496dbb28..efaec661a 100644 --- a/daemon/icfg/main.go +++ b/daemon/icfg/main.go @@ -70,6 +70,10 @@ var ( errConfigFileCheck = errors.New("config file check") + // standbyDefaultRestart defines the default minimum restart threshold for + // standby resources. + standbyDefaultRestart = 2 + keyApp = key.New("DEFAULT", "app") keyChildren = key.New("DEFAULT", "children") keyEnv = key.New("DEFAULT", "env") @@ -415,12 +419,17 @@ func (t *Manager) getResources(cf *xconfig.T) instance.ResourceConfigs { if resourceset.IsSubsetSection(section) { continue } + restart := cf.GetInt(key.New(section, "restart")) + isStandby := cf.GetBool(key.New(section, "standby")) + if isStandby && restart < standbyDefaultRestart { + restart = standbyDefaultRestart + } m[section] = instance.ResourceConfig{ RestartDelay: cf.GetDuration(key.New(section, "restart_delay")), - Restart: cf.GetInt(key.New(section, "restart")), + Restart: restart, IsDisabled: cf.GetBool(key.New(section, "disable")), IsMonitored: cf.GetBool(key.New(section, "monitor")), - IsStandby: cf.GetBool(key.New(section, "standby")), + IsStandby: isStandby, } } return m From bb599051cfcfb56888b74dc536fc8ed0e81beac9 Mon Sep 17 00:00:00 2001 From: Cyril Galibern Date: Wed, 8 Jan 2025 18:35:13 +0100 Subject: [PATCH 02/16] [imon] Refactor resource restart handling for possible race Added `cmdResourceRestart` structure to handle the resource restarted once delay is reached. --- daemon/imon/main.go | 12 ++ daemon/imon/orchestration_resource_restart.go | 186 +++++++++--------- 2 files changed, 101 insertions(+), 97 deletions(-) diff --git a/daemon/imon/main.go b/daemon/imon/main.go index 958200419..bb513b60d 100644 --- a/daemon/imon/main.go +++ b/daemon/imon/main.go @@ -138,6 +138,16 @@ type ( newState instance.MonitorState } + // cmdResourceRestart is a structure representing a command to restart resources. + // It can be used from imon goroutines to schedule a future resource restart + // handled during imon main loop. + // rids is a slice of resource IDs to restart. + // standby indicates whether the resources should restart in standby mode. + cmdResourceRestart struct { + rids []string + standby bool + } + Factory struct { DrainDuration time.Duration SubQS pubsub.QueueSizer @@ -368,6 +378,8 @@ func (t *Manager) worker(initialNodes []string) { switch c := i.(type) { case cmdOrchestrate: t.needOrchestrate(c) + case cmdResourceRestart: + t.resourceRestart(c.rids, c.standby) } case <-t.delayTimer.C: t.onDelayTimer() diff --git a/daemon/imon/orchestration_resource_restart.go b/daemon/imon/orchestration_resource_restart.go index 665fda7c7..d5e846913 100644 --- a/daemon/imon/orchestration_resource_restart.go +++ b/daemon/imon/orchestration_resource_restart.go @@ -178,6 +178,8 @@ func (t *Manager) pubMonitorAction(rid string, action instance.MonitorAction) { t.labelLocalhost) } +// orchestrateResourceRestart manages the restart orchestration process for resources, +// handling delays, timers, and retries. func (t *Manager) orchestrateResourceRestart() { todoRestart := newTodoMap() todoStandby := newTodoMap() @@ -252,101 +254,6 @@ func (t *Manager) orchestrateResourceRestart() { } } - getRidsAndDelay := func(todo todoMap) ([]string, time.Duration) { - var maxDelay time.Duration - rids := make([]string, 0) - now := time.Now() - for rid := range todo { - rcfg := t.instConfig.Resources.Get(rid) - if rcfg == nil { - continue - } - rmon := t.state.Resources.Get(rid) - if rmon == nil { - continue - } - if rcfg.RestartDelay != nil { - notBefore := rmon.Restart.LastAt.Add(*rcfg.RestartDelay) - if now.Before(notBefore) { - delay := notBefore.Sub(now) - if delay > maxDelay { - maxDelay = delay - } - } - } - rids = append(rids, rid) - } - return rids, maxDelay - } - - doRestart := func() { - rids, delay := getRidsAndDelay(todoRestart) - if len(rids) == 0 { - return - } - timer := time.AfterFunc(delay, func() { - now := time.Now() - for _, rid := range rids { - rmon := t.state.Resources.Get(rid) - if rmon == nil { - continue - } - rmon.Restart.LastAt = now - rmon.Restart.Timer = nil - t.state.Resources.Set(rid, *rmon) - t.change = true - } - action := func() error { - return t.queueResourceStart(rids) - } - t.doTransitionAction(action, instance.MonitorStateStarting, instance.MonitorStateIdle, instance.MonitorStateStartFailed) - }) - for _, rid := range rids { - rmon := t.state.Resources.Get(rid) - if rmon == nil { - continue - } - rmon.DecRestartRemaining() - rmon.Restart.Timer = timer - t.state.Resources.Set(rid, *rmon) - t.change = true - } - } - - doStandby := func() { - rids, delay := getRidsAndDelay(todoStandby) - if len(rids) == 0 { - return - } - timer := time.AfterFunc(delay, func() { - now := time.Now() - for _, rid := range rids { - rmon := t.state.Resources.Get(rid) - if rmon == nil { - continue - } - rmon.Restart.LastAt = now - rmon.Restart.Timer = nil - t.state.Resources.Set(rid, *rmon) - t.change = true - } - action := func() error { - return t.queueResourceStartStandby(rids) - } - t.doTransitionAction(action, instance.MonitorStateStarting, instance.MonitorStateIdle, instance.MonitorStateStartFailed) - }) - for _, rid := range rids { - rmon := t.state.Resources.Get(rid) - if rmon == nil { - continue - } - rmon.DecRestartRemaining() - rmon.Restart.Timer = timer - t.state.Resources.Set(rid, *rmon) - t.change = true - } - } - // discard the cluster object if t.path.String() == "cluster" { return @@ -417,6 +324,91 @@ func (t *Manager) orchestrateResourceRestart() { for rid, rstat := range t.instStatus[t.localhost].Resources { planFor(rid, rstat.Status, started) } - doStandby() - doRestart() + t.resourceRestartSchedule(todoStandby, true) + t.resourceRestartSchedule(todoRestart, false) +} + +// resourceRestartSchedule schedules a restart for resources based on the provided resource map and standby mode. +// It updates the state of resources with associated restart timers and logs the operation. +func (t *Manager) resourceRestartSchedule(todo todoMap, standby bool) { + rids, delay := t.getRidsAndDelay(todo) + if len(rids) == 0 { + return + } + if standby { + t.log.Infof("schedule restart standby resources %v in %s", rids, delay) + } else { + t.log.Infof("schedule restart resources %v in %s", rids, delay) + } + timer := time.AfterFunc(delay, func() { + t.cmdC <- cmdResourceRestart{ + rids: rids, + standby: standby, + } + }) + for _, rid := range rids { + rmon := t.state.Resources.Get(rid) + if rmon == nil { + continue + } + rmon.DecRestartRemaining() + rmon.Restart.Timer = timer + t.state.Resources.Set(rid, *rmon) + t.change = true + } +} + +// resourceRestart restarts the specified resources and updates their state in the resource monitor. +// Accepts a list of resource IDs and a boolean indicating if standby mode should be used. +// Queues the appropriate start operation and initiates a state transition. +func (t *Manager) resourceRestart(resourceRids []string, standby bool) { + now := time.Now() + rids := make([]string, 0, len(resourceRids)) + for _, rid := range resourceRids { + rmon := t.state.Resources.Get(rid) + if rmon == nil { + continue + } + rids = append(rids, rid) + rmon.Restart.LastAt = now + rmon.Restart.Timer = nil + t.state.Resources.Set(rid, *rmon) + t.change = true + } + queueFunc := t.queueResourceStart + if standby { + queueFunc = t.queueResourceStartStandby + } + action := func() error { + return queueFunc(rids) + } + t.doTransitionAction(action, instance.MonitorStateStarting, instance.MonitorStateIdle, instance.MonitorStateStartFailed) +} + +// getRidsAndDelay processes a todoMap to retrieve resource IDs and calculates the maximum required restart delay. +func (t *Manager) getRidsAndDelay(todo todoMap) ([]string, time.Duration) { + var maxDelay time.Duration + rids := make([]string, 0) + now := time.Now() + for rid := range todo { + rcfg := t.instConfig.Resources.Get(rid) + if rcfg == nil { + continue + } + rmon := t.state.Resources.Get(rid) + if rmon == nil { + continue + } + if rcfg.RestartDelay != nil { + notBefore := rmon.Restart.LastAt.Add(*rcfg.RestartDelay) + if now.Before(notBefore) { + delay := notBefore.Sub(now) + if delay > maxDelay { + maxDelay = delay + } + } + } + rids = append(rids, rid) + } + return rids, maxDelay } From b6d338bdb75e459640bcee19d7670b1a315859e4 Mon Sep 17 00:00:00 2001 From: Cyril Galibern Date: Wed, 8 Jan 2025 18:46:22 +0100 Subject: [PATCH 03/16] [imon] Reordered and fix the standby restart (start vs startstandby) standby resources was restarted with `start` instead of `startstandby` when local expected is started: daemon: imon: restarts: resource fs#2 status stdby down, restart remaining 2 out of 2 ... daemon: imon: restarts: schedule restart resources [fs#2] in 0s ... daemon: imon: restarts: change state idle -> starting ... daemon: imon: restarts: -> exec [/usr/bin/om restarts start --local --rid fs#2] ... daemon: imon: restarts: <- exec [/usr/bin/om restarts start --local --rid fs#2] . instead of daemon: imon: restarts: resource fs#2 status stdby down, standby restart remaining 2 out of 2 ... daemon: imon: restarts: schedule restart standby resources [fs#2] in 0s ... daemon: imon: restarts: change state idle -> starting ... daemon: imon: restarts: -> exec [/usr/bin/om restarts startstandby --local --rid fs#2] ... daemon: imon: restarts: <- exec [/usr/bin/om restarts startstandby --local --rid fs#2 --- daemon/imon/orchestration_resource_restart.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/daemon/imon/orchestration_resource_restart.go b/daemon/imon/orchestration_resource_restart.go index d5e846913..921d95eec 100644 --- a/daemon/imon/orchestration_resource_restart.go +++ b/daemon/imon/orchestration_resource_restart.go @@ -237,6 +237,9 @@ func (t *Manager) orchestrateResourceRestart() { t.log.Debugf("resource %s restart skip: already has a delay timer", rid) case t.monitorActionCalled(): t.log.Debugf("resource %s restart skip: already ran the monitor action", rid) + case rcfg.IsStandby: + t.log.Infof("resource %s status %s, standby restart remaining %d out of %d", rid, resStatus, rmon.Restart.Remaining, rcfg.Restart) + todoStandby.Add(rid) case started: t.log.Infof("resource %s status %s, restart remaining %d out of %d", rid, resStatus, rmon.Restart.Remaining, rcfg.Restart) if rmon.Restart.Remaining == 0 { @@ -245,9 +248,6 @@ func (t *Manager) orchestrateResourceRestart() { } else { todoRestart.Add(rid) } - case rcfg.IsStandby: - t.log.Infof("resource %s status %s, standby restart remaining %d out of %d", rid, resStatus, rmon.Restart.Remaining, rcfg.Restart) - todoStandby.Add(rid) default: t.log.Debugf("resource %s restart skip: instance not started", rid) resetTimer(rid, rmon) From 2d3abea9a050e9c4c3c98d788f46b2f8b09fea8f Mon Sep 17 00:00:00 2001 From: Cyril Galibern Date: Wed, 8 Jan 2025 23:51:34 +0100 Subject: [PATCH 04/16] [imon] Ensure eviction actions only occur for monitored resources --- daemon/imon/orchestration_resource_restart.go | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/daemon/imon/orchestration_resource_restart.go b/daemon/imon/orchestration_resource_restart.go index 921d95eec..1fb100547 100644 --- a/daemon/imon/orchestration_resource_restart.go +++ b/daemon/imon/orchestration_resource_restart.go @@ -196,7 +196,7 @@ func (t *Manager) orchestrateResourceRestart() { resetRemaining := func(rid string, rcfg *instance.ResourceConfig, rmon *instance.ResourceMonitor) { if rmon.Restart.Remaining != rcfg.Restart { - t.log.Infof("resource %s is up, reset restart count to the max (%d -> %d)", rid, rmon.Restart.Remaining, rcfg.Restart) + t.log.Infof("resource %s: reset restart count to the max (%d -> %d)", rid, rmon.Restart.Remaining, rcfg.Restart) rmon.Restart.Remaining = rcfg.Restart // reset the last monitor action execution time, to rearm the next monitor action t.state.MonitorActionExecutedAt = time.Time{} @@ -241,11 +241,12 @@ func (t *Manager) orchestrateResourceRestart() { t.log.Infof("resource %s status %s, standby restart remaining %d out of %d", rid, resStatus, rmon.Restart.Remaining, rcfg.Restart) todoStandby.Add(rid) case started: - t.log.Infof("resource %s status %s, restart remaining %d out of %d", rid, resStatus, rmon.Restart.Remaining, rcfg.Restart) - if rmon.Restart.Remaining == 0 { - t.setLocalExpect(instance.MonitorLocalExpectEvicted, "monitor action: %s", disableMonitorMsg) + if rmon.Restart.Remaining == 0 && rcfg.IsMonitored { + t.log.Infof("resource %s status %s, restart remaining %d out of %d", rid, resStatus, rmon.Restart.Remaining, rcfg.Restart) + t.setLocalExpect(instance.MonitorLocalExpectEvicted, "monitor action evicting: %s", disableMonitorMsg) t.doMonitorAction(rid, 0) } else { + t.log.Infof("resource %s status %s, restart remaining %d out of %d", rid, resStatus, rmon.Restart.Remaining, rcfg.Restart) todoRestart.Add(rid) } default: From 0df7ce23a38bc4f0ccb5284e0c958ff0bbcabd6b Mon Sep 17 00:00:00 2001 From: Cyril Galibern Date: Thu, 9 Jan 2025 00:02:49 +0100 Subject: [PATCH 05/16] [imon] Don't restart monitored resources when remaining attempts is 0 Previously, the restart logic incorrectly added resources to the restart list regardless of remaining attempts. Remaining attempts is 0 when restart kw is undefined. This change ensures resources are only added when remaining restarts are greater than zero, preventing unnecessary actions. --- daemon/imon/orchestration_resource_restart.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/daemon/imon/orchestration_resource_restart.go b/daemon/imon/orchestration_resource_restart.go index 1fb100547..ab661bcd4 100644 --- a/daemon/imon/orchestration_resource_restart.go +++ b/daemon/imon/orchestration_resource_restart.go @@ -245,7 +245,7 @@ func (t *Manager) orchestrateResourceRestart() { t.log.Infof("resource %s status %s, restart remaining %d out of %d", rid, resStatus, rmon.Restart.Remaining, rcfg.Restart) t.setLocalExpect(instance.MonitorLocalExpectEvicted, "monitor action evicting: %s", disableMonitorMsg) t.doMonitorAction(rid, 0) - } else { + } else if rmon.Restart.Remaining > 0 { t.log.Infof("resource %s status %s, restart remaining %d out of %d", rid, resStatus, rmon.Restart.Remaining, rcfg.Restart) todoRestart.Add(rid) } From 95e34851d94bc6f0ab4d9f60369b18f7dd25cb98 Mon Sep 17 00:00:00 2001 From: Cyril Galibern Date: Thu, 9 Jan 2025 00:19:11 +0100 Subject: [PATCH 06/16] [imon] Drop unnecessary resourceRestartSchedule call when todo is empty --- daemon/imon/orchestration_resource_restart.go | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/daemon/imon/orchestration_resource_restart.go b/daemon/imon/orchestration_resource_restart.go index ab661bcd4..96ef8938e 100644 --- a/daemon/imon/orchestration_resource_restart.go +++ b/daemon/imon/orchestration_resource_restart.go @@ -325,8 +325,14 @@ func (t *Manager) orchestrateResourceRestart() { for rid, rstat := range t.instStatus[t.localhost].Resources { planFor(rid, rstat.Status, started) } - t.resourceRestartSchedule(todoStandby, true) - t.resourceRestartSchedule(todoRestart, false) + + // Prepare scheduled resource restart + if len(todoStandby) > 0 { + t.resourceRestartSchedule(todoStandby, true) + } + if len(todoRestart) > 0 { + t.resourceRestartSchedule(todoRestart, false) + } } // resourceRestartSchedule schedules a restart for resources based on the provided resource map and standby mode. From f69fb543db9596e69ab0312a4890542995a3ac50 Mon Sep 17 00:00:00 2001 From: Cyril Galibern Date: Thu, 9 Jan 2025 00:49:05 +0100 Subject: [PATCH 07/16] Add Monitor flag population in GetStatus for ResourceFlagsString This ensures the Monitor flag is included when generating ResourceFlagsString, providing more accurate and informative logging. Updates will now reflect the monitored status of resources in the output. => app#foo1 .M.../.4 down forking app.forking is now app#foo1 ...../.4 down forking app.forking --- core/resource/resource.go | 1 + 1 file changed, 1 insertion(+) diff --git a/core/resource/resource.go b/core/resource/resource.go index 308c9e8db..883a9b7de 100644 --- a/core/resource/resource.go +++ b/core/resource/resource.go @@ -1174,6 +1174,7 @@ func GetStatus(ctx context.Context, r Driver) Status { Log: r.StatusLog().Entries(), Provisioned: getProvisionStatus(r), Info: getStatusInfo(ctx, r), + Monitor: MonitorFlag(r.IsMonitored()), Restart: RestartFlag(r.RestartCount()), Optional: OptionalFlag(r.IsOptional()), Standby: StandbyFlag(r.IsStandby()), From f49aac71e9a5b71d8461731297bc7b1e0185e50b Mon Sep 17 00:00:00 2001 From: Cyril Galibern Date: Thu, 9 Jan 2025 01:42:41 +0100 Subject: [PATCH 08/16] [imon] Enhance resource restart reset logging with detailed reasons --- daemon/imon/orchestration_resource_restart.go | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/daemon/imon/orchestration_resource_restart.go b/daemon/imon/orchestration_resource_restart.go index 96ef8938e..ef20d6b76 100644 --- a/daemon/imon/orchestration_resource_restart.go +++ b/daemon/imon/orchestration_resource_restart.go @@ -194,9 +194,9 @@ func (t *Manager) orchestrateResourceRestart() { } } - resetRemaining := func(rid string, rcfg *instance.ResourceConfig, rmon *instance.ResourceMonitor) { + resetRemaining := func(rid string, rcfg *instance.ResourceConfig, rmon *instance.ResourceMonitor, reason string) { if rmon.Restart.Remaining != rcfg.Restart { - t.log.Infof("resource %s: reset restart count to the max (%d -> %d)", rid, rmon.Restart.Remaining, rcfg.Restart) + t.log.Infof("resource %s %s: reset restart count to the max (%d -> %d)", rid, reason, rmon.Restart.Remaining, rcfg.Restart) rmon.Restart.Remaining = rcfg.Restart // reset the last monitor action execution time, to rearm the next monitor action t.state.MonitorActionExecutedAt = time.Time{} @@ -205,8 +205,8 @@ func (t *Manager) orchestrateResourceRestart() { } } - resetRemainingAndTimer := func(rid string, rcfg *instance.ResourceConfig, rmon *instance.ResourceMonitor) { - resetRemaining(rid, rcfg, rmon) + resetRemainingAndTimer := func(rid string, rcfg *instance.ResourceConfig, rmon *instance.ResourceMonitor, reason string) { + resetRemaining(rid, rcfg, rmon, reason) resetTimer(rid, rmon) } @@ -226,13 +226,10 @@ func (t *Manager) orchestrateResourceRestart() { return case rcfg.IsDisabled: t.log.Debugf("resource %s restart skip: disable=%v", rid, rcfg.IsDisabled) - resetRemainingAndTimer(rid, rcfg, rmon) - case resStatus.Is(status.NotApplicable, status.Undef): + resetRemainingAndTimer(rid, rcfg, rmon, "is disabled") + case resStatus.Is(status.NotApplicable, status.Undef, status.Up, status.StandbyUp): t.log.Debugf("resource %s restart skip: status=%s", rid, resStatus) - resetRemainingAndTimer(rid, rcfg, rmon) - case resStatus.Is(status.Up, status.StandbyUp): - t.log.Debugf("resource %s restart skip: status=%s", rid, resStatus) - resetRemainingAndTimer(rid, rcfg, rmon) + resetRemainingAndTimer(rid, rcfg, rmon, fmt.Sprintf("status is %s", resStatus)) case rmon.Restart.Timer != nil: t.log.Debugf("resource %s restart skip: already has a delay timer", rid) case t.monitorActionCalled(): From bfaf39990e825ca49bb4e85c23d32b5448b685ca Mon Sep 17 00:00:00 2001 From: Cyril Galibern Date: Thu, 9 Jan 2025 03:01:59 +0100 Subject: [PATCH 09/16] [imon] Resets resource restart timers during resource monitor init Ensure that any pending resource restart timers are stopped when initializing the resource monitor to prevent unintended resource restart. On instance config updated, the resource monitor is re-initialized. The previous resource monitor restart timers must be stopped. --- daemon/imon/main_cmd.go | 9 +++++++++ daemon/imon/orchestration_resource_restart.go | 4 ++-- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/daemon/imon/main_cmd.go b/daemon/imon/main_cmd.go index 30cacd754..aabf6f2df 100644 --- a/daemon/imon/main_cmd.go +++ b/daemon/imon/main_cmd.go @@ -335,6 +335,7 @@ func (t *Manager) onInstanceConfigUpdated(srcNode string, srcCmd *msgbus.Instanc } }() t.instConfig = srcCmd.Value + t.log.Debugf("refresh resource monitor states on local instance config updated") t.initResourceMonitor() janitorInstStatus(srcCmd.Value.Scope) janitorRelations(srcCmd.Value.Children, "Child", t.state.Children) @@ -1046,6 +1047,14 @@ func (t *Manager) doLastAction(action func() error, newState, successState, erro } func (t *Manager) initResourceMonitor() { + // Stop any pending restart timers before init. We may be called after + // instance config refreshed with some previous resource restart scheduled. + if t.state.Resources != nil && len(t.state.Resources) > 0 { + t.log.Infof("drop pending schedule restart resources: local instance config has been updated") + for _, rmon := range t.state.Resources { + rmon.StopRestartTimer() + } + } m := make(instance.ResourceMonitors, 0) for rid, rcfg := range t.instConfig.Resources { m[rid] = instance.ResourceMonitor{ diff --git a/daemon/imon/orchestration_resource_restart.go b/daemon/imon/orchestration_resource_restart.go index ef20d6b76..15c02555d 100644 --- a/daemon/imon/orchestration_resource_restart.go +++ b/daemon/imon/orchestration_resource_restart.go @@ -225,10 +225,10 @@ func (t *Manager) orchestrateResourceRestart() { case rmon == nil: return case rcfg.IsDisabled: - t.log.Debugf("resource %s restart skip: disable=%v", rid, rcfg.IsDisabled) + t.log.Debugf("resource %s restart skip: is disabled", rid, rcfg.IsDisabled) resetRemainingAndTimer(rid, rcfg, rmon, "is disabled") case resStatus.Is(status.NotApplicable, status.Undef, status.Up, status.StandbyUp): - t.log.Debugf("resource %s restart skip: status=%s", rid, resStatus) + t.log.Debugf("resource %s restart skip: status is %s", rid, resStatus) resetRemainingAndTimer(rid, rcfg, rmon, fmt.Sprintf("status is %s", resStatus)) case rmon.Restart.Timer != nil: t.log.Debugf("resource %s restart skip: already has a delay timer", rid) From 4fff23e31e8a6ceee14e13c24f1944e35c64f518 Mon Sep 17 00:00:00 2001 From: Cyril Galibern Date: Thu, 9 Jan 2025 14:57:47 +0100 Subject: [PATCH 10/16] [imon] Re-introduce monitor action for standby resources --- daemon/imon/orchestration_resource_restart.go | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/daemon/imon/orchestration_resource_restart.go b/daemon/imon/orchestration_resource_restart.go index 15c02555d..f0f30e14e 100644 --- a/daemon/imon/orchestration_resource_restart.go +++ b/daemon/imon/orchestration_resource_restart.go @@ -234,17 +234,19 @@ func (t *Manager) orchestrateResourceRestart() { t.log.Debugf("resource %s restart skip: already has a delay timer", rid) case t.monitorActionCalled(): t.log.Debugf("resource %s restart skip: already ran the monitor action", rid) - case rcfg.IsStandby: - t.log.Infof("resource %s status %s, standby restart remaining %d out of %d", rid, resStatus, rmon.Restart.Remaining, rcfg.Restart) - todoStandby.Add(rid) - case started: + case rcfg.IsStandby || started: if rmon.Restart.Remaining == 0 && rcfg.IsMonitored { t.log.Infof("resource %s status %s, restart remaining %d out of %d", rid, resStatus, rmon.Restart.Remaining, rcfg.Restart) t.setLocalExpect(instance.MonitorLocalExpectEvicted, "monitor action evicting: %s", disableMonitorMsg) t.doMonitorAction(rid, 0) } else if rmon.Restart.Remaining > 0 { - t.log.Infof("resource %s status %s, restart remaining %d out of %d", rid, resStatus, rmon.Restart.Remaining, rcfg.Restart) - todoRestart.Add(rid) + if rcfg.IsStandby { + t.log.Infof("resource %s status %s, standby restart remaining %d out of %d", rid, resStatus, rmon.Restart.Remaining, rcfg.Restart) + todoStandby.Add(rid) + } else { + t.log.Infof("resource %s status %s, restart remaining %d out of %d", rid, resStatus, rmon.Restart.Remaining, rcfg.Restart) + todoRestart.Add(rid) + } } default: t.log.Debugf("resource %s restart skip: instance not started", rid) From 38c398a8de4be40a2ea6f30463e7be090f11c077 Mon Sep 17 00:00:00 2001 From: Cyril Galibern Date: Thu, 9 Jan 2025 15:14:42 +0100 Subject: [PATCH 11/16] [imon] Add validation and alerting for monitor actions (none is skipped) Introduced `initialMonitorAction` value that is created/refreshed during initResourceMonitor func. It is called during initial imon startup or on local instance config update. initResourceMonitor func validates config keywords: `monitor_action` and `monitor` and may log warning messages: resource %s is monitored, but monitor action is none unsupported monitor action: %s --- daemon/imon/main.go | 5 ++ daemon/imon/main_cmd.go | 13 ++++ daemon/imon/orchestration_resource_restart.go | 68 +++++++++++-------- 3 files changed, 57 insertions(+), 29 deletions(-) diff --git a/daemon/imon/main.go b/daemon/imon/main.go index bb513b60d..383f93596 100644 --- a/daemon/imon/main.go +++ b/daemon/imon/main.go @@ -130,6 +130,11 @@ type ( // It is used during enableDelayTimer(): // When false the delay timer is reset with delayDuration delayTimerEnabled bool + + // initialMonitorAction specifies the initial (stage 0) monitor action + // for monitoring as defined by the MonitorAction type. + // Its Value is created/refreshed during func initResourceMonitor. + initialMonitorAction instance.MonitorAction } // cmdOrchestrate can be used from post action go routines diff --git a/daemon/imon/main_cmd.go b/daemon/imon/main_cmd.go index aabf6f2df..61fe18702 100644 --- a/daemon/imon/main_cmd.go +++ b/daemon/imon/main_cmd.go @@ -1055,6 +1055,15 @@ func (t *Manager) initResourceMonitor() { rmon.StopRestartTimer() } } + + if monitorAction, ok := t.getValidMonitorAction(0); !ok { + t.initialMonitorAction = instance.MonitorActionNone + } else { + t.initialMonitorAction = monitorAction + } + + hasMonitorActionNone := t.initialMonitorAction == instance.MonitorActionNone + m := make(instance.ResourceMonitors, 0) for rid, rcfg := range t.instConfig.Resources { m[rid] = instance.ResourceMonitor{ @@ -1062,8 +1071,12 @@ func (t *Manager) initResourceMonitor() { Remaining: rcfg.Restart, }, } + if rcfg.IsMonitored && hasMonitorActionNone { + t.log.Warnf("resource %s is monitored, but monitor action is none", rid) + } } t.state.Resources = m + t.change = true } diff --git a/daemon/imon/orchestration_resource_restart.go b/daemon/imon/orchestration_resource_restart.go index f0f30e14e..8bd243a19 100644 --- a/daemon/imon/orchestration_resource_restart.go +++ b/daemon/imon/orchestration_resource_restart.go @@ -91,47 +91,29 @@ func (t *Manager) monitorActionCalled() bool { return !t.state.MonitorActionExecutedAt.IsZero() } -func (t *Manager) doMonitorAction(rid string, stage int) { +func (t *Manager) doMonitorAction(rid string, action instance.MonitorAction) { t.state.MonitorActionExecutedAt = time.Now() - monitorActionCount := len(t.instConfig.MonitorAction) - if monitorActionCount < stage+1 { - t.log.Errorf("skip monitor action: stage %d action no longer configured", stage+1) + if !t.isValidMonitorAction(action) { return } - - monitorAction := t.instConfig.MonitorAction[stage] - - switch monitorAction { - case instance.MonitorActionCrash: - case instance.MonitorActionFreezeStop: - case instance.MonitorActionReboot: - case instance.MonitorActionSwitch: - case instance.MonitorActionNone: - t.log.Infof("skip monitor action: not configured") - return - default: - t.log.Errorf("skip monitor action: not supported: %s", monitorAction) - return - } - if err := t.doPreMonitorAction(); err != nil { t.log.Errorf("pre monitor action: %s", err) } - t.log.Infof("do monitor action %d/%d: %s", stage+1, len(t.instConfig.MonitorAction), monitorAction) - t.pubMonitorAction(rid, monitorAction) + t.log.Infof("do monitor action: %s", action) + t.pubMonitorAction(rid, action) - switch monitorAction { + switch action { case instance.MonitorActionCrash: if err := toc.Crash(); err != nil { - t.log.Errorf("monitor action: %s", err) + t.log.Errorf("monitor action %s: %s", action, err) } case instance.MonitorActionFreezeStop: t.doFreezeStop() t.doStop() case instance.MonitorActionReboot: if err := toc.Reboot(); err != nil { - t.log.Errorf("monitor action: %s", err) + t.log.Errorf("monitor action %s: %s", action, err) } case instance.MonitorActionSwitch: t.createPendingWithDuration(stopDuration) @@ -235,10 +217,10 @@ func (t *Manager) orchestrateResourceRestart() { case t.monitorActionCalled(): t.log.Debugf("resource %s restart skip: already ran the monitor action", rid) case rcfg.IsStandby || started: - if rmon.Restart.Remaining == 0 && rcfg.IsMonitored { + if rmon.Restart.Remaining == 0 && rcfg.IsMonitored && t.initialMonitorAction != instance.MonitorActionNone { t.log.Infof("resource %s status %s, restart remaining %d out of %d", rid, resStatus, rmon.Restart.Remaining, rcfg.Restart) t.setLocalExpect(instance.MonitorLocalExpectEvicted, "monitor action evicting: %s", disableMonitorMsg) - t.doMonitorAction(rid, 0) + t.doMonitorAction(rid, t.initialMonitorAction) } else if rmon.Restart.Remaining > 0 { if rcfg.IsStandby { t.log.Infof("resource %s status %s, standby restart remaining %d out of %d", rid, resStatus, rmon.Restart.Remaining, rcfg.Restart) @@ -285,8 +267,12 @@ func (t *Manager) orchestrateResourceRestart() { } if t.state.LocalExpect == instance.MonitorLocalExpectEvicted && t.state.State == instance.MonitorStateStopFailed { - t.disableMonitor("orchestrate resource restart recover from evicted and stop failed") - t.doMonitorAction("", 1) + if action, ok := t.getValidMonitorAction(1); ok { + t.disableMonitor("initial monitor action failed, try alternate monitor action %s", action) + t.doMonitorAction("", action) + } else { + t.disableMonitor("initial monitor action failed, no alternate monitor action") + } } // don't run on frozen instances @@ -418,3 +404,27 @@ func (t *Manager) getRidsAndDelay(todo todoMap) ([]string, time.Duration) { } return rids, maxDelay } + +func (t *Manager) getValidMonitorAction(stage int) (action instance.MonitorAction, ok bool) { + if stage >= len(t.instConfig.MonitorAction) { + return + } + action = t.instConfig.MonitorAction[stage] + ok = t.isValidMonitorAction(action) + return +} + +func (t *Manager) isValidMonitorAction(action instance.MonitorAction) bool { + switch action { + case instance.MonitorActionCrash, + instance.MonitorActionFreezeStop, + instance.MonitorActionReboot, + instance.MonitorActionSwitch: + return true + case instance.MonitorActionNone: + return false + default: + t.log.Errorf("unsupported monitor action: %s", action) + return false + } +} From 2897b4513ded4781eb3d6fc77a0445aa0565cda4 Mon Sep 17 00:00:00 2001 From: Cyril Galibern Date: Thu, 9 Jan 2025 15:55:44 +0100 Subject: [PATCH 12/16] [imon] Add support for the "no-op" monitor action Introduces a new "no-op" action for monitored resources, allowing a no-operation behavior while setting the state to 'evicted'. This can be useful for demonstration purposes or cases where no action is required. --- core/instance/monitor.go | 25 ++++++++++++++++--- core/object/text/kw/core/monitor_action | 14 +++++++++-- core/resource/resource.go | 13 +++++----- daemon/imon/orchestration_resource_restart.go | 3 ++- 4 files changed, 42 insertions(+), 13 deletions(-) diff --git a/core/instance/monitor.go b/core/instance/monitor.go index d71fb66f1..8c1ab035d 100644 --- a/core/instance/monitor.go +++ b/core/instance/monitor.go @@ -303,12 +303,29 @@ var ( ErrSameGlobalExpect = errors.New("instance monitor global expect is already set to the same value") ErrSameLocalExpect = errors.New("instance monitor local expect is already set to the same value") ErrSameState = errors.New("instance monitor state is already set to the same value") +) + +var ( + // MonitorActionNone: monitor action is disabled. + MonitorActionNone MonitorAction = "none" + + // MonitorActionCrash represents the monitor action that will try system crash/panic + MonitorActionCrash MonitorAction = "crash" - MonitorActionNone MonitorAction = "none" - MonitorActionCrash MonitorAction = "crash" + // MonitorActionFreezeStop represents the monitor action that will try freeze and subsequently stop + // the monitored instance. MonitorActionFreezeStop MonitorAction = "freezestop" - MonitorActionReboot MonitorAction = "reboot" - MonitorActionSwitch MonitorAction = "switch" + + // MonitorActionReboot represents the monitor action that will reboot the system. + MonitorActionReboot MonitorAction = "reboot" + + // MonitorActionSwitch represents the monitor action that will stop instance stop to allow + // any other cluster nodes to takeover instance. + MonitorActionSwitch MonitorAction = "switch" + + // MonitorActionNoOp represents the no-operation behavior while setting the state to 'evicted'. + // This can be useful for demonstration purposes or cases where no action is required. + MonitorActionNoOp MonitorAction = "no-op" ) func (t MonitorState) Is(states ...MonitorState) bool { diff --git a/core/object/text/kw/core/monitor_action b/core/object/text/kw/core/monitor_action index bb0312d0a..51f294c2e 100644 --- a/core/object/text/kw/core/monitor_action +++ b/core/object/text/kw/core/monitor_action @@ -1,7 +1,7 @@ The action to trigger when a monitored resource is no longer in the "up" or "standby up" state, and all restart attempts for the resource have failed. -The reboot and crash monitor actions do not attempt to cleanly stop any +The `reboot` and `crash` monitor actions do not attempt to cleanly stop any processes. On Linux, they utilize system-level sysrq triggers. This behavior is designed to ensure that the host stops writing to shared @@ -10,5 +10,15 @@ is critical because a failover node is likely preparing to write to the same shared disks. You can append a fallback monitor action to this keyword. A common example -is "freezestop reboot". In this case, the reboot action will be executed +is `freezestop reboot`. In this case, the reboot action will be executed if the stop fails or times out. + +Other monitor_actions values: + - `none`: the default value for monitor action disabled (`monitor`keyword + must be also `false` or undefined). + - `freezestop`: freeze and subsequently stop the monitored instance. + - `switch`: try monitored instance stop to allow any other cluster nodes to + takeover the instance. + - `no-op`: The monitor action No Operation is called but does nothing. It + may be used for demonstration. The final local expect after call will + be set to `evicted`. diff --git a/core/resource/resource.go b/core/resource/resource.go index 883a9b7de..412bad002 100644 --- a/core/resource/resource.go +++ b/core/resource/resource.go @@ -1174,12 +1174,13 @@ func GetStatus(ctx context.Context, r Driver) Status { Log: r.StatusLog().Entries(), Provisioned: getProvisionStatus(r), Info: getStatusInfo(ctx, r), - Monitor: MonitorFlag(r.IsMonitored()), - Restart: RestartFlag(r.RestartCount()), - Optional: OptionalFlag(r.IsOptional()), - Standby: StandbyFlag(r.IsStandby()), - Disable: DisableFlag(r.IsDisabled()), - Encap: EncapFlag(r.IsEncap()), + + Monitor: MonitorFlag(r.IsMonitored()), + Restart: RestartFlag(r.RestartCount()), + Optional: OptionalFlag(r.IsOptional()), + Standby: StandbyFlag(r.IsStandby()), + Disable: DisableFlag(r.IsDisabled()), + Encap: EncapFlag(r.IsEncap()), } } diff --git a/daemon/imon/orchestration_resource_restart.go b/daemon/imon/orchestration_resource_restart.go index 8bd243a19..94fcdbcdc 100644 --- a/daemon/imon/orchestration_resource_restart.go +++ b/daemon/imon/orchestration_resource_restart.go @@ -419,7 +419,8 @@ func (t *Manager) isValidMonitorAction(action instance.MonitorAction) bool { case instance.MonitorActionCrash, instance.MonitorActionFreezeStop, instance.MonitorActionReboot, - instance.MonitorActionSwitch: + instance.MonitorActionSwitch, + instance.MonitorActionNoOp: return true case instance.MonitorActionNone: return false From d95b2a99f8ee7548d2d759f1e2f1d29c332dc948 Mon Sep 17 00:00:00 2001 From: Cyril Galibern Date: Fri, 10 Jan 2025 00:10:02 +0100 Subject: [PATCH 13/16] [daemonapi] log info level when monitor action are miss configured - unusable monitor action: resource %s is monitored, but monitor action is none - unsupported monitor action: %s --- daemon/imon/main_cmd.go | 2 +- daemon/imon/orchestration_resource_restart.go | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/daemon/imon/main_cmd.go b/daemon/imon/main_cmd.go index 61fe18702..4731a925d 100644 --- a/daemon/imon/main_cmd.go +++ b/daemon/imon/main_cmd.go @@ -1072,7 +1072,7 @@ func (t *Manager) initResourceMonitor() { }, } if rcfg.IsMonitored && hasMonitorActionNone { - t.log.Warnf("resource %s is monitored, but monitor action is none", rid) + t.log.Infof("unusable monitor action: resource %s is monitored, but monitor action is none", rid) } } t.state.Resources = m diff --git a/daemon/imon/orchestration_resource_restart.go b/daemon/imon/orchestration_resource_restart.go index 94fcdbcdc..affb4373b 100644 --- a/daemon/imon/orchestration_resource_restart.go +++ b/daemon/imon/orchestration_resource_restart.go @@ -425,7 +425,7 @@ func (t *Manager) isValidMonitorAction(action instance.MonitorAction) bool { case instance.MonitorActionNone: return false default: - t.log.Errorf("unsupported monitor action: %s", action) + t.log.Infof("unsupported monitor action: %s", action) return false } } From 10ae0dbf9a7759678fe8cff74840d6bb89195f90 Mon Sep 17 00:00:00 2001 From: Cyril Galibern Date: Fri, 10 Jan 2025 00:44:38 +0100 Subject: [PATCH 14/16] [imon] Fix non restarted resource after 1 fixed MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We now use 2 dedicated timers and maps for standby and non-standby resources. Replaced individual resource timers with centralized timers for standby and non-standby resources to improve clarity and reduce redundancy. Added mechanisms to log and reset scheduled restarts, ensuring better state consistency. Bug Example (app#foo? needs 4 start retries to succeed) Jan 09 19:34:07.053 │INF│ daemon: imon: restarts: resource app#foo1 status down, restart remaining 4 out of 4 ... Jan 09 19:34:07.053 │INF│ daemon: imon: restarts: resource app#foo3 status down, restart remaining 4 out of 4 ... Jan 09 19:34:07.053 │INF│ daemon: imon: restarts: schedule restart resources [app#foo1 app#foo3] in 0s ... Jan 09 19:34:07.054 │INF│ daemon: imon: restarts: change state idle -> starting ... Jan 09 19:34:07.158 │INF│ daemon: imon: restarts: -> exec [/usr/bin/om restarts start --local --rid app#foo1,app#foo3] ... Jan 09 19:34:07.235 │INF│ daemon: imon: restarts: <- exec [/usr/bin/om restarts start --local --rid app#foo1,app#foo3] ... Jan 09 19:34:07.235 │INF│ daemon: imon: restarts: change state starting -> idle ... Jan 09 19:34:07.487 │INF│ daemon: imon: restarts: resource app#foo3 status down, restart remaining 3 out of 4 ... Jan 09 19:34:07.487 │INF│ daemon: imon: restarts: resource app#foo1 status down, restart remaining 3 out of 4 ... Jan 09 19:34:07.487 │INF│ daemon: imon: restarts: schedule restart resources [app#foo3 app#foo1] in 19.566563857s ... ---- start app#foo1 and post status Jan 09 19:34:16.737 │INF│ daemon: imon: restarts: resource app#foo1 status is up: reset restart count to the max (2 -> 4) ... Jan 09 19:34:16.737 │INF│ daemon: imon: restarts: resource app#foo1 is up, reset delayed restart ... ---- the schedule restart resources of app#foo3 will be never called After the fix: Example where some resources are fixed by sysadmin, during a resource restart orchestration (app#foo? needs 4 start retries to succeed) Jan 09 22:49:53.966 │INF│ daemon: imon: restarts: resource app#foo1 status down, restart remaining 4 out of 4 ... Jan 09 22:49:53.966 │INF│ daemon: imon: restarts: resource app#foo2 status stdby down, standby restart remaining 6 out of 6 ... Jan 09 22:49:53.966 │INF│ daemon: imon: restarts: resource app#foo3 status down, restart remaining 4 out of 4 ... Jan 09 22:49:53.966 │INF│ daemon: imon: restarts: schedule restart standby resources [app#foo2] in 0s ... Jan 09 22:49:53.966 │INF│ daemon: imon: restarts: schedule restart resources [app#foo1 app#foo3] in 0s ... Jan 09 22:49:53.967 │INF│ daemon: imon: restarts: change state idle -> starting ... Jan 09 22:49:54.122 │INF│ daemon: imon: restarts: -> exec [/usr/bin/om restarts start --local --rid app#foo1,app#foo3] ... Jan 09 22:49:54.197 │INF│ daemon: imon: restarts: <- exec [/usr/bin/om restarts start --local --rid app#foo1,app#foo3] ... Jan 09 22:49:54.198 │INF│ daemon: imon: restarts: change state starting -> idle ... Jan 09 22:49:54.198 │INF│ daemon: imon: restarts: change state idle -> starting ... Jan 09 22:49:54.321 │INF│ daemon: imon: restarts: -> exec [/usr/bin/om restarts startstandby --local --rid app#foo2] ... Jan 09 22:49:54.391 │INF│ daemon: imon: restarts: <- exec [/usr/bin/om restarts startstandby --local --rid app#foo2] ... Jan 09 22:49:54.392 │INF│ daemon: imon: restarts: change state starting -> idle ... Jan 09 22:49:54.643 │INF│ daemon: imon: restarts: resource app#foo2 status stdby down, standby restart remaining 5 out of 6 ... Jan 09 22:49:54.643 │INF│ daemon: imon: restarts: resource app#foo3 status down, restart remaining 3 out of 4 ... Jan 09 22:49:54.643 │INF│ daemon: imon: restarts: resource app#foo1 status down, restart remaining 3 out of 4 ... Jan 09 22:49:54.644 │INF│ daemon: imon: restarts: schedule restart standby resources [app#foo2] in 9.554244155s ... Jan 09 22:49:54.644 │INF│ daemon: imon: restarts: schedule restart resources [app#foo3 app#foo1] in 9.322901395s ... ... manual start foo#1 and post status Jan 09 22:49:58.990 │INF│ daemon: imon: restarts: resource app#foo1 status is up: reset restart count to the max (2 -> 4) ... Jan 09 22:49:58.990 │INF│ daemon: imon: restarts: resource app#foo1 status is up, reset delayed restart ... ... manual start foo#2 that is standby and post status Jan 09 22:50:02.834 │INF│ daemon: imon: restarts: resource app#foo2 status is stdby up: reset restart count to the max (4 -> 6) ... Jan 09 22:50:02.834 │INF│ daemon: imon: restarts: resource app#foo2 status is stdby up, reset delayed restart standby ... Jan 09 22:50:02.834 │INF│ daemon: imon: restarts: reset scheduled restart standby resources ... ... the scheduled restart standby resources have been reset (no more standby resources need restart) Jan 09 22:50:03.967 │INF│ daemon: imon: restarts: skip resource restart app#foo1: not anymore candidate ... Jan 09 22:50:03.967 │INF│ daemon: imon: restarts: change state idle -> starting ... Jan 09 22:50:04.122 │INF│ daemon: imon: restarts: -> exec [/usr/bin/om restarts start --local --rid app#foo3] ... Jan 09 22:50:04.185 │INF│ daemon: imon: restarts: <- exec [/usr/bin/om restarts start --local --rid app#foo3] ... Jan 09 22:50:04.186 │INF│ daemon: imon: restarts: change state starting -> idle ... Jan 09 22:50:04.436 │INF│ daemon: imon: restarts: resource app#foo3 status down, restart remaining 2 out of 4 ... Jan 09 22:50:04.437 │INF│ daemon: imon: restarts: schedule restart resources [app#foo3] in 9.530711539s ... ... manual start foo#3 and post status Jan 09 22:50:09.941 │INF│ daemon: imon: restarts: resource app#foo3 status is up: reset restart count to the max (1 -> 4) ... Jan 09 22:50:09.941 │INF│ daemon: imon: restarts: resource app#foo3 status is up, reset delayed restart ... Jan 09 22:50:09.941 │INF│ daemon: imon: restarts: reset scheduled restart resources ... ... the scheduled restart resources have been reset (no more resources need restart) After the fix: Example off full automatic resource orchestration (app#foo? needs 4 start retries to succeed) Jan 09 22:50:25.523 │INF│ daemon: imon: restarts: resource app#foo1 status down, restart remaining 4 out of 4 ... Jan 09 22:50:25.523 │INF│ daemon: imon: restarts: resource app#foo2 status stdby down, standby restart remaining 6 out of 6 ... Jan 09 22:50:25.524 │INF│ daemon: imon: restarts: resource app#foo3 status down, restart remaining 4 out of 4 ... Jan 09 22:50:25.525 │INF│ daemon: imon: restarts: schedule restart standby resources [app#foo2] in 0s ... Jan 09 22:50:25.525 │INF│ daemon: imon: restarts: schedule restart resources [app#foo1 app#foo3] in 0s ... Jan 09 22:50:25.526 │INF│ daemon: imon: restarts: change state idle -> starting ... Jan 09 22:50:25.722 │INF│ daemon: imon: restarts: -> exec [/usr/bin/om restarts startstandby --local --rid app#foo2] ... Jan 09 22:50:25.786 │INF│ daemon: imon: restarts: <- exec [/usr/bin/om restarts startstandby --local --rid app#foo2] ... Jan 09 22:50:25.786 │INF│ daemon: imon: restarts: change state starting -> idle ... Jan 09 22:50:25.786 │INF│ daemon: imon: restarts: change state idle -> starting ... Jan 09 22:50:25.921 │INF│ daemon: imon: restarts: -> exec [/usr/bin/om restarts start --local --rid app#foo1,app#foo3] ... Jan 09 22:50:26.000 │INF│ daemon: imon: restarts: <- exec [/usr/bin/om restarts start --local --rid app#foo1,app#foo3] ... Jan 09 22:50:26.001 │INF│ daemon: imon: restarts: change state starting -> idle ... Jan 09 22:50:26.037 │INF│ daemon: imon: restarts: resource app#foo3 status down, restart remaining 3 out of 4 ... Jan 09 22:50:26.037 │INF│ daemon: imon: restarts: resource app#foo1 status down, restart remaining 3 out of 4 ... Jan 09 22:50:26.037 │INF│ daemon: imon: restarts: resource app#foo2 status stdby down, standby restart remaining 5 out of 6 ... Jan 09 22:50:26.037 │INF│ daemon: imon: restarts: schedule restart standby resources [app#foo2] in 9.488514985s ... Jan 09 22:50:26.037 │INF│ daemon: imon: restarts: schedule restart resources [app#foo3 app#foo1] in 9.748987131s ... Jan 09 22:50:35.527 │INF│ daemon: imon: restarts: change state idle -> starting ... Jan 09 22:50:35.721 │INF│ daemon: imon: restarts: -> exec [/usr/bin/om restarts startstandby --local --rid app#foo2] ... Jan 09 22:50:35.783 │INF│ daemon: imon: restarts: <- exec [/usr/bin/om restarts startstandby --local --rid app#foo2] ... Jan 09 22:50:35.784 │INF│ daemon: imon: restarts: change state starting -> idle ... Jan 09 22:50:35.787 │INF│ daemon: imon: restarts: change state idle -> starting ... Jan 09 22:50:35.922 │INF│ daemon: imon: restarts: -> exec [/usr/bin/om restarts start --local --rid app#foo3,app#foo1] ... Jan 09 22:50:36.002 │INF│ daemon: imon: restarts: <- exec [/usr/bin/om restarts start --local --rid app#foo3,app#foo1] ... Jan 09 22:50:36.002 │INF│ daemon: imon: restarts: change state starting -> idle ... Jan 09 22:50:36.034 │INF│ daemon: imon: restarts: resource app#foo2 status stdby down, standby restart remaining 4 out of 6 ... Jan 09 22:50:36.034 │INF│ daemon: imon: restarts: resource app#foo3 status down, restart remaining 2 out of 4 ... Jan 09 22:50:36.035 │INF│ daemon: imon: restarts: resource app#foo1 status down, restart remaining 2 out of 4 ... Jan 09 22:50:36.035 │INF│ daemon: imon: restarts: schedule restart standby resources [app#foo2] in 9.492020723s ... Jan 09 22:50:36.035 │INF│ daemon: imon: restarts: schedule restart resources [app#foo3 app#foo1] in 9.751725604s ... Jan 09 22:50:45.527 │INF│ daemon: imon: restarts: change state idle -> starting ... Jan 09 22:50:45.722 │INF│ daemon: imon: restarts: -> exec [/usr/bin/om restarts startstandby --local --rid app#foo2] ... Jan 09 22:50:45.789 │INF│ daemon: imon: restarts: <- exec [/usr/bin/om restarts startstandby --local --rid app#foo2] ... Jan 09 22:50:45.789 │INF│ daemon: imon: restarts: change state starting -> idle ... Jan 09 22:50:45.790 │INF│ daemon: imon: restarts: change state idle -> starting ... Jan 09 22:50:45.922 │INF│ daemon: imon: restarts: -> exec [/usr/bin/om restarts start --local --rid app#foo3,app#foo1] ... Jan 09 22:50:46.004 │INF│ daemon: imon: restarts: <- exec [/usr/bin/om restarts start --local --rid app#foo3,app#foo1] ... Jan 09 22:50:46.005 │INF│ daemon: imon: restarts: change state starting -> idle ... Jan 09 22:50:46.040 │INF│ daemon: imon: restarts: resource app#foo3 status is up: reset restart count to the max (1 -> 4) ... Jan 09 22:50:46.040 │INF│ daemon: imon: restarts: resource app#foo1 status is up: reset restart count to the max (1 -> 4) ... Jan 09 22:50:46.040 │INF│ daemon: imon: restarts: resource app#foo2 status is stdby up: reset restart count to the max (3 -> 6) ... ``` --- core/instance/monitor.go | 15 +-- daemon/imon/main.go | 16 +++ daemon/imon/main_cmd.go | 20 ++- daemon/imon/orchestration_resource_restart.go | 120 +++++++++++++----- 4 files changed, 121 insertions(+), 50 deletions(-) diff --git a/core/instance/monitor.go b/core/instance/monitor.go index 8c1ab035d..2296a0569 100644 --- a/core/instance/monitor.go +++ b/core/instance/monitor.go @@ -78,9 +78,8 @@ type ( Restart ResourceMonitorRestart `json:"restart"` } ResourceMonitorRestart struct { - Remaining int `json:"remaining"` - LastAt time.Time `json:"last_at"` - Timer *time.Timer `json:"-"` + Remaining int `json:"remaining"` + LastAt time.Time `json:"last_at"` } MonitorState int @@ -434,16 +433,6 @@ func (rmon *ResourceMonitor) DecRestartRemaining() { } } -func (rmon *ResourceMonitor) StopRestartTimer() bool { - if rmon.Restart.Timer == nil { - return false - } else { - rmon.Restart.Timer.Stop() - rmon.Restart.Timer = nil - return true - } -} - func (m ResourceMonitors) Set(rid string, rmon ResourceMonitor) { m[rid] = rmon } diff --git a/daemon/imon/main.go b/daemon/imon/main.go index 383f93596..ba056bee1 100644 --- a/daemon/imon/main.go +++ b/daemon/imon/main.go @@ -135,6 +135,22 @@ type ( // for monitoring as defined by the MonitorAction type. // Its Value is created/refreshed during func initResourceMonitor. initialMonitorAction instance.MonitorAction + + // resourceRestartTimer represents a timer used to schedule the restart + // of non-standby resources. + resourceRestartTimer *time.Timer + + // resourceStandbyRestartTimer is a timer used to schedule the restart + // of standby resources. + resourceStandbyRestartTimer *time.Timer + + // resourceWithRestartScheduled maintains a mapping of non-standby + // resource identifiers to their restart scheduled as boolean values. + resourceWithRestartScheduled map[string]bool + + // resourceStandbyWithRestartScheduled maintains a mapping of standby + // resource identifiers to their restart scheduled as boolean values. + resourceStandbyWithRestartScheduled map[string]bool } // cmdOrchestrate can be used from post action go routines diff --git a/daemon/imon/main_cmd.go b/daemon/imon/main_cmd.go index 4731a925d..4d7a3afca 100644 --- a/daemon/imon/main_cmd.go +++ b/daemon/imon/main_cmd.go @@ -1049,12 +1049,24 @@ func (t *Manager) doLastAction(action func() error, newState, successState, erro func (t *Manager) initResourceMonitor() { // Stop any pending restart timers before init. We may be called after // instance config refreshed with some previous resource restart scheduled. - if t.state.Resources != nil && len(t.state.Resources) > 0 { - t.log.Infof("drop pending schedule restart resources: local instance config has been updated") - for _, rmon := range t.state.Resources { - rmon.StopRestartTimer() + t.resetResourceMonitorTimers() + + logDropped := func(l map[string]bool, comment string) { + if l != nil { + dropped := make([]string, 0) + for rid := range l { + dropped = append(dropped, rid) + } + if len(dropped) > 0 { + t.log.Infof("instance config has been updated: drop previously scheduled restart %s %v", comment, dropped) + } } } + logDropped(t.resourceWithRestartScheduled, "resources") + logDropped(t.resourceStandbyWithRestartScheduled, "standby resources") + + t.resourceWithRestartScheduled = make(map[string]bool) + t.resourceStandbyWithRestartScheduled = make(map[string]bool) if monitorAction, ok := t.getValidMonitorAction(0); !ok { t.initialMonitorAction = instance.MonitorActionNone diff --git a/daemon/imon/orchestration_resource_restart.go b/daemon/imon/orchestration_resource_restart.go index affb4373b..eec119e27 100644 --- a/daemon/imon/orchestration_resource_restart.go +++ b/daemon/imon/orchestration_resource_restart.go @@ -166,13 +166,23 @@ func (t *Manager) orchestrateResourceRestart() { todoRestart := newTodoMap() todoStandby := newTodoMap() - resetTimer := func(rid string, rmon *instance.ResourceMonitor) { - todoRestart.Del(rid) - todoStandby.Del(rid) - if rmon.Restart.Timer != nil { - t.log.Infof("resource %s is up, reset delayed restart", rid) - t.change = rmon.StopRestartTimer() - t.state.Resources.Set(rid, *rmon) + dropScheduled := func(rid string, standby bool, reason string) { + var scheduled map[string]bool + var msg string + if standby { + scheduled = t.resourceStandbyWithRestartScheduled + msg = "delayed restart standby" + } else { + scheduled = t.resourceWithRestartScheduled + msg = "delayed restart" + } + if _, ok := scheduled[rid]; ok { + delete(scheduled, rid) + t.change = true + t.log.Infof("resource %s %s, reset %s", rid, reason, msg) + if len(scheduled) == 0 { + t.resetResourceMonitorTimer(standby) + } } } @@ -189,18 +199,16 @@ func (t *Manager) orchestrateResourceRestart() { resetRemainingAndTimer := func(rid string, rcfg *instance.ResourceConfig, rmon *instance.ResourceMonitor, reason string) { resetRemaining(rid, rcfg, rmon, reason) - resetTimer(rid, rmon) - } - - resetTimers := func() { - for rid, rmon := range t.state.Resources { - resetTimer(rid, &rmon) + if rcfg != nil { + dropScheduled(rid, rcfg.IsStandby, reason) } } planFor := func(rid string, resStatus status.T, started bool) { rcfg := t.instConfig.Resources.Get(rid) rmon := t.state.Resources.Get(rid) + _, aleadyScheduled := t.resourceWithRestartScheduled[rid] + _, aleadyScheduledStandby := t.resourceStandbyWithRestartScheduled[rid] switch { case rcfg == nil: return @@ -212,8 +220,10 @@ func (t *Manager) orchestrateResourceRestart() { case resStatus.Is(status.NotApplicable, status.Undef, status.Up, status.StandbyUp): t.log.Debugf("resource %s restart skip: status is %s", rid, resStatus) resetRemainingAndTimer(rid, rcfg, rmon, fmt.Sprintf("status is %s", resStatus)) - case rmon.Restart.Timer != nil: - t.log.Debugf("resource %s restart skip: already has a delay timer", rid) + case aleadyScheduledStandby && rcfg.IsStandby: + t.log.Debugf("resource %s restart skipped: already registered for restart standby", rid) + case aleadyScheduled && !rcfg.IsStandby: + t.log.Debugf("resource %s restart skipped: already registered for restart", rid) case t.monitorActionCalled(): t.log.Debugf("resource %s restart skip: already ran the monitor action", rid) case rcfg.IsStandby || started: @@ -231,8 +241,7 @@ func (t *Manager) orchestrateResourceRestart() { } } default: - t.log.Debugf("resource %s restart skip: instance not started", rid) - resetTimer(rid, rmon) + dropScheduled(rid, rcfg.IsStandby, "not standby or instance is not started") } } @@ -250,19 +259,22 @@ func (t *Manager) orchestrateResourceRestart() { // discard if the instance status does not exist if _, ok := t.instStatus[t.localhost]; !ok { - resetTimers() + t.log.Errorf("skip restart: missing instance status") + t.resetResourceMonitorTimers() return } // don't run on frozen nodes if t.nodeStatus[t.localhost].IsFrozen() { - resetTimers() + t.log.Errorf("skip restart: node is frozen") + t.resetResourceMonitorTimers() return } // don't run when the node is not idle if t.nodeMonitor[t.localhost].State != node.MonitorStateIdle { - resetTimers() + t.log.Errorf("skip restart: node is %s", t.nodeMonitor[t.localhost].State) + t.resetResourceMonitorTimers() return } @@ -277,22 +289,23 @@ func (t *Manager) orchestrateResourceRestart() { // don't run on frozen instances if t.instStatus[t.localhost].IsFrozen() { - resetTimers() + t.log.Errorf("skip restart: instance is frozen") + t.resetResourceMonitorTimers() return } // discard not provisioned if instanceStatus := t.instStatus[t.localhost]; instanceStatus.Provisioned.IsOneOf(provisioned.False, provisioned.Mixed, provisioned.Undef) { - t.log.Debugf("skip restart: provisioned=%s", instanceStatus.Provisioned) - resetTimers() + t.log.Errorf("skip restart: provisioned is %s", instanceStatus.Provisioned) + t.resetResourceMonitorTimers() return } // discard if the instance has no monitor data instMonitor, ok := t.GetInstanceMonitor(t.localhost) if !ok { - t.log.Debugf("skip restart: no instance monitor") - resetTimers() + t.log.Errorf("skip restart: no instance monitor") + t.resetResourceMonitorTimers() return } @@ -323,31 +336,37 @@ func (t *Manager) orchestrateResourceRestart() { // resourceRestartSchedule schedules a restart for resources based on the provided resource map and standby mode. // It updates the state of resources with associated restart timers and logs the operation. func (t *Manager) resourceRestartSchedule(todo todoMap, standby bool) { + var scheduled map[string]bool rids, delay := t.getRidsAndDelay(todo) if len(rids) == 0 { return } + onTimer := func() { + t.cmdC <- cmdResourceRestart{ + rids: rids, + standby: standby, + } + } if standby { t.log.Infof("schedule restart standby resources %v in %s", rids, delay) + t.resourceStandbyRestartTimer = time.AfterFunc(delay, onTimer) + scheduled = t.resourceStandbyWithRestartScheduled } else { t.log.Infof("schedule restart resources %v in %s", rids, delay) + t.resourceRestartTimer = time.AfterFunc(delay, onTimer) + scheduled = t.resourceWithRestartScheduled } - timer := time.AfterFunc(delay, func() { - t.cmdC <- cmdResourceRestart{ - rids: rids, - standby: standby, - } - }) for _, rid := range rids { rmon := t.state.Resources.Get(rid) if rmon == nil { continue } rmon.DecRestartRemaining() - rmon.Restart.Timer = timer t.state.Resources.Set(rid, *rmon) t.change = true + scheduled[rid] = true } + } // resourceRestart restarts the specified resources and updates their state in the resource monitor. @@ -356,16 +375,34 @@ func (t *Manager) resourceRestartSchedule(todo todoMap, standby bool) { func (t *Manager) resourceRestart(resourceRids []string, standby bool) { now := time.Now() rids := make([]string, 0, len(resourceRids)) + var scheduled map[string]bool + var skipMessage string + if standby { + skipMessage = "skip resource restart standby" + scheduled = t.resourceStandbyWithRestartScheduled + } else { + skipMessage = "skip resource restart" + scheduled = t.resourceWithRestartScheduled + } for _, rid := range resourceRids { rmon := t.state.Resources.Get(rid) if rmon == nil { continue } + + if _, ok := scheduled[rid]; !ok { + t.log.Infof("%s %s: not anymore candidate", skipMessage, rid) + continue + } rids = append(rids, rid) rmon.Restart.LastAt = now - rmon.Restart.Timer = nil t.state.Resources.Set(rid, *rmon) t.change = true + delete(scheduled, rid) + } + if len(rids) == 0 { + t.log.Infof("%s: no more candidates", skipMessage) + return } queueFunc := t.queueResourceStart if standby { @@ -429,3 +466,20 @@ func (t *Manager) isValidMonitorAction(action instance.MonitorAction) bool { return false } } + +func (t *Manager) resetResourceMonitorTimers() { + t.resetResourceMonitorTimer(true) + t.resetResourceMonitorTimer(false) +} + +func (t *Manager) resetResourceMonitorTimer(standby bool) { + if standby && t.resourceStandbyRestartTimer != nil { + t.log.Infof("reset scheduled restart standby resources") + t.resourceStandbyRestartTimer.Stop() + t.resourceStandbyRestartTimer = nil + } else if !standby && t.resourceRestartTimer != nil { + t.log.Infof("reset scheduled restart resources") + t.resourceRestartTimer.Stop() + t.resourceRestartTimer = nil + } +} From 5b9b08c83e6bdc9958e4666a5b36ed6179b39a2f Mon Sep 17 00:00:00 2001 From: Cyril Galibern Date: Fri, 10 Jan 2025 20:25:45 +0100 Subject: [PATCH 15/16] [imon] Refactor with orchestrationResource Replaced individual timers and scheduling maps with a unified orchestrationResource struct for better modularity and maintainability. --- daemon/imon/main.go | 28 +- daemon/imon/main_cmd.go | 20 +- daemon/imon/orchestration_resource_restart.go | 330 ++++++++++-------- 3 files changed, 211 insertions(+), 167 deletions(-) diff --git a/daemon/imon/main.go b/daemon/imon/main.go index ba056bee1..d37136919 100644 --- a/daemon/imon/main.go +++ b/daemon/imon/main.go @@ -136,21 +136,11 @@ type ( // Its Value is created/refreshed during func initResourceMonitor. initialMonitorAction instance.MonitorAction - // resourceRestartTimer represents a timer used to schedule the restart - // of non-standby resources. - resourceRestartTimer *time.Timer + // standbyResourceOrchestrate is the orchestrationResource for standby resources + standbyResourceOrchestrate orchestrationResource - // resourceStandbyRestartTimer is a timer used to schedule the restart - // of standby resources. - resourceStandbyRestartTimer *time.Timer - - // resourceWithRestartScheduled maintains a mapping of non-standby - // resource identifiers to their restart scheduled as boolean values. - resourceWithRestartScheduled map[string]bool - - // resourceStandbyWithRestartScheduled maintains a mapping of standby - // resource identifiers to their restart scheduled as boolean values. - resourceStandbyWithRestartScheduled map[string]bool + // standbyResourceOrchestrate is the orchestrationResource for regular resources + regularResourceOrchestrate orchestrationResource } // cmdOrchestrate can be used from post action go routines @@ -245,6 +235,9 @@ func start(parent context.Context, qs pubsub.QueueSizer, p naming.Path, nodes [] } t.log = t.newLogger(uuid.Nil) + t.regularResourceOrchestrate.log = t.newResourceLogger("regular resource") + t.standbyResourceOrchestrate.log = t.newResourceLogger( "standby resource") + t.startSubscriptions(qs) go func() { @@ -254,6 +247,12 @@ func start(parent context.Context, qs pubsub.QueueSizer, p naming.Path, nodes [] return nil } +func (t *Manager) newResourceLogger(s string) *plog.Logger { + return naming.LogWithPath(plog.NewDefaultLogger(), t.path). + Attr("pkg", "daemon/imon"). + WithPrefix(fmt.Sprintf("daemon: imon: %s: %s: ", t.path.String(), s)) +} + func (t *Manager) newLogger(i uuid.UUID) *plog.Logger { return naming.LogWithPath(plog.NewDefaultLogger(), t.path). Attr("pkg", "daemon/imon"). @@ -261,6 +260,7 @@ func (t *Manager) newLogger(i uuid.UUID) *plog.Logger { WithPrefix(fmt.Sprintf("daemon: imon: %s: ", t.path.String())) } + func (t *Manager) startSubscriptions(qs pubsub.QueueSizer) { sub := t.pubsubBus.Sub("daemon.imon "+t.id, qs) sub.AddFilter(&msgbus.NodeConfigUpdated{}, t.labelLocalhost) diff --git a/daemon/imon/main_cmd.go b/daemon/imon/main_cmd.go index 4d7a3afca..a6433d46b 100644 --- a/daemon/imon/main_cmd.go +++ b/daemon/imon/main_cmd.go @@ -1049,24 +1049,24 @@ func (t *Manager) doLastAction(action func() error, newState, successState, erro func (t *Manager) initResourceMonitor() { // Stop any pending restart timers before init. We may be called after // instance config refreshed with some previous resource restart scheduled. - t.resetResourceMonitorTimers() + t.cancelResourceOrchestrateSchedules() - logDropped := func(l map[string]bool, comment string) { - if l != nil { + logDropped := func(or *orchestrationResource) { + if or != nil && or.scheduled != nil { dropped := make([]string, 0) - for rid := range l { + for rid := range or.scheduled { dropped = append(dropped, rid) } if len(dropped) > 0 { - t.log.Infof("instance config has been updated: drop previously scheduled restart %s %v", comment, dropped) + or.log.Infof("instance config has been updated: drop previously scheduled restarts %v", dropped) } } } - logDropped(t.resourceWithRestartScheduled, "resources") - logDropped(t.resourceStandbyWithRestartScheduled, "standby resources") + logDropped(&t.regularResourceOrchestrate) + logDropped(&t.standbyResourceOrchestrate) - t.resourceWithRestartScheduled = make(map[string]bool) - t.resourceStandbyWithRestartScheduled = make(map[string]bool) + t.regularResourceOrchestrate.scheduled = make(map[string]bool) + t.standbyResourceOrchestrate.scheduled = make(map[string]bool) if monitorAction, ok := t.getValidMonitorAction(0); !ok { t.initialMonitorAction = instance.MonitorActionNone @@ -1084,7 +1084,7 @@ func (t *Manager) initResourceMonitor() { }, } if rcfg.IsMonitored && hasMonitorActionNone { - t.log.Infof("unusable monitor action: resource %s is monitored, but monitor action is none", rid) + t.orchestrationResource(rcfg.IsStandby).log.Infof("rid %s is monitored, but monitor action is none", rid) } } t.state.Resources = m diff --git a/daemon/imon/orchestration_resource_restart.go b/daemon/imon/orchestration_resource_restart.go index eec119e27..9e3a6d97f 100644 --- a/daemon/imon/orchestration_resource_restart.go +++ b/daemon/imon/orchestration_resource_restart.go @@ -13,11 +13,30 @@ import ( "github.com/opensvc/om3/core/status" "github.com/opensvc/om3/daemon/msgbus" "github.com/opensvc/om3/util/command" + "github.com/opensvc/om3/util/plog" "github.com/opensvc/om3/util/toc" ) type ( todoMap map[string]bool + + // orchestrationResource manages the resource orchestration state, scheduling, + // and actions for restart/monitoring. + orchestrationResource struct { + // scheduler represents a timer used to schedule the restart of a group + // of resources with same standby value. + scheduler *time.Timer + + // scheduled tracks the set of resources identified by their IDs that + // are currently scheduled for a specific action. + scheduled map[string]bool + + // standby indicates whether the resource is in a standby mode, + // or in regular mode (non-standby). + standby bool + + log *plog.Logger + } ) const ( @@ -25,6 +44,10 @@ const ( disableMonitorMsg = "disable resource restart and monitoring" ) +var ( + resourceOrchestrableKinds = naming.NewKinds(naming.KindSvc, naming.KindVol) +) + // disableMonitor disables the resource restart and monitoring by setting // the local expectation to "none". // format is used to log changing reason, format == "" => no logging. @@ -163,118 +186,30 @@ func (t *Manager) pubMonitorAction(rid string, action instance.MonitorAction) { // orchestrateResourceRestart manages the restart orchestration process for resources, // handling delays, timers, and retries. func (t *Manager) orchestrateResourceRestart() { - todoRestart := newTodoMap() - todoStandby := newTodoMap() - - dropScheduled := func(rid string, standby bool, reason string) { - var scheduled map[string]bool - var msg string - if standby { - scheduled = t.resourceStandbyWithRestartScheduled - msg = "delayed restart standby" - } else { - scheduled = t.resourceWithRestartScheduled - msg = "delayed restart" - } - if _, ok := scheduled[rid]; ok { - delete(scheduled, rid) - t.change = true - t.log.Infof("resource %s %s, reset %s", rid, reason, msg) - if len(scheduled) == 0 { - t.resetResourceMonitorTimer(standby) - } - } - } - - resetRemaining := func(rid string, rcfg *instance.ResourceConfig, rmon *instance.ResourceMonitor, reason string) { - if rmon.Restart.Remaining != rcfg.Restart { - t.log.Infof("resource %s %s: reset restart count to the max (%d -> %d)", rid, reason, rmon.Restart.Remaining, rcfg.Restart) - rmon.Restart.Remaining = rcfg.Restart - // reset the last monitor action execution time, to rearm the next monitor action - t.state.MonitorActionExecutedAt = time.Time{} - t.state.Resources.Set(rid, *rmon) - t.change = true - } - } - - resetRemainingAndTimer := func(rid string, rcfg *instance.ResourceConfig, rmon *instance.ResourceMonitor, reason string) { - resetRemaining(rid, rcfg, rmon, reason) - if rcfg != nil { - dropScheduled(rid, rcfg.IsStandby, reason) - } - } - - planFor := func(rid string, resStatus status.T, started bool) { - rcfg := t.instConfig.Resources.Get(rid) - rmon := t.state.Resources.Get(rid) - _, aleadyScheduled := t.resourceWithRestartScheduled[rid] - _, aleadyScheduledStandby := t.resourceStandbyWithRestartScheduled[rid] - switch { - case rcfg == nil: - return - case rmon == nil: - return - case rcfg.IsDisabled: - t.log.Debugf("resource %s restart skip: is disabled", rid, rcfg.IsDisabled) - resetRemainingAndTimer(rid, rcfg, rmon, "is disabled") - case resStatus.Is(status.NotApplicable, status.Undef, status.Up, status.StandbyUp): - t.log.Debugf("resource %s restart skip: status is %s", rid, resStatus) - resetRemainingAndTimer(rid, rcfg, rmon, fmt.Sprintf("status is %s", resStatus)) - case aleadyScheduledStandby && rcfg.IsStandby: - t.log.Debugf("resource %s restart skipped: already registered for restart standby", rid) - case aleadyScheduled && !rcfg.IsStandby: - t.log.Debugf("resource %s restart skipped: already registered for restart", rid) - case t.monitorActionCalled(): - t.log.Debugf("resource %s restart skip: already ran the monitor action", rid) - case rcfg.IsStandby || started: - if rmon.Restart.Remaining == 0 && rcfg.IsMonitored && t.initialMonitorAction != instance.MonitorActionNone { - t.log.Infof("resource %s status %s, restart remaining %d out of %d", rid, resStatus, rmon.Restart.Remaining, rcfg.Restart) - t.setLocalExpect(instance.MonitorLocalExpectEvicted, "monitor action evicting: %s", disableMonitorMsg) - t.doMonitorAction(rid, t.initialMonitorAction) - } else if rmon.Restart.Remaining > 0 { - if rcfg.IsStandby { - t.log.Infof("resource %s status %s, standby restart remaining %d out of %d", rid, resStatus, rmon.Restart.Remaining, rcfg.Restart) - todoStandby.Add(rid) - } else { - t.log.Infof("resource %s status %s, restart remaining %d out of %d", rid, resStatus, rmon.Restart.Remaining, rcfg.Restart) - todoRestart.Add(rid) - } - } - default: - dropScheduled(rid, rcfg.IsStandby, "not standby or instance is not started") - } - } - - // discard the cluster object - if t.path.String() == "cluster" { - return - } - - // discard all except svc and vol - switch t.path.Kind { - case naming.KindSvc, naming.KindVol: - default: + // only available for svc or vol + if !resourceOrchestrableKinds.Has(t.path.Kind) { + // discard non svc or vol return } // discard if the instance status does not exist if _, ok := t.instStatus[t.localhost]; !ok { - t.log.Errorf("skip restart: missing instance status") - t.resetResourceMonitorTimers() + t.log.Infof("skip restart: missing instance status") + t.cancelResourceOrchestrateSchedules() return } // don't run on frozen nodes if t.nodeStatus[t.localhost].IsFrozen() { - t.log.Errorf("skip restart: node is frozen") - t.resetResourceMonitorTimers() + t.log.Debugf("skip restart: node is frozen") + t.cancelResourceOrchestrateSchedules() return } // don't run when the node is not idle if t.nodeMonitor[t.localhost].State != node.MonitorStateIdle { - t.log.Errorf("skip restart: node is %s", t.nodeMonitor[t.localhost].State) - t.resetResourceMonitorTimers() + t.log.Debugf("skip restart: node is %s", t.nodeMonitor[t.localhost].State) + t.cancelResourceOrchestrateSchedules() return } @@ -289,23 +224,23 @@ func (t *Manager) orchestrateResourceRestart() { // don't run on frozen instances if t.instStatus[t.localhost].IsFrozen() { - t.log.Errorf("skip restart: instance is frozen") - t.resetResourceMonitorTimers() + t.log.Debugf("skip restart: instance is frozen") + t.cancelResourceOrchestrateSchedules() return } // discard not provisioned if instanceStatus := t.instStatus[t.localhost]; instanceStatus.Provisioned.IsOneOf(provisioned.False, provisioned.Mixed, provisioned.Undef) { - t.log.Errorf("skip restart: provisioned is %s", instanceStatus.Provisioned) - t.resetResourceMonitorTimers() + t.log.Debugf("skip restart: provisioned is %s", instanceStatus.Provisioned) + t.cancelResourceOrchestrateSchedules() return } // discard if the instance has no monitor data instMonitor, ok := t.GetInstanceMonitor(t.localhost) if !ok { - t.log.Errorf("skip restart: no instance monitor") - t.resetResourceMonitorTimers() + t.log.Debugf("skip restart: no instance monitor") + t.cancelResourceOrchestrateSchedules() return } @@ -318,10 +253,36 @@ func (t *Manager) orchestrateResourceRestart() { return } - started := instMonitor.LocalExpect == instance.MonitorLocalExpectStarted + todoRestart := newTodoMap() + todoStandby := newTodoMap() + + started := t.state.LocalExpect == instance.MonitorLocalExpectStarted for rid, rstat := range t.instStatus[t.localhost].Resources { - planFor(rid, rstat.Status, started) + rcfg := t.instConfig.Resources.Get(rid) + if rcfg == nil { + continue + } + rmon := t.state.Resources.Get(rid) + if rmon == nil { + continue + } + needRestart, needMonitorAction, err := t.orchestrateResourcePlan(rid, rcfg, rmon, rstat.Status, started) + if err != nil { + t.log.Errorf("orchestrate resource plan for resource %s: %s", rid, err) + t.cancelResourceOrchestrateSchedules() + return + } else if needMonitorAction { + t.setLocalExpect(instance.MonitorLocalExpectEvicted, "monitor action evicting: %s", disableMonitorMsg) + t.doMonitorAction(rid, t.initialMonitorAction) + t.cancelResourceOrchestrateSchedules() + } else if needRestart { + if rcfg.IsStandby { + todoStandby.Add(rid) + } else { + todoRestart.Add(rid) + } + } } // Prepare scheduled resource restart @@ -336,26 +297,20 @@ func (t *Manager) orchestrateResourceRestart() { // resourceRestartSchedule schedules a restart for resources based on the provided resource map and standby mode. // It updates the state of resources with associated restart timers and logs the operation. func (t *Manager) resourceRestartSchedule(todo todoMap, standby bool) { - var scheduled map[string]bool rids, delay := t.getRidsAndDelay(todo) if len(rids) == 0 { return } - onTimer := func() { + + or := t.orchestrationResource(standby) + or.log.Infof("schedule restart resources %v in %s", rids, delay) + or.scheduler = time.AfterFunc(delay, func() { t.cmdC <- cmdResourceRestart{ rids: rids, standby: standby, } - } - if standby { - t.log.Infof("schedule restart standby resources %v in %s", rids, delay) - t.resourceStandbyRestartTimer = time.AfterFunc(delay, onTimer) - scheduled = t.resourceStandbyWithRestartScheduled - } else { - t.log.Infof("schedule restart resources %v in %s", rids, delay) - t.resourceRestartTimer = time.AfterFunc(delay, onTimer) - scheduled = t.resourceWithRestartScheduled - } + }) + for _, rid := range rids { rmon := t.state.Resources.Get(rid) if rmon == nil { @@ -364,7 +319,7 @@ func (t *Manager) resourceRestartSchedule(todo todoMap, standby bool) { rmon.DecRestartRemaining() t.state.Resources.Set(rid, *rmon) t.change = true - scheduled[rid] = true + or.scheduled[rid] = true } } @@ -375,33 +330,25 @@ func (t *Manager) resourceRestartSchedule(todo todoMap, standby bool) { func (t *Manager) resourceRestart(resourceRids []string, standby bool) { now := time.Now() rids := make([]string, 0, len(resourceRids)) - var scheduled map[string]bool - var skipMessage string - if standby { - skipMessage = "skip resource restart standby" - scheduled = t.resourceStandbyWithRestartScheduled - } else { - skipMessage = "skip resource restart" - scheduled = t.resourceWithRestartScheduled - } + or := t.orchestrationResource(standby) for _, rid := range resourceRids { rmon := t.state.Resources.Get(rid) if rmon == nil { continue } - if _, ok := scheduled[rid]; !ok { - t.log.Infof("%s %s: not anymore candidate", skipMessage, rid) + if _, ok := or.scheduled[rid]; !ok { + or.log.Infof("drop restart rid %s: not anymore candidate", rid) continue } rids = append(rids, rid) rmon.Restart.LastAt = now t.state.Resources.Set(rid, *rmon) t.change = true - delete(scheduled, rid) + delete(or.scheduled, rid) } if len(rids) == 0 { - t.log.Infof("%s: no more candidates", skipMessage) + or.log.Infof("abort restart: no more candidates") return } queueFunc := t.queueResourceStart @@ -467,19 +414,116 @@ func (t *Manager) isValidMonitorAction(action instance.MonitorAction) bool { } } -func (t *Manager) resetResourceMonitorTimers() { - t.resetResourceMonitorTimer(true) - t.resetResourceMonitorTimer(false) +// orchestrationResource selects and returns the appropriate orchestration resource +// based on the standby mode flag provided. +func (t *Manager) orchestrationResource(standby bool) *orchestrationResource { + if standby { + return &t.standbyResourceOrchestrate + } else { + return &t.regularResourceOrchestrate + } +} + +func (t *Manager) cancelResourceOrchestrateSchedules() { + t.standbyResourceOrchestrate.cancelSchedule() + t.regularResourceOrchestrate.cancelSchedule() +} + +// orchestrateResourcePlan determines the plan for resource from its configuration and state. +// It returns flags indicating if a restart, or a monitor action is needed. +// the monitor action is needed when all the following conditions are met: +// +// the resource status is not in [NotApplicable, Undef, Up, StandbyUp] +// the started is true or the resource configuration is standby +// the remaining restarts is 0 +// the `monitor` value is true +// the `monitor_action` is not `MonitorActionNone` +func (t *Manager) orchestrateResourcePlan(rid string, rcfg *instance.ResourceConfig, rmon *instance.ResourceMonitor, rStatus status.T, started bool) (needRestart, needMonitorAction bool, err error) { + if rcfg == nil { + err = fmt.Errorf("orchestrate resource plan called with nil resource monitor") + return + } else if rmon == nil { + err = fmt.Errorf("orchestrate resource plan called with nil resource config") + return + } + + or := t.orchestrationResource(rcfg.IsStandby) + + dropScheduled := func(rid string, reason string) { + if changed := or.dropScheduled(rid, reason); changed { + t.change = true + } + } + + resetRemaining := func(rid string, reason string) { + if rmon.Restart.Remaining != rcfg.Restart { + or.log.Infof("rid %s %s: reset restart count to config value (%d -> %d)", rid, reason, rmon.Restart.Remaining, rcfg.Restart) + rmon.Restart.Remaining = rcfg.Restart + // reset the last monitor action execution time, to rearm the next monitor action + t.state.MonitorActionExecutedAt = time.Time{} + t.state.Resources.Set(rid, *rmon) + t.change = true + } + } + + switch { + case rcfg.IsDisabled: + reason := "is disabled" + or.log.Debugf("planFor rid %s skipped: %s", rid, reason) + dropScheduled(rid, reason) + resetRemaining(rid, reason) + case rStatus.Is(status.NotApplicable, status.Undef, status.Up, status.StandbyUp): + reason := fmt.Sprintf("status is %s", rStatus) + or.log.Debugf("planFor rid %s skipped: %s", rid, reason) + dropScheduled(rid, reason) + resetRemaining(rid, reason) + case or.alreadyScheduled(rid): + or.log.Debugf("planFor rid %s skipped: already scheduled", rid) + case t.monitorActionCalled(): + or.log.Debugf("planFor rid %s skipped: monitor action has been already called", rid) + case rcfg.IsStandby || started: + if rmon.Restart.Remaining == 0 && rcfg.IsMonitored && t.initialMonitorAction != instance.MonitorActionNone { + or.log.Infof("rid %s status %s, restart remaining %d out of %d: need monitor action", rid, rStatus, rmon.Restart.Remaining, rcfg.Restart) + needMonitorAction = true + } else if rmon.Restart.Remaining > 0 { + or.log.Infof("rid %s status %s, restart remaining %d out of %d", rid, rStatus, rmon.Restart.Remaining, rcfg.Restart) + needRestart = true + } + default: + dropScheduled(rid, "not standby or instance is not started") + } + return +} + +// cancelSchedule stops and clears any active scheduler associated with the +// orchestration resource. Logs the cancellation action. +func (or *orchestrationResource) cancelSchedule() { + if or.scheduler != nil { + or.log.Infof("cancel previously scheduled restart") + or.scheduler.Stop() + or.scheduler = nil + } } -func (t *Manager) resetResourceMonitorTimer(standby bool) { - if standby && t.resourceStandbyRestartTimer != nil { - t.log.Infof("reset scheduled restart standby resources") - t.resourceStandbyRestartTimer.Stop() - t.resourceStandbyRestartTimer = nil - } else if !standby && t.resourceRestartTimer != nil { - t.log.Infof("reset scheduled restart resources") - t.resourceRestartTimer.Stop() - t.resourceRestartTimer = nil +// dropScheduled removes a resource from the scheduled list using its ID and +// logs the action with the given reason. +// If the scheduled list becomes empty, it cancels any pending schedule. +// Returns true if the resource was found and removed, otherwise returns false. +func (or *orchestrationResource) dropScheduled(rid string, reason string) (change bool) { + if _, ok := or.scheduled[rid]; ok { + delete(or.scheduled, rid) + change = true + or.log.Infof("rid %s %s: drop delayed restart", rid, reason) + if len(or.scheduled) == 0 { + or.cancelSchedule() + } } + return +} + +// alreadyScheduled returns true if a resource, identified by its ID, has been +// scheduled for a restart (is already in the scheduled map). +func (or *orchestrationResource) alreadyScheduled(rid string) bool { + _, ok := or.scheduled[rid] + return ok } From d1906187b0cc219cbf400a8fc83aba6f6fce92e6 Mon Sep 17 00:00:00 2001 From: Cyril Galibern Date: Fri, 10 Jan 2025 21:29:27 +0100 Subject: [PATCH 16/16] [kw] Add `no-io` to `monitor_action` candidates --- core/object/core_keywords.go | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/core/object/core_keywords.go b/core/object/core_keywords.go index fe97a05f9..9b0bc78af 100644 --- a/core/object/core_keywords.go +++ b/core/object/core_keywords.go @@ -5,6 +5,7 @@ import ( "fmt" "github.com/opensvc/om3/core/driver" + "github.com/opensvc/om3/core/instance" "github.com/opensvc/om3/core/keyop" "github.com/opensvc/om3/core/keywords" "github.com/opensvc/om3/core/naming" @@ -219,16 +220,23 @@ var keywordStore = keywords.Store{ Text: keywords.NewText(fs, "text/kw/core/encapnodes"), }, { - Candidates: []string{"crash", "freezestop", "none", "reboot", "switch"}, - Converter: converters.List, - Default: "none", - Kind: naming.NewKinds(naming.KindSvc, naming.KindVol), - Example: "reboot", - Inherit: keywords.InheritHead, - Option: "monitor_action", - Scopable: true, - Section: "DEFAULT", - Text: keywords.NewText(fs, "text/kw/core/monitor_action"), + Candidates: []string{ + string(instance.MonitorActionCrash), + string(instance.MonitorActionFreezeStop), + string(instance.MonitorActionNone), + string(instance.MonitorActionReboot), + string(instance.MonitorActionSwitch), + string(instance.MonitorActionNoOp), + }, + Converter: converters.List, + Default: string(instance.MonitorActionNone), + Kind: naming.NewKinds(naming.KindSvc, naming.KindVol), + Example: string(instance.MonitorActionReboot), + Inherit: keywords.InheritHead, + Option: "monitor_action", + Scopable: true, + Section: "DEFAULT", + Text: keywords.NewText(fs, "text/kw/core/monitor_action"), }, { Example: "/bin/true",