Skip to content

Commit

Permalink
Merge pull request #656 from cgalibern/dev1
Browse files Browse the repository at this point in the history
Refactor orchestrate resource restart
  • Loading branch information
cgalibern authored Jan 10, 2025
2 parents 4604c19 + d190618 commit 5c4a3ac
Show file tree
Hide file tree
Showing 8 changed files with 456 additions and 247 deletions.
40 changes: 23 additions & 17 deletions core/instance/monitor.go
Original file line number Diff line number Diff line change
Expand Up @@ -78,9 +78,8 @@ type (
Restart ResourceMonitorRestart `json:"restart"`
}
ResourceMonitorRestart struct {
Remaining int `json:"remaining"`
LastAt time.Time `json:"last_at"`
Timer *time.Timer `json:"-"`
Remaining int `json:"remaining"`
LastAt time.Time `json:"last_at"`
}

MonitorState int
Expand Down Expand Up @@ -303,12 +302,29 @@ var (
ErrSameGlobalExpect = errors.New("instance monitor global expect is already set to the same value")
ErrSameLocalExpect = errors.New("instance monitor local expect is already set to the same value")
ErrSameState = errors.New("instance monitor state is already set to the same value")
)

var (
// MonitorActionNone: monitor action is disabled.
MonitorActionNone MonitorAction = "none"

MonitorActionNone MonitorAction = "none"
MonitorActionCrash MonitorAction = "crash"
// MonitorActionCrash represents the monitor action that will try system crash/panic
MonitorActionCrash MonitorAction = "crash"

// MonitorActionFreezeStop represents the monitor action that will try freeze and subsequently stop
// the monitored instance.
MonitorActionFreezeStop MonitorAction = "freezestop"
MonitorActionReboot MonitorAction = "reboot"
MonitorActionSwitch MonitorAction = "switch"

// MonitorActionReboot represents the monitor action that will reboot the system.
MonitorActionReboot MonitorAction = "reboot"

// MonitorActionSwitch represents the monitor action that will stop instance stop to allow
// any other cluster nodes to takeover instance.
MonitorActionSwitch MonitorAction = "switch"

// MonitorActionNoOp represents the no-operation behavior while setting the state to 'evicted'.
// This can be useful for demonstration purposes or cases where no action is required.
MonitorActionNoOp MonitorAction = "no-op"
)

func (t MonitorState) Is(states ...MonitorState) bool {
Expand Down Expand Up @@ -417,16 +433,6 @@ func (rmon *ResourceMonitor) DecRestartRemaining() {
}
}

func (rmon *ResourceMonitor) StopRestartTimer() bool {
if rmon.Restart.Timer == nil {
return false
} else {
rmon.Restart.Timer.Stop()
rmon.Restart.Timer = nil
return true
}
}

func (m ResourceMonitors) Set(rid string, rmon ResourceMonitor) {
m[rid] = rmon
}
Expand Down
28 changes: 18 additions & 10 deletions core/object/core_keywords.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import (
"fmt"

"github.com/opensvc/om3/core/driver"
"github.com/opensvc/om3/core/instance"
"github.com/opensvc/om3/core/keyop"
"github.com/opensvc/om3/core/keywords"
"github.com/opensvc/om3/core/naming"
Expand Down Expand Up @@ -219,16 +220,23 @@ var keywordStore = keywords.Store{
Text: keywords.NewText(fs, "text/kw/core/encapnodes"),
},
{
Candidates: []string{"crash", "freezestop", "none", "reboot", "switch"},
Converter: converters.List,
Default: "none",
Kind: naming.NewKinds(naming.KindSvc, naming.KindVol),
Example: "reboot",
Inherit: keywords.InheritHead,
Option: "monitor_action",
Scopable: true,
Section: "DEFAULT",
Text: keywords.NewText(fs, "text/kw/core/monitor_action"),
Candidates: []string{
string(instance.MonitorActionCrash),
string(instance.MonitorActionFreezeStop),
string(instance.MonitorActionNone),
string(instance.MonitorActionReboot),
string(instance.MonitorActionSwitch),
string(instance.MonitorActionNoOp),
},
Converter: converters.List,
Default: string(instance.MonitorActionNone),
Kind: naming.NewKinds(naming.KindSvc, naming.KindVol),
Example: string(instance.MonitorActionReboot),
Inherit: keywords.InheritHead,
Option: "monitor_action",
Scopable: true,
Section: "DEFAULT",
Text: keywords.NewText(fs, "text/kw/core/monitor_action"),
},
{
Example: "/bin/true",
Expand Down
14 changes: 12 additions & 2 deletions core/object/text/kw/core/monitor_action
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
The action to trigger when a monitored resource is no longer in the "up" or
"standby up" state, and all restart attempts for the resource have failed.

The reboot and crash monitor actions do not attempt to cleanly stop any
The `reboot` and `crash` monitor actions do not attempt to cleanly stop any
processes. On Linux, they utilize system-level sysrq triggers.

This behavior is designed to ensure that the host stops writing to shared
Expand All @@ -10,5 +10,15 @@ is critical because a failover node is likely preparing to write to the same
shared disks.

You can append a fallback monitor action to this keyword. A common example
is "freezestop reboot". In this case, the reboot action will be executed
is `freezestop reboot`. In this case, the reboot action will be executed
if the stop fails or times out.

Other monitor_actions values:
- `none`: the default value for monitor action disabled (`monitor`keyword
must be also `false` or undefined).
- `freezestop`: freeze and subsequently stop the monitored instance.
- `switch`: try monitored instance stop to allow any other cluster nodes to
takeover the instance.
- `no-op`: The monitor action No Operation is called but does nothing. It
may be used for demonstration. The final local expect after call will
be set to `evicted`.
12 changes: 7 additions & 5 deletions core/resource/resource.go
Original file line number Diff line number Diff line change
Expand Up @@ -1174,11 +1174,13 @@ func GetStatus(ctx context.Context, r Driver) Status {
Log: r.StatusLog().Entries(),
Provisioned: getProvisionStatus(r),
Info: getStatusInfo(ctx, r),
Restart: RestartFlag(r.RestartCount()),
Optional: OptionalFlag(r.IsOptional()),
Standby: StandbyFlag(r.IsStandby()),
Disable: DisableFlag(r.IsDisabled()),
Encap: EncapFlag(r.IsEncap()),

Monitor: MonitorFlag(r.IsMonitored()),
Restart: RestartFlag(r.RestartCount()),
Optional: OptionalFlag(r.IsOptional()),
Standby: StandbyFlag(r.IsStandby()),
Disable: DisableFlag(r.IsDisabled()),
Encap: EncapFlag(r.IsEncap()),
}
}

Expand Down
13 changes: 11 additions & 2 deletions daemon/icfg/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,10 @@ var (

errConfigFileCheck = errors.New("config file check")

// standbyDefaultRestart defines the default minimum restart threshold for
// standby resources.
standbyDefaultRestart = 2

keyApp = key.New("DEFAULT", "app")
keyChildren = key.New("DEFAULT", "children")
keyEnv = key.New("DEFAULT", "env")
Expand Down Expand Up @@ -415,12 +419,17 @@ func (t *Manager) getResources(cf *xconfig.T) instance.ResourceConfigs {
if resourceset.IsSubsetSection(section) {
continue
}
restart := cf.GetInt(key.New(section, "restart"))
isStandby := cf.GetBool(key.New(section, "standby"))
if isStandby && restart < standbyDefaultRestart {
restart = standbyDefaultRestart
}
m[section] = instance.ResourceConfig{
RestartDelay: cf.GetDuration(key.New(section, "restart_delay")),
Restart: cf.GetInt(key.New(section, "restart")),
Restart: restart,
IsDisabled: cf.GetBool(key.New(section, "disable")),
IsMonitored: cf.GetBool(key.New(section, "monitor")),
IsStandby: cf.GetBool(key.New(section, "standby")),
IsStandby: isStandby,
}
}
return m
Expand Down
33 changes: 33 additions & 0 deletions daemon/imon/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,17 @@ type (
// It is used during enableDelayTimer():
// When false the delay timer is reset with delayDuration
delayTimerEnabled bool

// initialMonitorAction specifies the initial (stage 0) monitor action
// for monitoring as defined by the MonitorAction type.
// Its Value is created/refreshed during func initResourceMonitor.
initialMonitorAction instance.MonitorAction

// standbyResourceOrchestrate is the orchestrationResource for standby resources
standbyResourceOrchestrate orchestrationResource

// standbyResourceOrchestrate is the orchestrationResource for regular resources
regularResourceOrchestrate orchestrationResource
}

// cmdOrchestrate can be used from post action go routines
Expand All @@ -138,6 +149,16 @@ type (
newState instance.MonitorState
}

// cmdResourceRestart is a structure representing a command to restart resources.
// It can be used from imon goroutines to schedule a future resource restart
// handled during imon main loop.
// rids is a slice of resource IDs to restart.
// standby indicates whether the resources should restart in standby mode.
cmdResourceRestart struct {
rids []string
standby bool
}

Factory struct {
DrainDuration time.Duration
SubQS pubsub.QueueSizer
Expand Down Expand Up @@ -214,6 +235,9 @@ func start(parent context.Context, qs pubsub.QueueSizer, p naming.Path, nodes []
}

t.log = t.newLogger(uuid.Nil)
t.regularResourceOrchestrate.log = t.newResourceLogger("regular resource")
t.standbyResourceOrchestrate.log = t.newResourceLogger( "standby resource")

t.startSubscriptions(qs)

go func() {
Expand All @@ -223,13 +247,20 @@ func start(parent context.Context, qs pubsub.QueueSizer, p naming.Path, nodes []
return nil
}

func (t *Manager) newResourceLogger(s string) *plog.Logger {
return naming.LogWithPath(plog.NewDefaultLogger(), t.path).
Attr("pkg", "daemon/imon").
WithPrefix(fmt.Sprintf("daemon: imon: %s: %s: ", t.path.String(), s))
}

func (t *Manager) newLogger(i uuid.UUID) *plog.Logger {
return naming.LogWithPath(plog.NewDefaultLogger(), t.path).
Attr("pkg", "daemon/imon").
Attr("orchestration_id", i.String()).
WithPrefix(fmt.Sprintf("daemon: imon: %s: ", t.path.String()))
}


func (t *Manager) startSubscriptions(qs pubsub.QueueSizer) {
sub := t.pubsubBus.Sub("daemon.imon "+t.id, qs)
sub.AddFilter(&msgbus.NodeConfigUpdated{}, t.labelLocalhost)
Expand Down Expand Up @@ -368,6 +399,8 @@ func (t *Manager) worker(initialNodes []string) {
switch c := i.(type) {
case cmdOrchestrate:
t.needOrchestrate(c)
case cmdResourceRestart:
t.resourceRestart(c.rids, c.standby)
}
case <-t.delayTimer.C:
t.onDelayTimer()
Expand Down
34 changes: 34 additions & 0 deletions daemon/imon/main_cmd.go
Original file line number Diff line number Diff line change
Expand Up @@ -335,6 +335,7 @@ func (t *Manager) onInstanceConfigUpdated(srcNode string, srcCmd *msgbus.Instanc
}
}()
t.instConfig = srcCmd.Value
t.log.Debugf("refresh resource monitor states on local instance config updated")
t.initResourceMonitor()
janitorInstStatus(srcCmd.Value.Scope)
janitorRelations(srcCmd.Value.Children, "Child", t.state.Children)
Expand Down Expand Up @@ -1046,15 +1047,48 @@ func (t *Manager) doLastAction(action func() error, newState, successState, erro
}

func (t *Manager) initResourceMonitor() {
// Stop any pending restart timers before init. We may be called after
// instance config refreshed with some previous resource restart scheduled.
t.cancelResourceOrchestrateSchedules()

logDropped := func(or *orchestrationResource) {
if or != nil && or.scheduled != nil {
dropped := make([]string, 0)
for rid := range or.scheduled {
dropped = append(dropped, rid)
}
if len(dropped) > 0 {
or.log.Infof("instance config has been updated: drop previously scheduled restarts %v", dropped)
}
}
}
logDropped(&t.regularResourceOrchestrate)
logDropped(&t.standbyResourceOrchestrate)

t.regularResourceOrchestrate.scheduled = make(map[string]bool)
t.standbyResourceOrchestrate.scheduled = make(map[string]bool)

if monitorAction, ok := t.getValidMonitorAction(0); !ok {
t.initialMonitorAction = instance.MonitorActionNone
} else {
t.initialMonitorAction = monitorAction
}

hasMonitorActionNone := t.initialMonitorAction == instance.MonitorActionNone

m := make(instance.ResourceMonitors, 0)
for rid, rcfg := range t.instConfig.Resources {
m[rid] = instance.ResourceMonitor{
Restart: instance.ResourceMonitorRestart{
Remaining: rcfg.Restart,
},
}
if rcfg.IsMonitored && hasMonitorActionNone {
t.orchestrationResource(rcfg.IsStandby).log.Infof("rid %s is monitored, but monitor action is none", rid)
}
}
t.state.Resources = m

t.change = true
}

Expand Down
Loading

0 comments on commit 5c4a3ac

Please sign in to comment.