Skip to content

Commit

Permalink
deprecate unexpected healing failed counters (minio#19705)
Browse files Browse the repository at this point in the history
simplify this to avoid verbose metrics, and make
room for valid metrics to be reported for alerting
etc.
  • Loading branch information
harshavardhana authored May 9, 2024
1 parent 7b7d2ea commit b534dc6
Show file tree
Hide file tree
Showing 5 changed files with 76 additions and 80 deletions.
41 changes: 22 additions & 19 deletions cmd/admin-heal-ops.go
Original file line number Diff line number Diff line change
Expand Up @@ -455,8 +455,8 @@ type healSequence struct {
// Number of total items healed against item type
healedItemsMap map[madmin.HealItemType]int64

// Number of total items where healing failed against endpoint and drive state
healFailedItemsMap map[string]int64
// Number of total items where healing failed against item type
healFailedItemsMap map[madmin.HealItemType]int64

// The time of the last scan/heal activity
lastHealActivity time.Time
Expand Down Expand Up @@ -497,7 +497,7 @@ func newHealSequence(ctx context.Context, bucket, objPrefix, clientAddr string,
ctx: ctx,
scannedItemsMap: make(map[madmin.HealItemType]int64),
healedItemsMap: make(map[madmin.HealItemType]int64),
healFailedItemsMap: make(map[string]int64),
healFailedItemsMap: make(map[madmin.HealItemType]int64),
}
}

Expand Down Expand Up @@ -543,42 +543,40 @@ func (h *healSequence) getHealedItemsMap() map[madmin.HealItemType]int64 {

// getHealFailedItemsMap - returns map of all items where heal failed against
// drive endpoint and status
func (h *healSequence) getHealFailedItemsMap() map[string]int64 {
func (h *healSequence) getHealFailedItemsMap() map[madmin.HealItemType]int64 {
h.mutex.RLock()
defer h.mutex.RUnlock()

// Make a copy before returning the value
retMap := make(map[string]int64, len(h.healFailedItemsMap))
retMap := make(map[madmin.HealItemType]int64, len(h.healFailedItemsMap))
for k, v := range h.healFailedItemsMap {
retMap[k] = v
}

return retMap
}

func (h *healSequence) countFailed(res madmin.HealResultItem) {
func (h *healSequence) countFailed(healType madmin.HealItemType) {
h.mutex.Lock()
defer h.mutex.Unlock()

for _, d := range res.After.Drives {
// For failed items we report the endpoint and drive state
// This will help users take corrective actions for drives
h.healFailedItemsMap[d.Endpoint+","+d.State]++
}

h.healFailedItemsMap[healType]++
h.lastHealActivity = UTCNow()
}

func (h *healSequence) countHeals(healType madmin.HealItemType, healed bool) {
func (h *healSequence) countScanned(healType madmin.HealItemType) {
h.mutex.Lock()
defer h.mutex.Unlock()

if !healed {
h.scannedItemsMap[healType]++
} else {
h.healedItemsMap[healType]++
}
h.scannedItemsMap[healType]++
h.lastHealActivity = UTCNow()
}

func (h *healSequence) countHealed(healType madmin.HealItemType) {
h.mutex.Lock()
defer h.mutex.Unlock()

h.healedItemsMap[healType]++
h.lastHealActivity = UTCNow()
}

Expand Down Expand Up @@ -734,7 +732,7 @@ func (h *healSequence) queueHealTask(source healSource, healType madmin.HealItem
task.opts.ScanMode = madmin.HealNormalScan
}

h.countHeals(healType, false)
h.countScanned(healType)

if source.noWait {
select {
Expand Down Expand Up @@ -766,6 +764,11 @@ func (h *healSequence) queueHealTask(source healSource, healType madmin.HealItem
// task queued, now wait for the response.
select {
case res := <-task.respCh:
if res.err == nil {
h.countHealed(healType)
} else {
h.countFailed(healType)
}
if !h.reportProgress {
if errors.Is(res.err, errSkipFile) { // this is only sent usually by nopHeal
return nil
Expand Down
19 changes: 10 additions & 9 deletions cmd/background-heal-ops.go
Original file line number Diff line number Diff line change
Expand Up @@ -133,19 +133,20 @@ func (h *healRoutine) AddWorker(ctx context.Context, objAPI ObjectLayer, bgSeq *
}
}

if bgSeq != nil {
// We increment relevant counter based on the heal result for prometheus reporting.
if err != nil {
bgSeq.countFailed(res)
} else {
bgSeq.countHeals(res.Type, false)
}
}

if task.respCh != nil {
task.respCh <- healResult{result: res, err: err}
continue
}

// when respCh is not set caller is not waiting but we
// update the relevant metrics for them
if bgSeq != nil {
if err == nil {
bgSeq.countHealed(res.Type)
} else {
bgSeq.countFailed(res.Type)
}
}
case <-ctx.Done():
return
}
Expand Down
2 changes: 1 addition & 1 deletion cmd/global-heal.go
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ func newBgHealSequence() *healSequence {
reportProgress: false,
scannedItemsMap: make(map[madmin.HealItemType]int64),
healedItemsMap: make(map[madmin.HealItemType]int64),
healFailedItemsMap: make(map[string]int64),
healFailedItemsMap: make(map[madmin.HealItemType]int64),
}
}

Expand Down
Loading

0 comments on commit b534dc6

Please sign in to comment.