Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

删除节点后缓存数据未清除及metrics接口依然存在被删除节点的监控指标数据的bug修复 #615

Merged
merged 12 commits into from
Oct 10, 2023
Merged
Show file tree
Hide file tree
Changes from 10 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion cluster/calcium/node.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import (
enginefactory "github.com/projecteru2/core/engine/factory"
enginetypes "github.com/projecteru2/core/engine/types"
"github.com/projecteru2/core/log"
"github.com/projecteru2/core/metrics"
"github.com/projecteru2/core/resource/plugins"
resourcetypes "github.com/projecteru2/core/resource/types"
"github.com/projecteru2/core/types"
Expand Down Expand Up @@ -89,7 +90,13 @@ func (c *Calcium) RemoveNode(ctx context.Context, nodename string) error {
},
// then: remove node resource metadata
func(ctx context.Context) error {
return c.rmgr.RemoveNode(ctx, nodename)
err = c.rmgr.RemoveNode(ctx, nodename)
if err != nil {
return err
}
nyl1001 marked this conversation as resolved.
Show resolved Hide resolved
enginefactory.RemoveEngineFromCache(ctx, node.Endpoint, node.Ca, node.Cert, node.Key)
metrics.Client.DeleteInvalidNodeLabelValues([]string{nodename})
return nil
},
// rollback: do nothing
func(ctx context.Context, failureByCond bool) error {
Expand Down
3 changes: 3 additions & 0 deletions metrics/handler.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,14 +20,17 @@ func (m *Metrics) ResourceMiddleware(cluster cluster.Cluster) func(http.Handler)
if err != nil {
logger.Error(ctx, err, "Get all nodes err")
}
activeNodeMap := make(map[string]*types.Node, 0)
nyl1001 marked this conversation as resolved.
Show resolved Hide resolved
for node := range nodes {
metrics, err := m.rmgr.GetNodeMetrics(ctx, node)
if err != nil {
logger.Error(ctx, err, "Get metrics failed")
continue
}
activeNodeMap[node.Name] = node
m.SendMetrics(ctx, metrics...)
}
m.DeleteInactiveNodeCacheAndMetrics(activeNodeMap)
h.ServeHTTP(w, r)
})
}
Expand Down
76 changes: 76 additions & 0 deletions metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import (
"strconv"
"sync"

enginefactory "github.com/projecteru2/core/engine/factory"
"github.com/projecteru2/core/log"
"github.com/projecteru2/core/resource"
"github.com/projecteru2/core/resource/cobalt"
Expand Down Expand Up @@ -85,6 +86,81 @@ func (m *Metrics) SendMetrics(ctx context.Context, metrics ...*plugintypes.Metri
}
}

func (m *Metrics) DeleteInactiveNodeCacheAndMetrics(activeNodeMap map[string]*types.Node) {
nyl1001 marked this conversation as resolved.
Show resolved Hide resolved
metricNodeNameMap := m.GetNodeNameMapFromMetrics()
// 计算差集
inactiveNodeNameList := make([]string, 0)
for nodeName := range metricNodeNameMap {
if node, exists := activeNodeMap[nodeName]; !exists {
inactiveNodeNameList = append(inactiveNodeNameList, nodeName)
nyl1001 marked this conversation as resolved.
Show resolved Hide resolved
enginefactory.RemoveEngineFromCache(context.Background(), node.Endpoint, node.Ca, node.Cert, node.Key)
}
}
if len(inactiveNodeNameList) > 0 {
nyl1001 marked this conversation as resolved.
Show resolved Hide resolved
m.DeleteInvalidNodeLabelValues(inactiveNodeNameList)
}
}

func (m *Metrics) GetNodeNameMapFromMetrics() map[string]bool {
nyl1001 marked this conversation as resolved.
Show resolved Hide resolved
metrics, _ := prometheus.DefaultGatherer.Gather()
nodeNameMap := make(map[string]bool, 0)
for _, metric := range metrics {
for _, mf := range metric.GetMetric() {
if len(mf.Label) == 0 {
continue
}
for _, label := range mf.Label {
if label.GetName() == "nodename" {
nodeNameMap[label.GetValue()] = true
break
}
}
}
}
return nodeNameMap
}

// DeleteInvalidNodeLabelValues 清除多余的metric标签值
func (m *Metrics) DeleteInvalidNodeLabelValues(nodeNameToRemoveList []string) {
nyl1001 marked this conversation as resolved.
Show resolved Hide resolved
for _, collector := range m.Collectors {
if collector == nil {
return
}
metrics, _ := prometheus.DefaultGatherer.Gather()
for _, metric := range metrics {
for _, mf := range metric.GetMetric() {
if len(mf.Label) == 0 {
continue
}
bFind := false
for _, label := range mf.Label {
for _, nodeNameToRemove := range nodeNameToRemoveList {
if label.GetName() == "nodename" && label.GetValue() == nodeNameToRemove {
bFind = true
break
}
}
}
if !bFind {
continue
}
nyl1001 marked this conversation as resolved.
Show resolved Hide resolved
labels := prometheus.Labels{}
for _, label := range mf.Label {
labels[label.GetName()] = label.GetValue()
}
// 删除符合条件的度量标签
switch c := collector.(type) {
case *prometheus.GaugeVec:
c.Delete(labels)
case *prometheus.CounterVec:
c.Delete(labels)
}
}
}
// 添加更多的条件来处理其他类型的Collector
}
}

// Lazy connect
func (m *Metrics) checkConn(ctx context.Context) error {
if m.statsdClient != nil {
Expand Down