Skip to content

Commit

Permalink
* [FEATURE] 新增ksm指标计算节点cpu/mem 请求/限制率等指标
Browse files Browse the repository at this point in the history
* [BUGFIX] ksm启动不再sleep等待,因为push的瓶颈在transfer已经解决了
  • Loading branch information
ning1875 committed Jan 28, 2021
1 parent c2ac8b8 commit 13a631e
Show file tree
Hide file tree
Showing 6 changed files with 230 additions and 37 deletions.
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
## v2.0.4 / 2021-01-28
* [FEATURE] 新增ksm指标计算节点cpu/mem 请求/限制率等指标
* [BUGFIX] ksm启动不再sleep等待,因为push的瓶颈在transfer已经解决了



## v2.0.3 / 2021-01-27
* [ENHANCEMENT] 修改大盘文件,测试导入内还在功能
* [CHANGE] 完善readme
Expand Down
68 changes: 67 additions & 1 deletion collect/common.go
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,67 @@ func checkFloatValidate(qu float64) (isValidate bool) {
return math.IsNaN(qu) || math.IsInf(qu, 1) || math.IsInf(qu, 1)
}

func NewMetricFunc(nid string, newMetricName string, value float64, step int64, appendTags map[string]string, metricList []dataobj.MetricValue) []dataobj.MetricValue {

metric := dataobj.MetricValue{}
metric.Nid = nid
metric.Metric = newMetricName
metric.Timestamp = time.Now().Unix()
metric.Step = step
metric.CounterType = config.METRIC_TYPE_GAUGE
metric.ValueUntyped = value
metric.Value = value
metric.TagsMap = appendTags
metricList = append(metricList, metric)
return metricList
}

func PercentComputeForKsm(mfenzi map[string]float64, mfenmu map[string]float64, nid string, newMetricName string, sameKeyName string, step int64, appendTags map[string]string, metricList []dataobj.MetricValue) []dataobj.MetricValue {

fmt.Println(newMetricName, mfenzi, mfenmu)
for sameKey, fenzi := range mfenzi {
fenmu, loaded := mfenmu[sameKey]
if !loaded {
fmt.Println("[not_loaded]", newMetricName, sameKey, fenzi)
continue
}
var percent float64
if fenmu == 0 {
percent = 0
} else {
percent = (fenzi / fenmu) * 100
}

metricPercent := dataobj.MetricValue{}
metricPercent.Nid = nid
metricPercent.Metric = newMetricName + "_percent"
metricPercent.Timestamp = time.Now().Unix()
metricPercent.Step = step
metricPercent.CounterType = config.METRIC_TYPE_GAUGE
metricPercent.ValueUntyped = percent
metricPercent.Value = percent
metricPercent.TagsMap = appendTags
metricPercent.TagsMap[sameKeyName] = sameKey
metricList = append(metricList, metricPercent)

metricValue := dataobj.MetricValue{}
metricValue.Nid = nid
metricValue.Metric = newMetricName + "_value"
metricValue.Timestamp = time.Now().Unix()
metricValue.Step = step
metricValue.CounterType = config.METRIC_TYPE_GAUGE
metricValue.ValueUntyped = fenzi
metricValue.Value = fenzi
metricValue.TagsMap = appendTags
metricValue.TagsMap[sameKeyName] = sameKey

metricList = append(metricList, metricValue)
fmt.Println(newMetricName, metricValue, metricPercent)
}

return metricList
}

func avgCompute(m map[string]float64, nid string, metricName string, step int64, appendTags map[string]string) (metricList []dataobj.MetricValue) {
sum := m["sum"]
count := m["count"]
Expand Down Expand Up @@ -170,6 +231,7 @@ func successfulRate(m map[string]float64, nid string, metricName string, step in
var (
suSum float64 = 0
allSum float64 = 0
value float64 = 0
)
for label, sum := range m {
if strings.HasPrefix(label, "2") || strings.HasPrefix(label, "3") {
Expand All @@ -178,7 +240,11 @@ func successfulRate(m map[string]float64, nid string, metricName string, step in
allSum += sum

}
value := suSum / allSum
if allSum == 0 {
value = 0
} else {
value = (suSum / allSum) * 100
}

metric := dataobj.MetricValue{}
metric.Nid = nid
Expand Down
60 changes: 60 additions & 0 deletions collect/kube_state_metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,28 @@ func DoKubeStatsMetricsCollect(cg *config.Config, logger log.Logger, funcName st
// 整理label:

var metricList []dataobj.MetricValue
kube_pod_container_resource_requests_cpu_cores_m := make(map[string]float64)
kube_pod_container_resource_limits_cpu_cores_m := make(map[string]float64)
kube_node_status_allocatable_cpu_cores_m := make(map[string]float64)

kube_pod_container_resource_requests_memory_bytes_m := make(map[string]float64)
kube_pod_container_resource_limits_memory_bytes_m := make(map[string]float64)
kube_node_status_capacity_memory_bytes_m := make(map[string]float64)

kube_pod_info_m := make(map[string]float64)
kube_node_status_capacity_pods_m := make(map[string]float64)

kube_pod_container_resource_requests_cpu_cores := "kube_pod_container_resource_requests_cpu_cores"
kube_pod_container_resource_limits_cpu_cores := "kube_pod_container_resource_limits_cpu_cores"
kube_node_status_allocatable_cpu_cores := "kube_node_status_allocatable_cpu_cores"

kube_pod_container_resource_requests_memory_bytes := "kube_pod_container_resource_requests_memory_bytes"
kube_pod_container_resource_limits_memory_bytes := "kube_pod_container_resource_limits_memory_bytes"
kube_node_status_capacity_memory_bytes := "kube_node_status_capacity_memory_bytes"

kube_pod_info := "kube_pod_info"
kube_node_status_capacity_pods := "kube_node_status_capacity_pods"

for _, metric := range metrics {
// 去掉kube_<>_labels
//if _, loaded := rm[metric.Metric]; loaded {
Expand All @@ -170,6 +192,30 @@ func DoKubeStatsMetricsCollect(cg *config.Config, logger log.Logger, funcName st
delete(metric.TagsMap, k)
}
}
labelNode := metric.TagsMap["node"]
switch metric.Metric {

// cpu
case kube_pod_container_resource_requests_cpu_cores:
kube_pod_container_resource_requests_cpu_cores_m[labelNode] += metric.Value
case kube_pod_container_resource_limits_cpu_cores:
kube_pod_container_resource_limits_cpu_cores_m[labelNode] += metric.Value
case kube_node_status_allocatable_cpu_cores:
kube_node_status_allocatable_cpu_cores_m[labelNode] += metric.Value

// mem
case kube_pod_container_resource_requests_memory_bytes:
kube_pod_container_resource_requests_memory_bytes_m[labelNode] += metric.Value
case kube_pod_container_resource_limits_memory_bytes:
kube_pod_container_resource_limits_memory_bytes_m[labelNode] += metric.Value
case kube_node_status_capacity_memory_bytes:
kube_node_status_capacity_memory_bytes_m[labelNode] += metric.Value
// pod num
case kube_pod_info:
kube_pod_info_m[labelNode] += metric.Value
case kube_node_status_capacity_pods:
kube_node_status_capacity_pods_m[labelNode] += metric.Value
}

if metric.CounterType == config.METRIC_TYPE_COUNTER {
metric.Metric = metric.Metric + config.COUNTER_TO_GAUGE_METRIC_NAME_SUFFIX
Expand All @@ -181,6 +227,20 @@ func DoKubeStatsMetricsCollect(cg *config.Config, logger log.Logger, funcName st
metricList = append(metricList, metric)

}

newtagsm := map[string]string{
cg.MultiFuncUniqueLabel: funcName,
}
// 计算百分比
// cpu
metricList = PercentComputeForKsm(kube_pod_container_resource_requests_cpu_cores_m, kube_node_status_allocatable_cpu_cores_m, cg.ServerSideNid, "kube_node_pod_container_cpu_requests", "node", cg.Step, newtagsm, metricList)
metricList = PercentComputeForKsm(kube_pod_container_resource_limits_cpu_cores_m, kube_node_status_allocatable_cpu_cores_m, cg.ServerSideNid, "kube_node_pod_container_cpu_limits", "node", cg.Step, newtagsm, metricList)
// mem
metricList = PercentComputeForKsm(kube_pod_container_resource_requests_memory_bytes_m, kube_node_status_capacity_memory_bytes_m, cg.ServerSideNid, "kube_node_pod_container_memory_requests", "node", cg.Step, newtagsm, metricList)
metricList = PercentComputeForKsm(kube_pod_container_resource_limits_memory_bytes_m, kube_node_status_capacity_memory_bytes_m, cg.ServerSideNid, "kube_node_pod_container_memory_limits", "node", cg.Step, newtagsm, metricList)
// pod
metricList = PercentComputeForKsm(kube_pod_info_m, kube_node_status_capacity_pods_m, cg.ServerSideNid, "kube_node_pod_num", "node", cg.Step, newtagsm, metricList)

level.Info(logger).Log("msg", "DoCollectSuccessfullyReadyToPush", "funcName", funcName, "metrics_num", len(metricList), "time_took_seconds", time.Since(start).Seconds(), "metric_addr", cg.KubeStatsC.Addr)

go PushWork(cg.PushServerAddr, cg.TimeOutSeconds, metricList, logger, funcName)
Expand Down
2 changes: 0 additions & 2 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -266,8 +266,6 @@ func main() {
if sConfig.KubeStatsC != nil {
// kube-stats-metrics
g.Add(func() error {
// ksm指标多延迟启动
time.Sleep(2)
err := collect.CommonCollectTicker(sConfig, ctxAll, logger, collect.DoKubeStatsMetricsCollect, config.FUNCNAME_KUBESTATSMETRICS)
if err != nil {
level.Error(logger).Log("msg", "kube-stats-metrics collect-manager stopped")
Expand Down
25 changes: 24 additions & 1 deletion metrics-detail/preaggregation.md
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,30 @@
| etcd_debugging_snap_save_total_duration_seconds_avg |gauge| dsnapshot save延迟平均值| |


## 成功率

## 节点资源汇总
- 节点cpu请求核数 `sum(kube_pod_container_resource_requests_cpu_cores{origin_prometheus=~"$origin_prometheus",node=~"^$Node$"}) by (node)`
- 节点cpu 请求率 `sum(kube_pod_container_resource_requests_cpu_cores{origin_prometheus=~"$origin_prometheus",node=~"^$Node$"})by (node) / sum(kube_node_status_allocatable_cpu_cores{origin_prometheus=~"$origin_prometheus",node=~"^$Node$"})by (node)`
- 节点cpu限制 `sum(kube_pod_container_resource_limits_cpu_cores{origin_prometheus=~"$origin_prometheus",node=~"^$Node$"}) by (node)`
- 节点cpu限制率 `sum(kube_pod_container_resource_limits_cpu_cores{origin_prometheus=~"$origin_prometheus",node=~"^$Node$"})by (node) / sum(kube_node_status_allocatable_cpu_cores{origin_prometheus=~"$origin_prometheus",node=~"^$Node$"})by (node)`

- 节点内存请求 `sum(kube_pod_container_resource_requests_memory_bytes{origin_prometheus=~"$origin_prometheus",node=~"^$Node$"}) by (node)`
- 节点内存请求% `sum(kube_pod_container_resource_requests_memory_bytes{origin_prometheus=~"$origin_prometheus",node=~"^$Node$"})by (node) / sum(kube_node_status_capacity_memory_bytes{origin_prometheus=~"$origin_prometheus",node=~"^$Node$"})by (node)`
- 节点内存限制 `sum(kube_pod_container_resource_limits_memory_bytes{origin_prometheus=~"$origin_prometheus",node=~"^$Node$"}) by (node)`
- 节点内存限制% `sum(kube_pod_container_resource_limits_cpu_cores{origin_prometheus=~"$origin_prometheus",node=~"^$Node$"})by (node) / sum(kube_node_status_allocatable_cpu_cores{origin_prometheus=~"$origin_prometheus",node=~"^$Node$"})by (node)`




## 成功率/百分比
| 指标名 | 类型|含义 | 说明 |
| --- | --- | --- | --- |
| apiserver_request_successful_rate |gauge| apiserver请求成功率 | |
| kube_node_pod_container_cpu_limits_value |gauge|节点cpu限制 | |
| kube_node_pod_container_cpu_limits_percent|gauge|节点cpu限制率 | |
| kube_node_pod_container_cpu_requests_value|gauge|节点cpu 请求| |
| kube_node_pod_container_cpu_requests_percent|gauge|节点cpu 请求率 | `sum(kube_pod_container_resource_requests_cpu_cores)by (node) / sum(kube_node_status_allocatable_cpu_cores)by (node)`|
| kube_node_pod_container_memory_requests_value|gauge|节点内存请求| |
| kube_node_pod_container_memory_requests_percent|gauge|节点内存请求率| |
| kube_node_pod_container_memory_limits_value|gauge|节点内存限制| |
| kube_node_pod_container_memory_limits_percent|gauge|节点内存限制率| `sum(kube_pod_container_resource_limits_cpu_cores)by (node) / sum(kube_node_status_allocatable_cpu_cores)by (node)` |
Loading

0 comments on commit 13a631e

Please sign in to comment.