From 851b89d8dde7aa162b8e0d88573b1ddefef9152a Mon Sep 17 00:00:00 2001 From: Monokaix Date: Thu, 26 Dec 2024 09:55:14 +0800 Subject: [PATCH] Add deploy yaml&guide Signed-off-by: Monokaix --- .github/workflows/code_verify.yaml | 2 +- Makefile | 4 +- README.md | 123 +++++++++++++++- .../volcano-descheduler-development.yaml | 139 ++++++++++++++++++ 4 files changed, 264 insertions(+), 4 deletions(-) create mode 100644 installer/volcano-descheduler-development.yaml diff --git a/.github/workflows/code_verify.yaml b/.github/workflows/code_verify.yaml index 08a2e3f..035e4ca 100644 --- a/.github/workflows/code_verify.yaml +++ b/.github/workflows/code_verify.yaml @@ -34,6 +34,6 @@ jobs: - name: Run verify test run: | make verify - make all + make image sudo make unit-test working-directory: ./src/github.com/${{ github.repository }} diff --git a/Makefile b/Makefile index 689f128..8a17b13 100644 --- a/Makefile +++ b/Makefile @@ -54,7 +54,7 @@ else GOARCH?=$(OSARCH) endif -# Run `make images DOCKER_PLATFORMS="linux/amd64,linux/arm64" BUILDX_OUTPUT_TYPE=registry IMAGE_PREFIX=[yourregistry]` to push multi-platform +# Run `make image DOCKER_PLATFORMS="linux/amd64,linux/arm64" BUILDX_OUTPUT_TYPE=registry IMAGE_PREFIX=[yourregistry]` to push multi-platform DOCKER_PLATFORMS ?= "linux/${GOARCH}" GOOS ?= linux @@ -74,7 +74,7 @@ vc-descheduler: init image_bins: vc-descheduler -images: +image: for name in descheduler; do\ docker buildx build -t "${IMAGE_PREFIX}/vc-$$name:$(TAG)" . -f ./installer/dockerfile/$$name/Dockerfile --output=type=${BUILDX_OUTPUT_TYPE} --platform ${DOCKER_PLATFORMS} --build-arg APK_MIRROR=${APK_MIRROR} --build-arg OPEN_EULER_IMAGE_TAG=${OPEN_EULER_IMAGE_TAG}; \ done diff --git a/README.md b/README.md index 4d81e67..c9337ce 100644 --- a/README.md +++ b/README.md @@ -42,4 +42,125 @@ The principle of LoadAware is shown in the figure above: - Over-utilized nodes: nodes with resource utilization higher than 80%. Hotspot nodes will evict some Pods and reduce the load level to no more than 80%. The descheduler will schedule the Pods on the hotspot nodes to the idle nodes. -- Under-utilized nodes: nodes with resource utilization lower than 30%. \ No newline at end of file +- Under-utilized nodes: nodes with resource utilization lower than 30%. + +# Quick start + +## Prepare + +Install [prometheue](https://github.com/prometheus-community/helm-charts/tree/main/charts/prometheus) or [prometheus-adaptor](https://github.com/prometheus-community/helm-charts/tree/main/charts/prometheus-adapter), and [prometheus-node-exporter](https://github.com/prometheus-community/helm-charts/tree/main/charts/prometheus-node-exporter), The real load of the node is exposed to the `Volcano descheduler` through node-exporter and prometheus. + +Add the following automatic discovery and node label replacement rules for the node-exporter service in the `scrape_configs` configuration of prometheus. This step is very important, otherwise `Volcano descheduler` cannot get the real load metrics of the node. For more details about `scrape_configs`, please refer to [Configuration | Prometheus](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#scrape_config). + +```yaml +scrape_configs: +- job_name: 'kubernetes-service-endpoints' + kubernetes_sd_configs: + - role: endpoints + relabel_configs: + - source_labels: [__meta_kubernetes_pod_node_name] + action: replace + target_label: instance +``` + +## Install Volcano descheduler + +### Install via yaml + +```shell +# create ns first. +kubectl create ns volcano-system +# deploy descheduler yaml. +kubectl apply -f https://raw.githubusercontent.com/volcano-sh/descheduler/main/installer/volcano-descheduler-development.yaml +``` + +## Configurations + +The default descheduling configuration is in the `volcano-descheduler` configMap under the `volcano-system` namespace. You can update the descheduling configuration by modifying the data in the configMap. The plugins enabled by default are `LoadAware` and `DefaultEvictor`, which perform load-aware descheduling and eviction respectively. + +```yaml +apiVersion: "descheduler/v1alpha2" +kind: "DeschedulerPolicy" +profiles: +- name: default + pluginConfig: + - args: + ignorePvcPods: true + nodeFit: true + priorityThreshold: + value: 10000 + name: DefaultEvictor + - args: + evictableNamespaces: + exclude: + - kube-system + metrics: + address: null + type: null + targetThresholds: + cpu: 80 # Eviction will be triggered when the node CPU utilization exceeds 80% + memory: 85 # Eviction will be triggered when the node memory utilization exceeds 85% + thresholds: + cpu: 30 # Pods can be scheduled to nodes whose CPU resource utilization is less than 30% + memory: 30 # Pods can be scheduled to nodes whose memory resource utilization is less than 30%. + name: LoadAware + plugins: + balance: + enabled: + - LoadAware +``` + +For the full configuration and parameter description of the `DefaultEvictor` plugin, please refer to: [DefaultEvictor Configuration](https://github.com/kubernetes-sigs/descheduler/tree/master#evictor-plugin-configuration-default-evictor). + +`LoadAware` plugin parameter description: + +| Name | type | Default Value | Description | +| :-----------------: | :------------------: | :-----------: | :----------------------------------------------------------: | +| nodeSelector | string | nil | Limiting the nodes which are processed | +| evictableNamespaces | map(string:[]string) | nil | Exclude evicting pods under excluded namespaces | +| nodeFit | bool | false | Set to `true` the descheduler will consider whether or not the pods that meet eviction criteria will fit on other nodes before evicting them. | +| numberOfNodes | int | 0 | This parameter can be configured to activate the strategy only when the number of under utilized nodes are above the configured value. This could be helpful in large clusters where a few nodes could go under utilized frequently or for a short period of time. | +| duration | string | 2m | The time range specified when querying the actual utilization metrics of nodes, only takes effect when `metrics.type` is configured as `prometheus`. | +| metrics | map(string:string) | nil | **Required Field**
Contains two parameters:
type: The type of metrics source, only supports `prometheus` and `prometheus_adaptor`.
address: The service address of `prometheus`. | +| targetThresholds | map(string:int) | nil | **Required Field**
Supported configuration keys are `cpu`, `memory`, and `pods`.
When the node resource utilization (for `cpu` or `memory`) exceeds the setting threshold, it will trigger Pods eviction on the node, with the unit being %.
When the number of Pods on the node exceeds the set threshold, it will trigger Pods eviction on the node, with the unit being number. | +| thresholds | map(string:int) | nil | **Required Field**
The evicted Pods should be scheduled to nodes with utilization below the `thresholds`.
The threshold for the same resource type cannot exceed the threshold set in `targetThresholds`. | + +In addition to the above `LoadAware plugin` enhancements, `Volcano descheduler` also supports native descheduler functions and plugins. If you want to configure other native plugins, please refer to: [kubernetes-sigs/descheduler](https://github.com/kubernetes-sigs/descheduler/blob/master/docs/user-guide.md). + +# Best practices + +When the Pods on the node with relatively high resource utilization are evicted, we expect that the new created Pods should avoid being scheduled to the node with relatively high resource utilization again. Therefore, the `Volcano scheduler` also needs to enable the plugin `usage` based on real load awareness, for detailed description and configuration of `usage`, please refer to: [volcano usage plugin](https://github.com/volcano-sh/volcano/blob/master/docs/design/usage-based-scheduling.md). + +# Trouble shotting + +When the configuration parameter `metrics.type` of the LoadAware plugin is set to `prometheus`, `Volcano scheduler` queries the actual utilization of cpu and memory through the following `PromQL` statement. When the expected eviction behavior does not occur, you can query it manually through prometheus, check whether the node metrics are correctly exposed, and compare it with the log of `Volcano descheduler` to judge its actual behavior. + +**cpu:** + +```shell +avg_over_time((1 - (avg by (instance) (irate(node_cpu_seconds_total{mode="idle",instance="$replace_with_your_node_name"}[30s])) * 1))[2m:30s]) +``` + +**memory:** + +```shell +avg_over_time(((1-node_memory_MemAvailable_bytes{instance="$replace_with_your_node_name"}/node_memory_MemTotal_bytes{instance="$replace_with_your_node_name"}))[2m:30s]) +``` + +# Development + +## build binary + +```shell +make vc-descheduler +``` + +## build image + +```shell +make image +``` + +# Release Guide + +The release cadence of the `descheduler` is not synchronized with that of [Volcano](https://github.com/volcano-sh/volcano). This is because the `descheduler` is a sub-repository under volcano-sh, and its code and feature changes are relatively minor. We will adapt to the upstream Kubernetes community's descheduler project as needed and release new versions accordingly. \ No newline at end of file diff --git a/installer/volcano-descheduler-development.yaml b/installer/volcano-descheduler-development.yaml new file mode 100644 index 0000000..92ddb63 --- /dev/null +++ b/installer/volcano-descheduler-development.yaml @@ -0,0 +1,139 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: volcano-descheduler + namespace: volcano-system + +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: volcano-descheduler + namespace: volcano-system +data: + policy.yaml: | + apiVersion: "descheduler/v1alpha2" + kind: "DeschedulerPolicy" + profiles: + - name: default + pluginConfig: + - args: + ignorePvcPods: true + nodeFit: true + priorityThreshold: + value: 10000 + name: DefaultEvictor + - args: + evictableNamespaces: + exclude: + - kube-system + metrics: + address: null + type: null + targetThresholds: + cpu: 80 + memory: 85 + thresholds: + cpu: 30 + memory: 30 + name: LoadAware + plugins: + balance: + enabled: + - LoadAware + +--- +kind: ClusterRole +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: volcano-descheduler +rules: + - apiGroups: ["events.k8s.io"] + resources: ["events"] + verbs: ["create", "update"] + - apiGroups: [""] + resources: ["nodes"] + verbs: ["get", "watch", "list"] + - apiGroups: [""] + resources: ["namespaces"] + verbs: ["get", "watch", "list"] + - apiGroups: [""] + resources: ["pods"] + verbs: ["get", "watch", "list", "delete"] + - apiGroups: [""] + resources: ["pods/eviction"] + verbs: ["create"] + - apiGroups: ["scheduling.k8s.io"] + resources: ["priorityclasses"] + verbs: ["get", "watch", "list"] + - apiGroups: ["metrics.k8s.io"] + resources: ["pods"] + verbs: ["get", "list", "watch"] + +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: volcano-descheduler +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: volcano-descheduler +subjects: + - kind: ServiceAccount + name: volcano-descheduler + namespace: volcano-system + +--- +kind: Deployment +apiVersion: apps/v1 +metadata: + name: volcano-descheduler + namespace: volcano-system + labels: + app: descheduler + k8s-app: descheduler +spec: + replicas: 1 + revisionHistoryLimit: 10 + selector: + matchLabels: + app: descheduler + k8s-app: descheduler + template: + metadata: + labels: + app: descheduler + k8s-app: descheduler + spec: + serviceAccountName: volcano-descheduler + volumes: + - name: policy-volume + configMap: + name: volcano-descheduler + - name: log + hostPath: + path: /var/log/volcano/descheduler + containers: + - name: descheduler + image: docker.io/volcanosh/vc-descheduler:latest + command: ["sh", "-c"] + args: + - > + /vc-descheduler --descheduling-interval-cron-expression='*/10 * * * *' + --descheduling-interval=10m + --policy-config-file=/policy-dir/policy.yaml + --leader-elect=false + --leader-elect-resource-namespace=volcano-system + --v=3 1>>/var/log/volcano/descheduler/descheduler.log 2>&1 + imagePullPolicy: Always + env: + - name: POD_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + volumeMounts: + - mountPath: /policy-dir + name: policy-volume + - name: log + mountPath: /var/log/volcano/descheduler