Merge pull request #1 from bitovi/fixes-and-improvements

bitovi · Sep 20, 2024 · 0e95e4e · 0e95e4e
2 parents a26267b + 86e0896
commit 0e95e4e
Show file tree

Hide file tree

Showing 7 changed files with 74 additions and 66 deletions.
diff --git a/README.md b/README.md
@@ -4,6 +4,14 @@ Bring Temporal Cloud Metrics into your Kubernetes cluster to inform autoscaling
 
 ![Metrics Dashboard Demo](./img/metrics-dashboard.jpg)
 
+## How it works
+
+This project is essentially just a proxy server. Kubernetes makes an HTTP call which is handled by this service which in turn pulls metrics from Temporal Cloud, converts them to the format that Kubernetes expects, and returns them to k8s.
+
+Kubernetes will poll our service for metrics which become available to HPA's living in the same Kubernetes namespace. 
+
+![Architecture Diagram](./img/diagram.png)
+
 ## Setup
 
 ### Prerequisites
@@ -41,7 +49,7 @@ sum by(temporal_namespace) (
     temporal_cloud_v0_poll_success_sync_count{}[1m]
   )
 )
--
+/
 sum by(temporal_namespace) (
   rate(
     temporal_cloud_v0_poll_success_count{}[1m]
@@ -51,27 +59,40 @@ sum by(temporal_namespace) (
 
 __After__
 
-We've made two important changes here: (1) we've swapped the places of the two underlying metrics to invert the resulting number so it will now be positive and increase as the Sync Match Rate falls, (2) use clamp_min to set a lower bound of zero, and (3) we default the resulting value to zero in the event no data points are available within the specified time window.
+We've made two important changes here: (1) we've swapped the places of the two underlying metrics to invert the resulting number so it will now be positive and increase as the Sync Match Rate falls, and (2) we default the resulting value to `1` in the event no data points are available within the specified time window.
+
+The result is a decimal that starts at `1` when there is a perfect Sync Match Rate and rises as the Sync Match Rate is declines. 
 
 ```
-sum(
-  clamp_min(
-    (
-      sum by(temporal_namespace) (
-        rate(
-          temporal_cloud_v0_poll_success_count{}[1m]
-        )
-      )
-      -
-      sum by(temporal_namespace) (
-          rate(
-              temporal_cloud_v0_poll_success_sync_count{}[1m]
-          )
-      )
-    ),
-    0
+(
+  sum by(temporal_namespace) (
+    rate(
+      temporal_cloud_v0_poll_success_count{
+        temporal_namespace="bitovi.x72yu"
+      }[1m]
+    )
   )
-) or vector(0)
+  /
+  sum by(temporal_namespace) (
+    rate(
+      temporal_cloud_v0_poll_success_sync_count{
+        temporal_namespace="bitovi.x72yu"
+      }[1m]
+    )
+  )
+)
+unless
+(
+  sum by(temporal_namespace) (
+    rate(
+      temporal_cloud_v0_poll_success_sync_count{
+        temporal_namespace="bitovi.x72yu"
+      }[1m]
+    )
+  ) == 0
+)
+or label_replace(vector(1), "temporal_namespace", "bitovi.x72yu", "", "")
+
 ```
 
 ### Step 3: HPA
@@ -175,8 +196,8 @@ You can adjust the how quickly the cluster scales up and down our workers.
             temporal_namespace: xyz.123
       target:
         type: Value
-        # Scale up when the target metric exceeds 50 milli values (0.05)
-        value: 50m
+        # Scale up when the target metric exceeds 1500 milli values (1.5)
+        value: 1500m
   behavior:
     scaleUp:
       # The highest value in the last 10 seconds will be used to determine the need to scale up

diff --git a/chart/templates/demo-worker.yaml b/chart/templates/demo-worker.yaml
@@ -22,7 +22,7 @@ spec:
         imagePullPolicy: Always
         ports: []
         volumeMounts:
-        - name: mtls-certs
+        - name: tcm-mtls-certs
           mountPath: "/app/certs"
           readOnly: true
         env:
@@ -37,7 +37,7 @@ spec:
         - name: TEMPORAL_QUEUE
           value: autoscaler_demo
       volumes:
-      - name: mtls-certs
+      - name: tcm-mtls-certs
         secret:
-          secretName: mtls-certs
+          secretName: tcm-mtls-certs
 {{- end }}
diff --git a/chart/templates/deployment.yaml b/chart/templates/deployment.yaml
@@ -36,7 +36,7 @@ spec:
         - mountPath: /var/run/serving-cert
           name: serving-cert
           readOnly: false
-        - name: mtls-certs
+        - name: tcm-mtls-certs
           mountPath: "/app/certs"
           readOnly: true
         - name: config
@@ -59,9 +59,9 @@ spec:
         emptyDir: {}
       - name: serving-cert
         emptyDir: {}
-      - name: mtls-certs
+      - name: tcm-mtls-certs
         secret:
-          secretName: mtls-certs
+          secretName: tcm-mtls-certs
       - name: config
         configMap:
           name: adapter-configuration
diff --git a/chart/templates/hpa.yaml b/chart/templates/hpa.yaml
@@ -8,8 +8,8 @@ spec:
     apiVersion: apps/v1
     kind: Deployment
     name: {{ .Values.worker.deployment }}
-  minReplicas: 1
-  maxReplicas: 20
+  minReplicas: 2
+  maxReplicas: 50
   metrics:
   - type: External
     external:
@@ -20,19 +20,19 @@ spec:
             temporal_namespace: "{{ .Values.temporal.namespace }}"
       target:
         type: Value
-        value: 50m
+        value: 1500m
   behavior:
     scaleUp:
-      stabilizationWindowSeconds: 10
+      stabilizationWindowSeconds: 5
       selectPolicy: Max
       policies:
         - type: Pods
-          value: 5
-          periodSeconds: 10
+          value: 10
+          periodSeconds: 5
     scaleDown:
-      stabilizationWindowSeconds: 60
+      stabilizationWindowSeconds: 5
       selectPolicy: Max
       policies:
         - type: Pods
-          value: 3
-          periodSeconds: 30
+          value: 10
+          periodSeconds: 5
diff --git a/chart/templates/mtls-certificates.yaml b/chart/templates/mtls-certificates.yaml
@@ -5,5 +5,5 @@ data:
 kind: Secret
 metadata:
   creationTimestamp: null
-  name: mtls-certs
+  name: tcm-mtls-certs
   namespace: {{ .Release.Namespace }}
diff --git a/img/diagram.png b/img/diagram.png
diff --git a/sample-config.yaml b/sample-config.yaml
@@ -8,35 +8,22 @@ temporal_cloud:
 metrics:
   temporal_cloud_sync_match_rate:
     query: >
-      sum(
-        clamp_min(
-          (
-            sum by(temporal_namespace) (
-              rate(
-                temporal_cloud_v0_poll_success_count{}[1m]
-              )
-            )
-            -
-            sum by(temporal_namespace) (
-                rate(
-                    temporal_cloud_v0_poll_success_sync_count{}[1m]
-                )
-            )
-          ),
-          0
+      (
+        sum by(temporal_namespace) (
+          rate(
+            temporal_cloud_v0_poll_success_count{
+              temporal_namespace="123.xyz",
+              task_queue="hello_world"
+            }[1m]
+          )
         )
-      ) or vector(0)
-  temporal_cloud_service_latency:
-    query: >
-      sum(
-        clamp_min(
-          sum by(temporal_namespace) (
-            rate(temporal_cloud_v0_service_latency_count{}[1m])
+        /
+        sum by(temporal_namespace) (
+          rate(
+            temporal_cloud_v0_poll_success_sync_count{
+              temporal_namespace="123.xyz",
+              task_queue="hello_world"
+            }[1m]
           )
-          -
-          sum by(temporal_namespace) (
-            rate(temporal_cloud_v0_service_latency_sum{}[1m])
-          ),
-          0
         )
-      ) or vector(0)
+      ) or vector(1)