Skip to content

Commit

Permalink
feat: dashboard in gcp (#11201)
Browse files Browse the repository at this point in the history
[new
dashboard](https://console.cloud.google.com/monitoring/dashboards/builder/30d2d0d2-8dd2-4535-8074-e551dbc773aa;duration=PT15M?f.mlabel.k8s_namespace_name.namespace=mitch&f.mlabel.aztec_circuit_protocol_circuit_name.protocol_circuit=&project=testnet-440309)

It also has
[traces](https://console.cloud.google.com/traces/list?project=testnet-440309),
and the [logs](https://cloudlogging.app.goo.gl/kV6xa4jZzP8ScDLM8) are
much nicer looking now.

We have a new env var, USE_GCLOUD_OBSERVABILITY, which takes precedence
over the otel stuff. The "old" otel env vars can be used to use a custom
metrics stack, e.g. in local testing or in CI.
  • Loading branch information
just-mitch authored Jan 14, 2025
1 parent b42da6d commit 2790bd7
Show file tree
Hide file tree
Showing 31 changed files with 386 additions and 88 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ else
fi

# Configure OTEL_COLLECTOR_ENDPOINT if not set in values file
if [ "${TELEMETRY:-false}" = "true" ] && [ "${OTEL_COLLECTOR_ENDPOINT}" = "" ]; then
if [ "${TELEMETRY:-false}" = "true" ] && [ "${OTEL_COLLECTOR_ENDPOINT}" = "" ] && [ "${USE_GCLOUD_OBSERVABILITY:-false}" = "false" ]; then
OTEL_COLLECTOR_PORT=${OTEL_COLLECTOR_PORT:-4318}
OTEL_COLLECTOR_ENDPOINT="http://metrics-opentelemetry-collector.metrics:$OTEL_COLLECTOR_PORT"
fi
Expand Down
2 changes: 2 additions & 0 deletions spartan/aztec-network/templates/_helpers.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,8 @@ Service Address Setup Container
value: "{{ .Values.proverNode.service.nodePort }}"
- name: PROVER_BROKER_PORT
value: "{{ .Values.proverBroker.service.nodePort }}"
- name: USE_GCLOUD_OBSERVABILITY
value: "{{ .Values.telemetry.useGcloudObservability }}"
- name: SERVICE_NAME
value: {{ include "aztec-network.fullname" . }}
volumeMounts:
Expand Down
19 changes: 12 additions & 7 deletions spartan/aztec-network/templates/boot-node.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -55,13 +55,6 @@ spec:
sleep 5
done
echo "Ethereum node is ready!"
{{- if .Values.telemetry.enabled }}
until curl --head --silent $OTEL_COLLECTOR_ENDPOINT > /dev/null; do
echo "Waiting for OpenTelemetry collector $OTEL_COLLECTOR_ENDPOINT..."
sleep 5
done
echo "OpenTelemetry collector is ready!"
{{- end }}
volumeMounts:
- name: config
mountPath: /shared/config
Expand Down Expand Up @@ -123,6 +116,12 @@ spec:
valueFrom:
fieldRef:
fieldPath: metadata.name
- name: K8S_NAMESPACE_NAME
valueFrom:
fieldRef:
fieldPath: metadata.namespace
- name: USE_GCLOUD_OBSERVABILITY
value: "{{ .Values.telemetry.useGcloudObservability }}"
{{- end }}
containers:
- name: boot-node
Expand Down Expand Up @@ -181,6 +180,10 @@ spec:
fieldPath: metadata.name
- name: OTEL_SERVICE_NAME
value: boot-node
- name: K8S_NAMESPACE_NAME
valueFrom:
fieldRef:
fieldPath: metadata.namespace
- name: NODE_OPTIONS
value: "--max-old-space-size={{ .Values.bootNode.maxOldSpaceSize}}"
- name: AZTEC_PORT
Expand Down Expand Up @@ -235,6 +238,8 @@ spec:
value: "{{ .Values.storage.dataStoreMapSize }}"
- name: WS_DB_MAP_SIZE_KB
value: "{{ .Values.storage.worldStateMapSize }}"
- name: USE_GCLOUD_OBSERVABILITY
value: "{{ .Values.telemetry.useGcloudObservability }}"
ports:
- containerPort: {{ .Values.bootNode.service.nodePort }}
- containerPort: {{ .Values.bootNode.service.p2pTcpPort }}
Expand Down
6 changes: 6 additions & 0 deletions spartan/aztec-network/templates/deploy-l1-verifier.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,10 @@ spec:
valueFrom:
fieldRef:
fieldPath: metadata.name
- name: K8S_NAMESPACE_NAME
valueFrom:
fieldRef:
fieldPath: metadata.namespace
- name: NODE_NO_WARNINGS
value: "1"
- name: LOG_LEVEL
Expand All @@ -108,6 +112,8 @@ spec:
value: "{{ .Values.proverNode.service.nodePort }}"
- name: SERVICE_NAME
value: {{ include "aztec-network.fullname" . }}
- name: USE_GCLOUD_OBSERVABILITY
value: "{{ .Values.telemetry.useGcloudObservability }}"
volumeMounts:
- name: config
mountPath: /shared/config
Expand Down
2 changes: 2 additions & 0 deletions spartan/aztec-network/templates/faucet.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,8 @@ spec:
fieldPath: metadata.name
- name: OTEL_SERVICE_NAME
value: faucet
- name: USE_GCLOUD_OBSERVABILITY
value: "{{ .Values.telemetry.useGcloudObservability }}"
ports:
- name: http
containerPort: {{ .Values.faucet.service.nodePort }}
Expand Down
13 changes: 6 additions & 7 deletions spartan/aztec-network/templates/prover-agent.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -57,13 +57,6 @@ spec:
sleep 5
done
echo "Broker is ready!"
{{- if .Values.telemetry.enabled }}
until curl --head --silent $OTEL_COLLECTOR_ENDPOINT > /dev/null; do
echo "Waiting for OpenTelemetry collector $OTEL_COLLECTOR_ENDPOINT..."
sleep 5
done
echo "OpenTelemetry collector is ready!"
{{- end }}
volumeMounts:
- name: config
mountPath: /shared/config
Expand All @@ -90,6 +83,10 @@ spec:
fieldPath: metadata.name
- name: OTEL_SERVICE_NAME
value: prover-agent
- name: K8S_NAMESPACE_NAME
valueFrom:
fieldRef:
fieldPath: metadata.namespace
- name: AZTEC_PORT
value: "{{ .Values.proverAgent.service.nodePort }}"
- name: LOG_LEVEL
Expand All @@ -106,6 +103,8 @@ spec:
value: {{ join "," .Values.proverAgent.proofTypes | quote }}
- name: OTEL_RESOURCE_ATTRIBUTES
value: service.name={{ .Release.Name }},service.namespace={{ .Release.Namespace }},service.version={{ .Chart.AppVersion }},environment={{ .Values.environment | default "production" }}
- name: USE_GCLOUD_OBSERVABILITY
value: "{{ .Values.telemetry.useGcloudObservability }}"
resources:
{{- toYaml .Values.proverAgent.resources | nindent 12 }}
{{- end }}
13 changes: 6 additions & 7 deletions spartan/aztec-network/templates/prover-broker.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -54,13 +54,6 @@ spec:
- |
source /shared/config/service-addresses
cat /shared/config/service-addresses
{{- if .Values.telemetry.enabled }}
until curl --head --silent $OTEL_COLLECTOR_ENDPOINT > /dev/null; do
echo "Waiting for OpenTelemetry collector $OTEL_COLLECTOR_ENDPOINT..."
sleep 5
done
echo "OpenTelemetry collector is ready!"
{{- end }}
volumeMounts:
- name: config
mountPath: /shared/config
Expand Down Expand Up @@ -89,6 +82,10 @@ spec:
fieldPath: metadata.name
- name: OTEL_SERVICE_NAME
value: prover-broker
- name: K8S_NAMESPACE_NAME
valueFrom:
fieldRef:
fieldPath: metadata.namespace
- name: NODE_OPTIONS
value: "--max-old-space-size={{ .Values.proverBroker.maxOldSpaceSize}}"
- name: AZTEC_PORT
Expand All @@ -109,6 +106,8 @@ spec:
value: "{{ .Values.storage.dataStoreMapSize }}"
- name: OTEL_RESOURCE_ATTRIBUTES
value: service.name={{ .Release.Name }},service.namespace={{ .Release.Namespace }},service.version={{ .Chart.AppVersion }},environment={{ .Values.environment | default "production" }}
- name: USE_GCLOUD_OBSERVABILITY
value: "{{ .Values.telemetry.useGcloudObservability }}"
resources:
{{- toYaml .Values.proverBroker.resources | nindent 12 }}
volumes:
Expand Down
13 changes: 6 additions & 7 deletions spartan/aztec-network/templates/prover-node.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -64,13 +64,6 @@ spec:
echo "Using built-in job broker"
fi
{{- if .Values.telemetry.enabled }}
until curl --head --silent $OTEL_COLLECTOR_ENDPOINT > /dev/null; do
echo "Waiting for OpenTelemetry collector $OTEL_COLLECTOR_ENDPOINT..."
sleep 5
done
echo "OpenTelemetry collector is ready!"
{{- end }}
until curl --head --silent $BOOT_NODE_HOST/status; do
echo "Waiting for boot node..."
sleep 5
Expand Down Expand Up @@ -132,6 +125,10 @@ spec:
fieldPath: metadata.name
- name: OTEL_SERVICE_NAME
value: prover-node
- name: K8S_NAMESPACE_NAME
valueFrom:
fieldRef:
fieldPath: metadata.namespace
- name: POD_IP
valueFrom:
fieldRef:
Expand Down Expand Up @@ -194,6 +191,8 @@ spec:
value: "{{ .Values.storage.dataStoreMapSize }}"
- name: WS_DB_MAP_SIZE_KB
value: "{{ .Values.storage.worldStateMapSize }}"
- name: USE_GCLOUD_OBSERVABILITY
value: "{{ .Values.telemetry.useGcloudObservability }}"
ports:
- containerPort: {{ .Values.proverNode.service.nodePort }}
- containerPort: {{ .Values.proverNode.service.p2pTcpPort }}
Expand Down
6 changes: 6 additions & 0 deletions spartan/aztec-network/templates/pxe.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,10 @@ spec:
fieldPath: metadata.name
- name: OTEL_SERVICE_NAME
value: pxe
- name: K8S_NAMESPACE_NAME
valueFrom:
fieldRef:
fieldPath: metadata.namespace
- name: AZTEC_PORT
value: "{{ .Values.pxe.service.nodePort }}"
- name: LOG_JSON
Expand All @@ -99,6 +103,8 @@ spec:
value: "{{ .Values.pxe.logLevel }}"
- name: PXE_PROVER_ENABLED
value: "{{ .Values.aztec.realProofs }}"
- name: USE_GCLOUD_OBSERVABILITY
value: "{{ .Values.telemetry.useGcloudObservability }}"
ports:
- name: http
containerPort: {{ .Values.pxe.service.nodePort }}
Expand Down
6 changes: 6 additions & 0 deletions spartan/aztec-network/templates/setup-l2-contracts.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,10 @@ spec:
valueFrom:
fieldRef:
fieldPath: metadata.name
- name: K8S_NAMESPACE_NAME
valueFrom:
fieldRef:
fieldPath: metadata.namespace
- name: TELEMETRY
value: "{{ .Values.telemetry.enabled }}"
- name: LOG_LEVEL
Expand All @@ -96,4 +100,6 @@ spec:
value: "{{ .Values.proverNode.service.nodePort }}"
- name: SERVICE_NAME
value: {{ include "aztec-network.fullname" . }}
- name: USE_GCLOUD_OBSERVABILITY
value: "{{ .Values.telemetry.useGcloudObservability }}"
{{ end }}
6 changes: 6 additions & 0 deletions spartan/aztec-network/templates/transaction-bot.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,10 @@ spec:
fieldPath: metadata.name
- name: OTEL_SERVICE_NAME
value: bot
- name: K8S_NAMESPACE_NAME
valueFrom:
fieldRef:
fieldPath: metadata.namespace
- name: AZTEC_PORT
value: "{{ .Values.bot.service.nodePort }}"
- name: LOG_JSON
Expand Down Expand Up @@ -113,6 +117,8 @@ spec:
value: "{{ .Values.bot.maxErrors }}"
- name: BOT_STOP_WHEN_UNHEALTHY
value: "{{ .Values.bot.stopIfUnhealthy }}"
- name: USE_GCLOUD_OBSERVABILITY
value: "{{ .Values.telemetry.useGcloudObservability }}"
ports:
- name: http
containerPort: {{ .Values.bot.service.nodePort }}
Expand Down
13 changes: 6 additions & 7 deletions spartan/aztec-network/templates/validator.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -58,13 +58,6 @@ spec:
done
echo "Ethereum node is ready!"
{{- if .Values.telemetry.enabled }}
until curl --head --silent $OTEL_COLLECTOR_ENDPOINT > /dev/null; do
echo "Waiting for OpenTelemetry collector $OTEL_COLLECTOR_ENDPOINT..."
sleep 5
done
echo "OpenTelemetry collector is ready!"
{{- end }}
if [ "{{ .Values.validator.dynamicBootNode }}" = "true" ]; then
echo "{{ include "aztec-network.pxeUrl" . }}" > /shared/pxe/pxe_url
Expand Down Expand Up @@ -164,6 +157,10 @@ spec:
valueFrom:
fieldRef:
fieldPath: status.podIP
- name: K8S_NAMESPACE_NAME
valueFrom:
fieldRef:
fieldPath: metadata.namespace
- name: NODE_OPTIONS
value: "--max-old-space-size={{ .Values.validator.maxOldSpaceSize}}"
- name: AZTEC_PORT
Expand Down Expand Up @@ -220,6 +217,8 @@ spec:
value: "{{ .Values.storage.dataStoreMapSize }}"
- name: WS_DB_MAP_SIZE_KB
value: "{{ .Values.storage.worldStateMapSize }}"
- name: USE_GCLOUD_OBSERVABILITY
value: "{{ .Values.telemetry.useGcloudObservability }}"
ports:
- containerPort: {{ .Values.validator.service.nodePort }}
- containerPort: {{ .Values.validator.service.p2pTcpPort }}
Expand Down
1 change: 1 addition & 0 deletions spartan/aztec-network/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ storage:
telemetry:
enabled: false
otelCollectorEndpoint:
useGcloudObservability: false

images:
aztec:
Expand Down
1 change: 0 additions & 1 deletion spartan/aztec-network/values/exp-1.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@ aztec:

telemetry:
enabled: true
otelCollectorEndpoint: http://35.197.100.168:4318

images:
aztec:
Expand Down
3 changes: 1 addition & 2 deletions spartan/aztec-network/values/exp-2.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
telemetry:
enabled: true
otelCollectorEndpoint: http://metrics-opentelemetry-collector.metrics:4318

network:
setupL2Contracts: false
Expand Down Expand Up @@ -29,4 +28,4 @@ proverNode:
proverPublisherPrivateKey:

bot:
txIntervalSeconds: 20
txIntervalSeconds: 20
1 change: 0 additions & 1 deletion spartan/aztec-network/values/rc-1.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@ images:

telemetry:
enabled: true
otelCollectorEndpoint: http://35.197.100.168:4318

validator:
storageSize: "100Gi"
Expand Down
1 change: 0 additions & 1 deletion spartan/aztec-network/values/rc-2.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@ images:

telemetry:
enabled: true
otelCollectorEndpoint: http://35.197.100.168:4318

validator:
replicas: 48
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
telemetry:
enabled: true
otelCollectorEndpoint: http://metrics-opentelemetry-collector.metrics:4318

network:
setupL2Contracts: false
Expand Down Expand Up @@ -29,4 +28,4 @@ proverNode:
proverPublisherPrivateKey:

bot:
txIntervalSeconds: 20
txIntervalSeconds: 20
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
telemetry:
enabled: true
otelCollectorEndpoint: http://metrics-opentelemetry-collector.metrics:4318

network:
setupL2Contracts: false
Expand Down Expand Up @@ -75,4 +74,4 @@ proverNode:
proverPublisherPrivateKey:

bot:
txIntervalSeconds: 5
txIntervalSeconds: 5
5 changes: 5 additions & 0 deletions spartan/terraform/deploy-release/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,11 @@ resource "helm_release" "aztec-gke-cluster" {
value = var.L1_DEPLOYMENT_SALT
}

set {
name = "telemetry.useGcloudObservability"
value = "true"
}

# Setting timeout and wait conditions
timeout = 1200 # 20 minutes in seconds
wait = true
Expand Down
4 changes: 3 additions & 1 deletion yarn-project/foundation/src/config/env_var.ts
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ export type EnvVar =
| 'NETWORK'
| 'NO_PXE'
| 'COIN_ISSUER_CONTRACT_ADDRESS'
| 'USE_GCLOUD_OBSERVABILITY'
| 'OTEL_EXPORTER_OTLP_METRICS_ENDPOINT'
| 'OTEL_EXPORTER_OTLP_TRACES_ENDPOINT'
| 'OTEL_EXPORTER_OTLP_LOGS_ENDPOINT'
Expand Down Expand Up @@ -202,4 +203,5 @@ export type EnvVar =
| 'FAUCET_INTERVAL_MS'
| 'FAUCET_L1_ASSETS'
| 'K8S_POD_NAME'
| 'K8S_POD_UID';
| 'K8S_POD_UID'
| 'K8S_NAMESPACE_NAME';
Loading

0 comments on commit 2790bd7

Please sign in to comment.