From 75b78cdd3d66e568a1702a44c061afeb37303d54 Mon Sep 17 00:00:00 2001 From: chahat sagar <109112505+chahatsagarmain@users.noreply.github.com> Date: Fri, 17 Jan 2025 00:40:48 +0530 Subject: [PATCH] [ci] Scrape and verify metrics at the end of e2e tests (#6330) ## Which problem is this PR solving? - Part of #6278 ## Description of the changes - scrape script and usage in cit workflow - diff calculating script - cache save and restore from main workflow runs - A sample diff (txt) ``` --- +++ @@ -303,2 +303,2 @@ -rpc_server_requests_per_rpc{le="+Inf",rpc_grpc_status_code="0",rpc_method="Export",rpc_service="opentelemetry.proto.collector.trace.v1.TestService",rpc_system="grpc",service_name="jaeger",service_version=""} -rpc_server_requests_per_rpc{le="0",rpc_grpc_status_code="0",rpc_method="Export",rpc_service="opentelemetry.proto.collector.trace.v1.TraceService",rpc_system="grpc",service_name="jaeger",service_version=""} +rpc_server_requests_per_rpc{le="+Inf",rpc_grpc_status_code="0",rpc_method="Export",rpc_service="opentelemetry.proto.collector.trace.v1.TraceService",rpc_system="grpc",service_name="jaeger",service_version=""} +rpc_server_requests_per_rpc{le="0",rpc_grpc_status_code="1",rpc_method="Export",rpc_service="opentelemetry.proto.collector.trace.v1.TraceService",rpc_system="grpc",service_name="jaeger",service_version=""} @@ -321 +321 @@ -rpc_server_response_size{le="+Inf",rpc_method="Export",rpc_service="opentelemetry.proto.collector.trace.v1.TraceService",rpc_system="grpc",service_name="jaeger",service_version=""} +rpc_server_response_size{le="+Inf",rpc_service="opentelemetry.proto.collector.trace.v1.TraceService",rpc_system="grpc",service_name="jaeger",service_version="",test_change="Export"} @@ -338 +338 @@ -rpc_server_response_size{rpc_method="Export",rpc_service="opentelemetry.proto.collector.trace.v1.TraceService",rpc_system="grpc",service_name="jaeger",service_version=""} +rpc_server_response_size{rpc_method="Export",rpc_service="opentelemetry.proto.collector.trace.v1.TraceService",rpc_system="grpc",service_name="test-jaeger",service_version=""} ``` ## How was this change tested? - ## Checklist - [x] I have read https://github.com/jaegertracing/jaeger/blob/master/CONTRIBUTING_GUIDELINES.md - [x] I have signed all commits - [ ] I have added unit tests for the new functionality - [x] I have run lint and test steps successfully - for `jaeger`: `make lint test` - for `jaeger-ui`: `yarn lint` and `yarn test` --------- Signed-off-by: chahatsagarmain Signed-off-by: chahat sagar <109112505+chahatsagarmain@users.noreply.github.com> Signed-off-by: Yuri Shkuro Co-authored-by: Yuri Shkuro --- .../verify-metrics-snapshot/action.yaml | 70 +++++++++++++++++ .github/workflows/ci-e2e-all.yml | 4 - .github/workflows/ci-e2e-badger.yaml | 7 +- .github/workflows/ci-e2e-cassandra.yml | 8 ++ .github/workflows/ci-e2e-elasticsearch.yml | 12 +-- .github/workflows/ci-e2e-grpc.yml | 6 ++ .github/workflows/ci-e2e-kafka.yml | 6 ++ .github/workflows/ci-e2e-memory.yaml | 9 ++- .github/workflows/ci-e2e-opensearch.yml | 10 ++- .gitignore | 1 + .../internal/integration/e2e_integration.go | 21 +++++ scripts/e2e/compare_metrics.py | 77 +++++++++++++++++++ 12 files changed, 215 insertions(+), 16 deletions(-) create mode 100644 .github/actions/verify-metrics-snapshot/action.yaml create mode 100644 scripts/e2e/compare_metrics.py diff --git a/.github/actions/verify-metrics-snapshot/action.yaml b/.github/actions/verify-metrics-snapshot/action.yaml new file mode 100644 index 00000000000..5319f587a54 --- /dev/null +++ b/.github/actions/verify-metrics-snapshot/action.yaml @@ -0,0 +1,70 @@ +# Copyright (c) 2023 The Jaeger Authors. +# SPDX-License-Identifier: Apache-2.0 + +name: 'Verify Metric Snapshot and Upload Metrics' +description: 'Upload or cache the metrics data after verification' +inputs: + snapshot: + description: 'Path to the metric file' + required: true + artifact_key: + description: 'Artifact key used for uploading and fetching artifacts' + required: true +runs: + using: 'composite' + steps: + - name: Upload current metrics snapshot + uses: actions/upload-artifact@6f51ac03b9356f520e9adb1b1b7802705f340c2b # v4.5.0 + with: + name: ${{ inputs.artifact_key }} + path: ./.metrics/${{ inputs.snapshot }}.txt + retention-days: 7 + + # The github cache restore successfully restores when cache saved has same key and same path. + # Hence to restore release metric with name relese_{metric_name} , the name must be changed to the same. + - name: Change file name before caching + if: github.ref_name == 'main' + shell: bash + run: | + mv ./.metrics/${{ inputs.snapshot }}.txt ./.metrics/baseline_${{ inputs.snapshot }}.txt + + - name: Cache metrics snapshot on main branch for longer retention + if: github.ref_name == 'main' + uses: actions/cache/save@1bd1e32a3bdc45362d1e726936510720a7c30a57 + with: + path: ./.metrics/baseline_${{ inputs.snapshot }}.txt + key: ${{ inputs.artifact_key }}_${{ github.run_id }} + + # Use restore keys to match prefix and fetch the latest cache + # Here , restore keys is an ordered list of prefixes that need to be matched + - name: Download the cached tagged metrics + id: download-release-snapshot + if: github.ref_name != 'main' + uses: actions/cache/restore@1bd1e32a3bdc45362d1e726936510720a7c30a57 + with: + path: ./.metrics/baseline_${{ inputs.snapshot }}.txt + key: ${{ inputs.artifact_key }} + restore-keys: | + ${{ inputs.artifact_key }} + + - name: Calculate diff between the snapshots + id: compare-snapshots + if: ${{ (github.ref_name != 'main') && (steps.download-release-snapshot.outputs.cache-matched-key != '') }} + shell: bash + run: | + python3 -m pip install prometheus-client + python3 ./scripts/e2e/compare_metrics.py --file1 ./.metrics/${{ inputs.snapshot }}.txt --file2 ./.metrics/baseline_${{ inputs.snapshot }}.txt --output ./.metrics/diff_${{ inputs.snapshot }}.txt + if [ $? -eq 1 ]; then + echo "🛑 Differences found in metrics" + exit 1 + fi + + - name: Upload the diff artifact + if: ${{ (github.ref_name != 'main') && (steps.compare-snapshots.outcome == 'failure') }} + uses: actions/upload-artifact@6f51ac03b9356f520e9adb1b1b7802705f340c2b # v4.5.0 + with: + name: diff_${{ inputs.artifact_key }} + path: ./.metrics/diff_${{ inputs.snapshot }}.txt + retention-days: 7 + + \ No newline at end of file diff --git a/.github/workflows/ci-e2e-all.yml b/.github/workflows/ci-e2e-all.yml index fb41a17b5c5..082270f5488 100644 --- a/.github/workflows/ci-e2e-all.yml +++ b/.github/workflows/ci-e2e-all.yml @@ -36,7 +36,3 @@ jobs: opensearch: uses: ./.github/workflows/ci-e2e-opensearch.yml - - - - diff --git a/.github/workflows/ci-e2e-badger.yaml b/.github/workflows/ci-e2e-badger.yaml index 808b73fc6ce..000aec55847 100644 --- a/.github/workflows/ci-e2e-badger.yaml +++ b/.github/workflows/ci-e2e-badger.yaml @@ -25,7 +25,6 @@ jobs: egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs - uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0 - - uses: actions/setup-go@3041bf56c941b39c61721a86cd11f3bb1338122a # v5.2.0 with: go-version: 1.23.x @@ -41,6 +40,12 @@ jobs: ;; esac + - uses: ./.github/actions/verify-metrics-snapshot + if: matrix.version == 'v2' + with: + snapshot: metrics_snapshot_badger + artifact_key: metrics_snapshot_badger_${{ matrix.version }} + - name: Upload coverage to codecov uses: ./.github/actions/upload-codecov with: diff --git a/.github/workflows/ci-e2e-cassandra.yml b/.github/workflows/ci-e2e-cassandra.yml index 1d866828736..6ddb129c396 100644 --- a/.github/workflows/ci-e2e-cassandra.yml +++ b/.github/workflows/ci-e2e-cassandra.yml @@ -48,9 +48,17 @@ jobs: run: bash scripts/e2e/cassandra.sh ${{ matrix.version.major }} ${{ matrix.version.schema }} ${{ matrix.jaeger-version }} env: SKIP_APPLY_SCHEMA: ${{ matrix.create-schema == 'auto' && true || false }} + + - uses: ./.github/actions/verify-metrics-snapshot + if: matrix.jaeger-version == 'v2' + with: + snapshot: metrics_snapshot_cassandra + artifact_key: metrics_snapshot_cassandras_${{ matrix.version.major }}_${{ matrix.version.schema }}_${{ matrix.jaeger-version }}_${{ matrix.create-schema }} - name: Upload coverage to codecov uses: ./.github/actions/upload-codecov with: files: cover.out flags: cassandra-${{ matrix.version.major }}-${{ matrix.jaeger-version }}-${{ matrix.create-schema }} + + diff --git a/.github/workflows/ci-e2e-elasticsearch.yml b/.github/workflows/ci-e2e-elasticsearch.yml index d6d90ed55f3..2510026cb4b 100644 --- a/.github/workflows/ci-e2e-elasticsearch.yml +++ b/.github/workflows/ci-e2e-elasticsearch.yml @@ -40,11 +40,7 @@ jobs: - uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0 with: submodules: true - - - name: Fetch git tags - run: | - git fetch --prune --unshallow --tags - + - uses: actions/setup-go@3041bf56c941b39c61721a86cd11f3bb1338122a # v5.2.0 with: go-version: 1.23.x @@ -58,6 +54,12 @@ jobs: - name: Run ${{ matrix.version.distribution }} integration tests id: test-execution run: bash scripts/e2e/elasticsearch.sh ${{ matrix.version.distribution }} ${{ matrix.version.major }} ${{ matrix.version.jaeger }} + + - uses: ./.github/actions/verify-metrics-snapshot + if: matrix.version.jaeger == 'v2' + with: + snapshot: metrics_snapshot_elasticsearch + artifact_key: metrics_snapshot_elasticsearch_${{ matrix.version.major }}_${{ matrix.version.jaeger}} - name: Upload coverage to codecov uses: ./.github/actions/upload-codecov diff --git a/.github/workflows/ci-e2e-grpc.yml b/.github/workflows/ci-e2e-grpc.yml index 4126c10b352..bea28085eec 100644 --- a/.github/workflows/ci-e2e-grpc.yml +++ b/.github/workflows/ci-e2e-grpc.yml @@ -41,6 +41,12 @@ jobs: ;; esac + - uses: ./.github/actions/verify-metrics-snapshot + if: matrix.version == 'v2' + with: + snapshot: metrics_snapshot_grpc + artifact_key: metrics_snapshot_grpc_${{ matrix.version }} + - name: Upload coverage to codecov uses: ./.github/actions/upload-codecov with: diff --git a/.github/workflows/ci-e2e-kafka.yml b/.github/workflows/ci-e2e-kafka.yml index c6b3750b420..b7f1c796ddd 100644 --- a/.github/workflows/ci-e2e-kafka.yml +++ b/.github/workflows/ci-e2e-kafka.yml @@ -35,6 +35,12 @@ jobs: - name: Run kafka integration tests id: test-execution run: bash scripts/e2e/kafka.sh -j ${{ matrix.jaeger-version }} -v ${{ matrix.kafka-version }} + + - uses: ./.github/actions/verify-metrics-snapshot + if: matrix.jaeger-version == 'v2' + with: + snapshot: metrics_snapshot_kafka + artifact_key: metrics_snapshot_kafka_${{ matrix.jaeger-version }} - name: Upload coverage to codecov uses: ./.github/actions/upload-codecov diff --git a/.github/workflows/ci-e2e-memory.yaml b/.github/workflows/ci-e2e-memory.yaml index c8dd9123c8e..d615487f532 100644 --- a/.github/workflows/ci-e2e-memory.yaml +++ b/.github/workflows/ci-e2e-memory.yaml @@ -21,7 +21,7 @@ jobs: egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs - uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0 - + - uses: actions/setup-go@3041bf56c941b39c61721a86cd11f3bb1338122a # v5.2.0 with: go-version: 1.23.x @@ -29,7 +29,12 @@ jobs: - name: Run Memory storage integration tests run: | STORAGE=memory_v2 make jaeger-v2-storage-integration-test - + + - uses: ./.github/actions/verify-metrics-snapshot + with: + snapshot: metrics_snapshot_memory + artifact_key: metrics_snapshot_memory + - name: Upload coverage to codecov uses: ./.github/actions/upload-codecov with: diff --git a/.github/workflows/ci-e2e-opensearch.yml b/.github/workflows/ci-e2e-opensearch.yml index c4d79a3c2ad..1e1405a604e 100644 --- a/.github/workflows/ci-e2e-opensearch.yml +++ b/.github/workflows/ci-e2e-opensearch.yml @@ -38,10 +38,6 @@ jobs: with: submodules: true - - name: Fetch git tags - run: | - git fetch --prune --unshallow --tags - - uses: actions/setup-go@3041bf56c941b39c61721a86cd11f3bb1338122a # v5.2.0 with: go-version: 1.23.x @@ -52,6 +48,12 @@ jobs: id: test-execution run: bash scripts/e2e/elasticsearch.sh ${{ matrix.version.distribution }} ${{ matrix.version.major }} ${{ matrix.version.jaeger }} + - uses: ./.github/actions/verify-metrics-snapshot + if: matrix.version.jaeger == 'v2' + with: + snapshot: metrics_snapshot_opensearch + artifact_key: metrics_snapshot_opensearch_${{ matrix.version.major }} + - name: Upload coverage to codecov uses: ./.github/actions/upload-codecov with: diff --git a/.gitignore b/.gitignore index e8719d8386d..95bc230a2d3 100644 --- a/.gitignore +++ b/.gitignore @@ -50,3 +50,4 @@ sha256sum.combined.txt resource.syso .gocache test-results.json +.metrics/ \ No newline at end of file diff --git a/cmd/jaeger/internal/integration/e2e_integration.go b/cmd/jaeger/internal/integration/e2e_integration.go index 5882d1f42e0..a86bbac6522 100644 --- a/cmd/jaeger/internal/integration/e2e_integration.go +++ b/cmd/jaeger/internal/integration/e2e_integration.go @@ -99,11 +99,32 @@ func (s *E2EStorageIntegration) e2eInitialize(t *testing.T, storage string) { require.NoError(t, err) t.Cleanup(func() { + scrapeMetrics(t, storage) require.NoError(t, s.TraceReader.(io.Closer).Close()) require.NoError(t, s.TraceWriter.(io.Closer).Close()) }) } +func scrapeMetrics(t *testing.T, storage string) { + req, err := http.NewRequestWithContext(context.Background(), http.MethodGet, "http://localhost:8888/metrics", nil) + require.NoError(t, err) + + client := &http.Client{} + resp, err := client.Do(req) + require.NoError(t, err) + defer resp.Body.Close() + + outputDir := "../../../../.metrics" + require.NoError(t, os.MkdirAll(outputDir, os.ModePerm)) + + metricsFile, err := os.Create(fmt.Sprintf("%s/metrics_snapshot_%v.txt", outputDir, storage)) + require.NoError(t, err) + defer metricsFile.Close() + + _, err = io.Copy(metricsFile, resp.Body) + require.NoError(t, err) +} + func createStorageCleanerConfig(t *testing.T, configFile string, storage string) string { data, err := os.ReadFile(configFile) require.NoError(t, err) diff --git a/scripts/e2e/compare_metrics.py b/scripts/e2e/compare_metrics.py new file mode 100644 index 00000000000..c30edb4114f --- /dev/null +++ b/scripts/e2e/compare_metrics.py @@ -0,0 +1,77 @@ +# Copyright (c) 2024 The Jaeger Authors. +# SPDX-License-Identifier: Apache-2.0 + +import argparse +from difflib import unified_diff +from bisect import insort +from prometheus_client.parser import text_string_to_metric_families + +def read_metric_file(file_path): + with open(file_path, 'r') as f: + return f.readlines() + +def parse_metrics(content): + metrics = [] + for family in text_string_to_metric_families(content): + for sample in family.samples: + labels = dict(sample.labels) + #simply pop undesirable metric labels + labels.pop('service_instance_id',None) + label_pairs = sorted(labels.items(), key=lambda x: x[0]) + label_str = ','.join(f'{k}="{v}"' for k,v in label_pairs) + metric = f"{family.name}{{{label_str}}}" + insort(metrics , metric) + + return metrics + + +def generate_diff(file1_content, file2_content): + if isinstance(file1_content, list): + file1_content = ''.join(file1_content) + if isinstance(file2_content, list): + file2_content = ''.join(file2_content) + + metrics1 = parse_metrics(file1_content) + metrics2 = parse_metrics(file2_content) + + diff = unified_diff(metrics1, metrics2,lineterm='',n=0) + + return '\n'.join(diff) + +def write_diff_file(diff_lines, output_path): + + with open(output_path, 'w') as f: + f.write(diff_lines) + f.write('\n') # Add final newline + print(f"Diff file successfully written to: {output_path}") + +def main(): + parser = argparse.ArgumentParser(description='Generate diff between two Jaeger metric files') + parser.add_argument('--file1', help='Path to first metric file') + parser.add_argument('--file2', help='Path to second metric file') + parser.add_argument('--output', '-o', default='metrics_diff.txt', + help='Output diff file path (default: metrics_diff.txt)') + + args = parser.parse_args() + + # Read input files + file1_lines = read_metric_file(args.file1) + file2_lines = read_metric_file(args.file2) + + # Generate diff + diff_lines = generate_diff(file1_lines, file2_lines) + + # Check if there are any differences + if diff_lines: + print("differences found between the metric files.") + print("=== Metrics Comparison Results ===") + print(diff_lines) + write_diff_file(diff_lines, args.output) + + return 1 + + print("no difference found") + return 0 + +if __name__ == '__main__': + main() \ No newline at end of file