From 2a0e7513a4c85cec230433dd0bd328426105b637 Mon Sep 17 00:00:00 2001 From: Pavel Kalinnikov Date: Thu, 14 Nov 2024 09:58:34 +0000 Subject: [PATCH 1/2] roachtest: fix health and consistency checks Epic: none Release note: none --- pkg/cmd/roachtest/BUILD.bazel | 1 - pkg/cmd/roachtest/cluster.go | 4 ++-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/pkg/cmd/roachtest/BUILD.bazel b/pkg/cmd/roachtest/BUILD.bazel index daa466d53926..f540e958285c 100644 --- a/pkg/cmd/roachtest/BUILD.bazel +++ b/pkg/cmd/roachtest/BUILD.bazel @@ -50,7 +50,6 @@ go_library( "//pkg/testutils/skip", "//pkg/util/allstacks", "//pkg/util/ctxgroup", - "//pkg/util/httputil", "//pkg/util/leaktest", "//pkg/util/quotapool", "//pkg/util/randutil", diff --git a/pkg/cmd/roachtest/cluster.go b/pkg/cmd/roachtest/cluster.go index 264327488673..acc480ce719c 100644 --- a/pkg/cmd/roachtest/cluster.go +++ b/pkg/cmd/roachtest/cluster.go @@ -47,7 +47,6 @@ import ( "github.com/cockroachdb/cockroach/pkg/roachprod/prometheus" "github.com/cockroachdb/cockroach/pkg/roachprod/vm" "github.com/cockroachdb/cockroach/pkg/roachprod/vm/gce" - "github.com/cockroachdb/cockroach/pkg/util/httputil" "github.com/cockroachdb/cockroach/pkg/util/syncutil" "github.com/cockroachdb/cockroach/pkg/util/timeutil" "github.com/cockroachdb/errors" @@ -1546,9 +1545,10 @@ func (c *clusterImpl) HealthStatus( if err != nil { return nil, errors.WithDetail(err, "Unable to get admin UI address(es)") } + client := roachtestutil.DefaultHTTPClient(c, l) getStatus := func(ctx context.Context, node int) *HealthStatusResult { url := fmt.Sprintf(`https://%s/health?ready=1`, adminAddrs[node-1]) - resp, err := httputil.Get(ctx, url) + resp, err := client.Get(ctx, url) if err != nil { return newHealthStatusResult(node, 0, nil, err) } From c6433cd3dbadb919a1867214b7ca76cbf7b7c915 Mon Sep 17 00:00:00 2001 From: Pavel Kalinnikov Date: Thu, 14 Nov 2024 11:24:08 +0000 Subject: [PATCH 2/2] roachtest: fix consistency check connection tenant Epic: none Release note: none --- pkg/cmd/roachtest/test_runner.go | 44 ++++++++++++++++++-------------- 1 file changed, 25 insertions(+), 19 deletions(-) diff --git a/pkg/cmd/roachtest/test_runner.go b/pkg/cmd/roachtest/test_runner.go index 1c3217e4e407..861dada4e6d5 100644 --- a/pkg/cmd/roachtest/test_runner.go +++ b/pkg/cmd/roachtest/test_runner.go @@ -7,7 +7,6 @@ package main import ( "context" - gosql "database/sql" "encoding/json" "fmt" "html" @@ -1458,8 +1457,7 @@ func (r *testRunner) postTestAssertions( postAssertionErr(errors.WithDetail(err, "Unable to check health status")) } - var db *gosql.DB - var validationNode int + validationNode := 0 for _, s := range statuses { if s.Err != nil { t.L().Printf("n%d:/health?ready=1 error=%s", s.Node, s.Err) @@ -1471,9 +1469,8 @@ func (r *testRunner) postTestAssertions( continue } - if db == nil { - db = c.Conn(ctx, t.L(), s.Node) - validationNode = s.Node + if validationNode == 0 { + validationNode = s.Node // NB: s.Node is never zero } t.L().Printf("n%d:/health?ready=1 status=200 ok", s.Node) } @@ -1486,25 +1483,34 @@ func (r *testRunner) postTestAssertions( // // TODO(testinfra): figure out why this can still get stuck despite the // above. - if db != nil { - defer db.Close() - t.L().Printf("running validation checks on node %d (<10m)", validationNode) - // If this validation fails due to a timeout, it is very likely that - // the replica divergence check below will also fail. - if t.spec.SkipPostValidations®istry.PostValidationInvalidDescriptors == 0 { + if validationNode == 0 { + t.L().Printf("no live node found, skipping validation checks") + return + } + + t.L().Printf("running validation checks on node %d (<10m)", validationNode) + // If this validation fails due to a timeout, it is very likely that + // the replica divergence check below will also fail. + if t.spec.SkipPostValidations®istry.PostValidationInvalidDescriptors == 0 { + func() { + db := c.Conn(ctx, t.L(), validationNode) + defer db.Close() if err := roachtestutil.CheckInvalidDescriptors(ctx, db); err != nil { postAssertionErr(errors.WithDetail(err, "invalid descriptors check failed")) } - } - // Detect replica divergence (i.e. ranges in which replicas have arrived - // at the same log position with different states). - if t.spec.SkipPostValidations®istry.PostValidationReplicaDivergence == 0 { + }() + } + // Detect replica divergence (i.e. ranges in which replicas have arrived + // at the same log position with different states). + if t.spec.SkipPostValidations®istry.PostValidationReplicaDivergence == 0 { + func() { + // NB: the consistency checks should run at the system tenant level. + db := c.Conn(ctx, t.L(), validationNode, option.VirtualClusterName("system")) + defer db.Close() if err := c.assertConsistentReplicas(ctx, db, t); err != nil { postAssertionErr(errors.WithDetail(err, "consistency check failed")) } - } - } else { - t.L().Printf("no live node found, skipping validation checks") + }() } })