Skip to content

Commit

Permalink
Merge pull request #135286 from pav-kv/backport24.3-135156
Browse files Browse the repository at this point in the history
release-24.3: roachtest: fix health and consistency checks
  • Loading branch information
pav-kv authored Nov 15, 2024
2 parents 4aff47f + c6433cd commit bd78b4e
Show file tree
Hide file tree
Showing 3 changed files with 27 additions and 22 deletions.
1 change: 0 additions & 1 deletion pkg/cmd/roachtest/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,6 @@ go_library(
"//pkg/testutils/skip",
"//pkg/util/allstacks",
"//pkg/util/ctxgroup",
"//pkg/util/httputil",
"//pkg/util/leaktest",
"//pkg/util/quotapool",
"//pkg/util/randutil",
Expand Down
4 changes: 2 additions & 2 deletions pkg/cmd/roachtest/cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,6 @@ import (
"github.com/cockroachdb/cockroach/pkg/roachprod/prometheus"
"github.com/cockroachdb/cockroach/pkg/roachprod/vm"
"github.com/cockroachdb/cockroach/pkg/roachprod/vm/gce"
"github.com/cockroachdb/cockroach/pkg/util/httputil"
"github.com/cockroachdb/cockroach/pkg/util/syncutil"
"github.com/cockroachdb/cockroach/pkg/util/timeutil"
"github.com/cockroachdb/errors"
Expand Down Expand Up @@ -1546,9 +1545,10 @@ func (c *clusterImpl) HealthStatus(
if err != nil {
return nil, errors.WithDetail(err, "Unable to get admin UI address(es)")
}
client := roachtestutil.DefaultHTTPClient(c, l)
getStatus := func(ctx context.Context, node int) *HealthStatusResult {
url := fmt.Sprintf(`https://%s/health?ready=1`, adminAddrs[node-1])
resp, err := httputil.Get(ctx, url)
resp, err := client.Get(ctx, url)
if err != nil {
return newHealthStatusResult(node, 0, nil, err)
}
Expand Down
44 changes: 25 additions & 19 deletions pkg/cmd/roachtest/test_runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@ package main

import (
"context"
gosql "database/sql"
"encoding/json"
"fmt"
"html"
Expand Down Expand Up @@ -1458,8 +1457,7 @@ func (r *testRunner) postTestAssertions(
postAssertionErr(errors.WithDetail(err, "Unable to check health status"))
}

var db *gosql.DB
var validationNode int
validationNode := 0
for _, s := range statuses {
if s.Err != nil {
t.L().Printf("n%d:/health?ready=1 error=%s", s.Node, s.Err)
Expand All @@ -1471,9 +1469,8 @@ func (r *testRunner) postTestAssertions(
continue
}

if db == nil {
db = c.Conn(ctx, t.L(), s.Node)
validationNode = s.Node
if validationNode == 0 {
validationNode = s.Node // NB: s.Node is never zero
}
t.L().Printf("n%d:/health?ready=1 status=200 ok", s.Node)
}
Expand All @@ -1486,25 +1483,34 @@ func (r *testRunner) postTestAssertions(
//
// TODO(testinfra): figure out why this can still get stuck despite the
// above.
if db != nil {
defer db.Close()
t.L().Printf("running validation checks on node %d (<10m)", validationNode)
// If this validation fails due to a timeout, it is very likely that
// the replica divergence check below will also fail.
if t.spec.SkipPostValidations&registry.PostValidationInvalidDescriptors == 0 {
if validationNode == 0 {
t.L().Printf("no live node found, skipping validation checks")
return
}

t.L().Printf("running validation checks on node %d (<10m)", validationNode)
// If this validation fails due to a timeout, it is very likely that
// the replica divergence check below will also fail.
if t.spec.SkipPostValidations&registry.PostValidationInvalidDescriptors == 0 {
func() {
db := c.Conn(ctx, t.L(), validationNode)
defer db.Close()
if err := roachtestutil.CheckInvalidDescriptors(ctx, db); err != nil {
postAssertionErr(errors.WithDetail(err, "invalid descriptors check failed"))
}
}
// Detect replica divergence (i.e. ranges in which replicas have arrived
// at the same log position with different states).
if t.spec.SkipPostValidations&registry.PostValidationReplicaDivergence == 0 {
}()
}
// Detect replica divergence (i.e. ranges in which replicas have arrived
// at the same log position with different states).
if t.spec.SkipPostValidations&registry.PostValidationReplicaDivergence == 0 {
func() {
// NB: the consistency checks should run at the system tenant level.
db := c.Conn(ctx, t.L(), validationNode, option.VirtualClusterName("system"))
defer db.Close()
if err := c.assertConsistentReplicas(ctx, db, t); err != nil {
postAssertionErr(errors.WithDetail(err, "consistency check failed"))
}
}
} else {
t.L().Printf("no live node found, skipping validation checks")
}()
}
})

Expand Down

0 comments on commit bd78b4e

Please sign in to comment.