Skip to content

Commit

Permalink
Vitess: ignore unhealthy replicas with realtime stats (#136)
Browse files Browse the repository at this point in the history
* Ignore Vitess replicas without running replication

* Fix typo

* Method rename

* Add to comment

* Copy logic vtgate uses to filter tablets, minus lag+tablet count

* Only check if &TabletRealtimeStats{} is nil, not the HealthError

* Add test for nil realtime stats

* Do not ignore 'serving: false' tablets

* Improve test comments
  • Loading branch information
timvaillancourt authored Mar 18, 2021
1 parent 67e708a commit 8d8b3f5
Show file tree
Hide file tree
Showing 2 changed files with 70 additions and 19 deletions.
30 changes: 29 additions & 1 deletion pkg/vitess/api_client.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,25 @@ import (

const defaultTimeout = time.Duration(5) * time.Second

// TabletRealtimeStats represents realtime stats from a running instance of vttablet.
type TabletRealtimeStats struct {
HealthError string `json:"health_error,omitempty"`
}

// TabletStats represents stats from a running instance of vttablet.
type TabletStats struct {
LastError string `json:"last_error,omitempty"`
Realtime *TabletRealtimeStats `json:"realtime,omitempty"`
Serving bool `json:"serving,omitempty"`
Up bool `json:"up,omitempty"`
}

// Tablet represents information about a running instance of vttablet.
type Tablet struct {
Alias *topodata.TabletAlias `json:"alias,omitempty"`
MysqlHostname string `json:"mysql_hostname,omitempty"`
MysqlPort int32 `json:"mysql_port,omitempty"`
Stats *TabletStats `json:"stats,omitempty"`
Type topodata.TabletType `json:"type,omitempty"`
}

Expand All @@ -36,9 +50,23 @@ func (t Tablet) HasValidCell(validCells []string) bool {
return false
}

// IsServeable returns a bool reflecting if a tablet is eligible to serve traffic based on tablet stats. For
// backwards-compatibilty tablets are assumed to be healthy if realtime stats is disabled. This method aims
// to mimic the logic used by vtgate to select tablets for read queries without considering 'serving', minimum
// tablet count (not important to freno) and replication lag (freno polls its own replication lag)
func (t Tablet) IsServeable() bool {
if t.Stats != nil {
return t.Stats.LastError == "" && t.Stats.Realtime != nil
}
return true
}

// IsValidReplica returns a bool reflecting if a tablet type is REPLICA
func (t Tablet) IsValidReplica() bool {
return t.Type == topodata.TabletType_REPLICA
if t.Type != topodata.TabletType_REPLICA {
return false
}
return t.IsServeable()
}

var httpClient = http.Client{
Expand Down
59 changes: 41 additions & 18 deletions pkg/vitess/api_client_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@ package vitess

import (
"encoding/json"
"fmt"
"net/http"
"net/http/httptest"
"testing"
Expand All @@ -16,44 +15,68 @@ func TestParseTablets(t *testing.T) {
vitessApi := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
switch r.URL.String() {
case "/api/keyspace/test/tablets/00", "/api/keyspace/test/tablets/00?cells=cell2":
data, _ := json.Marshal([]Tablet{
json.NewEncoder(w).Encode([]Tablet{
{
Alias: &topodata.TabletAlias{Cell: "cell1"},
MysqlHostname: "master",
Type: topodata.TabletType_MASTER,
// primary (should be ignored)
Alias: &topodata.TabletAlias{Cell: "cell1"},
Type: topodata.TabletType_MASTER,
},
{
// replica without realtime tablet stats enabled (assumed to be healthy)
Alias: &topodata.TabletAlias{Cell: "cell2"},
MysqlHostname: "replica1",
Type: topodata.TabletType_REPLICA,
},
{
// replica with healthy realtime tablet stats
Alias: &topodata.TabletAlias{Cell: "cell3"},
MysqlHostname: "replica2",
Type: topodata.TabletType_REPLICA,
Stats: &TabletStats{
Realtime: &TabletRealtimeStats{},
},
Type: topodata.TabletType_REPLICA,
},
{
// replica with nil realtime stats (should be ignored)
Alias: &topodata.TabletAlias{Cell: "cell1"},
MysqlHostname: "replica3",
Stats: &TabletStats{
Realtime: nil,
},
},
{
// replica with realtime tablet stats and 'replication not running' error (should be ignored)
Alias: &topodata.TabletAlias{Cell: "cell2"},
MysqlHostname: "spare",
Type: topodata.TabletType_SPARE,
MysqlHostname: "replica4",
Stats: &TabletStats{
LastError: "vttablet error: replication is not running",
Realtime: &TabletRealtimeStats{
HealthError: "replication is not running",
},
},
Type: topodata.TabletType_REPLICA,
},
{
Alias: &topodata.TabletAlias{Cell: "cell3"},
MysqlHostname: "batch",
Type: topodata.TabletType_BATCH,
// spare tablet (should be ignored)
Alias: &topodata.TabletAlias{Cell: "cell2"},
Type: topodata.TabletType_SPARE,
},
{
Alias: &topodata.TabletAlias{Cell: "cell2"},
MysqlHostname: "backup",
Type: topodata.TabletType_BACKUP,
// batch tablet (should be ignored)
Alias: &topodata.TabletAlias{Cell: "cell3"},
Type: topodata.TabletType_BATCH,
},
{
Alias: &topodata.TabletAlias{Cell: "cell1"},
MysqlHostname: "restore",
Type: topodata.TabletType_RESTORE,
// backup tablet (should be ignored)
Alias: &topodata.TabletAlias{Cell: "cell2"},
Type: topodata.TabletType_BACKUP,
},
{
// restore tablet (should be ignored)
Alias: &topodata.TabletAlias{Cell: "cell1"},
Type: topodata.TabletType_RESTORE,
},
})
fmt.Fprint(w, string(data))
default:
w.WriteHeader(http.StatusNotFound)
}
Expand Down

0 comments on commit 8d8b3f5

Please sign in to comment.