Skip to content

Commit

Permalink
[statspro] Bootstrap database statistics once on startup (#8036)
Browse files Browse the repository at this point in the history
* [statspro] Bootstrap database statistics once on startup

* fix some bats

* [ga-format-pr] Run go/utils/repofmt/format_repo.sh and go/Godeps/update.sh

* accommodate missing default branch bugs

* bootstrap ignores db not found error

* load stats for read replica db

* disable bootstrap stats from query plan tests

* abort bootstrap for >2mm rows

* bad merge

* zach comment

---------

Co-authored-by: max-hoffman <[email protected]>
  • Loading branch information
max-hoffman and max-hoffman authored Jun 24, 2024
1 parent 7059d8a commit c5c05b7
Show file tree
Hide file tree
Showing 10 changed files with 151 additions and 18 deletions.
2 changes: 2 additions & 0 deletions go/libraries/doltcore/env/actions/branch.go
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,8 @@ func RenameBranch(ctx context.Context, dbData env.DbData, oldBranch, newBranch s
}
}

// todo: update default branch variable

return DeleteBranch(ctx, dbData, oldBranch, DeleteOptions{Force: true, AllowDeletingCurrentBranch: true}, remoteDbPro, rsc)
}

Expand Down
1 change: 1 addition & 0 deletions go/libraries/doltcore/sqle/dsess/variables.go
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ const (
DoltClusterAckWritesTimeoutSecs = "dolt_cluster_ack_writes_timeout_secs"

DoltStatsAutoRefreshEnabled = "dolt_stats_auto_refresh_enabled"
DoltStatsBootstrapEnabled = "dolt_stats_bootstrap_enabled"
DoltStatsAutoRefreshThreshold = "dolt_stats_auto_refresh_threshold"
DoltStatsAutoRefreshInterval = "dolt_stats_auto_refresh_interval"
DoltStatsMemoryOnly = "dolt_stats_memory_only"
Expand Down
1 change: 1 addition & 0 deletions go/libraries/doltcore/sqle/enginetest/dolt_engine_tests.go
Original file line number Diff line number Diff line change
Expand Up @@ -266,6 +266,7 @@ func RunQueryTestPlans(t *testing.T, harness DoltEnginetestHarness) {
}

defer harness.Close()
sql.SystemVariables.SetGlobal(dsess.DoltStatsBootstrapEnabled, 0)
enginetest.TestQueryPlans(t, harness, queries.PlanTests)
}

Expand Down
5 changes: 2 additions & 3 deletions go/libraries/doltcore/sqle/statsnoms/iter.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@ import (

"github.com/dolthub/go-mysql-server/sql"
"github.com/dolthub/go-mysql-server/sql/planbuilder"
"github.com/dolthub/go-mysql-server/sql/stats"
"gopkg.in/errgo.v2/errors"

"github.com/dolthub/dolt/go/libraries/doltcore/schema"
Expand Down Expand Up @@ -114,15 +113,15 @@ func (s *statsIter) Next(ctx *sql.Context) (sql.Row, error) {
upperBoundCnt := row[schema.StatsUpperBoundCntTag].(int64)
createdAt := row[schema.StatsCreatedAtTag].(time.Time)

typs := strings.Split(typesStr, ",")
typs := strings.Split(typesStr, "\n")
for i, t := range typs {
typs[i] = strings.TrimSpace(t)
}

qual := sql.NewStatQualifier(dbName, tableName, indexName)
if curQual := qual.String(); !strings.EqualFold(curQual, s.currentQual) {
s.currentQual = curQual
s.currentTypes, err = stats.ParseTypeStrings(typs)
s.currentTypes, err = parseTypeStrings(typs)
if err != nil {
return nil, err
}
Expand Down
21 changes: 17 additions & 4 deletions go/libraries/doltcore/sqle/statsnoms/load.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ import (
"time"

"github.com/dolthub/go-mysql-server/sql"
"github.com/dolthub/go-mysql-server/sql/planbuilder"
"github.com/dolthub/go-mysql-server/sql/stats"

"github.com/dolthub/dolt/go/libraries/doltcore/doltdb"
Expand Down Expand Up @@ -68,7 +69,7 @@ func loadStats(ctx *sql.Context, db dsess.SqlDatabase, m prolly.Map) (map[sql.St
upperBoundCnt := row[schema.StatsUpperBoundCntTag].(uint64)
createdAt := row[schema.StatsCreatedAtTag].(time.Time)

typs := strings.Split(typesStr, ",")
typs := strings.Split(typesStr, "\n")
for i, t := range typs {
typs[i] = strings.TrimSpace(t)
}
Expand All @@ -90,7 +91,7 @@ func loadStats(ctx *sql.Context, db dsess.SqlDatabase, m prolly.Map) (map[sql.St

mcvs := make([]sql.Row, numMcvs)
for i, v := range row[schema.StatsMcv1Tag:schema.StatsMcvCountsTag] {
if v != nil {
if v != nil && v != "" {
row, err := iter.ParseRow(v.(string))
if err != nil {
return nil, err
Expand Down Expand Up @@ -136,7 +137,7 @@ func loadStats(ctx *sql.Context, db dsess.SqlDatabase, m prolly.Map) (map[sql.St
}

if currentStat.Statistic.Hist == nil {
currentStat.Statistic.Typs, err = stats.ParseTypeStrings(typs)
currentStat.Statistic.Typs, err = parseTypeStrings(typs)
if err != nil {
return nil, err
}
Expand Down Expand Up @@ -180,6 +181,18 @@ func loadStats(ctx *sql.Context, db dsess.SqlDatabase, m prolly.Map) (map[sql.St
return qualToStats, nil
}

func parseTypeStrings(typs []string) ([]sql.Type, error) {
var ret []sql.Type
for _, typ := range typs {
ct, err := planbuilder.ParseColumnTypeString(typ)
if err != nil {
return nil, err
}
ret = append(ret, ct)
}
return ret, nil
}

func loadLowerBound(ctx *sql.Context, qual sql.StatQualifier) (sql.Row, error) {
dSess := dsess.DSessFromSess(ctx.Session)
roots, ok := dSess.GetRoots(ctx, qual.Db())
Expand Down Expand Up @@ -216,7 +229,7 @@ func loadLowerBound(ctx *sql.Context, qual sql.StatQualifier) (sql.Row, error) {
}

firstKey := keyBuilder.Build(buffPool)
var firstRow sql.Row
firstRow := make(sql.Row, keyBuilder.Desc.Count())
for i := 0; i < keyBuilder.Desc.Count(); i++ {
firstRow[i], err = tree.GetField(ctx, prollyMap.KeyDesc(), i, firstKey, prollyMap.NodeStore())
if err != nil {
Expand Down
2 changes: 1 addition & 1 deletion go/libraries/doltcore/sqle/statsnoms/write.go
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ func putIndexRows(ctx context.Context, statsMap *prolly.MutableMap, dStats *stat
sep := ""
for _, t := range dStats.Statistic.Typs {
typesB.WriteString(sep + t.String())
sep = ","
sep = "\n"
}
typesStr := typesB.String()

Expand Down
61 changes: 60 additions & 1 deletion go/libraries/doltcore/sqle/statspro/analyze.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,12 +29,62 @@ import (
"github.com/dolthub/dolt/go/store/prolly/tree"
)

const (
boostrapRowLimit = 2e6
)

func (p *Provider) RefreshTableStats(ctx *sql.Context, table sql.Table, db string) error {
dSess := dsess.DSessFromSess(ctx.Session)
branch, err := dSess.GetBranch()
if err != nil {
return err
}
return p.RefreshTableStatsWithBranch(ctx, table, db, branch)
}

func (p *Provider) BootstrapDatabaseStats(ctx *sql.Context, db string) error {
dSess := dsess.DSessFromSess(ctx.Session)
branches := p.getStatsBranches(ctx)
var rows uint64
for _, branch := range branches {
sqlDb, err := dSess.Provider().Database(ctx, p.branchQualifiedDatabase(db, branch))
if err != nil {
if sql.ErrDatabaseNotFound.Is(err) {
// default branch is not valid
continue
}
return err
}
tables, err := sqlDb.GetTableNames(ctx)
if err != nil {
return err
}
for _, table := range tables {
sqlTable, _, err := GetLatestTable(ctx, table, sqlDb)
if err != nil {
return err
}

if st, ok := sqlTable.(sql.StatisticsTable); ok {
cnt, ok, err := st.RowCount(ctx)
if ok && err == nil {
rows += cnt
}
}
if rows >= boostrapRowLimit {
return fmt.Errorf("stats bootstrap aborted because %s exceeds the default row limit; manually run \"ANALYZE <table>\" or \"call dolt_stats_restart()\" to collect statistics", db)
}

if err := p.RefreshTableStatsWithBranch(ctx, sqlTable, db, branch); err != nil {
return err
}
}
}
return nil
}

func (p *Provider) RefreshTableStatsWithBranch(ctx *sql.Context, table sql.Table, db string, branch string) error {
dSess := dsess.DSessFromSess(ctx.Session)

sqlDb, err := dSess.Provider().Database(ctx, p.branchQualifiedDatabase(db, branch))
if err != nil {
Expand Down Expand Up @@ -143,7 +193,16 @@ func (p *Provider) branchQualifiedDatabase(db, branch string) string {

// GetLatestTable will get the WORKING root table for the current database/branch
func GetLatestTable(ctx *sql.Context, tableName string, sqlDb sql.Database) (sql.Table, *doltdb.Table, error) {
sqlTable, ok, err := sqlDb.(sqle.Database).GetTableInsensitive(ctx, tableName)
var db sqle.Database
switch d := sqlDb.(type) {
case sqle.Database:
db = d
case sqle.ReadReplicaDatabase:
db = d.Database
default:
return nil, nil, fmt.Errorf("expected sqle.Database, found %T", sqlDb)
}
sqlTable, ok, err := db.GetTableInsensitive(ctx, tableName)
if err != nil {
return nil, nil, err
}
Expand Down
10 changes: 8 additions & 2 deletions go/libraries/doltcore/sqle/statspro/configure.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ func (p *Provider) Configure(ctx context.Context, ctxFactory func(ctx context.Co
branches := p.getStatsBranches(loadCtx)

var autoEnabled bool
var startupEnabled bool
var intervalSec time.Duration
var thresholdf64 float64
if _, enabled, _ := sql.SystemVariables.GetGlobal(dsess.DoltStatsAutoRefreshEnabled); enabled == int8(1) {
Expand All @@ -55,6 +56,8 @@ func (p *Provider) Configure(ctx context.Context, ctxFactory func(ctx context.Co

p.pro.InitDatabaseHooks = append(p.pro.InitDatabaseHooks, NewStatsInitDatabaseHook(p, ctxFactory, bThreads))
p.pro.DropDatabaseHooks = append(p.pro.DropDatabaseHooks, NewStatsDropDatabaseHook(p))
} else if _, startupStats, _ := sql.SystemVariables.GetGlobal(dsess.DoltStatsBootstrapEnabled); startupStats == int8(1) {
startupEnabled = true
}

eg, ctx := loadCtx.NewErrgroup()
Expand All @@ -69,7 +72,6 @@ func (p *Provider) Configure(ctx context.Context, ctxFactory func(ctx context.Co
} else {
err = fmt.Errorf("%w: %v", ErrFailedToLoad, r)
}

return
}
}()
Expand All @@ -84,6 +86,10 @@ func (p *Provider) Configure(ctx context.Context, ctxFactory func(ctx context.Co
}
if autoEnabled {
return p.InitAutoRefreshWithParams(ctxFactory, db.Name(), bThreads, intervalSec, thresholdf64, branches)
} else if startupEnabled {
if err := p.BootstrapDatabaseStats(loadCtx, db.Name()); err != nil {
return err
}
}
return nil
})
Expand All @@ -109,7 +115,7 @@ func (p *Provider) getStatsBranches(ctx *sql.Context) []string {
}

if branches == nil {
branches = []string{p.pro.DefaultBranch()}
branches = append(branches, p.pro.DefaultBranch())
}
return branches
}
Expand Down
7 changes: 7 additions & 0 deletions go/libraries/doltcore/sqle/system_variables.go
Original file line number Diff line number Diff line change
Expand Up @@ -219,6 +219,13 @@ func AddDoltSystemVariables() {
Type: types.NewSystemBoolType(dsess.DoltStatsAutoRefreshEnabled),
Default: int8(0),
},
&sql.MysqlSystemVariable{
Name: dsess.DoltStatsBootstrapEnabled,
Dynamic: true,
Scope: sql.GetMysqlScope(sql.SystemVariableScope_Global),
Type: types.NewSystemBoolType(dsess.DoltStatsBootstrapEnabled),
Default: int8(1),
},
&sql.MysqlSystemVariable{
Name: dsess.DoltStatsMemoryOnly,
Dynamic: true,
Expand Down
59 changes: 52 additions & 7 deletions integration-tests/bats/stats.bats
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,9 @@ teardown() {
@test "stats: empty initial stats" {
cd repo2

# disable bootstrap, can only make stats with ANALYZE or background thread
dolt sql -q "set @@PERSIST.dolt_stats_bootstrap_enabled = 0;"

dolt sql -q "insert into xy values (0,0), (1,1)"

start_sql_server
Expand Down Expand Up @@ -88,6 +91,16 @@ teardown() {
[ "${lines[1]}" = "8" ]
}

@test "stats: bootrap on engine startup" {
cd repo2

dolt sql -q "set @@PERSIST.dolt_stats_bootstrap_enabled = 1;"
dolt sql -q "insert into xy values (0,0), (1,1)"
run dolt sql -r csv -q "select count(*) from dolt_statistics"
[ "$status" -eq 0 ]
[ "${lines[1]}" = "2" ]
}

@test "stats: deletes refresh" {
cd repo2

Expand Down Expand Up @@ -219,20 +232,15 @@ teardown() {

@test "stats: multi db" {
cd repo1

dolt sql -q "insert into ab values (0,0), (1,1)"

cd ../repo2

dolt sql -q "insert into ab values (0,0), (1,1)"
dolt sql -q "insert into xy values (0,0), (1,1)"

cd ..
start_sql_server
sleep 1
stop_sql_server

run dolt sql -r csv -q "select count(*) from dolt_statistics"
[ "$status" -eq 0 ]
[ "${lines[1]}" = "0" ]

dolt sql -q "SET @@persist.dolt_stats_auto_refresh_enabled = 1;"
dolt sql -q "SET @@persist.dolt_stats_auto_refresh_threshold = 0.5"
Expand Down Expand Up @@ -327,3 +335,40 @@ SQL
[ "${lines[2]}" = "2" ]
}

@test "stats: boostrap abort over 1mm rows" {
cat <<EOF > data.py
import random
import os
rows = 2*1000*1000+1
def main():
f = open("data.csv","w+")
f.write("id,hostname\n")
for i in range(rows):
hostname = random.getrandbits(100)
f.write(f"{i},{hostname}\n")
if i % (500*1000) == 0:
print("row :", i)
f.flush()
f.close()
if __name__ == "__main__":
main()
EOF

mkdir repo3
cd repo3
python3 ../data.py

dolt init
dolt sql -q "create table f (id int primary key, hostname int)"
dolt table import -u --continue f data.csv

run dolt sql -r csv -q "select count(*) from dolt_statistics"
[ "$status" -eq 0 ]
[[ "${lines[0]}" =~ "stats bootstrap aborted" ]] || false
[ "${lines[2]}" = "0" ]
}

0 comments on commit c5c05b7

Please sign in to comment.