Skip to content

Commit

Permalink
Index rebuilds with external key sorting (#7754)
Browse files Browse the repository at this point in the history
* starter for external sorting index rebuilds

* fixes

* don't flush in progress

* [ga-format-pr] Run go/utils/repofmt/format_repo.sh and go/Godeps/update.sh

* formatting

* [ga-format-pr] Run go/utils/repofmt/format_repo.sh and go/Godeps/update.sh

* testing edits

* add tests

* [ga-format-pr] Run go/utils/repofmt/format_repo.sh and go/Godeps/update.sh

* edits

* fix string compare bug

* aaron coments

* standardize error passing

* bit more cleanup

* use io.ReadFull rather than retrying buf.Read

* Update go/store/prolly/sort/external.go

Co-authored-by: Aaron Son <[email protected]>

* aaron comments

* test window tmpdir edit

* test windows tmpdir edit2

* [ga-format-pr] Run go/utils/repofmt/format_repo.sh and go/Godeps/update.sh

---------

Co-authored-by: max-hoffman <[email protected]>
Co-authored-by: Aaron Son <[email protected]>
  • Loading branch information
3 people authored May 2, 2024
1 parent 76da4e5 commit 9ec3ce2
Show file tree
Hide file tree
Showing 5 changed files with 1,138 additions and 51 deletions.
4 changes: 2 additions & 2 deletions go/libraries/doltcore/schema/collation_comparator.go
Original file line number Diff line number Diff line change
Expand Up @@ -133,12 +133,12 @@ func compareCollatedStrings(collation sql.CollationID, left, right []byte) int {
}

li := i
for ; li >= 0 && !utf8.RuneStart(left[li]); li-- {
for ; li > 0 && !utf8.RuneStart(left[li]); li-- {
}
left = left[li:]

ri := i
for ; ri >= 0 && !utf8.RuneStart(right[ri]); ri-- {
for ; ri > 0 && !utf8.RuneStart(right[ri]); ri-- {
}
right = right[ri:]

Expand Down
142 changes: 142 additions & 0 deletions go/libraries/doltcore/table/editor/creation/external_build_index.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
// Copyright 2024 Dolthub, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package creation

import (
"errors"
"io"

"github.com/dolthub/go-mysql-server/sql"

"github.com/dolthub/dolt/go/libraries/doltcore/doltdb/durable"
"github.com/dolthub/dolt/go/libraries/doltcore/schema"
"github.com/dolthub/dolt/go/libraries/doltcore/sqle/index"
"github.com/dolthub/dolt/go/store/prolly"
"github.com/dolthub/dolt/go/store/prolly/sort"
"github.com/dolthub/dolt/go/store/prolly/tree"
"github.com/dolthub/dolt/go/store/types"
"github.com/dolthub/dolt/go/store/util/tempfiles"
"github.com/dolthub/dolt/go/store/val"
)

const (
batchSize = 32 * 1024 * 1024 // 32MB
fileMax = 128
)

// BuildProllyIndexExternal builds unique and non-unique indexes with a
// single prolly tree materialization by presorting the index keys in an
// intermediate file format.
func BuildProllyIndexExternal(
ctx *sql.Context,
vrw types.ValueReadWriter,
ns tree.NodeStore,
sch schema.Schema,
tableName string,
idx schema.Index,
primary prolly.Map,
uniqCb DupEntryCb,
) (durable.Index, error) {
empty, err := durable.NewEmptyIndex(ctx, vrw, ns, idx.Schema())
if err != nil {
return nil, err
}
secondary := durable.ProllyMapFromIndex(empty)
if schema.IsKeyless(sch) {
secondary = prolly.ConvertToSecondaryKeylessIndex(secondary)
}

iter, err := primary.IterAll(ctx)
if err != nil {
return nil, err
}
p := primary.Pool()

prefixDesc := secondary.KeyDesc().PrefixDesc(idx.Count())
secondaryBld, err := index.NewSecondaryKeyBuilder(ctx, tableName, sch, idx, secondary.KeyDesc(), p, secondary.NodeStore())
if err != nil {
return nil, err
}

sorter := sort.NewTupleSorter(batchSize, fileMax, func(t1, t2 val.Tuple) bool {
return prefixDesc.Compare(t1, t2) < 0
}, tempfiles.MovableTempFileProvider)
defer sorter.Close()

for {
k, v, err := iter.Next(ctx)
if err == io.EOF {
break
} else if err != nil {
return nil, err
}

idxKey, err := secondaryBld.SecondaryKeyFromRow(ctx, k, v)
if err != nil {
return nil, err
}

if uniqCb != nil && prefixDesc.HasNulls(idxKey) {
continue
}

if err := sorter.Insert(ctx, idxKey); err != nil {
return nil, err
}
}

sortedKeys, err := sorter.Flush(ctx)
if err != nil {
return nil, err
}
defer sortedKeys.Close()

mut := secondary.Mutate()
it, err := sortedKeys.IterAll(ctx)
if err != nil {
return nil, err
}
defer it.Close()

var lastKey val.Tuple
for {
key, err := it.Next(ctx)
if err != nil {
if errors.Is(err, io.EOF) {
break
}
return nil, err
}
if lastKey != nil && prefixDesc.Compare(lastKey, key) == 0 {
if uniqCb != nil {
// register a constraint violation if |key| collides with |lastKey|
if err := uniqCb(ctx, lastKey, key); err != nil {
return nil, err
}
}
}
if err = mut.Put(ctx, key, val.EmptyTuple); err != nil {
return nil, err
}
lastKey = key
}

secondary, err = mut.Map(ctx)
if err != nil {
return nil, err
}

return durable.IndexFromProllyMap(secondary), nil
}
52 changes: 3 additions & 49 deletions go/libraries/doltcore/table/editor/creation/index.go
Original file line number Diff line number Diff line change
Expand Up @@ -168,61 +168,15 @@ func BuildSecondaryProllyIndex(
idx schema.Index,
primary prolly.Map,
) (durable.Index, error) {
var uniqCb DupEntryCb
if idx.IsUnique() {
kd := idx.Schema().GetKeyDescriptor()
return BuildUniqueProllyIndex(ctx, vrw, ns, sch, tableName, idx, primary, func(ctx context.Context, existingKey, newKey val.Tuple) error {
uniqCb = func(ctx context.Context, existingKey, newKey val.Tuple) error {
msg := FormatKeyForUniqKeyErr(newKey, kd)
return sql.NewUniqueKeyErr(msg, false, nil)
})
}

empty, err := durable.NewEmptyIndex(ctx, vrw, ns, idx.Schema())
if err != nil {
return nil, err
}
if idx.IsFullText() {
return empty, nil
}
secondary := durable.ProllyMapFromIndex(empty)
if schema.IsKeyless(sch) {
secondary = prolly.ConvertToSecondaryKeylessIndex(secondary)
}

p := primary.Pool()
mut := secondary.Mutate()
secondaryBld, err := index.NewSecondaryKeyBuilder(ctx, tableName, sch, idx, secondary.KeyDesc(), p, secondary.NodeStore())
if err != nil {
return nil, err
}

iter, err := primary.IterAll(ctx)
if err != nil {
return nil, err
}

for {
var k, v val.Tuple
k, v, err = iter.Next(ctx)
if err == io.EOF {
break
} else if err != nil {
return nil, err
}

idxKey, err := secondaryBld.SecondaryKeyFromRow(ctx, k, v)
if err != nil {
return nil, err
}
if err = mut.Put(ctx, idxKey, val.EmptyTuple); err != nil {
return nil, err
}
}

secondary, err = mut.Map(ctx)
if err != nil {
return nil, err
}
return durable.IndexFromProllyMap(secondary), nil
return BuildProllyIndexExternal(ctx, vrw, ns, sch, tableName, idx, primary, uniqCb)
}

// FormatKeyForUniqKeyErr formats the given tuple |key| using |d|. The resulting
Expand Down
Loading

0 comments on commit 9ec3ce2

Please sign in to comment.