diff --git a/db.go b/db.go index 2760f2ccd7e..cc5d77cc286 100644 --- a/db.go +++ b/db.go @@ -502,6 +502,14 @@ type DB struct { // validating is set to true when validation is running. validating bool } + + // annotators contains various instances of manifest.Annotator which + // should be protected from concurrent access. + annotators struct { + totalSizeAnnotator *manifest.Annotator[uint64] + remoteSizeAnnotator *manifest.Annotator[uint64] + externalSizeAnnotator *manifest.Annotator[uint64] + } } // Normally equal to time.Now() but may be overridden in tests. @@ -2226,6 +2234,31 @@ func (d *DB) SSTables(opts ...SSTablesOption) ([][]SSTableInfo, error) { return destLevels, nil } +// makeFileSizeAnnotator returns an annotator that computes the total size of +// files that meet some criteria defined by filter. +func (d *DB) makeFileSizeAnnotator(filter func(f *fileMetadata) bool) *manifest.Annotator[uint64] { + return &manifest.Annotator[uint64]{ + Aggregator: manifest.SumAggregator{ + AccumulateFunc: func(f *fileMetadata) (uint64, bool) { + if filter(f) { + return f.Size, true + } + return 0, true + }, + AccumulatePartialOverlapFunc: func(f *fileMetadata, bounds base.UserKeyBounds) uint64 { + if filter(f) { + size, err := d.tableCache.estimateSize(f, bounds.Start, bounds.End.Key) + if err != nil { + return 0 + } + return size + } + return 0 + }, + }, + } +} + // EstimateDiskUsage returns the estimated filesystem space used in bytes for // storing the range `[start, end]`. The estimation is computed as follows: // @@ -2252,7 +2285,9 @@ func (d *DB) EstimateDiskUsageByBackingType( if err := d.closed.Load(); err != nil { panic(err) } - if d.opts.Comparer.Compare(start, end) > 0 { + + bounds := base.UserKeyBoundsInclusive(start, end) + if !bounds.Valid(d.cmp) { return 0, 0, 0, errors.New("invalid key-range specified (start > end)") } @@ -2262,70 +2297,13 @@ func (d *DB) EstimateDiskUsageByBackingType( readState := d.loadReadState() defer readState.unref() - for level, files := range readState.current.Levels { - iter := files.Iter() - if level > 0 { - // We can only use `Overlaps` to restrict `files` at L1+ since at L0 it - // expands the range iteratively until it has found a set of files that - // do not overlap any other L0 files outside that set. - overlaps := readState.current.Overlaps(level, base.UserKeyBoundsInclusive(start, end)) - iter = overlaps.Iter() - } - for file := iter.First(); file != nil; file = iter.Next() { - if d.opts.Comparer.Compare(start, file.Smallest.UserKey) <= 0 && - d.opts.Comparer.Compare(file.Largest.UserKey, end) <= 0 { - // The range fully contains the file, so skip looking it up in - // table cache/looking at its indexes, and add the full file size. - meta, err := d.objProvider.Lookup(fileTypeTable, file.FileBacking.DiskFileNum) - if err != nil { - return 0, 0, 0, err - } - if meta.IsRemote() { - remoteSize += file.Size - if meta.Remote.CleanupMethod == objstorage.SharedNoCleanup { - externalSize += file.Size - } - } - totalSize += file.Size - } else if d.opts.Comparer.Compare(file.Smallest.UserKey, end) <= 0 && - d.opts.Comparer.Compare(start, file.Largest.UserKey) <= 0 { - var size uint64 - var err error - if file.Virtual { - err = d.tableCache.withVirtualReader( - file.VirtualMeta(), - func(r sstable.VirtualReader) (err error) { - size, err = r.EstimateDiskUsage(start, end) - return err - }, - ) - } else { - err = d.tableCache.withReader( - file.PhysicalMeta(), - func(r *sstable.Reader) (err error) { - size, err = r.EstimateDiskUsage(start, end) - return err - }, - ) - } - if err != nil { - return 0, 0, 0, err - } - meta, err := d.objProvider.Lookup(fileTypeTable, file.FileBacking.DiskFileNum) - if err != nil { - return 0, 0, 0, err - } - if meta.IsRemote() { - remoteSize += size - if meta.Remote.CleanupMethod == objstorage.SharedNoCleanup { - externalSize += size - } - } - totalSize += size - } - } - } - return totalSize, remoteSize, externalSize, nil + d.mu.Lock() + defer d.mu.Unlock() + + totalSize = *d.mu.annotators.totalSizeAnnotator.VersionRangeAnnotation(readState.current, bounds) + remoteSize = *d.mu.annotators.remoteSizeAnnotator.VersionRangeAnnotation(readState.current, bounds) + externalSize = *d.mu.annotators.externalSizeAnnotator.VersionRangeAnnotation(readState.current, bounds) + return } func (d *DB) walPreallocateSize() int { diff --git a/internal/manifest/annotator.go b/internal/manifest/annotator.go index edab24db9ad..dd8c126817d 100644 --- a/internal/manifest/annotator.go +++ b/internal/manifest/annotator.go @@ -4,6 +4,12 @@ package manifest +import ( + "sort" + + "github.com/cockroachdb/pebble/internal/base" +) + // The Annotator type defined below is used by other packages to lazily // compute a value over a B-Tree. Each node of the B-Tree stores one // `annotation` per annotator, containing the result of the computation over @@ -24,6 +30,10 @@ package manifest // computed incrementally as edits are applied to a level. type Annotator[T any] struct { Aggregator AnnotationAggregator[T] + + // scratch is used to hold the aggregated annotation value when computing + // range annotations in order to avoid additional allocations. + scratch *T } // An AnnotationAggregator defines how an annotation should be accumulated @@ -49,6 +59,14 @@ type AnnotationAggregator[T any] interface { Merge(src *T, dst *T) *T } +// A PartialOverlapAnnotationAggregator is an extension of AnnotationAggregator +// that allows for custom accumulation of range annotations for files that only +// partially overlap with the range. +type PartialOverlapAnnotationAggregator[T any] interface { + AnnotationAggregator[T] + AccumulatePartialOverlap(f *FileMetadata, dst *T, bounds base.UserKeyBounds) *T +} + type annotation struct { // annotator is a pointer to the Annotator that computed this annotation. // NB: This is untyped to allow AnnotationAggregator to use Go generics, @@ -116,6 +134,89 @@ func (a *Annotator[T]) nodeAnnotation(n *node) (_ *T, cacheOK bool) { return t, annot.valid } +// accumulateRangeAnnotation computes this annotator's annotation across all +// files in the node's subtree which overlap with the range defined by bounds. +// The computed annotation is accumulated into a.scratch. +func (a *Annotator[T]) accumulateRangeAnnotation( + n *node, + cmp base.Compare, + bounds base.UserKeyBounds, + // fullyWithinLowerBound and fullyWithinUpperBound indicate whether this + // node's subtree is already known to be within each bound. + fullyWithinLowerBound bool, + fullyWithinUpperBound bool, +) { + // If this node's subtree is fully within the bounds, compute a regular + // annotation. + if fullyWithinLowerBound && fullyWithinUpperBound { + v, _ := a.nodeAnnotation(n) + a.scratch = a.Aggregator.Merge(v, a.scratch) + return + } + + // We will accumulate annotations from each item in the end-exclusive + // range [leftItem, rightItem). + leftItem, rightItem := 0, int(n.count) + if !fullyWithinLowerBound { + // leftItem is the index of the first item that overlaps the lower bound. + leftItem = sort.Search(int(n.count), func(i int) bool { + return cmp(bounds.Start, n.items[i].Largest.UserKey) <= 0 + }) + } + if !fullyWithinUpperBound { + // rightItem is the index of the first item that does not overlap the + // upper bound. + rightItem = sort.Search(int(n.count), func(i int) bool { + return !bounds.End.IsUpperBoundFor(cmp, n.items[i].Smallest.UserKey) + }) + } + + // Accumulate annotations from every item that overlaps the bounds. + for i := leftItem; i < rightItem; i++ { + if i == leftItem || i == rightItem-1 { + if agg, ok := a.Aggregator.(PartialOverlapAnnotationAggregator[T]); ok { + fb := n.items[i].UserKeyBounds() + if cmp(bounds.Start, fb.Start) > 0 || bounds.End.CompareUpperBounds(cmp, fb.End) < 0 { + a.scratch = agg.AccumulatePartialOverlap(n.items[i], a.scratch, bounds) + continue + } + } + } + v, _ := a.Aggregator.Accumulate(n.items[i], a.scratch) + a.scratch = v + } + + if !n.leaf { + // We will accumulate annotations from each child in the end-inclusive + // range [leftChild, rightChild]. + leftChild, rightChild := leftItem, rightItem + // If the lower bound overlaps with the child at leftItem, there is no + // need to accumulate annotations from the child to its left. + if leftItem < int(n.count) && cmp(bounds.Start, n.items[leftItem].Smallest.UserKey) >= 0 { + leftChild++ + } + // If the upper bound spans beyond the child at rightItem, we must also + // accumulate annotations from the child to its right. + if rightItem < int(n.count) && bounds.End.IsUpperBoundFor(cmp, n.items[rightItem].Largest.UserKey) { + rightChild++ + } + + for i := leftChild; i <= rightChild; i++ { + a.accumulateRangeAnnotation( + n.children[i], + cmp, + bounds, + // If this child is to the right of leftItem, then its entire + // subtree is within the lower bound. + fullyWithinLowerBound || i > leftItem, + // If this child is to the left of rightItem, then its entire + // subtree is within the upper bound. + fullyWithinUpperBound || i < rightItem, + ) + } + } +} + // InvalidateAnnotation removes any existing cached annotations from this // annotator from a node's subtree. func (a *Annotator[T]) invalidateNodeAnnotation(n *node) { @@ -142,8 +243,8 @@ func (a *Annotator[T]) LevelAnnotation(lm LevelMetadata) *T { return v } -// LevelAnnotation calculates the annotation defined by this Annotator for all -// files across the given levels. A pointer to the Annotator is used as the +// MultiLevelAnnotation calculates the annotation defined by this Annotator for +// all files across the given levels. A pointer to the Annotator is used as the // key for pre-calculated values, so the same Annotator must be used to avoid // duplicate computation. Annotation must not be called concurrently, and in // practice this is achieved by requiring callers to hold DB.mu. @@ -158,6 +259,42 @@ func (a *Annotator[T]) MultiLevelAnnotation(lms []LevelMetadata) *T { return aggregated } +// LevelRangeAnnotation calculates the annotation defined by this Annotator for +// the files within LevelMetadata which are within the range +// [lowerBound, upperBound). A pointer to the Annotator is used as the key for +// pre-calculated values, so the same Annotator must be used to avoid duplicate +// computation. Annotation must not be called concurrently, and in practice this +// is achieved by requiring callers to hold DB.mu. +func (a *Annotator[T]) LevelRangeAnnotation(lm LevelMetadata, bounds base.UserKeyBounds) *T { + if lm.Empty() { + return a.Aggregator.Zero(nil) + } + + a.scratch = a.Aggregator.Zero(a.scratch) + a.accumulateRangeAnnotation(lm.tree.root, lm.tree.cmp, bounds, false, false) + return a.scratch +} + +// VersionRangeAnnotation calculates the annotation defined by this Annotator +// for all files within the given Version which are within the range +// defined by bounds. +func (a *Annotator[T]) VersionRangeAnnotation(v *Version, bounds base.UserKeyBounds) *T { + accumulateSlice := func(ls LevelSlice) { + if ls.Empty() { + return + } + a.accumulateRangeAnnotation(ls.iter.r, v.cmp.Compare, bounds, false, false) + } + a.scratch = a.Aggregator.Zero(a.scratch) + for _, ls := range v.L0SublevelFiles { + accumulateSlice(ls) + } + for _, lm := range v.Levels[1:] { + accumulateSlice(lm.Slice()) + } + return a.scratch +} + // InvalidateAnnotation clears any cached annotations defined by Annotator. A // pointer to the Annotator is used as the key for pre-calculated values, so // the same Annotator must be used to clear the appropriate cached annotation. @@ -170,13 +307,15 @@ func (a *Annotator[T]) InvalidateLevelAnnotation(lm LevelMetadata) { a.invalidateNodeAnnotation(lm.tree.root) } -// sumAggregator defines an Aggregator which sums together a uint64 value +// SumAggregator defines an Aggregator which sums together a uint64 value // across files. -type sumAggregator struct { - accumulateFunc func(f *FileMetadata) (v uint64, cacheOK bool) +type SumAggregator struct { + AccumulateFunc func(f *FileMetadata) (v uint64, cacheOK bool) + AccumulatePartialOverlapFunc func(f *FileMetadata, bounds base.UserKeyBounds) uint64 } -func (sa sumAggregator) Zero(dst *uint64) *uint64 { +// Zero implements AnnotationAggregator.Zero, returning a new uint64 set to 0. +func (sa SumAggregator) Zero(dst *uint64) *uint64 { if dst == nil { return new(uint64) } @@ -184,13 +323,31 @@ func (sa sumAggregator) Zero(dst *uint64) *uint64 { return dst } -func (sa sumAggregator) Accumulate(f *FileMetadata, dst *uint64) (v *uint64, cacheOK bool) { - accumulated, ok := sa.accumulateFunc(f) +// Accumulate implements AnnotationAggregator.Accumulate, accumulating a single +// file's uint64 value. +func (sa SumAggregator) Accumulate(f *FileMetadata, dst *uint64) (v *uint64, cacheOK bool) { + accumulated, ok := sa.AccumulateFunc(f) *dst += accumulated return dst, ok } -func (sa sumAggregator) Merge(src *uint64, dst *uint64) *uint64 { +// AccumulatePartialOverlap implements +// PartialOverlapAnnotationAggregator.AccumulatePartialOverlap, accumulating a +// single file's uint64 value for a file which only partially overlaps with the +// range defined by bounds. +func (sa SumAggregator) AccumulatePartialOverlap( + f *FileMetadata, dst *uint64, bounds base.UserKeyBounds, +) *uint64 { + if sa.AccumulatePartialOverlapFunc == nil { + v, _ := sa.Accumulate(f, dst) + return v + } + *dst += sa.AccumulatePartialOverlapFunc(f, bounds) + return dst +} + +// Merge implements AnnotationAggregator.Merge by summing two uint64 values. +func (sa SumAggregator) Merge(src *uint64, dst *uint64) *uint64 { *dst += *src return dst } @@ -200,12 +357,20 @@ func (sa sumAggregator) Merge(src *uint64, dst *uint64) *uint64 { // files. func SumAnnotator(accumulate func(f *FileMetadata) (v uint64, cacheOK bool)) *Annotator[uint64] { return &Annotator[uint64]{ - Aggregator: sumAggregator{ - accumulateFunc: accumulate, + Aggregator: SumAggregator{ + AccumulateFunc: accumulate, }, } } +// NumFilesAnnotator is an Annotator which computes an annotation value +// equal to the number of files included in the annotation. Particularly, it +// can be used to efficiently calculate the number of files in a given key +// range using range annotations. +var NumFilesAnnotator = SumAnnotator(func(f *FileMetadata) (uint64, bool) { + return 1, true +}) + // PickFileAggregator implements the AnnotationAggregator interface. It defines // an aggregator that picks a single file from a set of eligible files. type PickFileAggregator struct { diff --git a/internal/manifest/annotator_test.go b/internal/manifest/annotator_test.go index 07badf6a387..1499e13a63a 100644 --- a/internal/manifest/annotator_test.go +++ b/internal/manifest/annotator_test.go @@ -5,54 +5,47 @@ package manifest import ( + "math/rand" "testing" "github.com/cockroachdb/pebble/internal/base" "github.com/stretchr/testify/require" ) -func makeTestLevelMetadata(count int) (LevelMetadata, []*FileMetadata) { - files := make([]*FileMetadata, count) - for i := 0; i < count; i++ { - files[i] = newItem(key(i)) +// Creates a version with numFiles files in level 6. +func makeTestVersion(numFiles int) (*Version, []*FileMetadata) { + files := make([]*FileMetadata, numFiles) + for i := 0; i < numFiles; i++ { + // Each file spans 10 keys, e.g. [0->9], [10->19], etc. + files[i] = (&FileMetadata{}).ExtendPointKeyBounds( + base.DefaultComparer.Compare, key(i*10), key(i*10+9), + ) + files[i].InitPhysicalBacking() } - lm := MakeLevelMetadata(base.DefaultComparer.Compare, 6, files) - return lm, files -} + var levelFiles [7][]*FileMetadata + levelFiles[6] = files -// NumFilesAnnotator is an Annotator which computes an annotation value -// equal to the number of files included in the annotation. -var NumFilesAnnotator = SumAnnotator(func(f *FileMetadata) (uint64, bool) { - return 1, true -}) + v := NewVersion(base.DefaultComparer, 0, levelFiles) + return v, files +} func TestNumFilesAnnotator(t *testing.T) { const count = 1000 - lm, _ := makeTestLevelMetadata(0) + v, _ := makeTestVersion(0) for i := 1; i <= count; i++ { - lm.tree.Insert(newItem(key(i))) - numFiles := *NumFilesAnnotator.LevelAnnotation(lm) + v.Levels[6].tree.Insert(newItem(key(i))) + numFiles := *NumFilesAnnotator.LevelAnnotation(v.Levels[6]) require.EqualValues(t, i, numFiles) } - - numFiles := *NumFilesAnnotator.LevelAnnotation(lm) - require.EqualValues(t, count, numFiles) - - numFiles = *NumFilesAnnotator.LevelAnnotation(lm) - require.EqualValues(t, count, numFiles) - - lm.tree.Delete(newItem(key(count / 2))) - numFiles = *NumFilesAnnotator.LevelAnnotation(lm) - require.EqualValues(t, count-1, numFiles) } func BenchmarkNumFilesAnnotator(b *testing.B) { - lm, _ := makeTestLevelMetadata(0) + v, _ := makeTestVersion(0) for i := 1; i <= b.N; i++ { - lm.tree.Insert(newItem(key(i))) - numFiles := *NumFilesAnnotator.LevelAnnotation(lm) + v.Levels[6].tree.Insert(newItem(key(i))) + numFiles := *NumFilesAnnotator.LevelAnnotation(v.Levels[6]) require.EqualValues(b, uint64(i), numFiles) } } @@ -70,12 +63,115 @@ func TestPickFileAggregator(t *testing.T) { }, } - lm, files := makeTestLevelMetadata(1) + v, files := makeTestVersion(1) for i := 1; i <= count; i++ { - lm.tree.Insert(newItem(key(i))) - pickedFile := a.LevelAnnotation(lm) + v.Levels[6].tree.Insert(newItem(key(i))) + pickedFile := a.LevelAnnotation(v.Levels[6]) // The picked file should always be the one with the smallest key. require.Same(t, files[0], pickedFile) } } + +func bounds(i int, j int, exclusive bool) base.UserKeyBounds { + b := base.UserKeyBoundsEndExclusiveIf(key(i).UserKey, key(j).UserKey, exclusive) + return b +} + +func randomBounds(rng *rand.Rand, count int) base.UserKeyBounds { + first := rng.Intn(count) + second := rng.Intn(count) + exclusive := rng.Intn(2) == 0 + return bounds(min(first, second), max(first, second), exclusive) +} + +func requireMatchOverlaps(t *testing.T, v *Version, bounds base.UserKeyBounds) { + overlaps := v.Overlaps(6, bounds) + numFiles := *NumFilesAnnotator.LevelRangeAnnotation(v.Levels[6], bounds) + require.EqualValues(t, overlaps.length, numFiles) +} + +func TestNumFilesRangeAnnotationEmptyRanges(t *testing.T) { + const count = 5_000 + v, files := makeTestVersion(count) + + // Delete files containing key ranges [0, 999] and [24_000, 25_999]. + for i := 0; i < 100; i++ { + v.Levels[6].tree.Delete(files[i]) + } + for i := 2400; i < 2600; i++ { + v.Levels[6].tree.Delete(files[i]) + } + + // Ranges that are completely empty. + requireMatchOverlaps(t, v, bounds(1, 999, false)) + requireMatchOverlaps(t, v, bounds(0, 1000, true)) + requireMatchOverlaps(t, v, bounds(50_000, 60_000, false)) + requireMatchOverlaps(t, v, bounds(24_500, 25_500, false)) + requireMatchOverlaps(t, v, bounds(24_000, 26_000, true)) + + // Partial overlaps with empty ranges. + requireMatchOverlaps(t, v, bounds(0, 1000, false)) + requireMatchOverlaps(t, v, bounds(20, 1001, true)) + requireMatchOverlaps(t, v, bounds(20, 1010, true)) + requireMatchOverlaps(t, v, bounds(23_000, 27_000, true)) + requireMatchOverlaps(t, v, bounds(25_000, 40_000, false)) + requireMatchOverlaps(t, v, bounds(25_500, 26_001, true)) + + // Ranges which only spans a single table. + requireMatchOverlaps(t, v, bounds(45_000, 45_000, true)) + requireMatchOverlaps(t, v, bounds(30_000, 30_001, true)) + requireMatchOverlaps(t, v, bounds(23_000, 23_000, false)) +} + +func TestNumFilesRangeAnnotationRandomized(t *testing.T) { + const count = 10_000 + const numIterations = 10_000 + + v, _ := makeTestVersion(count) + + rng := rand.New(rand.NewSource(int64(0))) + for i := 0; i < numIterations; i++ { + requireMatchOverlaps(t, v, randomBounds(rng, count*11)) + } +} + +func BenchmarkNumFilesRangeAnnotation(b *testing.B) { + const count = 100_000 + v, files := makeTestVersion(count) + + rng := rand.New(rand.NewSource(int64(0))) + b.Run("annotator", func(b *testing.B) { + for i := 0; i < b.N; i++ { + b := randomBounds(rng, count*11) + // Randomly delete and reinsert a file to verify that range + // annotations are still fast despite small mutations. + toDelete := rng.Intn(count) + v.Levels[6].tree.Delete(files[toDelete]) + + NumFilesAnnotator.LevelRangeAnnotation(v.Levels[6], b) + + v.Levels[6].tree.Insert(files[toDelete]) + } + }) + + // Also benchmark an equivalent aggregation using version.Overlaps to show + // the difference in performance. + b.Run("overlaps", func(b *testing.B) { + for i := 0; i < b.N; i++ { + b := randomBounds(rng, count*11) + toDelete := rng.Intn(count) + v.Levels[6].tree.Delete(files[toDelete]) + + overlaps := v.Overlaps(6, b) + iter := overlaps.Iter() + numFiles := 0 + for f := iter.First(); f != nil; f = iter.Next() { + numFiles++ + } + + v.Levels[6].tree.Insert(files[toDelete]) + } + }) + +} diff --git a/internal/manifest/btree.go b/internal/manifest/btree.go index 350200b612d..02d2f22cfbb 100644 --- a/internal/manifest/btree.go +++ b/internal/manifest/btree.go @@ -13,6 +13,7 @@ import ( "unsafe" "github.com/cockroachdb/errors" + "github.com/cockroachdb/pebble/internal/base" "github.com/cockroachdb/pebble/internal/invariants" ) @@ -300,14 +301,14 @@ func (n *node) popFront() (*FileMetadata, *node) { // // This function is for use only as a helper function for internal B-Tree code. // Clients should not invoke it directly. -func (n *node) find(cmp btreeCmp, item *FileMetadata) (index int, found bool) { +func (n *node) find(bcmp btreeCmp, item *FileMetadata) (index int, found bool) { // Logic copied from sort.Search. Inlining this gave // an 11% speedup on BenchmarkBTreeDeleteInsert. i, j := 0, int(n.count) for i < j { h := int(uint(i+j) >> 1) // avoid overflow when computing h // i ≤ h < j - v := cmp(item, n.items[h]) + v := bcmp(item, n.items[h]) if v == 0 { return h, true } else if v > 0 { @@ -386,8 +387,8 @@ func (n *node) split(i int) (*FileMetadata, *node) { // Insert inserts a item into the subtree rooted at this node, making sure no // nodes in the subtree exceed maxItems items. -func (n *node) Insert(cmp btreeCmp, item *FileMetadata) error { - i, found := n.find(cmp, item) +func (n *node) Insert(bcmp btreeCmp, item *FileMetadata) error { + i, found := n.find(bcmp, item) if found { // cmp provides a total ordering of the files within a level. // If we're inserting a metadata that's equal to an existing item @@ -404,7 +405,7 @@ func (n *node) Insert(cmp btreeCmp, item *FileMetadata) error { splitLa, splitNode := mut(&n.children[i]).split(maxItems / 2) n.insertAt(i, splitLa, splitNode) - switch cmp := cmp(item, n.items[i]); { + switch cmp := bcmp(item, n.items[i]); { case cmp < 0: // no change, we want first split node case cmp > 0: @@ -418,7 +419,7 @@ func (n *node) Insert(cmp btreeCmp, item *FileMetadata) error { } } - err := mut(&n.children[i]).Insert(cmp, item) + err := mut(&n.children[i]).Insert(bcmp, item) if err == nil { n.subtreeCount++ } @@ -447,8 +448,8 @@ func (n *node) removeMax() *FileMetadata { // Remove removes a item from the subtree rooted at this node. Returns // the item that was removed or nil if no matching item was found. -func (n *node) Remove(cmp btreeCmp, item *FileMetadata) (out *FileMetadata) { - i, found := n.find(cmp, item) +func (n *node) Remove(bcmp btreeCmp, item *FileMetadata) (out *FileMetadata) { + i, found := n.find(bcmp, item) if n.leaf { if found { out, _ = n.removeAt(i) @@ -460,7 +461,7 @@ func (n *node) Remove(cmp btreeCmp, item *FileMetadata) (out *FileMetadata) { if n.children[i].count <= minItems { // Child not large enough to remove from. n.rebalanceOrMerge(i) - return n.Remove(cmp, item) + return n.Remove(bcmp, item) } child := mut(&n.children[i]) if found { @@ -471,7 +472,7 @@ func (n *node) Remove(cmp btreeCmp, item *FileMetadata) (out *FileMetadata) { return out } // File is not in this node and child is large enough to remove from. - out = child.Remove(cmp, item) + out = child.Remove(bcmp, item) if out != nil { n.subtreeCount-- } @@ -635,7 +636,8 @@ func (n *node) verifyInvariants() { // goroutines, but Read operations are. type btree struct { root *node - cmp btreeCmp + cmp base.Compare + bcmp btreeCmp } // Release dereferences and clears the root node of the btree, removing all @@ -678,7 +680,7 @@ func (t *btree) Delete(item *FileMetadata) (obsolete bool) { if t.root == nil || t.root.count == 0 { return false } - if out := mut(&t.root).Remove(t.cmp, item); out != nil { + if out := mut(&t.root).Remove(t.bcmp, item); out != nil { obsolete = out.FileBacking.Unref() == 0 } if invariants.Enabled { @@ -712,7 +714,7 @@ func (t *btree) Insert(item *FileMetadata) error { t.root = newRoot } item.FileBacking.Ref() - err := mut(&t.root).Insert(t.cmp, item) + err := mut(&t.root).Insert(t.bcmp, item) if invariants.Enabled { t.root.verifyInvariants() } @@ -723,7 +725,7 @@ func (t *btree) Insert(item *FileMetadata) error { // iterator after modifications are made to the tree. If modifications are made, // create a new iterator. func (t *btree) Iter() iterator { - return iterator{r: t.root, pos: -1, cmp: t.cmp} + return iterator{r: t.root, pos: -1, cmp: t.bcmp} } // Count returns the number of files contained within the B-Tree. diff --git a/internal/manifest/btree_test.go b/internal/manifest/btree_test.go index 1c4b2ed2cd5..c75f33004ea 100644 --- a/internal/manifest/btree_test.go +++ b/internal/manifest/btree_test.go @@ -111,7 +111,7 @@ func (n *node) verifyCountAllowed(t *testing.T, root bool) { } func (t *btree) isSorted(tt *testing.T) { - t.root.isSorted(tt, t.cmp) + t.root.isSorted(tt, t.bcmp) } func (n *node) isSorted(t *testing.T, cmp func(*FileMetadata, *FileMetadata) int) { @@ -209,7 +209,7 @@ func checkIter(t *testing.T, it iterator, start, end int, keyMemo map[int]Intern // TestBTree tests basic btree operations. func TestBTree(t *testing.T) { var tr btree - tr.cmp = cmp + tr.bcmp = cmp keyMemo := make(map[int]InternalKey) // With degree == 16 (max-items/node == 31) we need 513 items in order for @@ -269,7 +269,7 @@ func TestIterClone(t *testing.T) { const count = 65536 var tr btree - tr.cmp = cmp + tr.bcmp = cmp keyMemo := make(map[int]InternalKey) for i := 0; i < count; i++ { @@ -295,7 +295,7 @@ func TestIterClone(t *testing.T) { func TestIterCmpEdgeCases(t *testing.T) { var tr btree - tr.cmp = cmp + tr.bcmp = cmp t.Run("empty", func(t *testing.T) { a := tr.Iter() b := tr.Iter() @@ -330,7 +330,7 @@ func TestIterCmpRand(t *testing.T) { const iterCount = 1000 var tr btree - tr.cmp = cmp + tr.bcmp = cmp for i := 0; i < itemCount; i++ { require.NoError(t, tr.Insert(newItem(key(i)))) } @@ -365,7 +365,7 @@ func TestBTreeSeek(t *testing.T) { const count = 513 var tr btree - tr.cmp = cmp + tr.bcmp = cmp for i := 0; i < count; i++ { require.NoError(t, tr.Insert(newItem(key(i*2)))) } @@ -404,7 +404,7 @@ func TestBTreeSeek(t *testing.T) { func TestBTreeInsertDuplicateError(t *testing.T) { var tr btree - tr.cmp = cmp + tr.bcmp = cmp require.NoError(t, tr.Insert(newItem(key(1)))) require.NoError(t, tr.Insert(newItem(key(2)))) require.NoError(t, tr.Insert(newItem(key(3)))) @@ -447,7 +447,7 @@ func TestBTreeCloneConcurrentOperations(t *testing.T) { wg.Add(1) var tr btree - tr.cmp = cmp + tr.bcmp = cmp go populate(&tr, 0) wg.Wait() close(treeC) @@ -516,7 +516,7 @@ func TestIterStack(t *testing.T) { func TestIterEndSentinel(t *testing.T) { var tr btree - tr.cmp = cmp + tr.bcmp = cmp require.NoError(t, tr.Insert(newItem(key(1)))) require.NoError(t, tr.Insert(newItem(key(2)))) require.NoError(t, tr.Insert(newItem(key(3)))) @@ -560,7 +560,7 @@ func TestRandomizedBTree(t *testing.T) { // Use a btree comparator that sorts by file number to make it easier to // prevent duplicates or overlaps. tree := btree{ - cmp: func(a *FileMetadata, b *FileMetadata) int { + bcmp: func(a *FileMetadata, b *FileMetadata) int { return stdcmp.Compare(a.FileNum, b.FileNum) }, } @@ -684,7 +684,7 @@ func BenchmarkBTreeInsert(b *testing.B) { b.ResetTimer() for i := 0; i < b.N; { var tr btree - tr.cmp = cmp + tr.bcmp = cmp for _, item := range insertP { if err := tr.Insert(item); err != nil { b.Fatal(err) @@ -706,7 +706,7 @@ func BenchmarkBTreeDelete(b *testing.B) { for i := 0; i < b.N; { b.StopTimer() var tr btree - tr.cmp = cmp + tr.bcmp = cmp for _, item := range insertP { if err := tr.Insert(item); err != nil { b.Fatal(err) @@ -732,7 +732,7 @@ func BenchmarkBTreeDeleteInsert(b *testing.B) { forBenchmarkSizes(b, func(b *testing.B, count int) { insertP := perm(count) var tr btree - tr.cmp = cmp + tr.bcmp = cmp for _, item := range insertP { if err := tr.Insert(item); err != nil { b.Fatal(err) @@ -755,7 +755,7 @@ func BenchmarkBTreeDeleteInsertCloneOnce(b *testing.B) { forBenchmarkSizes(b, func(b *testing.B, count int) { insertP := perm(count) var tr btree - tr.cmp = cmp + tr.bcmp = cmp for _, item := range insertP { if err := tr.Insert(item); err != nil { b.Fatal(err) @@ -781,8 +781,8 @@ func BenchmarkBTreeDeleteInsertCloneEachTime(b *testing.B) { forBenchmarkSizes(b, func(b *testing.B, count int) { insertP := perm(count) var tr, trRelease btree - tr.cmp = cmp - trRelease.cmp = cmp + tr.bcmp = cmp + trRelease.bcmp = cmp for _, item := range insertP { if err := tr.Insert(item); err != nil { b.Fatal(err) @@ -809,7 +809,7 @@ func BenchmarkBTreeDeleteInsertCloneEachTime(b *testing.B) { // BenchmarkBTreeIter measures the cost of creating a btree iterator. func BenchmarkBTreeIter(b *testing.B) { var tr btree - tr.cmp = cmp + tr.bcmp = cmp for i := 0; i < b.N; i++ { it := tr.Iter() it.first() @@ -823,7 +823,7 @@ func BenchmarkBTreeIterSeekGE(b *testing.B) { forBenchmarkSizes(b, func(b *testing.B, count int) { var keys []InternalKey var tr btree - tr.cmp = cmp + tr.bcmp = cmp for i := 0; i < count; i++ { s := key(i) @@ -857,7 +857,7 @@ func BenchmarkBTreeIterSeekLT(b *testing.B) { forBenchmarkSizes(b, func(b *testing.B, count int) { var keys []InternalKey var tr btree - tr.cmp = cmp + tr.bcmp = cmp for i := 0; i < count; i++ { k := key(i) @@ -896,7 +896,7 @@ func BenchmarkBTreeIterSeekLT(b *testing.B) { // next item in the tree. func BenchmarkBTreeIterNext(b *testing.B) { var tr btree - tr.cmp = cmp + tr.bcmp = cmp const count = 8 << 10 for i := 0; i < count; i++ { @@ -920,7 +920,7 @@ func BenchmarkBTreeIterNext(b *testing.B) { // previous item in the tree. func BenchmarkBTreeIterPrev(b *testing.B) { var tr btree - tr.cmp = cmp + tr.bcmp = cmp const count = 8 << 10 for i := 0; i < count; i++ { diff --git a/internal/manifest/l0_sublevels.go b/internal/manifest/l0_sublevels.go index 04cef07ff25..3185b95b3da 100644 --- a/internal/manifest/l0_sublevels.go +++ b/internal/manifest/l0_sublevels.go @@ -337,7 +337,7 @@ func NewL0Sublevels( // Construct a parallel slice of sublevel B-Trees. // TODO(jackson): Consolidate and only use the B-Trees. for _, sublevelFiles := range s.levelFiles { - tr, ls := makeBTree(btreeCmpSmallestKey(cmp), sublevelFiles) + tr, ls := makeBTree(cmp, btreeCmpSmallestKey(cmp), sublevelFiles) s.Levels = append(s.Levels, ls) tr.Release() } @@ -630,7 +630,7 @@ func (s *L0Sublevels) AddL0Files( // Construct a parallel slice of sublevel B-Trees. // TODO(jackson): Consolidate and only use the B-Trees. for _, sublevel := range updatedSublevels { - tr, ls := makeBTree(btreeCmpSmallestKey(newVal.cmp), newVal.levelFiles[sublevel]) + tr, ls := makeBTree(newVal.cmp, btreeCmpSmallestKey(newVal.cmp), newVal.levelFiles[sublevel]) if sublevel == len(newVal.Levels) { newVal.Levels = append(newVal.Levels, ls) } else { diff --git a/internal/manifest/level_metadata.go b/internal/manifest/level_metadata.go index 7cba73e14af..abe4a13cb36 100644 --- a/internal/manifest/level_metadata.go +++ b/internal/manifest/level_metadata.go @@ -49,7 +49,7 @@ func MakeLevelMetadata(cmp Compare, level int, files []*FileMetadata) LevelMetad } var lm LevelMetadata lm.level = level - lm.tree, _ = makeBTree(bcmp, files) + lm.tree, _ = makeBTree(cmp, bcmp, files) for _, f := range files { lm.totalSize += f.Size if f.Virtual { @@ -60,9 +60,10 @@ func MakeLevelMetadata(cmp Compare, level int, files []*FileMetadata) LevelMetad return lm } -func makeBTree(cmp btreeCmp, files []*FileMetadata) (btree, LevelSlice) { +func makeBTree(cmp base.Compare, bcmp btreeCmp, files []*FileMetadata) (btree, LevelSlice) { var t btree t.cmp = cmp + t.bcmp = bcmp for _, f := range files { t.Insert(f) } @@ -157,7 +158,7 @@ func (lf LevelFile) Slice() LevelSlice { // TODO(jackson): Can we improve this interface or avoid needing to export // a slice constructor like this? func NewLevelSliceSeqSorted(files []*FileMetadata) LevelSlice { - tr, slice := makeBTree(btreeCmpSeqNum, files) + tr, slice := makeBTree(nil, btreeCmpSeqNum, files) tr.Release() slice.verifyInvariants() return slice @@ -168,7 +169,7 @@ func NewLevelSliceSeqSorted(files []*FileMetadata) LevelSlice { // TODO(jackson): Can we improve this interface or avoid needing to export // a slice constructor like this? func NewLevelSliceKeySorted(cmp base.Compare, files []*FileMetadata) LevelSlice { - tr, slice := makeBTree(btreeCmpSmallestKey(cmp), files) + tr, slice := makeBTree(cmp, btreeCmpSmallestKey(cmp), files) tr.Release() slice.verifyInvariants() return slice @@ -179,7 +180,7 @@ func NewLevelSliceKeySorted(cmp base.Compare, files []*FileMetadata) LevelSlice // tests. // TODO(jackson): Update tests to avoid requiring this and remove it. func NewLevelSliceSpecificOrder(files []*FileMetadata) LevelSlice { - tr, slice := makeBTree(btreeCmpSpecificOrder(files), files) + tr, slice := makeBTree(nil, btreeCmpSpecificOrder(files), files) tr.Release() slice.verifyInvariants() return slice diff --git a/internal/manifest/version.go b/internal/manifest/version.go index a179d0382a2..1babb573e32 100644 --- a/internal/manifest/version.go +++ b/internal/manifest/version.go @@ -1068,12 +1068,12 @@ func NewVersion( // order to test consistency checking, etc. Once we've constructed the // initial B-Tree, we swap out the btreeCmp for the correct one. // TODO(jackson): Adjust or remove the tests and remove this. - v.Levels[l].tree, _ = makeBTree(btreeCmpSpecificOrder(files[l]), files[l]) + v.Levels[l].tree, _ = makeBTree(comparer.Compare, btreeCmpSpecificOrder(files[l]), files[l]) v.Levels[l].level = l if l == 0 { - v.Levels[l].tree.cmp = btreeCmpSeqNum + v.Levels[l].tree.bcmp = btreeCmpSeqNum } else { - v.Levels[l].tree.cmp = btreeCmpSmallestKey(comparer.Compare) + v.Levels[l].tree.bcmp = btreeCmpSmallestKey(comparer.Compare) } for _, f := range files[l] { v.Levels[l].totalSize += f.Size @@ -1501,7 +1501,7 @@ func (v *Version) Overlaps(level int, bounds base.UserKeyBounds) LevelSlice { if !restart { // Construct a B-Tree containing only the matching items. var tr btree - tr.cmp = v.Levels[level].tree.cmp + tr.bcmp = v.Levels[level].tree.bcmp for i, meta := 0, l0Iter.First(); meta != nil; i, meta = i+1, l0Iter.Next() { if selectedIndices[i] { err := tr.Insert(meta) diff --git a/open.go b/open.go index cb77a52bcef..c1de69543ce 100644 --- a/open.go +++ b/open.go @@ -408,6 +408,24 @@ func Open(dirname string, opts *Options) (db *DB, err error) { d.newIters = d.tableCache.newIters d.tableNewRangeKeyIter = tableNewRangeKeyIter(d.newIters) + d.mu.annotators.totalSizeAnnotator = d.makeFileSizeAnnotator(func(f *manifest.FileMetadata) bool { + return true + }) + d.mu.annotators.remoteSizeAnnotator = d.makeFileSizeAnnotator(func(f *manifest.FileMetadata) bool { + meta, err := d.objProvider.Lookup(fileTypeTable, f.FileBacking.DiskFileNum) + if err != nil { + return false + } + return meta.IsRemote() + }) + d.mu.annotators.externalSizeAnnotator = d.makeFileSizeAnnotator(func(f *manifest.FileMetadata) bool { + meta, err := d.objProvider.Lookup(fileTypeTable, f.FileBacking.DiskFileNum) + if err != nil { + return false + } + return meta.IsRemote() && meta.Remote.CleanupMethod == objstorage.SharedNoCleanup + }) + var previousOptionsFileNum base.DiskFileNum var previousOptionsFilename string for _, filename := range ls { diff --git a/table_cache.go b/table_cache.go index 849a842c14a..67bba2e43bf 100644 --- a/table_cache.go +++ b/table_cache.go @@ -210,27 +210,11 @@ func (c *tableCacheContainer) metrics() (CacheMetrics, FilterMetrics) { func (c *tableCacheContainer) estimateSize( meta *fileMetadata, lower, upper []byte, ) (size uint64, err error) { - if meta.Virtual { - err = c.withVirtualReader( - meta.VirtualMeta(), - func(r sstable.VirtualReader) (err error) { - size, err = r.EstimateDiskUsage(lower, upper) - return err - }, - ) - } else { - err = c.withReader( - meta.PhysicalMeta(), - func(r *sstable.Reader) (err error) { - size, err = r.EstimateDiskUsage(lower, upper) - return err - }, - ) - } - if err != nil { - return 0, err - } - return size, nil + c.withCommonReader(meta, func(cr sstable.CommonReader) error { + size, err = cr.EstimateDiskUsage(lower, upper) + return err + }) + return size, err } // createCommonReader creates a Reader for this file.