From 79bdbc1578aead8e8a8011acd09b3184a183d151 Mon Sep 17 00:00:00 2001 From: Anish Shanbhag Date: Mon, 15 Jul 2024 12:02:32 -0400 Subject: [PATCH] manifest: add range annotations This change adds a "range annotation" feature to Annotators , which are computations that aggregate some value over a specific key range within within a level. Level-wide annotations are now computed internally as a range annotation with a key range spanning the whole level. Range annotations use the same B-tree caching behavior as regular annotations, so queries remain fast even with thousands of tables because they avoid a sequential iteration over a level's files. This PR only sets up range annotations without changing any existing behavior. However, there are a number of potential use cases for range annotations which could be added next: - Calculating the number of keys shadowed by a tombstone-dense key range, for use in the heuristic proposed at #3719 - Computing the total file size that a read compaction overlaps with, which is used to prevent read compactions that are too wide [here](https://github.com/cockroachdb/pebble/blob/9a4ea4dfc5a8129937e3fdc811ea87543d88565b/compaction_picker.go#L1930) - Estimating disk usage for a key range without having to iterate over files, which is done [here](https://github.com/jbowens/pebble/blob/master/db.go#L2249) - Calculating average value size and compression ratio for a key range, which we [currently use when estimating the potential space that compacting point tombstones would reclaim](https://github.com/jbowens/pebble/blob/646c6bab1af3c72dc7db59a0dcc38b5955fc15cc/table_stats.go#L350). Range annotations could also be used to implement the TODO from @jbowens. - Estimating the reclaimed space from compacting range deletions, for which we also [currently use sequential iteration](https://github.com/jbowens/pebble/blob/master/table_stats.go#L557). - Because annotations are in-memory, if we can find a way to refactor those last two without using I/O at all, then this would eliminate the need to defer table stats collection to a separate goroutine for newly written tables. - Refactoring [`db.ScanStatistics`](https://github.com/jbowens/pebble/blob/646c6bab1af3c72dc7db59a0dcc38b5955fc15cc/db.go#L2823), which could increase performance significantly over the current implementation where we scan every key in the DB. - Expand the LSM visualizer tool (#508) or the LSM viewer (#3339) to show aggregate statistics about key ranges. Range annotations would allow us to efficiently compute statistics including the # of sstables, # of keys, etc. in chunks of the keyspace and visualize this on a graph showing overlapping ranges from each level. `BenchmarkNumFilesRangeAnnotation` shows that range annotations are significantly faster than using `version.Overlaps` to aggregate over a key range: ``` pkg: github.com/cockroachdb/pebble/internal/manifest BenchmarkNumFilesRangeAnnotation/annotator-10 232282 4716 ns/op 112 B/op 7 allocs/op BenchmarkNumFilesRangeAnnotation/overlaps-10 2110 545482 ns/op 400 B/op 9 allocs/op``` ``` --- compaction_picker.go | 204 +++++------------ compaction_picker_test.go | 2 +- compaction_test.go | 2 +- db.go | 16 +- format_major_version.go | 2 +- internal/manifest/annotator.go | 327 ++++++++++++++++++++++++++++ internal/manifest/annotator_test.go | 154 +++++++++++++ internal/manifest/btree.go | 151 ++----------- internal/manifest/btree_test.go | 96 ++------ internal/manifest/l0_sublevels.go | 4 +- internal/manifest/level_metadata.go | 36 +-- internal/manifest/version.go | 8 +- table_stats.go | 230 ++++--------------- 13 files changed, 648 insertions(+), 584 deletions(-) create mode 100644 internal/manifest/annotator.go create mode 100644 internal/manifest/annotator_test.go diff --git a/compaction_picker.go b/compaction_picker.go index 4724a4b4c86..18664cde52c 100644 --- a/compaction_picker.go +++ b/compaction_picker.go @@ -639,39 +639,13 @@ func compensatedSize(f *fileMetadata) uint64 { return f.Size + fileCompensation(f) } -// compensatedSizeAnnotator implements manifest.Annotator, annotating B-Tree -// nodes with the sum of the files' compensated sizes. Its annotation type is -// a *uint64. Compensated sizes may change once a table's stats are loaded -// asynchronously, so its values are marked as cacheable only if a file's -// stats have been loaded. -type compensatedSizeAnnotator struct { -} - -var _ manifest.Annotator = compensatedSizeAnnotator{} - -func (a compensatedSizeAnnotator) Zero(dst interface{}) interface{} { - if dst == nil { - return new(uint64) - } - v := dst.(*uint64) - *v = 0 - return v -} - -func (a compensatedSizeAnnotator) Accumulate( - f *fileMetadata, dst interface{}, -) (v interface{}, cacheOK bool) { - vptr := dst.(*uint64) - *vptr = *vptr + compensatedSize(f) - return vptr, f.StatsValid() -} - -func (a compensatedSizeAnnotator) Merge(src interface{}, dst interface{}) interface{} { - srcV := src.(*uint64) - dstV := dst.(*uint64) - *dstV = *dstV + *srcV - return dstV -} +// compensatedSizeAnnotator is a manifest.Annotator that annotates B-Tree +// nodes with the sum of the files' compensated sizes. Compensated sizes may +// change once a table's stats are loaded asynchronously, so its values are +// marked as cacheable only if a file's stats have been loaded. +var compensatedSizeAnnotator = manifest.SumAnnotator(func(f *fileMetadata) (uint64, bool) { + return compensatedSize(f), f.StatsValid() +}) // totalCompensatedSize computes the compensated size over a file metadata // iterator. Note that this function is linear in the files available to the @@ -912,10 +886,6 @@ func calculateSizeAdjust(inProgressCompactions []compactionInfo) [numLevels]leve return sizeAdjust } -func levelCompensatedSize(lm manifest.LevelMetadata) uint64 { - return *lm.Annotation(compensatedSizeAnnotator{}).(*uint64) -} - func (p *compactionPickerByScore) calculateLevelScores( inProgressCompactions []compactionInfo, ) [numLevels]candidateLevelInfo { @@ -932,7 +902,7 @@ func (p *compactionPickerByScore) calculateLevelScores( } sizeAdjust := calculateSizeAdjust(inProgressCompactions) for level := 1; level < numLevels; level++ { - compensatedLevelSize := levelCompensatedSize(p.vers.Levels[level]) + sizeAdjust[level].compensated() + compensatedLevelSize := compensatedSizeAnnotator.LevelAnnotation(p.vers.Levels[level]) + sizeAdjust[level].compensated() scores[level].compensatedScore = float64(compensatedLevelSize) / float64(p.levelMaxBytes[level]) scores[level].uncompensatedScore = float64(p.vers.Levels[level].Size()+sizeAdjust[level].actual()) / float64(p.levelMaxBytes[level]) } @@ -1393,109 +1363,51 @@ func (p *compactionPickerByScore) addScoresToPickedCompactionMetrics( } } -// elisionOnlyAnnotator implements the manifest.Annotator interface, -// annotating B-Tree nodes with the *fileMetadata of a file meeting the -// obsolete keys criteria for an elision-only compaction within the subtree. -// If multiple files meet the criteria, it chooses whichever file has the -// lowest LargestSeqNum. The lowest LargestSeqNum file will be the first -// eligible for an elision-only compaction once snapshots less than or equal -// to its LargestSeqNum are closed. -type elisionOnlyAnnotator struct{} - -var _ manifest.Annotator = elisionOnlyAnnotator{} - -func (a elisionOnlyAnnotator) Zero(interface{}) interface{} { - return nil -} - -func (a elisionOnlyAnnotator) Accumulate(f *fileMetadata, dst interface{}) (interface{}, bool) { - if f.IsCompacting() { - return dst, true - } - if !f.StatsValid() { - return dst, false - } - // Bottommost files are large and not worthwhile to compact just - // to remove a few tombstones. Consider a file ineligible if its - // own range deletions delete less than 10% of its data and its - // deletion tombstones make up less than 10% of its entries. - // - // TODO(jackson): This does not account for duplicate user keys - // which may be collapsed. Ideally, we would have 'obsolete keys' - // statistics that would include tombstones, the keys that are - // dropped by tombstones and duplicated user keys. See #847. - // - // Note that tables that contain exclusively range keys (i.e. no point keys, - // `NumEntries` and `RangeDeletionsBytesEstimate` are both zero) are excluded - // from elision-only compactions. - // TODO(travers): Consider an alternative heuristic for elision of range-keys. - if f.Stats.RangeDeletionsBytesEstimate*10 < f.Size && - f.Stats.NumDeletions*10 <= f.Stats.NumEntries { - return dst, true - } - if dst == nil { - return f, true - } else if dstV := dst.(*fileMetadata); dstV.LargestSeqNum > f.LargestSeqNum { - return f, true - } - return dst, true -} - -func (a elisionOnlyAnnotator) Merge(v interface{}, accum interface{}) interface{} { - if v == nil { - return accum - } - // If we haven't accumulated an eligible file yet, or f's LargestSeqNum is - // less than the accumulated file's, use f. - if accum == nil { - return v - } - f := v.(*fileMetadata) - accumV := accum.(*fileMetadata) - if accumV == nil || accumV.LargestSeqNum > f.LargestSeqNum { - return f - } - return accumV -} - -// markedForCompactionAnnotator implements the manifest.Annotator interface, -// annotating B-Tree nodes with the *fileMetadata of a file that is marked for -// compaction within the subtree. If multiple files meet the criteria, it -// chooses whichever file has the lowest LargestSeqNum. -type markedForCompactionAnnotator struct{} - -var _ manifest.Annotator = markedForCompactionAnnotator{} - -func (a markedForCompactionAnnotator) Zero(interface{}) interface{} { - return nil -} - -func (a markedForCompactionAnnotator) Accumulate( - f *fileMetadata, dst interface{}, -) (interface{}, bool) { - if !f.MarkedForCompaction { - // Not marked for compaction; return dst. - return dst, true - } - return markedMergeHelper(f, dst) -} - -func (a markedForCompactionAnnotator) Merge(v interface{}, accum interface{}) interface{} { - if v == nil { - return accum - } - accum, _ = markedMergeHelper(v.(*fileMetadata), accum) - return accum -} - -// REQUIRES: f is non-nil, and f.MarkedForCompaction=true. -func markedMergeHelper(f *fileMetadata, dst interface{}) (interface{}, bool) { - if dst == nil { - return f, true - } else if dstV := dst.(*fileMetadata); dstV.LargestSeqNum > f.LargestSeqNum { - return f, true - } - return dst, true +// elisionOnlyAnnotator is a manifest.Annotator that annotates B-Tree +// nodes with the *fileMetadata of a file meeting the obsolete keys criteria +// for an elision-only compaction within the subtree. If multiple files meet +// the criteria, it chooses whichever file has the lowest LargestSeqNum. The +// lowest LargestSeqNum file will be the first eligible for an elision-only +// compaction once snapshots less than or equal to its LargestSeqNum are closed. +var elisionOnlyAnnotator = &manifest.Annotator[*fileMetadata]{ + Aggregator: manifest.PickFileAggregator{ + Filter: func(f *fileMetadata) (eligible bool, cacheOK bool) { + // Bottommost files are large and not worthwhile to compact just + // to remove a few tombstones. Consider a file eligible only if + // either its own range deletions delete at least 10% of its data or + // its deletion tombstones make at least 10% of its entries. + // + // TODO(jackson): This does not account for duplicate user keys + // which may be collapsed. Ideally, we would have 'obsolete keys' + // statistics that would include tombstones, the keys that are + // dropped by tombstones and duplicated user keys. See #847. + // + // Note that tables that contain exclusively range keys (i.e. no point keys, + // `NumEntries` and `RangeDeletionsBytesEstimate` are both zero) are excluded + // from elision-only compactions. + // TODO(travers): Consider an alternative heuristic for elision of range-keys. + deletionCriteria := f.Stats.RangeDeletionsBytesEstimate*10 >= f.Size || f.Stats.NumDeletions*10 > f.Stats.NumEntries + return !f.IsCompacting() && f.StatsValid() && deletionCriteria, f.StatsValid() + }, + Compare: func(f1 *fileMetadata, f2 *fileMetadata) bool { + return f1.LargestSeqNum < f2.LargestSeqNum + }, + }, +} + +// markedForCompactionAnnotator is a manifest.Annotator that annotates B-Tree +// nodes with the *fileMetadata of a file that is marked for compaction +// within the subtree. If multiple files meet the criteria, it chooses +// whichever file has the lowest LargestSeqNum. +var markedForCompactionAnnotator = &manifest.Annotator[*fileMetadata]{ + Aggregator: manifest.PickFileAggregator{ + Filter: func(f *fileMetadata) (eligible bool, cacheOK bool) { + return f.MarkedForCompaction, true + }, + Compare: func(f1 *fileMetadata, f2 *fileMetadata) bool { + return f1.LargestSeqNum < f2.LargestSeqNum + }, + }, } // pickElisionOnlyCompaction looks for compactions of sstables in the @@ -1506,11 +1418,10 @@ func (p *compactionPickerByScore) pickElisionOnlyCompaction( if p.opts.private.disableElisionOnlyCompactions { return nil } - v := p.vers.Levels[numLevels-1].Annotation(elisionOnlyAnnotator{}) - if v == nil { + candidate := elisionOnlyAnnotator.LevelAnnotation(p.vers.Levels[numLevels-1]) + if candidate == nil { return nil } - candidate := v.(*fileMetadata) if candidate.IsCompacting() || candidate.LargestSeqNum >= env.earliestSnapshotSeqNum { return nil } @@ -1542,12 +1453,11 @@ func (p *compactionPickerByScore) pickElisionOnlyCompaction( // the input level. func (p *compactionPickerByScore) pickRewriteCompaction(env compactionEnv) (pc *pickedCompaction) { for l := numLevels - 1; l >= 0; l-- { - v := p.vers.Levels[l].Annotation(markedForCompactionAnnotator{}) - if v == nil { + candidate := markedForCompactionAnnotator.LevelAnnotation(p.vers.Levels[l]) + if candidate == nil { // Try the next level. continue } - candidate := v.(*fileMetadata) if candidate.IsCompacting() { // Try the next level. continue diff --git a/compaction_picker_test.go b/compaction_picker_test.go index d305b1678f1..bd2edb32d8b 100644 --- a/compaction_picker_test.go +++ b/compaction_picker_test.go @@ -563,7 +563,7 @@ func TestCompactionPickerL0(t *testing.T) { } f.MarkedForCompaction = true picker.vers.Stats.MarkedForCompaction++ - picker.vers.Levels[l].InvalidateAnnotation(markedForCompactionAnnotator{}) + markedForCompactionAnnotator.InvalidateLevelAnnotation(picker.vers.Levels[l]) return fmt.Sprintf("marked L%d.%s", l, f.FileNum) } } diff --git a/compaction_test.go b/compaction_test.go index 4a7e95ddbc8..68be949158f 100644 --- a/compaction_test.go +++ b/compaction_test.go @@ -2505,7 +2505,7 @@ func TestMarkedForCompaction(t *testing.T) { } f.MarkedForCompaction = true vers.Stats.MarkedForCompaction++ - vers.Levels[l].InvalidateAnnotation(markedForCompactionAnnotator{}) + markedForCompactionAnnotator.InvalidateLevelAnnotation(vers.Levels[l]) return fmt.Sprintf("marked L%d.%s", l, f.FileNum) } } diff --git a/db.go b/db.go index d879f688256..bf36a3e88e1 100644 --- a/db.go +++ b/db.go @@ -1995,8 +1995,8 @@ func (d *DB) Metrics() *Metrics { metrics.private.optionsFileSize = d.optionsFileSize // TODO(jackson): Consider making these metrics optional. - metrics.Keys.RangeKeySetsCount = countRangeKeySetFragments(vers) - metrics.Keys.TombstoneCount = countTombstones(vers) + metrics.Keys.RangeKeySetsCount = rangeKeySetsAnnotator.MultiLevelAnnotation(vers.RangeKeyLevels[:]) + metrics.Keys.TombstoneCount = tombstonesAnnotator.MultiLevelAnnotation(vers.Levels[:]) d.mu.versions.logLock() metrics.private.manifestFileSize = uint64(d.mu.versions.manifest.Size()) @@ -2014,12 +2014,12 @@ func (d *DB) Metrics() *Metrics { metrics.Flush.NumInProgress = 1 } for i := 0; i < numLevels; i++ { - metrics.Levels[i].Additional.ValueBlocksSize = valueBlocksSizeForLevel(vers, i) - unknown, snappy, none, zstd := compressionTypesForLevel(vers, i) - metrics.Table.CompressedCountUnknown += int64(unknown) - metrics.Table.CompressedCountSnappy += int64(snappy) - metrics.Table.CompressedCountZstd += int64(zstd) - metrics.Table.CompressedCountNone += int64(none) + metrics.Levels[i].Additional.ValueBlocksSize = valueBlockSizeAnnotator.LevelAnnotation(vers.Levels[i]) + compressionTypes := compressionTypeAnnotator.LevelAnnotation(vers.Levels[i]) + metrics.Table.CompressedCountUnknown += int64(compressionTypes.unknown) + metrics.Table.CompressedCountSnappy += int64(compressionTypes.snappy) + metrics.Table.CompressedCountZstd += int64(compressionTypes.zstd) + metrics.Table.CompressedCountNone += int64(compressionTypes.none) } d.mu.Unlock() diff --git a/format_major_version.go b/format_major_version.go index b7c501f79dc..90ef9f0b6a2 100644 --- a/format_major_version.go +++ b/format_major_version.go @@ -517,7 +517,7 @@ func (d *DB) markFilesLocked(findFn findFilesFunc) error { // annotations will be out of date. Clear the compaction-picking // annotation, so that it's recomputed the next time the compaction // picker looks for a file marked for compaction. - vers.Levels[l].InvalidateAnnotation(markedForCompactionAnnotator{}) + markedForCompactionAnnotator.InvalidateLevelAnnotation(vers.Levels[l]) } // The 'marked-for-compaction' bit is persisted in the MANIFEST file diff --git a/internal/manifest/annotator.go b/internal/manifest/annotator.go new file mode 100644 index 00000000000..6c42be3b8dd --- /dev/null +++ b/internal/manifest/annotator.go @@ -0,0 +1,327 @@ +// Copyright 2024 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +package manifest + +import ( + "sort" + + "github.com/cockroachdb/pebble/internal/base" +) + +// The Annotator type defined below is used by other packages to lazily +// compute a value over a B-Tree. Each node of the B-Tree stores one +// `annotation` per annotator, containing the result of the computation over +// the node's subtree. +// +// An annotation is marked as valid if it's current with the current subtree +// state. Annotations are marked as invalid whenever a node will be mutated +// (in mut). Annotators may also return `false` from `Accumulate` to signal +// that a computation for a file is not stable and may change in the future. +// Annotations that include these unstable values are also marked as invalid +// on the node, ensuring that future queries for the annotation will recompute +// the value. + +// An Annotator defines a computation over a level's FileMetadata. If the +// computation is stable and uses inputs that are fixed for the lifetime of +// a FileMetadata, the LevelMetadata's internal data structures are annotated +// with the intermediary computations. This allows the computation to be +// computed incrementally as edits are applied to a level. +type Annotator[T any] struct { + Aggregator AnnotationAggregator[T] +} + +// An AnnotationAggregator defines how an annotation should be accumulated +// from a single FileMetadata and merged with other annotated values. +type AnnotationAggregator[T any] interface { + // Zero returns the zero value of an annotation. This value is returned + // when a LevelMetadata is empty. + Zero() T + + // Accumulate computes the annotation for a single file in a level's + // metadata. It also returns a bool flag indicating whether or not the + // value is stable and okay to cache as an annotation, which must be false + // if the file's value may change over the life of the file. + Accumulate(f *FileMetadata) (v T, cacheOK bool) + + // Merge combines two annotated values src and dst, returning the result. + Merge(v1 T, v2 T) T +} + +type annotation struct { + // annotator is a pointer to the Annotator that computed this annotation. + // NB: This is untyped to allow AnnotationAggregator to use Go generics, + // since annotations are stored in a slice on each node and a single + // slice cannot contain elements with different type parameters. + annotator interface{} + // vptr is a pointer to the annotation value, the output of either + // annotator.Value or annotator.Merge. + // NB: This is untyped for the same reason as annotator above. We store + // a pointer instead of the value itself in order to eliminate a heap + // allocation every time an annotation is computed. + vptr interface{} + // valid indicates whether future reads of the annotation may use the + // value as-is. If false, v will be zeroed and recalculated. + valid bool +} + +// findAnnotation finds this Annotator's annotation on a node, creating +// one if it doesn't already exist. +func (a *Annotator[T]) findAnnotation(n *node) *annotation { + for i := range n.annot { + if n.annot[i].annotator == a { + return &n.annot[i] + } + } + + // This node has never been annotated by a. Create a new annotation. + n.annot = append(n.annot, annotation{ + annotator: a, + vptr: new(T), + }) + return &n.annot[len(n.annot)-1] +} + +// nodeRangeAnnotation computes this annotator's annotation of a node across +// all files in the range defined by lowerBound and upperBound. The second +// return value indicates whether the annotation is stable and thus cacheable. +func (a *Annotator[T]) nodeRangeAnnotation( + n *node, + cmp base.Compare, + // lowerBound and upperBound may be nil to indicate no lower or upper bound. + lowerBound []byte, + // upperBound is a UserKeyBoundary that may be inclusive or exclusive. + upperBound *base.UserKeyBoundary, +) (v T, cacheOK bool) { + annot := a.findAnnotation(n) + // If the annotation is already marked as valid and this node's + // subtree is fully within the bounds, we can return it without + // recomputing anything. + if lowerBound == nil && upperBound == nil && annot.valid { + return *annot.vptr.(*T), true + } + + aggregated := a.Aggregator.Zero() + valid := true + + // We will accumulate annotations from each item in the end-exclusive + // range [leftItem, rightItem). + leftItem, rightItem := 0, int(n.count) + if lowerBound != nil { + // leftItem is the index of the first item that overlaps the lower bound. + leftItem = sort.Search(int(n.count), func(i int) bool { + return cmp(lowerBound, n.items[i].Largest.UserKey) <= 0 + }) + } + if upperBound != nil { + // rightItem is the index of the first item that does not overlap the + // upper bound. + rightItem = sort.Search(int(n.count), func(i int) bool { + return !upperBound.IsUpperBoundFor(cmp, n.items[i].Smallest.UserKey) + }) + } + + // Accumulate annotations from every item that overlaps the bounds. + for i := leftItem; i < rightItem; i++ { + v, ok := a.Aggregator.Accumulate(n.items[i]) + aggregated = a.Aggregator.Merge(v, aggregated) + valid = valid && ok + } + + if !n.leaf { + // We will accumulate annotations from each child in the end-inclusive + // range [leftChild, rightChild]. + leftChild, rightChild := leftItem, rightItem + // If the lower bound overlaps with the child at leftItem, there is no + // need to accumulate annotations from the child to its left. + if leftItem < int(n.count) && cmp(lowerBound, n.items[leftItem].Smallest.UserKey) >= 0 { + leftChild++ + } + // If the upper bound spans beyond the child at rightItem, we must also + // accumulate annotations from the child to its right. + if rightItem < int(n.count) && upperBound.IsUpperBoundFor(cmp, n.items[rightItem].Largest.UserKey) { + rightChild++ + } + + for i := leftChild; i <= rightChild; i++ { + newLowerBound, newUpperBound := lowerBound, upperBound + // If this child is to the right of leftItem, then its entire + // subtree is within the lower bound. + if i > leftItem { + newLowerBound = nil + } + // If this child is to the left of rightItem, then its entire + // subtree is within the upper bound. + if i < rightItem { + newUpperBound = nil + } + + v, ok := a.nodeRangeAnnotation( + n.children[i], + cmp, + newLowerBound, + newUpperBound, + ) + aggregated = a.Aggregator.Merge(v, aggregated) + valid = valid && ok + } + } + + // Update this node's cached annotation only if we accumulated from + // this node's entire subtree. + if lowerBound == nil && upperBound == nil { + *annot.vptr.(*T) = aggregated + annot.valid = valid + } + + return aggregated, valid +} + +// InvalidateAnnotation removes any existing cached annotations from this +// annotator from a node's subtree. +func (a *Annotator[T]) invalidateNodeAnnotation(n *node) { + annot := a.findAnnotation(n) + annot.valid = false + if !n.leaf { + for i := int16(0); i <= n.count; i++ { + a.invalidateNodeAnnotation(n.children[i]) + } + } +} + +// LevelAnnotation calculates the annotation defined by this Annotator for all +// files in the given LevelMetadata. A pointer to the Annotator is used as the +// key for pre-calculated values, so the same Annotator must be used to avoid +// duplicate computation. Annotation must not be called concurrently, and in +// practice this is achieved by requiring callers to hold DB.mu. +func (a *Annotator[T]) LevelAnnotation(lm LevelMetadata) T { + if lm.Empty() { + return a.Aggregator.Zero() + } + + v, _ := a.nodeRangeAnnotation(lm.tree.root, lm.tree.cmp, nil, nil) + return v +} + +// LevelAnnotation calculates the annotation defined by this Annotator for all +// files across the given levels. A pointer to the Annotator is used as the +// key for pre-calculated values, so the same Annotator must be used to avoid +// duplicate computation. Annotation must not be called concurrently, and in +// practice this is achieved by requiring callers to hold DB.mu. +func (a *Annotator[T]) MultiLevelAnnotation(lms []LevelMetadata) T { + aggregated := a.Aggregator.Zero() + for l := 0; l < len(lms); l++ { + if !lms[l].Empty() { + v := a.LevelAnnotation(lms[l]) + aggregated = a.Aggregator.Merge(v, aggregated) + } + } + return aggregated +} + +// LevelRangeAnnotation calculates the annotation defined by this Annotator for +// the files within LevelMetadata which are within the range +// [lowerBound, upperBound). A pointer to the Annotator is used as the key for +// pre-calculated values, so the same Annotator must be used to avoid duplicate +// computation. Annotation must not be called concurrently, and in practice this +// is achieved by requiring callers to hold DB.mu. +func (a *Annotator[T]) LevelRangeAnnotation(lm LevelMetadata, bounds *base.UserKeyBounds) T { + if lm.Empty() { + return a.Aggregator.Zero() + } + + v, _ := a.nodeRangeAnnotation(lm.tree.root, lm.tree.cmp, bounds.Start, &bounds.End) + return v +} + +// InvalidateAnnotation clears any cached annotations defined by Annotator. A +// pointer to the Annotator is used as the key for pre-calculated values, so +// the same Annotator must be used to clear the appropriate cached annotation. +// InvalidateAnnotation must not be called concurrently, and in practice this +// is achieved by requiring callers to hold DB.mu. +func (a *Annotator[T]) InvalidateLevelAnnotation(lm LevelMetadata) { + if lm.Empty() { + return + } + a.invalidateNodeAnnotation(lm.tree.root) +} + +// sumAggregator defines an Aggregator which sums together a uint64 value +// across files. +type sumAggregator struct { + accumulateFunc func(f *FileMetadata) (v uint64, cacheOK bool) +} + +func (sa sumAggregator) Zero() uint64 { + return 0 +} + +func (sa sumAggregator) Accumulate(f *FileMetadata) (v uint64, cacheOK bool) { + return sa.accumulateFunc(f) +} + +func (sa sumAggregator) Merge(val1 uint64, val2 uint64) uint64 { + return val1 + val2 +} + +// SumAnnotator takes a function that computes a uint64 value from a single +// FileMetadata and returns an Annotator that sums together the values across +// files. +func SumAnnotator(accumulate func(f *FileMetadata) (v uint64, cacheOK bool)) *Annotator[uint64] { + return &Annotator[uint64]{ + Aggregator: sumAggregator{ + accumulateFunc: accumulate, + }, + } +} + +// NumFilesAnnotator is an Annotator which computes an annotation value +// equal to the number of files included in the annotation. Particularly, it +// can be used to efficiently calculate the number of files in a given key +// range using range annotations. +var NumFilesAnnotator = SumAnnotator(func(f *FileMetadata) (uint64, bool) { + return 1, true +}) + +// PickFileAggregator implements the AnnotationAggregator interface. It defines +// an aggregator that picks a single file from a set of eligible files. +type PickFileAggregator struct { + // Filter takes a FileMetadata and returns whether it is eligible to be + // picked by this PickFileAggregator. The second return value indicates + // whether this eligibility is stable and thus cacheable. + Filter func(f *FileMetadata) (eligible bool, cacheOK bool) + // Compare compares two instances of FileMetadata and returns true if + // the first one should be picked over the second one. It may assume + // that both arguments are non-nil. + Compare func(f1 *FileMetadata, f2 *FileMetadata) bool +} + +// Zero implements AnnotationAggregator.Zero, returning nil as the zero value. +func (fa PickFileAggregator) Zero() *FileMetadata { + return nil +} + +// Accumulate implements AnnotationAggregator.Accumulate, accumulating a single +// file as long as it is eligible to be picked. +func (fa PickFileAggregator) Accumulate(f *FileMetadata) (v *FileMetadata, cacheOK bool) { + eligible, ok := fa.Filter(f) + if eligible { + return f, ok + } + return nil, ok +} + +// Merge implements AnnotationAggregator.Merge by picking a single file based +// on the output of PickFileAggregator.Compare. +func (fa PickFileAggregator) Merge(f1 *FileMetadata, f2 *FileMetadata) *FileMetadata { + if f1 == nil { + return f2 + } else if f2 == nil { + return f1 + } + if fa.Compare(f1, f2) { + return f1 + } + return f2 +} diff --git a/internal/manifest/annotator_test.go b/internal/manifest/annotator_test.go new file mode 100644 index 00000000000..c4115de3e2e --- /dev/null +++ b/internal/manifest/annotator_test.go @@ -0,0 +1,154 @@ +// Copyright 2024 The LevelDB-Go and Pebble Authors. All rights reserved. Use +// of this source code is governed by a BSD-style license that can be found in +// the LICENSE file. + +package manifest + +import ( + "math/rand" + "testing" + + "github.com/cockroachdb/pebble/internal/base" + "github.com/stretchr/testify/require" +) + +// Creates a version with numFiles files in level 6. +func makeTestVersion(numFiles int) (*Version, []*FileMetadata) { + files := make([]*FileMetadata, numFiles) + for i := 0; i < numFiles; i++ { + // Each file spans 10 keys, e.g. [0->9], [10->19], etc. + files[i] = (&FileMetadata{}).ExtendPointKeyBounds( + base.DefaultComparer.Compare, key(i*10), key(i*10+9), + ) + files[i].InitPhysicalBacking() + } + + var levelFiles [7][]*FileMetadata + levelFiles[6] = files + + v := NewVersion(base.DefaultComparer, 0, levelFiles) + return v, files +} + +func TestNumFilesAnnotator(t *testing.T) { + const count = 1000 + v, _ := makeTestVersion(0) + + for i := 1; i <= count; i++ { + v.Levels[6].tree.Insert(newItem(key(i))) + numFiles := NumFilesAnnotator.LevelAnnotation(v.Levels[6]) + require.EqualValues(t, i, numFiles) + } +} + +func BenchmarkNumFilesAnnotator(b *testing.B) { + v, _ := makeTestVersion(0) + for i := 1; i <= b.N; i++ { + v.Levels[6].tree.Insert(newItem(key(i))) + numFiles := NumFilesAnnotator.LevelAnnotation(v.Levels[6]) + require.EqualValues(b, uint64(i), numFiles) + } +} + +func bounds(i int, j int, exclusive bool) *base.UserKeyBounds { + b := base.UserKeyBoundsEndExclusiveIf(key(i).UserKey, key(j).UserKey, exclusive) + return &b +} + +func randomBounds(rng *rand.Rand, count int) *base.UserKeyBounds { + first := rng.Intn(count) + second := rng.Intn(count) + exclusive := rng.Intn(2) == 0 + return bounds(min(first, second), max(first, second), exclusive) +} + +func requireMatchOverlaps(t *testing.T, v *Version, bounds *base.UserKeyBounds) { + overlaps := v.Overlaps(6, *bounds) + numFiles := NumFilesAnnotator.LevelRangeAnnotation(v.Levels[6], bounds) + require.EqualValues(t, overlaps.length, numFiles) +} + +func TestNumFilesRangeAnnotationEmptyRanges(t *testing.T) { + const count = 5_000 + v, files := makeTestVersion(count) + + // Delete files containing key ranges [0, 999] and [24_000, 25_999]. + for i := 0; i < 100; i++ { + v.Levels[6].tree.Delete(files[i]) + } + for i := 2400; i < 2600; i++ { + v.Levels[6].tree.Delete(files[i]) + } + + // Ranges that are completely empty. + requireMatchOverlaps(t, v, bounds(1, 999, false)) + requireMatchOverlaps(t, v, bounds(0, 1000, true)) + requireMatchOverlaps(t, v, bounds(50_000, 60_000, false)) + requireMatchOverlaps(t, v, bounds(24_500, 25_500, false)) + requireMatchOverlaps(t, v, bounds(24_000, 26_000, true)) + + // Partial overlaps with empty ranges. + requireMatchOverlaps(t, v, bounds(0, 1000, false)) + requireMatchOverlaps(t, v, bounds(20, 1001, true)) + requireMatchOverlaps(t, v, bounds(20, 1010, true)) + requireMatchOverlaps(t, v, bounds(23_000, 27_000, true)) + requireMatchOverlaps(t, v, bounds(25_000, 40_000, false)) + requireMatchOverlaps(t, v, bounds(25_500, 26_001, true)) + + // Ranges which only spans a single table. + requireMatchOverlaps(t, v, bounds(45_000, 45_000, true)) + requireMatchOverlaps(t, v, bounds(30_000, 30_001, true)) + requireMatchOverlaps(t, v, bounds(23_000, 23_000, false)) +} + +func TestNumFilesRangeAnnotationRandomized(t *testing.T) { + const count = 10_000 + const numIterations = 100_000 + + v, _ := makeTestVersion(count) + + rng := rand.New(rand.NewSource(int64(0))) + for i := 0; i < numIterations; i++ { + requireMatchOverlaps(t, v, randomBounds(rng, count*11)) + } +} + +func BenchmarkNumFilesRangeAnnotation(b *testing.B) { + const count = 100_000 + v, files := makeTestVersion(count) + + rng := rand.New(rand.NewSource(int64(0))) + b.Run("annotator", func(b *testing.B) { + for i := 0; i < b.N; i++ { + b := randomBounds(rng, count*11) + // Randomly delete and reinsert a file to verify that range + // annotations are still fast despite small mutations. + toDelete := rng.Intn(count) + v.Levels[6].tree.Delete(files[toDelete]) + + NumFilesAnnotator.LevelRangeAnnotation(v.Levels[6], b) + + v.Levels[6].tree.Insert(files[toDelete]) + } + }) + + // Also benchmark an equivalent aggregation using version.Overlaps to show + // the difference in performance. + b.Run("overlaps", func(b *testing.B) { + for i := 0; i < b.N; i++ { + b := randomBounds(rng, count*11) + toDelete := rng.Intn(count) + v.Levels[6].tree.Delete(files[toDelete]) + + overlaps := v.Overlaps(6, *b) + iter := overlaps.Iter() + numFiles := 0 + for f := iter.First(); f != nil; f = iter.Next() { + numFiles++ + } + + v.Levels[6].tree.Insert(files[toDelete]) + } + }) + +} diff --git a/internal/manifest/btree.go b/internal/manifest/btree.go index b098a179bfe..aba5c10e533 100644 --- a/internal/manifest/btree.go +++ b/internal/manifest/btree.go @@ -13,48 +13,10 @@ import ( "unsafe" "github.com/cockroachdb/errors" + "github.com/cockroachdb/pebble/internal/base" "github.com/cockroachdb/pebble/internal/invariants" ) -// The Annotator type defined below is used by other packages to lazily -// compute a value over a B-Tree. Each node of the B-Tree stores one -// `annotation` per annotator, containing the result of the computation over -// the node's subtree. -// -// An annotation is marked as valid if it's current with the current subtree -// state. Annotations are marked as invalid whenever a node will be mutated -// (in mut). Annotators may also return `false` from `Accumulate` to signal -// that a computation for a file is not stable and may change in the future. -// Annotations that include these unstable values are also marked as invalid -// on the node, ensuring that future queries for the annotation will recompute -// the value. - -// An Annotator defines a computation over a level's FileMetadata. If the -// computation is stable and uses inputs that are fixed for the lifetime of -// a FileMetadata, the LevelMetadata's internal data structures are annotated -// with the intermediary computations. This allows the computation to be -// computed incrementally as edits are applied to a level. -type Annotator interface { - // Zero returns the zero value of an annotation. This value is returned - // when a LevelMetadata is empty. The dst argument, if non-nil, is an - // obsolete value previously returned by this Annotator and may be - // overwritten and reused to avoid a memory allocation. - Zero(dst interface{}) (v interface{}) - - // Accumulate computes the annotation for a single file in a level's - // metadata. It merges the file's value into dst and returns a bool flag - // indicating whether or not the value is stable and okay to cache as an - // annotation. If the file's value may change over the life of the file, - // the annotator must return false. - // - // Implementations may modify dst and return it to avoid an allocation. - Accumulate(m *FileMetadata, dst interface{}) (v interface{}, cacheOK bool) - - // Merge combines two values src and dst, returning the result. - // Implementations may modify dst and return it to avoid an allocation. - Merge(src interface{}, dst interface{}) interface{} -} - type btreeCmp func(*FileMetadata, *FileMetadata) int func btreeCmpSeqNum(a, b *FileMetadata) int { @@ -91,16 +53,6 @@ const ( minItems = degree - 1 ) -type annotation struct { - annotator Annotator - // v is an annotation value, the output of either - // annotator.Value or annotator.Merge. - v interface{} - // valid indicates whether future reads of the annotation may use v as-is. - // If false, v will be zeroed and recalculated. - valid bool -} - type leafNode struct { ref atomic.Int32 count int16 @@ -349,14 +301,14 @@ func (n *node) popFront() (*FileMetadata, *node) { // // This function is for use only as a helper function for internal B-Tree code. // Clients should not invoke it directly. -func (n *node) find(cmp btreeCmp, item *FileMetadata) (index int, found bool) { +func (n *node) find(bcmp btreeCmp, item *FileMetadata) (index int, found bool) { // Logic copied from sort.Search. Inlining this gave // an 11% speedup on BenchmarkBTreeDeleteInsert. i, j := 0, int(n.count) for i < j { h := int(uint(i+j) >> 1) // avoid overflow when computing h // i ≤ h < j - v := cmp(item, n.items[h]) + v := bcmp(item, n.items[h]) if v == 0 { return h, true } else if v > 0 { @@ -435,8 +387,8 @@ func (n *node) split(i int) (*FileMetadata, *node) { // Insert inserts a item into the subtree rooted at this node, making sure no // nodes in the subtree exceed maxItems items. -func (n *node) Insert(cmp btreeCmp, item *FileMetadata) error { - i, found := n.find(cmp, item) +func (n *node) Insert(bcmp btreeCmp, item *FileMetadata) error { + i, found := n.find(bcmp, item) if found { // cmp provides a total ordering of the files within a level. // If we're inserting a metadata that's equal to an existing item @@ -453,7 +405,7 @@ func (n *node) Insert(cmp btreeCmp, item *FileMetadata) error { splitLa, splitNode := mut(&n.children[i]).split(maxItems / 2) n.insertAt(i, splitLa, splitNode) - switch cmp := cmp(item, n.items[i]); { + switch cmp := bcmp(item, n.items[i]); { case cmp < 0: // no change, we want first split node case cmp > 0: @@ -467,7 +419,7 @@ func (n *node) Insert(cmp btreeCmp, item *FileMetadata) error { } } - err := mut(&n.children[i]).Insert(cmp, item) + err := mut(&n.children[i]).Insert(bcmp, item) if err == nil { n.subtreeCount++ } @@ -496,8 +448,8 @@ func (n *node) removeMax() *FileMetadata { // Remove removes a item from the subtree rooted at this node. Returns // the item that was removed or nil if no matching item was found. -func (n *node) Remove(cmp btreeCmp, item *FileMetadata) (out *FileMetadata) { - i, found := n.find(cmp, item) +func (n *node) Remove(bcmp btreeCmp, item *FileMetadata) (out *FileMetadata) { + i, found := n.find(bcmp, item) if n.leaf { if found { out, _ = n.removeAt(i) @@ -509,7 +461,7 @@ func (n *node) Remove(cmp btreeCmp, item *FileMetadata) (out *FileMetadata) { if n.children[i].count <= minItems { // Child not large enough to remove from. n.rebalanceOrMerge(i) - return n.Remove(cmp, item) + return n.Remove(bcmp, item) } child := mut(&n.children[i]) if found { @@ -520,7 +472,7 @@ func (n *node) Remove(cmp btreeCmp, item *FileMetadata) (out *FileMetadata) { return out } // File is not in this node and child is large enough to remove from. - out = child.Remove(cmp, item) + out = child.Remove(bcmp, item) if out != nil { n.subtreeCount-- } @@ -659,78 +611,6 @@ func (n *node) rebalanceOrMerge(i int) { } } -// InvalidateAnnotation removes any existing cached annotations for the provided -// annotator from this node's subtree. -func (n *node) InvalidateAnnotation(a Annotator) { - // Find this annotator's annotation on this node. - var annot *annotation - for i := range n.annot { - if n.annot[i].annotator == a { - annot = &n.annot[i] - } - } - - if annot != nil && annot.valid { - annot.valid = false - annot.v = a.Zero(annot.v) - } - if !n.leaf { - for i := int16(0); i <= n.count; i++ { - n.children[i].InvalidateAnnotation(a) - } - } -} - -// Annotation retrieves, computing if not already computed, the provided -// annotator's annotation of this node. The second return value indicates -// whether the future reads of this annotation may use the first return value -// as-is. If false, the annotation is not stable and may change on a subsequent -// computation. -func (n *node) Annotation(a Annotator) (interface{}, bool) { - // Find this annotator's annotation on this node. - var annot *annotation - for i := range n.annot { - if n.annot[i].annotator == a { - annot = &n.annot[i] - } - } - - // If it exists and is marked as valid, we can return it without - // recomputing anything. - if annot != nil && annot.valid { - return annot.v, true - } - - if annot == nil { - // This is n's first time being annotated by a. - // Create a new zeroed annotation. - n.annot = append(n.annot, annotation{ - annotator: a, - v: a.Zero(nil), - }) - annot = &n.annot[len(n.annot)-1] - } else { - // There's an existing annotation that must be recomputed. - // Zero its value. - annot.v = a.Zero(annot.v) - } - - annot.valid = true - for i := int16(0); i <= n.count; i++ { - if !n.leaf { - v, ok := n.children[i].Annotation(a) - annot.v = a.Merge(v, annot.v) - annot.valid = annot.valid && ok - } - if i < n.count { - v, ok := a.Accumulate(n.items[i], annot.v) - annot.v = v - annot.valid = annot.valid && ok - } - } - return annot.v, annot.valid -} - func (n *node) verifyInvariants() { recomputedSubtreeCount := int(n.count) if !n.leaf { @@ -756,7 +636,8 @@ func (n *node) verifyInvariants() { // goroutines, but Read operations are. type btree struct { root *node - cmp btreeCmp + cmp base.Compare + bcmp btreeCmp } // Release dereferences and clears the root node of the btree, removing all @@ -799,7 +680,7 @@ func (t *btree) Delete(item *FileMetadata) (obsolete bool) { if t.root == nil || t.root.count == 0 { return false } - if out := mut(&t.root).Remove(t.cmp, item); out != nil { + if out := mut(&t.root).Remove(t.bcmp, item); out != nil { obsolete = out.FileBacking.Unref() == 0 } if invariants.Enabled { @@ -833,7 +714,7 @@ func (t *btree) Insert(item *FileMetadata) error { t.root = newRoot } item.FileBacking.Ref() - err := mut(&t.root).Insert(t.cmp, item) + err := mut(&t.root).Insert(t.bcmp, item) if invariants.Enabled { t.root.verifyInvariants() } @@ -844,7 +725,7 @@ func (t *btree) Insert(item *FileMetadata) error { // iterator after modifications are made to the tree. If modifications are made, // create a new iterator. func (t *btree) Iter() iterator { - return iterator{r: t.root, pos: -1, cmp: t.cmp} + return iterator{r: t.root, pos: -1, cmp: t.bcmp} } // Count returns the number of files contained within the B-Tree. diff --git a/internal/manifest/btree_test.go b/internal/manifest/btree_test.go index cce22a2563b..c75f33004ea 100644 --- a/internal/manifest/btree_test.go +++ b/internal/manifest/btree_test.go @@ -111,7 +111,7 @@ func (n *node) verifyCountAllowed(t *testing.T, root bool) { } func (t *btree) isSorted(tt *testing.T) { - t.root.isSorted(tt, t.cmp) + t.root.isSorted(tt, t.bcmp) } func (n *node) isSorted(t *testing.T, cmp func(*FileMetadata, *FileMetadata) int) { @@ -145,10 +145,10 @@ func (n *node) recurse(f func(child *node, pos int16)) { ////////////////////////////////////////// func key(i int) InternalKey { - if i < 0 || i > 99999 { + if i < 0 || i > 9999999 { panic("key out of bounds") } - return base.MakeInternalKey([]byte(fmt.Sprintf("%05d", i)), 0, base.InternalKeyKindSet) + return base.MakeInternalKey([]byte(fmt.Sprintf("%07d", i)), 0, base.InternalKeyKindSet) } func keyWithMemo(i int, memo map[int]InternalKey) InternalKey { @@ -209,7 +209,7 @@ func checkIter(t *testing.T, it iterator, start, end int, keyMemo map[int]Intern // TestBTree tests basic btree operations. func TestBTree(t *testing.T) { var tr btree - tr.cmp = cmp + tr.bcmp = cmp keyMemo := make(map[int]InternalKey) // With degree == 16 (max-items/node == 31) we need 513 items in order for @@ -269,7 +269,7 @@ func TestIterClone(t *testing.T) { const count = 65536 var tr btree - tr.cmp = cmp + tr.bcmp = cmp keyMemo := make(map[int]InternalKey) for i := 0; i < count; i++ { @@ -295,7 +295,7 @@ func TestIterClone(t *testing.T) { func TestIterCmpEdgeCases(t *testing.T) { var tr btree - tr.cmp = cmp + tr.bcmp = cmp t.Run("empty", func(t *testing.T) { a := tr.Iter() b := tr.Iter() @@ -330,7 +330,7 @@ func TestIterCmpRand(t *testing.T) { const iterCount = 1000 var tr btree - tr.cmp = cmp + tr.bcmp = cmp for i := 0; i < itemCount; i++ { require.NoError(t, tr.Insert(newItem(key(i)))) } @@ -365,7 +365,7 @@ func TestBTreeSeek(t *testing.T) { const count = 513 var tr btree - tr.cmp = cmp + tr.bcmp = cmp for i := 0; i < count; i++ { require.NoError(t, tr.Insert(newItem(key(i*2)))) } @@ -404,7 +404,7 @@ func TestBTreeSeek(t *testing.T) { func TestBTreeInsertDuplicateError(t *testing.T) { var tr btree - tr.cmp = cmp + tr.bcmp = cmp require.NoError(t, tr.Insert(newItem(key(1)))) require.NoError(t, tr.Insert(newItem(key(2)))) require.NoError(t, tr.Insert(newItem(key(3)))) @@ -447,7 +447,7 @@ func TestBTreeCloneConcurrentOperations(t *testing.T) { wg.Add(1) var tr btree - tr.cmp = cmp + tr.bcmp = cmp go populate(&tr, 0) wg.Wait() close(treeC) @@ -516,7 +516,7 @@ func TestIterStack(t *testing.T) { func TestIterEndSentinel(t *testing.T) { var tr btree - tr.cmp = cmp + tr.bcmp = cmp require.NoError(t, tr.Insert(newItem(key(1)))) require.NoError(t, tr.Insert(newItem(key(2)))) require.NoError(t, tr.Insert(newItem(key(3)))) @@ -534,56 +534,6 @@ func TestIterEndSentinel(t *testing.T) { require.True(t, iter.iter.valid()) } -type orderStatistic struct{} - -func (o orderStatistic) Zero(dst interface{}) interface{} { - if dst == nil { - return new(int) - } - v := dst.(*int) - *v = 0 - return v -} - -func (o orderStatistic) Accumulate(meta *FileMetadata, dst interface{}) (interface{}, bool) { - v := dst.(*int) - *v++ - return v, true -} - -func (o orderStatistic) Merge(src interface{}, dst interface{}) interface{} { - srcv := src.(*int) - dstv := dst.(*int) - *dstv = *dstv + *srcv - return dstv -} - -func TestAnnotationOrderStatistic(t *testing.T) { - const count = 1000 - ann := orderStatistic{} - - var tr btree - tr.cmp = cmp - for i := 1; i <= count; i++ { - require.NoError(t, tr.Insert(newItem(key(i)))) - - v, ok := tr.root.Annotation(ann) - require.True(t, ok) - vtyped := v.(*int) - require.Equal(t, i, *vtyped) - } - - v, ok := tr.root.Annotation(ann) - require.True(t, ok) - vtyped := v.(*int) - require.Equal(t, count, *vtyped) - - v, ok = tr.root.Annotation(ann) - vtyped = v.(*int) - require.True(t, ok) - require.Equal(t, count, *vtyped) -} - // TestRandomizedBTree tests a random set of Insert, Delete and iteration // operations, checking for equivalence with a map of filenums. func TestRandomizedBTree(t *testing.T) { @@ -610,7 +560,7 @@ func TestRandomizedBTree(t *testing.T) { // Use a btree comparator that sorts by file number to make it easier to // prevent duplicates or overlaps. tree := btree{ - cmp: func(a *FileMetadata, b *FileMetadata) int { + bcmp: func(a *FileMetadata, b *FileMetadata) int { return stdcmp.Compare(a.FileNum, b.FileNum) }, } @@ -734,7 +684,7 @@ func BenchmarkBTreeInsert(b *testing.B) { b.ResetTimer() for i := 0; i < b.N; { var tr btree - tr.cmp = cmp + tr.bcmp = cmp for _, item := range insertP { if err := tr.Insert(item); err != nil { b.Fatal(err) @@ -756,7 +706,7 @@ func BenchmarkBTreeDelete(b *testing.B) { for i := 0; i < b.N; { b.StopTimer() var tr btree - tr.cmp = cmp + tr.bcmp = cmp for _, item := range insertP { if err := tr.Insert(item); err != nil { b.Fatal(err) @@ -782,7 +732,7 @@ func BenchmarkBTreeDeleteInsert(b *testing.B) { forBenchmarkSizes(b, func(b *testing.B, count int) { insertP := perm(count) var tr btree - tr.cmp = cmp + tr.bcmp = cmp for _, item := range insertP { if err := tr.Insert(item); err != nil { b.Fatal(err) @@ -805,7 +755,7 @@ func BenchmarkBTreeDeleteInsertCloneOnce(b *testing.B) { forBenchmarkSizes(b, func(b *testing.B, count int) { insertP := perm(count) var tr btree - tr.cmp = cmp + tr.bcmp = cmp for _, item := range insertP { if err := tr.Insert(item); err != nil { b.Fatal(err) @@ -831,8 +781,8 @@ func BenchmarkBTreeDeleteInsertCloneEachTime(b *testing.B) { forBenchmarkSizes(b, func(b *testing.B, count int) { insertP := perm(count) var tr, trRelease btree - tr.cmp = cmp - trRelease.cmp = cmp + tr.bcmp = cmp + trRelease.bcmp = cmp for _, item := range insertP { if err := tr.Insert(item); err != nil { b.Fatal(err) @@ -859,7 +809,7 @@ func BenchmarkBTreeDeleteInsertCloneEachTime(b *testing.B) { // BenchmarkBTreeIter measures the cost of creating a btree iterator. func BenchmarkBTreeIter(b *testing.B) { var tr btree - tr.cmp = cmp + tr.bcmp = cmp for i := 0; i < b.N; i++ { it := tr.Iter() it.first() @@ -873,7 +823,7 @@ func BenchmarkBTreeIterSeekGE(b *testing.B) { forBenchmarkSizes(b, func(b *testing.B, count int) { var keys []InternalKey var tr btree - tr.cmp = cmp + tr.bcmp = cmp for i := 0; i < count; i++ { s := key(i) @@ -907,7 +857,7 @@ func BenchmarkBTreeIterSeekLT(b *testing.B) { forBenchmarkSizes(b, func(b *testing.B, count int) { var keys []InternalKey var tr btree - tr.cmp = cmp + tr.bcmp = cmp for i := 0; i < count; i++ { k := key(i) @@ -946,7 +896,7 @@ func BenchmarkBTreeIterSeekLT(b *testing.B) { // next item in the tree. func BenchmarkBTreeIterNext(b *testing.B) { var tr btree - tr.cmp = cmp + tr.bcmp = cmp const count = 8 << 10 for i := 0; i < count; i++ { @@ -970,7 +920,7 @@ func BenchmarkBTreeIterNext(b *testing.B) { // previous item in the tree. func BenchmarkBTreeIterPrev(b *testing.B) { var tr btree - tr.cmp = cmp + tr.bcmp = cmp const count = 8 << 10 for i := 0; i < count; i++ { diff --git a/internal/manifest/l0_sublevels.go b/internal/manifest/l0_sublevels.go index 04cef07ff25..3185b95b3da 100644 --- a/internal/manifest/l0_sublevels.go +++ b/internal/manifest/l0_sublevels.go @@ -337,7 +337,7 @@ func NewL0Sublevels( // Construct a parallel slice of sublevel B-Trees. // TODO(jackson): Consolidate and only use the B-Trees. for _, sublevelFiles := range s.levelFiles { - tr, ls := makeBTree(btreeCmpSmallestKey(cmp), sublevelFiles) + tr, ls := makeBTree(cmp, btreeCmpSmallestKey(cmp), sublevelFiles) s.Levels = append(s.Levels, ls) tr.Release() } @@ -630,7 +630,7 @@ func (s *L0Sublevels) AddL0Files( // Construct a parallel slice of sublevel B-Trees. // TODO(jackson): Consolidate and only use the B-Trees. for _, sublevel := range updatedSublevels { - tr, ls := makeBTree(btreeCmpSmallestKey(newVal.cmp), newVal.levelFiles[sublevel]) + tr, ls := makeBTree(newVal.cmp, btreeCmpSmallestKey(newVal.cmp), newVal.levelFiles[sublevel]) if sublevel == len(newVal.Levels) { newVal.Levels = append(newVal.Levels, ls) } else { diff --git a/internal/manifest/level_metadata.go b/internal/manifest/level_metadata.go index c3008e809b5..abe4a13cb36 100644 --- a/internal/manifest/level_metadata.go +++ b/internal/manifest/level_metadata.go @@ -49,7 +49,7 @@ func MakeLevelMetadata(cmp Compare, level int, files []*FileMetadata) LevelMetad } var lm LevelMetadata lm.level = level - lm.tree, _ = makeBTree(bcmp, files) + lm.tree, _ = makeBTree(cmp, bcmp, files) for _, f := range files { lm.totalSize += f.Size if f.Virtual { @@ -60,9 +60,10 @@ func MakeLevelMetadata(cmp Compare, level int, files []*FileMetadata) LevelMetad return lm } -func makeBTree(cmp btreeCmp, files []*FileMetadata) (btree, LevelSlice) { +func makeBTree(cmp base.Compare, bcmp btreeCmp, files []*FileMetadata) (btree, LevelSlice) { var t btree t.cmp = cmp + t.bcmp = bcmp for _, f := range files { t.Insert(f) } @@ -140,31 +141,6 @@ func (lm *LevelMetadata) Find(cmp base.Compare, m *FileMetadata) LevelSlice { return LevelSlice{} } -// Annotation lazily calculates and returns the annotation defined by -// Annotator. The Annotator is used as the key for pre-calculated -// values, so equal Annotators must be used to avoid duplicate computations -// and cached annotations. Annotation must not be called concurrently, and in -// practice this is achieved by requiring callers to hold DB.mu. -func (lm *LevelMetadata) Annotation(annotator Annotator) interface{} { - if lm.Empty() { - return annotator.Zero(nil) - } - v, _ := lm.tree.root.Annotation(annotator) - return v -} - -// InvalidateAnnotation clears any cached annotations defined by Annotator. The -// Annotator is used as the key for pre-calculated values, so equal Annotators -// must be used to clear the appropriate cached annotation. InvalidateAnnotation -// must not be called concurrently, and in practice this is achieved by -// requiring callers to hold DB.mu. -func (lm *LevelMetadata) InvalidateAnnotation(annotator Annotator) { - if lm.Empty() { - return - } - lm.tree.root.InvalidateAnnotation(annotator) -} - // LevelFile holds a file's metadata along with its position // within a level of the LSM. type LevelFile struct { @@ -182,7 +158,7 @@ func (lf LevelFile) Slice() LevelSlice { // TODO(jackson): Can we improve this interface or avoid needing to export // a slice constructor like this? func NewLevelSliceSeqSorted(files []*FileMetadata) LevelSlice { - tr, slice := makeBTree(btreeCmpSeqNum, files) + tr, slice := makeBTree(nil, btreeCmpSeqNum, files) tr.Release() slice.verifyInvariants() return slice @@ -193,7 +169,7 @@ func NewLevelSliceSeqSorted(files []*FileMetadata) LevelSlice { // TODO(jackson): Can we improve this interface or avoid needing to export // a slice constructor like this? func NewLevelSliceKeySorted(cmp base.Compare, files []*FileMetadata) LevelSlice { - tr, slice := makeBTree(btreeCmpSmallestKey(cmp), files) + tr, slice := makeBTree(cmp, btreeCmpSmallestKey(cmp), files) tr.Release() slice.verifyInvariants() return slice @@ -204,7 +180,7 @@ func NewLevelSliceKeySorted(cmp base.Compare, files []*FileMetadata) LevelSlice // tests. // TODO(jackson): Update tests to avoid requiring this and remove it. func NewLevelSliceSpecificOrder(files []*FileMetadata) LevelSlice { - tr, slice := makeBTree(btreeCmpSpecificOrder(files), files) + tr, slice := makeBTree(nil, btreeCmpSpecificOrder(files), files) tr.Release() slice.verifyInvariants() return slice diff --git a/internal/manifest/version.go b/internal/manifest/version.go index a90717e836d..dc9e45fc730 100644 --- a/internal/manifest/version.go +++ b/internal/manifest/version.go @@ -1068,12 +1068,12 @@ func NewVersion( // order to test consistency checking, etc. Once we've constructed the // initial B-Tree, we swap out the btreeCmp for the correct one. // TODO(jackson): Adjust or remove the tests and remove this. - v.Levels[l].tree, _ = makeBTree(btreeCmpSpecificOrder(files[l]), files[l]) + v.Levels[l].tree, _ = makeBTree(comparer.Compare, btreeCmpSpecificOrder(files[l]), files[l]) v.Levels[l].level = l if l == 0 { - v.Levels[l].tree.cmp = btreeCmpSeqNum + v.Levels[l].tree.bcmp = btreeCmpSeqNum } else { - v.Levels[l].tree.cmp = btreeCmpSmallestKey(comparer.Compare) + v.Levels[l].tree.bcmp = btreeCmpSmallestKey(comparer.Compare) } for _, f := range files[l] { v.Levels[l].totalSize += f.Size @@ -1501,7 +1501,7 @@ func (v *Version) Overlaps(level int, bounds base.UserKeyBounds) LevelSlice { if !restart { // Construct a B-Tree containing only the matching items. var tr btree - tr.cmp = v.Levels[level].tree.cmp + tr.bcmp = v.Levels[level].tree.bcmp for i, meta := 0, l0Iter.First(); meta != nil; i, meta = i+1, l0Iter.Next() { if selectedIndices[i] { err := tr.Insert(meta) diff --git a/table_stats.go b/table_stats.go index a670dc11942..49acf1220cb 100644 --- a/table_stats.go +++ b/table_stats.go @@ -989,206 +989,72 @@ func newCombinedDeletionKeyspanIter( return mIter, nil } -// rangeKeySetsAnnotator implements manifest.Annotator, annotating B-Tree nodes -// with the sum of the files' counts of range key fragments. Its annotation type -// is a *uint64. The count of range key sets may change once a table's stats are +// rangeKeySetsAnnotator is a manifest.Annotator that annotates B-Tree nodes +// with the sum of the files' counts of range key fragments. The count of range +// key sets may change once a table's stats are loaded asynchronously, so its +// values are marked as cacheable only if a file's stats have been loaded. +var rangeKeySetsAnnotator = manifest.SumAnnotator(func(f *manifest.FileMetadata) (uint64, bool) { + return f.Stats.NumRangeKeySets, f.StatsValid() +}) + +// tombstonesAnnotator is a manifest.Annotator that annotates B-Tree nodes +// with the sum of the files' counts of tombstones (DEL, SINGLEDEL and RANGEDEL +// keys). The count of tombstones may change once a table's stats are loaded +// asynchronously, so its values are marked as cacheable only if a file's stats +// have been loaded. +var tombstonesAnnotator = manifest.SumAnnotator(func(f *manifest.FileMetadata) (uint64, bool) { + return f.Stats.NumDeletions, f.StatsValid() +}) + +// valueBlocksSizeAnnotator is a manifest.Annotator that annotates B-Tree +// nodes with the sum of the files' Properties.ValueBlocksSize. The value block +// size may change once a table's stats are loaded asynchronously, so its +// values are marked as cacheable only if a file's stats have been loaded. +var valueBlockSizeAnnotator = manifest.SumAnnotator(func(f *fileMetadata) (uint64, bool) { + return f.Stats.ValueBlocksSize, f.StatsValid() +}) + +// compressionTypeAnnotator is a manifest.Annotator that annotates B-tree +// nodes with the compression type of the file. Its annotation type is +// compressionTypes. The compression type may change once a table's stats are // loaded asynchronously, so its values are marked as cacheable only if a file's // stats have been loaded. -type rangeKeySetsAnnotator struct{} - -var _ manifest.Annotator = rangeKeySetsAnnotator{} - -func (a rangeKeySetsAnnotator) Zero(dst interface{}) interface{} { - if dst == nil { - return new(uint64) - } - v := dst.(*uint64) - *v = 0 - return v -} - -func (a rangeKeySetsAnnotator) Accumulate( - f *fileMetadata, dst interface{}, -) (v interface{}, cacheOK bool) { - vptr := dst.(*uint64) - *vptr = *vptr + f.Stats.NumRangeKeySets - return vptr, f.StatsValid() -} - -func (a rangeKeySetsAnnotator) Merge(src interface{}, dst interface{}) interface{} { - srcV := src.(*uint64) - dstV := dst.(*uint64) - *dstV = *dstV + *srcV - return dstV +var compressionTypeAnnotator = manifest.Annotator[compressionTypes]{ + Aggregator: compressionTypeAggregator{}, } -// countRangeKeySetFragments counts the number of RANGEKEYSET keys across all -// files of the LSM. It only counts keys in files for which table stats have -// been loaded. It uses a b-tree annotator to cache intermediate values between -// calculations when possible. -func countRangeKeySetFragments(v *version) (count uint64) { - for l := 0; l < numLevels; l++ { - if v.RangeKeyLevels[l].Empty() { - continue - } - count += *v.RangeKeyLevels[l].Annotation(rangeKeySetsAnnotator{}).(*uint64) - } - return count -} - -// tombstonesAnnotator implements manifest.Annotator, annotating B-Tree nodes -// with the sum of the files' counts of tombstones (DEL, SINGLEDEL and RANGEDELk -// eys). Its annotation type is a *uint64. The count of tombstones may change -// once a table's stats are loaded asynchronously, so its values are marked as -// cacheable only if a file's stats have been loaded. -type tombstonesAnnotator struct{} - -var _ manifest.Annotator = tombstonesAnnotator{} - -func (a tombstonesAnnotator) Zero(dst interface{}) interface{} { - if dst == nil { - return new(uint64) - } - v := dst.(*uint64) - *v = 0 - return v -} - -func (a tombstonesAnnotator) Accumulate( - f *fileMetadata, dst interface{}, -) (v interface{}, cacheOK bool) { - vptr := dst.(*uint64) - *vptr = *vptr + f.Stats.NumDeletions - return vptr, f.StatsValid() -} - -func (a tombstonesAnnotator) Merge(src interface{}, dst interface{}) interface{} { - srcV := src.(*uint64) - dstV := dst.(*uint64) - *dstV = *dstV + *srcV - return dstV -} - -// countTombstones counts the number of tombstone (DEL, SINGLEDEL and RANGEDEL) -// internal keys across all files of the LSM. It only counts keys in files for -// which table stats have been loaded. It uses a b-tree annotator to cache -// intermediate values between calculations when possible. -func countTombstones(v *version) (count uint64) { - for l := 0; l < numLevels; l++ { - if v.Levels[l].Empty() { - continue - } - count += *v.Levels[l].Annotation(tombstonesAnnotator{}).(*uint64) - } - return count -} - -// valueBlocksSizeAnnotator implements manifest.Annotator, annotating B-Tree -// nodes with the sum of the files' Properties.ValueBlocksSize. Its annotation -// type is a *uint64. The value block size may change once a table's stats are -// loaded asynchronously, so its values are marked as cacheable only if a -// file's stats have been loaded. -type valueBlocksSizeAnnotator struct{} - -var _ manifest.Annotator = valueBlocksSizeAnnotator{} - -func (a valueBlocksSizeAnnotator) Zero(dst interface{}) interface{} { - if dst == nil { - return new(uint64) - } - v := dst.(*uint64) - *v = 0 - return v -} - -func (a valueBlocksSizeAnnotator) Accumulate( - f *fileMetadata, dst interface{}, -) (v interface{}, cacheOK bool) { - vptr := dst.(*uint64) - *vptr = *vptr + f.Stats.ValueBlocksSize - return vptr, f.StatsValid() -} - -func (a valueBlocksSizeAnnotator) Merge(src interface{}, dst interface{}) interface{} { - srcV := src.(*uint64) - dstV := dst.(*uint64) - *dstV = *dstV + *srcV - return dstV -} - -// valueBlocksSizeForLevel returns the Properties.ValueBlocksSize across all -// files for a level of the LSM. It only includes the size for files for which -// table stats have been loaded. It uses a b-tree annotator to cache -// intermediate values between calculations when possible. It must not be -// called concurrently. -// -// REQUIRES: 0 <= level <= numLevels. -func valueBlocksSizeForLevel(v *version, level int) (count uint64) { - if v.Levels[level].Empty() { - return 0 - } - return *v.Levels[level].Annotation(valueBlocksSizeAnnotator{}).(*uint64) -} - -// compressionTypeAnnotator implements manifest.Annotator, annotating B-tree -// nodes with the compression type of the file. Its annotation type is a -// *compressionTypes. The compression type may change once a table's stats are -// loaded asynchronously, so its values are marked as cacheable only if a file's -// stats have been loaded. -type compressionTypeAnnotator struct{} +type compressionTypeAggregator struct{} type compressionTypes struct { snappy, zstd, none, unknown uint64 } -var _ manifest.Annotator = compressionTypeAnnotator{} - -func (a compressionTypeAnnotator) Zero(dst interface{}) interface{} { - if dst == nil { - return new(compressionTypes) - } - v := dst.(*compressionTypes) - *v = compressionTypes{} - return v +func (a compressionTypeAggregator) Zero() compressionTypes { + return compressionTypes{} } -func (a compressionTypeAnnotator) Accumulate( - f *fileMetadata, dst interface{}, -) (v interface{}, cacheOK bool) { - vptr := dst.(*compressionTypes) +func (a compressionTypeAggregator) Accumulate(f *fileMetadata) (v compressionTypes, cacheOK bool) { + types := compressionTypes{} switch f.Stats.CompressionType { case sstable.SnappyCompression: - vptr.snappy++ + types.snappy = 1 case sstable.ZstdCompression: - vptr.zstd++ + types.zstd = 1 case sstable.NoCompression: - vptr.none++ + types.none = 1 default: - vptr.unknown++ + types.unknown = 1 } - return vptr, f.StatsValid() -} - -func (a compressionTypeAnnotator) Merge(src interface{}, dst interface{}) interface{} { - srcV := src.(*compressionTypes) - dstV := dst.(*compressionTypes) - dstV.snappy = dstV.snappy + srcV.snappy - dstV.zstd = dstV.zstd + srcV.zstd - dstV.none = dstV.none + srcV.none - dstV.unknown = dstV.unknown + srcV.unknown - return dstV + return types, f.StatsValid() } -// compressionTypesForLevel returns the count of sstables by compression type -// used for a level in the LSM. Sstables with compression type snappy or zstd -// are returned, while others are ignored. -func compressionTypesForLevel(v *version, level int) (unknown, snappy, none, zstd uint64) { - if v.Levels[level].Empty() { - return - } - compression := v.Levels[level].Annotation(compressionTypeAnnotator{}).(*compressionTypes) - if compression == nil { - return +func (a compressionTypeAggregator) Merge( + c1 compressionTypes, c2 compressionTypes, +) compressionTypes { + return compressionTypes{ + snappy: c1.snappy + c2.snappy, + zstd: c1.zstd + c2.zstd, + none: c1.none + c2.none, + unknown: c1.unknown + c2.unknown, } - return compression.unknown, compression.snappy, compression.none, compression.zstd }