From c4b8017ec27a3f551b6004d91c6f241d18e86390 Mon Sep 17 00:00:00 2001 From: Jackson Owens Date: Mon, 13 May 2024 11:35:17 -0400 Subject: [PATCH] db: avoid stepping beyond iteration prefix in levelIter During prefix iteration mode, avoid stepping the levelIter beyond the current iteration prefix. This fixes a bug whereby a levelIter.Next may position a levelIter to a file beyond the iteration prefix and a subsequent SeekPrefixGE to a later key with TrySeekUsingNext would fail to observe keys in the skipped file. Fix #3610. --- data_test.go | 10 ++++ internal/base/iterator.go | 16 +++--- level_iter.go | 66 +++++++++++++----------- testdata/iter_histories/prefix_iteration | 26 ++++++++++ 4 files changed, 82 insertions(+), 36 deletions(-) diff --git a/data_test.go b/data_test.go index 3ee315c148..5ea1ba6600 100644 --- a/data_test.go +++ b/data_test.go @@ -842,6 +842,16 @@ func runDBDefineCmdReuseFS(td *datadriven.TestData, opts *Options) (*DB, error) for _, levelOpts := range opts.Levels { levelOpts.BlockSize = size } + case "bloom-bits-per-key": + v, err := strconv.Atoi(arg.Vals[0]) + if err != nil { + return nil, err + } + fp := bloom.FilterPolicy(v) + opts.Filters = map[string]FilterPolicy{fp.Name(): fp} + for i := range opts.Levels { + opts.Levels[i].FilterPolicy = fp + } case "format-major-version": fmv, err := strconv.Atoi(arg.Vals[0]) if err != nil { diff --git a/internal/base/iterator.go b/internal/base/iterator.go index 7cbea1b755..7531e5fbd2 100644 --- a/internal/base/iterator.go +++ b/internal/base/iterator.go @@ -107,13 +107,15 @@ type InternalIterator interface { // SeekPrefixGE only checks the upper bound. It is up to the caller to ensure // that key is greater than or equal to the lower bound. // - // The prefix argument is used by some InternalIterator implementations (e.g. - // sstable.Reader) to avoid expensive operations. This operation is only - // useful when a user-defined Split function is supplied to the Comparer for - // the DB. The supplied prefix will be the prefix of the given key returned by - // that Split function. If the iterator is able to determine that no key with - // the prefix exists, it can return (nil,nilv). Unlike SeekGE, this is not an - // indication that iteration is exhausted. + // The prefix argument is used by some InternalIterator implementations + // (e.g. sstable.Reader) to avoid expensive operations. This operation is + // only useful when a user-defined Split function is supplied to the + // Comparer for the DB. The supplied prefix will be the prefix of the given + // key returned by that Split function. If the iterator is able to determine + // that no key with the prefix exists, it can return (nil,nilv). Unlike + // SeekGE, this is not an indication that iteration is exhausted. The prefix + // byte slice is guaranteed to be stable until the next absolute positioning + // operation. // // Note that the iterator may return keys not matching the prefix. It is up // to the caller to check if the prefix matches. diff --git a/level_iter.go b/level_iter.go index b799dd6053..00af0887d1 100644 --- a/level_iter.go +++ b/level_iter.go @@ -31,21 +31,13 @@ type internalIterOpts struct { // // levelIter is used during compaction and as part of the Iterator // implementation. When used as part of the Iterator implementation, level -// iteration needs to "pause" at sstable boundaries if a range deletion iterator -// is open. In this case, we materialize a "synthetic" InternalKV to return from -// levelIter. This prevents mergingIter from advancing past the sstable until -// the sstable contains the smallest (or largest for reverse iteration) key in -// the merged heap. Note that mergingIter treats a range deletion tombstone -// returned by the point iterator as a no-op. -// -// SeekPrefixGE presents the need for a second type of pausing. If an sstable -// iterator returns "not found" for a SeekPrefixGE operation, we don't want to -// advance to the next sstable as the "not found" does not indicate that all of -// the keys in the sstable are less than the search key. Advancing to the next -// sstable would cause us to skip over range tombstones, violating -// correctness. Instead, SeekPrefixGE creates a synthetic boundary key with the -// kind InternalKeyKindRangeDeletion which will be used to pause the levelIter -// at the sstable until the mergingIter is ready to advance past it. +// iteration needs to "pause" at range deletion boundaries if file contains +// range deletions. In this case, the levelIter uses a keyspan.InterleavingIter +// to materialize InternalKVs at start and end boundaries of range deletions. +// This prevents mergingIter from advancing past the sstable until the sstable +// contains the smallest (or largest for reverse iteration) key in the merged +// heap. Note that mergingIter treats a range deletion tombstone returned by the +// point iterator as a no-op. type levelIter struct { // The context is stored here since (a) iterators are expected to be // short-lived (since they pin sstables), (b) plumbing a context into every @@ -60,6 +52,9 @@ type levelIter struct { // recent call to SetBounds. lower []byte upper []byte + // prefix holds the iteration prefix when the most recent absolute + // positioning method was a SeekPrefixGE. + prefix []byte // The iterator options for the currently open table. If // tableOpts.{Lower,Upper}Bound are nil, the corresponding iteration boundary // does not lie within the table bounds. @@ -161,6 +156,7 @@ func (l *levelIter) init( l.err = nil l.level = level l.logger = opts.getLogger() + l.prefix = nil l.lower = opts.LowerBound l.upper = opts.UpperBound l.tableOpts.TableFilter = opts.TableFilter @@ -629,6 +625,7 @@ func (l *levelIter) SeekGE(key []byte, flags base.SeekGEFlags) *base.InternalKV l.err = nil // clear cached iteration error l.exhaustedDir = 0 + l.prefix = nil // NB: the top-level Iterator has already adjusted key based on // IterOptions.LowerBound. loadFileIndicator := l.loadFile(l.findFileGE(key, flags), +1) @@ -651,9 +648,9 @@ func (l *levelIter) SeekPrefixGE(prefix, key []byte, flags base.SeekGEFlags) *ba if invariants.Enabled && l.lower != nil && l.cmp(key, l.lower) < 0 { panic(errors.AssertionFailedf("levelIter SeekGE to key %q violates lower bound %q", key, l.lower)) } - l.err = nil // clear cached iteration error l.exhaustedDir = 0 + l.prefix = prefix // NB: the top-level Iterator has already adjusted key based on // IterOptions.LowerBound. @@ -673,19 +670,6 @@ func (l *levelIter) SeekPrefixGE(prefix, key []byte, flags base.SeekGEFlags) *ba if err := l.iter.Error(); err != nil { return nil } - // It is possible that we are here because bloom filter matching failed. In - // that case it is likely that all keys matching the prefix are wholly - // within the current file and cannot be in the subsequent file. In that - // case we don't want to go to the next file, since loading and seeking in - // there has some cost. Additionally, for sparse key spaces, loading the - // next file will defeat the optimization for the next SeekPrefixGE that is - // called with flags.TrySeekUsingNext(), since for sparse key spaces it is - // likely that the next key will also be contained in the current file. - n := l.split(l.iterFile.LargestPointKey.UserKey) - if l.cmp(prefix, l.iterFile.LargestPointKey.UserKey[:n]) < 0 { - l.exhaustedForward() - return nil - } return l.verify(l.skipEmptyFileForward()) } @@ -696,6 +680,7 @@ func (l *levelIter) SeekLT(key []byte, flags base.SeekLTFlags) *base.InternalKV l.err = nil // clear cached iteration error l.exhaustedDir = 0 + l.prefix = nil // NB: the top-level Iterator has already adjusted key based on // IterOptions.UpperBound. @@ -716,6 +701,7 @@ func (l *levelIter) First() *base.InternalKV { l.err = nil // clear cached iteration error l.exhaustedDir = 0 + l.prefix = nil // NB: the top-level Iterator will call SeekGE if IterOptions.LowerBound is // set. @@ -736,6 +722,7 @@ func (l *levelIter) Last() *base.InternalKV { l.err = nil // clear cached iteration error l.exhaustedDir = 0 + l.prefix = nil // NB: the top-level Iterator will call SeekLT if IterOptions.UpperBound is // set. @@ -835,6 +822,27 @@ func (l *levelIter) skipEmptyFileForward() *base.InternalKV { l.exhaustedForward() return nil } + + // If the iterator is in prefix iteration mode, it's possible that we + // are here because bloom filter matching failed. In that case it is + // likely that all keys matching the prefix are wholly within the + // current file and cannot be in a subsequent file. In that case we + // don't want to go to the next file, since loading and seeking in there + // has some cost. + // + // This is not just an optimization. We must not advance to the next + // file if the current file might possibly contain keys relevant to any + // prefix greater than our current iteration prefix. If we did, a + // subsequent SeekPrefixGE with TrySeekUsingNext could mistakenly skip + // the file's relevant keys. + if l.prefix != nil { + fileLargestPrefix := l.iterFile.LargestPointKey.UserKey[:l.split(l.iterFile.LargestPointKey.UserKey)] + if l.cmp(fileLargestPrefix, l.prefix) > 0 { + l.exhaustedForward() + return nil + } + } + // Current file was exhausted. Move to the next file. if l.loadFile(l.files.Next(), +1) == noFileLoaded { l.exhaustedForward() diff --git a/testdata/iter_histories/prefix_iteration b/testdata/iter_histories/prefix_iteration index 1ea878ea9d..d977b2ecf8 100644 --- a/testdata/iter_histories/prefix_iteration +++ b/testdata/iter_histories/prefix_iteration @@ -336,3 +336,29 @@ seek-prefix-ge c@9 ---- . . + +# Regression test for #3610. +# +# Similar to the above case, this test consists of two SeekPrefixGEs for +# ascending keys, resulting in TrySeekUsingNext()=true for the second seek. +# Previously, during the first SeekPrefixGE the mergingIter could Next the +# levelIter beyond the file containing point keys relevant to both seeks. + +define bloom-bits-per-key=100 +L4 + b@0.SET.10:b@0 +L5 + b@8.RANGEDEL.3:b@1 + c@3.SET.0:c@3 +---- +L4: + 000004:[b@0#10,SET-b@0#10,SET] +L5: + 000005:[b@8#3,RANGEDEL-c@3#0,SET] + +combined-iter +seek-prefix-ge b@10 +seek-prefix-ge c@10 +---- +b@0: (b@0, .) +c@3: (c@3, .)