Skip to content

Commit

Permalink
db: avoid stepping beyond iteration prefix in levelIter
Browse files Browse the repository at this point in the history
During prefix iteration mode, avoid stepping the levelIter beyond the current
iteration prefix. This fixes a bug whereby a levelIter.Next may position a
levelIter to a file beyond the iteration prefix and a subsequent SeekPrefixGE
to a later key with TrySeekUsingNext would fail to observe keys in the skipped
file.

Fix #3610.
  • Loading branch information
jbowens committed May 13, 2024
1 parent 5ee10bd commit c4b8017
Show file tree
Hide file tree
Showing 4 changed files with 82 additions and 36 deletions.
10 changes: 10 additions & 0 deletions data_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -842,6 +842,16 @@ func runDBDefineCmdReuseFS(td *datadriven.TestData, opts *Options) (*DB, error)
for _, levelOpts := range opts.Levels {
levelOpts.BlockSize = size
}
case "bloom-bits-per-key":
v, err := strconv.Atoi(arg.Vals[0])
if err != nil {
return nil, err
}
fp := bloom.FilterPolicy(v)
opts.Filters = map[string]FilterPolicy{fp.Name(): fp}
for i := range opts.Levels {
opts.Levels[i].FilterPolicy = fp
}
case "format-major-version":
fmv, err := strconv.Atoi(arg.Vals[0])
if err != nil {
Expand Down
16 changes: 9 additions & 7 deletions internal/base/iterator.go
Original file line number Diff line number Diff line change
Expand Up @@ -107,13 +107,15 @@ type InternalIterator interface {
// SeekPrefixGE only checks the upper bound. It is up to the caller to ensure
// that key is greater than or equal to the lower bound.
//
// The prefix argument is used by some InternalIterator implementations (e.g.
// sstable.Reader) to avoid expensive operations. This operation is only
// useful when a user-defined Split function is supplied to the Comparer for
// the DB. The supplied prefix will be the prefix of the given key returned by
// that Split function. If the iterator is able to determine that no key with
// the prefix exists, it can return (nil,nilv). Unlike SeekGE, this is not an
// indication that iteration is exhausted.
// The prefix argument is used by some InternalIterator implementations
// (e.g. sstable.Reader) to avoid expensive operations. This operation is
// only useful when a user-defined Split function is supplied to the
// Comparer for the DB. The supplied prefix will be the prefix of the given
// key returned by that Split function. If the iterator is able to determine
// that no key with the prefix exists, it can return (nil,nilv). Unlike
// SeekGE, this is not an indication that iteration is exhausted. The prefix
// byte slice is guaranteed to be stable until the next absolute positioning
// operation.
//
// Note that the iterator may return keys not matching the prefix. It is up
// to the caller to check if the prefix matches.
Expand Down
66 changes: 37 additions & 29 deletions level_iter.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,21 +31,13 @@ type internalIterOpts struct {
//
// levelIter is used during compaction and as part of the Iterator
// implementation. When used as part of the Iterator implementation, level
// iteration needs to "pause" at sstable boundaries if a range deletion iterator
// is open. In this case, we materialize a "synthetic" InternalKV to return from
// levelIter. This prevents mergingIter from advancing past the sstable until
// the sstable contains the smallest (or largest for reverse iteration) key in
// the merged heap. Note that mergingIter treats a range deletion tombstone
// returned by the point iterator as a no-op.
//
// SeekPrefixGE presents the need for a second type of pausing. If an sstable
// iterator returns "not found" for a SeekPrefixGE operation, we don't want to
// advance to the next sstable as the "not found" does not indicate that all of
// the keys in the sstable are less than the search key. Advancing to the next
// sstable would cause us to skip over range tombstones, violating
// correctness. Instead, SeekPrefixGE creates a synthetic boundary key with the
// kind InternalKeyKindRangeDeletion which will be used to pause the levelIter
// at the sstable until the mergingIter is ready to advance past it.
// iteration needs to "pause" at range deletion boundaries if file contains
// range deletions. In this case, the levelIter uses a keyspan.InterleavingIter
// to materialize InternalKVs at start and end boundaries of range deletions.
// This prevents mergingIter from advancing past the sstable until the sstable
// contains the smallest (or largest for reverse iteration) key in the merged
// heap. Note that mergingIter treats a range deletion tombstone returned by the
// point iterator as a no-op.
type levelIter struct {
// The context is stored here since (a) iterators are expected to be
// short-lived (since they pin sstables), (b) plumbing a context into every
Expand All @@ -60,6 +52,9 @@ type levelIter struct {
// recent call to SetBounds.
lower []byte
upper []byte
// prefix holds the iteration prefix when the most recent absolute
// positioning method was a SeekPrefixGE.
prefix []byte
// The iterator options for the currently open table. If
// tableOpts.{Lower,Upper}Bound are nil, the corresponding iteration boundary
// does not lie within the table bounds.
Expand Down Expand Up @@ -161,6 +156,7 @@ func (l *levelIter) init(
l.err = nil
l.level = level
l.logger = opts.getLogger()
l.prefix = nil
l.lower = opts.LowerBound
l.upper = opts.UpperBound
l.tableOpts.TableFilter = opts.TableFilter
Expand Down Expand Up @@ -629,6 +625,7 @@ func (l *levelIter) SeekGE(key []byte, flags base.SeekGEFlags) *base.InternalKV

l.err = nil // clear cached iteration error
l.exhaustedDir = 0
l.prefix = nil
// NB: the top-level Iterator has already adjusted key based on
// IterOptions.LowerBound.
loadFileIndicator := l.loadFile(l.findFileGE(key, flags), +1)
Expand All @@ -651,9 +648,9 @@ func (l *levelIter) SeekPrefixGE(prefix, key []byte, flags base.SeekGEFlags) *ba
if invariants.Enabled && l.lower != nil && l.cmp(key, l.lower) < 0 {
panic(errors.AssertionFailedf("levelIter SeekGE to key %q violates lower bound %q", key, l.lower))
}

l.err = nil // clear cached iteration error
l.exhaustedDir = 0
l.prefix = prefix

// NB: the top-level Iterator has already adjusted key based on
// IterOptions.LowerBound.
Expand All @@ -673,19 +670,6 @@ func (l *levelIter) SeekPrefixGE(prefix, key []byte, flags base.SeekGEFlags) *ba
if err := l.iter.Error(); err != nil {
return nil
}
// It is possible that we are here because bloom filter matching failed. In
// that case it is likely that all keys matching the prefix are wholly
// within the current file and cannot be in the subsequent file. In that
// case we don't want to go to the next file, since loading and seeking in
// there has some cost. Additionally, for sparse key spaces, loading the
// next file will defeat the optimization for the next SeekPrefixGE that is
// called with flags.TrySeekUsingNext(), since for sparse key spaces it is
// likely that the next key will also be contained in the current file.
n := l.split(l.iterFile.LargestPointKey.UserKey)
if l.cmp(prefix, l.iterFile.LargestPointKey.UserKey[:n]) < 0 {
l.exhaustedForward()
return nil
}
return l.verify(l.skipEmptyFileForward())
}

Expand All @@ -696,6 +680,7 @@ func (l *levelIter) SeekLT(key []byte, flags base.SeekLTFlags) *base.InternalKV

l.err = nil // clear cached iteration error
l.exhaustedDir = 0
l.prefix = nil

// NB: the top-level Iterator has already adjusted key based on
// IterOptions.UpperBound.
Expand All @@ -716,6 +701,7 @@ func (l *levelIter) First() *base.InternalKV {

l.err = nil // clear cached iteration error
l.exhaustedDir = 0
l.prefix = nil

// NB: the top-level Iterator will call SeekGE if IterOptions.LowerBound is
// set.
Expand All @@ -736,6 +722,7 @@ func (l *levelIter) Last() *base.InternalKV {

l.err = nil // clear cached iteration error
l.exhaustedDir = 0
l.prefix = nil

// NB: the top-level Iterator will call SeekLT if IterOptions.UpperBound is
// set.
Expand Down Expand Up @@ -835,6 +822,27 @@ func (l *levelIter) skipEmptyFileForward() *base.InternalKV {
l.exhaustedForward()
return nil
}

// If the iterator is in prefix iteration mode, it's possible that we
// are here because bloom filter matching failed. In that case it is
// likely that all keys matching the prefix are wholly within the
// current file and cannot be in a subsequent file. In that case we
// don't want to go to the next file, since loading and seeking in there
// has some cost.
//
// This is not just an optimization. We must not advance to the next
// file if the current file might possibly contain keys relevant to any
// prefix greater than our current iteration prefix. If we did, a
// subsequent SeekPrefixGE with TrySeekUsingNext could mistakenly skip
// the file's relevant keys.
if l.prefix != nil {
fileLargestPrefix := l.iterFile.LargestPointKey.UserKey[:l.split(l.iterFile.LargestPointKey.UserKey)]
if l.cmp(fileLargestPrefix, l.prefix) > 0 {
l.exhaustedForward()
return nil
}
}

// Current file was exhausted. Move to the next file.
if l.loadFile(l.files.Next(), +1) == noFileLoaded {
l.exhaustedForward()
Expand Down
26 changes: 26 additions & 0 deletions testdata/iter_histories/prefix_iteration
Original file line number Diff line number Diff line change
Expand Up @@ -336,3 +336,29 @@ seek-prefix-ge c@9
----
.
.

# Regression test for #3610.
#
# Similar to the above case, this test consists of two SeekPrefixGEs for
# ascending keys, resulting in TrySeekUsingNext()=true for the second seek.
# Previously, during the first SeekPrefixGE the mergingIter could Next the
# levelIter beyond the file containing point keys relevant to both seeks.

define bloom-bits-per-key=100
L4
[email protected]:b@0
L5
[email protected]:b@1
[email protected]:c@3
----
L4:
000004:[b@0#10,SET-b@0#10,SET]
L5:
000005:[b@8#3,RANGEDEL-c@3#0,SET]

combined-iter
seek-prefix-ge b@10
seek-prefix-ge c@10
----
b@0: (b@0, .)
c@3: (c@3, .)

0 comments on commit c4b8017

Please sign in to comment.