Skip to content

Commit

Permalink
db: verify ingest sst bounds are suffixless or adjacent
Browse files Browse the repository at this point in the history
We now allow sstables to have range key bounds that have suffixes
in them, during ingestion. This is to allow for Cockroach-side
defragmentation of range keys. This change ensures that such
suffixed range keys are adjacent and defrag cleanly with rangekeys
in other ingested sstables, to not cause panics later on in the read
path.

Fixes #3829.
  • Loading branch information
itsbilal committed Nov 15, 2024
1 parent 9ed54bc commit 2165485
Show file tree
Hide file tree
Showing 5 changed files with 125 additions and 30 deletions.
112 changes: 87 additions & 25 deletions ingest.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ import (
"github.com/cockroachdb/pebble/internal/base"
"github.com/cockroachdb/pebble/internal/cache"
"github.com/cockroachdb/pebble/internal/invariants"
"github.com/cockroachdb/pebble/internal/keyspan"
"github.com/cockroachdb/pebble/internal/manifest"
"github.com/cockroachdb/pebble/internal/overlap"
"github.com/cockroachdb/pebble/internal/sstableinternal"
Expand Down Expand Up @@ -239,48 +240,86 @@ func ingestLoad1External(
return meta, nil
}

type rangeKeyIngestValidator struct {
// lastRangeKey is the last range key seen in the previous file.
lastRangeKey keyspan.Span
// comparer, if unset, disables range key validation.
comparer *base.Comparer
}

func (r *rangeKeyIngestValidator) Validate(nextFileSmallestKey *keyspan.Span) error {
if r.comparer == nil {
return nil
}
if r.lastRangeKey.Valid() {
if r.comparer.Split.HasSuffix(r.lastRangeKey.End) && (!r.comparer.Equal(r.lastRangeKey.End, nextFileSmallestKey.Start) ||
!keyspan.DefragmentInternal.ShouldDefragment(r.comparer.CompareRangeSuffixes, &r.lastRangeKey, nextFileSmallestKey)) {
// The last range key has a suffix, and it doesn't defragment cleanly with this range key.
return errors.AssertionFailedf("pebble: ingest sstable has suffixed range key that won't defragment with previous sstable: %s",
r.comparer.FormatKey(r.lastRangeKey.End))
}
} else if r.comparer.Split.HasSuffix(nextFileSmallestKey.Start) {
return errors.Newf("pebble: ingest sstable has suffixed range key start that won't defragment: %s",
r.comparer.FormatKey(nextFileSmallestKey.Start))
}
return nil
}

// ingestLoad1 creates the FileMetadata for one file. This file will be owned
// by this store.
//
// prevLastRangeKey is the last range key from the previous file. It is used to
// ensure that the range keys defragment cleanly across files. These checks
// are disabled if disableRangeKeyChecks is true.
func ingestLoad1(
ctx context.Context,
opts *Options,
fmv FormatMajorVersion,
readable objstorage.Readable,
cacheID cache.ID,
fileNum base.FileNum,
) (*fileMetadata, error) {
prevLastRangeKey keyspan.Span,
disableRangeKeyChecks bool,
) (meta *fileMetadata, lastRangeKey keyspan.Span, err error) {
o := opts.MakeReaderOptions()
o.SetInternalCacheOpts(sstableinternal.CacheOptions{
Cache: opts.Cache,
CacheID: cacheID,
FileNum: base.PhysicalTableDiskFileNum(fileNum),
})
rangeKeyValidator := rangeKeyIngestValidator{}
if !disableRangeKeyChecks {
rangeKeyValidator = rangeKeyIngestValidator{
lastRangeKey: prevLastRangeKey,
comparer: opts.Comparer,
}
}
r, err := sstable.NewReader(ctx, readable, o)
if err != nil {
return nil, err
return nil, keyspan.Span{}, err
}
defer r.Close()

// Avoid ingesting tables with format versions this DB doesn't support.
tf, err := r.TableFormat()
if err != nil {
return nil, err
return nil, keyspan.Span{}, err
}
if tf < fmv.MinTableFormat() || tf > fmv.MaxTableFormat() {
return nil, errors.Newf(
return nil, keyspan.Span{}, errors.Newf(
"pebble: table format %s is not within range supported at DB format major version %d, (%s,%s)",
tf, fmv, fmv.MinTableFormat(), fmv.MaxTableFormat(),
)
}
if tf.BlockColumnar() {
if _, ok := opts.KeySchemas[r.Properties.KeySchemaName]; !ok {
return nil, errors.Newf(
return nil, keyspan.Span{}, errors.Newf(
"pebble: table uses key schema %q unknown to the database",
r.Properties.KeySchemaName)
}
}

meta := &fileMetadata{}
meta = &fileMetadata{}
meta.FileNum = fileNum
meta.Size = uint64(readable.Size())
meta.CreationTime = time.Now().Unix()
Expand All @@ -300,52 +339,52 @@ func ingestLoad1(
{
iter, err := r.NewIter(sstable.NoTransforms, nil /* lower */, nil /* upper */)
if err != nil {
return nil, err
return nil, keyspan.Span{}, err
}
defer iter.Close()
var smallest InternalKey
if kv := iter.First(); kv != nil {
if err := ingestValidateKey(opts, &kv.K); err != nil {
return nil, err
return nil, keyspan.Span{}, err
}
smallest = kv.K.Clone()
}
if err := iter.Error(); err != nil {
return nil, err
return nil, keyspan.Span{}, err
}
if kv := iter.Last(); kv != nil {
if err := ingestValidateKey(opts, &kv.K); err != nil {
return nil, err
return nil, keyspan.Span{}, err
}
meta.ExtendPointKeyBounds(opts.Comparer.Compare, smallest, kv.K.Clone())
}
if err := iter.Error(); err != nil {
return nil, err
return nil, keyspan.Span{}, err
}
}

iter, err := r.NewRawRangeDelIter(ctx, sstable.NoFragmentTransforms)
if err != nil {
return nil, err
return nil, keyspan.Span{}, err
}
if iter != nil {
defer iter.Close()
var smallest InternalKey
if s, err := iter.First(); err != nil {
return nil, err
return nil, keyspan.Span{}, err
} else if s != nil {
key := s.SmallestKey()
if err := ingestValidateKey(opts, &key); err != nil {
return nil, err
return nil, keyspan.Span{}, err
}
smallest = key.Clone()
}
if s, err := iter.Last(); err != nil {
return nil, err
return nil, keyspan.Span{}, err
} else if s != nil {
k := s.SmallestKey()
if err := ingestValidateKey(opts, &k); err != nil {
return nil, err
return nil, keyspan.Span{}, err
}
largest := s.LargestKey().Clone()
meta.ExtendPointKeyBounds(opts.Comparer.Compare, smallest, largest)
Expand All @@ -356,45 +395,54 @@ func ingestLoad1(
{
iter, err := r.NewRawRangeKeyIter(ctx, sstable.NoFragmentTransforms)
if err != nil {
return nil, err
return nil, keyspan.Span{}, err
}
if iter != nil {
defer iter.Close()
var smallest InternalKey
if s, err := iter.First(); err != nil {
return nil, err
return nil, keyspan.Span{}, err
} else if s != nil {
key := s.SmallestKey()
if err := ingestValidateKey(opts, &key); err != nil {
return nil, err
return nil, keyspan.Span{}, err
}
smallest = key.Clone()
// Range keys need some additional validation as we need to ensure they
// defragment cleanly with the lastRangeKey from the previous file.
if err := rangeKeyValidator.Validate(s); err != nil {
return nil, keyspan.Span{}, err
}
}
lastRangeKey = keyspan.Span{}
if s, err := iter.Last(); err != nil {
return nil, err
return nil, keyspan.Span{}, err
} else if s != nil {
k := s.SmallestKey()
if err := ingestValidateKey(opts, &k); err != nil {
return nil, err
return nil, keyspan.Span{}, err
}
// As range keys are fragmented, the end key of the last range key in
// the table provides the upper bound for the table.
largest := s.LargestKey().Clone()
meta.ExtendRangeKeyBounds(opts.Comparer.Compare, smallest, largest)
lastRangeKey = s.Clone()
}
} else {
lastRangeKey = keyspan.Span{}
}
}

if !meta.HasPointKeys && !meta.HasRangeKeys {
return nil, nil
return nil, keyspan.Span{}, nil
}

// Sanity check that the various bounds on the file were set consistently.
if err := meta.Validate(opts.Comparer.Compare, opts.Comparer.FormatKey); err != nil {
return nil, err
return nil, keyspan.Span{}, err
}

return meta, nil
return meta, lastRangeKey, nil
}

type ingestLoadResult struct {
Expand Down Expand Up @@ -445,6 +493,15 @@ func ingestLoad(

var result ingestLoadResult
result.local = make([]ingestLocalMeta, 0, len(paths))
var lastRangeKey keyspan.Span
// NB: we disable range key boundary assertions if we have shared or external files
// present in this ingestion. This is because a suffixed range key in a local file
// can possibly defragment with a suffixed range key in a shared or external file.
// We also disable range key boundary assertions if we have CreateOnShared set to
// true, as that means we could have suffixed RangeKeyDels or Unsets in the local
// files that won't ever be surfaced, even if there are no shared or external files
// in the ingestion.
disableRangeKeyChecks := len(shared) > 0 || len(external) > 0 || opts.Experimental.CreateOnShared != remote.CreateOnSharedNone
for i := range paths {
f, err := opts.FS.Open(paths[i])
if err != nil {
Expand All @@ -455,7 +512,8 @@ func ingestLoad(
if err != nil {
return ingestLoadResult{}, err
}
m, err := ingestLoad1(ctx, opts, fmv, readable, cacheID, localFileNums[i])
var m *fileMetadata
m, lastRangeKey, err = ingestLoad1(ctx, opts, fmv, readable, cacheID, localFileNums[i], lastRangeKey, disableRangeKeyChecks)
if err != nil {
return ingestLoadResult{}, err
}
Expand All @@ -466,6 +524,10 @@ func ingestLoad(
})
}
}
if !disableRangeKeyChecks && lastRangeKey.Valid() && opts.Comparer.Split.HasSuffix(lastRangeKey.End) {
return ingestLoadResult{}, errors.AssertionFailedf("pebble: last ingest sstable has suffixed range key end %s",
opts.Comparer.FormatKey(lastRangeKey.End))
}

// Sort the shared files according to level.
sort.Sort(sharedByLevel(shared))
Expand Down
7 changes: 7 additions & 0 deletions internal/base/comparer.go
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,13 @@ func (s Split) Prefix(k []byte) []byte {
return k[:i:i]
}

// HasSuffix returns true if the key k has a suffix remaining after
// Split is called on it. For keys where the entirety of the key is
// returned by Split, HasSuffix will return false.
func (s Split) HasSuffix(k []byte) bool {
return s(k) < len(k)
}

// DefaultSplit is a trivial implementation of Split which always returns the
// full key.
var DefaultSplit Split = func(key []byte) int { return len(key) }
Expand Down
10 changes: 10 additions & 0 deletions iterator_histories_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -221,6 +221,16 @@ func TestIterHistories(t *testing.T) {
}
}
return buf.String()
case "build":
if err := runBuildCmd(td, d, d.opts.FS); err != nil {
return err.Error()
}
return ""
case "ingest-existing":
if err := runIngestCmd(td, d, d.opts.FS); err != nil {
return err.Error()
}
return ""
case "ingest":
if err := runBuildCmd(td, d, d.opts.FS); err != nil {
return err.Error()
Expand Down
9 changes: 8 additions & 1 deletion open.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ import (
"github.com/cockroachdb/pebble/internal/cache"
"github.com/cockroachdb/pebble/internal/constants"
"github.com/cockroachdb/pebble/internal/invariants"
"github.com/cockroachdb/pebble/internal/keyspan"
"github.com/cockroachdb/pebble/internal/manifest"
"github.com/cockroachdb/pebble/internal/manual"
"github.com/cockroachdb/pebble/objstorage"
Expand Down Expand Up @@ -810,17 +811,23 @@ func (d *DB) replayIngestedFlushable(
}

meta := make([]*fileMetadata, len(fileNums))
var lastRangeKey keyspan.Span
for i, n := range fileNums {
readable, err := d.objProvider.OpenForReading(context.TODO(), fileTypeTable, n, objstorage.OpenOptions{MustExist: true})
if err != nil {
return nil, errors.Wrap(err, "pebble: error when opening flushable ingest files")
}
// NB: ingestLoad1 will close readable.
meta[i], err = ingestLoad1(context.TODO(), d.opts, d.FormatMajorVersion(), readable, d.cacheID, base.PhysicalTableFileNum(n))
meta[i], lastRangeKey, err = ingestLoad1(context.TODO(), d.opts, d.FormatMajorVersion(),
readable, d.cacheID, base.PhysicalTableFileNum(n), lastRangeKey, false /* disableRangeKeyChecks */)
if err != nil {
return nil, errors.Wrap(err, "pebble: error when loading flushable ingest files")
}
}
if lastRangeKey.Valid() && d.opts.Comparer.Split.HasSuffix(lastRangeKey.End) {
return nil, errors.AssertionFailedf("pebble: last ingest sstable has suffixed range key end %s",
d.opts.Comparer.FormatKey(lastRangeKey.End))
}

numFiles := len(meta)
if exciseSpan.Valid() {
Expand Down
17 changes: 13 additions & 4 deletions testdata/iter_histories/prefix_iteration
Original file line number Diff line number Diff line change
Expand Up @@ -140,8 +140,14 @@ ingest ext1
range-key-set a c@8 @1 bar
set c@9 c@9
----
pebble: last ingest sstable has suffixed range key end c@8

ingest ext2
build ext1
range-key-set a c@8 @1 bar
set c@9 c@9
----

build ext2
range-key-set c@8 e @1 bar
set c@8 c@8
set c@7 c@7
Expand All @@ -152,6 +158,9 @@ set c@3 c@3
set c@2 c@2
----

ingest-existing ext1 ext2
----

ingest ext2
range-key-set y z @1 foo
set z z
Expand All @@ -160,9 +169,9 @@ set z z
lsm
----
L6:
000004:[a#10,RANGEKEYSET-c@8#inf,RANGEKEYSET]
000005:[c@8#11,RANGEKEYSET-e#inf,RANGEKEYSET]
000006:[y#12,RANGEKEYSET-z#12,SET]
000005:[a#10,RANGEKEYSET-c@8#inf,RANGEKEYSET]
000006:[c@8#11,RANGEKEYSET-e#inf,RANGEKEYSET]
000007:[y#12,RANGEKEYSET-z#12,SET]


# The first seek-prefix-ge y@1 converts the iterator from lazy combined iterator
Expand Down

0 comments on commit 2165485

Please sign in to comment.