diff --git a/ingest.go b/ingest.go index 3fe0aae727..23b196d223 100644 --- a/ingest.go +++ b/ingest.go @@ -15,6 +15,7 @@ import ( "github.com/cockroachdb/pebble/internal/base" "github.com/cockroachdb/pebble/internal/cache" "github.com/cockroachdb/pebble/internal/invariants" + "github.com/cockroachdb/pebble/internal/keyspan" "github.com/cockroachdb/pebble/internal/manifest" "github.com/cockroachdb/pebble/internal/overlap" "github.com/cockroachdb/pebble/internal/sstableinternal" @@ -241,6 +242,10 @@ func ingestLoad1External( // ingestLoad1 creates the FileMetadata for one file. This file will be owned // by this store. +// +// lastRangeKey is the last range key from the previous file. It is used to +// ensure that the range keys defragment cleanly across files. These checks +// are disabled if disableRangeKeyChecks is true. func ingestLoad1( ctx context.Context, opts *Options, @@ -248,7 +253,9 @@ func ingestLoad1( readable objstorage.Readable, cacheID cache.ID, fileNum base.FileNum, -) (*fileMetadata, error) { + lastRangeKey keyspan.Span, + disableRangeKeyChecks bool, +) (*fileMetadata, keyspan.Span, error) { o := opts.MakeReaderOptions() o.SetInternalCacheOpts(sstableinternal.CacheOptions{ Cache: opts.Cache, @@ -257,24 +264,24 @@ func ingestLoad1( }) r, err := sstable.NewReader(ctx, readable, o) if err != nil { - return nil, err + return nil, keyspan.Span{}, err } defer r.Close() // Avoid ingesting tables with format versions this DB doesn't support. tf, err := r.TableFormat() if err != nil { - return nil, err + return nil, keyspan.Span{}, err } if tf < fmv.MinTableFormat() || tf > fmv.MaxTableFormat() { - return nil, errors.Newf( + return nil, keyspan.Span{}, errors.Newf( "pebble: table format %s is not within range supported at DB format major version %d, (%s,%s)", tf, fmv, fmv.MinTableFormat(), fmv.MaxTableFormat(), ) } if tf.BlockColumnar() { if _, ok := opts.KeySchemas[r.Properties.KeySchemaName]; !ok { - return nil, errors.Newf( + return nil, keyspan.Span{}, errors.Newf( "pebble: table uses key schema %q unknown to the database", r.Properties.KeySchemaName) } @@ -300,52 +307,52 @@ func ingestLoad1( { iter, err := r.NewIter(sstable.NoTransforms, nil /* lower */, nil /* upper */) if err != nil { - return nil, err + return nil, keyspan.Span{}, err } defer iter.Close() var smallest InternalKey if kv := iter.First(); kv != nil { if err := ingestValidateKey(opts, &kv.K); err != nil { - return nil, err + return nil, keyspan.Span{}, err } smallest = kv.K.Clone() } if err := iter.Error(); err != nil { - return nil, err + return nil, keyspan.Span{}, err } if kv := iter.Last(); kv != nil { if err := ingestValidateKey(opts, &kv.K); err != nil { - return nil, err + return nil, keyspan.Span{}, err } meta.ExtendPointKeyBounds(opts.Comparer.Compare, smallest, kv.K.Clone()) } if err := iter.Error(); err != nil { - return nil, err + return nil, keyspan.Span{}, err } } iter, err := r.NewRawRangeDelIter(ctx, sstable.NoFragmentTransforms) if err != nil { - return nil, err + return nil, keyspan.Span{}, err } if iter != nil { defer iter.Close() var smallest InternalKey if s, err := iter.First(); err != nil { - return nil, err + return nil, keyspan.Span{}, err } else if s != nil { key := s.SmallestKey() if err := ingestValidateKey(opts, &key); err != nil { - return nil, err + return nil, keyspan.Span{}, err } smallest = key.Clone() } if s, err := iter.Last(); err != nil { - return nil, err + return nil, keyspan.Span{}, err } else if s != nil { k := s.SmallestKey() if err := ingestValidateKey(opts, &k); err != nil { - return nil, err + return nil, keyspan.Span{}, err } largest := s.LargestKey().Clone() meta.ExtendPointKeyBounds(opts.Comparer.Compare, smallest, largest) @@ -356,45 +363,66 @@ func ingestLoad1( { iter, err := r.NewRawRangeKeyIter(ctx, sstable.NoFragmentTransforms) if err != nil { - return nil, err + return nil, keyspan.Span{}, err } if iter != nil { defer iter.Close() var smallest InternalKey if s, err := iter.First(); err != nil { - return nil, err + return nil, keyspan.Span{}, err } else if s != nil { key := s.SmallestKey() if err := ingestValidateKey(opts, &key); err != nil { - return nil, err + return nil, keyspan.Span{}, err } smallest = key.Clone() + // Range keys need some additional validation as we need to ensure they + // defragment cleanly with the lastRangeKey from the previous file. + if !disableRangeKeyChecks { + if lastRangeKey.Valid() { + if opts.Comparer.Split.HasSuffix(lastRangeKey.End) && (!opts.Comparer.Equal(lastRangeKey.End, s.Start) || + !keyspan.DefragmentInternal.ShouldDefragment(opts.Comparer.CompareRangeSuffixes, &lastRangeKey, s)) { + // The last range key has a suffix, and it doesn't defragment cleanly with this range key. + return nil, keyspan.Span{}, errors.AssertionFailedf("pebble: ingest sstable has suffixed range key that won't defragment: %s", + opts.Comparer.FormatKey(lastRangeKey.End)) + } + } else if opts.Comparer.Split.HasSuffix(s.Start) { + return nil, keyspan.Span{}, errors.Newf("pebble: ingest sstable has suffixed range key start that won't defragment: %s", + opts.Comparer.FormatKey(s.Start)) + } + } } if s, err := iter.Last(); err != nil { - return nil, err + return nil, keyspan.Span{}, err } else if s != nil { k := s.SmallestKey() if err := ingestValidateKey(opts, &k); err != nil { - return nil, err + return nil, keyspan.Span{}, err } // As range keys are fragmented, the end key of the last range key in // the table provides the upper bound for the table. largest := s.LargestKey().Clone() meta.ExtendRangeKeyBounds(opts.Comparer.Compare, smallest, largest) + lastRangeKey = s.Clone() + } else { + // s == nil. + lastRangeKey = keyspan.Span{} } + } else { + lastRangeKey = keyspan.Span{} } } if !meta.HasPointKeys && !meta.HasRangeKeys { - return nil, nil + return nil, keyspan.Span{}, nil } // Sanity check that the various bounds on the file were set consistently. if err := meta.Validate(opts.Comparer.Compare, opts.Comparer.FormatKey); err != nil { - return nil, err + return nil, keyspan.Span{}, err } - return meta, nil + return meta, lastRangeKey, nil } type ingestLoadResult struct { @@ -445,6 +473,7 @@ func ingestLoad( var result ingestLoadResult result.local = make([]ingestLocalMeta, 0, len(paths)) + var lastRangeKey keyspan.Span for i := range paths { f, err := opts.FS.Open(paths[i]) if err != nil { @@ -455,7 +484,12 @@ func ingestLoad( if err != nil { return ingestLoadResult{}, err } - m, err := ingestLoad1(ctx, opts, fmv, readable, cacheID, localFileNums[i]) + var m *fileMetadata + // NB: we disable range key boundary assertions if we have shared or external files + // present in this ingestion. This is because a suffixed range key in a local file + // can possibly defragment with a suffixed range key in a shared or external file. + disableRangeKeyChecks := len(shared) > 0 || len(external) > 0 + m, lastRangeKey, err = ingestLoad1(ctx, opts, fmv, readable, cacheID, localFileNums[i], lastRangeKey, disableRangeKeyChecks) if err != nil { return ingestLoadResult{}, err } @@ -466,6 +500,10 @@ func ingestLoad( }) } } + if lastRangeKey.Valid() && opts.Comparer.Split.HasSuffix(lastRangeKey.End) { + return ingestLoadResult{}, errors.AssertionFailedf("pebble: last ingest sstable has suffixed range key end %s", + opts.Comparer.FormatKey(lastRangeKey.End)) + } // Sort the shared files according to level. sort.Sort(sharedByLevel(shared)) diff --git a/internal/base/comparer.go b/internal/base/comparer.go index 78a7aadd37..c734052de0 100644 --- a/internal/base/comparer.go +++ b/internal/base/comparer.go @@ -186,6 +186,13 @@ func (s Split) Prefix(k []byte) []byte { return k[:i:i] } +// HasSuffix returns true if the key k has a suffix remaining after +// Split is called on it. For keys where the entirety of the key is +// returned by Split, HasSuffix will return false. +func (s Split) HasSuffix(k []byte) bool { + return s(k) < len(k) +} + // DefaultSplit is a trivial implementation of Split which always returns the // full key. var DefaultSplit Split = func(key []byte) int { return len(key) } diff --git a/iterator_histories_test.go b/iterator_histories_test.go index e22fa18e0b..6b72c1f13d 100644 --- a/iterator_histories_test.go +++ b/iterator_histories_test.go @@ -221,6 +221,16 @@ func TestIterHistories(t *testing.T) { } } return buf.String() + case "build": + if err := runBuildCmd(td, d, d.opts.FS); err != nil { + return err.Error() + } + return "" + case "ingest-existing": + if err := runIngestCmd(td, d, d.opts.FS); err != nil { + return err.Error() + } + return "" case "ingest": if err := runBuildCmd(td, d, d.opts.FS); err != nil { return err.Error() diff --git a/open.go b/open.go index 2610e3451f..94ac3e78ba 100644 --- a/open.go +++ b/open.go @@ -24,6 +24,7 @@ import ( "github.com/cockroachdb/pebble/internal/cache" "github.com/cockroachdb/pebble/internal/constants" "github.com/cockroachdb/pebble/internal/invariants" + "github.com/cockroachdb/pebble/internal/keyspan" "github.com/cockroachdb/pebble/internal/manifest" "github.com/cockroachdb/pebble/internal/manual" "github.com/cockroachdb/pebble/objstorage" @@ -807,17 +808,23 @@ func (d *DB) replayIngestedFlushable( } meta := make([]*fileMetadata, len(fileNums)) + var lastRangeKey keyspan.Span for i, n := range fileNums { readable, err := d.objProvider.OpenForReading(context.TODO(), fileTypeTable, n, objstorage.OpenOptions{MustExist: true}) if err != nil { return nil, errors.Wrap(err, "pebble: error when opening flushable ingest files") } // NB: ingestLoad1 will close readable. - meta[i], err = ingestLoad1(context.TODO(), d.opts, d.FormatMajorVersion(), readable, d.cacheID, base.PhysicalTableFileNum(n)) + meta[i], lastRangeKey, err = ingestLoad1(context.TODO(), d.opts, d.FormatMajorVersion(), + readable, d.cacheID, base.PhysicalTableFileNum(n), lastRangeKey, false /* disableRangeKeyChecks */) if err != nil { return nil, errors.Wrap(err, "pebble: error when loading flushable ingest files") } } + if lastRangeKey.Valid() && d.opts.Comparer.Split.HasSuffix(lastRangeKey.End) { + return nil, errors.AssertionFailedf("pebble: last ingest sstable has suffixed range key end %s", + d.opts.Comparer.FormatKey(lastRangeKey.End)) + } numFiles := len(meta) if exciseSpan.Valid() { diff --git a/testdata/iter_histories/prefix_iteration b/testdata/iter_histories/prefix_iteration index a5484b51e4..728b504ca8 100644 --- a/testdata/iter_histories/prefix_iteration +++ b/testdata/iter_histories/prefix_iteration @@ -140,8 +140,14 @@ ingest ext1 range-key-set a c@8 @1 bar set c@9 c@9 ---- +pebble: last ingest sstable has suffixed range key end c@8 -ingest ext2 +build ext1 +range-key-set a c@8 @1 bar +set c@9 c@9 +---- + +build ext2 range-key-set c@8 e @1 bar set c@8 c@8 set c@7 c@7 @@ -152,6 +158,9 @@ set c@3 c@3 set c@2 c@2 ---- +ingest-existing ext1 ext2 +---- + ingest ext2 range-key-set y z @1 foo set z z @@ -160,9 +169,9 @@ set z z lsm ---- L6: - 000004:[a#10,RANGEKEYSET-c@8#inf,RANGEKEYSET] - 000005:[c@8#11,RANGEKEYSET-e#inf,RANGEKEYSET] - 000006:[y#12,RANGEKEYSET-z#12,SET] + 000005:[a#10,RANGEKEYSET-c@8#inf,RANGEKEYSET] + 000006:[c@8#11,RANGEKEYSET-e#inf,RANGEKEYSET] + 000007:[y#12,RANGEKEYSET-z#12,SET] # The first seek-prefix-ge y@1 converts the iterator from lazy combined iterator