diff --git a/docs/generated/settings/settings-for-tenants.txt b/docs/generated/settings/settings-for-tenants.txt index a84d13adfc82..06811321d3f6 100644 --- a/docs/generated/settings/settings-for-tenants.txt +++ b/docs/generated/settings/settings-for-tenants.txt @@ -385,6 +385,7 @@ sql.ttl.job.enabled boolean true whether the TTL job is enabled application sql.txn.read_committed_isolation.enabled boolean true set to true to allow transactions to use the READ COMMITTED isolation level if specified by BEGIN/SET commands application sql.txn.repeatable_read_isolation.enabled (alias: sql.txn.snapshot_isolation.enabled) boolean false set to true to allow transactions to use the REPEATABLE READ isolation level if specified by BEGIN/SET commands application sql.txn_fingerprint_id_cache.capacity integer 100 the maximum number of txn fingerprint IDs stored application +storage.columnar_blocks.enabled boolean false set to true to enable columnar-blocks to store KVs in a columnar format system-visible storage.ingestion.value_blocks.enabled boolean true set to true to enable writing of value blocks in ingestion sstables application storage.max_sync_duration duration 20s maximum duration for disk operations; any operations that take longer than this setting trigger a warning log entry or process crash system-visible storage.max_sync_duration.fatal.enabled boolean true if true, fatal the process when a disk operation exceeds storage.max_sync_duration application diff --git a/docs/generated/settings/settings.html b/docs/generated/settings/settings.html index 6ae8f81d3a1d..84e79c96fd07 100644 --- a/docs/generated/settings/settings.html +++ b/docs/generated/settings/settings.html @@ -339,6 +339,7 @@
sql.txn.read_committed_isolation.enabled
booleantrueset to true to allow transactions to use the READ COMMITTED isolation level if specified by BEGIN/SET commandsServerless/Dedicated/Self-Hosted
sql.txn.repeatable_read_isolation.enabled
(alias: sql.txn.snapshot_isolation.enabled)
booleanfalseset to true to allow transactions to use the REPEATABLE READ isolation level if specified by BEGIN/SET commandsServerless/Dedicated/Self-Hosted
sql.txn_fingerprint_id_cache.capacity
integer100the maximum number of txn fingerprint IDs storedServerless/Dedicated/Self-Hosted +
storage.columnar_blocks.enabled
booleanfalseset to true to enable columnar-blocks to store KVs in a columnar formatDedicated/Self-hosted (read-write); Serverless (read-only)
storage.experimental.eventually_file_only_snapshots.enabled
booleantrueset to false to disable eventually-file-only-snapshots (kv.snapshot_receiver.excise.enabled must also be false)Dedicated/Self-Hosted
storage.ingest_split.enabled
booleantrueset to false to disable ingest-time splitting that lowers write-amplificationDedicated/Self-Hosted
storage.ingestion.value_blocks.enabled
booleantrueset to true to enable writing of value blocks in ingestion sstablesServerless/Dedicated/Self-Hosted diff --git a/go.mod b/go.mod index df35da8c1d65..fe9967ff3b4b 100644 --- a/go.mod +++ b/go.mod @@ -128,6 +128,7 @@ require ( github.com/cockroachdb/cmux v0.0.0-20170110192607-30d10be49292 github.com/cockroachdb/cockroach-go/v2 v2.3.7 github.com/cockroachdb/crlfmt v0.0.0-20221214225007-b2fc5c302548 + github.com/cockroachdb/crlib v0.0.0-20241015224233-894974b3ad94 github.com/cockroachdb/datadriven v1.0.3-0.20240530155848-7682d40af056 github.com/cockroachdb/errors v1.11.3 github.com/cockroachdb/fifo v0.0.0-20240606204812-0bbfbd93a7ce @@ -306,7 +307,6 @@ require ( github.com/cespare/xxhash/v2 v2.2.0 // indirect github.com/charmbracelet/bubbletea v0.23.1 // indirect github.com/charmbracelet/lipgloss v0.6.0 // indirect - github.com/cockroachdb/crlib v0.0.0-20241015224233-894974b3ad94 // indirect github.com/cockroachdb/swiss v0.0.0-20240612210725-f4de07ae6964 // indirect github.com/danieljoos/wincred v1.1.2 // indirect github.com/decred/dcrd/dcrec/secp256k1/v4 v4.3.0 // indirect diff --git a/pkg/storage/BUILD.bazel b/pkg/storage/BUILD.bazel index ffbc4d912c38..51be20522921 100644 --- a/pkg/storage/BUILD.bazel +++ b/pkg/storage/BUILD.bazel @@ -28,6 +28,7 @@ go_library( "pebble.go", "pebble_batch.go", "pebble_iterator.go", + "pebble_key_schema.go", "pebble_logger_and_tracer.go", "pebble_merge.go", "pebble_mvcc_scanner.go", @@ -86,6 +87,7 @@ go_library( "//pkg/util/timeutil", "//pkg/util/tracing", "//pkg/util/uuid", + "@com_github_cockroachdb_crlib//crbytes", "@com_github_cockroachdb_errors//:errors", "@com_github_cockroachdb_errors//oserror", "@com_github_cockroachdb_fifo//:fifo", @@ -100,6 +102,7 @@ go_library( "@com_github_cockroachdb_pebble//replay", "@com_github_cockroachdb_pebble//sstable", "@com_github_cockroachdb_pebble//sstable/block", + "@com_github_cockroachdb_pebble//sstable/colblk", "@com_github_cockroachdb_pebble//vfs", "@com_github_cockroachdb_pebble//wal", "@com_github_cockroachdb_redact//:redact", @@ -139,6 +142,7 @@ go_test( "mvcc_value_test.go", "open_test.go", "pebble_iterator_test.go", + "pebble_key_schema_test.go", "pebble_mvcc_scanner_test.go", "pebble_test.go", "read_as_of_iterator_test.go", @@ -198,6 +202,8 @@ go_test( "//pkg/util/timeutil", "//pkg/util/uint128", "//pkg/util/uuid", + "@com_github_cockroachdb_crlib//crbytes", + "@com_github_cockroachdb_crlib//crstrings", "@com_github_cockroachdb_datadriven//:datadriven", "@com_github_cockroachdb_errors//:errors", "@com_github_cockroachdb_errors//oserror", @@ -206,9 +212,11 @@ go_test( "@com_github_cockroachdb_pebble//objstorage/objstorageprovider", "@com_github_cockroachdb_pebble//sstable", "@com_github_cockroachdb_pebble//sstable/block", + "@com_github_cockroachdb_pebble//sstable/colblk", "@com_github_cockroachdb_pebble//vfs", "@com_github_cockroachdb_redact//:redact", "@com_github_kr_pretty//:pretty", + "@com_github_olekukonko_tablewriter//:tablewriter", "@com_github_stretchr_testify//assert", "@com_github_stretchr_testify//require", "@org_golang_google_protobuf//proto", diff --git a/pkg/storage/pebble.go b/pkg/storage/pebble.go index 505910772fbb..da114f9d5cb6 100644 --- a/pkg/storage/pebble.go +++ b/pkg/storage/pebble.go @@ -102,6 +102,15 @@ var IngestSplitEnabled = settings.RegisterBoolSetting( settings.WithPublic, ) +// columnarBlocksEnabled controls whether columnar-blocks are enabled in Pebble. +var columnarBlocksEnabled = settings.RegisterBoolSetting( + settings.SystemVisible, + "storage.columnar_blocks.enabled", + "set to true to enable columnar-blocks to store KVs in a columnar format", + false, // TODO(jackson): Metamorphicize this. + settings.WithPublic, +) + // IngestAsFlushable controls whether ingested sstables that overlap the // memtable may be lazily ingested: written to the WAL and enqueued in the list // of flushables (eg, memtables, large batches and now lazily-ingested @@ -407,6 +416,30 @@ func EngineSuffixCompare(a, b []byte) int { return bytes.Compare(b[:len(b)-1], a[:len(a)-1]) } +// EnginePointSuffixCompare compares suffixes of Cockroach point keys (which are +// composed of the version and a trailing version-length byte); the version can +// be an MVCC timestamp or a lock key. EnginePointSuffixCompare differs from +// EngineSuffixCompare, because EnginePointSuffixCompare normalizes the +// suffixes. Ideally we'd have one function that implemented the semantics of +// EnginePointSuffixCompare, but due to historical reasons, range key suffix +// comparisons must not perform normalization. +// +// See https://github.com/cockroachdb/cockroach/issues/130533 +func EnginePointSuffixCompare(a, b []byte) int { + // NB: For performance, this routine manually splits the key into the + // user-key and version components rather than using DecodeEngineKey. In + // most situations, use DecodeEngineKey or GetKeyPartFromEngineKey or + // SplitMVCCKey instead of doing this. + if len(a) == 0 || len(b) == 0 { + // Empty suffixes sort before non-empty suffixes. + return cmp.Compare(len(a), len(b)) + } + return bytes.Compare( + normalizeEngineSuffixForCompare(b), + normalizeEngineSuffixForCompare(a), + ) +} + func checkEngineKey(k []byte) { if len(k) == 0 { panic(errors.AssertionFailedf("empty key")) @@ -792,8 +825,9 @@ const MinimumSupportedFormatVersion = pebble.FormatSyntheticPrefixSuffix // DefaultPebbleOptions returns the default pebble options. func DefaultPebbleOptions() *pebble.Options { opts := &pebble.Options{ - Comparer: EngineComparer, - FS: vfs.Default, + Comparer: EngineComparer, + FS: vfs.Default, + KeySchema: keySchema, // A value of 2 triggers a compaction when there is 1 sub-level. L0CompactionThreshold: 2, L0StopWritesThreshold: 1000, @@ -1190,6 +1224,9 @@ func newPebble(ctx context.Context, cfg engineConfig) (p *Pebble, err error) { cfg.opts.Experimental.IngestSplit = func() bool { return IngestSplitEnabled.Get(&cfg.settings.SV) } + cfg.opts.Experimental.EnableColumnarBlocks = func() bool { + return columnarBlocksEnabled.Get(&cfg.settings.SV) + } auxDir := cfg.opts.FS.PathJoin(cfg.env.Dir, base.AuxiliaryDir) if !cfg.env.IsReadOnly() { @@ -2512,7 +2549,7 @@ func (p *Pebble) CreateCheckpoint(dir string, spans []roachpb.Span) error { // version associated with it, since they did so during the fence version. var pebbleFormatVersionMap = map[clusterversion.Key]pebble.FormatMajorVersion{ clusterversion.V24_1: pebble.FormatSyntheticPrefixSuffix, - clusterversion.V24_3: pebble.FormatFlushableIngestExcises, + clusterversion.V24_3: pebble.FormatColumnarBlocks, } // pebbleFormatVersionKeys contains the keys in the map above, in descending order. diff --git a/pkg/storage/pebble_key_schema.go b/pkg/storage/pebble_key_schema.go new file mode 100644 index 000000000000..13adefb9e0b7 --- /dev/null +++ b/pkg/storage/pebble_key_schema.go @@ -0,0 +1,469 @@ +// Copyright 2024 The Cockroach Authors. +// +// Use of this software is governed by the CockroachDB Software License +// included in the /LICENSE file. + +package storage + +import ( + "bytes" + "cmp" + "encoding/binary" + "fmt" + "io" + "sync" + "unsafe" + + "github.com/cockroachdb/cockroach/pkg/util/buildutil" + "github.com/cockroachdb/crlib/crbytes" + "github.com/cockroachdb/errors" + "github.com/cockroachdb/pebble/sstable/colblk" +) + +const ( + // cockroachColRoachKey is a roachpb.Key user key. It does NOT include the + // 0x00 terminator byte that a serialized engine key includes. + cockroachColRoachKey int = iota + // cockroachColMVCCWallTime is the wall time component of a MVCC timestamp, + // or zero if not an MVCC key. + cockroachColMVCCWallTime + // cockroachColMVCCLogical is the logical time component of a MVCC + // timestamp, or zero if not an MVCC key. + cockroachColMVCCLogical + // cockroachColUntypedVersion holds any non-empty, non-MVCC version. It does + // NOT include the 0x00 separator byte that delimits the prefix and suffix + // in a serialized engine key. In practice, this column is used to store the + // version of lock-table keys. + cockroachColUntypedVersion + cockroachColCount +) + +var keySchema = colblk.KeySchema{ + ColumnTypes: []colblk.DataType{ + cockroachColRoachKey: colblk.DataTypePrefixBytes, + cockroachColMVCCWallTime: colblk.DataTypeUint, + cockroachColMVCCLogical: colblk.DataTypeUint, + cockroachColUntypedVersion: colblk.DataTypeBytes, + }, + NewKeyWriter: func() colblk.KeyWriter { + kw := &cockroachKeyWriter{} + kw.roachKeys.Init(16) + kw.wallTimes.Init() + kw.logicalTimes.InitWithDefault() + kw.untypedVersions.Init() + return kw + }, + NewKeySeeker: func() colblk.KeySeeker { + return &cockroachKeySeeker{} + }, +} + +type cockroachKeyWriter struct { + roachKeys colblk.PrefixBytesBuilder + wallTimes colblk.UintBuilder + logicalTimes colblk.UintBuilder + untypedVersions colblk.RawBytesBuilder + prevSuffix []byte +} + +// Assert *cockroachKeyWriter implements colblk.KeyWriter. +var _ colblk.KeyWriter = (*cockroachKeyWriter)(nil) + +func (kw *cockroachKeyWriter) ComparePrev(key []byte) colblk.KeyComparison { + var cmpv colblk.KeyComparison + cmpv.PrefixLen = int32(EngineKeySplit(key)) // TODO(jackson): Inline + if kw.roachKeys.Rows() == 0 { + cmpv.UserKeyComparison = 1 + return cmpv + } + lp := kw.roachKeys.UnsafeGet(kw.roachKeys.Rows() - 1) + cmpv.CommonPrefixLen = int32(crbytes.CommonPrefix(lp, key[:cmpv.PrefixLen-1])) + if cmpv.CommonPrefixLen == cmpv.PrefixLen-1 { + // Adjust CommonPrefixLen to include the sentinel byte. + cmpv.CommonPrefixLen = cmpv.PrefixLen + cmpv.UserKeyComparison = int32(EnginePointSuffixCompare(key[cmpv.PrefixLen:], kw.prevSuffix)) + return cmpv + } + // The keys have different MVCC prefixes. We haven't determined which is + // greater, but we know the index at which they diverge. The base.Comparer + // contract dictates that prefixes must be lexicographically ordered. + if len(lp) == int(cmpv.CommonPrefixLen) { + // cmpv.PrefixLen > cmpv.PrefixLenShared; key is greater. + cmpv.UserKeyComparison = +1 + } else { + // Both keys have at least 1 additional byte at which they diverge. + // Compare the diverging byte. + cmpv.UserKeyComparison = int32(cmp.Compare(key[cmpv.CommonPrefixLen], lp[cmpv.CommonPrefixLen])) + } + return cmpv +} + +func (kw *cockroachKeyWriter) WriteKey( + row int, key []byte, keyPrefixLen, keyPrefixLenSharedWithPrev int32, +) { + if len(key) == 0 { + panic(errors.AssertionFailedf("empty key")) + } + // Last byte is the version length + 1 when there is a version, + // else it is 0. + versionLen := int(key[len(key)-1]) + if (len(key)-versionLen) != int(keyPrefixLen) || key[keyPrefixLen-1] != 0x00 { + panic(errors.AssertionFailedf("invalid %d-byte key with %d-byte prefix (%q)", + len(key), keyPrefixLen, key)) + } + // TODO(jackson): Avoid copying the previous suffix. + kw.prevSuffix = append(kw.prevSuffix[:0], key[keyPrefixLen:]...) + + // When the roach key is the same, keyPrefixLenSharedWithPrev includes the + // separator byte. + kw.roachKeys.Put(key[:keyPrefixLen-1], min(int(keyPrefixLenSharedWithPrev), int(keyPrefixLen)-1)) + + // NB: The w.logicalTimes builder was initialized with InitWithDefault, so + // if we don't set a value, the column value is implicitly zero. We only + // need to Set anything for non-zero values. + var wallTime uint64 + var untypedVersion []byte + switch versionLen { + case 0: + // No-op. + case 9: + wallTime = binary.BigEndian.Uint64(key[keyPrefixLen : keyPrefixLen+8]) + case 13, 14: + wallTime = binary.BigEndian.Uint64(key[keyPrefixLen : keyPrefixLen+8]) + kw.logicalTimes.Set(row, uint64(binary.BigEndian.Uint32(key[keyPrefixLen+8:keyPrefixLen+12]))) + // NOTE: byte 13 used to store the timestamp's synthetic bit, but this is no + // longer consulted and can be ignored during decoding. + default: + // Not a MVCC timestamp. + untypedVersion = key[keyPrefixLen:] + } + kw.wallTimes.Set(row, wallTime) + kw.untypedVersions.Put(untypedVersion) +} + +func (kw *cockroachKeyWriter) MaterializeKey(dst []byte, i int) []byte { + dst = append(dst, kw.roachKeys.UnsafeGet(i)...) + // Append separator byte. + dst = append(dst, 0) + if untypedVersion := kw.untypedVersions.UnsafeGet(i); len(untypedVersion) > 0 { + dst = append(dst, untypedVersion...) + return dst + } + wall := kw.wallTimes.Get(i) + logical := uint32(kw.logicalTimes.Get(i)) + if logical == 0 { + if wall == 0 { + return dst + } + dst = append(dst, make([]byte, 9)...) + binary.BigEndian.PutUint64(dst[len(dst)-9:], wall) + dst[len(dst)-1] = 9 // Version length byte + return dst + } + dst = append(dst, make([]byte, 13)...) + binary.BigEndian.PutUint64(dst[len(dst)-13:], wall) + binary.BigEndian.PutUint32(dst[len(dst)-5:], logical) + dst[len(dst)-1] = 13 // Version length byte + return dst +} + +func (kw *cockroachKeyWriter) Reset() { + kw.roachKeys.Reset() + kw.wallTimes.Reset() + kw.logicalTimes.Reset() + kw.untypedVersions.Reset() +} + +func (kw *cockroachKeyWriter) WriteDebug(dst io.Writer, rows int) { + fmt.Fprint(dst, "prefixes: ") + kw.roachKeys.WriteDebug(dst, rows) + fmt.Fprintln(dst) + fmt.Fprint(dst, "wall times: ") + kw.wallTimes.WriteDebug(dst, rows) + fmt.Fprintln(dst) + fmt.Fprint(dst, "logical times: ") + kw.logicalTimes.WriteDebug(dst, rows) + fmt.Fprintln(dst) + fmt.Fprint(dst, "untyped suffixes: ") + kw.untypedVersions.WriteDebug(dst, rows) + fmt.Fprintln(dst) +} + +func (kw *cockroachKeyWriter) NumColumns() int { + return cockroachColCount +} + +func (kw *cockroachKeyWriter) DataType(col int) colblk.DataType { + return keySchema.ColumnTypes[col] +} + +func (kw *cockroachKeyWriter) Size(rows int, offset uint32) uint32 { + offset = kw.roachKeys.Size(rows, offset) + offset = kw.wallTimes.Size(rows, offset) + offset = kw.logicalTimes.Size(rows, offset) + offset = kw.untypedVersions.Size(rows, offset) + return offset +} + +func (kw *cockroachKeyWriter) Finish( + col int, rows int, offset uint32, buf []byte, +) (endOffset uint32) { + switch col { + case cockroachColRoachKey: + return kw.roachKeys.Finish(0, rows, offset, buf) + case cockroachColMVCCWallTime: + return kw.wallTimes.Finish(0, rows, offset, buf) + case cockroachColMVCCLogical: + return kw.logicalTimes.Finish(0, rows, offset, buf) + case cockroachColUntypedVersion: + return kw.untypedVersions.Finish(0, rows, offset, buf) + default: + panic(fmt.Sprintf("unknown default key column: %d", col)) + } +} + +var cockroachKeySeekerPool = sync.Pool{ + New: func() interface{} { return &cockroachKeySeeker{} }, +} + +type cockroachKeySeeker struct { + roachKeys colblk.PrefixBytes + roachKeyChanged colblk.Bitmap + mvccWallTimes colblk.UnsafeUints + mvccLogical colblk.UnsafeUints + untypedVersions colblk.RawBytes +} + +var _ colblk.KeySeeker = (*cockroachKeySeeker)(nil) + +// Init is part of the KeySeeker interface. +func (ks *cockroachKeySeeker) Init(d *colblk.DataBlockDecoder) error { + bd := d.BlockDecoder() + ks.roachKeys = bd.PrefixBytes(cockroachColRoachKey) + ks.roachKeyChanged = d.PrefixChanged() + ks.mvccWallTimes = bd.Uints(cockroachColMVCCWallTime) + ks.mvccLogical = bd.Uints(cockroachColMVCCLogical) + ks.untypedVersions = bd.RawBytes(cockroachColUntypedVersion) + return nil +} + +// IsLowerBound compares the provided key to the first user key +// contained within the data block. It's equivalent to performing +// +// Compare(firstUserKey, k) >= 0 +func (ks *cockroachKeySeeker) IsLowerBound(k []byte, syntheticSuffix []byte) bool { + ek, ok := DecodeEngineKey(k) + if !ok { + panic(errors.AssertionFailedf("invalid key %q", k)) + } + if v := bytes.Compare(ks.roachKeys.UnsafeFirstSlice(), ek.Key); v != 0 { + return v > 0 + } + // If there's a synthetic suffix, we ignore the block's suffix columns and + // compare the key's suffix to the synthetic suffix. + if len(syntheticSuffix) > 0 { + return EnginePointSuffixCompare(syntheticSuffix, k[len(ek.Key)+1:]) >= 0 + } + var wallTime uint64 + var logicalTime uint32 + switch len(ek.Version) { + case engineKeyNoVersion: + case engineKeyVersionWallTimeLen: + wallTime = binary.BigEndian.Uint64(ek.Version[:8]) + case engineKeyVersionWallAndLogicalTimeLen, engineKeyVersionWallLogicalAndSyntheticTimeLen: + wallTime = binary.BigEndian.Uint64(ek.Version[:8]) + logicalTime = binary.BigEndian.Uint32(ek.Version[8:12]) + default: + // The provided key `k` is not a MVCC key. Assert that the first key in + // the block is also not an MVCC key. If it were, that would mean there + // exists both a MVCC key and a non-MVCC key with the same prefix. + // + // TODO(jackson): Double check that we'll never produce index separators + // that are invalid version lengths. + if buildutil.CrdbTestBuild && ks.mvccWallTimes.At(0) != 0 { + panic("comparing timestamp with untyped suffix") + } + return EnginePointSuffixCompare(ks.untypedVersions.At(0), ek.Version) >= 0 + } + + // NB: The sign comparison is inverted because suffixes are sorted such that + // the largest timestamps are "smaller" in the lexicographical ordering. + if v := cmp.Compare(ks.mvccWallTimes.At(0), wallTime); v != 0 { + return v < 0 + } + return cmp.Compare(uint32(ks.mvccLogical.At(0)), logicalTime) <= 0 +} + +// SeekGE is part of the KeySeeker interface. +func (ks *cockroachKeySeeker) SeekGE( + key []byte, boundRow int, searchDir int8, +) (row int, equalPrefix bool) { + // TODO(jackson): Inline EngineKeySplit. + si := EngineKeySplit(key) + row, eq := ks.roachKeys.Search(key[:si-1]) + if eq { + return ks.seekGEOnSuffix(row, key[si:]), true + } + return row, false +} + +// seekGEOnSuffix is a helper function for SeekGE when a seek key's prefix +// exactly matches a row. seekGEOnSuffix finds the first row at index or later +// with the same prefix as index and a suffix greater than or equal to [suffix], +// or if no such row exists, the next row with a different prefix. +func (ks *cockroachKeySeeker) seekGEOnSuffix(index int, seekSuffix []byte) (row int) { + // The search key's prefix exactly matches the prefix of the row at index. + const withWall = 9 + const withLogical = withWall + 4 + const withSynthetic = withLogical + 1 + var seekWallTime uint64 + var seekLogicalTime uint32 + switch len(seekSuffix) { + case 0: + // The search key has no suffix, so it's the smallest possible key with + // its prefix. Return the row. This is a common case where the user is + // seeking to the most-recent row and just wants the smallest key with + // the prefix. + return index + case withLogical, withSynthetic: + seekWallTime = binary.BigEndian.Uint64(seekSuffix) + seekLogicalTime = binary.BigEndian.Uint32(seekSuffix[8:]) + case withWall: + seekWallTime = binary.BigEndian.Uint64(seekSuffix) + default: + // The suffix is untyped. Compare the untyped suffixes. + // Binary search between [index, prefixChanged.SeekSetBitGE(index+1)]. + // + // Define f(i) = true iff key at i is >= seek key. + // Invariant: f(l-1) == false, f(u) == true. + l := index + u := ks.roachKeyChanged.SeekSetBitGE(index + 1) + for l < u { + h := int(uint(l+u) >> 1) // avoid overflow when computing h + // l ≤ h < u + if bytes.Compare(ks.untypedVersions.At(h), seekSuffix) >= 0 { + u = h // preserves f(u) == true + } else { + l = h + 1 // preserves f(l-1) == false + } + } + return l + } + // Seeking among MVCC versions using a MVCC timestamp. + + // TODO(jackson): What if the row has an untyped suffix? + + // First check the suffix at index, because querying for the latest value is + // the most common case. + if latestWallTime := ks.mvccWallTimes.At(index); latestWallTime < seekWallTime || + (latestWallTime == seekWallTime && uint32(ks.mvccLogical.At(index)) <= seekLogicalTime) { + return index + } + + // Binary search between [index+1, prefixChanged.SeekSetBitGE(index+1)]. + // + // Define f(i) = true iff key at i is >= seek key. + // Invariant: f(l-1) == false, f(u) == true. + l := index + 1 + u := ks.roachKeyChanged.SeekSetBitGE(index + 1) + for l < u { + h := int(uint(l+u) >> 1) // avoid overflow when computing h + // l ≤ h < u + hWallTime := ks.mvccWallTimes.At(h) + if hWallTime < seekWallTime || + (hWallTime == seekWallTime && uint32(ks.mvccLogical.At(h)) <= seekLogicalTime) { + u = h // preserves f(u) = true + } else { + l = h + 1 // preserves f(l-1) = false + } + } + return l +} + +// MaterializeUserKey is part of the KeySeeker interface. +func (ks *cockroachKeySeeker) MaterializeUserKey( + ki *colblk.PrefixBytesIter, prevRow, row int, +) []byte { + if prevRow+1 == row && prevRow >= 0 { + ks.roachKeys.SetNext(ki) + } else { + ks.roachKeys.SetAt(ki, row) + } + + roachKeyLen := len(ki.Buf) + ptr := unsafe.Pointer(uintptr(unsafe.Pointer(unsafe.SliceData(ki.Buf))) + uintptr(roachKeyLen)) + mvccWall := ks.mvccWallTimes.At(row) + mvccLogical := uint32(ks.mvccLogical.At(row)) + if mvccWall == 0 && mvccLogical == 0 { + // This is not an MVCC key. Use the untyped suffix. + untypedVersion := ks.untypedVersions.At(row) + if len(untypedVersion) == 0 { + res := ki.Buf[:roachKeyLen+1] + res[roachKeyLen] = 0 + return res + } + // Slice first, to check that the capacity is sufficient. + res := ki.Buf[:roachKeyLen+1+len(untypedVersion)] + *(*byte)(ptr) = 0 + memmove( + unsafe.Pointer(uintptr(ptr)+1), + unsafe.Pointer(unsafe.SliceData(untypedVersion)), + uintptr(len(untypedVersion)), + ) + return res + } + + // Inline binary.BigEndian.PutUint64. Note that this code is converted into + // word-size instructions by the compiler. + *(*byte)(ptr) = 0 + *(*byte)(unsafe.Pointer(uintptr(ptr) + 1)) = byte(mvccWall >> 56) + *(*byte)(unsafe.Pointer(uintptr(ptr) + 2)) = byte(mvccWall >> 48) + *(*byte)(unsafe.Pointer(uintptr(ptr) + 3)) = byte(mvccWall >> 40) + *(*byte)(unsafe.Pointer(uintptr(ptr) + 4)) = byte(mvccWall >> 32) + *(*byte)(unsafe.Pointer(uintptr(ptr) + 5)) = byte(mvccWall >> 24) + *(*byte)(unsafe.Pointer(uintptr(ptr) + 6)) = byte(mvccWall >> 16) + *(*byte)(unsafe.Pointer(uintptr(ptr) + 7)) = byte(mvccWall >> 8) + *(*byte)(unsafe.Pointer(uintptr(ptr) + 8)) = byte(mvccWall) + + ptr = unsafe.Pointer(uintptr(ptr) + 9) + // This is an MVCC key. + if mvccLogical == 0 { + *(*byte)(ptr) = 9 + return ki.Buf[:len(ki.Buf)+10] + } + + // Inline binary.BigEndian.PutUint32. + *(*byte)(ptr) = byte(mvccWall >> 24) + *(*byte)(unsafe.Pointer(uintptr(ptr) + 1)) = byte(mvccWall >> 16) + *(*byte)(unsafe.Pointer(uintptr(ptr) + 2)) = byte(mvccWall >> 8) + *(*byte)(unsafe.Pointer(uintptr(ptr) + 3)) = byte(mvccWall) + *(*byte)(unsafe.Pointer(uintptr(ptr) + 4)) = 13 + return ki.Buf[:len(ki.Buf)+14] +} + +// MaterializeUserKeyWithSyntheticSuffix is part of the KeySeeker interface. +func (ks *cockroachKeySeeker) MaterializeUserKeyWithSyntheticSuffix( + ki *colblk.PrefixBytesIter, suffix []byte, prevRow, row int, +) []byte { + if prevRow+1 == row && prevRow >= 0 { + ks.roachKeys.SetNext(ki) + } else { + ks.roachKeys.SetAt(ki, row) + } + + // Slice first, to check that the capacity is sufficient. + res := ki.Buf[:len(ki.Buf)+1+len(suffix)] + ptr := unsafe.Pointer(uintptr(unsafe.Pointer(unsafe.SliceData(ki.Buf))) + uintptr(len(ki.Buf))) + *(*byte)(ptr) = 0 + memmove(unsafe.Pointer(uintptr(ptr)+1), unsafe.Pointer(unsafe.SliceData(suffix)), uintptr(len(suffix))) + return res +} + +// Release is part of the KeySeeker interface. +func (ks *cockroachKeySeeker) Release() { + *ks = cockroachKeySeeker{} + cockroachKeySeekerPool.Put(ks) +} + +//go:linkname memmove runtime.memmove +func memmove(to, from unsafe.Pointer, n uintptr) diff --git a/pkg/storage/pebble_key_schema_test.go b/pkg/storage/pebble_key_schema_test.go new file mode 100644 index 000000000000..57c917d3a801 --- /dev/null +++ b/pkg/storage/pebble_key_schema_test.go @@ -0,0 +1,314 @@ +// Copyright 2024 The Cockroach Authors. +// +// Use of this software is governed by the CockroachDB Software License +// included in the /LICENSE file. + +package storage + +import ( + "bytes" + "encoding/hex" + "fmt" + "math/rand" + "strconv" + "strings" + "testing" + + "github.com/cockroachdb/cockroach/pkg/kv/kvserver/concurrency/lock" + "github.com/cockroachdb/cockroach/pkg/testutils/datapathutils" + "github.com/cockroachdb/cockroach/pkg/util/hlc" + "github.com/cockroachdb/cockroach/pkg/util/leaktest" + "github.com/cockroachdb/cockroach/pkg/util/log" + "github.com/cockroachdb/cockroach/pkg/util/uuid" + "github.com/cockroachdb/crlib/crbytes" + "github.com/cockroachdb/crlib/crstrings" + "github.com/cockroachdb/datadriven" + "github.com/cockroachdb/errors" + "github.com/cockroachdb/pebble" + "github.com/cockroachdb/pebble/sstable/block" + "github.com/cockroachdb/pebble/sstable/colblk" + "github.com/olekukonko/tablewriter" +) + +func TestKeySchema_KeyWriter(t *testing.T) { + defer leaktest.AfterTest(t)() + defer log.Scope(t).Close(t) + + var kw colblk.KeyWriter + var row int + var buf bytes.Buffer + var keyBuf []byte + datadriven.RunTest(t, datapathutils.TestDataPath(t, "key_schema_key_writer"), func(t *testing.T, td *datadriven.TestData) string { + buf.Reset() + switch td.Cmd { + case "init": + // Exercise both resetting and retrieving a new writer. + if kw != nil && rand.Intn(2) == 1 { + kw.Reset() + } else { + kw = keySchema.NewKeyWriter() + } + row = 0 + keyBuf = keyBuf[:0] + return "" + case "write": + for i, line := range crstrings.Lines(td.Input) { + k, err := parseTestKey(line) + if err != nil { + t.Fatalf("bad test key %q on line %d: %s", line, i, err) + } + fmt.Fprintf(&buf, "Parse(%q) = hex:%x\n", line, k) + kcmp := kw.ComparePrev(k) + if v := EngineKeyCompare(k, keyBuf); v < 0 { + t.Fatalf("line %d: EngineKeyCompare(%q, hex:%x) = %d", i, line, keyBuf, v) + } else if v != int(kcmp.UserKeyComparison) { + t.Fatalf("line %d: EngineKeyCompare(%q, hex:%x) = %d; kcmp.UserKeyComparison = %d", + i, line, keyBuf, v, kcmp.UserKeyComparison) + } + + fmt.Fprintf(&buf, "%02d: ComparePrev(%q): PrefixLen=%d; CommonPrefixLen=%d; UserKeyComparison=%d\n", + i, line, kcmp.PrefixLen, kcmp.CommonPrefixLen, kcmp.UserKeyComparison) + kw.WriteKey(row, k, kcmp.PrefixLen, kcmp.CommonPrefixLen) + fmt.Fprintf(&buf, "%02d: WriteKey(%d, %q, PrefixLen=%d, CommonPrefixLen=%d)\n", + i, row, line, kcmp.PrefixLen, kcmp.CommonPrefixLen) + + keyBuf = kw.MaterializeKey(keyBuf[:0], row) + if !EngineKeyEqual(k, keyBuf) { + t.Fatalf("line %d: EngineKeyEqual(hex:%x, hex:%x) == false", i, k, keyBuf) + } + if v := EngineKeyCompare(k, keyBuf); v != 0 { + t.Fatalf("line %d: EngineKeyCompare(hex:%x, hex:%x) = %d", i, k, keyBuf, v) + } + + fmt.Fprintf(&buf, "%02d: MaterializeKey(_, %d) = hex:%x\n", i, row, keyBuf) + row++ + } + return buf.String() + case "finish": + b := crbytes.AllocAligned(int(kw.Size(row, 0) + 1)) + offs := make([]uint32, kw.NumColumns()+1) + for i := 0; i < kw.NumColumns(); i++ { + offs[i+1] = kw.Finish(i, row, offs[i], b) + } + roachKeys, _ := colblk.DecodePrefixBytes(b, offs[cockroachColRoachKey], row) + mvccWallTimes, _ := colblk.DecodeUnsafeUints(b, offs[cockroachColMVCCWallTime], row) + mvccLogicalTimes, _ := colblk.DecodeUnsafeUints(b, offs[cockroachColMVCCLogical], row) + untypedVersions, _ := colblk.DecodeRawBytes(b, offs[cockroachColUntypedVersion], row) + tbl := tablewriter.NewWriter(&buf) + tbl.SetHeader([]string{"Key", "Wall", "Logical", "Untyped"}) + for i := 0; i < row; i++ { + tbl.Append([]string{ + asciiOrHex(roachKeys.At(i)), + fmt.Sprintf("%d", mvccWallTimes.At(i)), + fmt.Sprintf("%d", mvccLogicalTimes.At(i)), + fmt.Sprintf("%x", untypedVersions.At(i)), + }) + } + tbl.Render() + return buf.String() + default: + panic(fmt.Sprintf("unrecognized command %q", td.Cmd)) + } + }) +} + +func TestKeySchema_KeySeeker(t *testing.T) { + defer leaktest.AfterTest(t)() + defer log.Scope(t).Close(t) + + var buf bytes.Buffer + var enc colblk.DataBlockEncoder + var dec colblk.DataBlockDecoder + var ks colblk.KeySeeker + var maxKeyLen int + enc.Init(keySchema) + + initKeySeeker := func() { + if ks == nil || rand.Intn(2) == 1 { + if ks != nil { + ks.Release() + } + ks = keySchema.NewKeySeeker() + } + if err := ks.Init(&dec); err != nil { + t.Fatal(err) + } + } + + datadriven.RunTest(t, datapathutils.TestDataPath(t, "key_schema_key_seeker"), func(t *testing.T, td *datadriven.TestData) string { + buf.Reset() + switch td.Cmd { + case "define-block": + enc.Reset() + maxKeyLen = 0 + var rows int + for i, line := range crstrings.Lines(td.Input) { + k, err := parseTestKey(line) + if err != nil { + t.Fatalf("bad test key %q on line %d: %s", line, i, err) + } + fmt.Fprintf(&buf, "Parse(%q) = hex:%x\n", line, k) + maxKeyLen = max(maxKeyLen, len(k)) + kcmp := enc.KeyWriter.ComparePrev(k) + ikey := pebble.InternalKey{ + UserKey: k, + Trailer: pebble.MakeInternalKeyTrailer(0, pebble.InternalKeyKindSet), + } + enc.Add(ikey, k, block.InPlaceValuePrefix(false), kcmp, false /* isObsolete */) + rows++ + } + blk, _ := enc.Finish(rows, enc.Size()) + dec.Init(keySchema, blk) + return buf.String() + case "is-lower-bound": + initKeySeeker() + syntheticSuffix, syntheticSuffixStr, _ := getSyntheticSuffix(t, td) + for _, line := range crstrings.Lines(td.Input) { + k, err := parseTestKey(line) + if err != nil { + t.Fatalf("bad test key %q: %s", line, err) + } + got := ks.IsLowerBound(k, syntheticSuffix) + fmt.Fprintf(&buf, "IsLowerBound(%q, %q) = %t\n", line, syntheticSuffixStr, got) + } + return buf.String() + case "seek-ge": + initKeySeeker() + for _, line := range crstrings.Lines(td.Input) { + fields := strings.Fields(line) + k, err := parseTestKey(fields[0]) + if err != nil { + t.Fatalf("bad test key %q: %s", fields[0], err) + } + boundRow := -1 + searchDir := 0 + if len(fields) == 3 { + boundRow, err = strconv.Atoi(fields[1]) + if err != nil { + t.Fatalf("bad bound row %q: %s", fields[1], err) + } + switch fields[2] { + case "fwd": + searchDir = +1 + case "bwd": + searchDir = -1 + default: + t.Fatalf("bad search direction %q", fields[2]) + } + } + row, equalPrefix := ks.SeekGE(k, boundRow, int8(searchDir)) + + fmt.Fprintf(&buf, "SeekGE(%q, boundRow=%d, searchDir=%d) = (row=%d, equalPrefix=%t)", + line, boundRow, searchDir, row, equalPrefix) + if row >= 0 && row < dec.BlockDecoder().Rows() { + var kiter colblk.PrefixBytesIter + kiter.Buf = make([]byte, maxKeyLen+1) + key := ks.MaterializeUserKey(&kiter, -1, row) + fmt.Fprintf(&buf, " [hex:%x]", key) + } + fmt.Fprintln(&buf) + } + return buf.String() + case "materialize-user-key": + initKeySeeker() + syntheticSuffix, syntheticSuffixStr, syntheticSuffixOk := getSyntheticSuffix(t, td) + + var kiter colblk.PrefixBytesIter + kiter.Buf = make([]byte, maxKeyLen+len(syntheticSuffix)+1) + prevRow := -1 + for _, line := range crstrings.Lines(td.Input) { + row, err := strconv.Atoi(line) + if err != nil { + t.Fatalf("bad row number %q: %s", line, err) + } + if syntheticSuffixOk { + key := ks.MaterializeUserKeyWithSyntheticSuffix(&kiter, syntheticSuffix, prevRow, row) + fmt.Fprintf(&buf, "MaterializeUserKeyWithSyntheticSuffix(%d, %d, %s) = hex:%x\n", prevRow, row, syntheticSuffixStr, key) + } else { + key := ks.MaterializeUserKey(&kiter, prevRow, row) + fmt.Fprintf(&buf, "MaterializeUserKey(%d, %d) = hex:%x\n", prevRow, row, key) + } + prevRow = row + } + return buf.String() + default: + panic(fmt.Sprintf("unrecognized command %q", td.Cmd)) + } + }) + +} + +func getSyntheticSuffix(t *testing.T, td *datadriven.TestData) ([]byte, string, bool) { + var syntheticSuffix []byte + var syntheticSuffixStr string + cmdArg, ok := td.Arg("synthetic-suffix") + if ok { + syntheticSuffixStr = cmdArg.SingleVal(t) + var err error + syntheticSuffix, err = parseTestKey(syntheticSuffixStr) + if err != nil { + t.Fatalf("parsing synthetic suffix %q: %s", syntheticSuffixStr, err) + } + syntheticSuffix = syntheticSuffix[1:] // Trim the separator byte. + } + return syntheticSuffix, syntheticSuffixStr, ok +} + +func asciiOrHex(b []byte) string { + if bytes.ContainsFunc(b, func(r rune) bool { return r < ' ' || r > '~' }) { + return fmt.Sprintf("hex:%x", b) + } + return string(b) +} + +func parseTestKey(s string) ([]byte, error) { + if strings.HasPrefix(s, "hex:") { + b, err := hex.DecodeString(strings.TrimPrefix(s, "hex:")) + if err != nil { + return nil, errors.Wrap(err, "parsing hexadecimal literal key") + } + return b, nil + } + i := strings.IndexByte(s, '@') + if i == -1 { + // Return just the roachpb key with the sentinel byte. + return append([]byte(s), 0x00), nil + } + if len(s[i+1:]) == 0 { + return nil, errors.Newf("key %q has empty suffix", s) + } + version := s[i+1:] + j := strings.IndexByte(version, ',') + switch version[0:j] { + case "Shared", "Exclusive", "Intent": + // This is the lock strength. Parse as a lock table key. + strength := lock.Intent + switch version[0:j] { + case "Shared": + strength = lock.Shared + case "Exclusive": + strength = lock.Exclusive + } + txnUUID, err := uuid.FromString(version[j+1:]) + if err != nil { + return nil, errors.Wrapf(err, "parsing lock table transaction UUID") + } + ltk := LockTableKey{ + Key: []byte(s[:i]), + Strength: strength, + TxnUUID: txnUUID, + } + ek, _ := ltk.ToEngineKey(nil) + return ek.Encode(), nil + default: + // Parse as a MVCC key. + ts, err := hlc.ParseTimestamp(version) + if err != nil { + return nil, errors.Wrap(err, "parsing MVCC timestamp") + } + return EncodeMVCCKey(MVCCKey{ + Key: []byte(s[:i]), + Timestamp: ts, + }), nil + } +} diff --git a/pkg/storage/testdata/key_schema_key_seeker b/pkg/storage/testdata/key_schema_key_seeker new file mode 100644 index 000000000000..e05eeb8394c6 --- /dev/null +++ b/pkg/storage/testdata/key_schema_key_seeker @@ -0,0 +1,149 @@ +define-block +foo@3.000000000,1 +foo@3.000000000,0 +---- +Parse("foo@3.000000000,1") = hex:666f6f0000000000b2d05e00000000010d +Parse("foo@3.000000000,0") = hex:666f6f0000000000b2d05e0009 + +is-lower-bound +abc +fax@3.000000000,1 +foo@3.000000000,2 +foo@3.000000000,1 +foo@3.000000000,0 +zoo@9.100000000,2 +---- +IsLowerBound("abc", "") = true +IsLowerBound("fax@3.000000000,1", "") = true +IsLowerBound("foo@3.000000000,2", "") = true +IsLowerBound("foo@3.000000000,1", "") = true +IsLowerBound("foo@3.000000000,0", "") = false +IsLowerBound("zoo@9.100000000,2", "") = false + +seek-ge +fax@9.000000000,0 +foo@3.000000000,1 +foo@3.000000000,0 +foo@3.000000000,2 +zoo@9.000000000,0 +---- +SeekGE("fax@9.000000000,0", boundRow=-1, searchDir=0) = (row=0, equalPrefix=false) [hex:666f6f0000000000b2d05e00b2d05e000d] +SeekGE("foo@3.000000000,1", boundRow=-1, searchDir=0) = (row=0, equalPrefix=true) [hex:666f6f0000000000b2d05e00b2d05e000d] +SeekGE("foo@3.000000000,0", boundRow=-1, searchDir=0) = (row=1, equalPrefix=true) [hex:666f6f0000000000b2d05e0009] +SeekGE("foo@3.000000000,2", boundRow=-1, searchDir=0) = (row=0, equalPrefix=true) [hex:666f6f0000000000b2d05e00b2d05e000d] +SeekGE("zoo@9.000000000,0", boundRow=-1, searchDir=0) = (row=2, equalPrefix=false) + +define-block +bar@3.000000000,1 +bax@3.000000000,1 +foo@3.000000000,1 +moo@3.000000000,1 +---- +Parse("bar@3.000000000,1") = hex:6261720000000000b2d05e00000000010d +Parse("bax@3.000000000,1") = hex:6261780000000000b2d05e00000000010d +Parse("foo@3.000000000,1") = hex:666f6f0000000000b2d05e00000000010d +Parse("moo@3.000000000,1") = hex:6d6f6f0000000000b2d05e00000000010d + +is-lower-bound +bar@9.000000000,2 +bar@8.000000000,2 +bar@8.000000000,1 +bar@8.000000000,0 +bar@7.000000000,9 +bar@3.000000000,2 +bar@3.000000000,1 +bar@3.000000000,0 +---- +IsLowerBound("bar@9.000000000,2", "") = true +IsLowerBound("bar@8.000000000,2", "") = true +IsLowerBound("bar@8.000000000,1", "") = true +IsLowerBound("bar@8.000000000,0", "") = true +IsLowerBound("bar@7.000000000,9", "") = true +IsLowerBound("bar@3.000000000,2", "") = true +IsLowerBound("bar@3.000000000,1", "") = true +IsLowerBound("bar@3.000000000,0", "") = false + +is-lower-bound synthetic-suffix=@8.000000000,1 +bar@9.000000000,2 +bar@8.000000000,2 +bar@8.000000000,1 +bar@8.000000000,0 +bar@7.000000000,9 +bar@3.000000000,2 +bar@3.000000000,1 +bar@3.000000000,0 +---- +IsLowerBound("bar@9.000000000,2", "@8.000000000,1") = true +IsLowerBound("bar@8.000000000,2", "@8.000000000,1") = true +IsLowerBound("bar@8.000000000,1", "@8.000000000,1") = true +IsLowerBound("bar@8.000000000,0", "@8.000000000,1") = false +IsLowerBound("bar@7.000000000,9", "@8.000000000,1") = false +IsLowerBound("bar@3.000000000,2", "@8.000000000,1") = false +IsLowerBound("bar@3.000000000,1", "@8.000000000,1") = false +IsLowerBound("bar@3.000000000,0", "@8.000000000,1") = false + +seek-ge +apple@2.000000000,0 +bar@4.000000000,0 +bar@3.000000000,0 +bar@2.000000000,0 +bax@3.000000000,1 +bax@3.000000000,0 +fax@9.000000000,0 +foo@3.000000000,2 +foo@3.000000000,1 +foo@3.000000000,0 +moo@3.000000001,0 +moo@3.000000000,2 +moo@3.000000000,1 +moo@3.000000000,0 +zoo@9.000000000,0 +---- +SeekGE("apple@2.000000000,0", boundRow=-1, searchDir=0) = (row=0, equalPrefix=false) [hex:6261720000000000b2d05e00b2d05e000d] +SeekGE("bar@4.000000000,0", boundRow=-1, searchDir=0) = (row=0, equalPrefix=true) [hex:6261720000000000b2d05e00b2d05e000d] +SeekGE("bar@3.000000000,0", boundRow=-1, searchDir=0) = (row=1, equalPrefix=true) [hex:6261780000000000b2d05e00b2d05e000d] +SeekGE("bar@2.000000000,0", boundRow=-1, searchDir=0) = (row=1, equalPrefix=true) [hex:6261780000000000b2d05e00b2d05e000d] +SeekGE("bax@3.000000000,1", boundRow=-1, searchDir=0) = (row=1, equalPrefix=true) [hex:6261780000000000b2d05e00b2d05e000d] +SeekGE("bax@3.000000000,0", boundRow=-1, searchDir=0) = (row=2, equalPrefix=true) [hex:666f6f0000000000b2d05e00b2d05e000d] +SeekGE("fax@9.000000000,0", boundRow=-1, searchDir=0) = (row=2, equalPrefix=false) [hex:666f6f0000000000b2d05e00b2d05e000d] +SeekGE("foo@3.000000000,2", boundRow=-1, searchDir=0) = (row=2, equalPrefix=true) [hex:666f6f0000000000b2d05e00b2d05e000d] +SeekGE("foo@3.000000000,1", boundRow=-1, searchDir=0) = (row=2, equalPrefix=true) [hex:666f6f0000000000b2d05e00b2d05e000d] +SeekGE("foo@3.000000000,0", boundRow=-1, searchDir=0) = (row=3, equalPrefix=true) [hex:6d6f6f0000000000b2d05e00b2d05e000d] +SeekGE("moo@3.000000001,0", boundRow=-1, searchDir=0) = (row=3, equalPrefix=true) [hex:6d6f6f0000000000b2d05e00b2d05e000d] +SeekGE("moo@3.000000000,2", boundRow=-1, searchDir=0) = (row=3, equalPrefix=true) [hex:6d6f6f0000000000b2d05e00b2d05e000d] +SeekGE("moo@3.000000000,1", boundRow=-1, searchDir=0) = (row=3, equalPrefix=true) [hex:6d6f6f0000000000b2d05e00b2d05e000d] +SeekGE("moo@3.000000000,0", boundRow=-1, searchDir=0) = (row=4, equalPrefix=true) +SeekGE("zoo@9.000000000,0", boundRow=-1, searchDir=0) = (row=4, equalPrefix=false) + +materialize-user-key +0 +1 +2 +3 +---- +MaterializeUserKey(-1, 0) = hex:6261720000000000b2d05e00b2d05e000d +MaterializeUserKey(0, 1) = hex:6261780000000000b2d05e00b2d05e000d +MaterializeUserKey(1, 2) = hex:666f6f0000000000b2d05e00b2d05e000d +MaterializeUserKey(2, 3) = hex:6d6f6f0000000000b2d05e00b2d05e000d + +materialize-user-key synthetic-suffix=@8.000000000,9 +0 +1 +2 +3 +---- +MaterializeUserKeyWithSyntheticSuffix(-1, 0, @8.000000000,9) = hex:6261720000000001dcd65000000000090d +MaterializeUserKeyWithSyntheticSuffix(0, 1, @8.000000000,9) = hex:6261780000000001dcd65000000000090d +MaterializeUserKeyWithSyntheticSuffix(1, 2, @8.000000000,9) = hex:666f6f0000000001dcd65000000000090d +MaterializeUserKeyWithSyntheticSuffix(2, 3, @8.000000000,9) = hex:6d6f6f0000000001dcd65000000000090d + +materialize-user-key +3 +2 +0 +1 +---- +MaterializeUserKey(-1, 3) = hex:6d6f6f0000000000b2d05e00b2d05e000d +MaterializeUserKey(3, 2) = hex:666f6f0000000000b2d05e00b2d05e000d +MaterializeUserKey(2, 0) = hex:6261720000000000b2d05e00b2d05e000d +MaterializeUserKey(0, 1) = hex:6261780000000000b2d05e00b2d05e000d diff --git a/pkg/storage/testdata/key_schema_key_writer b/pkg/storage/testdata/key_schema_key_writer new file mode 100644 index 000000000000..535c20c771a5 --- /dev/null +++ b/pkg/storage/testdata/key_schema_key_writer @@ -0,0 +1,107 @@ +init +---- + +# Add a "MVCC" key with no version. + +write +foo +---- +Parse("foo") = hex:666f6f00 +00: ComparePrev("foo"): PrefixLen=4; CommonPrefixLen=0; UserKeyComparison=1 +00: WriteKey(0, "foo", PrefixLen=4, CommonPrefixLen=0) +00: MaterializeKey(_, 0) = hex:666f6f00 + +# Test writing two MVCC keys that are equal except for the logical time. The +# PrefixLen and CommonPrefixLen should be 4 (inclusive of the 0x00 separator +# byte). + +write +foo@3.000000000,1 +foo@3.000000000,0 +---- +Parse("foo@3.000000000,1") = hex:666f6f0000000000b2d05e00000000010d +00: ComparePrev("foo@3.000000000,1"): PrefixLen=4; CommonPrefixLen=4; UserKeyComparison=1 +00: WriteKey(1, "foo@3.000000000,1", PrefixLen=4, CommonPrefixLen=4) +00: MaterializeKey(_, 1) = hex:666f6f0000000000b2d05e00000000010d +Parse("foo@3.000000000,0") = hex:666f6f0000000000b2d05e0009 +01: ComparePrev("foo@3.000000000,0"): PrefixLen=4; CommonPrefixLen=4; UserKeyComparison=1 +01: WriteKey(2, "foo@3.000000000,0", PrefixLen=4, CommonPrefixLen=4) +01: MaterializeKey(_, 2) = hex:666f6f0000000000b2d05e0009 + +# Write a longer key that has the previous key's roachpb.Key as a prefix. The +# CommonPrefixLen should be 3 (exclusive of the 0x00 separator byte). + +write +food@9.000000000,0 +---- +Parse("food@9.000000000,0") = hex:666f6f64000000000218711a0009 +00: ComparePrev("food@9.000000000,0"): PrefixLen=5; CommonPrefixLen=3; UserKeyComparison=1 +00: WriteKey(3, "food@9.000000000,0", PrefixLen=5, CommonPrefixLen=3) +00: MaterializeKey(_, 3) = hex:666f6f64000000000218711a0009 + +# Write the same key again. This is possible internally within Pebble (eg, a DEL +# and a SET separated by a snapshot). UserKeyComparison should be zero. +# CommonPrefixLen should be the entirety of the prefix. + +write +food@9.000000000,0 +---- +Parse("food@9.000000000,0") = hex:666f6f64000000000218711a0009 +00: ComparePrev("food@9.000000000,0"): PrefixLen=5; CommonPrefixLen=5; UserKeyComparison=0 +00: WriteKey(4, "food@9.000000000,0", PrefixLen=5, CommonPrefixLen=5) +00: MaterializeKey(_, 4) = hex:666f6f64000000000218711a0009 + +finish +---- ++------+------------+---------+---------+ +| KEY | WALL | LOGICAL | UNTYPED | ++------+------------+---------+---------+ +| foo | 0 | 0 | | +| foo | 3000000000 | 1 | | +| foo | 3000000000 | 0 | | +| food | 9000000000 | 0 | | +| food | 9000000000 | 0 | | ++------+------------+---------+---------+ + +init +---- + +# Write a lock table key. Its suffix will be unconsidered 'untyped'. + +write +poi@Exclusive,2a84b329-b76b-4616-ac15-1047f0a3fe9c +---- +Parse("poi@Exclusive,2a84b329-b76b-4616-ac15-1047f0a3fe9c") = hex:017a6b12706f69000100022a84b329b76b4616ac151047f0a3fe9c12 +00: ComparePrev("poi@Exclusive,2a84b329-b76b-4616-ac15-1047f0a3fe9c"): PrefixLen=10; CommonPrefixLen=0; UserKeyComparison=1 +00: WriteKey(0, "poi@Exclusive,2a84b329-b76b-4616-ac15-1047f0a3fe9c", PrefixLen=10, CommonPrefixLen=0) +00: MaterializeKey(_, 0) = hex:017a6b12706f69000100022a84b329b76b4616ac151047f0a3fe9c12 + +# Write a lock table key with a later uuid. + +write +poi@Exclusive,073a83c4-5688-420e-af97-824255790f1e +---- +Parse("poi@Exclusive,073a83c4-5688-420e-af97-824255790f1e") = hex:017a6b12706f6900010002073a83c45688420eaf97824255790f1e12 +00: ComparePrev("poi@Exclusive,073a83c4-5688-420e-af97-824255790f1e"): PrefixLen=10; CommonPrefixLen=10; UserKeyComparison=1 +00: WriteKey(1, "poi@Exclusive,073a83c4-5688-420e-af97-824255790f1e", PrefixLen=10, CommonPrefixLen=10) +00: MaterializeKey(_, 1) = hex:017a6b12706f6900010002073a83c45688420eaf97824255790f1e12 + +# Write a MVCC key to the same block. This is okay. + +write +/MVCC/poi@1.000000000,3 +---- +Parse("/MVCC/poi@1.000000000,3") = hex:2f4d5643432f706f6900000000003b9aca00000000030d +00: ComparePrev("/MVCC/poi@1.000000000,3"): PrefixLen=10; CommonPrefixLen=0; UserKeyComparison=1 +00: WriteKey(2, "/MVCC/poi@1.000000000,3", PrefixLen=10, CommonPrefixLen=0) +00: MaterializeKey(_, 2) = hex:2f4d5643432f706f6900000000003b9aca00000000030d + +finish +---- ++------------------------+------------+---------+--------------------------------------+ +| KEY | WALL | LOGICAL | UNTYPED | ++------------------------+------------+---------+--------------------------------------+ +| hex:017a6b12706f690001 | 0 | 0 | 022a84b329b76b4616ac151047f0a3fe9c12 | +| hex:017a6b12706f690001 | 0 | 0 | 02073a83c45688420eaf97824255790f1e12 | +| /MVCC/poi | 1000000000 | 3 | | ++------------------------+------------+---------+--------------------------------------+