diff --git a/docs/generated/settings/settings-for-tenants.txt b/docs/generated/settings/settings-for-tenants.txt
index a84d13adfc82..06811321d3f6 100644
--- a/docs/generated/settings/settings-for-tenants.txt
+++ b/docs/generated/settings/settings-for-tenants.txt
@@ -385,6 +385,7 @@ sql.ttl.job.enabled boolean true whether the TTL job is enabled application
sql.txn.read_committed_isolation.enabled boolean true set to true to allow transactions to use the READ COMMITTED isolation level if specified by BEGIN/SET commands application
sql.txn.repeatable_read_isolation.enabled (alias: sql.txn.snapshot_isolation.enabled) boolean false set to true to allow transactions to use the REPEATABLE READ isolation level if specified by BEGIN/SET commands application
sql.txn_fingerprint_id_cache.capacity integer 100 the maximum number of txn fingerprint IDs stored application
+storage.columnar_blocks.enabled boolean false set to true to enable columnar-blocks to store KVs in a columnar format system-visible
storage.ingestion.value_blocks.enabled boolean true set to true to enable writing of value blocks in ingestion sstables application
storage.max_sync_duration duration 20s maximum duration for disk operations; any operations that take longer than this setting trigger a warning log entry or process crash system-visible
storage.max_sync_duration.fatal.enabled boolean true if true, fatal the process when a disk operation exceeds storage.max_sync_duration application
diff --git a/docs/generated/settings/settings.html b/docs/generated/settings/settings.html
index 6ae8f81d3a1d..84e79c96fd07 100644
--- a/docs/generated/settings/settings.html
+++ b/docs/generated/settings/settings.html
@@ -339,6 +339,7 @@
sql.txn.read_committed_isolation.enabled
| boolean | true | set to true to allow transactions to use the READ COMMITTED isolation level if specified by BEGIN/SET commands | Serverless/Dedicated/Self-Hosted |
sql.txn.repeatable_read_isolation.enabled (alias: sql.txn.snapshot_isolation.enabled)
| boolean | false | set to true to allow transactions to use the REPEATABLE READ isolation level if specified by BEGIN/SET commands | Serverless/Dedicated/Self-Hosted |
sql.txn_fingerprint_id_cache.capacity
| integer | 100 | the maximum number of txn fingerprint IDs stored | Serverless/Dedicated/Self-Hosted |
+storage.columnar_blocks.enabled
| boolean | false | set to true to enable columnar-blocks to store KVs in a columnar format | Dedicated/Self-hosted (read-write); Serverless (read-only) |
storage.experimental.eventually_file_only_snapshots.enabled
| boolean | true | set to false to disable eventually-file-only-snapshots (kv.snapshot_receiver.excise.enabled must also be false) | Dedicated/Self-Hosted |
storage.ingest_split.enabled
| boolean | true | set to false to disable ingest-time splitting that lowers write-amplification | Dedicated/Self-Hosted |
storage.ingestion.value_blocks.enabled
| boolean | true | set to true to enable writing of value blocks in ingestion sstables | Serverless/Dedicated/Self-Hosted |
diff --git a/go.mod b/go.mod
index df35da8c1d65..fe9967ff3b4b 100644
--- a/go.mod
+++ b/go.mod
@@ -128,6 +128,7 @@ require (
github.com/cockroachdb/cmux v0.0.0-20170110192607-30d10be49292
github.com/cockroachdb/cockroach-go/v2 v2.3.7
github.com/cockroachdb/crlfmt v0.0.0-20221214225007-b2fc5c302548
+ github.com/cockroachdb/crlib v0.0.0-20241015224233-894974b3ad94
github.com/cockroachdb/datadriven v1.0.3-0.20240530155848-7682d40af056
github.com/cockroachdb/errors v1.11.3
github.com/cockroachdb/fifo v0.0.0-20240606204812-0bbfbd93a7ce
@@ -306,7 +307,6 @@ require (
github.com/cespare/xxhash/v2 v2.2.0 // indirect
github.com/charmbracelet/bubbletea v0.23.1 // indirect
github.com/charmbracelet/lipgloss v0.6.0 // indirect
- github.com/cockroachdb/crlib v0.0.0-20241015224233-894974b3ad94 // indirect
github.com/cockroachdb/swiss v0.0.0-20240612210725-f4de07ae6964 // indirect
github.com/danieljoos/wincred v1.1.2 // indirect
github.com/decred/dcrd/dcrec/secp256k1/v4 v4.3.0 // indirect
diff --git a/pkg/storage/BUILD.bazel b/pkg/storage/BUILD.bazel
index ffbc4d912c38..51be20522921 100644
--- a/pkg/storage/BUILD.bazel
+++ b/pkg/storage/BUILD.bazel
@@ -28,6 +28,7 @@ go_library(
"pebble.go",
"pebble_batch.go",
"pebble_iterator.go",
+ "pebble_key_schema.go",
"pebble_logger_and_tracer.go",
"pebble_merge.go",
"pebble_mvcc_scanner.go",
@@ -86,6 +87,7 @@ go_library(
"//pkg/util/timeutil",
"//pkg/util/tracing",
"//pkg/util/uuid",
+ "@com_github_cockroachdb_crlib//crbytes",
"@com_github_cockroachdb_errors//:errors",
"@com_github_cockroachdb_errors//oserror",
"@com_github_cockroachdb_fifo//:fifo",
@@ -100,6 +102,7 @@ go_library(
"@com_github_cockroachdb_pebble//replay",
"@com_github_cockroachdb_pebble//sstable",
"@com_github_cockroachdb_pebble//sstable/block",
+ "@com_github_cockroachdb_pebble//sstable/colblk",
"@com_github_cockroachdb_pebble//vfs",
"@com_github_cockroachdb_pebble//wal",
"@com_github_cockroachdb_redact//:redact",
@@ -139,6 +142,7 @@ go_test(
"mvcc_value_test.go",
"open_test.go",
"pebble_iterator_test.go",
+ "pebble_key_schema_test.go",
"pebble_mvcc_scanner_test.go",
"pebble_test.go",
"read_as_of_iterator_test.go",
@@ -198,6 +202,8 @@ go_test(
"//pkg/util/timeutil",
"//pkg/util/uint128",
"//pkg/util/uuid",
+ "@com_github_cockroachdb_crlib//crbytes",
+ "@com_github_cockroachdb_crlib//crstrings",
"@com_github_cockroachdb_datadriven//:datadriven",
"@com_github_cockroachdb_errors//:errors",
"@com_github_cockroachdb_errors//oserror",
@@ -206,9 +212,11 @@ go_test(
"@com_github_cockroachdb_pebble//objstorage/objstorageprovider",
"@com_github_cockroachdb_pebble//sstable",
"@com_github_cockroachdb_pebble//sstable/block",
+ "@com_github_cockroachdb_pebble//sstable/colblk",
"@com_github_cockroachdb_pebble//vfs",
"@com_github_cockroachdb_redact//:redact",
"@com_github_kr_pretty//:pretty",
+ "@com_github_olekukonko_tablewriter//:tablewriter",
"@com_github_stretchr_testify//assert",
"@com_github_stretchr_testify//require",
"@org_golang_google_protobuf//proto",
diff --git a/pkg/storage/pebble.go b/pkg/storage/pebble.go
index 505910772fbb..da114f9d5cb6 100644
--- a/pkg/storage/pebble.go
+++ b/pkg/storage/pebble.go
@@ -102,6 +102,15 @@ var IngestSplitEnabled = settings.RegisterBoolSetting(
settings.WithPublic,
)
+// columnarBlocksEnabled controls whether columnar-blocks are enabled in Pebble.
+var columnarBlocksEnabled = settings.RegisterBoolSetting(
+ settings.SystemVisible,
+ "storage.columnar_blocks.enabled",
+ "set to true to enable columnar-blocks to store KVs in a columnar format",
+ false, // TODO(jackson): Metamorphicize this.
+ settings.WithPublic,
+)
+
// IngestAsFlushable controls whether ingested sstables that overlap the
// memtable may be lazily ingested: written to the WAL and enqueued in the list
// of flushables (eg, memtables, large batches and now lazily-ingested
@@ -407,6 +416,30 @@ func EngineSuffixCompare(a, b []byte) int {
return bytes.Compare(b[:len(b)-1], a[:len(a)-1])
}
+// EnginePointSuffixCompare compares suffixes of Cockroach point keys (which are
+// composed of the version and a trailing version-length byte); the version can
+// be an MVCC timestamp or a lock key. EnginePointSuffixCompare differs from
+// EngineSuffixCompare, because EnginePointSuffixCompare normalizes the
+// suffixes. Ideally we'd have one function that implemented the semantics of
+// EnginePointSuffixCompare, but due to historical reasons, range key suffix
+// comparisons must not perform normalization.
+//
+// See https://github.com/cockroachdb/cockroach/issues/130533
+func EnginePointSuffixCompare(a, b []byte) int {
+ // NB: For performance, this routine manually splits the key into the
+ // user-key and version components rather than using DecodeEngineKey. In
+ // most situations, use DecodeEngineKey or GetKeyPartFromEngineKey or
+ // SplitMVCCKey instead of doing this.
+ if len(a) == 0 || len(b) == 0 {
+ // Empty suffixes sort before non-empty suffixes.
+ return cmp.Compare(len(a), len(b))
+ }
+ return bytes.Compare(
+ normalizeEngineSuffixForCompare(b),
+ normalizeEngineSuffixForCompare(a),
+ )
+}
+
func checkEngineKey(k []byte) {
if len(k) == 0 {
panic(errors.AssertionFailedf("empty key"))
@@ -792,8 +825,9 @@ const MinimumSupportedFormatVersion = pebble.FormatSyntheticPrefixSuffix
// DefaultPebbleOptions returns the default pebble options.
func DefaultPebbleOptions() *pebble.Options {
opts := &pebble.Options{
- Comparer: EngineComparer,
- FS: vfs.Default,
+ Comparer: EngineComparer,
+ FS: vfs.Default,
+ KeySchema: keySchema,
// A value of 2 triggers a compaction when there is 1 sub-level.
L0CompactionThreshold: 2,
L0StopWritesThreshold: 1000,
@@ -1190,6 +1224,9 @@ func newPebble(ctx context.Context, cfg engineConfig) (p *Pebble, err error) {
cfg.opts.Experimental.IngestSplit = func() bool {
return IngestSplitEnabled.Get(&cfg.settings.SV)
}
+ cfg.opts.Experimental.EnableColumnarBlocks = func() bool {
+ return columnarBlocksEnabled.Get(&cfg.settings.SV)
+ }
auxDir := cfg.opts.FS.PathJoin(cfg.env.Dir, base.AuxiliaryDir)
if !cfg.env.IsReadOnly() {
@@ -2512,7 +2549,7 @@ func (p *Pebble) CreateCheckpoint(dir string, spans []roachpb.Span) error {
// version associated with it, since they did so during the fence version.
var pebbleFormatVersionMap = map[clusterversion.Key]pebble.FormatMajorVersion{
clusterversion.V24_1: pebble.FormatSyntheticPrefixSuffix,
- clusterversion.V24_3: pebble.FormatFlushableIngestExcises,
+ clusterversion.V24_3: pebble.FormatColumnarBlocks,
}
// pebbleFormatVersionKeys contains the keys in the map above, in descending order.
diff --git a/pkg/storage/pebble_key_schema.go b/pkg/storage/pebble_key_schema.go
new file mode 100644
index 000000000000..13adefb9e0b7
--- /dev/null
+++ b/pkg/storage/pebble_key_schema.go
@@ -0,0 +1,469 @@
+// Copyright 2024 The Cockroach Authors.
+//
+// Use of this software is governed by the CockroachDB Software License
+// included in the /LICENSE file.
+
+package storage
+
+import (
+ "bytes"
+ "cmp"
+ "encoding/binary"
+ "fmt"
+ "io"
+ "sync"
+ "unsafe"
+
+ "github.com/cockroachdb/cockroach/pkg/util/buildutil"
+ "github.com/cockroachdb/crlib/crbytes"
+ "github.com/cockroachdb/errors"
+ "github.com/cockroachdb/pebble/sstable/colblk"
+)
+
+const (
+ // cockroachColRoachKey is a roachpb.Key user key. It does NOT include the
+ // 0x00 terminator byte that a serialized engine key includes.
+ cockroachColRoachKey int = iota
+ // cockroachColMVCCWallTime is the wall time component of a MVCC timestamp,
+ // or zero if not an MVCC key.
+ cockroachColMVCCWallTime
+ // cockroachColMVCCLogical is the logical time component of a MVCC
+ // timestamp, or zero if not an MVCC key.
+ cockroachColMVCCLogical
+ // cockroachColUntypedVersion holds any non-empty, non-MVCC version. It does
+ // NOT include the 0x00 separator byte that delimits the prefix and suffix
+ // in a serialized engine key. In practice, this column is used to store the
+ // version of lock-table keys.
+ cockroachColUntypedVersion
+ cockroachColCount
+)
+
+var keySchema = colblk.KeySchema{
+ ColumnTypes: []colblk.DataType{
+ cockroachColRoachKey: colblk.DataTypePrefixBytes,
+ cockroachColMVCCWallTime: colblk.DataTypeUint,
+ cockroachColMVCCLogical: colblk.DataTypeUint,
+ cockroachColUntypedVersion: colblk.DataTypeBytes,
+ },
+ NewKeyWriter: func() colblk.KeyWriter {
+ kw := &cockroachKeyWriter{}
+ kw.roachKeys.Init(16)
+ kw.wallTimes.Init()
+ kw.logicalTimes.InitWithDefault()
+ kw.untypedVersions.Init()
+ return kw
+ },
+ NewKeySeeker: func() colblk.KeySeeker {
+ return &cockroachKeySeeker{}
+ },
+}
+
+type cockroachKeyWriter struct {
+ roachKeys colblk.PrefixBytesBuilder
+ wallTimes colblk.UintBuilder
+ logicalTimes colblk.UintBuilder
+ untypedVersions colblk.RawBytesBuilder
+ prevSuffix []byte
+}
+
+// Assert *cockroachKeyWriter implements colblk.KeyWriter.
+var _ colblk.KeyWriter = (*cockroachKeyWriter)(nil)
+
+func (kw *cockroachKeyWriter) ComparePrev(key []byte) colblk.KeyComparison {
+ var cmpv colblk.KeyComparison
+ cmpv.PrefixLen = int32(EngineKeySplit(key)) // TODO(jackson): Inline
+ if kw.roachKeys.Rows() == 0 {
+ cmpv.UserKeyComparison = 1
+ return cmpv
+ }
+ lp := kw.roachKeys.UnsafeGet(kw.roachKeys.Rows() - 1)
+ cmpv.CommonPrefixLen = int32(crbytes.CommonPrefix(lp, key[:cmpv.PrefixLen-1]))
+ if cmpv.CommonPrefixLen == cmpv.PrefixLen-1 {
+ // Adjust CommonPrefixLen to include the sentinel byte.
+ cmpv.CommonPrefixLen = cmpv.PrefixLen
+ cmpv.UserKeyComparison = int32(EnginePointSuffixCompare(key[cmpv.PrefixLen:], kw.prevSuffix))
+ return cmpv
+ }
+ // The keys have different MVCC prefixes. We haven't determined which is
+ // greater, but we know the index at which they diverge. The base.Comparer
+ // contract dictates that prefixes must be lexicographically ordered.
+ if len(lp) == int(cmpv.CommonPrefixLen) {
+ // cmpv.PrefixLen > cmpv.PrefixLenShared; key is greater.
+ cmpv.UserKeyComparison = +1
+ } else {
+ // Both keys have at least 1 additional byte at which they diverge.
+ // Compare the diverging byte.
+ cmpv.UserKeyComparison = int32(cmp.Compare(key[cmpv.CommonPrefixLen], lp[cmpv.CommonPrefixLen]))
+ }
+ return cmpv
+}
+
+func (kw *cockroachKeyWriter) WriteKey(
+ row int, key []byte, keyPrefixLen, keyPrefixLenSharedWithPrev int32,
+) {
+ if len(key) == 0 {
+ panic(errors.AssertionFailedf("empty key"))
+ }
+ // Last byte is the version length + 1 when there is a version,
+ // else it is 0.
+ versionLen := int(key[len(key)-1])
+ if (len(key)-versionLen) != int(keyPrefixLen) || key[keyPrefixLen-1] != 0x00 {
+ panic(errors.AssertionFailedf("invalid %d-byte key with %d-byte prefix (%q)",
+ len(key), keyPrefixLen, key))
+ }
+ // TODO(jackson): Avoid copying the previous suffix.
+ kw.prevSuffix = append(kw.prevSuffix[:0], key[keyPrefixLen:]...)
+
+ // When the roach key is the same, keyPrefixLenSharedWithPrev includes the
+ // separator byte.
+ kw.roachKeys.Put(key[:keyPrefixLen-1], min(int(keyPrefixLenSharedWithPrev), int(keyPrefixLen)-1))
+
+ // NB: The w.logicalTimes builder was initialized with InitWithDefault, so
+ // if we don't set a value, the column value is implicitly zero. We only
+ // need to Set anything for non-zero values.
+ var wallTime uint64
+ var untypedVersion []byte
+ switch versionLen {
+ case 0:
+ // No-op.
+ case 9:
+ wallTime = binary.BigEndian.Uint64(key[keyPrefixLen : keyPrefixLen+8])
+ case 13, 14:
+ wallTime = binary.BigEndian.Uint64(key[keyPrefixLen : keyPrefixLen+8])
+ kw.logicalTimes.Set(row, uint64(binary.BigEndian.Uint32(key[keyPrefixLen+8:keyPrefixLen+12])))
+ // NOTE: byte 13 used to store the timestamp's synthetic bit, but this is no
+ // longer consulted and can be ignored during decoding.
+ default:
+ // Not a MVCC timestamp.
+ untypedVersion = key[keyPrefixLen:]
+ }
+ kw.wallTimes.Set(row, wallTime)
+ kw.untypedVersions.Put(untypedVersion)
+}
+
+func (kw *cockroachKeyWriter) MaterializeKey(dst []byte, i int) []byte {
+ dst = append(dst, kw.roachKeys.UnsafeGet(i)...)
+ // Append separator byte.
+ dst = append(dst, 0)
+ if untypedVersion := kw.untypedVersions.UnsafeGet(i); len(untypedVersion) > 0 {
+ dst = append(dst, untypedVersion...)
+ return dst
+ }
+ wall := kw.wallTimes.Get(i)
+ logical := uint32(kw.logicalTimes.Get(i))
+ if logical == 0 {
+ if wall == 0 {
+ return dst
+ }
+ dst = append(dst, make([]byte, 9)...)
+ binary.BigEndian.PutUint64(dst[len(dst)-9:], wall)
+ dst[len(dst)-1] = 9 // Version length byte
+ return dst
+ }
+ dst = append(dst, make([]byte, 13)...)
+ binary.BigEndian.PutUint64(dst[len(dst)-13:], wall)
+ binary.BigEndian.PutUint32(dst[len(dst)-5:], logical)
+ dst[len(dst)-1] = 13 // Version length byte
+ return dst
+}
+
+func (kw *cockroachKeyWriter) Reset() {
+ kw.roachKeys.Reset()
+ kw.wallTimes.Reset()
+ kw.logicalTimes.Reset()
+ kw.untypedVersions.Reset()
+}
+
+func (kw *cockroachKeyWriter) WriteDebug(dst io.Writer, rows int) {
+ fmt.Fprint(dst, "prefixes: ")
+ kw.roachKeys.WriteDebug(dst, rows)
+ fmt.Fprintln(dst)
+ fmt.Fprint(dst, "wall times: ")
+ kw.wallTimes.WriteDebug(dst, rows)
+ fmt.Fprintln(dst)
+ fmt.Fprint(dst, "logical times: ")
+ kw.logicalTimes.WriteDebug(dst, rows)
+ fmt.Fprintln(dst)
+ fmt.Fprint(dst, "untyped suffixes: ")
+ kw.untypedVersions.WriteDebug(dst, rows)
+ fmt.Fprintln(dst)
+}
+
+func (kw *cockroachKeyWriter) NumColumns() int {
+ return cockroachColCount
+}
+
+func (kw *cockroachKeyWriter) DataType(col int) colblk.DataType {
+ return keySchema.ColumnTypes[col]
+}
+
+func (kw *cockroachKeyWriter) Size(rows int, offset uint32) uint32 {
+ offset = kw.roachKeys.Size(rows, offset)
+ offset = kw.wallTimes.Size(rows, offset)
+ offset = kw.logicalTimes.Size(rows, offset)
+ offset = kw.untypedVersions.Size(rows, offset)
+ return offset
+}
+
+func (kw *cockroachKeyWriter) Finish(
+ col int, rows int, offset uint32, buf []byte,
+) (endOffset uint32) {
+ switch col {
+ case cockroachColRoachKey:
+ return kw.roachKeys.Finish(0, rows, offset, buf)
+ case cockroachColMVCCWallTime:
+ return kw.wallTimes.Finish(0, rows, offset, buf)
+ case cockroachColMVCCLogical:
+ return kw.logicalTimes.Finish(0, rows, offset, buf)
+ case cockroachColUntypedVersion:
+ return kw.untypedVersions.Finish(0, rows, offset, buf)
+ default:
+ panic(fmt.Sprintf("unknown default key column: %d", col))
+ }
+}
+
+var cockroachKeySeekerPool = sync.Pool{
+ New: func() interface{} { return &cockroachKeySeeker{} },
+}
+
+type cockroachKeySeeker struct {
+ roachKeys colblk.PrefixBytes
+ roachKeyChanged colblk.Bitmap
+ mvccWallTimes colblk.UnsafeUints
+ mvccLogical colblk.UnsafeUints
+ untypedVersions colblk.RawBytes
+}
+
+var _ colblk.KeySeeker = (*cockroachKeySeeker)(nil)
+
+// Init is part of the KeySeeker interface.
+func (ks *cockroachKeySeeker) Init(d *colblk.DataBlockDecoder) error {
+ bd := d.BlockDecoder()
+ ks.roachKeys = bd.PrefixBytes(cockroachColRoachKey)
+ ks.roachKeyChanged = d.PrefixChanged()
+ ks.mvccWallTimes = bd.Uints(cockroachColMVCCWallTime)
+ ks.mvccLogical = bd.Uints(cockroachColMVCCLogical)
+ ks.untypedVersions = bd.RawBytes(cockroachColUntypedVersion)
+ return nil
+}
+
+// IsLowerBound compares the provided key to the first user key
+// contained within the data block. It's equivalent to performing
+//
+// Compare(firstUserKey, k) >= 0
+func (ks *cockroachKeySeeker) IsLowerBound(k []byte, syntheticSuffix []byte) bool {
+ ek, ok := DecodeEngineKey(k)
+ if !ok {
+ panic(errors.AssertionFailedf("invalid key %q", k))
+ }
+ if v := bytes.Compare(ks.roachKeys.UnsafeFirstSlice(), ek.Key); v != 0 {
+ return v > 0
+ }
+ // If there's a synthetic suffix, we ignore the block's suffix columns and
+ // compare the key's suffix to the synthetic suffix.
+ if len(syntheticSuffix) > 0 {
+ return EnginePointSuffixCompare(syntheticSuffix, k[len(ek.Key)+1:]) >= 0
+ }
+ var wallTime uint64
+ var logicalTime uint32
+ switch len(ek.Version) {
+ case engineKeyNoVersion:
+ case engineKeyVersionWallTimeLen:
+ wallTime = binary.BigEndian.Uint64(ek.Version[:8])
+ case engineKeyVersionWallAndLogicalTimeLen, engineKeyVersionWallLogicalAndSyntheticTimeLen:
+ wallTime = binary.BigEndian.Uint64(ek.Version[:8])
+ logicalTime = binary.BigEndian.Uint32(ek.Version[8:12])
+ default:
+ // The provided key `k` is not a MVCC key. Assert that the first key in
+ // the block is also not an MVCC key. If it were, that would mean there
+ // exists both a MVCC key and a non-MVCC key with the same prefix.
+ //
+ // TODO(jackson): Double check that we'll never produce index separators
+ // that are invalid version lengths.
+ if buildutil.CrdbTestBuild && ks.mvccWallTimes.At(0) != 0 {
+ panic("comparing timestamp with untyped suffix")
+ }
+ return EnginePointSuffixCompare(ks.untypedVersions.At(0), ek.Version) >= 0
+ }
+
+ // NB: The sign comparison is inverted because suffixes are sorted such that
+ // the largest timestamps are "smaller" in the lexicographical ordering.
+ if v := cmp.Compare(ks.mvccWallTimes.At(0), wallTime); v != 0 {
+ return v < 0
+ }
+ return cmp.Compare(uint32(ks.mvccLogical.At(0)), logicalTime) <= 0
+}
+
+// SeekGE is part of the KeySeeker interface.
+func (ks *cockroachKeySeeker) SeekGE(
+ key []byte, boundRow int, searchDir int8,
+) (row int, equalPrefix bool) {
+ // TODO(jackson): Inline EngineKeySplit.
+ si := EngineKeySplit(key)
+ row, eq := ks.roachKeys.Search(key[:si-1])
+ if eq {
+ return ks.seekGEOnSuffix(row, key[si:]), true
+ }
+ return row, false
+}
+
+// seekGEOnSuffix is a helper function for SeekGE when a seek key's prefix
+// exactly matches a row. seekGEOnSuffix finds the first row at index or later
+// with the same prefix as index and a suffix greater than or equal to [suffix],
+// or if no such row exists, the next row with a different prefix.
+func (ks *cockroachKeySeeker) seekGEOnSuffix(index int, seekSuffix []byte) (row int) {
+ // The search key's prefix exactly matches the prefix of the row at index.
+ const withWall = 9
+ const withLogical = withWall + 4
+ const withSynthetic = withLogical + 1
+ var seekWallTime uint64
+ var seekLogicalTime uint32
+ switch len(seekSuffix) {
+ case 0:
+ // The search key has no suffix, so it's the smallest possible key with
+ // its prefix. Return the row. This is a common case where the user is
+ // seeking to the most-recent row and just wants the smallest key with
+ // the prefix.
+ return index
+ case withLogical, withSynthetic:
+ seekWallTime = binary.BigEndian.Uint64(seekSuffix)
+ seekLogicalTime = binary.BigEndian.Uint32(seekSuffix[8:])
+ case withWall:
+ seekWallTime = binary.BigEndian.Uint64(seekSuffix)
+ default:
+ // The suffix is untyped. Compare the untyped suffixes.
+ // Binary search between [index, prefixChanged.SeekSetBitGE(index+1)].
+ //
+ // Define f(i) = true iff key at i is >= seek key.
+ // Invariant: f(l-1) == false, f(u) == true.
+ l := index
+ u := ks.roachKeyChanged.SeekSetBitGE(index + 1)
+ for l < u {
+ h := int(uint(l+u) >> 1) // avoid overflow when computing h
+ // l ≤ h < u
+ if bytes.Compare(ks.untypedVersions.At(h), seekSuffix) >= 0 {
+ u = h // preserves f(u) == true
+ } else {
+ l = h + 1 // preserves f(l-1) == false
+ }
+ }
+ return l
+ }
+ // Seeking among MVCC versions using a MVCC timestamp.
+
+ // TODO(jackson): What if the row has an untyped suffix?
+
+ // First check the suffix at index, because querying for the latest value is
+ // the most common case.
+ if latestWallTime := ks.mvccWallTimes.At(index); latestWallTime < seekWallTime ||
+ (latestWallTime == seekWallTime && uint32(ks.mvccLogical.At(index)) <= seekLogicalTime) {
+ return index
+ }
+
+ // Binary search between [index+1, prefixChanged.SeekSetBitGE(index+1)].
+ //
+ // Define f(i) = true iff key at i is >= seek key.
+ // Invariant: f(l-1) == false, f(u) == true.
+ l := index + 1
+ u := ks.roachKeyChanged.SeekSetBitGE(index + 1)
+ for l < u {
+ h := int(uint(l+u) >> 1) // avoid overflow when computing h
+ // l ≤ h < u
+ hWallTime := ks.mvccWallTimes.At(h)
+ if hWallTime < seekWallTime ||
+ (hWallTime == seekWallTime && uint32(ks.mvccLogical.At(h)) <= seekLogicalTime) {
+ u = h // preserves f(u) = true
+ } else {
+ l = h + 1 // preserves f(l-1) = false
+ }
+ }
+ return l
+}
+
+// MaterializeUserKey is part of the KeySeeker interface.
+func (ks *cockroachKeySeeker) MaterializeUserKey(
+ ki *colblk.PrefixBytesIter, prevRow, row int,
+) []byte {
+ if prevRow+1 == row && prevRow >= 0 {
+ ks.roachKeys.SetNext(ki)
+ } else {
+ ks.roachKeys.SetAt(ki, row)
+ }
+
+ roachKeyLen := len(ki.Buf)
+ ptr := unsafe.Pointer(uintptr(unsafe.Pointer(unsafe.SliceData(ki.Buf))) + uintptr(roachKeyLen))
+ mvccWall := ks.mvccWallTimes.At(row)
+ mvccLogical := uint32(ks.mvccLogical.At(row))
+ if mvccWall == 0 && mvccLogical == 0 {
+ // This is not an MVCC key. Use the untyped suffix.
+ untypedVersion := ks.untypedVersions.At(row)
+ if len(untypedVersion) == 0 {
+ res := ki.Buf[:roachKeyLen+1]
+ res[roachKeyLen] = 0
+ return res
+ }
+ // Slice first, to check that the capacity is sufficient.
+ res := ki.Buf[:roachKeyLen+1+len(untypedVersion)]
+ *(*byte)(ptr) = 0
+ memmove(
+ unsafe.Pointer(uintptr(ptr)+1),
+ unsafe.Pointer(unsafe.SliceData(untypedVersion)),
+ uintptr(len(untypedVersion)),
+ )
+ return res
+ }
+
+ // Inline binary.BigEndian.PutUint64. Note that this code is converted into
+ // word-size instructions by the compiler.
+ *(*byte)(ptr) = 0
+ *(*byte)(unsafe.Pointer(uintptr(ptr) + 1)) = byte(mvccWall >> 56)
+ *(*byte)(unsafe.Pointer(uintptr(ptr) + 2)) = byte(mvccWall >> 48)
+ *(*byte)(unsafe.Pointer(uintptr(ptr) + 3)) = byte(mvccWall >> 40)
+ *(*byte)(unsafe.Pointer(uintptr(ptr) + 4)) = byte(mvccWall >> 32)
+ *(*byte)(unsafe.Pointer(uintptr(ptr) + 5)) = byte(mvccWall >> 24)
+ *(*byte)(unsafe.Pointer(uintptr(ptr) + 6)) = byte(mvccWall >> 16)
+ *(*byte)(unsafe.Pointer(uintptr(ptr) + 7)) = byte(mvccWall >> 8)
+ *(*byte)(unsafe.Pointer(uintptr(ptr) + 8)) = byte(mvccWall)
+
+ ptr = unsafe.Pointer(uintptr(ptr) + 9)
+ // This is an MVCC key.
+ if mvccLogical == 0 {
+ *(*byte)(ptr) = 9
+ return ki.Buf[:len(ki.Buf)+10]
+ }
+
+ // Inline binary.BigEndian.PutUint32.
+ *(*byte)(ptr) = byte(mvccWall >> 24)
+ *(*byte)(unsafe.Pointer(uintptr(ptr) + 1)) = byte(mvccWall >> 16)
+ *(*byte)(unsafe.Pointer(uintptr(ptr) + 2)) = byte(mvccWall >> 8)
+ *(*byte)(unsafe.Pointer(uintptr(ptr) + 3)) = byte(mvccWall)
+ *(*byte)(unsafe.Pointer(uintptr(ptr) + 4)) = 13
+ return ki.Buf[:len(ki.Buf)+14]
+}
+
+// MaterializeUserKeyWithSyntheticSuffix is part of the KeySeeker interface.
+func (ks *cockroachKeySeeker) MaterializeUserKeyWithSyntheticSuffix(
+ ki *colblk.PrefixBytesIter, suffix []byte, prevRow, row int,
+) []byte {
+ if prevRow+1 == row && prevRow >= 0 {
+ ks.roachKeys.SetNext(ki)
+ } else {
+ ks.roachKeys.SetAt(ki, row)
+ }
+
+ // Slice first, to check that the capacity is sufficient.
+ res := ki.Buf[:len(ki.Buf)+1+len(suffix)]
+ ptr := unsafe.Pointer(uintptr(unsafe.Pointer(unsafe.SliceData(ki.Buf))) + uintptr(len(ki.Buf)))
+ *(*byte)(ptr) = 0
+ memmove(unsafe.Pointer(uintptr(ptr)+1), unsafe.Pointer(unsafe.SliceData(suffix)), uintptr(len(suffix)))
+ return res
+}
+
+// Release is part of the KeySeeker interface.
+func (ks *cockroachKeySeeker) Release() {
+ *ks = cockroachKeySeeker{}
+ cockroachKeySeekerPool.Put(ks)
+}
+
+//go:linkname memmove runtime.memmove
+func memmove(to, from unsafe.Pointer, n uintptr)
diff --git a/pkg/storage/pebble_key_schema_test.go b/pkg/storage/pebble_key_schema_test.go
new file mode 100644
index 000000000000..57c917d3a801
--- /dev/null
+++ b/pkg/storage/pebble_key_schema_test.go
@@ -0,0 +1,314 @@
+// Copyright 2024 The Cockroach Authors.
+//
+// Use of this software is governed by the CockroachDB Software License
+// included in the /LICENSE file.
+
+package storage
+
+import (
+ "bytes"
+ "encoding/hex"
+ "fmt"
+ "math/rand"
+ "strconv"
+ "strings"
+ "testing"
+
+ "github.com/cockroachdb/cockroach/pkg/kv/kvserver/concurrency/lock"
+ "github.com/cockroachdb/cockroach/pkg/testutils/datapathutils"
+ "github.com/cockroachdb/cockroach/pkg/util/hlc"
+ "github.com/cockroachdb/cockroach/pkg/util/leaktest"
+ "github.com/cockroachdb/cockroach/pkg/util/log"
+ "github.com/cockroachdb/cockroach/pkg/util/uuid"
+ "github.com/cockroachdb/crlib/crbytes"
+ "github.com/cockroachdb/crlib/crstrings"
+ "github.com/cockroachdb/datadriven"
+ "github.com/cockroachdb/errors"
+ "github.com/cockroachdb/pebble"
+ "github.com/cockroachdb/pebble/sstable/block"
+ "github.com/cockroachdb/pebble/sstable/colblk"
+ "github.com/olekukonko/tablewriter"
+)
+
+func TestKeySchema_KeyWriter(t *testing.T) {
+ defer leaktest.AfterTest(t)()
+ defer log.Scope(t).Close(t)
+
+ var kw colblk.KeyWriter
+ var row int
+ var buf bytes.Buffer
+ var keyBuf []byte
+ datadriven.RunTest(t, datapathutils.TestDataPath(t, "key_schema_key_writer"), func(t *testing.T, td *datadriven.TestData) string {
+ buf.Reset()
+ switch td.Cmd {
+ case "init":
+ // Exercise both resetting and retrieving a new writer.
+ if kw != nil && rand.Intn(2) == 1 {
+ kw.Reset()
+ } else {
+ kw = keySchema.NewKeyWriter()
+ }
+ row = 0
+ keyBuf = keyBuf[:0]
+ return ""
+ case "write":
+ for i, line := range crstrings.Lines(td.Input) {
+ k, err := parseTestKey(line)
+ if err != nil {
+ t.Fatalf("bad test key %q on line %d: %s", line, i, err)
+ }
+ fmt.Fprintf(&buf, "Parse(%q) = hex:%x\n", line, k)
+ kcmp := kw.ComparePrev(k)
+ if v := EngineKeyCompare(k, keyBuf); v < 0 {
+ t.Fatalf("line %d: EngineKeyCompare(%q, hex:%x) = %d", i, line, keyBuf, v)
+ } else if v != int(kcmp.UserKeyComparison) {
+ t.Fatalf("line %d: EngineKeyCompare(%q, hex:%x) = %d; kcmp.UserKeyComparison = %d",
+ i, line, keyBuf, v, kcmp.UserKeyComparison)
+ }
+
+ fmt.Fprintf(&buf, "%02d: ComparePrev(%q): PrefixLen=%d; CommonPrefixLen=%d; UserKeyComparison=%d\n",
+ i, line, kcmp.PrefixLen, kcmp.CommonPrefixLen, kcmp.UserKeyComparison)
+ kw.WriteKey(row, k, kcmp.PrefixLen, kcmp.CommonPrefixLen)
+ fmt.Fprintf(&buf, "%02d: WriteKey(%d, %q, PrefixLen=%d, CommonPrefixLen=%d)\n",
+ i, row, line, kcmp.PrefixLen, kcmp.CommonPrefixLen)
+
+ keyBuf = kw.MaterializeKey(keyBuf[:0], row)
+ if !EngineKeyEqual(k, keyBuf) {
+ t.Fatalf("line %d: EngineKeyEqual(hex:%x, hex:%x) == false", i, k, keyBuf)
+ }
+ if v := EngineKeyCompare(k, keyBuf); v != 0 {
+ t.Fatalf("line %d: EngineKeyCompare(hex:%x, hex:%x) = %d", i, k, keyBuf, v)
+ }
+
+ fmt.Fprintf(&buf, "%02d: MaterializeKey(_, %d) = hex:%x\n", i, row, keyBuf)
+ row++
+ }
+ return buf.String()
+ case "finish":
+ b := crbytes.AllocAligned(int(kw.Size(row, 0) + 1))
+ offs := make([]uint32, kw.NumColumns()+1)
+ for i := 0; i < kw.NumColumns(); i++ {
+ offs[i+1] = kw.Finish(i, row, offs[i], b)
+ }
+ roachKeys, _ := colblk.DecodePrefixBytes(b, offs[cockroachColRoachKey], row)
+ mvccWallTimes, _ := colblk.DecodeUnsafeUints(b, offs[cockroachColMVCCWallTime], row)
+ mvccLogicalTimes, _ := colblk.DecodeUnsafeUints(b, offs[cockroachColMVCCLogical], row)
+ untypedVersions, _ := colblk.DecodeRawBytes(b, offs[cockroachColUntypedVersion], row)
+ tbl := tablewriter.NewWriter(&buf)
+ tbl.SetHeader([]string{"Key", "Wall", "Logical", "Untyped"})
+ for i := 0; i < row; i++ {
+ tbl.Append([]string{
+ asciiOrHex(roachKeys.At(i)),
+ fmt.Sprintf("%d", mvccWallTimes.At(i)),
+ fmt.Sprintf("%d", mvccLogicalTimes.At(i)),
+ fmt.Sprintf("%x", untypedVersions.At(i)),
+ })
+ }
+ tbl.Render()
+ return buf.String()
+ default:
+ panic(fmt.Sprintf("unrecognized command %q", td.Cmd))
+ }
+ })
+}
+
+func TestKeySchema_KeySeeker(t *testing.T) {
+ defer leaktest.AfterTest(t)()
+ defer log.Scope(t).Close(t)
+
+ var buf bytes.Buffer
+ var enc colblk.DataBlockEncoder
+ var dec colblk.DataBlockDecoder
+ var ks colblk.KeySeeker
+ var maxKeyLen int
+ enc.Init(keySchema)
+
+ initKeySeeker := func() {
+ if ks == nil || rand.Intn(2) == 1 {
+ if ks != nil {
+ ks.Release()
+ }
+ ks = keySchema.NewKeySeeker()
+ }
+ if err := ks.Init(&dec); err != nil {
+ t.Fatal(err)
+ }
+ }
+
+ datadriven.RunTest(t, datapathutils.TestDataPath(t, "key_schema_key_seeker"), func(t *testing.T, td *datadriven.TestData) string {
+ buf.Reset()
+ switch td.Cmd {
+ case "define-block":
+ enc.Reset()
+ maxKeyLen = 0
+ var rows int
+ for i, line := range crstrings.Lines(td.Input) {
+ k, err := parseTestKey(line)
+ if err != nil {
+ t.Fatalf("bad test key %q on line %d: %s", line, i, err)
+ }
+ fmt.Fprintf(&buf, "Parse(%q) = hex:%x\n", line, k)
+ maxKeyLen = max(maxKeyLen, len(k))
+ kcmp := enc.KeyWriter.ComparePrev(k)
+ ikey := pebble.InternalKey{
+ UserKey: k,
+ Trailer: pebble.MakeInternalKeyTrailer(0, pebble.InternalKeyKindSet),
+ }
+ enc.Add(ikey, k, block.InPlaceValuePrefix(false), kcmp, false /* isObsolete */)
+ rows++
+ }
+ blk, _ := enc.Finish(rows, enc.Size())
+ dec.Init(keySchema, blk)
+ return buf.String()
+ case "is-lower-bound":
+ initKeySeeker()
+ syntheticSuffix, syntheticSuffixStr, _ := getSyntheticSuffix(t, td)
+ for _, line := range crstrings.Lines(td.Input) {
+ k, err := parseTestKey(line)
+ if err != nil {
+ t.Fatalf("bad test key %q: %s", line, err)
+ }
+ got := ks.IsLowerBound(k, syntheticSuffix)
+ fmt.Fprintf(&buf, "IsLowerBound(%q, %q) = %t\n", line, syntheticSuffixStr, got)
+ }
+ return buf.String()
+ case "seek-ge":
+ initKeySeeker()
+ for _, line := range crstrings.Lines(td.Input) {
+ fields := strings.Fields(line)
+ k, err := parseTestKey(fields[0])
+ if err != nil {
+ t.Fatalf("bad test key %q: %s", fields[0], err)
+ }
+ boundRow := -1
+ searchDir := 0
+ if len(fields) == 3 {
+ boundRow, err = strconv.Atoi(fields[1])
+ if err != nil {
+ t.Fatalf("bad bound row %q: %s", fields[1], err)
+ }
+ switch fields[2] {
+ case "fwd":
+ searchDir = +1
+ case "bwd":
+ searchDir = -1
+ default:
+ t.Fatalf("bad search direction %q", fields[2])
+ }
+ }
+ row, equalPrefix := ks.SeekGE(k, boundRow, int8(searchDir))
+
+ fmt.Fprintf(&buf, "SeekGE(%q, boundRow=%d, searchDir=%d) = (row=%d, equalPrefix=%t)",
+ line, boundRow, searchDir, row, equalPrefix)
+ if row >= 0 && row < dec.BlockDecoder().Rows() {
+ var kiter colblk.PrefixBytesIter
+ kiter.Buf = make([]byte, maxKeyLen+1)
+ key := ks.MaterializeUserKey(&kiter, -1, row)
+ fmt.Fprintf(&buf, " [hex:%x]", key)
+ }
+ fmt.Fprintln(&buf)
+ }
+ return buf.String()
+ case "materialize-user-key":
+ initKeySeeker()
+ syntheticSuffix, syntheticSuffixStr, syntheticSuffixOk := getSyntheticSuffix(t, td)
+
+ var kiter colblk.PrefixBytesIter
+ kiter.Buf = make([]byte, maxKeyLen+len(syntheticSuffix)+1)
+ prevRow := -1
+ for _, line := range crstrings.Lines(td.Input) {
+ row, err := strconv.Atoi(line)
+ if err != nil {
+ t.Fatalf("bad row number %q: %s", line, err)
+ }
+ if syntheticSuffixOk {
+ key := ks.MaterializeUserKeyWithSyntheticSuffix(&kiter, syntheticSuffix, prevRow, row)
+ fmt.Fprintf(&buf, "MaterializeUserKeyWithSyntheticSuffix(%d, %d, %s) = hex:%x\n", prevRow, row, syntheticSuffixStr, key)
+ } else {
+ key := ks.MaterializeUserKey(&kiter, prevRow, row)
+ fmt.Fprintf(&buf, "MaterializeUserKey(%d, %d) = hex:%x\n", prevRow, row, key)
+ }
+ prevRow = row
+ }
+ return buf.String()
+ default:
+ panic(fmt.Sprintf("unrecognized command %q", td.Cmd))
+ }
+ })
+
+}
+
+func getSyntheticSuffix(t *testing.T, td *datadriven.TestData) ([]byte, string, bool) {
+ var syntheticSuffix []byte
+ var syntheticSuffixStr string
+ cmdArg, ok := td.Arg("synthetic-suffix")
+ if ok {
+ syntheticSuffixStr = cmdArg.SingleVal(t)
+ var err error
+ syntheticSuffix, err = parseTestKey(syntheticSuffixStr)
+ if err != nil {
+ t.Fatalf("parsing synthetic suffix %q: %s", syntheticSuffixStr, err)
+ }
+ syntheticSuffix = syntheticSuffix[1:] // Trim the separator byte.
+ }
+ return syntheticSuffix, syntheticSuffixStr, ok
+}
+
+func asciiOrHex(b []byte) string {
+ if bytes.ContainsFunc(b, func(r rune) bool { return r < ' ' || r > '~' }) {
+ return fmt.Sprintf("hex:%x", b)
+ }
+ return string(b)
+}
+
+func parseTestKey(s string) ([]byte, error) {
+ if strings.HasPrefix(s, "hex:") {
+ b, err := hex.DecodeString(strings.TrimPrefix(s, "hex:"))
+ if err != nil {
+ return nil, errors.Wrap(err, "parsing hexadecimal literal key")
+ }
+ return b, nil
+ }
+ i := strings.IndexByte(s, '@')
+ if i == -1 {
+ // Return just the roachpb key with the sentinel byte.
+ return append([]byte(s), 0x00), nil
+ }
+ if len(s[i+1:]) == 0 {
+ return nil, errors.Newf("key %q has empty suffix", s)
+ }
+ version := s[i+1:]
+ j := strings.IndexByte(version, ',')
+ switch version[0:j] {
+ case "Shared", "Exclusive", "Intent":
+ // This is the lock strength. Parse as a lock table key.
+ strength := lock.Intent
+ switch version[0:j] {
+ case "Shared":
+ strength = lock.Shared
+ case "Exclusive":
+ strength = lock.Exclusive
+ }
+ txnUUID, err := uuid.FromString(version[j+1:])
+ if err != nil {
+ return nil, errors.Wrapf(err, "parsing lock table transaction UUID")
+ }
+ ltk := LockTableKey{
+ Key: []byte(s[:i]),
+ Strength: strength,
+ TxnUUID: txnUUID,
+ }
+ ek, _ := ltk.ToEngineKey(nil)
+ return ek.Encode(), nil
+ default:
+ // Parse as a MVCC key.
+ ts, err := hlc.ParseTimestamp(version)
+ if err != nil {
+ return nil, errors.Wrap(err, "parsing MVCC timestamp")
+ }
+ return EncodeMVCCKey(MVCCKey{
+ Key: []byte(s[:i]),
+ Timestamp: ts,
+ }), nil
+ }
+}
diff --git a/pkg/storage/testdata/key_schema_key_seeker b/pkg/storage/testdata/key_schema_key_seeker
new file mode 100644
index 000000000000..e05eeb8394c6
--- /dev/null
+++ b/pkg/storage/testdata/key_schema_key_seeker
@@ -0,0 +1,149 @@
+define-block
+foo@3.000000000,1
+foo@3.000000000,0
+----
+Parse("foo@3.000000000,1") = hex:666f6f0000000000b2d05e00000000010d
+Parse("foo@3.000000000,0") = hex:666f6f0000000000b2d05e0009
+
+is-lower-bound
+abc
+fax@3.000000000,1
+foo@3.000000000,2
+foo@3.000000000,1
+foo@3.000000000,0
+zoo@9.100000000,2
+----
+IsLowerBound("abc", "") = true
+IsLowerBound("fax@3.000000000,1", "") = true
+IsLowerBound("foo@3.000000000,2", "") = true
+IsLowerBound("foo@3.000000000,1", "") = true
+IsLowerBound("foo@3.000000000,0", "") = false
+IsLowerBound("zoo@9.100000000,2", "") = false
+
+seek-ge
+fax@9.000000000,0
+foo@3.000000000,1
+foo@3.000000000,0
+foo@3.000000000,2
+zoo@9.000000000,0
+----
+SeekGE("fax@9.000000000,0", boundRow=-1, searchDir=0) = (row=0, equalPrefix=false) [hex:666f6f0000000000b2d05e00b2d05e000d]
+SeekGE("foo@3.000000000,1", boundRow=-1, searchDir=0) = (row=0, equalPrefix=true) [hex:666f6f0000000000b2d05e00b2d05e000d]
+SeekGE("foo@3.000000000,0", boundRow=-1, searchDir=0) = (row=1, equalPrefix=true) [hex:666f6f0000000000b2d05e0009]
+SeekGE("foo@3.000000000,2", boundRow=-1, searchDir=0) = (row=0, equalPrefix=true) [hex:666f6f0000000000b2d05e00b2d05e000d]
+SeekGE("zoo@9.000000000,0", boundRow=-1, searchDir=0) = (row=2, equalPrefix=false)
+
+define-block
+bar@3.000000000,1
+bax@3.000000000,1
+foo@3.000000000,1
+moo@3.000000000,1
+----
+Parse("bar@3.000000000,1") = hex:6261720000000000b2d05e00000000010d
+Parse("bax@3.000000000,1") = hex:6261780000000000b2d05e00000000010d
+Parse("foo@3.000000000,1") = hex:666f6f0000000000b2d05e00000000010d
+Parse("moo@3.000000000,1") = hex:6d6f6f0000000000b2d05e00000000010d
+
+is-lower-bound
+bar@9.000000000,2
+bar@8.000000000,2
+bar@8.000000000,1
+bar@8.000000000,0
+bar@7.000000000,9
+bar@3.000000000,2
+bar@3.000000000,1
+bar@3.000000000,0
+----
+IsLowerBound("bar@9.000000000,2", "") = true
+IsLowerBound("bar@8.000000000,2", "") = true
+IsLowerBound("bar@8.000000000,1", "") = true
+IsLowerBound("bar@8.000000000,0", "") = true
+IsLowerBound("bar@7.000000000,9", "") = true
+IsLowerBound("bar@3.000000000,2", "") = true
+IsLowerBound("bar@3.000000000,1", "") = true
+IsLowerBound("bar@3.000000000,0", "") = false
+
+is-lower-bound synthetic-suffix=@8.000000000,1
+bar@9.000000000,2
+bar@8.000000000,2
+bar@8.000000000,1
+bar@8.000000000,0
+bar@7.000000000,9
+bar@3.000000000,2
+bar@3.000000000,1
+bar@3.000000000,0
+----
+IsLowerBound("bar@9.000000000,2", "@8.000000000,1") = true
+IsLowerBound("bar@8.000000000,2", "@8.000000000,1") = true
+IsLowerBound("bar@8.000000000,1", "@8.000000000,1") = true
+IsLowerBound("bar@8.000000000,0", "@8.000000000,1") = false
+IsLowerBound("bar@7.000000000,9", "@8.000000000,1") = false
+IsLowerBound("bar@3.000000000,2", "@8.000000000,1") = false
+IsLowerBound("bar@3.000000000,1", "@8.000000000,1") = false
+IsLowerBound("bar@3.000000000,0", "@8.000000000,1") = false
+
+seek-ge
+apple@2.000000000,0
+bar@4.000000000,0
+bar@3.000000000,0
+bar@2.000000000,0
+bax@3.000000000,1
+bax@3.000000000,0
+fax@9.000000000,0
+foo@3.000000000,2
+foo@3.000000000,1
+foo@3.000000000,0
+moo@3.000000001,0
+moo@3.000000000,2
+moo@3.000000000,1
+moo@3.000000000,0
+zoo@9.000000000,0
+----
+SeekGE("apple@2.000000000,0", boundRow=-1, searchDir=0) = (row=0, equalPrefix=false) [hex:6261720000000000b2d05e00b2d05e000d]
+SeekGE("bar@4.000000000,0", boundRow=-1, searchDir=0) = (row=0, equalPrefix=true) [hex:6261720000000000b2d05e00b2d05e000d]
+SeekGE("bar@3.000000000,0", boundRow=-1, searchDir=0) = (row=1, equalPrefix=true) [hex:6261780000000000b2d05e00b2d05e000d]
+SeekGE("bar@2.000000000,0", boundRow=-1, searchDir=0) = (row=1, equalPrefix=true) [hex:6261780000000000b2d05e00b2d05e000d]
+SeekGE("bax@3.000000000,1", boundRow=-1, searchDir=0) = (row=1, equalPrefix=true) [hex:6261780000000000b2d05e00b2d05e000d]
+SeekGE("bax@3.000000000,0", boundRow=-1, searchDir=0) = (row=2, equalPrefix=true) [hex:666f6f0000000000b2d05e00b2d05e000d]
+SeekGE("fax@9.000000000,0", boundRow=-1, searchDir=0) = (row=2, equalPrefix=false) [hex:666f6f0000000000b2d05e00b2d05e000d]
+SeekGE("foo@3.000000000,2", boundRow=-1, searchDir=0) = (row=2, equalPrefix=true) [hex:666f6f0000000000b2d05e00b2d05e000d]
+SeekGE("foo@3.000000000,1", boundRow=-1, searchDir=0) = (row=2, equalPrefix=true) [hex:666f6f0000000000b2d05e00b2d05e000d]
+SeekGE("foo@3.000000000,0", boundRow=-1, searchDir=0) = (row=3, equalPrefix=true) [hex:6d6f6f0000000000b2d05e00b2d05e000d]
+SeekGE("moo@3.000000001,0", boundRow=-1, searchDir=0) = (row=3, equalPrefix=true) [hex:6d6f6f0000000000b2d05e00b2d05e000d]
+SeekGE("moo@3.000000000,2", boundRow=-1, searchDir=0) = (row=3, equalPrefix=true) [hex:6d6f6f0000000000b2d05e00b2d05e000d]
+SeekGE("moo@3.000000000,1", boundRow=-1, searchDir=0) = (row=3, equalPrefix=true) [hex:6d6f6f0000000000b2d05e00b2d05e000d]
+SeekGE("moo@3.000000000,0", boundRow=-1, searchDir=0) = (row=4, equalPrefix=true)
+SeekGE("zoo@9.000000000,0", boundRow=-1, searchDir=0) = (row=4, equalPrefix=false)
+
+materialize-user-key
+0
+1
+2
+3
+----
+MaterializeUserKey(-1, 0) = hex:6261720000000000b2d05e00b2d05e000d
+MaterializeUserKey(0, 1) = hex:6261780000000000b2d05e00b2d05e000d
+MaterializeUserKey(1, 2) = hex:666f6f0000000000b2d05e00b2d05e000d
+MaterializeUserKey(2, 3) = hex:6d6f6f0000000000b2d05e00b2d05e000d
+
+materialize-user-key synthetic-suffix=@8.000000000,9
+0
+1
+2
+3
+----
+MaterializeUserKeyWithSyntheticSuffix(-1, 0, @8.000000000,9) = hex:6261720000000001dcd65000000000090d
+MaterializeUserKeyWithSyntheticSuffix(0, 1, @8.000000000,9) = hex:6261780000000001dcd65000000000090d
+MaterializeUserKeyWithSyntheticSuffix(1, 2, @8.000000000,9) = hex:666f6f0000000001dcd65000000000090d
+MaterializeUserKeyWithSyntheticSuffix(2, 3, @8.000000000,9) = hex:6d6f6f0000000001dcd65000000000090d
+
+materialize-user-key
+3
+2
+0
+1
+----
+MaterializeUserKey(-1, 3) = hex:6d6f6f0000000000b2d05e00b2d05e000d
+MaterializeUserKey(3, 2) = hex:666f6f0000000000b2d05e00b2d05e000d
+MaterializeUserKey(2, 0) = hex:6261720000000000b2d05e00b2d05e000d
+MaterializeUserKey(0, 1) = hex:6261780000000000b2d05e00b2d05e000d
diff --git a/pkg/storage/testdata/key_schema_key_writer b/pkg/storage/testdata/key_schema_key_writer
new file mode 100644
index 000000000000..535c20c771a5
--- /dev/null
+++ b/pkg/storage/testdata/key_schema_key_writer
@@ -0,0 +1,107 @@
+init
+----
+
+# Add a "MVCC" key with no version.
+
+write
+foo
+----
+Parse("foo") = hex:666f6f00
+00: ComparePrev("foo"): PrefixLen=4; CommonPrefixLen=0; UserKeyComparison=1
+00: WriteKey(0, "foo", PrefixLen=4, CommonPrefixLen=0)
+00: MaterializeKey(_, 0) = hex:666f6f00
+
+# Test writing two MVCC keys that are equal except for the logical time. The
+# PrefixLen and CommonPrefixLen should be 4 (inclusive of the 0x00 separator
+# byte).
+
+write
+foo@3.000000000,1
+foo@3.000000000,0
+----
+Parse("foo@3.000000000,1") = hex:666f6f0000000000b2d05e00000000010d
+00: ComparePrev("foo@3.000000000,1"): PrefixLen=4; CommonPrefixLen=4; UserKeyComparison=1
+00: WriteKey(1, "foo@3.000000000,1", PrefixLen=4, CommonPrefixLen=4)
+00: MaterializeKey(_, 1) = hex:666f6f0000000000b2d05e00000000010d
+Parse("foo@3.000000000,0") = hex:666f6f0000000000b2d05e0009
+01: ComparePrev("foo@3.000000000,0"): PrefixLen=4; CommonPrefixLen=4; UserKeyComparison=1
+01: WriteKey(2, "foo@3.000000000,0", PrefixLen=4, CommonPrefixLen=4)
+01: MaterializeKey(_, 2) = hex:666f6f0000000000b2d05e0009
+
+# Write a longer key that has the previous key's roachpb.Key as a prefix. The
+# CommonPrefixLen should be 3 (exclusive of the 0x00 separator byte).
+
+write
+food@9.000000000,0
+----
+Parse("food@9.000000000,0") = hex:666f6f64000000000218711a0009
+00: ComparePrev("food@9.000000000,0"): PrefixLen=5; CommonPrefixLen=3; UserKeyComparison=1
+00: WriteKey(3, "food@9.000000000,0", PrefixLen=5, CommonPrefixLen=3)
+00: MaterializeKey(_, 3) = hex:666f6f64000000000218711a0009
+
+# Write the same key again. This is possible internally within Pebble (eg, a DEL
+# and a SET separated by a snapshot). UserKeyComparison should be zero.
+# CommonPrefixLen should be the entirety of the prefix.
+
+write
+food@9.000000000,0
+----
+Parse("food@9.000000000,0") = hex:666f6f64000000000218711a0009
+00: ComparePrev("food@9.000000000,0"): PrefixLen=5; CommonPrefixLen=5; UserKeyComparison=0
+00: WriteKey(4, "food@9.000000000,0", PrefixLen=5, CommonPrefixLen=5)
+00: MaterializeKey(_, 4) = hex:666f6f64000000000218711a0009
+
+finish
+----
++------+------------+---------+---------+
+| KEY | WALL | LOGICAL | UNTYPED |
++------+------------+---------+---------+
+| foo | 0 | 0 | |
+| foo | 3000000000 | 1 | |
+| foo | 3000000000 | 0 | |
+| food | 9000000000 | 0 | |
+| food | 9000000000 | 0 | |
++------+------------+---------+---------+
+
+init
+----
+
+# Write a lock table key. Its suffix will be unconsidered 'untyped'.
+
+write
+poi@Exclusive,2a84b329-b76b-4616-ac15-1047f0a3fe9c
+----
+Parse("poi@Exclusive,2a84b329-b76b-4616-ac15-1047f0a3fe9c") = hex:017a6b12706f69000100022a84b329b76b4616ac151047f0a3fe9c12
+00: ComparePrev("poi@Exclusive,2a84b329-b76b-4616-ac15-1047f0a3fe9c"): PrefixLen=10; CommonPrefixLen=0; UserKeyComparison=1
+00: WriteKey(0, "poi@Exclusive,2a84b329-b76b-4616-ac15-1047f0a3fe9c", PrefixLen=10, CommonPrefixLen=0)
+00: MaterializeKey(_, 0) = hex:017a6b12706f69000100022a84b329b76b4616ac151047f0a3fe9c12
+
+# Write a lock table key with a later uuid.
+
+write
+poi@Exclusive,073a83c4-5688-420e-af97-824255790f1e
+----
+Parse("poi@Exclusive,073a83c4-5688-420e-af97-824255790f1e") = hex:017a6b12706f6900010002073a83c45688420eaf97824255790f1e12
+00: ComparePrev("poi@Exclusive,073a83c4-5688-420e-af97-824255790f1e"): PrefixLen=10; CommonPrefixLen=10; UserKeyComparison=1
+00: WriteKey(1, "poi@Exclusive,073a83c4-5688-420e-af97-824255790f1e", PrefixLen=10, CommonPrefixLen=10)
+00: MaterializeKey(_, 1) = hex:017a6b12706f6900010002073a83c45688420eaf97824255790f1e12
+
+# Write a MVCC key to the same block. This is okay.
+
+write
+/MVCC/poi@1.000000000,3
+----
+Parse("/MVCC/poi@1.000000000,3") = hex:2f4d5643432f706f6900000000003b9aca00000000030d
+00: ComparePrev("/MVCC/poi@1.000000000,3"): PrefixLen=10; CommonPrefixLen=0; UserKeyComparison=1
+00: WriteKey(2, "/MVCC/poi@1.000000000,3", PrefixLen=10, CommonPrefixLen=0)
+00: MaterializeKey(_, 2) = hex:2f4d5643432f706f6900000000003b9aca00000000030d
+
+finish
+----
++------------------------+------------+---------+--------------------------------------+
+| KEY | WALL | LOGICAL | UNTYPED |
++------------------------+------------+---------+--------------------------------------+
+| hex:017a6b12706f690001 | 0 | 0 | 022a84b329b76b4616ac151047f0a3fe9c12 |
+| hex:017a6b12706f690001 | 0 | 0 | 02073a83c45688420eaf97824255790f1e12 |
+| /MVCC/poi | 1000000000 | 3 | |
++------------------------+------------+---------+--------------------------------------+