crdb: better seek, fix untyped versions bug

The untyped versions should not have the terminator byte but we were writing it out. The new code relies on the `suffixTypes` value for a fast path that assumes everything is MVCC. The rest of the cases are handled by a general path that compares suffixes by byte. The performance is slightly better than the old code and we now support all cases. ``` name old time/op new time/op delta CockroachDataBlockIterShort/AlphaLen=8,Prefix=8,Shared=4,KeysPerPrefix=4,Logical=10,ValueLen=8/Next-10 11.5ns ± 3% 11.2ns ± 0% -1.91% (p=0.001 n=7+7) CockroachDataBlockIterShort/AlphaLen=8,Prefix=8,Shared=4,KeysPerPrefix=4,Logical=10,ValueLen=8/SeekGE-10 119ns ± 2% 116ns ± 0% -2.24% (p=0.000 n=8+8) CockroachDataBlockIterShort/AlphaLen=8,Prefix=8,Shared=4,KeysPerPrefix=4,Logical=10,ValueLen=8/SeekGELatest-10 106ns ± 0% 105ns ± 0% -0.97% (p=0.000 n=7+8) CockroachDataBlockIterShort/AlphaLen=8,Prefix=128,Shared=64,KeysPerPrefix=4,Logical=10,ValueLen=128/Next-10 10.7ns ± 1% 10.6ns ± 0% -1.26% (p=0.000 n=8+8) CockroachDataBlockIterShort/AlphaLen=8,Prefix=128,Shared=64,KeysPerPrefix=4,Logical=10,ValueLen=128/SeekGE-10 93.9ns ± 1% 92.4ns ± 0% -1.56% (p=0.000 n=8+8) CockroachDataBlockIterShort/AlphaLen=8,Prefix=128,Shared=64,KeysPerPrefix=4,Logical=10,ValueLen=128/SeekGELatest-10 85.8ns ± 1% 83.3ns ± 0% -2.96% (p=0.000 n=8+7) ```
cockroachdb · Oct 24, 2024 · efa5401 · efa5401
1 parent d2480d7
commit efa5401
Show file tree

Hide file tree

Showing 4 changed files with 136 additions and 110 deletions.
diff --git a/internal/crdbtest/crdb.go b/internal/crdbtest/crdb.go
@@ -203,28 +203,9 @@ func DecodeEngineKey(b []byte) (roachKey, version []byte, ok bool) {
 	return roachKey, version, true
 }
 
-// DecodeSuffix decodes the suffix of a key. If the suffix is a timestamp,
-// wallTime and logicalTime are returned; otherwise untypedVersion contains the
-// version (which is the suffix without the terminator length byte).
-//
-//gcassert:inline
-func DecodeSuffix(suffix []byte) (untypedVersion []byte, wallTime uint64, logicalTime uint32) {
-	if invariants.Enabled && len(suffix) > 0 && len(suffix) != int(suffix[len(suffix)-1]) {
-		panic("invalid length byte")
-	}
-	switch len(suffix) {
-	case 0:
-		return nil, 0, 0
-	case 9:
-		return nil, binary.BigEndian.Uint64(suffix[:8]), 0
-	case 13, 14:
-		return nil, binary.BigEndian.Uint64(suffix[:8]), binary.BigEndian.Uint32(suffix[8:12])
-	default:
-		return suffix[:len(suffix)-1], 0, 0
-	}
-}
-
 // Split implements base.Split for CockroachDB keys.
+//
+//go:inline
 func Split(key []byte) int {
 	if len(key) == 0 {
 		return 0
@@ -540,7 +521,7 @@ func (kw *cockroachKeyWriter) WriteKey(
 	default:
 		// Not a MVCC timestamp.
 		kw.suffixTypes |= hasNonMVCCSuffixes
-		untypedVersion = key[keyPrefixLen:]
+		untypedVersion = key[keyPrefixLen : len(key)-1]
 	}
 	kw.wallTimes.Set(row, wallTime)
 	kw.untypedVersions.Put(untypedVersion)
@@ -688,7 +669,6 @@ func (ks *cockroachKeySeeker) IsLowerBound(k []byte, syntheticSuffix []byte) boo
 func (ks *cockroachKeySeeker) SeekGE(
 	key []byte, boundRow int, searchDir int8,
 ) (row int, equalPrefix bool) {
-	// TODO(jackson): Inline Split.
 	si := Split(key)
 	row, eq := ks.roachKeys.Search(key[:si-1])
 	if eq {
@@ -702,69 +682,117 @@ func (ks *cockroachKeySeeker) SeekGE(
 // with the same prefix as index and a suffix greater than or equal to [suffix],
 // or if no such row exists, the next row with a different prefix.
 func (ks *cockroachKeySeeker) seekGEOnSuffix(index int, seekSuffix []byte) (row int) {
-	// The search key's prefix exactly matches the prefix of the row at index.
+	// We have three common cases:
+	// 1. The seek key has no suffix.
+	// 2. We are seeking to an MVCC timestamp in a block where all keys have
+	//    MVCC timestamps (e.g. SQL table data).
+	// 3. We are seeking to a non-MVCC timestamp in a block where no keys have
+	//    MVCC timestamps (e.g. lock keys).
+
+	if len(seekSuffix) == 0 {
+		// The search key has no suffix, so it's the smallest possible key with its
+		// prefix. Return the row. This is a common case where the user is seeking
+		// to the most-recent row and just wants the smallest key with the prefix.
+		return index
+	}
+
 	const withWall = 9
 	const withLogical = withWall + 4
 	const withSynthetic = withLogical + 1
-	var seekWallTime uint64
-	var seekLogicalTime uint32
-	switch len(seekSuffix) {
-	case 0:
-		// The search key has no suffix, so it's the smallest possible key with
-		// its prefix. Return the row. This is a common case where the user is
-		// seeking to the most-recent row and just wants the smallest key with
-		// the prefix.
-		return index
-	case withLogical, withSynthetic:
-		seekWallTime = binary.BigEndian.Uint64(seekSuffix)
-		seekLogicalTime = binary.BigEndian.Uint32(seekSuffix[8:])
-	case withWall:
-		seekWallTime = binary.BigEndian.Uint64(seekSuffix)
-	default:
-		// The suffix is untyped. Compare the untyped suffixes.
-		// Binary search between [index, prefixChanged.SeekSetBitGE(index+1)].
+
+	// If suffixTypes == hasMVCCSuffixes, all keys in the block have MVCC
+	// suffixes. Note that blocks that contain both MVCC and non-MVCC should be
+	// very rare, so it's ok to use the more general path below in that case.
+	if ks.suffixTypes == hasMVCCSuffixes && (len(seekSuffix) == withWall || len(seekSuffix) == withLogical || len(seekSuffix) == withSynthetic) {
+		// Fast path: seeking among MVCC versions using a MVCC timestamp.
+		seekWallTime := binary.BigEndian.Uint64(seekSuffix)
+		var seekLogicalTime uint32
+		if len(seekSuffix) >= withLogical {
+			seekLogicalTime = binary.BigEndian.Uint32(seekSuffix[8:])
+		}
+
+		// First check the suffix at index, because querying for the latest value is
+		// the most common case.
+		if latestWallTime := ks.mvccWallTimes.At(index); latestWallTime < seekWallTime ||
+			(latestWallTime == seekWallTime && uint32(ks.mvccLogical.At(index)) <= seekLogicalTime) {
+			return index
+		}
+
+		// Binary search between [index+1, prefixChanged.SeekSetBitGE(index+1)].
 		//
 		// Define f(i) = true iff key at i is >= seek key.
 		// Invariant: f(l-1) == false, f(u) == true.
-		l := index
+		l := index + 1
 		u := ks.roachKeyChanged.SeekSetBitGE(index + 1)
+
 		for l < u {
-			h := int(uint(l+u) >> 1) // avoid overflow when computing h
-			// l ≤ h < u
-			if bytes.Compare(ks.untypedVersions.At(h), seekSuffix) <= 0 {
-				u = h // preserves f(u) == true
+			m := int(uint(l+u) >> 1) // avoid overflow when computing m
+			// l ≤ m < u
+			mWallTime := ks.mvccWallTimes.At(m)
+			if mWallTime < seekWallTime ||
+				(mWallTime == seekWallTime && uint32(ks.mvccLogical.At(m)) <= seekLogicalTime) {
+				u = m // preserves f(u) = true
 			} else {
-				l = h + 1 // preserves f(l-1) == false
+				l = m + 1 // preserves f(l-1) = false
 			}
 		}
 		return l
 	}
-	// Seeking among MVCC versions using a MVCC timestamp.
-
-	// TODO(jackson): What if the row has an untyped suffix?
 
-	// First check the suffix at index, because querying for the latest value is
-	// the most common case.
-	if latestWallTime := ks.mvccWallTimes.At(index); latestWallTime < seekWallTime ||
-		(latestWallTime == seekWallTime && uint32(ks.mvccLogical.At(index)) <= seekLogicalTime) {
-		return index
+	// Remove the terminator byte, which we know is equal to len(seekSuffix)
+	// because we obtained the suffix by splitting the seek key.
+	version := seekSuffix[:len(seekSuffix)-1]
+	if invariants.Enabled && seekSuffix[len(version)] != byte(len(seekSuffix)) {
+		panic(errors.AssertionFailedf("invalid seek suffix %x", seekSuffix))
 	}
 
-	// Binary search between [index+1, prefixChanged.SeekSetBitGE(index+1)].
+	// Binary search for version between [index, prefixChanged.SeekSetBitGE(index+1)].
 	//
-	// Define f(i) = true iff key at i is >= seek key.
+	// Define f(i) = true iff key at i is >= seek key (i.e. suffix at i is <= seek suffix).
 	// Invariant: f(l-1) == false, f(u) == true.
-	l := index + 1
+	l := index
 	u := ks.roachKeyChanged.SeekSetBitGE(index + 1)
+	if ks.suffixTypes&hasEmptySuffixes != 0 {
+		// Check if the key at index has an empty suffix. Since empty suffixes sort
+		// first, this is the only key in the range [index, u) which could have an
+		// empty suffix.
+		if len(ks.untypedVersions.At(index)) == 0 && ks.mvccWallTimes.At(index) == 0 && ks.mvccLogical.At(index) == 0 {
+			// Our seek suffix is not empty, so it must come after the empty suffix.
+			l = index + 1
+		}
+	}
+
 	for l < u {
-		h := int(uint(l+u) >> 1) // avoid overflow when computing h
-		// l ≤ h < u
-		hWallTime := ks.mvccWallTimes.At(h)
-		if hWallTime < seekWallTime ||
-			(hWallTime == seekWallTime && uint32(ks.mvccLogical.At(h)) <= seekLogicalTime) {
-			u = h // preserves f(u) = true
+		m := int(uint(l+u) >> 1) // avoid overflow when computing m
+		// l ≤ m < u
+		mVer := ks.untypedVersions.At(m)
+		if len(mVer) == 0 {
+			wallTime := ks.mvccWallTimes.At(m)
+			logicalTime := uint32(ks.mvccLogical.At(m))
+			if invariants.Enabled && wallTime == 0 && logicalTime == 0 {
+				// This can only happen for row at index.
+				panic(errors.AssertionFailedf("unexpected empty suffix at %d (l=%d, u=%d)", m, l, u))
+			}
+
+			// Note: this path is not very performance sensitive: blocks that mix MVCC
+			// suffixes with non-MVCC suffixes should be rare.
+
+			//gcassert:noescape
+			var buf [12]byte
+			//gcassert:inline
+			binary.BigEndian.PutUint64(buf[:], wallTime)
+			if logicalTime == 0 {
+				mVer = buf[:8]
+			} else {
+				//gcassert:inline
+				binary.BigEndian.PutUint32(buf[8:], logicalTime)
+				mVer = buf[:12]
+			}
+		}
+		if bytes.Compare(mVer, version) <= 0 {
+			u = m // preserves f(u) == true
 		} else {
-			l = h + 1 // preserves f(l-1) = false
+			l = m + 1 // preserves f(l-1) == false
 		}
 	}
 	return l
@@ -796,13 +824,14 @@ func (ks *cockroachKeySeeker) MaterializeUserKey(
 			return res
 		}
 		// Slice first, to check that the capacity is sufficient.
-		res := ki.Buf[:roachKeyLen+1+len(untypedVersion)]
+		res := ki.Buf[:roachKeyLen+2+len(untypedVersion)]
 		*(*byte)(ptr) = 0
 		memmove(
 			unsafe.Pointer(uintptr(ptr)+1),
 			unsafe.Pointer(unsafe.SliceData(untypedVersion)),
 			uintptr(len(untypedVersion)),
 		)
+		*(*byte)(unsafe.Pointer(uintptr(ptr) + uintptr(len(untypedVersion)+1))) = byte(len(untypedVersion) + 1)
 		return res
 	}
 

diff --git a/internal/crdbtest/testdata/block_encoding b/internal/crdbtest/testdata/block_encoding
@@ -51,9 +51,9 @@ data block header
  │    ├── 027-028: b 00000011 # col 3: bytes
  │    ├── 028-032: x ec000000 # col 3: page start 236
  │    ├── 032-033: b 00000010 # col 4: uint
- │    ├── 033-037: x 0c010000 # col 4: page start 268
+ │    ├── 033-037: x 0b010000 # col 4: page start 267
  │    ├── 037-038: b 00000001 # col 5: bool
- │    ├── 038-042: x 26010000 # col 5: page start 294
+ │    ├── 038-042: x 24010000 # col 5: page start 292
  │    ├── 042-043: b 00000011 # col 6: bytes
  │    ├── 043-047: x 38010000 # col 6: page start 312
  │    ├── 047-048: b 00000001 # col 7: bool
@@ -130,46 +130,45 @@ data block header
  │    │    ├── 238-239: x 00 # data[1] = 0 [250 overall]
  │    │    ├── 239-240: x 00 # data[2] = 0 [250 overall]
  │    │    ├── 240-241: x 00 # data[3] = 0 [250 overall]
- │    │    ├── 241-242: x 12 # data[4] = 18 [268 overall]
- │    │    ├── 242-243: x 12 # data[5] = 18 [268 overall]
- │    │    ├── 243-244: x 12 # data[6] = 18 [268 overall]
- │    │    ├── 244-245: x 12 # data[7] = 18 [268 overall]
- │    │    ├── 245-246: x 12 # data[8] = 18 [268 overall]
- │    │    ├── 246-247: x 12 # data[9] = 18 [268 overall]
- │    │    ├── 247-248: x 12 # data[10] = 18 [268 overall]
- │    │    ├── 248-249: x 12 # data[11] = 18 [268 overall]
- │    │    └── 249-250: x 12 # data[12] = 18 [268 overall]
+ │    │    ├── 241-242: x 11 # data[4] = 17 [267 overall]
+ │    │    ├── 242-243: x 11 # data[5] = 17 [267 overall]
+ │    │    ├── 243-244: x 11 # data[6] = 17 [267 overall]
+ │    │    ├── 244-245: x 11 # data[7] = 17 [267 overall]
+ │    │    ├── 245-246: x 11 # data[8] = 17 [267 overall]
+ │    │    ├── 246-247: x 11 # data[9] = 17 [267 overall]
+ │    │    ├── 247-248: x 11 # data[10] = 17 [267 overall]
+ │    │    ├── 248-249: x 11 # data[11] = 17 [267 overall]
+ │    │    └── 249-250: x 11 # data[12] = 17 [267 overall]
  │    └── data
- │         ├── 250-250: x                                      # data[0]:
- │         ├── 250-250: x                                      # data[1]:
- │         ├── 250-250: x                                      # data[2]:
- │         ├── 250-268: x 010203040506070809101112131415161712 # data[3]: "\x01\x02\x03\x04\x05\x06\a\b\t\x10\x11\x12\x13\x14\x15\x16\x17\x12"
- │         ├── 268-268: x                                      # data[4]:
- │         ├── 268-268: x                                      # data[5]:
- │         ├── 268-268: x                                      # data[6]:
- │         ├── 268-268: x                                      # data[7]:
- │         ├── 268-268: x                                      # data[8]:
- │         ├── 268-268: x                                      # data[9]:
- │         ├── 268-268: x                                      # data[10]:
- │         └── 268-268: x                                      # data[11]:
+ │         ├── 250-250: x                                    # data[0]:
+ │         ├── 250-250: x                                    # data[1]:
+ │         ├── 250-250: x                                    # data[2]:
+ │         ├── 250-267: x 0102030405060708091011121314151617 # data[3]: "\x01\x02\x03\x04\x05\x06\a\b\t\x10\x11\x12\x13\x14\x15\x16\x17"
+ │         ├── 267-267: x                                    # data[4]:
+ │         ├── 267-267: x                                    # data[5]:
+ │         ├── 267-267: x                                    # data[6]:
+ │         ├── 267-267: x                                    # data[7]:
+ │         ├── 267-267: x                                    # data[8]:
+ │         ├── 267-267: x                                    # data[9]:
+ │         ├── 267-267: x                                    # data[10]:
+ │         └── 267-267: x                                    # data[11]:
  ├── data for column 4 (uint)
- │    ├── 268-269: x 02   # encoding: 2b
- │    ├── 269-270: x 00   # padding (aligning to 16-bit boundary)
- │    ├── 270-272: x 0103 # data[0] = 769
- │    ├── 272-274: x 0102 # data[1] = 513
- │    ├── 274-276: x 0101 # data[2] = 257
- │    ├── 276-278: x 0101 # data[3] = 257
- │    ├── 278-280: x 0102 # data[4] = 513
- │    ├── 280-282: x 0101 # data[5] = 257
- │    ├── 282-284: x 0104 # data[6] = 1025
- │    ├── 284-286: x 0103 # data[7] = 769
- │    ├── 286-288: x 0102 # data[8] = 513
- │    ├── 288-290: x 0101 # data[9] = 257
- │    ├── 290-292: x 0101 # data[10] = 257
- │    └── 292-294: x 0101 # data[11] = 257
+ │    ├── 267-268: x 02   # encoding: 2b
+ │    ├── 268-270: x 0103 # data[0] = 769
+ │    ├── 270-272: x 0102 # data[1] = 513
+ │    ├── 272-274: x 0101 # data[2] = 257
+ │    ├── 274-276: x 0101 # data[3] = 257
+ │    ├── 276-278: x 0102 # data[4] = 513
+ │    ├── 278-280: x 0101 # data[5] = 257
+ │    ├── 280-282: x 0104 # data[6] = 1025
+ │    ├── 282-284: x 0103 # data[7] = 769
+ │    ├── 284-286: x 0102 # data[8] = 513
+ │    ├── 286-288: x 0101 # data[9] = 257
+ │    ├── 288-290: x 0101 # data[10] = 257
+ │    └── 290-292: x 0101 # data[11] = 257
  ├── data for column 5 (bool)
- │    ├── 294-295: x 00                                                               # default bitmap encoding
- │    ├── 295-296: x 00                                                               # padding to align to 64-bit boundary
+ │    ├── 292-293: x 00                                                               # default bitmap encoding
+ │    ├── 293-296: x 000000                                                           # padding to align to 64-bit boundary
  │    ├── 296-304: b 0000000100001100000000000000000000000000000000000000000000000000 # bitmap word 0
  │    └── 304-312: b 0000000100000000000000000000000000000000000000000000000000000000 # bitmap summary word 0-63
  ├── data for column 6 (bytes)

diff --git a/internal/crdbtest/testdata/seek b/internal/crdbtest/testdata/seek
@@ -13,14 +13,12 @@ fop @ 0102030405060708 #1,SET = row6
 seek
 fo
 fo @ 05
-fo @ 05
 fo @ 0102030405060709 #1,SET = v0
 fo @ 0102030405060708 #1,SET = v0
 fo @ 0102030405060707 #1,SET = v0
 ----
 fo: fo @ 0102030405060708 #1,SET = row0
 fo @ 05: fo @ 0102030405060708 #1,SET = row0
-fo @ 05: fo @ 0102030405060708 #1,SET = row0
 fo @ 0102030405060709: fo @ 0102030405060708 #1,SET = row0
 fo @ 0102030405060708: fo @ 0102030405060708 #1,SET = row0
 fo @ 0102030405060707: foo @ 0102030405060708 #1,SET = row1
@@ -94,7 +92,7 @@ foo @ 0000000000000000000000000000000002 #1,SET = row3
 foo @ 0000000000000000000000000000000001 #1,SET = row4
 z @ 0000000000000000000000000000000001 #1,SET = row5
 ----
-6 rows, total size 259B
+6 rows, total size 251B
 
 seek
 a

diff --git a/internal/crdbtest/testdata/suffix_types b/internal/crdbtest/testdata/suffix_types
@@ -53,7 +53,7 @@ init
 foo #1,SET = bar
 foo @ 0102030405060708091011121314151617 #1,SET = bar
 ----
-2 rows, total size 133B
+2 rows, total size 125B
 
 suffix-types
 ----
@@ -64,7 +64,7 @@ foo #1,SET = bar
 foo @ 0102030405060708 #1,SET = bar
 foo @ 0102030405060708091011121314151617 #1,SET = bar
 ----
-3 rows, total size 169B
+3 rows, total size 161B
 
 suffix-types
 ----