From efa54018daf8846c837e6ab754194184d27d9665 Mon Sep 17 00:00:00 2001 From: Radu Berinde Date: Thu, 24 Oct 2024 09:50:46 -0700 Subject: [PATCH] crdb: better seek, fix untyped versions bug MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The untyped versions should not have the terminator byte but we were writing it out. The new code relies on the `suffixTypes` value for a fast path that assumes everything is MVCC. The rest of the cases are handled by a general path that compares suffixes by byte. The performance is slightly better than the old code and we now support all cases. ``` name old time/op new time/op delta CockroachDataBlockIterShort/AlphaLen=8,Prefix=8,Shared=4,KeysPerPrefix=4,Logical=10,ValueLen=8/Next-10 11.5ns ± 3% 11.2ns ± 0% -1.91% (p=0.001 n=7+7) CockroachDataBlockIterShort/AlphaLen=8,Prefix=8,Shared=4,KeysPerPrefix=4,Logical=10,ValueLen=8/SeekGE-10 119ns ± 2% 116ns ± 0% -2.24% (p=0.000 n=8+8) CockroachDataBlockIterShort/AlphaLen=8,Prefix=8,Shared=4,KeysPerPrefix=4,Logical=10,ValueLen=8/SeekGELatest-10 106ns ± 0% 105ns ± 0% -0.97% (p=0.000 n=7+8) CockroachDataBlockIterShort/AlphaLen=8,Prefix=128,Shared=64,KeysPerPrefix=4,Logical=10,ValueLen=128/Next-10 10.7ns ± 1% 10.6ns ± 0% -1.26% (p=0.000 n=8+8) CockroachDataBlockIterShort/AlphaLen=8,Prefix=128,Shared=64,KeysPerPrefix=4,Logical=10,ValueLen=128/SeekGE-10 93.9ns ± 1% 92.4ns ± 0% -1.56% (p=0.000 n=8+8) CockroachDataBlockIterShort/AlphaLen=8,Prefix=128,Shared=64,KeysPerPrefix=4,Logical=10,ValueLen=128/SeekGELatest-10 85.8ns ± 1% 83.3ns ± 0% -2.96% (p=0.000 n=8+7) ``` --- internal/crdbtest/crdb.go | 161 +++++++++++++--------- internal/crdbtest/testdata/block_encoding | 77 +++++------ internal/crdbtest/testdata/seek | 4 +- internal/crdbtest/testdata/suffix_types | 4 +- 4 files changed, 136 insertions(+), 110 deletions(-) diff --git a/internal/crdbtest/crdb.go b/internal/crdbtest/crdb.go index 8dd99020ed..360da03209 100644 --- a/internal/crdbtest/crdb.go +++ b/internal/crdbtest/crdb.go @@ -203,28 +203,9 @@ func DecodeEngineKey(b []byte) (roachKey, version []byte, ok bool) { return roachKey, version, true } -// DecodeSuffix decodes the suffix of a key. If the suffix is a timestamp, -// wallTime and logicalTime are returned; otherwise untypedVersion contains the -// version (which is the suffix without the terminator length byte). -// -//gcassert:inline -func DecodeSuffix(suffix []byte) (untypedVersion []byte, wallTime uint64, logicalTime uint32) { - if invariants.Enabled && len(suffix) > 0 && len(suffix) != int(suffix[len(suffix)-1]) { - panic("invalid length byte") - } - switch len(suffix) { - case 0: - return nil, 0, 0 - case 9: - return nil, binary.BigEndian.Uint64(suffix[:8]), 0 - case 13, 14: - return nil, binary.BigEndian.Uint64(suffix[:8]), binary.BigEndian.Uint32(suffix[8:12]) - default: - return suffix[:len(suffix)-1], 0, 0 - } -} - // Split implements base.Split for CockroachDB keys. +// +//go:inline func Split(key []byte) int { if len(key) == 0 { return 0 @@ -540,7 +521,7 @@ func (kw *cockroachKeyWriter) WriteKey( default: // Not a MVCC timestamp. kw.suffixTypes |= hasNonMVCCSuffixes - untypedVersion = key[keyPrefixLen:] + untypedVersion = key[keyPrefixLen : len(key)-1] } kw.wallTimes.Set(row, wallTime) kw.untypedVersions.Put(untypedVersion) @@ -688,7 +669,6 @@ func (ks *cockroachKeySeeker) IsLowerBound(k []byte, syntheticSuffix []byte) boo func (ks *cockroachKeySeeker) SeekGE( key []byte, boundRow int, searchDir int8, ) (row int, equalPrefix bool) { - // TODO(jackson): Inline Split. si := Split(key) row, eq := ks.roachKeys.Search(key[:si-1]) if eq { @@ -702,69 +682,117 @@ func (ks *cockroachKeySeeker) SeekGE( // with the same prefix as index and a suffix greater than or equal to [suffix], // or if no such row exists, the next row with a different prefix. func (ks *cockroachKeySeeker) seekGEOnSuffix(index int, seekSuffix []byte) (row int) { - // The search key's prefix exactly matches the prefix of the row at index. + // We have three common cases: + // 1. The seek key has no suffix. + // 2. We are seeking to an MVCC timestamp in a block where all keys have + // MVCC timestamps (e.g. SQL table data). + // 3. We are seeking to a non-MVCC timestamp in a block where no keys have + // MVCC timestamps (e.g. lock keys). + + if len(seekSuffix) == 0 { + // The search key has no suffix, so it's the smallest possible key with its + // prefix. Return the row. This is a common case where the user is seeking + // to the most-recent row and just wants the smallest key with the prefix. + return index + } + const withWall = 9 const withLogical = withWall + 4 const withSynthetic = withLogical + 1 - var seekWallTime uint64 - var seekLogicalTime uint32 - switch len(seekSuffix) { - case 0: - // The search key has no suffix, so it's the smallest possible key with - // its prefix. Return the row. This is a common case where the user is - // seeking to the most-recent row and just wants the smallest key with - // the prefix. - return index - case withLogical, withSynthetic: - seekWallTime = binary.BigEndian.Uint64(seekSuffix) - seekLogicalTime = binary.BigEndian.Uint32(seekSuffix[8:]) - case withWall: - seekWallTime = binary.BigEndian.Uint64(seekSuffix) - default: - // The suffix is untyped. Compare the untyped suffixes. - // Binary search between [index, prefixChanged.SeekSetBitGE(index+1)]. + + // If suffixTypes == hasMVCCSuffixes, all keys in the block have MVCC + // suffixes. Note that blocks that contain both MVCC and non-MVCC should be + // very rare, so it's ok to use the more general path below in that case. + if ks.suffixTypes == hasMVCCSuffixes && (len(seekSuffix) == withWall || len(seekSuffix) == withLogical || len(seekSuffix) == withSynthetic) { + // Fast path: seeking among MVCC versions using a MVCC timestamp. + seekWallTime := binary.BigEndian.Uint64(seekSuffix) + var seekLogicalTime uint32 + if len(seekSuffix) >= withLogical { + seekLogicalTime = binary.BigEndian.Uint32(seekSuffix[8:]) + } + + // First check the suffix at index, because querying for the latest value is + // the most common case. + if latestWallTime := ks.mvccWallTimes.At(index); latestWallTime < seekWallTime || + (latestWallTime == seekWallTime && uint32(ks.mvccLogical.At(index)) <= seekLogicalTime) { + return index + } + + // Binary search between [index+1, prefixChanged.SeekSetBitGE(index+1)]. // // Define f(i) = true iff key at i is >= seek key. // Invariant: f(l-1) == false, f(u) == true. - l := index + l := index + 1 u := ks.roachKeyChanged.SeekSetBitGE(index + 1) + for l < u { - h := int(uint(l+u) >> 1) // avoid overflow when computing h - // l ≤ h < u - if bytes.Compare(ks.untypedVersions.At(h), seekSuffix) <= 0 { - u = h // preserves f(u) == true + m := int(uint(l+u) >> 1) // avoid overflow when computing m + // l ≤ m < u + mWallTime := ks.mvccWallTimes.At(m) + if mWallTime < seekWallTime || + (mWallTime == seekWallTime && uint32(ks.mvccLogical.At(m)) <= seekLogicalTime) { + u = m // preserves f(u) = true } else { - l = h + 1 // preserves f(l-1) == false + l = m + 1 // preserves f(l-1) = false } } return l } - // Seeking among MVCC versions using a MVCC timestamp. - - // TODO(jackson): What if the row has an untyped suffix? - // First check the suffix at index, because querying for the latest value is - // the most common case. - if latestWallTime := ks.mvccWallTimes.At(index); latestWallTime < seekWallTime || - (latestWallTime == seekWallTime && uint32(ks.mvccLogical.At(index)) <= seekLogicalTime) { - return index + // Remove the terminator byte, which we know is equal to len(seekSuffix) + // because we obtained the suffix by splitting the seek key. + version := seekSuffix[:len(seekSuffix)-1] + if invariants.Enabled && seekSuffix[len(version)] != byte(len(seekSuffix)) { + panic(errors.AssertionFailedf("invalid seek suffix %x", seekSuffix)) } - // Binary search between [index+1, prefixChanged.SeekSetBitGE(index+1)]. + // Binary search for version between [index, prefixChanged.SeekSetBitGE(index+1)]. // - // Define f(i) = true iff key at i is >= seek key. + // Define f(i) = true iff key at i is >= seek key (i.e. suffix at i is <= seek suffix). // Invariant: f(l-1) == false, f(u) == true. - l := index + 1 + l := index u := ks.roachKeyChanged.SeekSetBitGE(index + 1) + if ks.suffixTypes&hasEmptySuffixes != 0 { + // Check if the key at index has an empty suffix. Since empty suffixes sort + // first, this is the only key in the range [index, u) which could have an + // empty suffix. + if len(ks.untypedVersions.At(index)) == 0 && ks.mvccWallTimes.At(index) == 0 && ks.mvccLogical.At(index) == 0 { + // Our seek suffix is not empty, so it must come after the empty suffix. + l = index + 1 + } + } + for l < u { - h := int(uint(l+u) >> 1) // avoid overflow when computing h - // l ≤ h < u - hWallTime := ks.mvccWallTimes.At(h) - if hWallTime < seekWallTime || - (hWallTime == seekWallTime && uint32(ks.mvccLogical.At(h)) <= seekLogicalTime) { - u = h // preserves f(u) = true + m := int(uint(l+u) >> 1) // avoid overflow when computing m + // l ≤ m < u + mVer := ks.untypedVersions.At(m) + if len(mVer) == 0 { + wallTime := ks.mvccWallTimes.At(m) + logicalTime := uint32(ks.mvccLogical.At(m)) + if invariants.Enabled && wallTime == 0 && logicalTime == 0 { + // This can only happen for row at index. + panic(errors.AssertionFailedf("unexpected empty suffix at %d (l=%d, u=%d)", m, l, u)) + } + + // Note: this path is not very performance sensitive: blocks that mix MVCC + // suffixes with non-MVCC suffixes should be rare. + + //gcassert:noescape + var buf [12]byte + //gcassert:inline + binary.BigEndian.PutUint64(buf[:], wallTime) + if logicalTime == 0 { + mVer = buf[:8] + } else { + //gcassert:inline + binary.BigEndian.PutUint32(buf[8:], logicalTime) + mVer = buf[:12] + } + } + if bytes.Compare(mVer, version) <= 0 { + u = m // preserves f(u) == true } else { - l = h + 1 // preserves f(l-1) = false + l = m + 1 // preserves f(l-1) == false } } return l @@ -796,13 +824,14 @@ func (ks *cockroachKeySeeker) MaterializeUserKey( return res } // Slice first, to check that the capacity is sufficient. - res := ki.Buf[:roachKeyLen+1+len(untypedVersion)] + res := ki.Buf[:roachKeyLen+2+len(untypedVersion)] *(*byte)(ptr) = 0 memmove( unsafe.Pointer(uintptr(ptr)+1), unsafe.Pointer(unsafe.SliceData(untypedVersion)), uintptr(len(untypedVersion)), ) + *(*byte)(unsafe.Pointer(uintptr(ptr) + uintptr(len(untypedVersion)+1))) = byte(len(untypedVersion) + 1) return res } diff --git a/internal/crdbtest/testdata/block_encoding b/internal/crdbtest/testdata/block_encoding index 95323170e3..0a71e663bd 100644 --- a/internal/crdbtest/testdata/block_encoding +++ b/internal/crdbtest/testdata/block_encoding @@ -51,9 +51,9 @@ data block header │ ├── 027-028: b 00000011 # col 3: bytes │ ├── 028-032: x ec000000 # col 3: page start 236 │ ├── 032-033: b 00000010 # col 4: uint - │ ├── 033-037: x 0c010000 # col 4: page start 268 + │ ├── 033-037: x 0b010000 # col 4: page start 267 │ ├── 037-038: b 00000001 # col 5: bool - │ ├── 038-042: x 26010000 # col 5: page start 294 + │ ├── 038-042: x 24010000 # col 5: page start 292 │ ├── 042-043: b 00000011 # col 6: bytes │ ├── 043-047: x 38010000 # col 6: page start 312 │ ├── 047-048: b 00000001 # col 7: bool @@ -130,46 +130,45 @@ data block header │ │ ├── 238-239: x 00 # data[1] = 0 [250 overall] │ │ ├── 239-240: x 00 # data[2] = 0 [250 overall] │ │ ├── 240-241: x 00 # data[3] = 0 [250 overall] - │ │ ├── 241-242: x 12 # data[4] = 18 [268 overall] - │ │ ├── 242-243: x 12 # data[5] = 18 [268 overall] - │ │ ├── 243-244: x 12 # data[6] = 18 [268 overall] - │ │ ├── 244-245: x 12 # data[7] = 18 [268 overall] - │ │ ├── 245-246: x 12 # data[8] = 18 [268 overall] - │ │ ├── 246-247: x 12 # data[9] = 18 [268 overall] - │ │ ├── 247-248: x 12 # data[10] = 18 [268 overall] - │ │ ├── 248-249: x 12 # data[11] = 18 [268 overall] - │ │ └── 249-250: x 12 # data[12] = 18 [268 overall] + │ │ ├── 241-242: x 11 # data[4] = 17 [267 overall] + │ │ ├── 242-243: x 11 # data[5] = 17 [267 overall] + │ │ ├── 243-244: x 11 # data[6] = 17 [267 overall] + │ │ ├── 244-245: x 11 # data[7] = 17 [267 overall] + │ │ ├── 245-246: x 11 # data[8] = 17 [267 overall] + │ │ ├── 246-247: x 11 # data[9] = 17 [267 overall] + │ │ ├── 247-248: x 11 # data[10] = 17 [267 overall] + │ │ ├── 248-249: x 11 # data[11] = 17 [267 overall] + │ │ └── 249-250: x 11 # data[12] = 17 [267 overall] │ └── data - │ ├── 250-250: x # data[0]: - │ ├── 250-250: x # data[1]: - │ ├── 250-250: x # data[2]: - │ ├── 250-268: x 010203040506070809101112131415161712 # data[3]: "\x01\x02\x03\x04\x05\x06\a\b\t\x10\x11\x12\x13\x14\x15\x16\x17\x12" - │ ├── 268-268: x # data[4]: - │ ├── 268-268: x # data[5]: - │ ├── 268-268: x # data[6]: - │ ├── 268-268: x # data[7]: - │ ├── 268-268: x # data[8]: - │ ├── 268-268: x # data[9]: - │ ├── 268-268: x # data[10]: - │ └── 268-268: x # data[11]: + │ ├── 250-250: x # data[0]: + │ ├── 250-250: x # data[1]: + │ ├── 250-250: x # data[2]: + │ ├── 250-267: x 0102030405060708091011121314151617 # data[3]: "\x01\x02\x03\x04\x05\x06\a\b\t\x10\x11\x12\x13\x14\x15\x16\x17" + │ ├── 267-267: x # data[4]: + │ ├── 267-267: x # data[5]: + │ ├── 267-267: x # data[6]: + │ ├── 267-267: x # data[7]: + │ ├── 267-267: x # data[8]: + │ ├── 267-267: x # data[9]: + │ ├── 267-267: x # data[10]: + │ └── 267-267: x # data[11]: ├── data for column 4 (uint) - │ ├── 268-269: x 02 # encoding: 2b - │ ├── 269-270: x 00 # padding (aligning to 16-bit boundary) - │ ├── 270-272: x 0103 # data[0] = 769 - │ ├── 272-274: x 0102 # data[1] = 513 - │ ├── 274-276: x 0101 # data[2] = 257 - │ ├── 276-278: x 0101 # data[3] = 257 - │ ├── 278-280: x 0102 # data[4] = 513 - │ ├── 280-282: x 0101 # data[5] = 257 - │ ├── 282-284: x 0104 # data[6] = 1025 - │ ├── 284-286: x 0103 # data[7] = 769 - │ ├── 286-288: x 0102 # data[8] = 513 - │ ├── 288-290: x 0101 # data[9] = 257 - │ ├── 290-292: x 0101 # data[10] = 257 - │ └── 292-294: x 0101 # data[11] = 257 + │ ├── 267-268: x 02 # encoding: 2b + │ ├── 268-270: x 0103 # data[0] = 769 + │ ├── 270-272: x 0102 # data[1] = 513 + │ ├── 272-274: x 0101 # data[2] = 257 + │ ├── 274-276: x 0101 # data[3] = 257 + │ ├── 276-278: x 0102 # data[4] = 513 + │ ├── 278-280: x 0101 # data[5] = 257 + │ ├── 280-282: x 0104 # data[6] = 1025 + │ ├── 282-284: x 0103 # data[7] = 769 + │ ├── 284-286: x 0102 # data[8] = 513 + │ ├── 286-288: x 0101 # data[9] = 257 + │ ├── 288-290: x 0101 # data[10] = 257 + │ └── 290-292: x 0101 # data[11] = 257 ├── data for column 5 (bool) - │ ├── 294-295: x 00 # default bitmap encoding - │ ├── 295-296: x 00 # padding to align to 64-bit boundary + │ ├── 292-293: x 00 # default bitmap encoding + │ ├── 293-296: x 000000 # padding to align to 64-bit boundary │ ├── 296-304: b 0000000100001100000000000000000000000000000000000000000000000000 # bitmap word 0 │ └── 304-312: b 0000000100000000000000000000000000000000000000000000000000000000 # bitmap summary word 0-63 ├── data for column 6 (bytes) diff --git a/internal/crdbtest/testdata/seek b/internal/crdbtest/testdata/seek index eb0f0e8b13..4b26dd9cc1 100644 --- a/internal/crdbtest/testdata/seek +++ b/internal/crdbtest/testdata/seek @@ -13,14 +13,12 @@ fop @ 0102030405060708 #1,SET = row6 seek fo fo @ 05 -fo @ 05 fo @ 0102030405060709 #1,SET = v0 fo @ 0102030405060708 #1,SET = v0 fo @ 0102030405060707 #1,SET = v0 ---- fo: fo @ 0102030405060708 #1,SET = row0 fo @ 05: fo @ 0102030405060708 #1,SET = row0 -fo @ 05: fo @ 0102030405060708 #1,SET = row0 fo @ 0102030405060709: fo @ 0102030405060708 #1,SET = row0 fo @ 0102030405060708: fo @ 0102030405060708 #1,SET = row0 fo @ 0102030405060707: foo @ 0102030405060708 #1,SET = row1 @@ -94,7 +92,7 @@ foo @ 0000000000000000000000000000000002 #1,SET = row3 foo @ 0000000000000000000000000000000001 #1,SET = row4 z @ 0000000000000000000000000000000001 #1,SET = row5 ---- -6 rows, total size 259B +6 rows, total size 251B seek a diff --git a/internal/crdbtest/testdata/suffix_types b/internal/crdbtest/testdata/suffix_types index 004ea03fc8..f0120e5ef3 100644 --- a/internal/crdbtest/testdata/suffix_types +++ b/internal/crdbtest/testdata/suffix_types @@ -53,7 +53,7 @@ init foo #1,SET = bar foo @ 0102030405060708091011121314151617 #1,SET = bar ---- -2 rows, total size 133B +2 rows, total size 125B suffix-types ---- @@ -64,7 +64,7 @@ foo #1,SET = bar foo @ 0102030405060708 #1,SET = bar foo @ 0102030405060708091011121314151617 #1,SET = bar ---- -3 rows, total size 169B +3 rows, total size 161B suffix-types ----