Skip to content

Commit

Permalink
crdb: better seek, fix untyped versions bug
Browse files Browse the repository at this point in the history
The untyped versions should not have the terminator byte but we were
writing it out.

The new code relies on the `suffixTypes` value for a fast path that
assumes everything is MVCC. The rest of the cases are handled by a
general path that compares suffixes by byte. The performance is
slightly better than the old code and we now support all cases.

```
name                                                                                                                 old time/op    new time/op    delta
CockroachDataBlockIterShort/AlphaLen=8,Prefix=8,Shared=4,KeysPerPrefix=4,Logical=10,ValueLen=8/Next-10                 11.5ns ± 3%    11.2ns ± 0%  -1.91%  (p=0.001 n=7+7)
CockroachDataBlockIterShort/AlphaLen=8,Prefix=8,Shared=4,KeysPerPrefix=4,Logical=10,ValueLen=8/SeekGE-10                119ns ± 2%     116ns ± 0%  -2.24%  (p=0.000 n=8+8)
CockroachDataBlockIterShort/AlphaLen=8,Prefix=8,Shared=4,KeysPerPrefix=4,Logical=10,ValueLen=8/SeekGELatest-10          106ns ± 0%     105ns ± 0%  -0.97%  (p=0.000 n=7+8)
CockroachDataBlockIterShort/AlphaLen=8,Prefix=128,Shared=64,KeysPerPrefix=4,Logical=10,ValueLen=128/Next-10            10.7ns ± 1%    10.6ns ± 0%  -1.26%  (p=0.000 n=8+8)
CockroachDataBlockIterShort/AlphaLen=8,Prefix=128,Shared=64,KeysPerPrefix=4,Logical=10,ValueLen=128/SeekGE-10          93.9ns ± 1%    92.4ns ± 0%  -1.56%  (p=0.000 n=8+8)
CockroachDataBlockIterShort/AlphaLen=8,Prefix=128,Shared=64,KeysPerPrefix=4,Logical=10,ValueLen=128/SeekGELatest-10    85.8ns ± 1%    83.3ns ± 0%  -2.96%  (p=0.000 n=8+7)
```
  • Loading branch information
RaduBerinde committed Oct 24, 2024
1 parent d2480d7 commit efa5401
Show file tree
Hide file tree
Showing 4 changed files with 136 additions and 110 deletions.
161 changes: 95 additions & 66 deletions internal/crdbtest/crdb.go
Original file line number Diff line number Diff line change
Expand Up @@ -203,28 +203,9 @@ func DecodeEngineKey(b []byte) (roachKey, version []byte, ok bool) {
return roachKey, version, true
}

// DecodeSuffix decodes the suffix of a key. If the suffix is a timestamp,
// wallTime and logicalTime are returned; otherwise untypedVersion contains the
// version (which is the suffix without the terminator length byte).
//
//gcassert:inline
func DecodeSuffix(suffix []byte) (untypedVersion []byte, wallTime uint64, logicalTime uint32) {
if invariants.Enabled && len(suffix) > 0 && len(suffix) != int(suffix[len(suffix)-1]) {
panic("invalid length byte")
}
switch len(suffix) {
case 0:
return nil, 0, 0
case 9:
return nil, binary.BigEndian.Uint64(suffix[:8]), 0
case 13, 14:
return nil, binary.BigEndian.Uint64(suffix[:8]), binary.BigEndian.Uint32(suffix[8:12])
default:
return suffix[:len(suffix)-1], 0, 0
}
}

// Split implements base.Split for CockroachDB keys.
//
//go:inline
func Split(key []byte) int {
if len(key) == 0 {
return 0
Expand Down Expand Up @@ -540,7 +521,7 @@ func (kw *cockroachKeyWriter) WriteKey(
default:
// Not a MVCC timestamp.
kw.suffixTypes |= hasNonMVCCSuffixes
untypedVersion = key[keyPrefixLen:]
untypedVersion = key[keyPrefixLen : len(key)-1]
}
kw.wallTimes.Set(row, wallTime)
kw.untypedVersions.Put(untypedVersion)
Expand Down Expand Up @@ -688,7 +669,6 @@ func (ks *cockroachKeySeeker) IsLowerBound(k []byte, syntheticSuffix []byte) boo
func (ks *cockroachKeySeeker) SeekGE(
key []byte, boundRow int, searchDir int8,
) (row int, equalPrefix bool) {
// TODO(jackson): Inline Split.
si := Split(key)
row, eq := ks.roachKeys.Search(key[:si-1])
if eq {
Expand All @@ -702,69 +682,117 @@ func (ks *cockroachKeySeeker) SeekGE(
// with the same prefix as index and a suffix greater than or equal to [suffix],
// or if no such row exists, the next row with a different prefix.
func (ks *cockroachKeySeeker) seekGEOnSuffix(index int, seekSuffix []byte) (row int) {
// The search key's prefix exactly matches the prefix of the row at index.
// We have three common cases:
// 1. The seek key has no suffix.
// 2. We are seeking to an MVCC timestamp in a block where all keys have
// MVCC timestamps (e.g. SQL table data).
// 3. We are seeking to a non-MVCC timestamp in a block where no keys have
// MVCC timestamps (e.g. lock keys).

if len(seekSuffix) == 0 {
// The search key has no suffix, so it's the smallest possible key with its
// prefix. Return the row. This is a common case where the user is seeking
// to the most-recent row and just wants the smallest key with the prefix.
return index
}

const withWall = 9
const withLogical = withWall + 4
const withSynthetic = withLogical + 1
var seekWallTime uint64
var seekLogicalTime uint32
switch len(seekSuffix) {
case 0:
// The search key has no suffix, so it's the smallest possible key with
// its prefix. Return the row. This is a common case where the user is
// seeking to the most-recent row and just wants the smallest key with
// the prefix.
return index
case withLogical, withSynthetic:
seekWallTime = binary.BigEndian.Uint64(seekSuffix)
seekLogicalTime = binary.BigEndian.Uint32(seekSuffix[8:])
case withWall:
seekWallTime = binary.BigEndian.Uint64(seekSuffix)
default:
// The suffix is untyped. Compare the untyped suffixes.
// Binary search between [index, prefixChanged.SeekSetBitGE(index+1)].

// If suffixTypes == hasMVCCSuffixes, all keys in the block have MVCC
// suffixes. Note that blocks that contain both MVCC and non-MVCC should be
// very rare, so it's ok to use the more general path below in that case.
if ks.suffixTypes == hasMVCCSuffixes && (len(seekSuffix) == withWall || len(seekSuffix) == withLogical || len(seekSuffix) == withSynthetic) {
// Fast path: seeking among MVCC versions using a MVCC timestamp.
seekWallTime := binary.BigEndian.Uint64(seekSuffix)
var seekLogicalTime uint32
if len(seekSuffix) >= withLogical {
seekLogicalTime = binary.BigEndian.Uint32(seekSuffix[8:])
}

// First check the suffix at index, because querying for the latest value is
// the most common case.
if latestWallTime := ks.mvccWallTimes.At(index); latestWallTime < seekWallTime ||
(latestWallTime == seekWallTime && uint32(ks.mvccLogical.At(index)) <= seekLogicalTime) {
return index
}

// Binary search between [index+1, prefixChanged.SeekSetBitGE(index+1)].
//
// Define f(i) = true iff key at i is >= seek key.
// Invariant: f(l-1) == false, f(u) == true.
l := index
l := index + 1
u := ks.roachKeyChanged.SeekSetBitGE(index + 1)

for l < u {
h := int(uint(l+u) >> 1) // avoid overflow when computing h
// l ≤ h < u
if bytes.Compare(ks.untypedVersions.At(h), seekSuffix) <= 0 {
u = h // preserves f(u) == true
m := int(uint(l+u) >> 1) // avoid overflow when computing m
// l ≤ m < u
mWallTime := ks.mvccWallTimes.At(m)
if mWallTime < seekWallTime ||
(mWallTime == seekWallTime && uint32(ks.mvccLogical.At(m)) <= seekLogicalTime) {
u = m // preserves f(u) = true
} else {
l = h + 1 // preserves f(l-1) == false
l = m + 1 // preserves f(l-1) = false
}
}
return l
}
// Seeking among MVCC versions using a MVCC timestamp.

// TODO(jackson): What if the row has an untyped suffix?

// First check the suffix at index, because querying for the latest value is
// the most common case.
if latestWallTime := ks.mvccWallTimes.At(index); latestWallTime < seekWallTime ||
(latestWallTime == seekWallTime && uint32(ks.mvccLogical.At(index)) <= seekLogicalTime) {
return index
// Remove the terminator byte, which we know is equal to len(seekSuffix)
// because we obtained the suffix by splitting the seek key.
version := seekSuffix[:len(seekSuffix)-1]
if invariants.Enabled && seekSuffix[len(version)] != byte(len(seekSuffix)) {
panic(errors.AssertionFailedf("invalid seek suffix %x", seekSuffix))
}

// Binary search between [index+1, prefixChanged.SeekSetBitGE(index+1)].
// Binary search for version between [index, prefixChanged.SeekSetBitGE(index+1)].
//
// Define f(i) = true iff key at i is >= seek key.
// Define f(i) = true iff key at i is >= seek key (i.e. suffix at i is <= seek suffix).
// Invariant: f(l-1) == false, f(u) == true.
l := index + 1
l := index
u := ks.roachKeyChanged.SeekSetBitGE(index + 1)
if ks.suffixTypes&hasEmptySuffixes != 0 {
// Check if the key at index has an empty suffix. Since empty suffixes sort
// first, this is the only key in the range [index, u) which could have an
// empty suffix.
if len(ks.untypedVersions.At(index)) == 0 && ks.mvccWallTimes.At(index) == 0 && ks.mvccLogical.At(index) == 0 {
// Our seek suffix is not empty, so it must come after the empty suffix.
l = index + 1
}
}

for l < u {
h := int(uint(l+u) >> 1) // avoid overflow when computing h
// l ≤ h < u
hWallTime := ks.mvccWallTimes.At(h)
if hWallTime < seekWallTime ||
(hWallTime == seekWallTime && uint32(ks.mvccLogical.At(h)) <= seekLogicalTime) {
u = h // preserves f(u) = true
m := int(uint(l+u) >> 1) // avoid overflow when computing m
// l ≤ m < u
mVer := ks.untypedVersions.At(m)
if len(mVer) == 0 {
wallTime := ks.mvccWallTimes.At(m)
logicalTime := uint32(ks.mvccLogical.At(m))
if invariants.Enabled && wallTime == 0 && logicalTime == 0 {
// This can only happen for row at index.
panic(errors.AssertionFailedf("unexpected empty suffix at %d (l=%d, u=%d)", m, l, u))
}

// Note: this path is not very performance sensitive: blocks that mix MVCC
// suffixes with non-MVCC suffixes should be rare.

//gcassert:noescape
var buf [12]byte
//gcassert:inline
binary.BigEndian.PutUint64(buf[:], wallTime)
if logicalTime == 0 {
mVer = buf[:8]
} else {
//gcassert:inline
binary.BigEndian.PutUint32(buf[8:], logicalTime)
mVer = buf[:12]
}
}
if bytes.Compare(mVer, version) <= 0 {
u = m // preserves f(u) == true
} else {
l = h + 1 // preserves f(l-1) = false
l = m + 1 // preserves f(l-1) == false
}
}
return l
Expand Down Expand Up @@ -796,13 +824,14 @@ func (ks *cockroachKeySeeker) MaterializeUserKey(
return res
}
// Slice first, to check that the capacity is sufficient.
res := ki.Buf[:roachKeyLen+1+len(untypedVersion)]
res := ki.Buf[:roachKeyLen+2+len(untypedVersion)]
*(*byte)(ptr) = 0
memmove(
unsafe.Pointer(uintptr(ptr)+1),
unsafe.Pointer(unsafe.SliceData(untypedVersion)),
uintptr(len(untypedVersion)),
)
*(*byte)(unsafe.Pointer(uintptr(ptr) + uintptr(len(untypedVersion)+1))) = byte(len(untypedVersion) + 1)
return res
}

Expand Down
77 changes: 38 additions & 39 deletions internal/crdbtest/testdata/block_encoding
Original file line number Diff line number Diff line change
Expand Up @@ -51,9 +51,9 @@ data block header
│ ├── 027-028: b 00000011 # col 3: bytes
│ ├── 028-032: x ec000000 # col 3: page start 236
│ ├── 032-033: b 00000010 # col 4: uint
│ ├── 033-037: x 0c010000 # col 4: page start 268
│ ├── 033-037: x 0b010000 # col 4: page start 267
│ ├── 037-038: b 00000001 # col 5: bool
│ ├── 038-042: x 26010000 # col 5: page start 294
│ ├── 038-042: x 24010000 # col 5: page start 292
│ ├── 042-043: b 00000011 # col 6: bytes
│ ├── 043-047: x 38010000 # col 6: page start 312
│ ├── 047-048: b 00000001 # col 7: bool
Expand Down Expand Up @@ -130,46 +130,45 @@ data block header
│ │ ├── 238-239: x 00 # data[1] = 0 [250 overall]
│ │ ├── 239-240: x 00 # data[2] = 0 [250 overall]
│ │ ├── 240-241: x 00 # data[3] = 0 [250 overall]
│ │ ├── 241-242: x 12 # data[4] = 18 [268 overall]
│ │ ├── 242-243: x 12 # data[5] = 18 [268 overall]
│ │ ├── 243-244: x 12 # data[6] = 18 [268 overall]
│ │ ├── 244-245: x 12 # data[7] = 18 [268 overall]
│ │ ├── 245-246: x 12 # data[8] = 18 [268 overall]
│ │ ├── 246-247: x 12 # data[9] = 18 [268 overall]
│ │ ├── 247-248: x 12 # data[10] = 18 [268 overall]
│ │ ├── 248-249: x 12 # data[11] = 18 [268 overall]
│ │ └── 249-250: x 12 # data[12] = 18 [268 overall]
│ │ ├── 241-242: x 11 # data[4] = 17 [267 overall]
│ │ ├── 242-243: x 11 # data[5] = 17 [267 overall]
│ │ ├── 243-244: x 11 # data[6] = 17 [267 overall]
│ │ ├── 244-245: x 11 # data[7] = 17 [267 overall]
│ │ ├── 245-246: x 11 # data[8] = 17 [267 overall]
│ │ ├── 246-247: x 11 # data[9] = 17 [267 overall]
│ │ ├── 247-248: x 11 # data[10] = 17 [267 overall]
│ │ ├── 248-249: x 11 # data[11] = 17 [267 overall]
│ │ └── 249-250: x 11 # data[12] = 17 [267 overall]
│ └── data
│ ├── 250-250: x # data[0]:
│ ├── 250-250: x # data[1]:
│ ├── 250-250: x # data[2]:
│ ├── 250-268: x 010203040506070809101112131415161712 # data[3]: "\x01\x02\x03\x04\x05\x06\a\b\t\x10\x11\x12\x13\x14\x15\x16\x17\x12"
│ ├── 268-268: x # data[4]:
│ ├── 268-268: x # data[5]:
│ ├── 268-268: x # data[6]:
│ ├── 268-268: x # data[7]:
│ ├── 268-268: x # data[8]:
│ ├── 268-268: x # data[9]:
│ ├── 268-268: x # data[10]:
│ └── 268-268: x # data[11]:
│ ├── 250-250: x # data[0]:
│ ├── 250-250: x # data[1]:
│ ├── 250-250: x # data[2]:
│ ├── 250-267: x 0102030405060708091011121314151617 # data[3]: "\x01\x02\x03\x04\x05\x06\a\b\t\x10\x11\x12\x13\x14\x15\x16\x17"
│ ├── 267-267: x # data[4]:
│ ├── 267-267: x # data[5]:
│ ├── 267-267: x # data[6]:
│ ├── 267-267: x # data[7]:
│ ├── 267-267: x # data[8]:
│ ├── 267-267: x # data[9]:
│ ├── 267-267: x # data[10]:
│ └── 267-267: x # data[11]:
├── data for column 4 (uint)
│ ├── 268-269: x 02 # encoding: 2b
│ ├── 269-270: x 00 # padding (aligning to 16-bit boundary)
│ ├── 270-272: x 0103 # data[0] = 769
│ ├── 272-274: x 0102 # data[1] = 513
│ ├── 274-276: x 0101 # data[2] = 257
│ ├── 276-278: x 0101 # data[3] = 257
│ ├── 278-280: x 0102 # data[4] = 513
│ ├── 280-282: x 0101 # data[5] = 257
│ ├── 282-284: x 0104 # data[6] = 1025
│ ├── 284-286: x 0103 # data[7] = 769
│ ├── 286-288: x 0102 # data[8] = 513
│ ├── 288-290: x 0101 # data[9] = 257
│ ├── 290-292: x 0101 # data[10] = 257
│ └── 292-294: x 0101 # data[11] = 257
│ ├── 267-268: x 02 # encoding: 2b
│ ├── 268-270: x 0103 # data[0] = 769
│ ├── 270-272: x 0102 # data[1] = 513
│ ├── 272-274: x 0101 # data[2] = 257
│ ├── 274-276: x 0101 # data[3] = 257
│ ├── 276-278: x 0102 # data[4] = 513
│ ├── 278-280: x 0101 # data[5] = 257
│ ├── 280-282: x 0104 # data[6] = 1025
│ ├── 282-284: x 0103 # data[7] = 769
│ ├── 284-286: x 0102 # data[8] = 513
│ ├── 286-288: x 0101 # data[9] = 257
│ ├── 288-290: x 0101 # data[10] = 257
│ └── 290-292: x 0101 # data[11] = 257
├── data for column 5 (bool)
│ ├── 294-295: x 00 # default bitmap encoding
│ ├── 295-296: x 00 # padding to align to 64-bit boundary
│ ├── 292-293: x 00 # default bitmap encoding
│ ├── 293-296: x 000000 # padding to align to 64-bit boundary
│ ├── 296-304: b 0000000100001100000000000000000000000000000000000000000000000000 # bitmap word 0
│ └── 304-312: b 0000000100000000000000000000000000000000000000000000000000000000 # bitmap summary word 0-63
├── data for column 6 (bytes)
Expand Down
4 changes: 1 addition & 3 deletions internal/crdbtest/testdata/seek
Original file line number Diff line number Diff line change
Expand Up @@ -13,14 +13,12 @@ fop @ 0102030405060708 #1,SET = row6
seek
fo
fo @ 05
fo @ 05
fo @ 0102030405060709 #1,SET = v0
fo @ 0102030405060708 #1,SET = v0
fo @ 0102030405060707 #1,SET = v0
----
fo: fo @ 0102030405060708 #1,SET = row0
fo @ 05: fo @ 0102030405060708 #1,SET = row0
fo @ 05: fo @ 0102030405060708 #1,SET = row0
fo @ 0102030405060709: fo @ 0102030405060708 #1,SET = row0
fo @ 0102030405060708: fo @ 0102030405060708 #1,SET = row0
fo @ 0102030405060707: foo @ 0102030405060708 #1,SET = row1
Expand Down Expand Up @@ -94,7 +92,7 @@ foo @ 0000000000000000000000000000000002 #1,SET = row3
foo @ 0000000000000000000000000000000001 #1,SET = row4
z @ 0000000000000000000000000000000001 #1,SET = row5
----
6 rows, total size 259B
6 rows, total size 251B

seek
a
Expand Down
4 changes: 2 additions & 2 deletions internal/crdbtest/testdata/suffix_types
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ init
foo #1,SET = bar
foo @ 0102030405060708091011121314151617 #1,SET = bar
----
2 rows, total size 133B
2 rows, total size 125B

suffix-types
----
Expand All @@ -64,7 +64,7 @@ foo #1,SET = bar
foo @ 0102030405060708 #1,SET = bar
foo @ 0102030405060708091011121314151617 #1,SET = bar
----
3 rows, total size 169B
3 rows, total size 161B

suffix-types
----
Expand Down

0 comments on commit efa5401

Please sign in to comment.