Skip to content

Commit

Permalink
colblk: ensure BitmapBuilder Size and Finish agree on encoding
Browse files Browse the repository at this point in the history
Previously, if a bitmap builder switched from the all-zero encoding to the
default encoding, a subsequent call to Finish with a row count that should use
the all-zero encoding would erroneously write the default encoding (attempting
to write more bytes than were allocated). This commit updates the BitmapBuilder
to keep track of the first set bit of the bitmap. During Finish, this is used
to determine the appropriate encoding. Invert continues to force use of the
default encoding.
  • Loading branch information
jbowens committed Oct 1, 2024
1 parent 1168228 commit b56d1b9
Show file tree
Hide file tree
Showing 2 changed files with 45 additions and 6 deletions.
24 changes: 20 additions & 4 deletions sstable/colblk/bitmap.go
Original file line number Diff line number Diff line change
Expand Up @@ -243,6 +243,10 @@ func (b Bitmap) String() string {
// BitmapBuilder constructs a Bitmap. Bits default to false.
type BitmapBuilder struct {
words []uint64
// minNonZeroRowCount is the row count at which the bitmap should begin to
// use the defaultBitmapEncoding (as opposed to the zeroBitmapEncoding).
// It's updated on the first call to Set and defaults to zero.
minNonZeroRowCount int
}

type bitmapEncoding uint8
Expand All @@ -268,6 +272,11 @@ func bitmapRequiredSize(total int) int {

// Set sets the bit at position i to true.
func (b *BitmapBuilder) Set(i int) {
// Update minNonZeroRowCount if necessary. This is used to determine whether
// the bitmap should be encoded using the all-zeros encoding.
if b.isZero(i + 1) {
b.minNonZeroRowCount = i + 1
}
w := i >> 6 // divide by 64
for len(b.words) <= w {
b.words = append(b.words, 0)
Expand All @@ -276,14 +285,17 @@ func (b *BitmapBuilder) Set(i int) {
}

// isZero returns true if no bits are set and Invert was not called.
func (b *BitmapBuilder) isZero() bool {
return len(b.words) == 0
//
//gcassert:inline
func (b *BitmapBuilder) isZero(rows int) bool {
return b.minNonZeroRowCount == 0 || rows < b.minNonZeroRowCount
}

// Reset resets the bitmap to the empty state.
func (b *BitmapBuilder) Reset() {
clear(b.words)
b.words = b.words[:0]
b.minNonZeroRowCount = 0
}

// NumColumns implements the ColumnWriter interface.
Expand All @@ -296,7 +308,7 @@ func (b *BitmapBuilder) DataType(int) DataType { return DataTypeBool }
func (b *BitmapBuilder) Size(rows int, offset uint32) uint32 {
// First byte will be the encoding type.
offset++
if b.isZero() {
if b.isZero(rows) {
return offset
}
offset = align(offset, align64)
Expand All @@ -320,6 +332,10 @@ func (b *BitmapBuilder) InvertedSize(rows int, offset uint32) uint32 {
// Note that Invert can affect the Size of the bitmap. Use InvertedSize() if you
// intend to invert the bitmap before finishing.
func (b *BitmapBuilder) Invert(nRows int) {
// Inverted bitmaps never use the all-zero encoding, so we set
// rowCountIncludingFirstSetBit to 1 so that as long as the bitmap is
// finished encoding any rows at all, it uses the default encoding.
b.minNonZeroRowCount = 1
// If the tail of b is sparse, fill in zeroes before inverting.
nBitmapWords := (nRows + 63) >> 6
b.words = slices.Grow(b.words, nBitmapWords-len(b.words))[:nBitmapWords]
Expand All @@ -331,7 +347,7 @@ func (b *BitmapBuilder) Invert(nRows int) {
// Finish finalizes the bitmap, computing the per-word summary bitmap and
// writing the resulting data to buf at offset.
func (b *BitmapBuilder) Finish(col, nRows int, offset uint32, buf []byte) uint32 {
if b.isZero() {
if b.isZero(nRows) {
buf[offset] = byte(zeroBitmapEncoding)
return offset + 1
}
Expand Down
27 changes: 25 additions & 2 deletions sstable/colblk/testdata/bitmap
Original file line number Diff line number Diff line change
Expand Up @@ -372,22 +372,45 @@ Binary representation:
40-48: b 0000000000000000000000000000000000000000000000000000000010000000 # bitmap word 4
48-56: b 0001000000000000000000000000000000000000000000000000000000000000 # bitmap summary word 0-63

# Write out one fewer row than is set, which should result in the all-zeroes
# encoding.

build rows=319
0000000000000000000000000000000000000000000000000000000000000000
0000000000000000000000000000000000000000000000000000000000000000
0000000000000000000000000000000000000000000000000000000000000000
0000000000000000000000000000000000000000000000000000000000000000
0000000000000000000000000000000000000000000000000000000000000001
----
0000000000000000000000000000000000000000000000000000000000000000
0000000000000000000000000000000000000000000000000000000000000000
0000000000000000000000000000000000000000000000000000000000000000
0000000000000000000000000000000000000000000000000000000000000000
000000000000000000000000000000000000000000000000000000000000000
Binary representation:
0-1: x 01 # bitmap encoding

# Write out fewer rows than we set. The bitmap summary should reflect the
# truncated view of the bitmap.

build rows=192
build rows=260
0000000000000000000000000000000000000000000000000000000000000000
0000000000000000000000000000000000000000000000000000000000000000
0000000000000000000000000000000000000000000000000000000000000000
0000000000111000000000000000000000000000000000000000000000000000
0000000000111000000000000000000000000000000000000000000000000000
----
0000000000000000000000000000000000000000000000000000000000000000
0000000000000000000000000000000000000000000000000000000000000000
0000000000000000000000000000000000000000000000000000000000000000
0000000000111000000000000000000000000000000000000000000000000000
0000
Binary representation:
00-01: x 00 # bitmap encoding
01-08: x 00000000000000 # padding to align to 64-bit boundary
08-16: b 0000000000000000000000000000000000000000000000000000000000000000 # bitmap word 0
16-24: b 0000000000000000000000000000000000000000000000000000000000000000 # bitmap word 1
24-32: b 0000000000000000000000000000000000000000000000000000000000000000 # bitmap word 2
32-40: b 0000000000000000000000000000000000000000000000000000000000000000 # bitmap summary word 0-63
32-40: b 0000000000011100000000000000000000000000000000000000000000000000 # bitmap word 3
40-48: b 0000000000000000000000000000000000000000000000000000000000000000 # bitmap word 4
48-56: b 0000100000000000000000000000000000000000000000000000000000000000 # bitmap summary word 0-63

0 comments on commit b56d1b9

Please sign in to comment.