From b56d1b9e56e239fee11787a989e2834fe469cae0 Mon Sep 17 00:00:00 2001 From: Jackson Owens Date: Mon, 30 Sep 2024 11:50:57 -0400 Subject: [PATCH] colblk: ensure BitmapBuilder Size and Finish agree on encoding Previously, if a bitmap builder switched from the all-zero encoding to the default encoding, a subsequent call to Finish with a row count that should use the all-zero encoding would erroneously write the default encoding (attempting to write more bytes than were allocated). This commit updates the BitmapBuilder to keep track of the first set bit of the bitmap. During Finish, this is used to determine the appropriate encoding. Invert continues to force use of the default encoding. --- sstable/colblk/bitmap.go | 24 ++++++++++++++++++++---- sstable/colblk/testdata/bitmap | 27 +++++++++++++++++++++++++-- 2 files changed, 45 insertions(+), 6 deletions(-) diff --git a/sstable/colblk/bitmap.go b/sstable/colblk/bitmap.go index 5a8ac94f47..dde6e797e0 100644 --- a/sstable/colblk/bitmap.go +++ b/sstable/colblk/bitmap.go @@ -243,6 +243,10 @@ func (b Bitmap) String() string { // BitmapBuilder constructs a Bitmap. Bits default to false. type BitmapBuilder struct { words []uint64 + // minNonZeroRowCount is the row count at which the bitmap should begin to + // use the defaultBitmapEncoding (as opposed to the zeroBitmapEncoding). + // It's updated on the first call to Set and defaults to zero. + minNonZeroRowCount int } type bitmapEncoding uint8 @@ -268,6 +272,11 @@ func bitmapRequiredSize(total int) int { // Set sets the bit at position i to true. func (b *BitmapBuilder) Set(i int) { + // Update minNonZeroRowCount if necessary. This is used to determine whether + // the bitmap should be encoded using the all-zeros encoding. + if b.isZero(i + 1) { + b.minNonZeroRowCount = i + 1 + } w := i >> 6 // divide by 64 for len(b.words) <= w { b.words = append(b.words, 0) @@ -276,14 +285,17 @@ func (b *BitmapBuilder) Set(i int) { } // isZero returns true if no bits are set and Invert was not called. -func (b *BitmapBuilder) isZero() bool { - return len(b.words) == 0 +// +//gcassert:inline +func (b *BitmapBuilder) isZero(rows int) bool { + return b.minNonZeroRowCount == 0 || rows < b.minNonZeroRowCount } // Reset resets the bitmap to the empty state. func (b *BitmapBuilder) Reset() { clear(b.words) b.words = b.words[:0] + b.minNonZeroRowCount = 0 } // NumColumns implements the ColumnWriter interface. @@ -296,7 +308,7 @@ func (b *BitmapBuilder) DataType(int) DataType { return DataTypeBool } func (b *BitmapBuilder) Size(rows int, offset uint32) uint32 { // First byte will be the encoding type. offset++ - if b.isZero() { + if b.isZero(rows) { return offset } offset = align(offset, align64) @@ -320,6 +332,10 @@ func (b *BitmapBuilder) InvertedSize(rows int, offset uint32) uint32 { // Note that Invert can affect the Size of the bitmap. Use InvertedSize() if you // intend to invert the bitmap before finishing. func (b *BitmapBuilder) Invert(nRows int) { + // Inverted bitmaps never use the all-zero encoding, so we set + // rowCountIncludingFirstSetBit to 1 so that as long as the bitmap is + // finished encoding any rows at all, it uses the default encoding. + b.minNonZeroRowCount = 1 // If the tail of b is sparse, fill in zeroes before inverting. nBitmapWords := (nRows + 63) >> 6 b.words = slices.Grow(b.words, nBitmapWords-len(b.words))[:nBitmapWords] @@ -331,7 +347,7 @@ func (b *BitmapBuilder) Invert(nRows int) { // Finish finalizes the bitmap, computing the per-word summary bitmap and // writing the resulting data to buf at offset. func (b *BitmapBuilder) Finish(col, nRows int, offset uint32, buf []byte) uint32 { - if b.isZero() { + if b.isZero(nRows) { buf[offset] = byte(zeroBitmapEncoding) return offset + 1 } diff --git a/sstable/colblk/testdata/bitmap b/sstable/colblk/testdata/bitmap index f0979c9f20..074984106c 100644 --- a/sstable/colblk/testdata/bitmap +++ b/sstable/colblk/testdata/bitmap @@ -372,22 +372,45 @@ Binary representation: 40-48: b 0000000000000000000000000000000000000000000000000000000010000000 # bitmap word 4 48-56: b 0001000000000000000000000000000000000000000000000000000000000000 # bitmap summary word 0-63 +# Write out one fewer row than is set, which should result in the all-zeroes +# encoding. + +build rows=319 +0000000000000000000000000000000000000000000000000000000000000000 +0000000000000000000000000000000000000000000000000000000000000000 +0000000000000000000000000000000000000000000000000000000000000000 +0000000000000000000000000000000000000000000000000000000000000000 +0000000000000000000000000000000000000000000000000000000000000001 +---- +0000000000000000000000000000000000000000000000000000000000000000 +0000000000000000000000000000000000000000000000000000000000000000 +0000000000000000000000000000000000000000000000000000000000000000 +0000000000000000000000000000000000000000000000000000000000000000 +000000000000000000000000000000000000000000000000000000000000000 +Binary representation: +0-1: x 01 # bitmap encoding + # Write out fewer rows than we set. The bitmap summary should reflect the # truncated view of the bitmap. -build rows=192 +build rows=260 0000000000000000000000000000000000000000000000000000000000000000 0000000000000000000000000000000000000000000000000000000000000000 0000000000000000000000000000000000000000000000000000000000000000 0000000000111000000000000000000000000000000000000000000000000000 +0000000000111000000000000000000000000000000000000000000000000000 ---- 0000000000000000000000000000000000000000000000000000000000000000 0000000000000000000000000000000000000000000000000000000000000000 0000000000000000000000000000000000000000000000000000000000000000 +0000000000111000000000000000000000000000000000000000000000000000 +0000 Binary representation: 00-01: x 00 # bitmap encoding 01-08: x 00000000000000 # padding to align to 64-bit boundary 08-16: b 0000000000000000000000000000000000000000000000000000000000000000 # bitmap word 0 16-24: b 0000000000000000000000000000000000000000000000000000000000000000 # bitmap word 1 24-32: b 0000000000000000000000000000000000000000000000000000000000000000 # bitmap word 2 -32-40: b 0000000000000000000000000000000000000000000000000000000000000000 # bitmap summary word 0-63 +32-40: b 0000000000011100000000000000000000000000000000000000000000000000 # bitmap word 3 +40-48: b 0000000000000000000000000000000000000000000000000000000000000000 # bitmap word 4 +48-56: b 0000100000000000000000000000000000000000000000000000000000000000 # bitmap summary word 0-63