colblk: rework Uint encodings

Currently we have four uint column types; each one can be encoded with either 0,1,2,4,or 8 bytes per value. The encoding contains a delta that has the same width as the column type. This leads to a lot of possible combinations of data formats (each one requiring at least a bit of specific code, even if generated through generics). Given that the encoding supports smaller widths transparently, there is no real advantage to declaring column types for smaller integers. And there is a disadvantage - it will prevent us from using larger integers in the future without changing the type. This change replaces the four uint column types with a single type. We also add a specialization of the decoding type which is optimized for offsets. Benchmarks: ``` name time/op UnsafeUints/const-10 0.59ns ± 0% UnsafeUints/1b-10 0.70ns ± 0% UnsafeUints/1b,delta-10 0.70ns ± 1% UnsafeUints/2b-10 0.74ns ± 0% UnsafeUints/2b,delta-10 0.74ns ± 0% UnsafeUints/4b-10 0.94ns ± 0% UnsafeUints/4b,delta-10 0.94ns ± 0% UnsafeUints/8b-10 1.25ns ± 0% UnsafeOffsets/2b-10 0.51ns ± 0% UnsafeOffsets/4b-10 0.94ns ± 0% ```
cockroachdb · Aug 15, 2024 · 1e5fd68 · 1e5fd68
1 parent fd3044b
commit 1e5fd68
Show file tree

Hide file tree

Showing 22 changed files with 2,362 additions and 2,864 deletions.
diff --git a/internal/binfmt/binfmt.go b/internal/binfmt/binfmt.go
@@ -18,7 +18,7 @@ import (
 
 // New constructs a new binary formatter.
 func New(data []byte) *Formatter {
-	offsetWidth := strconv.Itoa(int(math.Log10(float64(len(data)-1))) + 1)
+	offsetWidth := strconv.Itoa(max(int(math.Log10(float64(len(data)-1)))+1, 1))
 	return &Formatter{
 		data:            data,
 		lineWidth:       40,

diff --git a/sstable/colblk/block.go b/sstable/colblk/block.go
@@ -126,10 +126,10 @@
 // [s,e) have the corresponding bounds.
 //
 // Both range deletions and range keys are encoded with the same schema. Range
-// deletion keyspan.Keys never contain suffixes or values. When encoded, the
-// RawBytes encoding uses the UintDeltaEncodingConstant encoding to avoid
-// materializing encoding N offsets. Each of these empty columns is encoded in
-// just ~5 bytes of column data.
+// deletion keyspan.Keys never contain suffixes or values. When one of these
+// columns is encoded, the RawBytes encoding uses uintEncodingAllZero to avoid
+// encoding N offsets. Each of these empty columns is encoded in just 1 byte of
+// column data.
 package colblk
 
 import (
@@ -357,28 +357,10 @@ func (r *BlockReader) PrefixBytes(col int) PrefixBytes {
 	return DecodeColumn(r, col, int(r.header.Rows), DataTypePrefixBytes, DecodePrefixBytes)
 }
 
-// Uint8s retrieves the col'th column as a column of uint8s. The column must be
-// of type DataTypeUint8.
-func (r *BlockReader) Uint8s(col int) UnsafeUint8s {
-	return DecodeColumn(r, col, int(r.header.Rows), DataTypeUint8, DecodeUnsafeIntegerSlice[uint8])
-}
-
-// Uint16s retrieves the col'th column as a column of uint8s. The column must be
-// of type DataTypeUint16.
-func (r *BlockReader) Uint16s(col int) UnsafeUint16s {
-	return DecodeColumn(r, col, int(r.header.Rows), DataTypeUint16, DecodeUnsafeIntegerSlice[uint16])
-}
-
-// Uint32s retrieves the col'th column as a column of uint32s. The column must be
-// of type DataTypeUint32.
-func (r *BlockReader) Uint32s(col int) UnsafeUint32s {
-	return DecodeColumn(r, col, int(r.header.Rows), DataTypeUint32, DecodeUnsafeIntegerSlice[uint32])
-}
-
-// Uint64s retrieves the col'th column as a column of uint64s. The column must be
-// of type DataTypeUint64.
-func (r *BlockReader) Uint64s(col int) UnsafeUint64s {
-	return DecodeColumn(r, col, int(r.header.Rows), DataTypeUint64, DecodeUnsafeIntegerSlice[uint64])
+// Uints retrieves the col'th column as a column of uints. The column must be
+// of type DataTypeUint.
+func (r *BlockReader) Uints(col int) UnsafeUints {
+	return DecodeColumn(r, col, int(r.header.Rows), DataTypeUint, DecodeUnsafeUints)
 }
 
 func (r *BlockReader) pageStart(col int) uint32 {
@@ -387,7 +369,7 @@ func (r *BlockReader) pageStart(col int) uint32 {
 		return uint32(len(r.data) - 1)
 	}
 	return binary.LittleEndian.Uint32(
-		unsafe.Slice((*byte)(unsafe.Pointer(r.pointer(r.customHeaderSize+uint32(blockHeaderBaseSize+columnHeaderSize*col+1)))), 4))
+		unsafe.Slice((*byte)(r.pointer(r.customHeaderSize+uint32(blockHeaderBaseSize+columnHeaderSize*col+1))), 4))
 }
 
 func (r *BlockReader) pointer(offset uint32) unsafe.Pointer {
@@ -443,8 +425,8 @@ func (r *BlockReader) columnToBinFormatter(f *binfmt.Formatter, col, rows int) {
 		switch dataType {
 		case DataTypeBool:
 			bitmapToBinFormatter(f, rows)
-		case DataTypeUint8, DataTypeUint16, DataTypeUint32, DataTypeUint64:
-			uintsToBinFormatter(f, rows, dataType, nil)
+		case DataTypeUint:
+			uintsToBinFormatter(f, rows, nil)
 		case DataTypePrefixBytes:
 			prefixBytesToBinFormatter(f, rows, nil)
 		case DataTypeBytes:

diff --git a/sstable/colblk/block_test.go b/sstable/colblk/block_test.go
@@ -7,6 +7,7 @@ package colblk
 import (
 	"bytes"
 	"fmt"
+	"math"
 	"reflect"
 	"slices"
 	"strconv"
@@ -18,9 +19,58 @@ import (
 	"golang.org/x/exp/rand"
 )
 
-type ColumnSpec struct {
+type testColumnSpec struct {
 	DataType
-	BundleSize int // Only used for DataTypePrefixBytes
+	IntRange   intRange // Only used for DataTypeUint
+	BundleSize int      // Only used for DataTypePrefixBytes
+}
+
+type intRange struct {
+	Min, Max         uint64
+	ExpectedEncoding UintEncoding
+}
+
+func (ir intRange) Rand(rng *rand.Rand) uint64 {
+	v := rng.Uint64()
+	if ir.Min == 0 && ir.Max == math.MaxUint64 {
+		return v
+	}
+	return ir.Min + v%(ir.Max-ir.Min+1)
+}
+
+var interestingIntRanges = []intRange{
+	// zero
+	{Min: 0, Max: 0, ExpectedEncoding: makeUintEncoding(0, false)},
+	// const
+	{Min: 1, Max: 1, ExpectedEncoding: makeUintEncoding(0, true)},
+	{Min: math.MaxUint32, Max: math.MaxUint32, ExpectedEncoding: makeUintEncoding(0, true)},
+	{Min: math.MaxUint64, Max: math.MaxUint64, ExpectedEncoding: makeUintEncoding(0, true)},
+	// 1b
+	{Min: 10, Max: 200, ExpectedEncoding: makeUintEncoding(1, false)},
+	{Min: 0, Max: math.MaxUint8, ExpectedEncoding: makeUintEncoding(1, false)},
+	// 1b,delta
+	{Min: 100, Max: 300, ExpectedEncoding: makeUintEncoding(1, true)},
+	{Min: math.MaxUint32 + 100, Max: math.MaxUint32 + 300, ExpectedEncoding: makeUintEncoding(1, true)},
+	{Min: math.MaxUint64 - 1, Max: math.MaxUint64, ExpectedEncoding: makeUintEncoding(1, true)},
+	// 2b
+	{Min: 10, Max: 20_000, ExpectedEncoding: makeUintEncoding(2, false)},
+	{Min: 0, Max: math.MaxUint8 + 1, ExpectedEncoding: makeUintEncoding(2, false)},
+	{Min: 0, Max: math.MaxUint16, ExpectedEncoding: makeUintEncoding(2, false)},
+	// 2b,delta
+	{Min: 20_000, Max: 80_000, ExpectedEncoding: makeUintEncoding(2, true)},
+	{Min: math.MaxUint32, Max: math.MaxUint32 + 50_000, ExpectedEncoding: makeUintEncoding(2, true)},
+	// 4b
+	{Min: 10, Max: 20_000_000, ExpectedEncoding: makeUintEncoding(4, false)},
+	{Min: 0, Max: math.MaxUint16 + 1, ExpectedEncoding: makeUintEncoding(4, false)},
+	{Min: 0, Max: math.MaxUint32, ExpectedEncoding: makeUintEncoding(4, false)},
+	// 4b,delta
+	{Min: 100_000, Max: math.MaxUint32 + 10, ExpectedEncoding: makeUintEncoding(4, true)},
+	{Min: math.MaxUint32, Max: math.MaxUint32 + 20_000_000, ExpectedEncoding: makeUintEncoding(4, true)},
+	// 8b
+	{Min: 10, Max: math.MaxUint32 + 100, ExpectedEncoding: makeUintEncoding(8, false)},
+	{Min: 0, Max: math.MaxUint32 + 1, ExpectedEncoding: makeUintEncoding(8, false)},
+	{Min: 0, Max: math.MaxUint64, ExpectedEncoding: makeUintEncoding(8, false)},
+	{Min: math.MaxUint64 - math.MaxUint32 - 1, Max: math.MaxUint64, ExpectedEncoding: makeUintEncoding(8, false)},
 }
 
 func TestBlockWriter(t *testing.T) {
@@ -48,20 +98,8 @@ func TestBlockWriter(t *testing.T) {
 				switch colDataTypes[i] {
 				case DataTypeBool:
 					colWriters[i] = &BitmapBuilder{}
-				case DataTypeUint8:
-					b := &UintBuilder[uint8]{}
-					b.Init()
-					colWriters[i] = b
-				case DataTypeUint16:
-					b := &UintBuilder[uint16]{}
-					b.Init()
-					colWriters[i] = b
-				case DataTypeUint32:
-					b := &UintBuilder[uint32]{}
-					b.Init()
-					colWriters[i] = b
-				case DataTypeUint64:
-					b := &UintBuilder[uint64]{}
+				case DataTypeUint:
+					b := &UintBuilder{}
 					b.Init()
 					colWriters[i] = b
 				case DataTypeBytes:
@@ -93,29 +131,8 @@ func TestBlockWriter(t *testing.T) {
 						panicIfErr(dataType, lineFields[r][c], err)
 						bb.Set(r, v)
 					}
-				case DataTypeUint8:
-					b := colWriters[c].(*UintBuilder[uint8])
-					for r := range lineFields {
-						v, err := strconv.ParseUint(lineFields[r][c], 10, 8)
-						panicIfErr(dataType, lineFields[r][c], err)
-						b.Set(r, uint8(v))
-					}
-				case DataTypeUint16:
-					b := colWriters[c].(*UintBuilder[uint16])
-					for r := range lineFields {
-						v, err := strconv.ParseUint(lineFields[r][c], 10, 16)
-						panicIfErr(dataType, lineFields[r][c], err)
-						b.Set(r, uint16(v))
-					}
-				case DataTypeUint32:
-					b := colWriters[c].(*UintBuilder[uint32])
-					for r := range lineFields {
-						v, err := strconv.ParseUint(lineFields[r][c], 10, 32)
-						panicIfErr(dataType, lineFields[r][c], err)
-						b.Set(r, uint32(v))
-					}
-				case DataTypeUint64:
-					b := colWriters[c].(*UintBuilder[uint64])
+				case DataTypeUint:
+					b := colWriters[c].(*UintBuilder)
 					for r := range lineFields {
 						v, err := strconv.ParseUint(lineFields[r][c], 10, 64)
 						panicIfErr(dataType, lineFields[r][c], err)
@@ -156,7 +173,7 @@ func dataTypeFromName(name string) DataType {
 // returns the serialized raw block and a []interface{} slice containing the
 // generated data. The type of each element of the slice is dependent on the
 // corresponding column's type.
-func randBlock(rng *rand.Rand, rows int, schema []ColumnSpec) ([]byte, []interface{}) {
+func randBlock(rng *rand.Rand, rows int, schema []testColumnSpec) ([]byte, []interface{}) {
 	data := make([]interface{}, len(schema))
 	for col := range data {
 		switch schema[col].DataType {
@@ -166,28 +183,10 @@ func randBlock(rng *rand.Rand, rows int, schema []ColumnSpec) ([]byte, []interfa
 				v[row] = (rng.Int31() % 2) == 0
 			}
 			data[col] = v
-		case DataTypeUint8:
-			v := make([]uint8, rows)
-			for row := 0; row < rows; row++ {
-				v[row] = uint8(rng.Uint32())
-			}
-			data[col] = v
-		case DataTypeUint16:
-			v := make([]uint16, rows)
-			for row := 0; row < rows; row++ {
-				v[row] = uint16(rng.Uint32())
-			}
-			data[col] = v
-		case DataTypeUint32:
-			v := make([]uint32, rows)
-			for row := 0; row < rows; row++ {
-				v[row] = rng.Uint32()
-			}
-			data[col] = v
-		case DataTypeUint64:
+		case DataTypeUint:
 			v := make([]uint64, rows)
 			for row := 0; row < rows; row++ {
-				v[row] = rng.Uint64()
+				v[row] = schema[col].IntRange.Rand(rng)
 			}
 			data[col] = v
 		case DataTypeBytes:
@@ -213,7 +212,7 @@ func randBlock(rng *rand.Rand, rows int, schema []ColumnSpec) ([]byte, []interfa
 	return buf, data
 }
 
-func buildBlock(schema []ColumnSpec, rows int, data []interface{}) []byte {
+func buildBlock(schema []testColumnSpec, rows int, data []interface{}) []byte {
 	cw := make([]ColumnWriter, len(schema))
 	for col := range schema {
 		switch schema[col].DataType {
@@ -224,29 +223,8 @@ func buildBlock(schema []ColumnSpec, rows int, data []interface{}) []byte {
 				bb.Set(row, v)
 			}
 			cw[col] = &bb
-		case DataTypeUint8:
-			var b UintBuilder[uint8]
-			b.Init()
-			for row, v := range data[col].([]uint8) {
-				b.Set(row, v)
-			}
-			cw[col] = &b
-		case DataTypeUint16:
-			var b UintBuilder[uint16]
-			b.Init()
-			for row, v := range data[col].([]uint16) {
-				b.Set(row, v)
-			}
-			cw[col] = &b
-		case DataTypeUint32:
-			var b UintBuilder[uint32]
-			b.Init()
-			for row, v := range data[col].([]uint32) {
-				b.Set(row, v)
-			}
-			cw[col] = &b
-		case DataTypeUint64:
-			var b UintBuilder[uint64]
+		case DataTypeUint:
+			var b UintBuilder
 			b.Init()
 			for row, v := range data[col].([]uint64) {
 				b.Set(row, v)
@@ -276,7 +254,7 @@ func buildBlock(schema []ColumnSpec, rows int, data []interface{}) []byte {
 	return FinishBlock(rows, cw)
 }
 
-func testRandomBlock(t *testing.T, rng *rand.Rand, rows int, schema []ColumnSpec) {
+func testRandomBlock(t *testing.T, rng *rand.Rand, rows int, schema []testColumnSpec) {
 	var sb strings.Builder
 	for i := range schema {
 		if i > 0 {
@@ -306,14 +284,8 @@ func testRandomBlock(t *testing.T, rng *rand.Rand, rows int, schema []ColumnSpec
 			switch spec.DataType {
 			case DataTypeBool:
 				got = Clone(r.Bitmap(col), rows)
-			case DataTypeUint8:
-				got = Clone(r.Uint8s(col), rows)
-			case DataTypeUint16:
-				got = Clone(r.Uint16s(col), rows)
-			case DataTypeUint32:
-				got = Clone(r.Uint32s(col), rows)
-			case DataTypeUint64:
-				got = Clone(r.Uint64s(col), rows)
+			case DataTypeUint:
+				got = Clone(r.Uints(col), rows)
 			case DataTypeBytes:
 				got = Clone(r.RawBytes(col), rows)
 			case DataTypePrefixBytes:
@@ -334,16 +306,15 @@ func TestBlockWriterRandomized(t *testing.T) {
 	randInt := func(lo, hi int) int {
 		return lo + rng.Intn(hi-lo)
 	}
-	testRandomBlock(t, rng, randInt(1, 100), []ColumnSpec{{DataType: DataTypeBool}})
-	testRandomBlock(t, rng, randInt(1, 100), []ColumnSpec{{DataType: DataTypeUint8}})
-	testRandomBlock(t, rng, randInt(1, 100), []ColumnSpec{{DataType: DataTypeUint16}})
-	testRandomBlock(t, rng, randInt(1, 100), []ColumnSpec{{DataType: DataTypeUint32}})
-	testRandomBlock(t, rng, randInt(1, 100), []ColumnSpec{{DataType: DataTypeUint64}})
-	testRandomBlock(t, rng, randInt(1, 100), []ColumnSpec{{DataType: DataTypeBytes}})
-	testRandomBlock(t, rng, randInt(1, 100), []ColumnSpec{{DataType: DataTypePrefixBytes, BundleSize: 1 << randInt(0, 6)}})
+	testRandomBlock(t, rng, randInt(1, 100), []testColumnSpec{{DataType: DataTypeBool}})
+	for _, r := range interestingIntRanges {
+		testRandomBlock(t, rng, randInt(1, 100), []testColumnSpec{{DataType: DataTypeUint, IntRange: r}})
+	}
+	testRandomBlock(t, rng, randInt(1, 100), []testColumnSpec{{DataType: DataTypeBytes}})
+	testRandomBlock(t, rng, randInt(1, 100), []testColumnSpec{{DataType: DataTypePrefixBytes, BundleSize: 1 << randInt(0, 6)}})
 
 	for i := 0; i < 100; i++ {
-		schema := make([]ColumnSpec, 2+rng.Intn(8))
+		schema := make([]testColumnSpec, 2+rng.Intn(8))
 		for j := range schema {
 			schema[j].DataType = DataType(randInt(1, int(dataTypesCount)))
 			if schema[j].DataType == DataTypePrefixBytes {