Skip to content

Commit

Permalink
colblk: rework Uint encodings
Browse files Browse the repository at this point in the history
Currently we have four uint column types; each one can be encoded with
either 0,1,2,4,or 8 bytes per value. The encoding contains a delta
that has the same width as the column type. This leads to a lot of
possible combinations of data formats (each one requiring at least a
bit of specific code, even if generated through generics).

Given that the encoding supports smaller widths transparently, there
is no real advantage to declaring column types for smaller integers.
And there is a disadvantage - it will prevent us from using larger
integers in the future without changing the type.

This change replaces the four uint column types with a single type.
We also add a specialization of the decoding type which is optimized
for offsets.

Benchmarks:
```
name                     time/op
UnsafeUints/const-10     0.59ns ± 0%
UnsafeUints/1b-10        0.70ns ± 0%
UnsafeUints/1b,delta-10  0.70ns ± 1%
UnsafeUints/2b-10        0.74ns ± 0%
UnsafeUints/2b,delta-10  0.74ns ± 0%
UnsafeUints/4b-10        0.94ns ± 0%
UnsafeUints/4b,delta-10  0.94ns ± 0%
UnsafeUints/8b-10        1.25ns ± 0%
UnsafeOffsets/2b-10      0.51ns ± 0%
UnsafeOffsets/4b-10      0.94ns ± 0%
```
  • Loading branch information
RaduBerinde committed Aug 15, 2024
1 parent fd3044b commit 1e5fd68
Show file tree
Hide file tree
Showing 22 changed files with 2,362 additions and 2,864 deletions.
2 changes: 1 addition & 1 deletion internal/binfmt/binfmt.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ import (

// New constructs a new binary formatter.
func New(data []byte) *Formatter {
offsetWidth := strconv.Itoa(int(math.Log10(float64(len(data)-1))) + 1)
offsetWidth := strconv.Itoa(max(int(math.Log10(float64(len(data)-1)))+1, 1))
return &Formatter{
data: data,
lineWidth: 40,
Expand Down
40 changes: 11 additions & 29 deletions sstable/colblk/block.go
Original file line number Diff line number Diff line change
Expand Up @@ -126,10 +126,10 @@
// [s,e) have the corresponding bounds.
//
// Both range deletions and range keys are encoded with the same schema. Range
// deletion keyspan.Keys never contain suffixes or values. When encoded, the
// RawBytes encoding uses the UintDeltaEncodingConstant encoding to avoid
// materializing encoding N offsets. Each of these empty columns is encoded in
// just ~5 bytes of column data.
// deletion keyspan.Keys never contain suffixes or values. When one of these
// columns is encoded, the RawBytes encoding uses uintEncodingAllZero to avoid
// encoding N offsets. Each of these empty columns is encoded in just 1 byte of
// column data.
package colblk

import (
Expand Down Expand Up @@ -357,28 +357,10 @@ func (r *BlockReader) PrefixBytes(col int) PrefixBytes {
return DecodeColumn(r, col, int(r.header.Rows), DataTypePrefixBytes, DecodePrefixBytes)
}

// Uint8s retrieves the col'th column as a column of uint8s. The column must be
// of type DataTypeUint8.
func (r *BlockReader) Uint8s(col int) UnsafeUint8s {
return DecodeColumn(r, col, int(r.header.Rows), DataTypeUint8, DecodeUnsafeIntegerSlice[uint8])
}

// Uint16s retrieves the col'th column as a column of uint8s. The column must be
// of type DataTypeUint16.
func (r *BlockReader) Uint16s(col int) UnsafeUint16s {
return DecodeColumn(r, col, int(r.header.Rows), DataTypeUint16, DecodeUnsafeIntegerSlice[uint16])
}

// Uint32s retrieves the col'th column as a column of uint32s. The column must be
// of type DataTypeUint32.
func (r *BlockReader) Uint32s(col int) UnsafeUint32s {
return DecodeColumn(r, col, int(r.header.Rows), DataTypeUint32, DecodeUnsafeIntegerSlice[uint32])
}

// Uint64s retrieves the col'th column as a column of uint64s. The column must be
// of type DataTypeUint64.
func (r *BlockReader) Uint64s(col int) UnsafeUint64s {
return DecodeColumn(r, col, int(r.header.Rows), DataTypeUint64, DecodeUnsafeIntegerSlice[uint64])
// Uints retrieves the col'th column as a column of uints. The column must be
// of type DataTypeUint.
func (r *BlockReader) Uints(col int) UnsafeUints {
return DecodeColumn(r, col, int(r.header.Rows), DataTypeUint, DecodeUnsafeUints)
}

func (r *BlockReader) pageStart(col int) uint32 {
Expand All @@ -387,7 +369,7 @@ func (r *BlockReader) pageStart(col int) uint32 {
return uint32(len(r.data) - 1)
}
return binary.LittleEndian.Uint32(
unsafe.Slice((*byte)(unsafe.Pointer(r.pointer(r.customHeaderSize+uint32(blockHeaderBaseSize+columnHeaderSize*col+1)))), 4))
unsafe.Slice((*byte)(r.pointer(r.customHeaderSize+uint32(blockHeaderBaseSize+columnHeaderSize*col+1))), 4))
}

func (r *BlockReader) pointer(offset uint32) unsafe.Pointer {
Expand Down Expand Up @@ -443,8 +425,8 @@ func (r *BlockReader) columnToBinFormatter(f *binfmt.Formatter, col, rows int) {
switch dataType {
case DataTypeBool:
bitmapToBinFormatter(f, rows)
case DataTypeUint8, DataTypeUint16, DataTypeUint32, DataTypeUint64:
uintsToBinFormatter(f, rows, dataType, nil)
case DataTypeUint:
uintsToBinFormatter(f, rows, nil)
case DataTypePrefixBytes:
prefixBytesToBinFormatter(f, rows, nil)
case DataTypeBytes:
Expand Down
173 changes: 72 additions & 101 deletions sstable/colblk/block_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ package colblk
import (
"bytes"
"fmt"
"math"
"reflect"
"slices"
"strconv"
Expand All @@ -18,9 +19,58 @@ import (
"golang.org/x/exp/rand"
)

type ColumnSpec struct {
type testColumnSpec struct {
DataType
BundleSize int // Only used for DataTypePrefixBytes
IntRange intRange // Only used for DataTypeUint
BundleSize int // Only used for DataTypePrefixBytes
}

type intRange struct {
Min, Max uint64
ExpectedEncoding UintEncoding
}

func (ir intRange) Rand(rng *rand.Rand) uint64 {
v := rng.Uint64()
if ir.Min == 0 && ir.Max == math.MaxUint64 {
return v
}
return ir.Min + v%(ir.Max-ir.Min+1)
}

var interestingIntRanges = []intRange{
// zero
{Min: 0, Max: 0, ExpectedEncoding: makeUintEncoding(0, false)},
// const
{Min: 1, Max: 1, ExpectedEncoding: makeUintEncoding(0, true)},
{Min: math.MaxUint32, Max: math.MaxUint32, ExpectedEncoding: makeUintEncoding(0, true)},
{Min: math.MaxUint64, Max: math.MaxUint64, ExpectedEncoding: makeUintEncoding(0, true)},
// 1b
{Min: 10, Max: 200, ExpectedEncoding: makeUintEncoding(1, false)},
{Min: 0, Max: math.MaxUint8, ExpectedEncoding: makeUintEncoding(1, false)},
// 1b,delta
{Min: 100, Max: 300, ExpectedEncoding: makeUintEncoding(1, true)},
{Min: math.MaxUint32 + 100, Max: math.MaxUint32 + 300, ExpectedEncoding: makeUintEncoding(1, true)},
{Min: math.MaxUint64 - 1, Max: math.MaxUint64, ExpectedEncoding: makeUintEncoding(1, true)},
// 2b
{Min: 10, Max: 20_000, ExpectedEncoding: makeUintEncoding(2, false)},
{Min: 0, Max: math.MaxUint8 + 1, ExpectedEncoding: makeUintEncoding(2, false)},
{Min: 0, Max: math.MaxUint16, ExpectedEncoding: makeUintEncoding(2, false)},
// 2b,delta
{Min: 20_000, Max: 80_000, ExpectedEncoding: makeUintEncoding(2, true)},
{Min: math.MaxUint32, Max: math.MaxUint32 + 50_000, ExpectedEncoding: makeUintEncoding(2, true)},
// 4b
{Min: 10, Max: 20_000_000, ExpectedEncoding: makeUintEncoding(4, false)},
{Min: 0, Max: math.MaxUint16 + 1, ExpectedEncoding: makeUintEncoding(4, false)},
{Min: 0, Max: math.MaxUint32, ExpectedEncoding: makeUintEncoding(4, false)},
// 4b,delta
{Min: 100_000, Max: math.MaxUint32 + 10, ExpectedEncoding: makeUintEncoding(4, true)},
{Min: math.MaxUint32, Max: math.MaxUint32 + 20_000_000, ExpectedEncoding: makeUintEncoding(4, true)},
// 8b
{Min: 10, Max: math.MaxUint32 + 100, ExpectedEncoding: makeUintEncoding(8, false)},
{Min: 0, Max: math.MaxUint32 + 1, ExpectedEncoding: makeUintEncoding(8, false)},
{Min: 0, Max: math.MaxUint64, ExpectedEncoding: makeUintEncoding(8, false)},
{Min: math.MaxUint64 - math.MaxUint32 - 1, Max: math.MaxUint64, ExpectedEncoding: makeUintEncoding(8, false)},
}

func TestBlockWriter(t *testing.T) {
Expand Down Expand Up @@ -48,20 +98,8 @@ func TestBlockWriter(t *testing.T) {
switch colDataTypes[i] {
case DataTypeBool:
colWriters[i] = &BitmapBuilder{}
case DataTypeUint8:
b := &UintBuilder[uint8]{}
b.Init()
colWriters[i] = b
case DataTypeUint16:
b := &UintBuilder[uint16]{}
b.Init()
colWriters[i] = b
case DataTypeUint32:
b := &UintBuilder[uint32]{}
b.Init()
colWriters[i] = b
case DataTypeUint64:
b := &UintBuilder[uint64]{}
case DataTypeUint:
b := &UintBuilder{}
b.Init()
colWriters[i] = b
case DataTypeBytes:
Expand Down Expand Up @@ -93,29 +131,8 @@ func TestBlockWriter(t *testing.T) {
panicIfErr(dataType, lineFields[r][c], err)
bb.Set(r, v)
}
case DataTypeUint8:
b := colWriters[c].(*UintBuilder[uint8])
for r := range lineFields {
v, err := strconv.ParseUint(lineFields[r][c], 10, 8)
panicIfErr(dataType, lineFields[r][c], err)
b.Set(r, uint8(v))
}
case DataTypeUint16:
b := colWriters[c].(*UintBuilder[uint16])
for r := range lineFields {
v, err := strconv.ParseUint(lineFields[r][c], 10, 16)
panicIfErr(dataType, lineFields[r][c], err)
b.Set(r, uint16(v))
}
case DataTypeUint32:
b := colWriters[c].(*UintBuilder[uint32])
for r := range lineFields {
v, err := strconv.ParseUint(lineFields[r][c], 10, 32)
panicIfErr(dataType, lineFields[r][c], err)
b.Set(r, uint32(v))
}
case DataTypeUint64:
b := colWriters[c].(*UintBuilder[uint64])
case DataTypeUint:
b := colWriters[c].(*UintBuilder)
for r := range lineFields {
v, err := strconv.ParseUint(lineFields[r][c], 10, 64)
panicIfErr(dataType, lineFields[r][c], err)
Expand Down Expand Up @@ -156,7 +173,7 @@ func dataTypeFromName(name string) DataType {
// returns the serialized raw block and a []interface{} slice containing the
// generated data. The type of each element of the slice is dependent on the
// corresponding column's type.
func randBlock(rng *rand.Rand, rows int, schema []ColumnSpec) ([]byte, []interface{}) {
func randBlock(rng *rand.Rand, rows int, schema []testColumnSpec) ([]byte, []interface{}) {
data := make([]interface{}, len(schema))
for col := range data {
switch schema[col].DataType {
Expand All @@ -166,28 +183,10 @@ func randBlock(rng *rand.Rand, rows int, schema []ColumnSpec) ([]byte, []interfa
v[row] = (rng.Int31() % 2) == 0
}
data[col] = v
case DataTypeUint8:
v := make([]uint8, rows)
for row := 0; row < rows; row++ {
v[row] = uint8(rng.Uint32())
}
data[col] = v
case DataTypeUint16:
v := make([]uint16, rows)
for row := 0; row < rows; row++ {
v[row] = uint16(rng.Uint32())
}
data[col] = v
case DataTypeUint32:
v := make([]uint32, rows)
for row := 0; row < rows; row++ {
v[row] = rng.Uint32()
}
data[col] = v
case DataTypeUint64:
case DataTypeUint:
v := make([]uint64, rows)
for row := 0; row < rows; row++ {
v[row] = rng.Uint64()
v[row] = schema[col].IntRange.Rand(rng)
}
data[col] = v
case DataTypeBytes:
Expand All @@ -213,7 +212,7 @@ func randBlock(rng *rand.Rand, rows int, schema []ColumnSpec) ([]byte, []interfa
return buf, data
}

func buildBlock(schema []ColumnSpec, rows int, data []interface{}) []byte {
func buildBlock(schema []testColumnSpec, rows int, data []interface{}) []byte {
cw := make([]ColumnWriter, len(schema))
for col := range schema {
switch schema[col].DataType {
Expand All @@ -224,29 +223,8 @@ func buildBlock(schema []ColumnSpec, rows int, data []interface{}) []byte {
bb.Set(row, v)
}
cw[col] = &bb
case DataTypeUint8:
var b UintBuilder[uint8]
b.Init()
for row, v := range data[col].([]uint8) {
b.Set(row, v)
}
cw[col] = &b
case DataTypeUint16:
var b UintBuilder[uint16]
b.Init()
for row, v := range data[col].([]uint16) {
b.Set(row, v)
}
cw[col] = &b
case DataTypeUint32:
var b UintBuilder[uint32]
b.Init()
for row, v := range data[col].([]uint32) {
b.Set(row, v)
}
cw[col] = &b
case DataTypeUint64:
var b UintBuilder[uint64]
case DataTypeUint:
var b UintBuilder
b.Init()
for row, v := range data[col].([]uint64) {
b.Set(row, v)
Expand Down Expand Up @@ -276,7 +254,7 @@ func buildBlock(schema []ColumnSpec, rows int, data []interface{}) []byte {
return FinishBlock(rows, cw)
}

func testRandomBlock(t *testing.T, rng *rand.Rand, rows int, schema []ColumnSpec) {
func testRandomBlock(t *testing.T, rng *rand.Rand, rows int, schema []testColumnSpec) {
var sb strings.Builder
for i := range schema {
if i > 0 {
Expand Down Expand Up @@ -306,14 +284,8 @@ func testRandomBlock(t *testing.T, rng *rand.Rand, rows int, schema []ColumnSpec
switch spec.DataType {
case DataTypeBool:
got = Clone(r.Bitmap(col), rows)
case DataTypeUint8:
got = Clone(r.Uint8s(col), rows)
case DataTypeUint16:
got = Clone(r.Uint16s(col), rows)
case DataTypeUint32:
got = Clone(r.Uint32s(col), rows)
case DataTypeUint64:
got = Clone(r.Uint64s(col), rows)
case DataTypeUint:
got = Clone(r.Uints(col), rows)
case DataTypeBytes:
got = Clone(r.RawBytes(col), rows)
case DataTypePrefixBytes:
Expand All @@ -334,16 +306,15 @@ func TestBlockWriterRandomized(t *testing.T) {
randInt := func(lo, hi int) int {
return lo + rng.Intn(hi-lo)
}
testRandomBlock(t, rng, randInt(1, 100), []ColumnSpec{{DataType: DataTypeBool}})
testRandomBlock(t, rng, randInt(1, 100), []ColumnSpec{{DataType: DataTypeUint8}})
testRandomBlock(t, rng, randInt(1, 100), []ColumnSpec{{DataType: DataTypeUint16}})
testRandomBlock(t, rng, randInt(1, 100), []ColumnSpec{{DataType: DataTypeUint32}})
testRandomBlock(t, rng, randInt(1, 100), []ColumnSpec{{DataType: DataTypeUint64}})
testRandomBlock(t, rng, randInt(1, 100), []ColumnSpec{{DataType: DataTypeBytes}})
testRandomBlock(t, rng, randInt(1, 100), []ColumnSpec{{DataType: DataTypePrefixBytes, BundleSize: 1 << randInt(0, 6)}})
testRandomBlock(t, rng, randInt(1, 100), []testColumnSpec{{DataType: DataTypeBool}})
for _, r := range interestingIntRanges {
testRandomBlock(t, rng, randInt(1, 100), []testColumnSpec{{DataType: DataTypeUint, IntRange: r}})
}
testRandomBlock(t, rng, randInt(1, 100), []testColumnSpec{{DataType: DataTypeBytes}})
testRandomBlock(t, rng, randInt(1, 100), []testColumnSpec{{DataType: DataTypePrefixBytes, BundleSize: 1 << randInt(0, 6)}})

for i := 0; i < 100; i++ {
schema := make([]ColumnSpec, 2+rng.Intn(8))
schema := make([]testColumnSpec, 2+rng.Intn(8))
for j := range schema {
schema[j].DataType = DataType(randInt(1, int(dataTypesCount)))
if schema[j].DataType == DataTypePrefixBytes {
Expand Down
Loading

0 comments on commit 1e5fd68

Please sign in to comment.