Skip to content

Commit

Permalink
storage: integrate columnar blocks, disabled by default
Browse files Browse the repository at this point in the history
This commit defines a colblk.KeySchema for use with CockroachDB keys, bumps the
format major version in 24.3 to FormatColumnarBlocks and gates use of columnar
blocks behind a new `storage.columnar_blocks.enabled` cluster setting.

This commit is port of code that's been iterated on within Pebble:
https://github.com/cockroachdb/pebble/blob/9522f39482c31a6bc956b438b92d94eb1b74ec79/internal/crdbtest/crdbtest.go

Epic: none
Release note: none
  • Loading branch information
jbowens committed Oct 16, 2024
1 parent 7f2a743 commit d56ed5c
Show file tree
Hide file tree
Showing 9 changed files with 1,090 additions and 4 deletions.
1 change: 1 addition & 0 deletions docs/generated/settings/settings-for-tenants.txt
Original file line number Diff line number Diff line change
Expand Up @@ -385,6 +385,7 @@ sql.ttl.job.enabled boolean true whether the TTL job is enabled application
sql.txn.read_committed_isolation.enabled boolean true set to true to allow transactions to use the READ COMMITTED isolation level if specified by BEGIN/SET commands application
sql.txn.repeatable_read_isolation.enabled (alias: sql.txn.snapshot_isolation.enabled) boolean false set to true to allow transactions to use the REPEATABLE READ isolation level if specified by BEGIN/SET commands application
sql.txn_fingerprint_id_cache.capacity integer 100 the maximum number of txn fingerprint IDs stored application
storage.columnar_blocks.enabled boolean false set to true to enable columnar-blocks to store KVs in a columnar format system-visible
storage.ingestion.value_blocks.enabled boolean true set to true to enable writing of value blocks in ingestion sstables application
storage.max_sync_duration duration 20s maximum duration for disk operations; any operations that take longer than this setting trigger a warning log entry or process crash system-visible
storage.max_sync_duration.fatal.enabled boolean true if true, fatal the process when a disk operation exceeds storage.max_sync_duration application
Expand Down
1 change: 1 addition & 0 deletions docs/generated/settings/settings.html
Original file line number Diff line number Diff line change
Expand Up @@ -339,6 +339,7 @@
<tr><td><div id="setting-sql-txn-read-committed-isolation-enabled" class="anchored"><code>sql.txn.read_committed_isolation.enabled</code></div></td><td>boolean</td><td><code>true</code></td><td>set to true to allow transactions to use the READ COMMITTED isolation level if specified by BEGIN/SET commands</td><td>Serverless/Dedicated/Self-Hosted</td></tr>
<tr><td><div id="setting-sql-txn-snapshot-isolation-enabled" class="anchored"><code>sql.txn.repeatable_read_isolation.enabled<br />(alias: sql.txn.snapshot_isolation.enabled)</code></div></td><td>boolean</td><td><code>false</code></td><td>set to true to allow transactions to use the REPEATABLE READ isolation level if specified by BEGIN/SET commands</td><td>Serverless/Dedicated/Self-Hosted</td></tr>
<tr><td><div id="setting-sql-txn-fingerprint-id-cache-capacity" class="anchored"><code>sql.txn_fingerprint_id_cache.capacity</code></div></td><td>integer</td><td><code>100</code></td><td>the maximum number of txn fingerprint IDs stored</td><td>Serverless/Dedicated/Self-Hosted</td></tr>
<tr><td><div id="setting-storage-columnar-blocks-enabled" class="anchored"><code>storage.columnar_blocks.enabled</code></div></td><td>boolean</td><td><code>false</code></td><td>set to true to enable columnar-blocks to store KVs in a columnar format</td><td>Dedicated/Self-hosted (read-write); Serverless (read-only)</td></tr>
<tr><td><div id="setting-storage-experimental-eventually-file-only-snapshots-enabled" class="anchored"><code>storage.experimental.eventually_file_only_snapshots.enabled</code></div></td><td>boolean</td><td><code>true</code></td><td>set to false to disable eventually-file-only-snapshots (kv.snapshot_receiver.excise.enabled must also be false)</td><td>Dedicated/Self-Hosted</td></tr>
<tr><td><div id="setting-storage-ingest-split-enabled" class="anchored"><code>storage.ingest_split.enabled</code></div></td><td>boolean</td><td><code>true</code></td><td>set to false to disable ingest-time splitting that lowers write-amplification</td><td>Dedicated/Self-Hosted</td></tr>
<tr><td><div id="setting-storage-ingestion-value-blocks-enabled" class="anchored"><code>storage.ingestion.value_blocks.enabled</code></div></td><td>boolean</td><td><code>true</code></td><td>set to true to enable writing of value blocks in ingestion sstables</td><td>Serverless/Dedicated/Self-Hosted</td></tr>
Expand Down
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,7 @@ require (
github.com/cockroachdb/cmux v0.0.0-20170110192607-30d10be49292
github.com/cockroachdb/cockroach-go/v2 v2.3.7
github.com/cockroachdb/crlfmt v0.0.0-20221214225007-b2fc5c302548
github.com/cockroachdb/crlib v0.0.0-20241015224233-894974b3ad94
github.com/cockroachdb/datadriven v1.0.3-0.20240530155848-7682d40af056
github.com/cockroachdb/errors v1.11.3
github.com/cockroachdb/fifo v0.0.0-20240606204812-0bbfbd93a7ce
Expand Down Expand Up @@ -306,7 +307,6 @@ require (
github.com/cespare/xxhash/v2 v2.2.0 // indirect
github.com/charmbracelet/bubbletea v0.23.1 // indirect
github.com/charmbracelet/lipgloss v0.6.0 // indirect
github.com/cockroachdb/crlib v0.0.0-20241015224233-894974b3ad94 // indirect
github.com/cockroachdb/swiss v0.0.0-20240612210725-f4de07ae6964 // indirect
github.com/danieljoos/wincred v1.1.2 // indirect
github.com/decred/dcrd/dcrec/secp256k1/v4 v4.3.0 // indirect
Expand Down
8 changes: 8 additions & 0 deletions pkg/storage/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ go_library(
"pebble.go",
"pebble_batch.go",
"pebble_iterator.go",
"pebble_key_schema.go",
"pebble_logger_and_tracer.go",
"pebble_merge.go",
"pebble_mvcc_scanner.go",
Expand Down Expand Up @@ -86,6 +87,7 @@ go_library(
"//pkg/util/timeutil",
"//pkg/util/tracing",
"//pkg/util/uuid",
"@com_github_cockroachdb_crlib//crbytes",
"@com_github_cockroachdb_errors//:errors",
"@com_github_cockroachdb_errors//oserror",
"@com_github_cockroachdb_fifo//:fifo",
Expand All @@ -100,6 +102,7 @@ go_library(
"@com_github_cockroachdb_pebble//replay",
"@com_github_cockroachdb_pebble//sstable",
"@com_github_cockroachdb_pebble//sstable/block",
"@com_github_cockroachdb_pebble//sstable/colblk",
"@com_github_cockroachdb_pebble//vfs",
"@com_github_cockroachdb_pebble//wal",
"@com_github_cockroachdb_redact//:redact",
Expand Down Expand Up @@ -139,6 +142,7 @@ go_test(
"mvcc_value_test.go",
"open_test.go",
"pebble_iterator_test.go",
"pebble_key_schema_test.go",
"pebble_mvcc_scanner_test.go",
"pebble_test.go",
"read_as_of_iterator_test.go",
Expand Down Expand Up @@ -198,6 +202,8 @@ go_test(
"//pkg/util/timeutil",
"//pkg/util/uint128",
"//pkg/util/uuid",
"@com_github_cockroachdb_crlib//crbytes",
"@com_github_cockroachdb_crlib//crstrings",
"@com_github_cockroachdb_datadriven//:datadriven",
"@com_github_cockroachdb_errors//:errors",
"@com_github_cockroachdb_errors//oserror",
Expand All @@ -206,9 +212,11 @@ go_test(
"@com_github_cockroachdb_pebble//objstorage/objstorageprovider",
"@com_github_cockroachdb_pebble//sstable",
"@com_github_cockroachdb_pebble//sstable/block",
"@com_github_cockroachdb_pebble//sstable/colblk",
"@com_github_cockroachdb_pebble//vfs",
"@com_github_cockroachdb_redact//:redact",
"@com_github_kr_pretty//:pretty",
"@com_github_olekukonko_tablewriter//:tablewriter",
"@com_github_stretchr_testify//assert",
"@com_github_stretchr_testify//require",
"@org_golang_google_protobuf//proto",
Expand Down
43 changes: 40 additions & 3 deletions pkg/storage/pebble.go
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,15 @@ var IngestSplitEnabled = settings.RegisterBoolSetting(
settings.WithPublic,
)

// columnarBlocksEnabled controls whether columnar-blocks are enabled in Pebble.
var columnarBlocksEnabled = settings.RegisterBoolSetting(
settings.SystemVisible,
"storage.columnar_blocks.enabled",
"set to true to enable columnar-blocks to store KVs in a columnar format",
false, // TODO(jackson): Metamorphicize this.
settings.WithPublic,
)

// IngestAsFlushable controls whether ingested sstables that overlap the
// memtable may be lazily ingested: written to the WAL and enqueued in the list
// of flushables (eg, memtables, large batches and now lazily-ingested
Expand Down Expand Up @@ -407,6 +416,30 @@ func EngineSuffixCompare(a, b []byte) int {
return bytes.Compare(b[:len(b)-1], a[:len(a)-1])
}

// EnginePointSuffixCompare compares suffixes of Cockroach point keys (which are
// composed of the version and a trailing version-length byte); the version can
// be an MVCC timestamp or a lock key. EnginePointSuffixCompare differs from
// EngineSuffixCompare, because EnginePointSuffixCompare normalizes the
// suffixes. Ideally we'd have one function that implemented the semantics of
// EnginePointSuffixCompare, but due to historical reasons, range key suffix
// comparisons must not perform normalization.
//
// See https://github.com/cockroachdb/cockroach/issues/130533
func EnginePointSuffixCompare(a, b []byte) int {
// NB: For performance, this routine manually splits the key into the
// user-key and version components rather than using DecodeEngineKey. In
// most situations, use DecodeEngineKey or GetKeyPartFromEngineKey or
// SplitMVCCKey instead of doing this.
if len(a) == 0 || len(b) == 0 {
// Empty suffixes sort before non-empty suffixes.
return cmp.Compare(len(a), len(b))
}
return bytes.Compare(
normalizeEngineSuffixForCompare(b),
normalizeEngineSuffixForCompare(a),
)
}

func checkEngineKey(k []byte) {
if len(k) == 0 {
panic(errors.AssertionFailedf("empty key"))
Expand Down Expand Up @@ -792,8 +825,9 @@ const MinimumSupportedFormatVersion = pebble.FormatSyntheticPrefixSuffix
// DefaultPebbleOptions returns the default pebble options.
func DefaultPebbleOptions() *pebble.Options {
opts := &pebble.Options{
Comparer: EngineComparer,
FS: vfs.Default,
Comparer: EngineComparer,
FS: vfs.Default,
KeySchema: keySchema,
// A value of 2 triggers a compaction when there is 1 sub-level.
L0CompactionThreshold: 2,
L0StopWritesThreshold: 1000,
Expand Down Expand Up @@ -1190,6 +1224,9 @@ func newPebble(ctx context.Context, cfg engineConfig) (p *Pebble, err error) {
cfg.opts.Experimental.IngestSplit = func() bool {
return IngestSplitEnabled.Get(&cfg.settings.SV)
}
cfg.opts.Experimental.EnableColumnarBlocks = func() bool {
return columnarBlocksEnabled.Get(&cfg.settings.SV)
}

auxDir := cfg.opts.FS.PathJoin(cfg.env.Dir, base.AuxiliaryDir)
if !cfg.env.IsReadOnly() {
Expand Down Expand Up @@ -2512,7 +2549,7 @@ func (p *Pebble) CreateCheckpoint(dir string, spans []roachpb.Span) error {
// version associated with it, since they did so during the fence version.
var pebbleFormatVersionMap = map[clusterversion.Key]pebble.FormatMajorVersion{
clusterversion.V24_1: pebble.FormatSyntheticPrefixSuffix,
clusterversion.V24_3: pebble.FormatFlushableIngestExcises,
clusterversion.V24_3: pebble.FormatColumnarBlocks,
}

// pebbleFormatVersionKeys contains the keys in the map above, in descending order.
Expand Down
Loading

0 comments on commit d56ed5c

Please sign in to comment.