Skip to content

Commit

Permalink
compact: add point tombstone density compaction heuristic
Browse files Browse the repository at this point in the history
This change adds a heuristic to compact point tombstones based on
their density across the LSM. We add a new table property called
`TombstoneDenseBlocksRatio` and a corresponding field in `TableStats` that
tracks the ratio of data blocks in each table which are considered
tombstone-dense. This value is calculated on the fly while tables are being
written, so no extra I/O is required later on to compute it.

A data block is considered tombstone-dense if it fulfills either of the
following criteria:
1. The block contains at least `options.Experimental.NumDeletionsThreshold`
point tombstones. The default value is `100`.
2. The ratio of the uncompressed size of point tombstones to the uncompressed
size of the block is at least `options.Experimental.DeletionSizeRatioThreshold`.
For example, with the default value of `0.5`, a data block of size 4KB
would be considered tombstone-dense if it contains at least 2KB of point
tombstones.

The intuition for these criteria is best described in
[this discussion](#918 (comment)),
which highlights that dense clusters are bad because they a) waste CPU when
skipping over tombstones, and b) waste I/O because we end up loading more
blocks per live key. The two criteria above are meant to tackle these two
issues respectively; the the count-based threshold prevents CPU waste,
and the size-based threshold prevents I/O waste.

A table is considered eligible for the new tombstone compaction type if
its ratio of tombstone-dense blocks is at least `options.Experimental.MinTombstoneDenseRatio`.
The default value is `0.05`. We use an Annotator in a similar way to
elision-only compactions in order to prioritize compacting the table with
the most tombstone-dense blocks if there are multiple eligible tables.
The default here was chosen through experimentation on CockroachDB KV
workloads; with a lower value we were compacting too aggressively leading
to very high write amplification, but lower values led to very few
noticeable performance improvements.

Fixes: #918
  • Loading branch information
anish-shanbhag committed Aug 12, 2024
1 parent cda4471 commit 1cdb2e4
Show file tree
Hide file tree
Showing 25 changed files with 363 additions and 148 deletions.
3 changes: 3 additions & 0 deletions compaction.go
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,7 @@ const (
compactionKindDeleteOnly
compactionKindElisionOnly
compactionKindRead
compactionKindTombstoneDensity
compactionKindRewrite
compactionKindIngestedFlushable
)
Expand All @@ -153,6 +154,8 @@ func (k compactionKind) String() string {
return "elision-only"
case compactionKindRead:
return "read"
case compactionKindTombstoneDensity:
return "tombstone-density"
case compactionKindRewrite:
return "rewrite"
case compactionKindIngestedFlushable:
Expand Down
138 changes: 94 additions & 44 deletions compaction_picker.go
Original file line number Diff line number Diff line change
Expand Up @@ -592,6 +592,7 @@ func newCompactionPickerByScore(
virtualBackings: virtualBackings,
}
p.initLevelMaxBytes(inProgressCompactions)
p.initTombstoneDensityAnnotator(opts)
return p
}

Expand Down Expand Up @@ -672,6 +673,11 @@ type compactionPickerByScore struct {
// levelMaxBytes holds the dynamically adjusted max bytes setting for each
// level.
levelMaxBytes [numLevels]int64
// tombstoneDensityAnnotator holds the annotator for choosing tombstone
// density compactions.
// NB: This is declared here rather than globally because
// options.Experimental.MinTombstoneDenseRatio is not known until runtime.
tombstoneDensityAnnotator *manifest.Annotator[fileMetadata]
}

var _ compactionPicker = &compactionPickerByScore{}
Expand Down Expand Up @@ -1287,6 +1293,13 @@ func (p *compactionPickerByScore) pickAuto(env compactionEnv) (pc *pickedCompact
}
}

// Check for files which contain excessive point tombstones that could slow
// down reads. Unlike elision-only compactions, these compactions may select
// a file at any level rather than only the lowest level.
if pc := p.pickTombstoneDensityCompaction(env); pc != nil {
return pc
}

// Check for L6 files with tombstones that may be elided. These files may
// exist if a snapshot prevented the elision of a tombstone or because of
// a move compaction. These are low-priority compactions because they
Expand Down Expand Up @@ -1415,6 +1428,38 @@ var markedForCompactionAnnotator = &manifest.Annotator[fileMetadata]{
},
}

// pickedCompactionFromCandidateFile creates a pickedCompaction from a *fileMetadata
// with various checks to ensure that the file still exists in the expected level
// and isn't already being compacted.
func (p *compactionPickerByScore) pickedCompactionFromCandidateFile(
candidate *fileMetadata, env compactionEnv, startLevel int, outputLevel int, kind compactionKind,
) *pickedCompaction {
if candidate == nil || candidate.IsCompacting() {
return nil
}

inputs := p.vers.Levels[startLevel].Find(p.opts.Comparer.Compare, candidate)
if inputs.Empty() {
panic(fmt.Sprintf("file %s not found in level %d as expected", candidate.FileNum, startLevel))
}

pc := newPickedCompaction(p.opts, p.vers, startLevel, outputLevel, p.baseLevel)
pc.kind = kind
pc.startLevel.files = inputs
pc.smallest, pc.largest = manifest.KeyRange(pc.cmp, pc.startLevel.files.Iter())

// Fail-safe to protect against compacting the same sstable concurrently.
if inputRangeAlreadyCompacting(env, pc) {
return nil
}

if !pc.setupInputs(p.opts, env.diskAvailBytes, pc.startLevel) {
return nil
}

return pc
}

// pickElisionOnlyCompaction looks for compactions of sstables in the
// bottommost level containing obsolete records that may now be dropped.
func (p *compactionPickerByScore) pickElisionOnlyCompaction(
Expand All @@ -1427,28 +1472,10 @@ func (p *compactionPickerByScore) pickElisionOnlyCompaction(
if candidate == nil {
return nil
}
if candidate.IsCompacting() || candidate.LargestSeqNum >= env.earliestSnapshotSeqNum {
if candidate.LargestSeqNum >= env.earliestSnapshotSeqNum {
return nil
}
lf := p.vers.Levels[numLevels-1].Find(p.opts.Comparer.Compare, candidate)
if lf.Empty() {
panic(fmt.Sprintf("file %s not found in level %d as expected", candidate.FileNum, numLevels-1))
}

// Construct a picked compaction of the elision candidate's atomic
// compaction unit.
pc = newPickedCompaction(p.opts, p.vers, numLevels-1, numLevels-1, p.baseLevel)
pc.kind = compactionKindElisionOnly
pc.startLevel.files = lf
if anyTablesCompacting(lf) {
return nil
}
pc.smallest, pc.largest = manifest.KeyRange(pc.cmp, pc.startLevel.files.Iter())
// Fail-safe to protect against compacting the same sstable concurrently.
if !inputRangeAlreadyCompacting(env, pc) {
return pc
}
return nil
return p.pickedCompactionFromCandidateFile(candidate, env, numLevels-1, numLevels-1, compactionKindElisionOnly)
}

// pickRewriteCompaction attempts to construct a compaction that
Expand All @@ -1463,36 +1490,59 @@ func (p *compactionPickerByScore) pickRewriteCompaction(env compactionEnv) (pc *
// Try the next level.
continue
}
if candidate.IsCompacting() {
// Try the next level.
continue
}
lf := p.vers.Levels[l].Find(p.opts.Comparer.Compare, candidate)
if lf.Empty() {
panic(fmt.Sprintf("file %s not found in level %d as expected", candidate.FileNum, numLevels-1))
pc := p.pickedCompactionFromCandidateFile(candidate, env, l, l, compactionKindRewrite)
if pc != nil {
return pc
}
}
return nil
}

inputs := lf
if anyTablesCompacting(inputs) {
// Try the next level.
continue
}
func (p *compactionPickerByScore) initTombstoneDensityAnnotator(opts *Options) {
p.tombstoneDensityAnnotator = &manifest.Annotator[fileMetadata]{
Aggregator: manifest.PickFileAggregator{
Filter: func(f *fileMetadata) (eligible bool, cacheOK bool) {
if f.IsCompacting() {
return false, true
}
if !f.StatsValid() {
return false, false
}
return f.Stats.TombstoneDenseBlocksRatio > opts.Experimental.TombstoneDenseCompactionThreshold, true
},
Compare: func(a, b *fileMetadata) bool {
return a.Stats.TombstoneDenseBlocksRatio > b.Stats.TombstoneDenseBlocksRatio
},
},
}
}

pc = newPickedCompaction(p.opts, p.vers, l, l, p.baseLevel)
pc.outputLevel.level = l
pc.kind = compactionKindRewrite
pc.startLevel.files = inputs
pc.smallest, pc.largest = manifest.KeyRange(pc.cmp, pc.startLevel.files.Iter())
// pickTombstoneDensityCompaction looks for a compaction that eliminates
// regions of extremely high point tombstone density. For each level, it picks
// a file where the ratio of tombstone-dense blocks is at least
// options.Experimental.MinTombstoneDenseRatio, prioritizing compaction of
// files with higher ratios of tombstone-dense blocks.
func (p *compactionPickerByScore) pickTombstoneDensityCompaction(
env compactionEnv,
) (pc *pickedCompaction) {
if p.opts.Experimental.TombstoneDenseCompactionThreshold == -1 {
// Tombstone density compactions are disabled.
return nil
}

// Fail-safe to protect against compacting the same sstable concurrently.
if !inputRangeAlreadyCompacting(env, pc) {
if pc.startLevel.level == 0 {
pc.startLevel.l0SublevelInfo = generateSublevelInfo(pc.cmp, pc.startLevel.files)
}
return pc
var candidate *fileMetadata
var level int
// NB: We don't consider L0 or the lowest level.
for l := 1; l < numLevels-1; l++ {
f := p.tombstoneDensityAnnotator.LevelAnnotation(p.vers.Levels[l])
newCandidate := p.tombstoneDensityAnnotator.Aggregator.Merge(f, candidate)
if newCandidate != candidate {
candidate = newCandidate
level = l
}
}
return nil

return p.pickedCompactionFromCandidateFile(candidate, env, level, defaultOutputLevel(level, p.baseLevel), compactionKindTombstoneDensity)
}

// pickAutoLPositive picks an automatic compaction for the candidate
Expand Down
1 change: 1 addition & 0 deletions compaction_picker_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -517,6 +517,7 @@ func TestCompactionPickerL0(t *testing.T) {
}
vs.picker = picker
picker.initLevelMaxBytes(inProgressCompactions)
picker.initTombstoneDensityAnnotator(opts)

var buf bytes.Buffer
fmt.Fprint(&buf, version.String())
Expand Down
10 changes: 10 additions & 0 deletions internal/manifest/version.go
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,16 @@ type TableStats struct {
ValueBlocksSize uint64
// CompressionType is the compression type of the table.
CompressionType block.Compression
// TombstoneDenseBlocksRatio is the ratio of data blocks in this table that
// fulfills at least one of the following:
// 1. The block contains at least options.Experimental.NumDeletionsThreshold
// point tombstones.
// 2. The ratio of the uncompressed size of point tombstones to the
// uncompressed size of the block is at least
// options.Experimental.DeletionSizeRatioThreshold.
// This statistic is used to determine eligibility for a tombstone density
// compaction.
TombstoneDenseBlocksRatio float64
}

// boundType represents the type of key (point or range) present as the smallest
Expand Down
24 changes: 13 additions & 11 deletions metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -153,16 +153,17 @@ type Metrics struct {

Compact struct {
// The total number of compactions, and per-compaction type counts.
Count int64
DefaultCount int64
DeleteOnlyCount int64
ElisionOnlyCount int64
CopyCount int64
MoveCount int64
ReadCount int64
RewriteCount int64
MultiLevelCount int64
CounterLevelCount int64
Count int64
DefaultCount int64
DeleteOnlyCount int64
ElisionOnlyCount int64
CopyCount int64
MoveCount int64
ReadCount int64
TombstoneDensityCount int64
RewriteCount int64
MultiLevelCount int64
CounterLevelCount int64
// An estimate of the number of bytes that need to be compacted for the LSM
// to reach a stable state.
EstimatedDebt uint64
Expand Down Expand Up @@ -580,12 +581,13 @@ func (m *Metrics) SafeFormat(w redact.SafePrinter, _ rune) {
redact.Safe(m.Compact.NumInProgress),
humanize.Bytes.Int64(m.Compact.InProgressBytes))

w.Printf(" default: %d delete: %d elision: %d move: %d read: %d rewrite: %d copy: %d multi-level: %d\n",
w.Printf(" default: %d delete: %d elision: %d move: %d read: %d tombstone-density: %d rewrite: %d copy: %d multi-level: %d\n",
redact.Safe(m.Compact.DefaultCount),
redact.Safe(m.Compact.DeleteOnlyCount),
redact.Safe(m.Compact.ElisionOnlyCount),
redact.Safe(m.Compact.MoveCount),
redact.Safe(m.Compact.ReadCount),
redact.Safe(m.Compact.TombstoneDensityCount),
redact.Safe(m.Compact.RewriteCount),
redact.Safe(m.Compact.CopyCount),
redact.Safe(m.Compact.MultiLevelCount))
Expand Down
1 change: 1 addition & 0 deletions metrics_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ func exampleMetrics() Metrics {
m.Compact.ElisionOnlyCount = 29
m.Compact.MoveCount = 30
m.Compact.ReadCount = 31
m.Compact.TombstoneDensityCount = 16
m.Compact.RewriteCount = 32
m.Compact.CopyCount = 33
m.Compact.MultiLevelCount = 34
Expand Down
54 changes: 54 additions & 0 deletions options.go
Original file line number Diff line number Diff line change
Expand Up @@ -599,6 +599,38 @@ type Options struct {
// gets multiplied with a constant of 1 << 16 to yield 1 << 20 (1MB).
ReadSamplingMultiplier int64

// NumDeletionsThreshold defines the minimum number of point tombstones
// that must be present in a single data block for that block to be
// considered tombstone-dense for the purposes of triggering a
// tombstone density compaction. Data blocks may also be considered
// tombstone-dense if they meet the criteria defined by
// DeletionSizeRatioThreshold below. Tombstone-dense blocks are identified
// when sstables are written, and so this is effectively an option for
// sstable writers. The default value is 100.
NumDeletionsThreshold int

// DeletionSizeRatioThreshold defines the minimum ratio of the size of
// point tombstones to the size of a data block that must be reached
// for that block to be considered tombstone-dense for the purposes of
// triggering a tombstone density compaction. Data blocks may also be
// considered tombstone-dense if they meet the criteria defined by
// NumDeletionsThreshold above. Tombstone-dense blocks are identified
// when sstables are written, and so this is effectively an option for
// sstable writers. The default value is 0.5.
DeletionSizeRatioThreshold float32

// TombstoneDenseCompactionThreshold is the minimum percent of data
// blocks in a table that must be tombstone-dense for that table to be
// eligible for a tombstone density compaction. It should be defined as a
// ratio out of 1. The default value is 0.05.
//
// If multiple tables are eligible for a tombstone density compaction, then
// tables with a higher percent of tombstone-dense blocks are still
// prioritized for compaction.
//
// A value of -1 disables tombstone density compactions.
TombstoneDenseCompactionThreshold float64

// TableCacheShards is the number of shards per table cache.
// Reducing the value can reduce the number of idle goroutines per DB
// instance which can be useful in scenarios with a lot of DB instances
Expand Down Expand Up @@ -1268,6 +1300,15 @@ func (o *Options) EnsureDefaults() *Options {
if o.Experimental.ReadSamplingMultiplier == 0 {
o.Experimental.ReadSamplingMultiplier = 1 << 4
}
if o.Experimental.NumDeletionsThreshold == 0 {
o.Experimental.NumDeletionsThreshold = sstable.DefaultNumDeletionsThreshold
}
if o.Experimental.DeletionSizeRatioThreshold == 0 {
o.Experimental.DeletionSizeRatioThreshold = sstable.DefaultDeletionSizeRatioThreshold
}
if o.Experimental.TombstoneDenseCompactionThreshold == 0 {
o.Experimental.TombstoneDenseCompactionThreshold = 0.05
}
if o.Experimental.TableCacheShards <= 0 {
o.Experimental.TableCacheShards = runtime.GOMAXPROCS(0)
}
Expand Down Expand Up @@ -1395,6 +1436,9 @@ func (o *Options) String() string {
}
fmt.Fprintf(&buf, " read_compaction_rate=%d\n", o.Experimental.ReadCompactionRate)
fmt.Fprintf(&buf, " read_sampling_multiplier=%d\n", o.Experimental.ReadSamplingMultiplier)
fmt.Fprintf(&buf, " num_deletions_threshold=%d\n", o.Experimental.NumDeletionsThreshold)
fmt.Fprintf(&buf, " deletion_size_ratio_threshold=%f\n", o.Experimental.DeletionSizeRatioThreshold)
fmt.Fprintf(&buf, " tombstone_dense_compaction_threshold=%f\n", o.Experimental.TombstoneDenseCompactionThreshold)
// We no longer care about strict_wal_tail, but set it to true in case an
// older version reads the options.
fmt.Fprintf(&buf, " strict_wal_tail=%t\n", true)
Expand Down Expand Up @@ -1711,6 +1755,14 @@ func (o *Options) Parse(s string, hooks *ParseHooks) error {
o.Experimental.ReadCompactionRate, err = strconv.ParseInt(value, 10, 64)
case "read_sampling_multiplier":
o.Experimental.ReadSamplingMultiplier, err = strconv.ParseInt(value, 10, 64)
case "num_deletions_threshold":
o.Experimental.NumDeletionsThreshold, err = strconv.Atoi(value)
case "deletion_size_ratio_threshold":
val, parseErr := strconv.ParseFloat(value, 32)
o.Experimental.DeletionSizeRatioThreshold = float32(val)
err = parseErr
case "tombstone_dense_compaction_threshold":
o.Experimental.TombstoneDenseCompactionThreshold, err = strconv.ParseFloat(value, 64)
case "table_cache_shards":
o.Experimental.TableCacheShards, err = strconv.Atoi(value)
case "table_format":
Expand Down Expand Up @@ -1985,6 +2037,8 @@ func (o *Options) MakeWriterOptions(level int, format sstable.TableFormat) sstab
writerOpts.FilterType = levelOpts.FilterType
writerOpts.IndexBlockSize = levelOpts.IndexBlockSize
writerOpts.AllocatorSizeClasses = o.AllocatorSizeClasses
writerOpts.NumDeletionsThreshold = o.Experimental.NumDeletionsThreshold
writerOpts.DeletionSizeRatioThreshold = o.Experimental.DeletionSizeRatioThreshold
return writerOpts
}

Expand Down
6 changes: 6 additions & 0 deletions options_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,9 @@ func TestOptionsString(t *testing.T) {
multilevel_compaction_heuristic=wamp(0.00, false)
read_compaction_rate=16000
read_sampling_multiplier=16
num_deletions_threshold=100
deletion_size_ratio_threshold=0.500000
tombstone_dense_compaction_threshold=0.050000
strict_wal_tail=true
table_cache_shards=8
validate_on_ingest=false
Expand Down Expand Up @@ -285,6 +288,9 @@ func TestOptionsParse(t *testing.T) {
}
opts.Experimental.ReadCompactionRate = 300
opts.Experimental.ReadSamplingMultiplier = 400
opts.Experimental.NumDeletionsThreshold = 500
opts.Experimental.DeletionSizeRatioThreshold = 0.7
opts.Experimental.TombstoneDenseCompactionThreshold = 0.2
opts.Experimental.TableCacheShards = 500
opts.Experimental.MaxWriterConcurrency = 1
opts.Experimental.ForceWriterParallelism = true
Expand Down
Loading

0 comments on commit 1cdb2e4

Please sign in to comment.