Skip to content

Commit

Permalink
compact: add point tombstone density compaction heuristic
Browse files Browse the repository at this point in the history
This change adds a heuristic to compact point tombstones based on
their density across the LSM. We add a new table property called
`NumTombstoneDenseBlocks` and a corresponding field in `TableStats` that
tracks the number of data blocks in each table which are considered
tombstone-dense. This value is calculated on the fly while tables are being
written, so no extra I/O is required later on to compute it.

A data block is considered tombstone-dense if it fulfills either of the
following criteria:
1. The block contains at least `options.Experimental.NumDeletionsThreshold`
point tombstones. The default value is `100`.
2. The ratio of the uncompressed size of point tombstones to the uncompressed
size of the block is at least `options.Experimental.DeletionSizeRatioThreshold`.
For example, with the default value of `0.5`, a data block of size 4KB
would be considered tombstone-dense if it contains at least 2KB of point
tombstones.

The intuition here is that as described [here](#918 (comment)),
dense clusters are bad because they a) waste CPU when skipping over tombstones,
and b) waste I/O because we end up loading more blocks per live key. The
two criteria above are meant to tackle these two issues respectively; the
the count-based threshold prevents CPU waste, and the size-based threshold
prevents I/O waste.

A table is considered eligible for the new tombstone compaction type if
it contains at least `options.Experimental.MinTombstoneDenseBlocks`
tombstone-dense data blocks. The default value is `20`. We use an Annotator
in a similar way to elision-only compactions in order to prioritize compacting
the table with the most tombstone-dense blocks if there are multiple
eligible tables. The default here was chosen through experimentation on
CockroachDB KV workloads; with a lower value we were compacting too
aggressively leading to very high write amplification, but lower values
led to very few noticeable performance improvements.
  • Loading branch information
anish-shanbhag committed Jul 24, 2024
1 parent 89e5644 commit 9772206
Show file tree
Hide file tree
Showing 20 changed files with 467 additions and 122 deletions.
3 changes: 3 additions & 0 deletions compaction.go
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,7 @@ const (
compactionKindDeleteOnly
compactionKindElisionOnly
compactionKindRead
compactionKindTombstoneDensity
compactionKindRewrite
compactionKindIngestedFlushable
)
Expand All @@ -153,6 +154,8 @@ func (k compactionKind) String() string {
return "elision-only"
case compactionKindRead:
return "read"
case compactionKindTombstoneDensity:
return "tombstone-density"
case compactionKindRewrite:
return "rewrite"
case compactionKindIngestedFlushable:
Expand Down
141 changes: 99 additions & 42 deletions compaction_picker.go
Original file line number Diff line number Diff line change
Expand Up @@ -1317,6 +1317,10 @@ func (p *compactionPickerByScore) pickAuto(env compactionEnv) (pc *pickedCompact
}
}

if pc := p.pickTombstoneDensityCompaction(env); pc != nil {
return pc
}

// Check for L6 files with tombstones that may be elided. These files may
// exist if a snapshot prevented the elision of a tombstone or because of
// a move compaction. These are low-priority compactions because they
Expand Down Expand Up @@ -1498,6 +1502,83 @@ func markedMergeHelper(f *fileMetadata, dst interface{}) (interface{}, bool) {
return dst, true
}

// TODO: replace this with the updated annotator interface when it's complete.
type tombstoneDensityAnnotator struct{}

var _ manifest.Annotator = tombstoneDensityAnnotator{}

func (a tombstoneDensityAnnotator) Zero(interface{}) interface{} {
return nil
}

func (a tombstoneDensityAnnotator) Accumulate(
f *fileMetadata, dst interface{},
) (interface{}, bool) {
if !f.StatsValid() || f.IsCompacting() {
return dst, false
}

// TODO: once the new annotator interface is in place, the threshold below
// should be defined by the MinTombstoneDenseBlocks option. For this reason,
// the annotator shouldn't be created until the option value is known.
if f.Stats.NumTombstoneDenseBlocks > 20 {
switch {
case dst == nil:
return f, true
case f.Stats.NumTombstoneDenseBlocks > dst.(*fileMetadata).Stats.NumTombstoneDenseBlocks:
return f, true
default:
return dst, true
}
}
return dst, true
}

func (a tombstoneDensityAnnotator) Merge(src interface{}, dst interface{}) interface{} {
switch {
case src == nil:
return dst
case dst == nil:
return src
case src.(*fileMetadata).Stats.NumTombstoneDenseBlocks > dst.(*fileMetadata).Stats.NumTombstoneDenseBlocks:
return src
default:
return dst
}
}

// pickedCompactionFromCandidateFile creates a pickedCompaction from a *fileMetadata
// with various checks to ensure that the file still exists in the expected level
// and isn't already being compacted.
func (p *compactionPickerByScore) pickedCompactionFromCandidateFile(
candidate *fileMetadata, env compactionEnv, startLevel int, outputLevel int, kind compactionKind,
) *pickedCompaction {
if candidate == nil || candidate.IsCompacting() {
return nil
}

inputs := p.vers.Levels[startLevel].Find(p.opts.Comparer.Compare, candidate)
if inputs.Empty() {
panic(fmt.Sprintf("file %s not found in level %d as expected", candidate.FileNum, startLevel))
}

pc := newPickedCompaction(p.opts, p.vers, startLevel, outputLevel, p.baseLevel)
pc.kind = kind
pc.startLevel.files = inputs
pc.smallest, pc.largest = manifest.KeyRange(pc.cmp, pc.startLevel.files.Iter())

// Fail-safe to protect against compacting the same sstable concurrently.
if inputRangeAlreadyCompacting(env, pc) {
return nil
}

if !pc.setupInputs(p.opts, env.diskAvailBytes, pc.startLevel) {
return nil
}

return pc
}

// pickElisionOnlyCompaction looks for compactions of sstables in the
// bottommost level containing obsolete records that may now be dropped.
func (p *compactionPickerByScore) pickElisionOnlyCompaction(
Expand All @@ -1511,28 +1592,10 @@ func (p *compactionPickerByScore) pickElisionOnlyCompaction(
return nil
}
candidate := v.(*fileMetadata)
if candidate.IsCompacting() || candidate.LargestSeqNum >= env.earliestSnapshotSeqNum {
if candidate.LargestSeqNum >= env.earliestSnapshotSeqNum {
return nil
}
lf := p.vers.Levels[numLevels-1].Find(p.opts.Comparer.Compare, candidate)
if lf.Empty() {
panic(fmt.Sprintf("file %s not found in level %d as expected", candidate.FileNum, numLevels-1))
}

// Construct a picked compaction of the elision candidate's atomic
// compaction unit.
pc = newPickedCompaction(p.opts, p.vers, numLevels-1, numLevels-1, p.baseLevel)
pc.kind = compactionKindElisionOnly
pc.startLevel.files = lf
if anyTablesCompacting(lf) {
return nil
}
pc.smallest, pc.largest = manifest.KeyRange(pc.cmp, pc.startLevel.files.Iter())
// Fail-safe to protect against compacting the same sstable concurrently.
if !inputRangeAlreadyCompacting(env, pc) {
return pc
}
return nil
return p.pickedCompactionFromCandidateFile(candidate, env, numLevels-1, numLevels-1, compactionKindElisionOnly)
}

// pickRewriteCompaction attempts to construct a compaction that
Expand All @@ -1548,32 +1611,26 @@ func (p *compactionPickerByScore) pickRewriteCompaction(env compactionEnv) (pc *
continue
}
candidate := v.(*fileMetadata)
if candidate.IsCompacting() {
// Try the next level.
continue
}
lf := p.vers.Levels[l].Find(p.opts.Comparer.Compare, candidate)
if lf.Empty() {
panic(fmt.Sprintf("file %s not found in level %d as expected", candidate.FileNum, numLevels-1))
pc := p.pickedCompactionFromCandidateFile(candidate, env, l, l, compactionKindRewrite)
if pc != nil {
return pc
}
}
return nil
}

inputs := lf
if anyTablesCompacting(inputs) {
// Try the next level.
// TODO
func (p *compactionPickerByScore) pickTombstoneDensityCompaction(
env compactionEnv,
) (pc *pickedCompaction) {
for l := 0; l < numLevels; l++ {
v := p.vers.Levels[l].Annotation(tombstoneDensityAnnotator{})
if v == nil {
continue
}

pc = newPickedCompaction(p.opts, p.vers, l, l, p.baseLevel)
pc.outputLevel.level = l
pc.kind = compactionKindRewrite
pc.startLevel.files = inputs
pc.smallest, pc.largest = manifest.KeyRange(pc.cmp, pc.startLevel.files.Iter())

// Fail-safe to protect against compacting the same sstable concurrently.
if !inputRangeAlreadyCompacting(env, pc) {
if pc.startLevel.level == 0 {
pc.startLevel.l0SublevelInfo = generateSublevelInfo(pc.cmp, pc.startLevel.files)
}
candidate := v.(*fileMetadata)
pc := p.pickedCompactionFromCandidateFile(candidate, env, l, defaultOutputLevel(l, p.baseLevel), compactionKindTombstoneDensity)
if pc != nil {
return pc
}
}
Expand Down
10 changes: 6 additions & 4 deletions internal/base/options.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,12 @@ package base

// SSTable block defaults.
const (
DefaultBlockRestartInterval = 16
DefaultBlockSize = 4096
DefaultBlockSizeThreshold = 90
SizeClassAwareBlockSizeThreshold = 60
DefaultBlockRestartInterval = 16
DefaultBlockSize = 4096
DefaultBlockSizeThreshold = 90
SizeClassAwareBlockSizeThreshold = 60
DefaultNumDeletionsThreshold = 100
DefaultDeletionSizeRatioThreshold = 0.5
)

// FilterType is the level at which to apply a filter: block or table.
Expand Down
2 changes: 2 additions & 0 deletions internal/manifest/version.go
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,8 @@ type TableStats struct {
ValueBlocksSize uint64
// CompressionType is the compression type of the table.
CompressionType sstable.Compression
// NumTombstoneDenseBlocks is the number of tombstone-dense data blocks in this table.
NumTombstoneDenseBlocks uint64
}

// boundType represents the type of key (point or range) present as the smallest
Expand Down
4 changes: 4 additions & 0 deletions internal/testkeys/testkeys_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,10 @@ func TestKeyCount(t *testing.T) {
}
testCases := map[params]int64{
{26, 1}: 26,
{26, 2}: 702,
{26, 3}: 18278,
{26, 4}: 475254,
{26, 5}: 12356630,
{52, 1}: 52,
{2, 2}: 6,
{2, 3}: 14,
Expand Down
Loading

0 comments on commit 9772206

Please sign in to comment.