From 977220677a7f7cd7d91d40bcdfbb459c2166d8e4 Mon Sep 17 00:00:00 2001
From: Anish Shanbhag <anish.shanbhag@cockroachlabs.com>
Date: Tue, 9 Jul 2024 13:48:16 -0400
Subject: [PATCH] compact: add point tombstone density compaction heuristic

This change adds a heuristic to compact point tombstones based on
their density across the LSM. We add a new table property called
`NumTombstoneDenseBlocks` and a corresponding field in `TableStats` that
tracks the number of data blocks in each table which are considered
tombstone-dense. This value is calculated on the fly while tables are being
written, so no extra I/O is required later on to compute it.

A data block is considered tombstone-dense if it fulfills either of the
following criteria:
1. The block contains at least `options.Experimental.NumDeletionsThreshold`
point tombstones. The default value is `100`.
2. The ratio of the uncompressed size of point tombstones to the uncompressed
size of the block is at least `options.Experimental.DeletionSizeRatioThreshold`.
For example, with the default value of `0.5`, a data block of size 4KB
would be considered tombstone-dense if it contains at least 2KB of point
tombstones.

The intuition here is that as described [here](https://github.com/cockroachdb/pebble/issues/918#issuecomment-1564714073),
dense clusters are bad because they a) waste CPU when skipping over tombstones,
and b) waste I/O because we end up loading more blocks per live key. The
two criteria above are meant to tackle these two issues respectively; the
the count-based threshold prevents CPU waste, and the size-based threshold
prevents I/O waste.

A table is considered eligible for the new tombstone compaction type if
it contains at least `options.Experimental.MinTombstoneDenseBlocks`
tombstone-dense data blocks. The default value is `20`. We use an Annotator
in a similar way to elision-only compactions in order to prioritize compacting
the table with the most tombstone-dense blocks if there are multiple
eligible tables. The default here was chosen through experimentation on
CockroachDB KV workloads; with a lower value we were compacting too
aggressively leading to very high write amplification, but lower values
led to very few noticeable performance improvements.
---
 compaction.go                      |   3 +
 compaction_picker.go               | 141 ++++++++++++++-------
 internal/base/options.go           |  10 +-
 internal/manifest/version.go       |   2 +
 internal/testkeys/testkeys_test.go |   4 +
 iterator_test.go                   | 196 ++++++++++++++++++++++++++---
 metrics.go                         |  24 ++--
 metrics_test.go                    |   1 +
 options.go                         |  41 ++++++
 options_test.go                    |   6 +
 replay/replay.go                   |  20 +--
 sstable/options.go                 |  13 ++
 sstable/properties.go              |   5 +
 sstable/reader_virtual.go          |   1 +
 sstable/writer.go                  |  39 +++++-
 table_stats.go                     |  13 +-
 testdata/event_listener            |   4 +-
 testdata/ingest                    |   4 +-
 testdata/metrics                   |  58 ++++-----
 version_set.go                     |   4 +
 20 files changed, 467 insertions(+), 122 deletions(-)

diff --git a/compaction.go b/compaction.go
index 71186e86d0..5b8d602a9c 100644
--- a/compaction.go
+++ b/compaction.go
@@ -135,6 +135,7 @@ const (
 	compactionKindDeleteOnly
 	compactionKindElisionOnly
 	compactionKindRead
+	compactionKindTombstoneDensity
 	compactionKindRewrite
 	compactionKindIngestedFlushable
 )
@@ -153,6 +154,8 @@ func (k compactionKind) String() string {
 		return "elision-only"
 	case compactionKindRead:
 		return "read"
+	case compactionKindTombstoneDensity:
+		return "tombstone-density"
 	case compactionKindRewrite:
 		return "rewrite"
 	case compactionKindIngestedFlushable:
diff --git a/compaction_picker.go b/compaction_picker.go
index 68cb9c5511..e2993bf73b 100644
--- a/compaction_picker.go
+++ b/compaction_picker.go
@@ -1317,6 +1317,10 @@ func (p *compactionPickerByScore) pickAuto(env compactionEnv) (pc *pickedCompact
 		}
 	}
 
+	if pc := p.pickTombstoneDensityCompaction(env); pc != nil {
+		return pc
+	}
+
 	// Check for L6 files with tombstones that may be elided. These files may
 	// exist if a snapshot prevented the elision of a tombstone or because of
 	// a move compaction. These are low-priority compactions because they
@@ -1498,6 +1502,83 @@ func markedMergeHelper(f *fileMetadata, dst interface{}) (interface{}, bool) {
 	return dst, true
 }
 
+// TODO: replace this with the updated annotator interface when it's complete.
+type tombstoneDensityAnnotator struct{}
+
+var _ manifest.Annotator = tombstoneDensityAnnotator{}
+
+func (a tombstoneDensityAnnotator) Zero(interface{}) interface{} {
+	return nil
+}
+
+func (a tombstoneDensityAnnotator) Accumulate(
+	f *fileMetadata, dst interface{},
+) (interface{}, bool) {
+	if !f.StatsValid() || f.IsCompacting() {
+		return dst, false
+	}
+
+	// TODO: once the new annotator interface is in place, the threshold below
+	// should be defined by the MinTombstoneDenseBlocks option. For this reason,
+	// the annotator shouldn't be created until the option value is known.
+	if f.Stats.NumTombstoneDenseBlocks > 20 {
+		switch {
+		case dst == nil:
+			return f, true
+		case f.Stats.NumTombstoneDenseBlocks > dst.(*fileMetadata).Stats.NumTombstoneDenseBlocks:
+			return f, true
+		default:
+			return dst, true
+		}
+	}
+	return dst, true
+}
+
+func (a tombstoneDensityAnnotator) Merge(src interface{}, dst interface{}) interface{} {
+	switch {
+	case src == nil:
+		return dst
+	case dst == nil:
+		return src
+	case src.(*fileMetadata).Stats.NumTombstoneDenseBlocks > dst.(*fileMetadata).Stats.NumTombstoneDenseBlocks:
+		return src
+	default:
+		return dst
+	}
+}
+
+// pickedCompactionFromCandidateFile creates a pickedCompaction from a *fileMetadata
+// with various checks to ensure that the file still exists in the expected level
+// and isn't already being compacted.
+func (p *compactionPickerByScore) pickedCompactionFromCandidateFile(
+	candidate *fileMetadata, env compactionEnv, startLevel int, outputLevel int, kind compactionKind,
+) *pickedCompaction {
+	if candidate == nil || candidate.IsCompacting() {
+		return nil
+	}
+
+	inputs := p.vers.Levels[startLevel].Find(p.opts.Comparer.Compare, candidate)
+	if inputs.Empty() {
+		panic(fmt.Sprintf("file %s not found in level %d as expected", candidate.FileNum, startLevel))
+	}
+
+	pc := newPickedCompaction(p.opts, p.vers, startLevel, outputLevel, p.baseLevel)
+	pc.kind = kind
+	pc.startLevel.files = inputs
+	pc.smallest, pc.largest = manifest.KeyRange(pc.cmp, pc.startLevel.files.Iter())
+
+	// Fail-safe to protect against compacting the same sstable concurrently.
+	if inputRangeAlreadyCompacting(env, pc) {
+		return nil
+	}
+
+	if !pc.setupInputs(p.opts, env.diskAvailBytes, pc.startLevel) {
+		return nil
+	}
+
+	return pc
+}
+
 // pickElisionOnlyCompaction looks for compactions of sstables in the
 // bottommost level containing obsolete records that may now be dropped.
 func (p *compactionPickerByScore) pickElisionOnlyCompaction(
@@ -1511,28 +1592,10 @@ func (p *compactionPickerByScore) pickElisionOnlyCompaction(
 		return nil
 	}
 	candidate := v.(*fileMetadata)
-	if candidate.IsCompacting() || candidate.LargestSeqNum >= env.earliestSnapshotSeqNum {
+	if candidate.LargestSeqNum >= env.earliestSnapshotSeqNum {
 		return nil
 	}
-	lf := p.vers.Levels[numLevels-1].Find(p.opts.Comparer.Compare, candidate)
-	if lf.Empty() {
-		panic(fmt.Sprintf("file %s not found in level %d as expected", candidate.FileNum, numLevels-1))
-	}
-
-	// Construct a picked compaction of the elision candidate's atomic
-	// compaction unit.
-	pc = newPickedCompaction(p.opts, p.vers, numLevels-1, numLevels-1, p.baseLevel)
-	pc.kind = compactionKindElisionOnly
-	pc.startLevel.files = lf
-	if anyTablesCompacting(lf) {
-		return nil
-	}
-	pc.smallest, pc.largest = manifest.KeyRange(pc.cmp, pc.startLevel.files.Iter())
-	// Fail-safe to protect against compacting the same sstable concurrently.
-	if !inputRangeAlreadyCompacting(env, pc) {
-		return pc
-	}
-	return nil
+	return p.pickedCompactionFromCandidateFile(candidate, env, numLevels-1, numLevels-1, compactionKindElisionOnly)
 }
 
 // pickRewriteCompaction attempts to construct a compaction that
@@ -1548,32 +1611,26 @@ func (p *compactionPickerByScore) pickRewriteCompaction(env compactionEnv) (pc *
 			continue
 		}
 		candidate := v.(*fileMetadata)
-		if candidate.IsCompacting() {
-			// Try the next level.
-			continue
-		}
-		lf := p.vers.Levels[l].Find(p.opts.Comparer.Compare, candidate)
-		if lf.Empty() {
-			panic(fmt.Sprintf("file %s not found in level %d as expected", candidate.FileNum, numLevels-1))
+		pc := p.pickedCompactionFromCandidateFile(candidate, env, l, l, compactionKindRewrite)
+		if pc != nil {
+			return pc
 		}
+	}
+	return nil
+}
 
-		inputs := lf
-		if anyTablesCompacting(inputs) {
-			// Try the next level.
+// TODO
+func (p *compactionPickerByScore) pickTombstoneDensityCompaction(
+	env compactionEnv,
+) (pc *pickedCompaction) {
+	for l := 0; l < numLevels; l++ {
+		v := p.vers.Levels[l].Annotation(tombstoneDensityAnnotator{})
+		if v == nil {
 			continue
 		}
-
-		pc = newPickedCompaction(p.opts, p.vers, l, l, p.baseLevel)
-		pc.outputLevel.level = l
-		pc.kind = compactionKindRewrite
-		pc.startLevel.files = inputs
-		pc.smallest, pc.largest = manifest.KeyRange(pc.cmp, pc.startLevel.files.Iter())
-
-		// Fail-safe to protect against compacting the same sstable concurrently.
-		if !inputRangeAlreadyCompacting(env, pc) {
-			if pc.startLevel.level == 0 {
-				pc.startLevel.l0SublevelInfo = generateSublevelInfo(pc.cmp, pc.startLevel.files)
-			}
+		candidate := v.(*fileMetadata)
+		pc := p.pickedCompactionFromCandidateFile(candidate, env, l, defaultOutputLevel(l, p.baseLevel), compactionKindTombstoneDensity)
+		if pc != nil {
 			return pc
 		}
 	}
diff --git a/internal/base/options.go b/internal/base/options.go
index f5f127c5ca..d8c030acd5 100644
--- a/internal/base/options.go
+++ b/internal/base/options.go
@@ -6,10 +6,12 @@ package base
 
 // SSTable block defaults.
 const (
-	DefaultBlockRestartInterval      = 16
-	DefaultBlockSize                 = 4096
-	DefaultBlockSizeThreshold        = 90
-	SizeClassAwareBlockSizeThreshold = 60
+	DefaultBlockRestartInterval       = 16
+	DefaultBlockSize                  = 4096
+	DefaultBlockSizeThreshold         = 90
+	SizeClassAwareBlockSizeThreshold  = 60
+	DefaultNumDeletionsThreshold      = 100
+	DefaultDeletionSizeRatioThreshold = 0.5
 )
 
 // FilterType is the level at which to apply a filter: block or table.
diff --git a/internal/manifest/version.go b/internal/manifest/version.go
index a90717e836..ee77c1dc5b 100644
--- a/internal/manifest/version.go
+++ b/internal/manifest/version.go
@@ -74,6 +74,8 @@ type TableStats struct {
 	ValueBlocksSize uint64
 	// CompressionType is the compression type of the table.
 	CompressionType sstable.Compression
+	// NumTombstoneDenseBlocks is the number of tombstone-dense data blocks in this table.
+	NumTombstoneDenseBlocks uint64
 }
 
 // boundType represents the type of key (point or range) present as the smallest
diff --git a/internal/testkeys/testkeys_test.go b/internal/testkeys/testkeys_test.go
index 2312088c89..7573eeb75f 100644
--- a/internal/testkeys/testkeys_test.go
+++ b/internal/testkeys/testkeys_test.go
@@ -65,6 +65,10 @@ func TestKeyCount(t *testing.T) {
 	}
 	testCases := map[params]int64{
 		{26, 1}: 26,
+		{26, 2}: 702,
+		{26, 3}: 18278,
+		{26, 4}: 475254,
+		{26, 5}: 12356630,
 		{52, 1}: 52,
 		{2, 2}:  6,
 		{2, 3}:  14,
diff --git a/iterator_test.go b/iterator_test.go
index 651ef2081b..ee283e639b 100644
--- a/iterator_test.go
+++ b/iterator_test.go
@@ -15,6 +15,7 @@ import (
 	"sort"
 	"strconv"
 	"strings"
+	"sync/atomic"
 	"testing"
 	"time"
 
@@ -30,6 +31,7 @@ import (
 	"github.com/cockroachdb/pebble/vfs"
 	"github.com/stretchr/testify/require"
 	"golang.org/x/exp/rand"
+	"golang.org/x/sync/errgroup"
 )
 
 var testKeyValuePairs = []string{
@@ -2722,6 +2724,182 @@ func BenchmarkSeekPrefixTombstones(b *testing.B) {
 	}
 }
 
+func waitForCompactions(d *DB) {
+	d.mu.Lock()
+	// NB: Wait for table stats because some compaction types rely
+	// on table stats to be collected.
+	d.waitTableStats()
+	for d.mu.compact.compactingCount > 0 {
+		d.mu.compact.cond.Wait()
+		d.waitTableStats()
+	}
+	d.mu.Unlock()
+}
+
+// BenchmarkPointDeletedSwath benchmarks iterator operations on large-ish
+// (hundreds of MBs) databases containing broad swaths of keys removed by point
+// tombstones.
+func BenchmarkPointDeletedSwath(b *testing.B) {
+	const maxKeyLen = 5
+	ks := testkeys.Alpha(maxKeyLen)
+
+	opts := func() *Options {
+		return (&Options{
+			DisableWAL:         true,
+			FS:                 vfs.NewMem(),
+			Comparer:           testkeys.Comparer,
+			FormatMajorVersion: FormatNewest,
+		}).EnsureDefaults()
+	}
+	type iteratorOp struct {
+		name string
+		fn   func(*DB, testkeys.Keyspace, *rand.Rand) error
+	}
+	var iterKeyBuf [maxKeyLen]byte
+
+	iterOps := []iteratorOp{
+		{
+			name: "prefix point lookup", fn: func(d *DB, ks testkeys.Keyspace, rng *rand.Rand) error {
+				n := testkeys.WriteKey(iterKeyBuf[:], ks, int64(rng.Intn(int(ks.Count()))))
+				iter, _ := d.NewIter(nil)
+				_ = iter.SeekPrefixGE(iterKeyBuf[:n])
+				return iter.Close()
+			},
+		},
+		{
+			name: "non-prefix point seek", fn: func(d *DB, ks testkeys.Keyspace, rng *rand.Rand) error {
+				n := testkeys.WriteKey(iterKeyBuf[:], ks, int64(rng.Intn(int(ks.Count()))))
+				iter, _ := d.NewIter(nil)
+				_ = iter.SeekGE(iterKeyBuf[:n])
+				return iter.Close()
+			},
+		},
+		{
+			name: "full scan", fn: func(d *DB, ks testkeys.Keyspace, rng *rand.Rand) error {
+				iter, _ := d.NewIter(nil)
+				for valid := iter.First(); valid; valid = iter.Next() {
+				}
+				return iter.Close()
+			},
+		},
+	}
+
+	// Populate an initial database with point keys at every key in the `ks`
+	// keyspace.
+	populated := withStateSetup(b, vfs.NewMem(), opts(), populateKeyspaceSetup(ks))
+	for _, gapLength := range []int{100, 1_000, 10_000, 100_000, 200_000, 400_000, 5_000_000, 10_000_000} {
+		b.Run(fmt.Sprintf("gap=%d", gapLength), func(b *testing.B) {
+			// Extend the `populated` initial database with DELs deleting all
+			// the middle keys in the keyspace in a contiguous swath of
+			// `gapLength` keys.
+			gapDeleted := withStateSetup(b, populated, opts(), deleteGapSetup(ks, gapLength))
+
+			for _, op := range iterOps {
+				b.Run(op.name, func(b *testing.B) {
+					// Run each instance of the test in a fresh DB constructed
+					// from `compacted`. This ensures background compactions
+					// from one iterator operation don't affect another iterator
+					// option.
+					withStateSetup(b, gapDeleted, opts(), func(_ testing.TB, d *DB) {
+						rng := rand.New(rand.NewSource(1 /* fixed seed */))
+						b.ResetTimer()
+						for i := 0; i < b.N; i++ {
+							if err := op.fn(d, ks, rng); err != nil {
+								b.Fatal(err)
+							}
+						}
+						b.StopTimer()
+					})
+				})
+			}
+		})
+	}
+}
+
+func withStateSetup(
+	t testing.TB, initial vfs.FS, opts *Options, setup func(testing.TB, *DB),
+) vfs.FS {
+	ok, err := vfs.Clone(initial, opts.FS, "", "", vfs.CloneSync)
+	require.NoError(t, err)
+	require.True(t, ok)
+	d, err := Open("", opts)
+	require.NoError(t, err)
+	defer func() { require.NoError(t, d.Close()) }()
+	setup(t, d)
+	return opts.FS
+}
+
+func populateKeyspaceSetup(ks testkeys.Keyspace) func(testing.TB, *DB) {
+	const valSize = 256
+	return func(t testing.TB, d *DB) {
+		t.Logf("Populating keyspace with %d keys, each with %d-byte values", ks.Count(), valSize)
+		// Parallelize population by divvying up the keyspace.
+		var grp errgroup.Group
+		loadKeyspaces := testkeys.Divvy(ks, 20)
+		var progress atomic.Uint64
+		for l := 0; l < len(loadKeyspaces); l++ {
+			l := l
+			grp.Go(func() error {
+				rng := rand.New(rand.NewSource(1))
+				batch := d.NewBatch()
+				key := make([]byte, ks.MaxLen())
+				var val [valSize]byte
+				for i := int64(0); i < loadKeyspaces[l].Count(); i++ {
+					rng.Read(val[:])
+					n := testkeys.WriteKey(key[:], loadKeyspaces[l], i)
+					if err := batch.Set(key[:n], val[:], nil); err != nil {
+						return err
+					}
+					if batch.Len() >= 10<<10 /* 10 kib */ {
+						count := batch.Count()
+						require.NoError(t, batch.Commit(NoSync))
+						if newTotal := progress.Add(uint64(count)); (newTotal / (uint64(ks.Count()) / 100)) != (newTotal-uint64(count))/uint64(ks.Count()/100) {
+							t.Logf("%.1f%% populated", 100.0*(float64(newTotal)/float64(ks.Count())))
+						}
+						batch = d.NewBatch()
+						d.AsyncFlush()
+					}
+				}
+				if !batch.Empty() {
+					return batch.Commit(NoSync)
+				}
+				return nil
+			})
+		}
+		require.NoError(t, grp.Wait())
+	}
+}
+
+func deleteGapSetup(ks testkeys.Keyspace, gapLength int) func(testing.TB, *DB) {
+	return func(t testing.TB, d *DB) {
+		midpoint := ks.Count() / 2
+		gapStart := midpoint - int64(gapLength/2)
+		gapEnd := midpoint + int64(gapLength/2+(gapLength%2))
+
+		batch := d.NewBatch()
+		key := make([]byte, ks.MaxLen())
+		for i := gapStart; i <= gapEnd; i++ {
+			n := testkeys.WriteKey(key[:], ks, i)
+			if err := batch.Delete(key[:n], nil); err != nil {
+				t.Fatal(err)
+			}
+			if batch.Len() >= 10<<10 /* 10 kib */ {
+				if err := batch.Commit(NoSync); err != nil {
+					t.Fatal(err)
+				}
+				batch = d.NewBatch()
+			}
+		}
+		if err := batch.Commit(NoSync); err != nil {
+			t.Fatal(err)
+		}
+		if err := d.Flush(); err != nil {
+			t.Fatal(err)
+		}
+		waitForCompactions(d)
+	}
+}
+
 func runBenchmarkQueueWorkload(b *testing.B, deleteRatio float32, initOps int, valueSize int) {
 	const queueCount = 8
 	// These should be large enough to assign a unique key to each item in the
@@ -2802,19 +2980,7 @@ func runBenchmarkQueueWorkload(b *testing.B, deleteRatio float32, initOps int, v
 	_, err = d.AsyncFlush()
 	require.NoError(b, err)
 
-	waitForCompactions := func() {
-		d.mu.Lock()
-		// NB: Wait for table stats because some compaction types rely
-		// on table stats to be collected.
-		d.waitTableStats()
-		for d.mu.compact.compactingCount > 0 {
-			d.mu.compact.cond.Wait()
-			d.waitTableStats()
-		}
-		d.mu.Unlock()
-	}
-
-	waitForCompactions()
+	waitForCompactions(d)
 
 	// Log the number of tombstones and live keys in each level after
 	// background compactions are complete.
@@ -2863,13 +3029,13 @@ func runBenchmarkQueueWorkload(b *testing.B, deleteRatio float32, initOps int, v
 // for more information.
 func BenchmarkQueueWorkload(b *testing.B) {
 	// The portion of processing ops that will be deletes for each subbenchmark.
-	var deleteRatios = []float32{0.1, 0.3, 0.5}
+	var deleteRatios = []float32{0.1}
 	// The number of times queues will be processed before running each
 	// subbenchmark.
 	var initOps = []int{400_000, 800_000, 1_200_000, 2_000_000, 3_500_000, 5_000_000, 7_500_000, 10_000_000, 50_000_000}
 	// We vary the value size to identify how compaction behaves when the
 	// relative sizes of tombstones and the keys they delete are different.
-	var valueSizes = []int{128, 2048}
+	var valueSizes = []int{2048}
 
 	for _, deleteRatio := range deleteRatios {
 		for _, valueSize := range valueSizes {
diff --git a/metrics.go b/metrics.go
index 6d2d60ce1c..05678f42b2 100644
--- a/metrics.go
+++ b/metrics.go
@@ -153,16 +153,17 @@ type Metrics struct {
 
 	Compact struct {
 		// The total number of compactions, and per-compaction type counts.
-		Count             int64
-		DefaultCount      int64
-		DeleteOnlyCount   int64
-		ElisionOnlyCount  int64
-		CopyCount         int64
-		MoveCount         int64
-		ReadCount         int64
-		RewriteCount      int64
-		MultiLevelCount   int64
-		CounterLevelCount int64
+		Count                 int64
+		DefaultCount          int64
+		DeleteOnlyCount       int64
+		ElisionOnlyCount      int64
+		CopyCount             int64
+		MoveCount             int64
+		ReadCount             int64
+		TombstoneDensityCount int64
+		RewriteCount          int64
+		MultiLevelCount       int64
+		CounterLevelCount     int64
 		// An estimate of the number of bytes that need to be compacted for the LSM
 		// to reach a stable state.
 		EstimatedDebt uint64
@@ -580,12 +581,13 @@ func (m *Metrics) SafeFormat(w redact.SafePrinter, _ rune) {
 		redact.Safe(m.Compact.NumInProgress),
 		humanize.Bytes.Int64(m.Compact.InProgressBytes))
 
-	w.Printf("             default: %d  delete: %d  elision: %d  move: %d  read: %d  rewrite: %d  copy: %d  multi-level: %d\n",
+	w.Printf("             default: %d  delete: %d  elision: %d  move: %d  read: %d  tombstone-density: %d  rewrite: %d  copy: %d  multi-level: %d\n",
 		redact.Safe(m.Compact.DefaultCount),
 		redact.Safe(m.Compact.DeleteOnlyCount),
 		redact.Safe(m.Compact.ElisionOnlyCount),
 		redact.Safe(m.Compact.MoveCount),
 		redact.Safe(m.Compact.ReadCount),
+		redact.Safe(m.Compact.TombstoneDensityCount),
 		redact.Safe(m.Compact.RewriteCount),
 		redact.Safe(m.Compact.CopyCount),
 		redact.Safe(m.Compact.MultiLevelCount))
diff --git a/metrics_test.go b/metrics_test.go
index 9b1d3a17a4..4e34afd906 100644
--- a/metrics_test.go
+++ b/metrics_test.go
@@ -39,6 +39,7 @@ func exampleMetrics() Metrics {
 	m.Compact.ElisionOnlyCount = 29
 	m.Compact.MoveCount = 30
 	m.Compact.ReadCount = 31
+	m.Compact.TombstoneDensityCount = 16
 	m.Compact.RewriteCount = 32
 	m.Compact.CopyCount = 33
 	m.Compact.MultiLevelCount = 34
diff --git a/options.go b/options.go
index a7542755a3..c278cfb973 100644
--- a/options.go
+++ b/options.go
@@ -603,6 +603,25 @@ type Options struct {
 		// gets multiplied with a constant of 1 << 16 to yield 1 << 20 (1MB).
 		ReadSamplingMultiplier int64
 
+		// NumDeletionsThreshold defines the minimum number of point tombstones
+		// that must be present in a single data block for that block to be
+		// considered tombstone-dense for the purposes of triggering a
+		// tombstone density compaction. The default value is 100.
+		NumDeletionsThreshold int
+
+		// DeletionSizeRatioThreshold defines the minimum ratio of the size of
+		// point tombstones to the size of the data block that must be reached
+		// for that block to be considered tombstone-dense for the purposes of
+		// triggering a tombstone density compaction. The default value is 0.5.
+		DeletionSizeRatioThreshold float32
+
+		// MinTombstoneDenseBlocks is the minimum number of tombstone-dense
+		// data blocks that must be present in a single table for it to be
+		// eligible for a tombstone density compaction. The default value is 20.
+		// Tables with a higher number of tombstone-dense blocks are still
+		// prioritized for compaction.
+		MinTombstoneDenseBlocks int
+
 		// TableCacheShards is the number of shards per table cache.
 		// Reducing the value can reduce the number of idle goroutines per DB
 		// instance which can be useful in scenarios with a lot of DB instances
@@ -1272,6 +1291,15 @@ func (o *Options) EnsureDefaults() *Options {
 	if o.Experimental.ReadSamplingMultiplier == 0 {
 		o.Experimental.ReadSamplingMultiplier = 1 << 4
 	}
+	if o.Experimental.NumDeletionsThreshold == 0 {
+		o.Experimental.NumDeletionsThreshold = base.DefaultNumDeletionsThreshold
+	}
+	if o.Experimental.DeletionSizeRatioThreshold == 0 {
+		o.Experimental.DeletionSizeRatioThreshold = base.DefaultDeletionSizeRatioThreshold
+	}
+	if o.Experimental.MinTombstoneDenseBlocks == 0 {
+		o.Experimental.MinTombstoneDenseBlocks = 20
+	}
 	if o.Experimental.TableCacheShards <= 0 {
 		o.Experimental.TableCacheShards = runtime.GOMAXPROCS(0)
 	}
@@ -1399,6 +1427,9 @@ func (o *Options) String() string {
 	}
 	fmt.Fprintf(&buf, "  read_compaction_rate=%d\n", o.Experimental.ReadCompactionRate)
 	fmt.Fprintf(&buf, "  read_sampling_multiplier=%d\n", o.Experimental.ReadSamplingMultiplier)
+	fmt.Fprintf(&buf, "  num_deletions_threshold=%d\n", o.Experimental.NumDeletionsThreshold)
+	fmt.Fprintf(&buf, "  deletion_size_ratio_threshold=%f\n", o.Experimental.DeletionSizeRatioThreshold)
+	fmt.Fprintf(&buf, "  min_tombstone_dense_blocks=%d\n", o.Experimental.MinTombstoneDenseBlocks)
 	// We no longer care about strict_wal_tail, but set it to true in case an
 	// older version reads the options.
 	fmt.Fprintf(&buf, "  strict_wal_tail=%t\n", true)
@@ -1715,6 +1746,14 @@ func (o *Options) Parse(s string, hooks *ParseHooks) error {
 				o.Experimental.ReadCompactionRate, err = strconv.ParseInt(value, 10, 64)
 			case "read_sampling_multiplier":
 				o.Experimental.ReadSamplingMultiplier, err = strconv.ParseInt(value, 10, 64)
+			case "num_deletions_threshold":
+				o.Experimental.NumDeletionsThreshold, err = strconv.Atoi(value)
+			case "deletion_size_ratio_threshold":
+				val, parseErr := strconv.ParseFloat(value, 32)
+				o.Experimental.DeletionSizeRatioThreshold = float32(val)
+				err = parseErr
+			case "min_tombstone_dense_blocks":
+				o.Experimental.MinTombstoneDenseBlocks, err = strconv.Atoi(value)
 			case "table_cache_shards":
 				o.Experimental.TableCacheShards, err = strconv.Atoi(value)
 			case "table_format":
@@ -1989,6 +2028,8 @@ func (o *Options) MakeWriterOptions(level int, format sstable.TableFormat) sstab
 	writerOpts.FilterType = levelOpts.FilterType
 	writerOpts.IndexBlockSize = levelOpts.IndexBlockSize
 	writerOpts.AllocatorSizeClasses = o.AllocatorSizeClasses
+	writerOpts.NumDeletionsThreshold = o.Experimental.NumDeletionsThreshold
+	writerOpts.DeletionSizeRatioThreshold = o.Experimental.DeletionSizeRatioThreshold
 	return writerOpts
 }
 
diff --git a/options_test.go b/options_test.go
index 2476340d79..3f4878119e 100644
--- a/options_test.go
+++ b/options_test.go
@@ -101,6 +101,9 @@ func TestOptionsString(t *testing.T) {
   multilevel_compaction_heuristic=wamp(0.00, false)
   read_compaction_rate=16000
   read_sampling_multiplier=16
+  num_deletions_threshold=100
+  deletion_size_ratio_threshold=0.500000
+  min_tombstone_dense_blocks=20
   strict_wal_tail=true
   table_cache_shards=8
   validate_on_ingest=false
@@ -285,6 +288,9 @@ func TestOptionsParse(t *testing.T) {
 			}
 			opts.Experimental.ReadCompactionRate = 300
 			opts.Experimental.ReadSamplingMultiplier = 400
+			opts.Experimental.NumDeletionsThreshold = 500
+			opts.Experimental.DeletionSizeRatioThreshold = 0.7
+			opts.Experimental.MinTombstoneDenseBlocks = 4
 			opts.Experimental.TableCacheShards = 500
 			opts.Experimental.MaxWriterConcurrency = 1
 			opts.Experimental.ForceWriterParallelism = true
diff --git a/replay/replay.go b/replay/replay.go
index 6a1aef3204..35f7ce6b5c 100644
--- a/replay/replay.go
+++ b/replay/replay.go
@@ -97,15 +97,16 @@ func (pra PaceByFixedReadAmp) pace(r *Runner, _ workloadStep) time.Duration {
 // Metrics holds the various statistics on a replay run and its performance.
 type Metrics struct {
 	CompactionCounts struct {
-		Total       int64
-		Default     int64
-		DeleteOnly  int64
-		ElisionOnly int64
-		Move        int64
-		Read        int64
-		Rewrite     int64
-		Copy        int64
-		MultiLevel  int64
+		Total            int64
+		Default          int64
+		DeleteOnly       int64
+		ElisionOnly      int64
+		Move             int64
+		Read             int64
+		TombstoneDensity int64
+		Rewrite          int64
+		Copy             int64
+		MultiLevel       int64
 	}
 	EstimatedDebt SampledMetric
 	Final         *pebble.Metrics
@@ -556,6 +557,7 @@ func (r *Runner) Wait() (Metrics, error) {
 	m.CompactionCounts.ElisionOnly = pm.Compact.ElisionOnlyCount
 	m.CompactionCounts.Move = pm.Compact.MoveCount
 	m.CompactionCounts.Read = pm.Compact.ReadCount
+	m.CompactionCounts.TombstoneDensity = pm.Compact.TombstoneDensityCount
 	m.CompactionCounts.Rewrite = pm.Compact.RewriteCount
 	m.CompactionCounts.Copy = pm.Compact.CopyCount
 	m.CompactionCounts.MultiLevel = pm.Compact.MultiLevelCount
diff --git a/sstable/options.go b/sstable/options.go
index 37c6e19daa..e478be38ca 100644
--- a/sstable/options.go
+++ b/sstable/options.go
@@ -288,6 +288,13 @@ type WriterOptions struct {
 
 	// internal options can only be used from within the pebble package.
 	internal sstableinternal.WriterOptions
+
+	// NumDeletionsThreshold mirrors Options.Experimental.NumDeletionsThreshold.
+	NumDeletionsThreshold int
+
+	// DeletionSizeRatioThreshold mirrors
+	// Options.Experimental.DeletionSizeRatioThreshold.
+	DeletionSizeRatioThreshold float32
 }
 
 // SetInternal sets the internal writer options. Note that even though this
@@ -330,5 +337,11 @@ func (o WriterOptions) ensureDefaults() WriterOptions {
 	if o.TableFormat == TableFormatUnspecified {
 		o.TableFormat = TableFormatMinSupported
 	}
+	if o.NumDeletionsThreshold == 0 {
+		o.NumDeletionsThreshold = base.DefaultNumDeletionsThreshold
+	}
+	if o.DeletionSizeRatioThreshold == 0 {
+		o.DeletionSizeRatioThreshold = base.DefaultDeletionSizeRatioThreshold
+	}
 	return o
 }
diff --git a/sstable/properties.go b/sstable/properties.go
index 8b597d11c6..bb6cffa9e6 100644
--- a/sstable/properties.go
+++ b/sstable/properties.go
@@ -100,6 +100,8 @@ type CommonProperties struct {
 	NumRangeKeySets uint64 `prop:"pebble.num.range-key-sets"`
 	// Total size of value blocks and value index block. Only serialized if > 0.
 	ValueBlocksSize uint64 `prop:"pebble.value-blocks.size"`
+	// The number of tombstone-dense data blocks in this table.
+	NumTombstoneDenseBlocks uint64 `prop:"pebble.num.tombstone-dense-blocks"`
 	// The compression algorithm used to compress blocks.
 	CompressionName string `prop:"rocksdb.compression"`
 	// The compression options used to compress blocks.
@@ -401,6 +403,9 @@ func (p *Properties) save(tblFormat TableFormat, w *rowblk.Writer) {
 	if p.ValueBlocksSize > 0 {
 		p.saveUvarint(m, unsafe.Offsetof(p.ValueBlocksSize), p.ValueBlocksSize)
 	}
+	if p.NumTombstoneDenseBlocks > 0 {
+		p.saveUvarint(m, unsafe.Offsetof(p.NumTombstoneDenseBlocks), p.NumTombstoneDenseBlocks)
+	}
 
 	if tblFormat < TableFormatPebblev1 {
 		m["rocksdb.column.family.id"] = binary.AppendUvarint([]byte(nil), math.MaxInt32)
diff --git a/sstable/reader_virtual.go b/sstable/reader_virtual.go
index 911bef4321..9cbe690913 100644
--- a/sstable/reader_virtual.go
+++ b/sstable/reader_virtual.go
@@ -76,6 +76,7 @@ func MakeVirtualReader(reader *Reader, p VirtualReaderParams) VirtualReader {
 	v.Properties.NumDeletions = scale(reader.Properties.NumDeletions)
 	v.Properties.NumRangeDeletions = scale(reader.Properties.NumRangeDeletions)
 	v.Properties.NumRangeKeyDels = scale(reader.Properties.NumRangeKeyDels)
+	v.Properties.NumTombstoneDenseBlocks = scale(reader.Properties.NumTombstoneDenseBlocks)
 
 	// Note that we rely on NumRangeKeySets for correctness. If the sstable may
 	// contain range keys, then NumRangeKeySets must be > 0. ceilDiv works because
diff --git a/sstable/writer.go b/sstable/writer.go
index 2fe0544454..0ed3e20166 100644
--- a/sstable/writer.go
+++ b/sstable/writer.go
@@ -213,6 +213,9 @@ type Writer struct {
 	valueBlockWriter *valueBlockWriter
 
 	allocatorSizeClasses []int
+
+	numDeletionsThreshold      int
+	deletionSizeRatioThreshold float32
 }
 
 type pointKeyInfo struct {
@@ -599,6 +602,15 @@ type dataBlockBuf struct {
 
 	// sepScratch is reusable scratch space for computing separator keys.
 	sepScratch []byte
+
+	// numDeletions stores the count of point tombstones in this data block.
+	// It's used to determine if this data block is considered tombstone-dense
+	// for the purposes of compaction.
+	numDeletions int
+	// deletionSize stores the raw size of point tombstones in this data block.
+	// It's used to determine if this data block is considered tombstone-dense
+	// for the purposes of compaction.
+	deletionSize int
 }
 
 func (d *dataBlockBuf) clear() {
@@ -1003,7 +1015,9 @@ func (w *Writer) addPoint(key InternalKey, value []byte, forceObsolete bool) err
 	switch key.Kind() {
 	case InternalKeyKindDelete, InternalKeyKindSingleDelete:
 		w.props.NumDeletions++
+		w.dataBlockBuf.numDeletions++
 		w.props.RawPointTombstoneKeySize += uint64(len(key.UserKey))
+		w.dataBlockBuf.deletionSize += len(key.UserKey)
 	case InternalKeyKindDeleteSized:
 		var size uint64
 		if len(value) > 0 {
@@ -1017,7 +1031,9 @@ func (w *Writer) addPoint(key InternalKey, value []byte, forceObsolete bool) err
 		}
 		w.props.NumDeletions++
 		w.props.NumSizedDeletions++
+		w.dataBlockBuf.numDeletions++
 		w.props.RawPointTombstoneKeySize += uint64(len(key.UserKey))
+		w.dataBlockBuf.deletionSize += len(key.UserKey)
 		w.props.RawPointTombstoneValueSize += size
 	case InternalKeyKindMerge:
 		w.props.NumMergeOperands++
@@ -1317,6 +1333,20 @@ func (w *Writer) maybeAddToFilter(key []byte) {
 	}
 }
 
+// incrementTombstoneDenseBlocks increments the number of tombstone dense
+// blocks if the number of deletions in the data block exceeds a threshold or
+// the deletion size exceeds a threshold. It should be called after the
+// data block has been finished.
+// Invariant: w.dataBlockBuf.uncompressed must already be populated.
+func (w *Writer) incrementTombstoneDenseBlocks() {
+	minSize := w.deletionSizeRatioThreshold * float32(len(w.dataBlockBuf.uncompressed))
+	if w.dataBlockBuf.numDeletions > w.numDeletionsThreshold || float32(w.dataBlockBuf.deletionSize) > minSize {
+		w.props.NumTombstoneDenseBlocks++
+	}
+	w.dataBlockBuf.numDeletions = 0
+	w.dataBlockBuf.deletionSize = 0
+}
+
 func (w *Writer) flush(key InternalKey) error {
 	// We're finishing a data block.
 	err := w.finishDataBlockProps(w.dataBlockBuf)
@@ -1324,6 +1354,7 @@ func (w *Writer) flush(key InternalKey) error {
 		return err
 	}
 	w.dataBlockBuf.finish()
+	w.incrementTombstoneDenseBlocks()
 	w.dataBlockBuf.compressAndChecksum(w.compression)
 	// Since dataBlockEstimates.addInflightDataBlock was never called, the
 	// inflightSize is set to 0.
@@ -1897,7 +1928,9 @@ func (w *Writer) Close() (err error) {
 	// Finish the last data block, or force an empty data block if there
 	// aren't any data blocks at all.
 	if w.dataBlockBuf.dataBlock.EntryCount() > 0 || w.indexBlock.block.EntryCount() == 0 {
-		bh, err := w.layout.WriteDataBlock(w.dataBlockBuf.dataBlock.Finish(), &w.dataBlockBuf.blockBuf)
+		w.dataBlockBuf.finish()
+		w.incrementTombstoneDenseBlocks()
+		bh, err := w.layout.WriteDataBlock(w.dataBlockBuf.uncompressed, &w.dataBlockBuf.blockBuf)
 		if err != nil {
 			return err
 		}
@@ -2119,7 +2152,9 @@ func NewWriter(writable objstorage.Writable, o WriterOptions, extraOpts ...Write
 			Cmp:    o.Comparer.Compare,
 			Format: o.Comparer.FormatKey,
 		},
-		allocatorSizeClasses: o.AllocatorSizeClasses,
+		allocatorSizeClasses:       o.AllocatorSizeClasses,
+		numDeletionsThreshold:      o.NumDeletionsThreshold,
+		deletionSizeRatioThreshold: o.DeletionSizeRatioThreshold,
 	}
 	if w.tableFormat >= TableFormatPebblev3 {
 		w.shortAttributeExtractor = o.ShortAttributeExtractor
diff --git a/table_stats.go b/table_stats.go
index bc1af0bd0b..6b034fe63d 100644
--- a/table_stats.go
+++ b/table_stats.go
@@ -132,6 +132,7 @@ func (d *DB) collectTableStats() bool {
 		maybeCompact = maybeCompact || fileCompensation(c.fileMetadata) > 0
 		c.fileMetadata.StatsMarkValid()
 	}
+
 	d.mu.tableStats.cond.Broadcast()
 	d.maybeCollectTableStatsLocked()
 	if len(hints) > 0 && !d.opts.private.disableDeleteOnlyCompactions {
@@ -307,6 +308,11 @@ func (d *DB) loadTableStats(
 			props := r.CommonProperties()
 			stats.NumEntries = props.NumEntries
 			stats.NumDeletions = props.NumDeletions
+			stats.NumRangeKeySets = props.NumRangeKeySets
+			stats.ValueBlocksSize = props.ValueBlocksSize
+			stats.CompressionType = sstable.CompressionFromString(props.CompressionName)
+			stats.NumTombstoneDenseBlocks = props.NumTombstoneDenseBlocks
+
 			if props.NumPointDeletions() > 0 {
 				if err = d.loadTablePointKeyStats(props, v, level, meta, &stats); err != nil {
 					return
@@ -319,12 +325,6 @@ func (d *DB) loadTableStats(
 					return
 				}
 			}
-			// TODO(travers): Once we have real-world data, consider collecting
-			// additional stats that may provide improved heuristics for compaction
-			// picking.
-			stats.NumRangeKeySets = props.NumRangeKeySets
-			stats.ValueBlocksSize = props.ValueBlocksSize
-			stats.CompressionType = sstable.CompressionFromString(props.CompressionName)
 			return
 		})
 	if err != nil {
@@ -680,6 +680,7 @@ func maybeSetStatsFromProperties(meta physicalMeta, props *sstable.Properties) b
 	meta.Stats.RangeDeletionsBytesEstimate = 0
 	meta.Stats.ValueBlocksSize = props.ValueBlocksSize
 	meta.Stats.CompressionType = sstable.CompressionFromString(props.CompressionName)
+	meta.Stats.NumTombstoneDenseBlocks = props.NumTombstoneDenseBlocks
 	meta.StatsMarkValid()
 	return true
 }
diff --git a/testdata/event_listener b/testdata/event_listener
index 34b61403a0..58687697d0 100644
--- a/testdata/event_listener
+++ b/testdata/event_listener
@@ -216,7 +216,7 @@ total |     3  1.7KB     0B       0 |     - |  671B |     1   590B |     0     0
 WAL: 1 files (0B)  in: 48B  written: 81B (69% overhead)
 Flushes: 3
 Compactions: 1  estimated debt: 1.7KB  in progress: 0 (0B)
-             default: 1  delete: 0  elision: 0  move: 0  read: 0  rewrite: 0  copy: 0  multi-level: 0
+             default: 1  delete: 0  elision: 0  move: 0  read: 0  tombstone-density: 0  rewrite: 0  copy: 0  multi-level: 0
 MemTables: 1 (256KB)  zombie: 1 (256KB)
 Zombie tables: 0 (0B, local: 0B)
 Backing tables: 0 (0B)
@@ -317,7 +317,7 @@ total |     6  3.5KB     0B       0 |     - | 1.8KB |     3  1.7KB |     0     0
 WAL: 1 files (0B)  in: 82B  written: 108B (32% overhead)
 Flushes: 6
 Compactions: 1  estimated debt: 3.5KB  in progress: 0 (0B)
-             default: 1  delete: 0  elision: 0  move: 0  read: 0  rewrite: 0  copy: 0  multi-level: 0
+             default: 1  delete: 0  elision: 0  move: 0  read: 0  tombstone-density: 0  rewrite: 0  copy: 0  multi-level: 0
 MemTables: 1 (512KB)  zombie: 1 (512KB)
 Zombie tables: 0 (0B, local: 0B)
 Backing tables: 0 (0B)
diff --git a/testdata/ingest b/testdata/ingest
index 354e50e978..b4f6eedf21 100644
--- a/testdata/ingest
+++ b/testdata/ingest
@@ -46,7 +46,7 @@ total |     1   569B     0B       0 |     - |  569B |     1   569B |     0     0
 WAL: 1 files (0B)  in: 0B  written: 0B (0% overhead)
 Flushes: 0
 Compactions: 0  estimated debt: 0B  in progress: 0 (0B)
-             default: 0  delete: 0  elision: 0  move: 0  read: 0  rewrite: 0  copy: 0  multi-level: 0
+             default: 0  delete: 0  elision: 0  move: 0  read: 0  tombstone-density: 0  rewrite: 0  copy: 0  multi-level: 0
 MemTables: 1 (256KB)  zombie: 0 (0B)
 Zombie tables: 0 (0B, local: 0B)
 Backing tables: 0 (0B)
@@ -54,7 +54,7 @@ Virtual tables: 0 (0B)
 Local tables size: 569B
 Compression types: snappy: 1
 Block cache: 6 entries (945B)  hit rate: 30.8%
-Table cache: 1 entries (728B)  hit rate: 50.0%
+Table cache: 1 entries (736B)  hit rate: 50.0%
 Secondary cache: 0 entries (0B)  hit rate: 0.0%
 Snapshots: 0  earliest seq num: 0
 Table iters: 0
diff --git a/testdata/metrics b/testdata/metrics
index d583567c6b..2877bbf151 100644
--- a/testdata/metrics
+++ b/testdata/metrics
@@ -15,7 +15,7 @@ total |  2.8K  2.7KB     0B    2.8K |     - | 2.8KB |  2.9K  2.8KB |  2.9K  2.8K
 WAL: 22 files (24B)  in: 25B  written: 26B (4% overhead)
 Flushes: 8
 Compactions: 5  estimated debt: 6B  in progress: 2 (7B)
-             default: 27  delete: 28  elision: 29  move: 30  read: 31  rewrite: 32  copy: 33  multi-level: 34
+             default: 27  delete: 28  elision: 29  move: 30  read: 31  tombstone-density: 16  rewrite: 32  copy: 33  multi-level: 34
 MemTables: 12 (11B)  zombie: 14 (13B)
 Zombie tables: 16 (15B, local: 30B)
 Backing tables: 1 (2.0MB)
@@ -67,7 +67,7 @@ total |     1   589B     0B       0 |     - |   28B |     0     0B |     0     0
 WAL: 1 files (0B)  in: 17B  written: 28B (65% overhead)
 Flushes: 1
 Compactions: 0  estimated debt: 0B  in progress: 0 (0B)
-             default: 0  delete: 0  elision: 0  move: 0  read: 0  rewrite: 0  copy: 0  multi-level: 0
+             default: 0  delete: 0  elision: 0  move: 0  read: 0  tombstone-density: 0  rewrite: 0  copy: 0  multi-level: 0
 MemTables: 1 (256KB)  zombie: 1 (256KB)
 Zombie tables: 0 (0B, local: 0B)
 Backing tables: 0 (0B)
@@ -75,7 +75,7 @@ Virtual tables: 0 (0B)
 Local tables size: 589B
 Compression types: snappy: 1
 Block cache: 3 entries (484B)  hit rate: 0.0%
-Table cache: 1 entries (728B)  hit rate: 0.0%
+Table cache: 1 entries (736B)  hit rate: 0.0%
 Secondary cache: 0 entries (0B)  hit rate: 0.0%
 Snapshots: 0  earliest seq num: 0
 Table iters: 1
@@ -84,7 +84,7 @@ Ingestions: 0  as flushable: 0 (0B in 0 tables)
 
 disk-usage
 ----
-1.9KB
+2.0KB
 
 batch
 set b 2
@@ -123,7 +123,7 @@ total |     1   595B     0B       0 |     - |   56B |     0     0B |     0     0
 WAL: 1 files (0B)  in: 34B  written: 56B (65% overhead)
 Flushes: 2
 Compactions: 1  estimated debt: 0B  in progress: 0 (0B)
-             default: 1  delete: 0  elision: 0  move: 0  read: 0  rewrite: 0  copy: 0  multi-level: 0
+             default: 1  delete: 0  elision: 0  move: 0  read: 0  tombstone-density: 0  rewrite: 0  copy: 0  multi-level: 0
 MemTables: 1 (256KB)  zombie: 2 (512KB)
 Zombie tables: 2 (1.2KB, local: 1.2KB)
 Backing tables: 0 (0B)
@@ -142,7 +142,7 @@ Iter category stats:
 
 disk-usage
 ----
-3.2KB
+3.3KB
 
 # Closing iter a will release one of the zombie memtables.
 
@@ -166,7 +166,7 @@ total |     1   595B     0B       0 |     - |   56B |     0     0B |     0     0
 WAL: 1 files (0B)  in: 34B  written: 56B (65% overhead)
 Flushes: 2
 Compactions: 1  estimated debt: 0B  in progress: 0 (0B)
-             default: 1  delete: 0  elision: 0  move: 0  read: 0  rewrite: 0  copy: 0  multi-level: 0
+             default: 1  delete: 0  elision: 0  move: 0  read: 0  tombstone-density: 0  rewrite: 0  copy: 0  multi-level: 0
 MemTables: 1 (256KB)  zombie: 2 (512KB)
 Zombie tables: 2 (1.2KB, local: 1.2KB)
 Backing tables: 0 (0B)
@@ -206,7 +206,7 @@ total |     1   595B     0B       0 |     - |   56B |     0     0B |     0     0
 WAL: 1 files (0B)  in: 34B  written: 56B (65% overhead)
 Flushes: 2
 Compactions: 1  estimated debt: 0B  in progress: 0 (0B)
-             default: 1  delete: 0  elision: 0  move: 0  read: 0  rewrite: 0  copy: 0  multi-level: 0
+             default: 1  delete: 0  elision: 0  move: 0  read: 0  tombstone-density: 0  rewrite: 0  copy: 0  multi-level: 0
 MemTables: 1 (256KB)  zombie: 2 (512KB)
 Zombie tables: 1 (589B, local: 589B)
 Backing tables: 0 (0B)
@@ -214,7 +214,7 @@ Virtual tables: 0 (0B)
 Local tables size: 595B
 Compression types: snappy: 1
 Block cache: 3 entries (484B)  hit rate: 33.3%
-Table cache: 1 entries (728B)  hit rate: 66.7%
+Table cache: 1 entries (736B)  hit rate: 66.7%
 Secondary cache: 0 entries (0B)  hit rate: 0.0%
 Snapshots: 0  earliest seq num: 0
 Table iters: 1
@@ -226,7 +226,7 @@ Iter category stats:
 
 disk-usage
 ----
-2.6KB
+2.7KB
 
 # Closing iter b will release the last zombie sstable and the last zombie memtable.
 
@@ -250,7 +250,7 @@ total |     1   595B     0B       0 |     - |   56B |     0     0B |     0     0
 WAL: 1 files (0B)  in: 34B  written: 56B (65% overhead)
 Flushes: 2
 Compactions: 1  estimated debt: 0B  in progress: 0 (0B)
-             default: 1  delete: 0  elision: 0  move: 0  read: 0  rewrite: 0  copy: 0  multi-level: 0
+             default: 1  delete: 0  elision: 0  move: 0  read: 0  tombstone-density: 0  rewrite: 0  copy: 0  multi-level: 0
 MemTables: 1 (256KB)  zombie: 1 (256KB)
 Zombie tables: 0 (0B, local: 0B)
 Backing tables: 0 (0B)
@@ -271,7 +271,7 @@ Iter category stats:
 
 disk-usage
 ----
-2.0KB
+2.1KB
 
 additional-metrics
 ----
@@ -321,7 +321,7 @@ total |     4  2.6KB    38B       0 |     - |  149B |     0     0B |     0     0
 WAL: 1 files (0B)  in: 116B  written: 149B (28% overhead)
 Flushes: 3
 Compactions: 1  estimated debt: 2.6KB  in progress: 0 (0B)
-             default: 1  delete: 0  elision: 0  move: 0  read: 0  rewrite: 0  copy: 0  multi-level: 0
+             default: 1  delete: 0  elision: 0  move: 0  read: 0  tombstone-density: 0  rewrite: 0  copy: 0  multi-level: 0
 MemTables: 1 (256KB)  zombie: 1 (256KB)
 Zombie tables: 0 (0B, local: 0B)
 Backing tables: 0 (0B)
@@ -376,7 +376,7 @@ total |     3  2.0KB    41B       0 |     - |  149B |     0     0B |     0     0
 WAL: 1 files (0B)  in: 116B  written: 149B (28% overhead)
 Flushes: 3
 Compactions: 2  estimated debt: 0B  in progress: 0 (0B)
-             default: 2  delete: 0  elision: 0  move: 0  read: 0  rewrite: 0  copy: 0  multi-level: 0
+             default: 2  delete: 0  elision: 0  move: 0  read: 0  tombstone-density: 0  rewrite: 0  copy: 0  multi-level: 0
 MemTables: 1 (256KB)  zombie: 1 (256KB)
 Zombie tables: 0 (0B, local: 0B)
 Backing tables: 0 (0B)
@@ -480,7 +480,7 @@ total |     7  4.3KB    41B       0 |     - | 1.9KB |     3  1.7KB |     0     0
 WAL: 1 files (0B)  in: 176B  written: 187B (6% overhead)
 Flushes: 8
 Compactions: 2  estimated debt: 4.3KB  in progress: 0 (0B)
-             default: 2  delete: 0  elision: 0  move: 0  read: 0  rewrite: 0  copy: 0  multi-level: 0
+             default: 2  delete: 0  elision: 0  move: 0  read: 0  tombstone-density: 0  rewrite: 0  copy: 0  multi-level: 0
 MemTables: 1 (1.0MB)  zombie: 1 (1.0MB)
 Zombie tables: 0 (0B, local: 0B)
 Backing tables: 0 (0B)
@@ -488,7 +488,7 @@ Virtual tables: 0 (0B)
 Local tables size: 4.3KB
 Compression types: snappy: 7
 Block cache: 12 entries (1.9KB)  hit rate: 9.1%
-Table cache: 1 entries (728B)  hit rate: 53.8%
+Table cache: 1 entries (736B)  hit rate: 53.8%
 Secondary cache: 0 entries (0B)  hit rate: 0.0%
 Snapshots: 0  earliest seq num: 0
 Table iters: 0
@@ -543,7 +543,7 @@ total |    10  6.1KB    41B       0 |     - | 2.0KB |     3  1.7KB |     0     0
 WAL: 1 files (0B)  in: 223B  written: 245B (10% overhead)
 Flushes: 9
 Compactions: 2  estimated debt: 6.1KB  in progress: 0 (0B)
-             default: 2  delete: 0  elision: 0  move: 0  read: 0  rewrite: 0  copy: 0  multi-level: 0
+             default: 2  delete: 0  elision: 0  move: 0  read: 0  tombstone-density: 0  rewrite: 0  copy: 0  multi-level: 0
 MemTables: 1 (1.0MB)  zombie: 1 (1.0MB)
 Zombie tables: 0 (0B, local: 0B)
 Backing tables: 0 (0B)
@@ -551,7 +551,7 @@ Virtual tables: 0 (0B)
 Local tables size: 6.1KB
 Compression types: snappy: 10
 Block cache: 12 entries (1.9KB)  hit rate: 9.1%
-Table cache: 1 entries (728B)  hit rate: 53.8%
+Table cache: 1 entries (736B)  hit rate: 53.8%
 Secondary cache: 0 entries (0B)  hit rate: 0.0%
 Snapshots: 0  earliest seq num: 0
 Table iters: 0
@@ -620,7 +620,7 @@ total |    11  5.6KB    41B       2 |     - | 2.5KB |     4  2.3KB |     0     0
 WAL: 1 files (0B)  in: 223B  written: 245B (10% overhead)
 Flushes: 9
 Compactions: 2  estimated debt: 5.6KB  in progress: 0 (0B)
-             default: 2  delete: 0  elision: 0  move: 0  read: 0  rewrite: 0  copy: 0  multi-level: 0
+             default: 2  delete: 0  elision: 0  move: 0  read: 0  tombstone-density: 0  rewrite: 0  copy: 0  multi-level: 0
 MemTables: 1 (1.0MB)  zombie: 1 (1.0MB)
 Zombie tables: 0 (0B, local: 0B)
 Backing tables: 2 (1.2KB)
@@ -722,7 +722,7 @@ total |     6  3.8KB    41B       0 |     - | 3.1KB |     5  2.9KB |     0     0
 WAL: 1 files (0B)  in: 223B  written: 245B (10% overhead)
 Flushes: 9
 Compactions: 3  estimated debt: 0B  in progress: 0 (0B)
-             default: 3  delete: 0  elision: 0  move: 0  read: 0  rewrite: 0  copy: 0  multi-level: 0
+             default: 3  delete: 0  elision: 0  move: 0  read: 0  tombstone-density: 0  rewrite: 0  copy: 0  multi-level: 0
 MemTables: 1 (1.0MB)  zombie: 1 (1.0MB)
 Zombie tables: 0 (0B, local: 0B)
 Backing tables: 0 (0B)
@@ -776,7 +776,7 @@ total |     1   604B     0B       0 |     - |   38B |     0     0B |     0     0
 WAL: 1 files (0B)  in: 27B  written: 38B (41% overhead)
 Flushes: 1
 Compactions: 0  estimated debt: 0B  in progress: 0 (0B)
-             default: 0  delete: 0  elision: 0  move: 0  read: 0  rewrite: 0  copy: 0  multi-level: 0
+             default: 0  delete: 0  elision: 0  move: 0  read: 0  tombstone-density: 0  rewrite: 0  copy: 0  multi-level: 0
 MemTables: 1 (256KB)  zombie: 1 (256KB)
 Zombie tables: 0 (0B, local: 0B)
 Backing tables: 0 (0B)
@@ -814,7 +814,7 @@ total |     1   604B     0B       0 |     - |   38B |     0     0B |     0     0
 WAL: 1 files (0B)  in: 27B  written: 38B (41% overhead)
 Flushes: 1
 Compactions: 1  estimated debt: 0B  in progress: 0 (0B)
-             default: 0  delete: 0  elision: 0  move: 0  read: 0  rewrite: 0  copy: 1  multi-level: 0
+             default: 0  delete: 0  elision: 0  move: 0  read: 0  tombstone-density: 0  rewrite: 0  copy: 1  multi-level: 0
 MemTables: 1 (256KB)  zombie: 1 (256KB)
 Zombie tables: 0 (0B, local: 0B)
 Backing tables: 0 (0B)
@@ -822,7 +822,7 @@ Virtual tables: 0 (0B)
 Local tables size: 0B
 Compression types: snappy: 1
 Block cache: 1 entries (440B)  hit rate: 0.0%
-Table cache: 1 entries (728B)  hit rate: 0.0%
+Table cache: 1 entries (736B)  hit rate: 0.0%
 Secondary cache: 0 entries (0B)  hit rate: 0.0%
 Snapshots: 0  earliest seq num: 0
 Table iters: 0
@@ -861,7 +861,7 @@ total |     2  1.2KB     0B       0 |     - |  627B |     1   589B |     0     0
 WAL: 1 files (0B)  in: 27B  written: 38B (41% overhead)
 Flushes: 1
 Compactions: 1  estimated debt: 1.2KB  in progress: 0 (0B)
-             default: 0  delete: 0  elision: 0  move: 0  read: 0  rewrite: 0  copy: 1  multi-level: 0
+             default: 0  delete: 0  elision: 0  move: 0  read: 0  tombstone-density: 0  rewrite: 0  copy: 1  multi-level: 0
 MemTables: 1 (256KB)  zombie: 1 (256KB)
 Zombie tables: 0 (0B, local: 0B)
 Backing tables: 0 (0B)
@@ -869,7 +869,7 @@ Virtual tables: 0 (0B)
 Local tables size: 0B
 Compression types: snappy: 2
 Block cache: 6 entries (996B)  hit rate: 0.0%
-Table cache: 1 entries (728B)  hit rate: 50.0%
+Table cache: 1 entries (736B)  hit rate: 50.0%
 Secondary cache: 0 entries (0B)  hit rate: 0.0%
 Snapshots: 0  earliest seq num: 0
 Table iters: 0
@@ -909,7 +909,7 @@ total |     3  1.7KB     0B       0 |     - |  655B |     1   589B |     0     0
 WAL: 1 files (0B)  in: 44B  written: 66B (50% overhead)
 Flushes: 2
 Compactions: 1  estimated debt: 1.7KB  in progress: 0 (0B)
-             default: 0  delete: 0  elision: 0  move: 0  read: 0  rewrite: 0  copy: 1  multi-level: 0
+             default: 0  delete: 0  elision: 0  move: 0  read: 0  tombstone-density: 0  rewrite: 0  copy: 1  multi-level: 0
 MemTables: 1 (256KB)  zombie: 1 (256KB)
 Zombie tables: 0 (0B, local: 0B)
 Backing tables: 0 (0B)
@@ -917,7 +917,7 @@ Virtual tables: 0 (0B)
 Local tables size: 589B
 Compression types: snappy: 3
 Block cache: 6 entries (996B)  hit rate: 0.0%
-Table cache: 1 entries (728B)  hit rate: 50.0%
+Table cache: 1 entries (736B)  hit rate: 50.0%
 Secondary cache: 0 entries (0B)  hit rate: 0.0%
 Snapshots: 0  earliest seq num: 0
 Table iters: 0
@@ -948,7 +948,7 @@ total |     3  1.7KB     0B       0 |     - |    0B |     0     0B |     0     0
 WAL: 1 files (0B)  in: 0B  written: 0B (0% overhead)
 Flushes: 0
 Compactions: 0  estimated debt: 1.7KB  in progress: 0 (0B)
-             default: 0  delete: 0  elision: 0  move: 0  read: 0  rewrite: 0  copy: 0  multi-level: 0
+             default: 0  delete: 0  elision: 0  move: 0  read: 0  tombstone-density: 0  rewrite: 0  copy: 0  multi-level: 0
 MemTables: 1 (256KB)  zombie: 0 (0B)
 Zombie tables: 0 (0B, local: 0B)
 Backing tables: 0 (0B)
@@ -986,7 +986,7 @@ total |     1   603B     0B       0 |     - |    0B |     0     0B |     0     0
 WAL: 1 files (0B)  in: 0B  written: 0B (0% overhead)
 Flushes: 0
 Compactions: 1  estimated debt: 0B  in progress: 0 (0B)
-             default: 1  delete: 0  elision: 0  move: 0  read: 0  rewrite: 0  copy: 0  multi-level: 0
+             default: 1  delete: 0  elision: 0  move: 0  read: 0  tombstone-density: 0  rewrite: 0  copy: 0  multi-level: 0
 MemTables: 1 (256KB)  zombie: 0 (0B)
 Zombie tables: 0 (0B, local: 0B)
 Backing tables: 0 (0B)
diff --git a/version_set.go b/version_set.go
index c0ec1758fb..2762aed001 100644
--- a/version_set.go
+++ b/version_set.go
@@ -842,6 +842,10 @@ func (vs *versionSet) incrementCompactions(
 		vs.metrics.Compact.Count++
 		vs.metrics.Compact.ReadCount++
 
+	case compactionKindTombstoneDensity:
+		vs.metrics.Compact.Count++
+		vs.metrics.Compact.TombstoneDensityCount++
+
 	case compactionKindRewrite:
 		vs.metrics.Compact.Count++
 		vs.metrics.Compact.RewriteCount++