From 977220677a7f7cd7d91d40bcdfbb459c2166d8e4 Mon Sep 17 00:00:00 2001 From: Anish Shanbhag Date: Tue, 9 Jul 2024 13:48:16 -0400 Subject: [PATCH] compact: add point tombstone density compaction heuristic This change adds a heuristic to compact point tombstones based on their density across the LSM. We add a new table property called `NumTombstoneDenseBlocks` and a corresponding field in `TableStats` that tracks the number of data blocks in each table which are considered tombstone-dense. This value is calculated on the fly while tables are being written, so no extra I/O is required later on to compute it. A data block is considered tombstone-dense if it fulfills either of the following criteria: 1. The block contains at least `options.Experimental.NumDeletionsThreshold` point tombstones. The default value is `100`. 2. The ratio of the uncompressed size of point tombstones to the uncompressed size of the block is at least `options.Experimental.DeletionSizeRatioThreshold`. For example, with the default value of `0.5`, a data block of size 4KB would be considered tombstone-dense if it contains at least 2KB of point tombstones. The intuition here is that as described [here](https://github.com/cockroachdb/pebble/issues/918#issuecomment-1564714073), dense clusters are bad because they a) waste CPU when skipping over tombstones, and b) waste I/O because we end up loading more blocks per live key. The two criteria above are meant to tackle these two issues respectively; the the count-based threshold prevents CPU waste, and the size-based threshold prevents I/O waste. A table is considered eligible for the new tombstone compaction type if it contains at least `options.Experimental.MinTombstoneDenseBlocks` tombstone-dense data blocks. The default value is `20`. We use an Annotator in a similar way to elision-only compactions in order to prioritize compacting the table with the most tombstone-dense blocks if there are multiple eligible tables. The default here was chosen through experimentation on CockroachDB KV workloads; with a lower value we were compacting too aggressively leading to very high write amplification, but lower values led to very few noticeable performance improvements. --- compaction.go | 3 + compaction_picker.go | 141 ++++++++++++++------- internal/base/options.go | 10 +- internal/manifest/version.go | 2 + internal/testkeys/testkeys_test.go | 4 + iterator_test.go | 196 ++++++++++++++++++++++++++--- metrics.go | 24 ++-- metrics_test.go | 1 + options.go | 41 ++++++ options_test.go | 6 + replay/replay.go | 20 +-- sstable/options.go | 13 ++ sstable/properties.go | 5 + sstable/reader_virtual.go | 1 + sstable/writer.go | 39 +++++- table_stats.go | 13 +- testdata/event_listener | 4 +- testdata/ingest | 4 +- testdata/metrics | 58 ++++----- version_set.go | 4 + 20 files changed, 467 insertions(+), 122 deletions(-) diff --git a/compaction.go b/compaction.go index 71186e86d0..5b8d602a9c 100644 --- a/compaction.go +++ b/compaction.go @@ -135,6 +135,7 @@ const ( compactionKindDeleteOnly compactionKindElisionOnly compactionKindRead + compactionKindTombstoneDensity compactionKindRewrite compactionKindIngestedFlushable ) @@ -153,6 +154,8 @@ func (k compactionKind) String() string { return "elision-only" case compactionKindRead: return "read" + case compactionKindTombstoneDensity: + return "tombstone-density" case compactionKindRewrite: return "rewrite" case compactionKindIngestedFlushable: diff --git a/compaction_picker.go b/compaction_picker.go index 68cb9c5511..e2993bf73b 100644 --- a/compaction_picker.go +++ b/compaction_picker.go @@ -1317,6 +1317,10 @@ func (p *compactionPickerByScore) pickAuto(env compactionEnv) (pc *pickedCompact } } + if pc := p.pickTombstoneDensityCompaction(env); pc != nil { + return pc + } + // Check for L6 files with tombstones that may be elided. These files may // exist if a snapshot prevented the elision of a tombstone or because of // a move compaction. These are low-priority compactions because they @@ -1498,6 +1502,83 @@ func markedMergeHelper(f *fileMetadata, dst interface{}) (interface{}, bool) { return dst, true } +// TODO: replace this with the updated annotator interface when it's complete. +type tombstoneDensityAnnotator struct{} + +var _ manifest.Annotator = tombstoneDensityAnnotator{} + +func (a tombstoneDensityAnnotator) Zero(interface{}) interface{} { + return nil +} + +func (a tombstoneDensityAnnotator) Accumulate( + f *fileMetadata, dst interface{}, +) (interface{}, bool) { + if !f.StatsValid() || f.IsCompacting() { + return dst, false + } + + // TODO: once the new annotator interface is in place, the threshold below + // should be defined by the MinTombstoneDenseBlocks option. For this reason, + // the annotator shouldn't be created until the option value is known. + if f.Stats.NumTombstoneDenseBlocks > 20 { + switch { + case dst == nil: + return f, true + case f.Stats.NumTombstoneDenseBlocks > dst.(*fileMetadata).Stats.NumTombstoneDenseBlocks: + return f, true + default: + return dst, true + } + } + return dst, true +} + +func (a tombstoneDensityAnnotator) Merge(src interface{}, dst interface{}) interface{} { + switch { + case src == nil: + return dst + case dst == nil: + return src + case src.(*fileMetadata).Stats.NumTombstoneDenseBlocks > dst.(*fileMetadata).Stats.NumTombstoneDenseBlocks: + return src + default: + return dst + } +} + +// pickedCompactionFromCandidateFile creates a pickedCompaction from a *fileMetadata +// with various checks to ensure that the file still exists in the expected level +// and isn't already being compacted. +func (p *compactionPickerByScore) pickedCompactionFromCandidateFile( + candidate *fileMetadata, env compactionEnv, startLevel int, outputLevel int, kind compactionKind, +) *pickedCompaction { + if candidate == nil || candidate.IsCompacting() { + return nil + } + + inputs := p.vers.Levels[startLevel].Find(p.opts.Comparer.Compare, candidate) + if inputs.Empty() { + panic(fmt.Sprintf("file %s not found in level %d as expected", candidate.FileNum, startLevel)) + } + + pc := newPickedCompaction(p.opts, p.vers, startLevel, outputLevel, p.baseLevel) + pc.kind = kind + pc.startLevel.files = inputs + pc.smallest, pc.largest = manifest.KeyRange(pc.cmp, pc.startLevel.files.Iter()) + + // Fail-safe to protect against compacting the same sstable concurrently. + if inputRangeAlreadyCompacting(env, pc) { + return nil + } + + if !pc.setupInputs(p.opts, env.diskAvailBytes, pc.startLevel) { + return nil + } + + return pc +} + // pickElisionOnlyCompaction looks for compactions of sstables in the // bottommost level containing obsolete records that may now be dropped. func (p *compactionPickerByScore) pickElisionOnlyCompaction( @@ -1511,28 +1592,10 @@ func (p *compactionPickerByScore) pickElisionOnlyCompaction( return nil } candidate := v.(*fileMetadata) - if candidate.IsCompacting() || candidate.LargestSeqNum >= env.earliestSnapshotSeqNum { + if candidate.LargestSeqNum >= env.earliestSnapshotSeqNum { return nil } - lf := p.vers.Levels[numLevels-1].Find(p.opts.Comparer.Compare, candidate) - if lf.Empty() { - panic(fmt.Sprintf("file %s not found in level %d as expected", candidate.FileNum, numLevels-1)) - } - - // Construct a picked compaction of the elision candidate's atomic - // compaction unit. - pc = newPickedCompaction(p.opts, p.vers, numLevels-1, numLevels-1, p.baseLevel) - pc.kind = compactionKindElisionOnly - pc.startLevel.files = lf - if anyTablesCompacting(lf) { - return nil - } - pc.smallest, pc.largest = manifest.KeyRange(pc.cmp, pc.startLevel.files.Iter()) - // Fail-safe to protect against compacting the same sstable concurrently. - if !inputRangeAlreadyCompacting(env, pc) { - return pc - } - return nil + return p.pickedCompactionFromCandidateFile(candidate, env, numLevels-1, numLevels-1, compactionKindElisionOnly) } // pickRewriteCompaction attempts to construct a compaction that @@ -1548,32 +1611,26 @@ func (p *compactionPickerByScore) pickRewriteCompaction(env compactionEnv) (pc * continue } candidate := v.(*fileMetadata) - if candidate.IsCompacting() { - // Try the next level. - continue - } - lf := p.vers.Levels[l].Find(p.opts.Comparer.Compare, candidate) - if lf.Empty() { - panic(fmt.Sprintf("file %s not found in level %d as expected", candidate.FileNum, numLevels-1)) + pc := p.pickedCompactionFromCandidateFile(candidate, env, l, l, compactionKindRewrite) + if pc != nil { + return pc } + } + return nil +} - inputs := lf - if anyTablesCompacting(inputs) { - // Try the next level. +// TODO +func (p *compactionPickerByScore) pickTombstoneDensityCompaction( + env compactionEnv, +) (pc *pickedCompaction) { + for l := 0; l < numLevels; l++ { + v := p.vers.Levels[l].Annotation(tombstoneDensityAnnotator{}) + if v == nil { continue } - - pc = newPickedCompaction(p.opts, p.vers, l, l, p.baseLevel) - pc.outputLevel.level = l - pc.kind = compactionKindRewrite - pc.startLevel.files = inputs - pc.smallest, pc.largest = manifest.KeyRange(pc.cmp, pc.startLevel.files.Iter()) - - // Fail-safe to protect against compacting the same sstable concurrently. - if !inputRangeAlreadyCompacting(env, pc) { - if pc.startLevel.level == 0 { - pc.startLevel.l0SublevelInfo = generateSublevelInfo(pc.cmp, pc.startLevel.files) - } + candidate := v.(*fileMetadata) + pc := p.pickedCompactionFromCandidateFile(candidate, env, l, defaultOutputLevel(l, p.baseLevel), compactionKindTombstoneDensity) + if pc != nil { return pc } } diff --git a/internal/base/options.go b/internal/base/options.go index f5f127c5ca..d8c030acd5 100644 --- a/internal/base/options.go +++ b/internal/base/options.go @@ -6,10 +6,12 @@ package base // SSTable block defaults. const ( - DefaultBlockRestartInterval = 16 - DefaultBlockSize = 4096 - DefaultBlockSizeThreshold = 90 - SizeClassAwareBlockSizeThreshold = 60 + DefaultBlockRestartInterval = 16 + DefaultBlockSize = 4096 + DefaultBlockSizeThreshold = 90 + SizeClassAwareBlockSizeThreshold = 60 + DefaultNumDeletionsThreshold = 100 + DefaultDeletionSizeRatioThreshold = 0.5 ) // FilterType is the level at which to apply a filter: block or table. diff --git a/internal/manifest/version.go b/internal/manifest/version.go index a90717e836..ee77c1dc5b 100644 --- a/internal/manifest/version.go +++ b/internal/manifest/version.go @@ -74,6 +74,8 @@ type TableStats struct { ValueBlocksSize uint64 // CompressionType is the compression type of the table. CompressionType sstable.Compression + // NumTombstoneDenseBlocks is the number of tombstone-dense data blocks in this table. + NumTombstoneDenseBlocks uint64 } // boundType represents the type of key (point or range) present as the smallest diff --git a/internal/testkeys/testkeys_test.go b/internal/testkeys/testkeys_test.go index 2312088c89..7573eeb75f 100644 --- a/internal/testkeys/testkeys_test.go +++ b/internal/testkeys/testkeys_test.go @@ -65,6 +65,10 @@ func TestKeyCount(t *testing.T) { } testCases := map[params]int64{ {26, 1}: 26, + {26, 2}: 702, + {26, 3}: 18278, + {26, 4}: 475254, + {26, 5}: 12356630, {52, 1}: 52, {2, 2}: 6, {2, 3}: 14, diff --git a/iterator_test.go b/iterator_test.go index 651ef2081b..ee283e639b 100644 --- a/iterator_test.go +++ b/iterator_test.go @@ -15,6 +15,7 @@ import ( "sort" "strconv" "strings" + "sync/atomic" "testing" "time" @@ -30,6 +31,7 @@ import ( "github.com/cockroachdb/pebble/vfs" "github.com/stretchr/testify/require" "golang.org/x/exp/rand" + "golang.org/x/sync/errgroup" ) var testKeyValuePairs = []string{ @@ -2722,6 +2724,182 @@ func BenchmarkSeekPrefixTombstones(b *testing.B) { } } +func waitForCompactions(d *DB) { + d.mu.Lock() + // NB: Wait for table stats because some compaction types rely + // on table stats to be collected. + d.waitTableStats() + for d.mu.compact.compactingCount > 0 { + d.mu.compact.cond.Wait() + d.waitTableStats() + } + d.mu.Unlock() +} + +// BenchmarkPointDeletedSwath benchmarks iterator operations on large-ish +// (hundreds of MBs) databases containing broad swaths of keys removed by point +// tombstones. +func BenchmarkPointDeletedSwath(b *testing.B) { + const maxKeyLen = 5 + ks := testkeys.Alpha(maxKeyLen) + + opts := func() *Options { + return (&Options{ + DisableWAL: true, + FS: vfs.NewMem(), + Comparer: testkeys.Comparer, + FormatMajorVersion: FormatNewest, + }).EnsureDefaults() + } + type iteratorOp struct { + name string + fn func(*DB, testkeys.Keyspace, *rand.Rand) error + } + var iterKeyBuf [maxKeyLen]byte + + iterOps := []iteratorOp{ + { + name: "prefix point lookup", fn: func(d *DB, ks testkeys.Keyspace, rng *rand.Rand) error { + n := testkeys.WriteKey(iterKeyBuf[:], ks, int64(rng.Intn(int(ks.Count())))) + iter, _ := d.NewIter(nil) + _ = iter.SeekPrefixGE(iterKeyBuf[:n]) + return iter.Close() + }, + }, + { + name: "non-prefix point seek", fn: func(d *DB, ks testkeys.Keyspace, rng *rand.Rand) error { + n := testkeys.WriteKey(iterKeyBuf[:], ks, int64(rng.Intn(int(ks.Count())))) + iter, _ := d.NewIter(nil) + _ = iter.SeekGE(iterKeyBuf[:n]) + return iter.Close() + }, + }, + { + name: "full scan", fn: func(d *DB, ks testkeys.Keyspace, rng *rand.Rand) error { + iter, _ := d.NewIter(nil) + for valid := iter.First(); valid; valid = iter.Next() { + } + return iter.Close() + }, + }, + } + + // Populate an initial database with point keys at every key in the `ks` + // keyspace. + populated := withStateSetup(b, vfs.NewMem(), opts(), populateKeyspaceSetup(ks)) + for _, gapLength := range []int{100, 1_000, 10_000, 100_000, 200_000, 400_000, 5_000_000, 10_000_000} { + b.Run(fmt.Sprintf("gap=%d", gapLength), func(b *testing.B) { + // Extend the `populated` initial database with DELs deleting all + // the middle keys in the keyspace in a contiguous swath of + // `gapLength` keys. + gapDeleted := withStateSetup(b, populated, opts(), deleteGapSetup(ks, gapLength)) + + for _, op := range iterOps { + b.Run(op.name, func(b *testing.B) { + // Run each instance of the test in a fresh DB constructed + // from `compacted`. This ensures background compactions + // from one iterator operation don't affect another iterator + // option. + withStateSetup(b, gapDeleted, opts(), func(_ testing.TB, d *DB) { + rng := rand.New(rand.NewSource(1 /* fixed seed */)) + b.ResetTimer() + for i := 0; i < b.N; i++ { + if err := op.fn(d, ks, rng); err != nil { + b.Fatal(err) + } + } + b.StopTimer() + }) + }) + } + }) + } +} + +func withStateSetup( + t testing.TB, initial vfs.FS, opts *Options, setup func(testing.TB, *DB), +) vfs.FS { + ok, err := vfs.Clone(initial, opts.FS, "", "", vfs.CloneSync) + require.NoError(t, err) + require.True(t, ok) + d, err := Open("", opts) + require.NoError(t, err) + defer func() { require.NoError(t, d.Close()) }() + setup(t, d) + return opts.FS +} + +func populateKeyspaceSetup(ks testkeys.Keyspace) func(testing.TB, *DB) { + const valSize = 256 + return func(t testing.TB, d *DB) { + t.Logf("Populating keyspace with %d keys, each with %d-byte values", ks.Count(), valSize) + // Parallelize population by divvying up the keyspace. + var grp errgroup.Group + loadKeyspaces := testkeys.Divvy(ks, 20) + var progress atomic.Uint64 + for l := 0; l < len(loadKeyspaces); l++ { + l := l + grp.Go(func() error { + rng := rand.New(rand.NewSource(1)) + batch := d.NewBatch() + key := make([]byte, ks.MaxLen()) + var val [valSize]byte + for i := int64(0); i < loadKeyspaces[l].Count(); i++ { + rng.Read(val[:]) + n := testkeys.WriteKey(key[:], loadKeyspaces[l], i) + if err := batch.Set(key[:n], val[:], nil); err != nil { + return err + } + if batch.Len() >= 10<<10 /* 10 kib */ { + count := batch.Count() + require.NoError(t, batch.Commit(NoSync)) + if newTotal := progress.Add(uint64(count)); (newTotal / (uint64(ks.Count()) / 100)) != (newTotal-uint64(count))/uint64(ks.Count()/100) { + t.Logf("%.1f%% populated", 100.0*(float64(newTotal)/float64(ks.Count()))) + } + batch = d.NewBatch() + d.AsyncFlush() + } + } + if !batch.Empty() { + return batch.Commit(NoSync) + } + return nil + }) + } + require.NoError(t, grp.Wait()) + } +} + +func deleteGapSetup(ks testkeys.Keyspace, gapLength int) func(testing.TB, *DB) { + return func(t testing.TB, d *DB) { + midpoint := ks.Count() / 2 + gapStart := midpoint - int64(gapLength/2) + gapEnd := midpoint + int64(gapLength/2+(gapLength%2)) + + batch := d.NewBatch() + key := make([]byte, ks.MaxLen()) + for i := gapStart; i <= gapEnd; i++ { + n := testkeys.WriteKey(key[:], ks, i) + if err := batch.Delete(key[:n], nil); err != nil { + t.Fatal(err) + } + if batch.Len() >= 10<<10 /* 10 kib */ { + if err := batch.Commit(NoSync); err != nil { + t.Fatal(err) + } + batch = d.NewBatch() + } + } + if err := batch.Commit(NoSync); err != nil { + t.Fatal(err) + } + if err := d.Flush(); err != nil { + t.Fatal(err) + } + waitForCompactions(d) + } +} + func runBenchmarkQueueWorkload(b *testing.B, deleteRatio float32, initOps int, valueSize int) { const queueCount = 8 // These should be large enough to assign a unique key to each item in the @@ -2802,19 +2980,7 @@ func runBenchmarkQueueWorkload(b *testing.B, deleteRatio float32, initOps int, v _, err = d.AsyncFlush() require.NoError(b, err) - waitForCompactions := func() { - d.mu.Lock() - // NB: Wait for table stats because some compaction types rely - // on table stats to be collected. - d.waitTableStats() - for d.mu.compact.compactingCount > 0 { - d.mu.compact.cond.Wait() - d.waitTableStats() - } - d.mu.Unlock() - } - - waitForCompactions() + waitForCompactions(d) // Log the number of tombstones and live keys in each level after // background compactions are complete. @@ -2863,13 +3029,13 @@ func runBenchmarkQueueWorkload(b *testing.B, deleteRatio float32, initOps int, v // for more information. func BenchmarkQueueWorkload(b *testing.B) { // The portion of processing ops that will be deletes for each subbenchmark. - var deleteRatios = []float32{0.1, 0.3, 0.5} + var deleteRatios = []float32{0.1} // The number of times queues will be processed before running each // subbenchmark. var initOps = []int{400_000, 800_000, 1_200_000, 2_000_000, 3_500_000, 5_000_000, 7_500_000, 10_000_000, 50_000_000} // We vary the value size to identify how compaction behaves when the // relative sizes of tombstones and the keys they delete are different. - var valueSizes = []int{128, 2048} + var valueSizes = []int{2048} for _, deleteRatio := range deleteRatios { for _, valueSize := range valueSizes { diff --git a/metrics.go b/metrics.go index 6d2d60ce1c..05678f42b2 100644 --- a/metrics.go +++ b/metrics.go @@ -153,16 +153,17 @@ type Metrics struct { Compact struct { // The total number of compactions, and per-compaction type counts. - Count int64 - DefaultCount int64 - DeleteOnlyCount int64 - ElisionOnlyCount int64 - CopyCount int64 - MoveCount int64 - ReadCount int64 - RewriteCount int64 - MultiLevelCount int64 - CounterLevelCount int64 + Count int64 + DefaultCount int64 + DeleteOnlyCount int64 + ElisionOnlyCount int64 + CopyCount int64 + MoveCount int64 + ReadCount int64 + TombstoneDensityCount int64 + RewriteCount int64 + MultiLevelCount int64 + CounterLevelCount int64 // An estimate of the number of bytes that need to be compacted for the LSM // to reach a stable state. EstimatedDebt uint64 @@ -580,12 +581,13 @@ func (m *Metrics) SafeFormat(w redact.SafePrinter, _ rune) { redact.Safe(m.Compact.NumInProgress), humanize.Bytes.Int64(m.Compact.InProgressBytes)) - w.Printf(" default: %d delete: %d elision: %d move: %d read: %d rewrite: %d copy: %d multi-level: %d\n", + w.Printf(" default: %d delete: %d elision: %d move: %d read: %d tombstone-density: %d rewrite: %d copy: %d multi-level: %d\n", redact.Safe(m.Compact.DefaultCount), redact.Safe(m.Compact.DeleteOnlyCount), redact.Safe(m.Compact.ElisionOnlyCount), redact.Safe(m.Compact.MoveCount), redact.Safe(m.Compact.ReadCount), + redact.Safe(m.Compact.TombstoneDensityCount), redact.Safe(m.Compact.RewriteCount), redact.Safe(m.Compact.CopyCount), redact.Safe(m.Compact.MultiLevelCount)) diff --git a/metrics_test.go b/metrics_test.go index 9b1d3a17a4..4e34afd906 100644 --- a/metrics_test.go +++ b/metrics_test.go @@ -39,6 +39,7 @@ func exampleMetrics() Metrics { m.Compact.ElisionOnlyCount = 29 m.Compact.MoveCount = 30 m.Compact.ReadCount = 31 + m.Compact.TombstoneDensityCount = 16 m.Compact.RewriteCount = 32 m.Compact.CopyCount = 33 m.Compact.MultiLevelCount = 34 diff --git a/options.go b/options.go index a7542755a3..c278cfb973 100644 --- a/options.go +++ b/options.go @@ -603,6 +603,25 @@ type Options struct { // gets multiplied with a constant of 1 << 16 to yield 1 << 20 (1MB). ReadSamplingMultiplier int64 + // NumDeletionsThreshold defines the minimum number of point tombstones + // that must be present in a single data block for that block to be + // considered tombstone-dense for the purposes of triggering a + // tombstone density compaction. The default value is 100. + NumDeletionsThreshold int + + // DeletionSizeRatioThreshold defines the minimum ratio of the size of + // point tombstones to the size of the data block that must be reached + // for that block to be considered tombstone-dense for the purposes of + // triggering a tombstone density compaction. The default value is 0.5. + DeletionSizeRatioThreshold float32 + + // MinTombstoneDenseBlocks is the minimum number of tombstone-dense + // data blocks that must be present in a single table for it to be + // eligible for a tombstone density compaction. The default value is 20. + // Tables with a higher number of tombstone-dense blocks are still + // prioritized for compaction. + MinTombstoneDenseBlocks int + // TableCacheShards is the number of shards per table cache. // Reducing the value can reduce the number of idle goroutines per DB // instance which can be useful in scenarios with a lot of DB instances @@ -1272,6 +1291,15 @@ func (o *Options) EnsureDefaults() *Options { if o.Experimental.ReadSamplingMultiplier == 0 { o.Experimental.ReadSamplingMultiplier = 1 << 4 } + if o.Experimental.NumDeletionsThreshold == 0 { + o.Experimental.NumDeletionsThreshold = base.DefaultNumDeletionsThreshold + } + if o.Experimental.DeletionSizeRatioThreshold == 0 { + o.Experimental.DeletionSizeRatioThreshold = base.DefaultDeletionSizeRatioThreshold + } + if o.Experimental.MinTombstoneDenseBlocks == 0 { + o.Experimental.MinTombstoneDenseBlocks = 20 + } if o.Experimental.TableCacheShards <= 0 { o.Experimental.TableCacheShards = runtime.GOMAXPROCS(0) } @@ -1399,6 +1427,9 @@ func (o *Options) String() string { } fmt.Fprintf(&buf, " read_compaction_rate=%d\n", o.Experimental.ReadCompactionRate) fmt.Fprintf(&buf, " read_sampling_multiplier=%d\n", o.Experimental.ReadSamplingMultiplier) + fmt.Fprintf(&buf, " num_deletions_threshold=%d\n", o.Experimental.NumDeletionsThreshold) + fmt.Fprintf(&buf, " deletion_size_ratio_threshold=%f\n", o.Experimental.DeletionSizeRatioThreshold) + fmt.Fprintf(&buf, " min_tombstone_dense_blocks=%d\n", o.Experimental.MinTombstoneDenseBlocks) // We no longer care about strict_wal_tail, but set it to true in case an // older version reads the options. fmt.Fprintf(&buf, " strict_wal_tail=%t\n", true) @@ -1715,6 +1746,14 @@ func (o *Options) Parse(s string, hooks *ParseHooks) error { o.Experimental.ReadCompactionRate, err = strconv.ParseInt(value, 10, 64) case "read_sampling_multiplier": o.Experimental.ReadSamplingMultiplier, err = strconv.ParseInt(value, 10, 64) + case "num_deletions_threshold": + o.Experimental.NumDeletionsThreshold, err = strconv.Atoi(value) + case "deletion_size_ratio_threshold": + val, parseErr := strconv.ParseFloat(value, 32) + o.Experimental.DeletionSizeRatioThreshold = float32(val) + err = parseErr + case "min_tombstone_dense_blocks": + o.Experimental.MinTombstoneDenseBlocks, err = strconv.Atoi(value) case "table_cache_shards": o.Experimental.TableCacheShards, err = strconv.Atoi(value) case "table_format": @@ -1989,6 +2028,8 @@ func (o *Options) MakeWriterOptions(level int, format sstable.TableFormat) sstab writerOpts.FilterType = levelOpts.FilterType writerOpts.IndexBlockSize = levelOpts.IndexBlockSize writerOpts.AllocatorSizeClasses = o.AllocatorSizeClasses + writerOpts.NumDeletionsThreshold = o.Experimental.NumDeletionsThreshold + writerOpts.DeletionSizeRatioThreshold = o.Experimental.DeletionSizeRatioThreshold return writerOpts } diff --git a/options_test.go b/options_test.go index 2476340d79..3f4878119e 100644 --- a/options_test.go +++ b/options_test.go @@ -101,6 +101,9 @@ func TestOptionsString(t *testing.T) { multilevel_compaction_heuristic=wamp(0.00, false) read_compaction_rate=16000 read_sampling_multiplier=16 + num_deletions_threshold=100 + deletion_size_ratio_threshold=0.500000 + min_tombstone_dense_blocks=20 strict_wal_tail=true table_cache_shards=8 validate_on_ingest=false @@ -285,6 +288,9 @@ func TestOptionsParse(t *testing.T) { } opts.Experimental.ReadCompactionRate = 300 opts.Experimental.ReadSamplingMultiplier = 400 + opts.Experimental.NumDeletionsThreshold = 500 + opts.Experimental.DeletionSizeRatioThreshold = 0.7 + opts.Experimental.MinTombstoneDenseBlocks = 4 opts.Experimental.TableCacheShards = 500 opts.Experimental.MaxWriterConcurrency = 1 opts.Experimental.ForceWriterParallelism = true diff --git a/replay/replay.go b/replay/replay.go index 6a1aef3204..35f7ce6b5c 100644 --- a/replay/replay.go +++ b/replay/replay.go @@ -97,15 +97,16 @@ func (pra PaceByFixedReadAmp) pace(r *Runner, _ workloadStep) time.Duration { // Metrics holds the various statistics on a replay run and its performance. type Metrics struct { CompactionCounts struct { - Total int64 - Default int64 - DeleteOnly int64 - ElisionOnly int64 - Move int64 - Read int64 - Rewrite int64 - Copy int64 - MultiLevel int64 + Total int64 + Default int64 + DeleteOnly int64 + ElisionOnly int64 + Move int64 + Read int64 + TombstoneDensity int64 + Rewrite int64 + Copy int64 + MultiLevel int64 } EstimatedDebt SampledMetric Final *pebble.Metrics @@ -556,6 +557,7 @@ func (r *Runner) Wait() (Metrics, error) { m.CompactionCounts.ElisionOnly = pm.Compact.ElisionOnlyCount m.CompactionCounts.Move = pm.Compact.MoveCount m.CompactionCounts.Read = pm.Compact.ReadCount + m.CompactionCounts.TombstoneDensity = pm.Compact.TombstoneDensityCount m.CompactionCounts.Rewrite = pm.Compact.RewriteCount m.CompactionCounts.Copy = pm.Compact.CopyCount m.CompactionCounts.MultiLevel = pm.Compact.MultiLevelCount diff --git a/sstable/options.go b/sstable/options.go index 37c6e19daa..e478be38ca 100644 --- a/sstable/options.go +++ b/sstable/options.go @@ -288,6 +288,13 @@ type WriterOptions struct { // internal options can only be used from within the pebble package. internal sstableinternal.WriterOptions + + // NumDeletionsThreshold mirrors Options.Experimental.NumDeletionsThreshold. + NumDeletionsThreshold int + + // DeletionSizeRatioThreshold mirrors + // Options.Experimental.DeletionSizeRatioThreshold. + DeletionSizeRatioThreshold float32 } // SetInternal sets the internal writer options. Note that even though this @@ -330,5 +337,11 @@ func (o WriterOptions) ensureDefaults() WriterOptions { if o.TableFormat == TableFormatUnspecified { o.TableFormat = TableFormatMinSupported } + if o.NumDeletionsThreshold == 0 { + o.NumDeletionsThreshold = base.DefaultNumDeletionsThreshold + } + if o.DeletionSizeRatioThreshold == 0 { + o.DeletionSizeRatioThreshold = base.DefaultDeletionSizeRatioThreshold + } return o } diff --git a/sstable/properties.go b/sstable/properties.go index 8b597d11c6..bb6cffa9e6 100644 --- a/sstable/properties.go +++ b/sstable/properties.go @@ -100,6 +100,8 @@ type CommonProperties struct { NumRangeKeySets uint64 `prop:"pebble.num.range-key-sets"` // Total size of value blocks and value index block. Only serialized if > 0. ValueBlocksSize uint64 `prop:"pebble.value-blocks.size"` + // The number of tombstone-dense data blocks in this table. + NumTombstoneDenseBlocks uint64 `prop:"pebble.num.tombstone-dense-blocks"` // The compression algorithm used to compress blocks. CompressionName string `prop:"rocksdb.compression"` // The compression options used to compress blocks. @@ -401,6 +403,9 @@ func (p *Properties) save(tblFormat TableFormat, w *rowblk.Writer) { if p.ValueBlocksSize > 0 { p.saveUvarint(m, unsafe.Offsetof(p.ValueBlocksSize), p.ValueBlocksSize) } + if p.NumTombstoneDenseBlocks > 0 { + p.saveUvarint(m, unsafe.Offsetof(p.NumTombstoneDenseBlocks), p.NumTombstoneDenseBlocks) + } if tblFormat < TableFormatPebblev1 { m["rocksdb.column.family.id"] = binary.AppendUvarint([]byte(nil), math.MaxInt32) diff --git a/sstable/reader_virtual.go b/sstable/reader_virtual.go index 911bef4321..9cbe690913 100644 --- a/sstable/reader_virtual.go +++ b/sstable/reader_virtual.go @@ -76,6 +76,7 @@ func MakeVirtualReader(reader *Reader, p VirtualReaderParams) VirtualReader { v.Properties.NumDeletions = scale(reader.Properties.NumDeletions) v.Properties.NumRangeDeletions = scale(reader.Properties.NumRangeDeletions) v.Properties.NumRangeKeyDels = scale(reader.Properties.NumRangeKeyDels) + v.Properties.NumTombstoneDenseBlocks = scale(reader.Properties.NumTombstoneDenseBlocks) // Note that we rely on NumRangeKeySets for correctness. If the sstable may // contain range keys, then NumRangeKeySets must be > 0. ceilDiv works because diff --git a/sstable/writer.go b/sstable/writer.go index 2fe0544454..0ed3e20166 100644 --- a/sstable/writer.go +++ b/sstable/writer.go @@ -213,6 +213,9 @@ type Writer struct { valueBlockWriter *valueBlockWriter allocatorSizeClasses []int + + numDeletionsThreshold int + deletionSizeRatioThreshold float32 } type pointKeyInfo struct { @@ -599,6 +602,15 @@ type dataBlockBuf struct { // sepScratch is reusable scratch space for computing separator keys. sepScratch []byte + + // numDeletions stores the count of point tombstones in this data block. + // It's used to determine if this data block is considered tombstone-dense + // for the purposes of compaction. + numDeletions int + // deletionSize stores the raw size of point tombstones in this data block. + // It's used to determine if this data block is considered tombstone-dense + // for the purposes of compaction. + deletionSize int } func (d *dataBlockBuf) clear() { @@ -1003,7 +1015,9 @@ func (w *Writer) addPoint(key InternalKey, value []byte, forceObsolete bool) err switch key.Kind() { case InternalKeyKindDelete, InternalKeyKindSingleDelete: w.props.NumDeletions++ + w.dataBlockBuf.numDeletions++ w.props.RawPointTombstoneKeySize += uint64(len(key.UserKey)) + w.dataBlockBuf.deletionSize += len(key.UserKey) case InternalKeyKindDeleteSized: var size uint64 if len(value) > 0 { @@ -1017,7 +1031,9 @@ func (w *Writer) addPoint(key InternalKey, value []byte, forceObsolete bool) err } w.props.NumDeletions++ w.props.NumSizedDeletions++ + w.dataBlockBuf.numDeletions++ w.props.RawPointTombstoneKeySize += uint64(len(key.UserKey)) + w.dataBlockBuf.deletionSize += len(key.UserKey) w.props.RawPointTombstoneValueSize += size case InternalKeyKindMerge: w.props.NumMergeOperands++ @@ -1317,6 +1333,20 @@ func (w *Writer) maybeAddToFilter(key []byte) { } } +// incrementTombstoneDenseBlocks increments the number of tombstone dense +// blocks if the number of deletions in the data block exceeds a threshold or +// the deletion size exceeds a threshold. It should be called after the +// data block has been finished. +// Invariant: w.dataBlockBuf.uncompressed must already be populated. +func (w *Writer) incrementTombstoneDenseBlocks() { + minSize := w.deletionSizeRatioThreshold * float32(len(w.dataBlockBuf.uncompressed)) + if w.dataBlockBuf.numDeletions > w.numDeletionsThreshold || float32(w.dataBlockBuf.deletionSize) > minSize { + w.props.NumTombstoneDenseBlocks++ + } + w.dataBlockBuf.numDeletions = 0 + w.dataBlockBuf.deletionSize = 0 +} + func (w *Writer) flush(key InternalKey) error { // We're finishing a data block. err := w.finishDataBlockProps(w.dataBlockBuf) @@ -1324,6 +1354,7 @@ func (w *Writer) flush(key InternalKey) error { return err } w.dataBlockBuf.finish() + w.incrementTombstoneDenseBlocks() w.dataBlockBuf.compressAndChecksum(w.compression) // Since dataBlockEstimates.addInflightDataBlock was never called, the // inflightSize is set to 0. @@ -1897,7 +1928,9 @@ func (w *Writer) Close() (err error) { // Finish the last data block, or force an empty data block if there // aren't any data blocks at all. if w.dataBlockBuf.dataBlock.EntryCount() > 0 || w.indexBlock.block.EntryCount() == 0 { - bh, err := w.layout.WriteDataBlock(w.dataBlockBuf.dataBlock.Finish(), &w.dataBlockBuf.blockBuf) + w.dataBlockBuf.finish() + w.incrementTombstoneDenseBlocks() + bh, err := w.layout.WriteDataBlock(w.dataBlockBuf.uncompressed, &w.dataBlockBuf.blockBuf) if err != nil { return err } @@ -2119,7 +2152,9 @@ func NewWriter(writable objstorage.Writable, o WriterOptions, extraOpts ...Write Cmp: o.Comparer.Compare, Format: o.Comparer.FormatKey, }, - allocatorSizeClasses: o.AllocatorSizeClasses, + allocatorSizeClasses: o.AllocatorSizeClasses, + numDeletionsThreshold: o.NumDeletionsThreshold, + deletionSizeRatioThreshold: o.DeletionSizeRatioThreshold, } if w.tableFormat >= TableFormatPebblev3 { w.shortAttributeExtractor = o.ShortAttributeExtractor diff --git a/table_stats.go b/table_stats.go index bc1af0bd0b..6b034fe63d 100644 --- a/table_stats.go +++ b/table_stats.go @@ -132,6 +132,7 @@ func (d *DB) collectTableStats() bool { maybeCompact = maybeCompact || fileCompensation(c.fileMetadata) > 0 c.fileMetadata.StatsMarkValid() } + d.mu.tableStats.cond.Broadcast() d.maybeCollectTableStatsLocked() if len(hints) > 0 && !d.opts.private.disableDeleteOnlyCompactions { @@ -307,6 +308,11 @@ func (d *DB) loadTableStats( props := r.CommonProperties() stats.NumEntries = props.NumEntries stats.NumDeletions = props.NumDeletions + stats.NumRangeKeySets = props.NumRangeKeySets + stats.ValueBlocksSize = props.ValueBlocksSize + stats.CompressionType = sstable.CompressionFromString(props.CompressionName) + stats.NumTombstoneDenseBlocks = props.NumTombstoneDenseBlocks + if props.NumPointDeletions() > 0 { if err = d.loadTablePointKeyStats(props, v, level, meta, &stats); err != nil { return @@ -319,12 +325,6 @@ func (d *DB) loadTableStats( return } } - // TODO(travers): Once we have real-world data, consider collecting - // additional stats that may provide improved heuristics for compaction - // picking. - stats.NumRangeKeySets = props.NumRangeKeySets - stats.ValueBlocksSize = props.ValueBlocksSize - stats.CompressionType = sstable.CompressionFromString(props.CompressionName) return }) if err != nil { @@ -680,6 +680,7 @@ func maybeSetStatsFromProperties(meta physicalMeta, props *sstable.Properties) b meta.Stats.RangeDeletionsBytesEstimate = 0 meta.Stats.ValueBlocksSize = props.ValueBlocksSize meta.Stats.CompressionType = sstable.CompressionFromString(props.CompressionName) + meta.Stats.NumTombstoneDenseBlocks = props.NumTombstoneDenseBlocks meta.StatsMarkValid() return true } diff --git a/testdata/event_listener b/testdata/event_listener index 34b61403a0..58687697d0 100644 --- a/testdata/event_listener +++ b/testdata/event_listener @@ -216,7 +216,7 @@ total | 3 1.7KB 0B 0 | - | 671B | 1 590B | 0 0 WAL: 1 files (0B) in: 48B written: 81B (69% overhead) Flushes: 3 Compactions: 1 estimated debt: 1.7KB in progress: 0 (0B) - default: 1 delete: 0 elision: 0 move: 0 read: 0 rewrite: 0 copy: 0 multi-level: 0 + default: 1 delete: 0 elision: 0 move: 0 read: 0 tombstone-density: 0 rewrite: 0 copy: 0 multi-level: 0 MemTables: 1 (256KB) zombie: 1 (256KB) Zombie tables: 0 (0B, local: 0B) Backing tables: 0 (0B) @@ -317,7 +317,7 @@ total | 6 3.5KB 0B 0 | - | 1.8KB | 3 1.7KB | 0 0 WAL: 1 files (0B) in: 82B written: 108B (32% overhead) Flushes: 6 Compactions: 1 estimated debt: 3.5KB in progress: 0 (0B) - default: 1 delete: 0 elision: 0 move: 0 read: 0 rewrite: 0 copy: 0 multi-level: 0 + default: 1 delete: 0 elision: 0 move: 0 read: 0 tombstone-density: 0 rewrite: 0 copy: 0 multi-level: 0 MemTables: 1 (512KB) zombie: 1 (512KB) Zombie tables: 0 (0B, local: 0B) Backing tables: 0 (0B) diff --git a/testdata/ingest b/testdata/ingest index 354e50e978..b4f6eedf21 100644 --- a/testdata/ingest +++ b/testdata/ingest @@ -46,7 +46,7 @@ total | 1 569B 0B 0 | - | 569B | 1 569B | 0 0 WAL: 1 files (0B) in: 0B written: 0B (0% overhead) Flushes: 0 Compactions: 0 estimated debt: 0B in progress: 0 (0B) - default: 0 delete: 0 elision: 0 move: 0 read: 0 rewrite: 0 copy: 0 multi-level: 0 + default: 0 delete: 0 elision: 0 move: 0 read: 0 tombstone-density: 0 rewrite: 0 copy: 0 multi-level: 0 MemTables: 1 (256KB) zombie: 0 (0B) Zombie tables: 0 (0B, local: 0B) Backing tables: 0 (0B) @@ -54,7 +54,7 @@ Virtual tables: 0 (0B) Local tables size: 569B Compression types: snappy: 1 Block cache: 6 entries (945B) hit rate: 30.8% -Table cache: 1 entries (728B) hit rate: 50.0% +Table cache: 1 entries (736B) hit rate: 50.0% Secondary cache: 0 entries (0B) hit rate: 0.0% Snapshots: 0 earliest seq num: 0 Table iters: 0 diff --git a/testdata/metrics b/testdata/metrics index d583567c6b..2877bbf151 100644 --- a/testdata/metrics +++ b/testdata/metrics @@ -15,7 +15,7 @@ total | 2.8K 2.7KB 0B 2.8K | - | 2.8KB | 2.9K 2.8KB | 2.9K 2.8K WAL: 22 files (24B) in: 25B written: 26B (4% overhead) Flushes: 8 Compactions: 5 estimated debt: 6B in progress: 2 (7B) - default: 27 delete: 28 elision: 29 move: 30 read: 31 rewrite: 32 copy: 33 multi-level: 34 + default: 27 delete: 28 elision: 29 move: 30 read: 31 tombstone-density: 16 rewrite: 32 copy: 33 multi-level: 34 MemTables: 12 (11B) zombie: 14 (13B) Zombie tables: 16 (15B, local: 30B) Backing tables: 1 (2.0MB) @@ -67,7 +67,7 @@ total | 1 589B 0B 0 | - | 28B | 0 0B | 0 0 WAL: 1 files (0B) in: 17B written: 28B (65% overhead) Flushes: 1 Compactions: 0 estimated debt: 0B in progress: 0 (0B) - default: 0 delete: 0 elision: 0 move: 0 read: 0 rewrite: 0 copy: 0 multi-level: 0 + default: 0 delete: 0 elision: 0 move: 0 read: 0 tombstone-density: 0 rewrite: 0 copy: 0 multi-level: 0 MemTables: 1 (256KB) zombie: 1 (256KB) Zombie tables: 0 (0B, local: 0B) Backing tables: 0 (0B) @@ -75,7 +75,7 @@ Virtual tables: 0 (0B) Local tables size: 589B Compression types: snappy: 1 Block cache: 3 entries (484B) hit rate: 0.0% -Table cache: 1 entries (728B) hit rate: 0.0% +Table cache: 1 entries (736B) hit rate: 0.0% Secondary cache: 0 entries (0B) hit rate: 0.0% Snapshots: 0 earliest seq num: 0 Table iters: 1 @@ -84,7 +84,7 @@ Ingestions: 0 as flushable: 0 (0B in 0 tables) disk-usage ---- -1.9KB +2.0KB batch set b 2 @@ -123,7 +123,7 @@ total | 1 595B 0B 0 | - | 56B | 0 0B | 0 0 WAL: 1 files (0B) in: 34B written: 56B (65% overhead) Flushes: 2 Compactions: 1 estimated debt: 0B in progress: 0 (0B) - default: 1 delete: 0 elision: 0 move: 0 read: 0 rewrite: 0 copy: 0 multi-level: 0 + default: 1 delete: 0 elision: 0 move: 0 read: 0 tombstone-density: 0 rewrite: 0 copy: 0 multi-level: 0 MemTables: 1 (256KB) zombie: 2 (512KB) Zombie tables: 2 (1.2KB, local: 1.2KB) Backing tables: 0 (0B) @@ -142,7 +142,7 @@ Iter category stats: disk-usage ---- -3.2KB +3.3KB # Closing iter a will release one of the zombie memtables. @@ -166,7 +166,7 @@ total | 1 595B 0B 0 | - | 56B | 0 0B | 0 0 WAL: 1 files (0B) in: 34B written: 56B (65% overhead) Flushes: 2 Compactions: 1 estimated debt: 0B in progress: 0 (0B) - default: 1 delete: 0 elision: 0 move: 0 read: 0 rewrite: 0 copy: 0 multi-level: 0 + default: 1 delete: 0 elision: 0 move: 0 read: 0 tombstone-density: 0 rewrite: 0 copy: 0 multi-level: 0 MemTables: 1 (256KB) zombie: 2 (512KB) Zombie tables: 2 (1.2KB, local: 1.2KB) Backing tables: 0 (0B) @@ -206,7 +206,7 @@ total | 1 595B 0B 0 | - | 56B | 0 0B | 0 0 WAL: 1 files (0B) in: 34B written: 56B (65% overhead) Flushes: 2 Compactions: 1 estimated debt: 0B in progress: 0 (0B) - default: 1 delete: 0 elision: 0 move: 0 read: 0 rewrite: 0 copy: 0 multi-level: 0 + default: 1 delete: 0 elision: 0 move: 0 read: 0 tombstone-density: 0 rewrite: 0 copy: 0 multi-level: 0 MemTables: 1 (256KB) zombie: 2 (512KB) Zombie tables: 1 (589B, local: 589B) Backing tables: 0 (0B) @@ -214,7 +214,7 @@ Virtual tables: 0 (0B) Local tables size: 595B Compression types: snappy: 1 Block cache: 3 entries (484B) hit rate: 33.3% -Table cache: 1 entries (728B) hit rate: 66.7% +Table cache: 1 entries (736B) hit rate: 66.7% Secondary cache: 0 entries (0B) hit rate: 0.0% Snapshots: 0 earliest seq num: 0 Table iters: 1 @@ -226,7 +226,7 @@ Iter category stats: disk-usage ---- -2.6KB +2.7KB # Closing iter b will release the last zombie sstable and the last zombie memtable. @@ -250,7 +250,7 @@ total | 1 595B 0B 0 | - | 56B | 0 0B | 0 0 WAL: 1 files (0B) in: 34B written: 56B (65% overhead) Flushes: 2 Compactions: 1 estimated debt: 0B in progress: 0 (0B) - default: 1 delete: 0 elision: 0 move: 0 read: 0 rewrite: 0 copy: 0 multi-level: 0 + default: 1 delete: 0 elision: 0 move: 0 read: 0 tombstone-density: 0 rewrite: 0 copy: 0 multi-level: 0 MemTables: 1 (256KB) zombie: 1 (256KB) Zombie tables: 0 (0B, local: 0B) Backing tables: 0 (0B) @@ -271,7 +271,7 @@ Iter category stats: disk-usage ---- -2.0KB +2.1KB additional-metrics ---- @@ -321,7 +321,7 @@ total | 4 2.6KB 38B 0 | - | 149B | 0 0B | 0 0 WAL: 1 files (0B) in: 116B written: 149B (28% overhead) Flushes: 3 Compactions: 1 estimated debt: 2.6KB in progress: 0 (0B) - default: 1 delete: 0 elision: 0 move: 0 read: 0 rewrite: 0 copy: 0 multi-level: 0 + default: 1 delete: 0 elision: 0 move: 0 read: 0 tombstone-density: 0 rewrite: 0 copy: 0 multi-level: 0 MemTables: 1 (256KB) zombie: 1 (256KB) Zombie tables: 0 (0B, local: 0B) Backing tables: 0 (0B) @@ -376,7 +376,7 @@ total | 3 2.0KB 41B 0 | - | 149B | 0 0B | 0 0 WAL: 1 files (0B) in: 116B written: 149B (28% overhead) Flushes: 3 Compactions: 2 estimated debt: 0B in progress: 0 (0B) - default: 2 delete: 0 elision: 0 move: 0 read: 0 rewrite: 0 copy: 0 multi-level: 0 + default: 2 delete: 0 elision: 0 move: 0 read: 0 tombstone-density: 0 rewrite: 0 copy: 0 multi-level: 0 MemTables: 1 (256KB) zombie: 1 (256KB) Zombie tables: 0 (0B, local: 0B) Backing tables: 0 (0B) @@ -480,7 +480,7 @@ total | 7 4.3KB 41B 0 | - | 1.9KB | 3 1.7KB | 0 0 WAL: 1 files (0B) in: 176B written: 187B (6% overhead) Flushes: 8 Compactions: 2 estimated debt: 4.3KB in progress: 0 (0B) - default: 2 delete: 0 elision: 0 move: 0 read: 0 rewrite: 0 copy: 0 multi-level: 0 + default: 2 delete: 0 elision: 0 move: 0 read: 0 tombstone-density: 0 rewrite: 0 copy: 0 multi-level: 0 MemTables: 1 (1.0MB) zombie: 1 (1.0MB) Zombie tables: 0 (0B, local: 0B) Backing tables: 0 (0B) @@ -488,7 +488,7 @@ Virtual tables: 0 (0B) Local tables size: 4.3KB Compression types: snappy: 7 Block cache: 12 entries (1.9KB) hit rate: 9.1% -Table cache: 1 entries (728B) hit rate: 53.8% +Table cache: 1 entries (736B) hit rate: 53.8% Secondary cache: 0 entries (0B) hit rate: 0.0% Snapshots: 0 earliest seq num: 0 Table iters: 0 @@ -543,7 +543,7 @@ total | 10 6.1KB 41B 0 | - | 2.0KB | 3 1.7KB | 0 0 WAL: 1 files (0B) in: 223B written: 245B (10% overhead) Flushes: 9 Compactions: 2 estimated debt: 6.1KB in progress: 0 (0B) - default: 2 delete: 0 elision: 0 move: 0 read: 0 rewrite: 0 copy: 0 multi-level: 0 + default: 2 delete: 0 elision: 0 move: 0 read: 0 tombstone-density: 0 rewrite: 0 copy: 0 multi-level: 0 MemTables: 1 (1.0MB) zombie: 1 (1.0MB) Zombie tables: 0 (0B, local: 0B) Backing tables: 0 (0B) @@ -551,7 +551,7 @@ Virtual tables: 0 (0B) Local tables size: 6.1KB Compression types: snappy: 10 Block cache: 12 entries (1.9KB) hit rate: 9.1% -Table cache: 1 entries (728B) hit rate: 53.8% +Table cache: 1 entries (736B) hit rate: 53.8% Secondary cache: 0 entries (0B) hit rate: 0.0% Snapshots: 0 earliest seq num: 0 Table iters: 0 @@ -620,7 +620,7 @@ total | 11 5.6KB 41B 2 | - | 2.5KB | 4 2.3KB | 0 0 WAL: 1 files (0B) in: 223B written: 245B (10% overhead) Flushes: 9 Compactions: 2 estimated debt: 5.6KB in progress: 0 (0B) - default: 2 delete: 0 elision: 0 move: 0 read: 0 rewrite: 0 copy: 0 multi-level: 0 + default: 2 delete: 0 elision: 0 move: 0 read: 0 tombstone-density: 0 rewrite: 0 copy: 0 multi-level: 0 MemTables: 1 (1.0MB) zombie: 1 (1.0MB) Zombie tables: 0 (0B, local: 0B) Backing tables: 2 (1.2KB) @@ -722,7 +722,7 @@ total | 6 3.8KB 41B 0 | - | 3.1KB | 5 2.9KB | 0 0 WAL: 1 files (0B) in: 223B written: 245B (10% overhead) Flushes: 9 Compactions: 3 estimated debt: 0B in progress: 0 (0B) - default: 3 delete: 0 elision: 0 move: 0 read: 0 rewrite: 0 copy: 0 multi-level: 0 + default: 3 delete: 0 elision: 0 move: 0 read: 0 tombstone-density: 0 rewrite: 0 copy: 0 multi-level: 0 MemTables: 1 (1.0MB) zombie: 1 (1.0MB) Zombie tables: 0 (0B, local: 0B) Backing tables: 0 (0B) @@ -776,7 +776,7 @@ total | 1 604B 0B 0 | - | 38B | 0 0B | 0 0 WAL: 1 files (0B) in: 27B written: 38B (41% overhead) Flushes: 1 Compactions: 0 estimated debt: 0B in progress: 0 (0B) - default: 0 delete: 0 elision: 0 move: 0 read: 0 rewrite: 0 copy: 0 multi-level: 0 + default: 0 delete: 0 elision: 0 move: 0 read: 0 tombstone-density: 0 rewrite: 0 copy: 0 multi-level: 0 MemTables: 1 (256KB) zombie: 1 (256KB) Zombie tables: 0 (0B, local: 0B) Backing tables: 0 (0B) @@ -814,7 +814,7 @@ total | 1 604B 0B 0 | - | 38B | 0 0B | 0 0 WAL: 1 files (0B) in: 27B written: 38B (41% overhead) Flushes: 1 Compactions: 1 estimated debt: 0B in progress: 0 (0B) - default: 0 delete: 0 elision: 0 move: 0 read: 0 rewrite: 0 copy: 1 multi-level: 0 + default: 0 delete: 0 elision: 0 move: 0 read: 0 tombstone-density: 0 rewrite: 0 copy: 1 multi-level: 0 MemTables: 1 (256KB) zombie: 1 (256KB) Zombie tables: 0 (0B, local: 0B) Backing tables: 0 (0B) @@ -822,7 +822,7 @@ Virtual tables: 0 (0B) Local tables size: 0B Compression types: snappy: 1 Block cache: 1 entries (440B) hit rate: 0.0% -Table cache: 1 entries (728B) hit rate: 0.0% +Table cache: 1 entries (736B) hit rate: 0.0% Secondary cache: 0 entries (0B) hit rate: 0.0% Snapshots: 0 earliest seq num: 0 Table iters: 0 @@ -861,7 +861,7 @@ total | 2 1.2KB 0B 0 | - | 627B | 1 589B | 0 0 WAL: 1 files (0B) in: 27B written: 38B (41% overhead) Flushes: 1 Compactions: 1 estimated debt: 1.2KB in progress: 0 (0B) - default: 0 delete: 0 elision: 0 move: 0 read: 0 rewrite: 0 copy: 1 multi-level: 0 + default: 0 delete: 0 elision: 0 move: 0 read: 0 tombstone-density: 0 rewrite: 0 copy: 1 multi-level: 0 MemTables: 1 (256KB) zombie: 1 (256KB) Zombie tables: 0 (0B, local: 0B) Backing tables: 0 (0B) @@ -869,7 +869,7 @@ Virtual tables: 0 (0B) Local tables size: 0B Compression types: snappy: 2 Block cache: 6 entries (996B) hit rate: 0.0% -Table cache: 1 entries (728B) hit rate: 50.0% +Table cache: 1 entries (736B) hit rate: 50.0% Secondary cache: 0 entries (0B) hit rate: 0.0% Snapshots: 0 earliest seq num: 0 Table iters: 0 @@ -909,7 +909,7 @@ total | 3 1.7KB 0B 0 | - | 655B | 1 589B | 0 0 WAL: 1 files (0B) in: 44B written: 66B (50% overhead) Flushes: 2 Compactions: 1 estimated debt: 1.7KB in progress: 0 (0B) - default: 0 delete: 0 elision: 0 move: 0 read: 0 rewrite: 0 copy: 1 multi-level: 0 + default: 0 delete: 0 elision: 0 move: 0 read: 0 tombstone-density: 0 rewrite: 0 copy: 1 multi-level: 0 MemTables: 1 (256KB) zombie: 1 (256KB) Zombie tables: 0 (0B, local: 0B) Backing tables: 0 (0B) @@ -917,7 +917,7 @@ Virtual tables: 0 (0B) Local tables size: 589B Compression types: snappy: 3 Block cache: 6 entries (996B) hit rate: 0.0% -Table cache: 1 entries (728B) hit rate: 50.0% +Table cache: 1 entries (736B) hit rate: 50.0% Secondary cache: 0 entries (0B) hit rate: 0.0% Snapshots: 0 earliest seq num: 0 Table iters: 0 @@ -948,7 +948,7 @@ total | 3 1.7KB 0B 0 | - | 0B | 0 0B | 0 0 WAL: 1 files (0B) in: 0B written: 0B (0% overhead) Flushes: 0 Compactions: 0 estimated debt: 1.7KB in progress: 0 (0B) - default: 0 delete: 0 elision: 0 move: 0 read: 0 rewrite: 0 copy: 0 multi-level: 0 + default: 0 delete: 0 elision: 0 move: 0 read: 0 tombstone-density: 0 rewrite: 0 copy: 0 multi-level: 0 MemTables: 1 (256KB) zombie: 0 (0B) Zombie tables: 0 (0B, local: 0B) Backing tables: 0 (0B) @@ -986,7 +986,7 @@ total | 1 603B 0B 0 | - | 0B | 0 0B | 0 0 WAL: 1 files (0B) in: 0B written: 0B (0% overhead) Flushes: 0 Compactions: 1 estimated debt: 0B in progress: 0 (0B) - default: 1 delete: 0 elision: 0 move: 0 read: 0 rewrite: 0 copy: 0 multi-level: 0 + default: 1 delete: 0 elision: 0 move: 0 read: 0 tombstone-density: 0 rewrite: 0 copy: 0 multi-level: 0 MemTables: 1 (256KB) zombie: 0 (0B) Zombie tables: 0 (0B, local: 0B) Backing tables: 0 (0B) diff --git a/version_set.go b/version_set.go index c0ec1758fb..2762aed001 100644 --- a/version_set.go +++ b/version_set.go @@ -842,6 +842,10 @@ func (vs *versionSet) incrementCompactions( vs.metrics.Compact.Count++ vs.metrics.Compact.ReadCount++ + case compactionKindTombstoneDensity: + vs.metrics.Compact.Count++ + vs.metrics.Compact.TombstoneDensityCount++ + case compactionKindRewrite: vs.metrics.Compact.Count++ vs.metrics.Compact.RewriteCount++