diff --git a/compaction.go b/compaction.go index eb27f335940..66296668239 100644 --- a/compaction.go +++ b/compaction.go @@ -135,6 +135,7 @@ const ( compactionKindDeleteOnly compactionKindElisionOnly compactionKindRead + compactionKindTombstoneDensity compactionKindRewrite compactionKindIngestedFlushable ) @@ -153,6 +154,8 @@ func (k compactionKind) String() string { return "elision-only" case compactionKindRead: return "read" + case compactionKindTombstoneDensity: + return "tombstone-density" case compactionKindRewrite: return "rewrite" case compactionKindIngestedFlushable: diff --git a/compaction_picker.go b/compaction_picker.go index 5911c4a1301..e5a8828d7fa 100644 --- a/compaction_picker.go +++ b/compaction_picker.go @@ -592,6 +592,7 @@ func newCompactionPickerByScore( virtualBackings: virtualBackings, } p.initLevelMaxBytes(inProgressCompactions) + p.initTombstoneDensityAnnotator(opts) return p } @@ -672,6 +673,11 @@ type compactionPickerByScore struct { // levelMaxBytes holds the dynamically adjusted max bytes setting for each // level. levelMaxBytes [numLevels]int64 + // tombstoneDensityAnnotator holds the annotator for choosing tombstone + // density compactions. + // NB: This is declared here rather than globally because + // options.Experimental.MinTombstoneDenseRatio is not known until runtime. + tombstoneDensityAnnotator *manifest.Annotator[fileMetadata] } var _ compactionPicker = &compactionPickerByScore{} @@ -1287,6 +1293,13 @@ func (p *compactionPickerByScore) pickAuto(env compactionEnv) (pc *pickedCompact } } + // Check for files which contain excessive point tombstones that could slow + // down reads. Unlike elision-only compactions, these compactions may select + // a file at any level rather than only the lowest level. + if pc := p.pickTombstoneDensityCompaction(env); pc != nil { + return pc + } + // Check for L6 files with tombstones that may be elided. These files may // exist if a snapshot prevented the elision of a tombstone or because of // a move compaction. These are low-priority compactions because they @@ -1415,6 +1428,38 @@ var markedForCompactionAnnotator = &manifest.Annotator[fileMetadata]{ }, } +// pickedCompactionFromCandidateFile creates a pickedCompaction from a *fileMetadata +// with various checks to ensure that the file still exists in the expected level +// and isn't already being compacted. +func (p *compactionPickerByScore) pickedCompactionFromCandidateFile( + candidate *fileMetadata, env compactionEnv, startLevel int, outputLevel int, kind compactionKind, +) *pickedCompaction { + if candidate == nil || candidate.IsCompacting() { + return nil + } + + inputs := p.vers.Levels[startLevel].Find(p.opts.Comparer.Compare, candidate) + if inputs.Empty() { + panic(fmt.Sprintf("file %s not found in level %d as expected", candidate.FileNum, startLevel)) + } + + pc := newPickedCompaction(p.opts, p.vers, startLevel, outputLevel, p.baseLevel) + pc.kind = kind + pc.startLevel.files = inputs + pc.smallest, pc.largest = manifest.KeyRange(pc.cmp, pc.startLevel.files.Iter()) + + // Fail-safe to protect against compacting the same sstable concurrently. + if inputRangeAlreadyCompacting(env, pc) { + return nil + } + + if !pc.setupInputs(p.opts, env.diskAvailBytes, pc.startLevel) { + return nil + } + + return pc +} + // pickElisionOnlyCompaction looks for compactions of sstables in the // bottommost level containing obsolete records that may now be dropped. func (p *compactionPickerByScore) pickElisionOnlyCompaction( @@ -1427,28 +1472,10 @@ func (p *compactionPickerByScore) pickElisionOnlyCompaction( if candidate == nil { return nil } - if candidate.IsCompacting() || candidate.LargestSeqNum >= env.earliestSnapshotSeqNum { + if candidate.LargestSeqNum >= env.earliestSnapshotSeqNum { return nil } - lf := p.vers.Levels[numLevels-1].Find(p.opts.Comparer.Compare, candidate) - if lf.Empty() { - panic(fmt.Sprintf("file %s not found in level %d as expected", candidate.FileNum, numLevels-1)) - } - - // Construct a picked compaction of the elision candidate's atomic - // compaction unit. - pc = newPickedCompaction(p.opts, p.vers, numLevels-1, numLevels-1, p.baseLevel) - pc.kind = compactionKindElisionOnly - pc.startLevel.files = lf - if anyTablesCompacting(lf) { - return nil - } - pc.smallest, pc.largest = manifest.KeyRange(pc.cmp, pc.startLevel.files.Iter()) - // Fail-safe to protect against compacting the same sstable concurrently. - if !inputRangeAlreadyCompacting(env, pc) { - return pc - } - return nil + return p.pickedCompactionFromCandidateFile(candidate, env, numLevels-1, numLevels-1, compactionKindElisionOnly) } // pickRewriteCompaction attempts to construct a compaction that @@ -1463,36 +1490,59 @@ func (p *compactionPickerByScore) pickRewriteCompaction(env compactionEnv) (pc * // Try the next level. continue } - if candidate.IsCompacting() { - // Try the next level. - continue - } - lf := p.vers.Levels[l].Find(p.opts.Comparer.Compare, candidate) - if lf.Empty() { - panic(fmt.Sprintf("file %s not found in level %d as expected", candidate.FileNum, numLevels-1)) + pc := p.pickedCompactionFromCandidateFile(candidate, env, l, l, compactionKindRewrite) + if pc != nil { + return pc } + } + return nil +} - inputs := lf - if anyTablesCompacting(inputs) { - // Try the next level. - continue - } +func (p *compactionPickerByScore) initTombstoneDensityAnnotator(opts *Options) { + p.tombstoneDensityAnnotator = &manifest.Annotator[fileMetadata]{ + Aggregator: manifest.PickFileAggregator{ + Filter: func(f *fileMetadata) (eligible bool, cacheOK bool) { + if f.IsCompacting() { + return false, true + } + if !f.StatsValid() { + return false, false + } + return f.Stats.TombstoneDenseBlocksRatio > opts.Experimental.TombstoneDenseCompactionThreshold, true + }, + Compare: func(a, b *fileMetadata) bool { + return a.Stats.TombstoneDenseBlocksRatio > b.Stats.TombstoneDenseBlocksRatio + }, + }, + } +} - pc = newPickedCompaction(p.opts, p.vers, l, l, p.baseLevel) - pc.outputLevel.level = l - pc.kind = compactionKindRewrite - pc.startLevel.files = inputs - pc.smallest, pc.largest = manifest.KeyRange(pc.cmp, pc.startLevel.files.Iter()) +// pickTombstoneDensityCompaction looks for a compaction that eliminates +// regions of extremely high point tombstone density. For each level, it picks +// a file where the ratio of tombstone-dense blocks is at least +// options.Experimental.MinTombstoneDenseRatio, prioritizing compaction of +// files with higher ratios of tombstone-dense blocks. +func (p *compactionPickerByScore) pickTombstoneDensityCompaction( + env compactionEnv, +) (pc *pickedCompaction) { + if p.opts.Experimental.TombstoneDenseCompactionThreshold == -1 { + // Tombstone density compactions are disabled. + return nil + } - // Fail-safe to protect against compacting the same sstable concurrently. - if !inputRangeAlreadyCompacting(env, pc) { - if pc.startLevel.level == 0 { - pc.startLevel.l0SublevelInfo = generateSublevelInfo(pc.cmp, pc.startLevel.files) - } - return pc + var candidate *fileMetadata + var level int + // NB: We don't consider L0 or the lowest level. + for l := 1; l < numLevels-1; l++ { + f := p.tombstoneDensityAnnotator.LevelAnnotation(p.vers.Levels[l]) + newCandidate := p.tombstoneDensityAnnotator.Aggregator.Merge(f, candidate) + if newCandidate != candidate { + candidate = newCandidate + level = l } } - return nil + + return p.pickedCompactionFromCandidateFile(candidate, env, level, defaultOutputLevel(level, p.baseLevel), compactionKindTombstoneDensity) } // pickAutoLPositive picks an automatic compaction for the candidate diff --git a/compaction_picker_test.go b/compaction_picker_test.go index bd2edb32d8b..481f4a6cd16 100644 --- a/compaction_picker_test.go +++ b/compaction_picker_test.go @@ -517,6 +517,7 @@ func TestCompactionPickerL0(t *testing.T) { } vs.picker = picker picker.initLevelMaxBytes(inProgressCompactions) + picker.initTombstoneDensityAnnotator(opts) var buf bytes.Buffer fmt.Fprint(&buf, version.String()) diff --git a/internal/manifest/version.go b/internal/manifest/version.go index a179d0382a2..1adc62f2b3c 100644 --- a/internal/manifest/version.go +++ b/internal/manifest/version.go @@ -75,6 +75,16 @@ type TableStats struct { ValueBlocksSize uint64 // CompressionType is the compression type of the table. CompressionType block.Compression + // TombstoneDenseBlocksRatio is the ratio of data blocks in this table that + // fulfills at least one of the following: + // 1. The block contains at least options.Experimental.NumDeletionsThreshold + // point tombstones. + // 2. The ratio of the uncompressed size of point tombstones to the + // uncompressed size of the block is at least + // options.Experimental.DeletionSizeRatioThreshold. + // This statistic is used to determine eligibility for a tombstone density + // compaction. + TombstoneDenseBlocksRatio float64 } // boundType represents the type of key (point or range) present as the smallest diff --git a/metrics.go b/metrics.go index 6d2d60ce1c2..05678f42b22 100644 --- a/metrics.go +++ b/metrics.go @@ -153,16 +153,17 @@ type Metrics struct { Compact struct { // The total number of compactions, and per-compaction type counts. - Count int64 - DefaultCount int64 - DeleteOnlyCount int64 - ElisionOnlyCount int64 - CopyCount int64 - MoveCount int64 - ReadCount int64 - RewriteCount int64 - MultiLevelCount int64 - CounterLevelCount int64 + Count int64 + DefaultCount int64 + DeleteOnlyCount int64 + ElisionOnlyCount int64 + CopyCount int64 + MoveCount int64 + ReadCount int64 + TombstoneDensityCount int64 + RewriteCount int64 + MultiLevelCount int64 + CounterLevelCount int64 // An estimate of the number of bytes that need to be compacted for the LSM // to reach a stable state. EstimatedDebt uint64 @@ -580,12 +581,13 @@ func (m *Metrics) SafeFormat(w redact.SafePrinter, _ rune) { redact.Safe(m.Compact.NumInProgress), humanize.Bytes.Int64(m.Compact.InProgressBytes)) - w.Printf(" default: %d delete: %d elision: %d move: %d read: %d rewrite: %d copy: %d multi-level: %d\n", + w.Printf(" default: %d delete: %d elision: %d move: %d read: %d tombstone-density: %d rewrite: %d copy: %d multi-level: %d\n", redact.Safe(m.Compact.DefaultCount), redact.Safe(m.Compact.DeleteOnlyCount), redact.Safe(m.Compact.ElisionOnlyCount), redact.Safe(m.Compact.MoveCount), redact.Safe(m.Compact.ReadCount), + redact.Safe(m.Compact.TombstoneDensityCount), redact.Safe(m.Compact.RewriteCount), redact.Safe(m.Compact.CopyCount), redact.Safe(m.Compact.MultiLevelCount)) diff --git a/metrics_test.go b/metrics_test.go index 9b1d3a17a46..4e34afd9067 100644 --- a/metrics_test.go +++ b/metrics_test.go @@ -39,6 +39,7 @@ func exampleMetrics() Metrics { m.Compact.ElisionOnlyCount = 29 m.Compact.MoveCount = 30 m.Compact.ReadCount = 31 + m.Compact.TombstoneDensityCount = 16 m.Compact.RewriteCount = 32 m.Compact.CopyCount = 33 m.Compact.MultiLevelCount = 34 diff --git a/options.go b/options.go index 18a21fef2b7..ba2cd508d9d 100644 --- a/options.go +++ b/options.go @@ -599,6 +599,38 @@ type Options struct { // gets multiplied with a constant of 1 << 16 to yield 1 << 20 (1MB). ReadSamplingMultiplier int64 + // NumDeletionsThreshold defines the minimum number of point tombstones + // that must be present in a single data block for that block to be + // considered tombstone-dense for the purposes of triggering a + // tombstone density compaction. Data blocks may also be considered + // tombstone-dense if they meet the criteria defined by + // DeletionSizeRatioThreshold below. Tombstone-dense blocks are identified + // when sstables are written, and so this is effectively an option for + // sstable writers. The default value is 100. + NumDeletionsThreshold int + + // DeletionSizeRatioThreshold defines the minimum ratio of the size of + // point tombstones to the size of a data block that must be reached + // for that block to be considered tombstone-dense for the purposes of + // triggering a tombstone density compaction. Data blocks may also be + // considered tombstone-dense if they meet the criteria defined by + // NumDeletionsThreshold above. Tombstone-dense blocks are identified + // when sstables are written, and so this is effectively an option for + // sstable writers. The default value is 0.5. + DeletionSizeRatioThreshold float32 + + // TombstoneDenseCompactionThreshold is the minimum percent of data + // blocks in a table that must be tombstone-dense for that table to be + // eligible for a tombstone density compaction. It should be defined as a + // ratio out of 1. The default value is 0.05. + // + // If multiple tables are eligible for a tombstone density compaction, then + // tables with a higher percent of tombstone-dense blocks are still + // prioritized for compaction. + // + // A value of -1 disables tombstone density compactions. + TombstoneDenseCompactionThreshold float64 + // TableCacheShards is the number of shards per table cache. // Reducing the value can reduce the number of idle goroutines per DB // instance which can be useful in scenarios with a lot of DB instances @@ -1268,6 +1300,15 @@ func (o *Options) EnsureDefaults() *Options { if o.Experimental.ReadSamplingMultiplier == 0 { o.Experimental.ReadSamplingMultiplier = 1 << 4 } + if o.Experimental.NumDeletionsThreshold == 0 { + o.Experimental.NumDeletionsThreshold = sstable.DefaultNumDeletionsThreshold + } + if o.Experimental.DeletionSizeRatioThreshold == 0 { + o.Experimental.DeletionSizeRatioThreshold = sstable.DefaultDeletionSizeRatioThreshold + } + if o.Experimental.TombstoneDenseCompactionThreshold == 0 { + o.Experimental.TombstoneDenseCompactionThreshold = 0.05 + } if o.Experimental.TableCacheShards <= 0 { o.Experimental.TableCacheShards = runtime.GOMAXPROCS(0) } @@ -1395,6 +1436,9 @@ func (o *Options) String() string { } fmt.Fprintf(&buf, " read_compaction_rate=%d\n", o.Experimental.ReadCompactionRate) fmt.Fprintf(&buf, " read_sampling_multiplier=%d\n", o.Experimental.ReadSamplingMultiplier) + fmt.Fprintf(&buf, " num_deletions_threshold=%d\n", o.Experimental.NumDeletionsThreshold) + fmt.Fprintf(&buf, " deletion_size_ratio_threshold=%f\n", o.Experimental.DeletionSizeRatioThreshold) + fmt.Fprintf(&buf, " tombstone_dense_compaction_threshold=%f\n", o.Experimental.TombstoneDenseCompactionThreshold) // We no longer care about strict_wal_tail, but set it to true in case an // older version reads the options. fmt.Fprintf(&buf, " strict_wal_tail=%t\n", true) @@ -1711,6 +1755,14 @@ func (o *Options) Parse(s string, hooks *ParseHooks) error { o.Experimental.ReadCompactionRate, err = strconv.ParseInt(value, 10, 64) case "read_sampling_multiplier": o.Experimental.ReadSamplingMultiplier, err = strconv.ParseInt(value, 10, 64) + case "num_deletions_threshold": + o.Experimental.NumDeletionsThreshold, err = strconv.Atoi(value) + case "deletion_size_ratio_threshold": + val, parseErr := strconv.ParseFloat(value, 32) + o.Experimental.DeletionSizeRatioThreshold = float32(val) + err = parseErr + case "tombstone_dense_compaction_threshold": + o.Experimental.TombstoneDenseCompactionThreshold, err = strconv.ParseFloat(value, 64) case "table_cache_shards": o.Experimental.TableCacheShards, err = strconv.Atoi(value) case "table_format": @@ -1985,6 +2037,8 @@ func (o *Options) MakeWriterOptions(level int, format sstable.TableFormat) sstab writerOpts.FilterType = levelOpts.FilterType writerOpts.IndexBlockSize = levelOpts.IndexBlockSize writerOpts.AllocatorSizeClasses = o.AllocatorSizeClasses + writerOpts.NumDeletionsThreshold = o.Experimental.NumDeletionsThreshold + writerOpts.DeletionSizeRatioThreshold = o.Experimental.DeletionSizeRatioThreshold return writerOpts } diff --git a/options_test.go b/options_test.go index 2476340d798..b09742d537d 100644 --- a/options_test.go +++ b/options_test.go @@ -101,6 +101,9 @@ func TestOptionsString(t *testing.T) { multilevel_compaction_heuristic=wamp(0.00, false) read_compaction_rate=16000 read_sampling_multiplier=16 + num_deletions_threshold=100 + deletion_size_ratio_threshold=0.500000 + tombstone_dense_compaction_threshold=0.050000 strict_wal_tail=true table_cache_shards=8 validate_on_ingest=false @@ -285,6 +288,9 @@ func TestOptionsParse(t *testing.T) { } opts.Experimental.ReadCompactionRate = 300 opts.Experimental.ReadSamplingMultiplier = 400 + opts.Experimental.NumDeletionsThreshold = 500 + opts.Experimental.DeletionSizeRatioThreshold = 0.7 + opts.Experimental.TombstoneDenseCompactionThreshold = 0.2 opts.Experimental.TableCacheShards = 500 opts.Experimental.MaxWriterConcurrency = 1 opts.Experimental.ForceWriterParallelism = true diff --git a/replay/replay.go b/replay/replay.go index c6e47285743..0e2809e6f96 100644 --- a/replay/replay.go +++ b/replay/replay.go @@ -97,15 +97,16 @@ func (pra PaceByFixedReadAmp) pace(r *Runner, _ workloadStep) time.Duration { // Metrics holds the various statistics on a replay run and its performance. type Metrics struct { CompactionCounts struct { - Total int64 - Default int64 - DeleteOnly int64 - ElisionOnly int64 - Move int64 - Read int64 - Rewrite int64 - Copy int64 - MultiLevel int64 + Total int64 + Default int64 + DeleteOnly int64 + ElisionOnly int64 + Move int64 + Read int64 + TombstoneDensity int64 + Rewrite int64 + Copy int64 + MultiLevel int64 } EstimatedDebt SampledMetric Final *pebble.Metrics @@ -556,6 +557,7 @@ func (r *Runner) Wait() (Metrics, error) { m.CompactionCounts.ElisionOnly = pm.Compact.ElisionOnlyCount m.CompactionCounts.Move = pm.Compact.MoveCount m.CompactionCounts.Read = pm.Compact.ReadCount + m.CompactionCounts.TombstoneDensity = pm.Compact.TombstoneDensityCount m.CompactionCounts.Rewrite = pm.Compact.RewriteCount m.CompactionCounts.Copy = pm.Compact.CopyCount m.CompactionCounts.MultiLevel = pm.Compact.MultiLevelCount diff --git a/replay/testdata/replay b/replay/testdata/replay index 89fdda344ff..6fb775dec7c 100644 --- a/replay/testdata/replay +++ b/replay/testdata/replay @@ -12,7 +12,7 @@ tree 0 LOCK 98 MANIFEST-000001 122 MANIFEST-000008 - 1240 OPTIONS-000003 + 1359 OPTIONS-000003 0 marker.format-version.000001.013 0 marker.manifest.000002.MANIFEST-000008 simple/ @@ -23,7 +23,7 @@ tree 25 000004.log 586 000005.sst 98 MANIFEST-000001 - 1240 OPTIONS-000003 + 1359 OPTIONS-000003 0 marker.format-version.000001.013 0 marker.manifest.000001.MANIFEST-000001 @@ -60,6 +60,9 @@ cat build/OPTIONS-000003 multilevel_compaction_heuristic=wamp(0.00, false) read_compaction_rate=16000 read_sampling_multiplier=16 + num_deletions_threshold=100 + deletion_size_ratio_threshold=0.500000 + tombstone_dense_compaction_threshold=0.050000 strict_wal_tail=true table_cache_shards=2 validate_on_ingest=false diff --git a/replay/testdata/replay_paced b/replay/testdata/replay_paced index 15a95de63f3..fdaa2a96eb5 100644 --- a/replay/testdata/replay_paced +++ b/replay/testdata/replay_paced @@ -14,7 +14,7 @@ tree 0 LOCK 122 MANIFEST-000008 205 MANIFEST-000011 - 1240 OPTIONS-000003 + 1359 OPTIONS-000003 0 marker.format-version.000001.013 0 marker.manifest.000003.MANIFEST-000011 high_read_amp/ @@ -26,7 +26,7 @@ tree 39 000009.log 560 000010.sst 157 MANIFEST-000011 - 1240 OPTIONS-000003 + 1359 OPTIONS-000003 0 marker.format-version.000001.013 0 marker.manifest.000001.MANIFEST-000011 diff --git a/sstable/options.go b/sstable/options.go index 724e0cd8d72..4d9f24646a1 100644 --- a/sstable/options.go +++ b/sstable/options.go @@ -12,8 +12,18 @@ import ( "github.com/cockroachdb/pebble/sstable/rowblk" ) -// MaximumBlockSize is the maximum permissible size of a block. -const MaximumBlockSize = rowblk.MaximumSize +const ( + // MaximumBlockSize is the maximum permissible size of a block. + MaximumBlockSize = rowblk.MaximumSize + // DefaultNumDeletionsThreshold defines the minimum number of point + // tombstones that must be present in a data block for it to be + // considered tombstone-dense. + DefaultNumDeletionsThreshold = 100 + // DefaultDeletionSizeRatioThreshold defines the minimum ratio of the size + // of point tombstones to the size of the data block in order to consider the + // block as tombstone-dense. + DefaultDeletionSizeRatioThreshold = 0.5 +) var ignoredInternalProperties = map[string]struct{}{ "rocksdb.column.family.id": {}, @@ -244,6 +254,13 @@ type WriterOptions struct { // internal options can only be used from within the pebble package. internal sstableinternal.WriterOptions + + // NumDeletionsThreshold mirrors Options.Experimental.NumDeletionsThreshold. + NumDeletionsThreshold int + + // DeletionSizeRatioThreshold mirrors + // Options.Experimental.DeletionSizeRatioThreshold. + DeletionSizeRatioThreshold float32 } // SetInternal sets the internal writer options. Note that even though this @@ -286,5 +303,11 @@ func (o WriterOptions) ensureDefaults() WriterOptions { if o.TableFormat == TableFormatUnspecified { o.TableFormat = TableFormatMinSupported } + if o.NumDeletionsThreshold == 0 { + o.NumDeletionsThreshold = DefaultNumDeletionsThreshold + } + if o.DeletionSizeRatioThreshold == 0 { + o.DeletionSizeRatioThreshold = DefaultDeletionSizeRatioThreshold + } return o } diff --git a/sstable/properties.go b/sstable/properties.go index 8b597d11c6a..9e10b82361e 100644 --- a/sstable/properties.go +++ b/sstable/properties.go @@ -100,6 +100,13 @@ type CommonProperties struct { NumRangeKeySets uint64 `prop:"pebble.num.range-key-sets"` // Total size of value blocks and value index block. Only serialized if > 0. ValueBlocksSize uint64 `prop:"pebble.value-blocks.size"` + // NumDataBlocks is the number of data blocks in this table. + NumDataBlocks uint64 `prop:"rocksdb.num.data.blocks"` + // NumTombstoneDenseBlocks is the number of data blocks in this table that + // are considered tombstone-dense. See the TombstoneDenseBlocksRatio field + // in manifest.TableStats for the criteria used to determine if a data + // block is tombstone-dense. + NumTombstoneDenseBlocks uint64 `prop:"pebble.num.tombstone-dense-blocks"` // The compression algorithm used to compress blocks. CompressionName string `prop:"rocksdb.compression"` // The compression options used to compress blocks. @@ -150,8 +157,6 @@ type Properties struct { IsStrictObsolete bool `prop:"pebble.obsolete.is_strict"` // The name of the merger used in this table. Empty if no merger is used. MergerName string `prop:"rocksdb.merge.operator"` - // The number of blocks in this table. - NumDataBlocks uint64 `prop:"rocksdb.num.data.blocks"` // The number of merge operands in the table. NumMergeOperands uint64 `prop:"rocksdb.merge.operands"` // The number of RANGEKEYUNSETs in this table. @@ -401,6 +406,9 @@ func (p *Properties) save(tblFormat TableFormat, w *rowblk.Writer) { if p.ValueBlocksSize > 0 { p.saveUvarint(m, unsafe.Offsetof(p.ValueBlocksSize), p.ValueBlocksSize) } + if p.NumTombstoneDenseBlocks != 0 { + p.saveUvarint(m, unsafe.Offsetof(p.NumTombstoneDenseBlocks), p.NumTombstoneDenseBlocks) + } if tblFormat < TableFormatPebblev1 { m["rocksdb.column.family.id"] = binary.AppendUvarint([]byte(nil), math.MaxInt32) diff --git a/sstable/properties_test.go b/sstable/properties_test.go index aaf585026f5..1cb6d4e0cca 100644 --- a/sstable/properties_test.go +++ b/sstable/properties_test.go @@ -29,6 +29,7 @@ func TestPropertiesLoad(t *testing.T) { NumRangeDeletions: 17, RawKeySize: 23938, RawValueSize: 1912, + NumDataBlocks: 14, CompressionName: "Snappy", CompressionOptions: "window_bits=-14; level=32767; strategy=0; max_dict_bytes=0; zstd_max_train_bytes=0; enabled=0; ", }, @@ -36,7 +37,6 @@ func TestPropertiesLoad(t *testing.T) { DataSize: 13913, IndexSize: 325, MergerName: "nullptr", - NumDataBlocks: 14, PropertyCollectorNames: "[]", } @@ -60,15 +60,17 @@ func TestPropertiesLoad(t *testing.T) { var testProps = Properties{ CommonProperties: CommonProperties{ - NumDeletions: 15, - NumEntries: 16, - NumRangeDeletions: 18, - NumRangeKeyDels: 19, - NumRangeKeySets: 20, - RawKeySize: 25, - RawValueSize: 26, - CompressionName: "compression name", - CompressionOptions: "compression option", + NumDeletions: 15, + NumEntries: 16, + NumRangeDeletions: 18, + NumRangeKeyDels: 19, + NumRangeKeySets: 20, + RawKeySize: 25, + RawValueSize: 26, + NumDataBlocks: 14, + NumTombstoneDenseBlocks: 2, + CompressionName: "compression name", + CompressionOptions: "compression option", }, ComparerName: "comparator name", DataSize: 3, @@ -79,7 +81,6 @@ var testProps = Properties{ IndexType: 12, IsStrictObsolete: true, MergerName: "merge operator name", - NumDataBlocks: 14, NumMergeOperands: 17, NumRangeKeyUnsets: 21, NumValueBlocks: 22, diff --git a/sstable/raw_writer.go b/sstable/raw_writer.go index 163d746b03e..5434ce71719 100644 --- a/sstable/raw_writer.go +++ b/sstable/raw_writer.go @@ -206,6 +206,9 @@ type RawWriter struct { valueBlockWriter *valueBlockWriter allocatorSizeClasses []int + + numDeletionsThreshold int + deletionSizeRatioThreshold float32 } type pointKeyInfo struct { @@ -591,6 +594,15 @@ type dataBlockBuf struct { // sepScratch is reusable scratch space for computing separator keys. sepScratch []byte + + // numDeletions stores the count of point tombstones in this data block. + // It's used to determine if this data block is considered tombstone-dense + // for the purposes of compaction. + numDeletions int + // deletionSize stores the raw size of point tombstones in this data block. + // It's used to determine if this data block is considered tombstone-dense + // for the purposes of compaction. + deletionSize int } func (d *dataBlockBuf) clear() { @@ -928,7 +940,9 @@ func (w *RawWriter) addPoint(key InternalKey, value []byte, forceObsolete bool) switch key.Kind() { case InternalKeyKindDelete, InternalKeyKindSingleDelete: w.props.NumDeletions++ + w.dataBlockBuf.numDeletions++ w.props.RawPointTombstoneKeySize += uint64(len(key.UserKey)) + w.dataBlockBuf.deletionSize += len(key.UserKey) case InternalKeyKindDeleteSized: var size uint64 if len(value) > 0 { @@ -942,7 +956,9 @@ func (w *RawWriter) addPoint(key InternalKey, value []byte, forceObsolete bool) } w.props.NumDeletions++ w.props.NumSizedDeletions++ + w.dataBlockBuf.numDeletions++ w.props.RawPointTombstoneKeySize += uint64(len(key.UserKey)) + w.dataBlockBuf.deletionSize += len(key.UserKey) w.props.RawPointTombstoneValueSize += size case InternalKeyKindMerge: w.props.NumMergeOperands++ @@ -1134,6 +1150,20 @@ func (w *RawWriter) maybeAddToFilter(key []byte) { } } +// maybeIncrementTombstoneDenseBlocks increments the number of tombstone dense +// blocks if the number of deletions in the data block exceeds a threshold or +// the deletion size exceeds a threshold. It should be called after the +// data block has been finished. +// Invariant: w.dataBlockBuf.uncompressed must already be populated. +func (w *RawWriter) maybeIncrementTombstoneDenseBlocks() { + minSize := w.deletionSizeRatioThreshold * float32(len(w.dataBlockBuf.uncompressed)) + if w.dataBlockBuf.numDeletions > w.numDeletionsThreshold || float32(w.dataBlockBuf.deletionSize) > minSize { + w.props.NumTombstoneDenseBlocks++ + } + w.dataBlockBuf.numDeletions = 0 + w.dataBlockBuf.deletionSize = 0 +} + func (w *RawWriter) flush(key InternalKey) error { // We're finishing a data block. err := w.finishDataBlockProps(w.dataBlockBuf) @@ -1141,6 +1171,7 @@ func (w *RawWriter) flush(key InternalKey) error { return err } w.dataBlockBuf.finish() + w.maybeIncrementTombstoneDenseBlocks() w.dataBlockBuf.compressAndChecksum(w.compression) // Since dataBlockEstimates.addInflightDataBlock was never called, the // inflightSize is set to 0. @@ -1695,7 +1726,9 @@ func (w *RawWriter) Close() (err error) { // Finish the last data block, or force an empty data block if there // aren't any data blocks at all. if w.dataBlockBuf.dataBlock.EntryCount() > 0 || w.indexBlock.block.EntryCount() == 0 { - bh, err := w.layout.WriteDataBlock(w.dataBlockBuf.dataBlock.Finish(), &w.dataBlockBuf.blockBuf) + w.dataBlockBuf.finish() + w.maybeIncrementTombstoneDenseBlocks() + bh, err := w.layout.WriteDataBlock(w.dataBlockBuf.uncompressed, &w.dataBlockBuf.blockBuf) if err != nil { return err } @@ -1885,24 +1918,26 @@ func NewRawWriter(writable objstorage.Writable, o WriterOptions) *RawWriter { blockSizeThreshold: (o.IndexBlockSize*o.BlockSizeThreshold + 99) / 100, sizeClassAwareThreshold: (o.IndexBlockSize*o.SizeClassAwareThreshold + 99) / 100, }, - compare: o.Comparer.Compare, - suffixCmp: o.Comparer.CompareSuffixes, - split: o.Comparer.Split, - formatKey: o.Comparer.FormatKey, - compression: o.Compression, - separator: o.Comparer.Separator, - successor: o.Comparer.Successor, - tableFormat: o.TableFormat, - isStrictObsolete: o.IsStrictObsolete, - writingToLowestLevel: o.WritingToLowestLevel, - restartInterval: o.BlockRestartInterval, - checksumType: o.Checksum, - disableKeyOrderChecks: o.internal.DisableKeyOrderChecks, - indexBlock: newIndexBlockBuf(o.Parallelism), - rangeDelBlock: rowblk.Writer{RestartInterval: 1}, - rangeKeyBlock: rowblk.Writer{RestartInterval: 1}, - topLevelIndexBlock: rowblk.Writer{RestartInterval: 1}, - allocatorSizeClasses: o.AllocatorSizeClasses, + compare: o.Comparer.Compare, + suffixCmp: o.Comparer.CompareSuffixes, + split: o.Comparer.Split, + formatKey: o.Comparer.FormatKey, + compression: o.Compression, + separator: o.Comparer.Separator, + successor: o.Comparer.Successor, + tableFormat: o.TableFormat, + isStrictObsolete: o.IsStrictObsolete, + writingToLowestLevel: o.WritingToLowestLevel, + restartInterval: o.BlockRestartInterval, + checksumType: o.Checksum, + disableKeyOrderChecks: o.internal.DisableKeyOrderChecks, + indexBlock: newIndexBlockBuf(o.Parallelism), + rangeDelBlock: rowblk.Writer{RestartInterval: 1}, + rangeKeyBlock: rowblk.Writer{RestartInterval: 1}, + topLevelIndexBlock: rowblk.Writer{RestartInterval: 1}, + allocatorSizeClasses: o.AllocatorSizeClasses, + numDeletionsThreshold: o.NumDeletionsThreshold, + deletionSizeRatioThreshold: o.DeletionSizeRatioThreshold, } if w.tableFormat >= TableFormatPebblev3 { w.shortAttributeExtractor = o.ShortAttributeExtractor diff --git a/sstable/reader_virtual.go b/sstable/reader_virtual.go index 37dd6770a76..85944ec3778 100644 --- a/sstable/reader_virtual.go +++ b/sstable/reader_virtual.go @@ -76,6 +76,8 @@ func MakeVirtualReader(reader *Reader, p VirtualReaderParams) VirtualReader { v.Properties.NumDeletions = scale(reader.Properties.NumDeletions) v.Properties.NumRangeDeletions = scale(reader.Properties.NumRangeDeletions) v.Properties.NumRangeKeyDels = scale(reader.Properties.NumRangeKeyDels) + v.Properties.NumDataBlocks = scale(reader.Properties.NumDataBlocks) + v.Properties.NumTombstoneDenseBlocks = scale(reader.Properties.NumTombstoneDenseBlocks) // Note that we rely on NumRangeKeySets for correctness. If the sstable may // contain range keys, then NumRangeKeySets must be > 0. ceilDiv works because diff --git a/sstable/testdata/virtual_reader_props b/sstable/testdata/virtual_reader_props index 49253c0c5ac..7d041960fa8 100644 --- a/sstable/testdata/virtual_reader_props +++ b/sstable/testdata/virtual_reader_props @@ -21,6 +21,7 @@ props: rocksdb.num.entries: 1 rocksdb.raw.key.size: 3 rocksdb.raw.value.size: 1 + rocksdb.num.data.blocks: 1 # Test 2: Similar to test 1 but force two level iterators. build block-size=1 index-block-size=1 @@ -40,6 +41,7 @@ props: rocksdb.num.entries: 1 rocksdb.raw.key.size: 2 rocksdb.raw.value.size: 1 + rocksdb.num.data.blocks: 1 # Test the constrain bounds function. It performs some subtle shrinking and # expanding of bounds. The current virtual sstable bounds are [b,c]. @@ -121,6 +123,7 @@ props: rocksdb.deleted.keys: 1 rocksdb.num.range-deletions: 1 pebble.num.range-key-sets: 1 + rocksdb.num.data.blocks: 1 build a.SET.1:a @@ -143,3 +146,4 @@ props: rocksdb.num.entries: 2 rocksdb.raw.key.size: 10 rocksdb.raw.value.size: 2 + rocksdb.num.data.blocks: 1 diff --git a/table_stats.go b/table_stats.go index cfd1af0f46a..a6776e81988 100644 --- a/table_stats.go +++ b/table_stats.go @@ -133,6 +133,7 @@ func (d *DB) collectTableStats() bool { maybeCompact = maybeCompact || fileCompensation(c.fileMetadata) > 0 c.fileMetadata.StatsMarkValid() } + d.mu.tableStats.cond.Broadcast() d.maybeCollectTableStatsLocked() if len(hints) > 0 && !d.opts.private.disableDeleteOnlyCompactions { @@ -308,6 +309,11 @@ func (d *DB) loadTableStats( props := r.CommonProperties() stats.NumEntries = props.NumEntries stats.NumDeletions = props.NumDeletions + stats.NumRangeKeySets = props.NumRangeKeySets + stats.ValueBlocksSize = props.ValueBlocksSize + stats.CompressionType = block.CompressionFromString(props.CompressionName) + stats.TombstoneDenseBlocksRatio = float64(props.NumTombstoneDenseBlocks) / float64(props.NumDataBlocks) + if props.NumPointDeletions() > 0 { if err = d.loadTablePointKeyStats(props, v, level, meta, &stats); err != nil { return @@ -320,12 +326,6 @@ func (d *DB) loadTableStats( return } } - // TODO(travers): Once we have real-world data, consider collecting - // additional stats that may provide improved heuristics for compaction - // picking. - stats.NumRangeKeySets = props.NumRangeKeySets - stats.ValueBlocksSize = props.ValueBlocksSize - stats.CompressionType = block.CompressionFromString(props.CompressionName) return }) if err != nil { diff --git a/testdata/event_listener b/testdata/event_listener index 34b61403a05..58687697d07 100644 --- a/testdata/event_listener +++ b/testdata/event_listener @@ -216,7 +216,7 @@ total | 3 1.7KB 0B 0 | - | 671B | 1 590B | 0 0 WAL: 1 files (0B) in: 48B written: 81B (69% overhead) Flushes: 3 Compactions: 1 estimated debt: 1.7KB in progress: 0 (0B) - default: 1 delete: 0 elision: 0 move: 0 read: 0 rewrite: 0 copy: 0 multi-level: 0 + default: 1 delete: 0 elision: 0 move: 0 read: 0 tombstone-density: 0 rewrite: 0 copy: 0 multi-level: 0 MemTables: 1 (256KB) zombie: 1 (256KB) Zombie tables: 0 (0B, local: 0B) Backing tables: 0 (0B) @@ -317,7 +317,7 @@ total | 6 3.5KB 0B 0 | - | 1.8KB | 3 1.7KB | 0 0 WAL: 1 files (0B) in: 82B written: 108B (32% overhead) Flushes: 6 Compactions: 1 estimated debt: 3.5KB in progress: 0 (0B) - default: 1 delete: 0 elision: 0 move: 0 read: 0 rewrite: 0 copy: 0 multi-level: 0 + default: 1 delete: 0 elision: 0 move: 0 read: 0 tombstone-density: 0 rewrite: 0 copy: 0 multi-level: 0 MemTables: 1 (512KB) zombie: 1 (512KB) Zombie tables: 0 (0B, local: 0B) Backing tables: 0 (0B) diff --git a/testdata/ingest b/testdata/ingest index 48930711d1a..08f8995a99a 100644 --- a/testdata/ingest +++ b/testdata/ingest @@ -46,7 +46,7 @@ total | 1 569B 0B 0 | - | 569B | 1 569B | 0 0 WAL: 1 files (0B) in: 0B written: 0B (0% overhead) Flushes: 0 Compactions: 0 estimated debt: 0B in progress: 0 (0B) - default: 0 delete: 0 elision: 0 move: 0 read: 0 rewrite: 0 copy: 0 multi-level: 0 + default: 0 delete: 0 elision: 0 move: 0 read: 0 tombstone-density: 0 rewrite: 0 copy: 0 multi-level: 0 MemTables: 1 (256KB) zombie: 0 (0B) Zombie tables: 0 (0B, local: 0B) Backing tables: 0 (0B) @@ -54,7 +54,7 @@ Virtual tables: 0 (0B) Local tables size: 569B Compression types: snappy: 1 Block cache: 6 entries (945B) hit rate: 30.8% -Table cache: 1 entries (736B) hit rate: 50.0% +Table cache: 1 entries (744B) hit rate: 50.0% Secondary cache: 0 entries (0B) hit rate: 0.0% Snapshots: 0 earliest seq num: 0 Table iters: 0 diff --git a/testdata/metrics b/testdata/metrics index 5760286df07..6ecb18e5aab 100644 --- a/testdata/metrics +++ b/testdata/metrics @@ -15,7 +15,7 @@ total | 2.8K 2.7KB 0B 2.8K | - | 2.8KB | 2.9K 2.8KB | 2.9K 2.8K WAL: 22 files (24B) in: 25B written: 26B (4% overhead) Flushes: 8 Compactions: 5 estimated debt: 6B in progress: 2 (7B) - default: 27 delete: 28 elision: 29 move: 30 read: 31 rewrite: 32 copy: 33 multi-level: 34 + default: 27 delete: 28 elision: 29 move: 30 read: 31 tombstone-density: 16 rewrite: 32 copy: 33 multi-level: 34 MemTables: 12 (11B) zombie: 14 (13B) Zombie tables: 16 (15B, local: 30B) Backing tables: 1 (2.0MB) @@ -67,7 +67,7 @@ total | 1 589B 0B 0 | - | 28B | 0 0B | 0 0 WAL: 1 files (0B) in: 17B written: 28B (65% overhead) Flushes: 1 Compactions: 0 estimated debt: 0B in progress: 0 (0B) - default: 0 delete: 0 elision: 0 move: 0 read: 0 rewrite: 0 copy: 0 multi-level: 0 + default: 0 delete: 0 elision: 0 move: 0 read: 0 tombstone-density: 0 rewrite: 0 copy: 0 multi-level: 0 MemTables: 1 (256KB) zombie: 1 (256KB) Zombie tables: 0 (0B, local: 0B) Backing tables: 0 (0B) @@ -75,7 +75,7 @@ Virtual tables: 0 (0B) Local tables size: 589B Compression types: snappy: 1 Block cache: 3 entries (484B) hit rate: 0.0% -Table cache: 1 entries (736B) hit rate: 0.0% +Table cache: 1 entries (744B) hit rate: 0.0% Secondary cache: 0 entries (0B) hit rate: 0.0% Snapshots: 0 earliest seq num: 0 Table iters: 1 @@ -84,7 +84,7 @@ Ingestions: 0 as flushable: 0 (0B in 0 tables) disk-usage ---- -1.9KB +2.0KB batch set b 2 @@ -123,7 +123,7 @@ total | 1 595B 0B 0 | - | 56B | 0 0B | 0 0 WAL: 1 files (0B) in: 34B written: 56B (65% overhead) Flushes: 2 Compactions: 1 estimated debt: 0B in progress: 0 (0B) - default: 1 delete: 0 elision: 0 move: 0 read: 0 rewrite: 0 copy: 0 multi-level: 0 + default: 1 delete: 0 elision: 0 move: 0 read: 0 tombstone-density: 0 rewrite: 0 copy: 0 multi-level: 0 MemTables: 1 (256KB) zombie: 2 (512KB) Zombie tables: 2 (1.2KB, local: 1.2KB) Backing tables: 0 (0B) @@ -131,7 +131,7 @@ Virtual tables: 0 (0B) Local tables size: 595B Compression types: snappy: 1 Block cache: 5 entries (946B) hit rate: 33.3% -Table cache: 2 entries (1.4KB) hit rate: 66.7% +Table cache: 2 entries (1.5KB) hit rate: 66.7% Secondary cache: 0 entries (0B) hit rate: 0.0% Snapshots: 0 earliest seq num: 0 Table iters: 2 @@ -142,7 +142,7 @@ Iter category stats: disk-usage ---- -3.2KB +3.3KB # Closing iter a will release one of the zombie memtables. @@ -166,7 +166,7 @@ total | 1 595B 0B 0 | - | 56B | 0 0B | 0 0 WAL: 1 files (0B) in: 34B written: 56B (65% overhead) Flushes: 2 Compactions: 1 estimated debt: 0B in progress: 0 (0B) - default: 1 delete: 0 elision: 0 move: 0 read: 0 rewrite: 0 copy: 0 multi-level: 0 + default: 1 delete: 0 elision: 0 move: 0 read: 0 tombstone-density: 0 rewrite: 0 copy: 0 multi-level: 0 MemTables: 1 (256KB) zombie: 2 (512KB) Zombie tables: 2 (1.2KB, local: 1.2KB) Backing tables: 0 (0B) @@ -174,7 +174,7 @@ Virtual tables: 0 (0B) Local tables size: 595B Compression types: snappy: 1 Block cache: 5 entries (946B) hit rate: 33.3% -Table cache: 2 entries (1.4KB) hit rate: 66.7% +Table cache: 2 entries (1.5KB) hit rate: 66.7% Secondary cache: 0 entries (0B) hit rate: 0.0% Snapshots: 0 earliest seq num: 0 Table iters: 2 @@ -206,7 +206,7 @@ total | 1 595B 0B 0 | - | 56B | 0 0B | 0 0 WAL: 1 files (0B) in: 34B written: 56B (65% overhead) Flushes: 2 Compactions: 1 estimated debt: 0B in progress: 0 (0B) - default: 1 delete: 0 elision: 0 move: 0 read: 0 rewrite: 0 copy: 0 multi-level: 0 + default: 1 delete: 0 elision: 0 move: 0 read: 0 tombstone-density: 0 rewrite: 0 copy: 0 multi-level: 0 MemTables: 1 (256KB) zombie: 2 (512KB) Zombie tables: 1 (589B, local: 589B) Backing tables: 0 (0B) @@ -214,7 +214,7 @@ Virtual tables: 0 (0B) Local tables size: 595B Compression types: snappy: 1 Block cache: 3 entries (484B) hit rate: 33.3% -Table cache: 1 entries (736B) hit rate: 66.7% +Table cache: 1 entries (744B) hit rate: 66.7% Secondary cache: 0 entries (0B) hit rate: 0.0% Snapshots: 0 earliest seq num: 0 Table iters: 1 @@ -226,7 +226,7 @@ Iter category stats: disk-usage ---- -2.6KB +2.7KB # Closing iter b will release the last zombie sstable and the last zombie memtable. @@ -250,7 +250,7 @@ total | 1 595B 0B 0 | - | 56B | 0 0B | 0 0 WAL: 1 files (0B) in: 34B written: 56B (65% overhead) Flushes: 2 Compactions: 1 estimated debt: 0B in progress: 0 (0B) - default: 1 delete: 0 elision: 0 move: 0 read: 0 rewrite: 0 copy: 0 multi-level: 0 + default: 1 delete: 0 elision: 0 move: 0 read: 0 tombstone-density: 0 rewrite: 0 copy: 0 multi-level: 0 MemTables: 1 (256KB) zombie: 1 (256KB) Zombie tables: 0 (0B, local: 0B) Backing tables: 0 (0B) @@ -271,7 +271,7 @@ Iter category stats: disk-usage ---- -2.0KB +2.1KB additional-metrics ---- @@ -321,7 +321,7 @@ total | 4 2.6KB 38B 0 | - | 149B | 0 0B | 0 0 WAL: 1 files (0B) in: 116B written: 149B (28% overhead) Flushes: 3 Compactions: 1 estimated debt: 2.6KB in progress: 0 (0B) - default: 1 delete: 0 elision: 0 move: 0 read: 0 rewrite: 0 copy: 0 multi-level: 0 + default: 1 delete: 0 elision: 0 move: 0 read: 0 tombstone-density: 0 rewrite: 0 copy: 0 multi-level: 0 MemTables: 1 (256KB) zombie: 1 (256KB) Zombie tables: 0 (0B, local: 0B) Backing tables: 0 (0B) @@ -376,7 +376,7 @@ total | 3 2.0KB 41B 0 | - | 149B | 0 0B | 0 0 WAL: 1 files (0B) in: 116B written: 149B (28% overhead) Flushes: 3 Compactions: 2 estimated debt: 0B in progress: 0 (0B) - default: 2 delete: 0 elision: 0 move: 0 read: 0 rewrite: 0 copy: 0 multi-level: 0 + default: 2 delete: 0 elision: 0 move: 0 read: 0 tombstone-density: 0 rewrite: 0 copy: 0 multi-level: 0 MemTables: 1 (256KB) zombie: 1 (256KB) Zombie tables: 0 (0B, local: 0B) Backing tables: 0 (0B) @@ -480,7 +480,7 @@ total | 7 4.3KB 41B 0 | - | 1.9KB | 3 1.7KB | 0 0 WAL: 1 files (0B) in: 176B written: 187B (6% overhead) Flushes: 8 Compactions: 2 estimated debt: 4.3KB in progress: 0 (0B) - default: 2 delete: 0 elision: 0 move: 0 read: 0 rewrite: 0 copy: 0 multi-level: 0 + default: 2 delete: 0 elision: 0 move: 0 read: 0 tombstone-density: 0 rewrite: 0 copy: 0 multi-level: 0 MemTables: 1 (1.0MB) zombie: 1 (1.0MB) Zombie tables: 0 (0B, local: 0B) Backing tables: 0 (0B) @@ -488,7 +488,7 @@ Virtual tables: 0 (0B) Local tables size: 4.3KB Compression types: snappy: 7 Block cache: 12 entries (1.9KB) hit rate: 9.1% -Table cache: 1 entries (736B) hit rate: 53.8% +Table cache: 1 entries (744B) hit rate: 53.8% Secondary cache: 0 entries (0B) hit rate: 0.0% Snapshots: 0 earliest seq num: 0 Table iters: 0 @@ -543,7 +543,7 @@ total | 10 6.1KB 41B 0 | - | 2.0KB | 3 1.7KB | 0 0 WAL: 1 files (0B) in: 223B written: 245B (10% overhead) Flushes: 9 Compactions: 2 estimated debt: 6.1KB in progress: 0 (0B) - default: 2 delete: 0 elision: 0 move: 0 read: 0 rewrite: 0 copy: 0 multi-level: 0 + default: 2 delete: 0 elision: 0 move: 0 read: 0 tombstone-density: 0 rewrite: 0 copy: 0 multi-level: 0 MemTables: 1 (1.0MB) zombie: 1 (1.0MB) Zombie tables: 0 (0B, local: 0B) Backing tables: 0 (0B) @@ -551,7 +551,7 @@ Virtual tables: 0 (0B) Local tables size: 6.1KB Compression types: snappy: 10 Block cache: 12 entries (1.9KB) hit rate: 9.1% -Table cache: 1 entries (736B) hit rate: 53.8% +Table cache: 1 entries (744B) hit rate: 53.8% Secondary cache: 0 entries (0B) hit rate: 0.0% Snapshots: 0 earliest seq num: 0 Table iters: 0 @@ -620,7 +620,7 @@ total | 11 5.6KB 41B 2 | - | 2.5KB | 4 2.3KB | 0 0 WAL: 1 files (0B) in: 223B written: 245B (10% overhead) Flushes: 9 Compactions: 2 estimated debt: 5.6KB in progress: 0 (0B) - default: 2 delete: 0 elision: 0 move: 0 read: 0 rewrite: 0 copy: 0 multi-level: 0 + default: 2 delete: 0 elision: 0 move: 0 read: 0 tombstone-density: 0 rewrite: 0 copy: 0 multi-level: 0 MemTables: 1 (1.0MB) zombie: 1 (1.0MB) Zombie tables: 0 (0B, local: 0B) Backing tables: 2 (1.2KB) @@ -722,7 +722,7 @@ total | 6 3.8KB 41B 0 | - | 3.1KB | 5 2.9KB | 0 0 WAL: 1 files (0B) in: 223B written: 245B (10% overhead) Flushes: 9 Compactions: 3 estimated debt: 0B in progress: 0 (0B) - default: 3 delete: 0 elision: 0 move: 0 read: 0 rewrite: 0 copy: 0 multi-level: 0 + default: 3 delete: 0 elision: 0 move: 0 read: 0 tombstone-density: 0 rewrite: 0 copy: 0 multi-level: 0 MemTables: 1 (1.0MB) zombie: 1 (1.0MB) Zombie tables: 0 (0B, local: 0B) Backing tables: 0 (0B) @@ -776,7 +776,7 @@ total | 1 604B 0B 0 | - | 38B | 0 0B | 0 0 WAL: 1 files (0B) in: 27B written: 38B (41% overhead) Flushes: 1 Compactions: 0 estimated debt: 0B in progress: 0 (0B) - default: 0 delete: 0 elision: 0 move: 0 read: 0 rewrite: 0 copy: 0 multi-level: 0 + default: 0 delete: 0 elision: 0 move: 0 read: 0 tombstone-density: 0 rewrite: 0 copy: 0 multi-level: 0 MemTables: 1 (256KB) zombie: 1 (256KB) Zombie tables: 0 (0B, local: 0B) Backing tables: 0 (0B) @@ -814,7 +814,7 @@ total | 1 604B 0B 0 | - | 38B | 0 0B | 0 0 WAL: 1 files (0B) in: 27B written: 38B (41% overhead) Flushes: 1 Compactions: 1 estimated debt: 0B in progress: 0 (0B) - default: 0 delete: 0 elision: 0 move: 0 read: 0 rewrite: 0 copy: 1 multi-level: 0 + default: 0 delete: 0 elision: 0 move: 0 read: 0 tombstone-density: 0 rewrite: 0 copy: 1 multi-level: 0 MemTables: 1 (256KB) zombie: 1 (256KB) Zombie tables: 0 (0B, local: 0B) Backing tables: 0 (0B) @@ -822,7 +822,7 @@ Virtual tables: 0 (0B) Local tables size: 0B Compression types: snappy: 1 Block cache: 1 entries (440B) hit rate: 0.0% -Table cache: 1 entries (736B) hit rate: 0.0% +Table cache: 1 entries (744B) hit rate: 0.0% Secondary cache: 0 entries (0B) hit rate: 0.0% Snapshots: 0 earliest seq num: 0 Table iters: 0 @@ -861,7 +861,7 @@ total | 2 1.2KB 0B 0 | - | 627B | 1 589B | 0 0 WAL: 1 files (0B) in: 27B written: 38B (41% overhead) Flushes: 1 Compactions: 1 estimated debt: 1.2KB in progress: 0 (0B) - default: 0 delete: 0 elision: 0 move: 0 read: 0 rewrite: 0 copy: 1 multi-level: 0 + default: 0 delete: 0 elision: 0 move: 0 read: 0 tombstone-density: 0 rewrite: 0 copy: 1 multi-level: 0 MemTables: 1 (256KB) zombie: 1 (256KB) Zombie tables: 0 (0B, local: 0B) Backing tables: 0 (0B) @@ -869,7 +869,7 @@ Virtual tables: 0 (0B) Local tables size: 0B Compression types: snappy: 2 Block cache: 6 entries (996B) hit rate: 0.0% -Table cache: 1 entries (736B) hit rate: 50.0% +Table cache: 1 entries (744B) hit rate: 50.0% Secondary cache: 0 entries (0B) hit rate: 0.0% Snapshots: 0 earliest seq num: 0 Table iters: 0 @@ -909,7 +909,7 @@ total | 3 1.7KB 0B 0 | - | 655B | 1 589B | 0 0 WAL: 1 files (0B) in: 44B written: 66B (50% overhead) Flushes: 2 Compactions: 1 estimated debt: 1.7KB in progress: 0 (0B) - default: 0 delete: 0 elision: 0 move: 0 read: 0 rewrite: 0 copy: 1 multi-level: 0 + default: 0 delete: 0 elision: 0 move: 0 read: 0 tombstone-density: 0 rewrite: 0 copy: 1 multi-level: 0 MemTables: 1 (256KB) zombie: 1 (256KB) Zombie tables: 0 (0B, local: 0B) Backing tables: 0 (0B) @@ -917,7 +917,7 @@ Virtual tables: 0 (0B) Local tables size: 589B Compression types: snappy: 3 Block cache: 6 entries (996B) hit rate: 0.0% -Table cache: 1 entries (736B) hit rate: 50.0% +Table cache: 1 entries (744B) hit rate: 50.0% Secondary cache: 0 entries (0B) hit rate: 0.0% Snapshots: 0 earliest seq num: 0 Table iters: 0 @@ -948,7 +948,7 @@ total | 3 1.7KB 0B 0 | - | 0B | 0 0B | 0 0 WAL: 1 files (0B) in: 0B written: 0B (0% overhead) Flushes: 0 Compactions: 0 estimated debt: 1.7KB in progress: 0 (0B) - default: 0 delete: 0 elision: 0 move: 0 read: 0 rewrite: 0 copy: 0 multi-level: 0 + default: 0 delete: 0 elision: 0 move: 0 read: 0 tombstone-density: 0 rewrite: 0 copy: 0 multi-level: 0 MemTables: 1 (256KB) zombie: 0 (0B) Zombie tables: 0 (0B, local: 0B) Backing tables: 0 (0B) @@ -986,7 +986,7 @@ total | 1 603B 0B 0 | - | 0B | 0 0B | 0 0 WAL: 1 files (0B) in: 0B written: 0B (0% overhead) Flushes: 0 Compactions: 1 estimated debt: 0B in progress: 0 (0B) - default: 1 delete: 0 elision: 0 move: 0 read: 0 rewrite: 0 copy: 0 multi-level: 0 + default: 1 delete: 0 elision: 0 move: 0 read: 0 tombstone-density: 0 rewrite: 0 copy: 0 multi-level: 0 MemTables: 1 (256KB) zombie: 0 (0B) Zombie tables: 0 (0B, local: 0B) Backing tables: 0 (0B) diff --git a/testdata/table_stats b/testdata/table_stats index 4ab5a40da80..ba1bbc4eeaf 100644 --- a/testdata/table_stats +++ b/testdata/table_stats @@ -602,6 +602,7 @@ rocksdb: rocksdb.raw.value.size: 2 rocksdb.deleted.keys: 1 rocksdb.num.range-deletions: 0 + rocksdb.num.data.blocks: 1 rocksdb.compression: Snappy rocksdb.compression_options: window_bits=-14; level=32767; strategy=0; max_dict_bytes=0; zstd_max_train_bytes=0; enabled=0; rocksdb.comparator: pebble.internal.testkeys @@ -610,7 +611,6 @@ rocksdb: rocksdb.index.size: 27 rocksdb.block.based.table.index.type: 0 rocksdb.merge.operator: pebble.concatenate - rocksdb.num.data.blocks: 1 rocksdb.merge.operands: 0 rocksdb.property.collectors: [obsolete-key] pebble: @@ -655,6 +655,7 @@ rocksdb.raw.key.size: 3 rocksdb.raw.value.size: 1 pebble.raw.point-tombstone.key.size: 1 rocksdb.deleted.keys: 1 +rocksdb.num.data.blocks: 1 properties file=8 ---- @@ -663,6 +664,7 @@ rocksdb.raw.key.size: 3 rocksdb.raw.value.size: 1 pebble.raw.point-tombstone.key.size: 1 rocksdb.deleted.keys: 1 +rocksdb.num.data.blocks: 1 wait-pending-table-stats 000007 @@ -710,6 +712,7 @@ rocksdb: rocksdb.raw.value.size: 3 rocksdb.deleted.keys: 0 rocksdb.num.range-deletions: 0 + rocksdb.num.data.blocks: 1 rocksdb.compression: Snappy rocksdb.compression_options: window_bits=-14; level=32767; strategy=0; max_dict_bytes=0; zstd_max_train_bytes=0; enabled=0; rocksdb.comparator: pebble.internal.testkeys @@ -718,7 +721,6 @@ rocksdb: rocksdb.index.size: 27 rocksdb.block.based.table.index.type: 0 rocksdb.merge.operator: pebble.concatenate - rocksdb.num.data.blocks: 1 rocksdb.merge.operands: 0 rocksdb.property.collectors: [obsolete-key] pebble: @@ -768,6 +770,7 @@ rocksdb.num.entries: 1 rocksdb.raw.key.size: 2 rocksdb.raw.value.size: 1 pebble.num.range-key-sets: 1 +rocksdb.num.data.blocks: 1 properties file=13 ---- @@ -775,6 +778,7 @@ rocksdb.num.entries: 1 rocksdb.raw.key.size: 2 rocksdb.raw.value.size: 1 pebble.num.range-key-sets: 1 +rocksdb.num.data.blocks: 1 wait-pending-table-stats 000012 @@ -832,6 +836,7 @@ rocksdb: rocksdb.raw.value.size: 1 rocksdb.deleted.keys: 1 rocksdb.num.range-deletions: 1 + rocksdb.num.data.blocks: 1 rocksdb.compression: Snappy rocksdb.compression_options: window_bits=-14; level=32767; strategy=0; max_dict_bytes=0; zstd_max_train_bytes=0; enabled=0; rocksdb.comparator: pebble.internal.testkeys @@ -840,7 +845,6 @@ rocksdb: rocksdb.index.size: 29 rocksdb.block.based.table.index.type: 0 rocksdb.merge.operator: pebble.concatenate - rocksdb.num.data.blocks: 1 rocksdb.merge.operands: 0 rocksdb.property.collectors: [obsolete-key] pebble: @@ -874,6 +878,7 @@ rocksdb.raw.key.size: 1 rocksdb.raw.value.size: 1 rocksdb.deleted.keys: 1 rocksdb.num.range-deletions: 1 +rocksdb.num.data.blocks: 1 properties file=21 ---- @@ -882,6 +887,7 @@ rocksdb.raw.key.size: 1 rocksdb.raw.value.size: 1 rocksdb.deleted.keys: 1 rocksdb.num.range-deletions: 1 +rocksdb.num.data.blocks: 1 wait-pending-table-stats 000020 diff --git a/tool/testdata/db_lsm b/tool/testdata/db_lsm index ab491677480..a0ae12ed665 100644 --- a/tool/testdata/db_lsm +++ b/tool/testdata/db_lsm @@ -25,7 +25,7 @@ total | 1 709B 0B 0 | - | 0B | 0 0B | 0 0 WAL: 0 files (0B) in: 0B written: 0B (0% overhead) Flushes: 0 Compactions: 0 estimated debt: 0B in progress: 0 (0B) - default: 0 delete: 0 elision: 0 move: 0 read: 0 rewrite: 0 copy: 0 multi-level: 0 + default: 0 delete: 0 elision: 0 move: 0 read: 0 tombstone-density: 0 rewrite: 0 copy: 0 multi-level: 0 MemTables: 1 (256KB) zombie: 0 (0B) Zombie tables: 0 (0B, local: 0B) Backing tables: 0 (0B) @@ -59,7 +59,7 @@ total | 1 709B 0B 0 | - | 0B | 0 0B | 0 0 WAL: 0 files (0B) in: 0B written: 0B (0% overhead) Flushes: 0 Compactions: 0 estimated debt: 0B in progress: 0 (0B) - default: 0 delete: 0 elision: 0 move: 0 read: 0 rewrite: 0 copy: 0 multi-level: 0 + default: 0 delete: 0 elision: 0 move: 0 read: 0 tombstone-density: 0 rewrite: 0 copy: 0 multi-level: 0 MemTables: 1 (256KB) zombie: 0 (0B) Zombie tables: 0 (0B, local: 0B) Backing tables: 0 (0B) diff --git a/tool/testdata/sstable_properties b/tool/testdata/sstable_properties index a6cfcc116f5..a324a7b7269 100644 Binary files a/tool/testdata/sstable_properties and b/tool/testdata/sstable_properties differ diff --git a/version_set.go b/version_set.go index c0ec1758fbb..2762aed0016 100644 --- a/version_set.go +++ b/version_set.go @@ -842,6 +842,10 @@ func (vs *versionSet) incrementCompactions( vs.metrics.Compact.Count++ vs.metrics.Compact.ReadCount++ + case compactionKindTombstoneDensity: + vs.metrics.Compact.Count++ + vs.metrics.Compact.TombstoneDensityCount++ + case compactionKindRewrite: vs.metrics.Compact.Count++ vs.metrics.Compact.RewriteCount++