Skip to content

Commit

Permalink
Tiered Compaction: per key placement support (facebook#9964)
Browse files Browse the repository at this point in the history
Summary:
Support per_key_placement for last level compaction, which will
be used for tiered compaction.
* compaction iterator reports which level a key should output to;
* compaction get the output level information and check if it's safe to
  output the data to penultimate level;
* all compaction output files will be installed.
* extra internal compaction stats added for penultimate level.

Pull Request resolved: facebook#9964

Test Plan:
* Unittest
* db_bench, no significate difference: https://gist.github.com/jay-zhuang/3645f8fb97ec0ab47c10704bb39fd6e4
* microbench manual compaction no significate difference: https://gist.github.com/jay-zhuang/ba679b3e89e24992615ee9eef310e6dd
* run the db_stress multiple times (not covering the new feature) looks good (internal: https://fburl.com/sandcastle/9w84pp2m)

Reviewed By: ajkr

Differential Revision: D36249494

Pulled By: jay-zhuang

fbshipit-source-id: a96da57c8031c1df83e4a7a8567b657a112b80a3
  • Loading branch information
jay-zhuang authored and facebook-github-bot committed Jul 14, 2022
1 parent 7e1b417 commit 6ce0b2c
Show file tree
Hide file tree
Showing 26 changed files with 4,507 additions and 1,700 deletions.
5 changes: 5 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -627,7 +627,11 @@ set(SOURCES
db/compaction/compaction_picker_fifo.cc
db/compaction/compaction_picker_level.cc
db/compaction/compaction_picker_universal.cc
db/compaction/compaction_service_job.cc
db/compaction/compaction_state.cc
db/compaction/compaction_outputs.cc
db/compaction/sst_partitioner.cc
db/compaction/subcompaction_state.cc
db/convenience.cc
db/db_filesnapshot.cc
db/db_impl/compacted_db_impl.cc
Expand Down Expand Up @@ -1231,6 +1235,7 @@ if(WITH_TESTS)
db/compaction/compaction_iterator_test.cc
db/compaction/compaction_picker_test.cc
db/compaction/compaction_service_test.cc
db/compaction/tiered_compaction_test.cc
db/comparator_db_test.cc
db/corruption_test.cc
db/cuckoo_table_db_test.cc
Expand Down
3 changes: 3 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -1783,6 +1783,9 @@ write_unprepared_transaction_test: $(OBJ_DIR)/utilities/transactions/write_unpre
timestamped_snapshot_test: $(OBJ_DIR)/utilities/transactions/timestamped_snapshot_test.o $(TEST_LIBRARY) $(LIBRARY)
$(AM_LINK)

tiered_compaction_test: $(OBJ_DIR)/db/compaction/tiered_compaction_test.o $(TEST_LIBRARY) $(LIBRARY)
$(AM_LINK)

sst_dump: $(OBJ_DIR)/tools/sst_dump.o $(TOOLS_LIBRARY) $(LIBRARY)
$(AM_LINK)

Expand Down
14 changes: 14 additions & 0 deletions TARGETS
Original file line number Diff line number Diff line change
Expand Up @@ -38,11 +38,15 @@ cpp_library_wrapper(name="rocksdb_lib", srcs=[
"db/compaction/compaction.cc",
"db/compaction/compaction_iterator.cc",
"db/compaction/compaction_job.cc",
"db/compaction/compaction_outputs.cc",
"db/compaction/compaction_picker.cc",
"db/compaction/compaction_picker_fifo.cc",
"db/compaction/compaction_picker_level.cc",
"db/compaction/compaction_picker_universal.cc",
"db/compaction/compaction_service_job.cc",
"db/compaction/compaction_state.cc",
"db/compaction/sst_partitioner.cc",
"db/compaction/subcompaction_state.cc",
"db/convenience.cc",
"db/db_filesnapshot.cc",
"db/db_impl/compacted_db_impl.cc",
Expand Down Expand Up @@ -368,11 +372,15 @@ cpp_library_wrapper(name="rocksdb_whole_archive_lib", srcs=[
"db/compaction/compaction.cc",
"db/compaction/compaction_iterator.cc",
"db/compaction/compaction_job.cc",
"db/compaction/compaction_outputs.cc",
"db/compaction/compaction_picker.cc",
"db/compaction/compaction_picker_fifo.cc",
"db/compaction/compaction_picker_level.cc",
"db/compaction/compaction_picker_universal.cc",
"db/compaction/compaction_service_job.cc",
"db/compaction/compaction_state.cc",
"db/compaction/sst_partitioner.cc",
"db/compaction/subcompaction_state.cc",
"db/convenience.cc",
"db/db_filesnapshot.cc",
"db/db_impl/compacted_db_impl.cc",
Expand Down Expand Up @@ -5764,6 +5772,12 @@ cpp_unittest_wrapper(name="thread_local_test",
extra_compiler_flags=[])


cpp_unittest_wrapper(name="tiered_compaction_test",
srcs=["db/compaction/tiered_compaction_test.cc"],
deps=[":rocksdb_test_lib"],
extra_compiler_flags=[])


cpp_unittest_wrapper(name="timer_queue_test",
srcs=["util/timer_queue_test.cc"],
deps=[":rocksdb_test_lib"],
Expand Down
83 changes: 78 additions & 5 deletions db/compaction/compaction.cc
Original file line number Diff line number Diff line change
Expand Up @@ -77,11 +77,11 @@ void Compaction::SetInputVersion(Version* _input_version) {
void Compaction::GetBoundaryKeys(
VersionStorageInfo* vstorage,
const std::vector<CompactionInputFiles>& inputs, Slice* smallest_user_key,
Slice* largest_user_key) {
Slice* largest_user_key, int exclude_level) {
bool initialized = false;
const Comparator* ucmp = vstorage->InternalComparator()->user_comparator();
for (size_t i = 0; i < inputs.size(); ++i) {
if (inputs[i].files.empty()) {
if (inputs[i].files.empty() || inputs[i].level == exclude_level) {
continue;
}
if (inputs[i].level == 0) {
Expand Down Expand Up @@ -257,7 +257,9 @@ Compaction::Compaction(
_blob_garbage_collection_age_cutoff < 0 ||
_blob_garbage_collection_age_cutoff > 1
? mutable_cf_options()->blob_garbage_collection_age_cutoff
: _blob_garbage_collection_age_cutoff) {
: _blob_garbage_collection_age_cutoff),
penultimate_level_(EvaluatePenultimateLevel(
immutable_options_, start_level_, output_level_)) {
MarkFilesBeingCompacted(true);
if (is_manual_compaction_) {
compaction_reason_ = CompactionReason::kManualCompaction;
Expand Down Expand Up @@ -303,6 +305,18 @@ Compaction::Compaction(
}
}
}

PopulatePenultimateLevelOutputRange();
}

void Compaction::PopulatePenultimateLevelOutputRange() {
if (!SupportsPerKeyPlacement()) {
return;
}

GetBoundaryKeys(input_vstorage_, inputs_,
&penultimate_level_smallest_user_key_,
&penultimate_level_largest_user_key_, number_levels_ - 1);
}

Compaction::~Compaction() {
Expand All @@ -314,6 +328,37 @@ Compaction::~Compaction() {
}
}

bool Compaction::SupportsPerKeyPlacement() const {
return penultimate_level_ != kInvalidLevel;
}

int Compaction::GetPenultimateLevel() const { return penultimate_level_; }

bool Compaction::OverlapPenultimateLevelOutputRange(
const Slice& smallest_key, const Slice& largest_key) const {
if (!SupportsPerKeyPlacement()) {
return false;
}
const Comparator* ucmp =
input_vstorage_->InternalComparator()->user_comparator();

return ucmp->Compare(smallest_key, penultimate_level_largest_user_key_) <=
0 &&
ucmp->Compare(largest_key, penultimate_level_smallest_user_key_) >= 0;
}

bool Compaction::WithinPenultimateLevelOutputRange(const Slice& key) const {
if (!SupportsPerKeyPlacement()) {
return false;
}

const Comparator* ucmp =
input_vstorage_->InternalComparator()->user_comparator();

return ucmp->Compare(key, penultimate_level_smallest_user_key_) >= 0 &&
ucmp->Compare(key, penultimate_level_largest_user_key_) <= 0;
}

bool Compaction::InputCompressionMatchesOutput() const {
int base_level = input_vstorage_->base_level();
bool matches =
Expand Down Expand Up @@ -677,8 +722,36 @@ uint64_t Compaction::MinInputFileOldestAncesterTime(
return min_oldest_ancester_time;
}

int Compaction::GetInputBaseLevel() const {
return input_vstorage_->base_level();
int Compaction::EvaluatePenultimateLevel(
const ImmutableOptions& immutable_options, const int start_level,
const int output_level) {
// TODO: currently per_key_placement feature only support level and universal
// compaction
if (immutable_options.compaction_style != kCompactionStyleLevel &&
immutable_options.compaction_style != kCompactionStyleUniversal) {
return kInvalidLevel;
}
if (output_level != immutable_options.num_levels - 1) {
return kInvalidLevel;
}

int penultimate_level = output_level - 1;
assert(penultimate_level < immutable_options.num_levels);
if (penultimate_level <= 0 || penultimate_level < start_level) {
return kInvalidLevel;
}

// TODO: will add public like `options.preclude_last_level_data_seconds` for
// per_key_placement feature, will check that option here. Currently, only
// set by unittest
bool supports_per_key_placement = false;
TEST_SYNC_POINT_CALLBACK("Compaction::SupportsPerKeyPlacement:Enabled",
&supports_per_key_placement);
if (!supports_per_key_placement) {
return kInvalidLevel;
}

return penultimate_level;
}

} // namespace ROCKSDB_NAMESPACE
70 changes: 68 additions & 2 deletions db/compaction/compaction.h
Original file line number Diff line number Diff line change
Expand Up @@ -302,7 +302,25 @@ class Compaction {

Slice GetLargestUserKey() const { return largest_user_key_; }

int GetInputBaseLevel() const;
// Return true if the compaction supports per_key_placement
bool SupportsPerKeyPlacement() const;

// Get per_key_placement penultimate output level, which is `last_level - 1`
// if per_key_placement feature is supported. Otherwise, return -1.
int GetPenultimateLevel() const;

// Return true if the given range is overlap with penultimate level output
// range.
bool OverlapPenultimateLevelOutputRange(const Slice& smallest_key,
const Slice& largest_key) const;

// Return true if the key is within penultimate level output range for
// per_key_placement feature, which is safe to place the key to the
// penultimate level. different compaction strategy has different rules.
// If per_key_placement is not supported, always return false.
// TODO: currently it doesn't support moving data from the last level to the
// penultimate level
bool WithinPenultimateLevelOutputRange(const Slice& key) const;

CompactionReason compaction_reason() const { return compaction_reason_; }

Expand Down Expand Up @@ -339,14 +357,34 @@ class Compaction {
return notify_on_compaction_completion_;
}

static constexpr int kInvalidLevel = -1;
// Evaluate penultimate output level. If the compaction supports
// per_key_placement feature, it returns the penultimate level number.
// Otherwise, it's set to kInvalidLevel (-1), which means
// output_to_penultimate_level is not supported.
static int EvaluatePenultimateLevel(const ImmutableOptions& immutable_options,
const int start_level,
const int output_level);

private:
// mark (or clear) all files that are being compacted
void MarkFilesBeingCompacted(bool mark_as_compacted);

// get the smallest and largest key present in files to be compacted
static void GetBoundaryKeys(VersionStorageInfo* vstorage,
const std::vector<CompactionInputFiles>& inputs,
Slice* smallest_key, Slice* largest_key);
Slice* smallest_key, Slice* largest_key,
int exclude_level = -1);

// populate penultimate level output range, which will be used to determine if
// a key is safe to output to the penultimate level (details see
// `Compaction::WithinPenultimateLevelOutputRange()`.
// TODO: Currently the penultimate level output range is the min/max keys of
// non-last-level input files. Which is only good if there's no key moved
// from the last level to the penultimate level. For a more complicated per
// key placement which may move data from the last level to the penultimate
// level, it needs extra check.
void PopulatePenultimateLevelOutputRange();

// Get the atomic file boundaries for all files in the compaction. Necessary
// in order to avoid the scenario described in
Expand Down Expand Up @@ -444,7 +482,35 @@ class Compaction {

// Blob garbage collection age cutoff.
double blob_garbage_collection_age_cutoff_;

// only set when per_key_placement feature is enabled, -1 (kInvalidLevel)
// means not supported.
const int penultimate_level_;

// Key range for penultimate level output
Slice penultimate_level_smallest_user_key_;
Slice penultimate_level_largest_user_key_;
};

#ifndef NDEBUG
// Helper struct only for tests, which contains the data to decide if a key
// should be output to the penultimate level.
// TODO: remove this when the public feature knob is available
struct PerKeyPlacementContext {
const int level;
const Slice key;
const Slice value;
const SequenceNumber seq_num;

bool output_to_penultimate_level;

PerKeyPlacementContext(int _level, Slice _key, Slice _value,
SequenceNumber _seq_num)
: level(_level), key(_key), value(_value), seq_num(_seq_num) {
output_to_penultimate_level = false;
}
};
#endif /* !NDEBUG */

// Return sum of sizes of all files in `files`.
extern uint64_t TotalFileSize(const std::vector<FileMetaData*>& files);
Expand Down
53 changes: 52 additions & 1 deletion db/compaction/compaction_iterator.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1075,6 +1075,52 @@ void CompactionIterator::GarbageCollectBlobIfNeeded() {
}
}

void CompactionIterator::DecideOutputLevel() {
#ifndef NDEBUG
// TODO: will be set by sequence number or key range, for now, it will only be
// set by unittest
PerKeyPlacementContext context(level_, ikey_.user_key, value_,
ikey_.sequence);
TEST_SYNC_POINT_CALLBACK("CompactionIterator::PrepareOutput.context",
&context);
output_to_penultimate_level_ = context.output_to_penultimate_level;
#endif /* !NDEBUG */

// if the key is within the earliest snapshot, it has to output to the
// penultimate level.
if (ikey_.sequence > earliest_snapshot_) {
output_to_penultimate_level_ = true;
}

if (output_to_penultimate_level_) {
// If it's decided to output to the penultimate level, but unsafe to do so,
// still output to the last level. For example, moving the data from a lower
// level to a higher level outside of the higher-level input key range is
// considered unsafe, because the key may conflict with higher-level SSTs
// not from this compaction.
// TODO: add statistic for declined output_to_penultimate_level
bool safe_to_penultimate_level =
compaction_->WithinPenultimateLevelOutputRange(ikey_.user_key);
if (!safe_to_penultimate_level) {
output_to_penultimate_level_ = false;
// It could happen when disable/enable `bottommost_temperature` while
// holding a snapshot. When `bottommost_temperature` is not set
// (==kUnknown), the data newer than any snapshot is pushed to the last
// level, but when the per_key_placement feature is enabled on the fly,
// the data later than the snapshot has to be moved to the penultimate
// level, which may or may not be safe. So the user needs to make sure all
// snapshot is released before enabling `bottommost_temperature` feature
// We will migrate the feature to `last_level_temperature` and maybe make
// it not dynamically changeable.
if (ikey_.sequence > earliest_snapshot_) {
status_ = Status::Corruption(
"Unsafe to store Seq later than snapshot in the last level if "
"per_key_placement is enabled");
}
}
}
}

void CompactionIterator::PrepareOutput() {
if (valid_) {
if (ikey_.type == kTypeValue) {
Expand All @@ -1083,6 +1129,10 @@ void CompactionIterator::PrepareOutput() {
GarbageCollectBlobIfNeeded();
}

if (compaction_ != nullptr && compaction_->SupportsPerKeyPlacement()) {
DecideOutputLevel();
}

// Zeroing out the sequence number leads to better compression.
// If this is the bottommost level (no files in lower levels)
// and the earliest snapshot is larger than this seqno
Expand All @@ -1097,7 +1147,8 @@ void CompactionIterator::PrepareOutput() {
if (valid_ && compaction_ != nullptr &&
!compaction_->allow_ingest_behind() && bottommost_level_ &&
DefinitelyInSnapshot(ikey_.sequence, earliest_snapshot_) &&
ikey_.type != kTypeMerge && current_key_committed_) {
ikey_.type != kTypeMerge && current_key_committed_ &&
!output_to_penultimate_level_) {
if (ikey_.type == kTypeDeletion ||
(ikey_.type == kTypeSingleDeletion && timestamp_size_ == 0)) {
ROCKS_LOG_FATAL(
Expand Down
Loading

0 comments on commit 6ce0b2c

Please sign in to comment.