From 8d8eb0e77e13a3902d23fbda742dc47aa7bc418f Mon Sep 17 00:00:00 2001 From: "mayue.fight" Date: Thu, 18 May 2023 13:25:01 -0700 Subject: [PATCH] Support Clip DB to KeyRange (#11379) Summary: This PR is part of the request https://github.com/facebook/rocksdb/issues/11317. (Another part is https://github.com/facebook/rocksdb/pull/11378) ClipDB() will clip the entries in the CF according to the range [begin_key, end_key). All the entries outside this range will be completely deleted (including tombstones). This feature is mainly used to ensure that there is no overlapping Key when calling CreateColumnFamilyWithImports() to import multiple CFs. When Calling ClipDB [begin, end), there are the following steps 1. Quickly and directly delete files without overlap DeleteFilesInRanges(nullptr, begin) + DeleteFilesInRanges(end, nullptr) 2. Delete the Key outside the range Delete[smallest_key, begin) + Delete[end, largest_key] 3. Delete the tombstone through Manul Compact CompactRange(option, nullptr, nullptr) Pull Request resolved: https://github.com/facebook/rocksdb/pull/11379 Reviewed By: ajkr Differential Revision: D45840358 Pulled By: cbi42 fbshipit-source-id: 54152e8a45fd8ede137f99787eb252f0b51440a4 --- CMakeLists.txt | 1 + HISTORY.md | 1 + Makefile | 3 + TARGETS | 6 + db/db_clip_test.cc | 142 +++++++++++++++++++++++ db/db_impl/compacted_db_impl.h | 7 ++ db/db_impl/db_impl.cc | 76 +++++++++++- db/db_impl/db_impl.h | 5 + db/db_impl/db_impl_readonly.h | 7 ++ db/db_test.cc | 7 ++ db/version_set.cc | 43 +++++++ db/version_set.h | 3 + include/rocksdb/db.h | 19 +++ include/rocksdb/utilities/stackable_db.h | 7 ++ src.mk | 1 + 15 files changed, 327 insertions(+), 1 deletion(-) create mode 100644 db/db_clip_test.cc diff --git a/CMakeLists.txt b/CMakeLists.txt index 109981c1b4f..7797bde2cb8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1290,6 +1290,7 @@ if(WITH_TESTS) db/db_bloom_filter_test.cc db/db_compaction_filter_test.cc db/db_compaction_test.cc + db/db_clip_test.cc db/db_dynamic_level_test.cc db/db_encryption_test.cc db/db_flush_test.cc diff --git a/HISTORY.md b/HISTORY.md index d76c99e59ac..51533d204ed 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -9,6 +9,7 @@ * New statistics `rocksdb.file.read.db.open.micros` that measures read time of block-based SST tables or blob files during db open. ### Public API Changes +* EXPERIMENTAL: Add new API `DB::ClipColumnFamily` to clip the key in CF to a certain range. It will physically deletes all keys outside the range including tombstones. * Add `MakeSharedCache()` construction functions to various cache Options objects, and deprecated the `NewWhateverCache()` functions with long parameter lists. ### Behavior changes diff --git a/Makefile b/Makefile index 75a5d735915..b499e8be132 100644 --- a/Makefile +++ b/Makefile @@ -1480,6 +1480,9 @@ db_compaction_filter_test: $(OBJ_DIR)/db/db_compaction_filter_test.o $(TEST_LIBR db_compaction_test: $(OBJ_DIR)/db/db_compaction_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) +db_clip_test: $(OBJ_DIR)/db/db_clip_test.o $(TEST_LIBRARY) $(LIBRARY) + $(AM_LINK) + db_dynamic_level_test: $(OBJ_DIR)/db/db_dynamic_level_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) diff --git a/TARGETS b/TARGETS index 52a29b7747b..ff5321312c9 100644 --- a/TARGETS +++ b/TARGETS @@ -4750,6 +4750,12 @@ cpp_unittest_wrapper(name="db_bloom_filter_test", extra_compiler_flags=[]) +cpp_unittest_wrapper(name="db_clip_test", + srcs=["db/db_clip_test.cc"], + deps=[":rocksdb_test_lib"], + extra_compiler_flags=[]) + + cpp_unittest_wrapper(name="db_compaction_filter_test", srcs=["db/db_compaction_filter_test.cc"], deps=[":rocksdb_test_lib"], diff --git a/db/db_clip_test.cc b/db/db_clip_test.cc new file mode 100644 index 00000000000..fd0bb57170f --- /dev/null +++ b/db/db_clip_test.cc @@ -0,0 +1,142 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/db_test_util.h" +#include "port/port.h" +#include "util/random.h" + +namespace ROCKSDB_NAMESPACE { + +class DBClipTest : public DBTestBase { + public: + DBClipTest() : DBTestBase("db_clip_test", /*env_do_fsync=*/true) {} +}; + +TEST_F(DBClipTest, TestClipRange) { + Options options = CurrentOptions(); + options.write_buffer_size = 10 * 1024 * 1024; + options.max_bytes_for_level_multiplier = 2; + options.num_levels = 3; + options.max_background_compactions = 3; + options.disable_auto_compactions = true; + options.statistics = CreateDBStatistics(); + + DestroyAndReopen(options); + int32_t value_size = 10 * 1024; // 10 KB + + Random rnd(301); + std::map values; + + // file [0 => 100), [100 => 200), ... [900, 1000) + for (auto i = 0; i < 10; i++) { + for (auto j = 0; j < 100; j++) { + auto k = i * 100 + j; + values[k] = rnd.RandomString(value_size); + ASSERT_OK(Put(Key(k), values[k])); + } + ASSERT_OK(Flush()); + } + ASSERT_EQ("10", FilesPerLevel(0)); + auto begin_key = Key(251), end_key = Key(751); + ASSERT_OK( + db_->ClipColumnFamily(db_->DefaultColumnFamily(), begin_key, end_key)); + + for (auto i = 0; i < 251; i++) { + ReadOptions ropts; + std::string result; + auto s = db_->Get(ropts, Key(i), &result); + ASSERT_TRUE(s.IsNotFound()); + } + for (auto i = 251; i < 751; i++) { + ASSERT_EQ(Get(Key(i)), values[i]); + } + for (auto i = 751; i < 1000; i++) { + ReadOptions ropts; + std::string result; + auto s = db_->Get(ropts, Key(i), &result); + ASSERT_TRUE(s.IsNotFound()); + } + + std::vector all_metadata; + db_->GetLiveFilesMetaData(&all_metadata); + for (auto& md : all_metadata) { + // make sure clip_begin_key <= file_smallestkey <= file_largestkey <= + // clip_end_key + bool in_range = false; + + if (options.comparator->Compare(begin_key, md.smallestkey) <= 0 && + options.comparator->Compare(end_key, md.largestkey) > 0) { + in_range = true; + } + ASSERT_TRUE(in_range); + } + + CompactRangeOptions compact_options; + compact_options.change_level = true; + compact_options.target_level = 2; + ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr)); + ASSERT_EQ("0,0,3", FilesPerLevel(0)); + + for (auto i = 0; i < 10; i += 2) { + for (auto j = 0; j < 100; j++) { + auto k = i * 100 + j; + ASSERT_OK(Put(Key(k), values[k])); + } + ASSERT_OK(Flush()); + } + ASSERT_EQ("5,0,3", FilesPerLevel(0)); + ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr)); + ASSERT_EQ("0,5,3", FilesPerLevel(0)); + + for (auto i = 1; i < 10; i += 2) { + for (auto j = 0; j < 100; j++) { + auto k = i * 100 + j; + ASSERT_OK(Put(Key(k), values[k])); + } + ASSERT_OK(Flush()); + } + ASSERT_EQ("5,5,3", FilesPerLevel(0)); + + auto begin_key_2 = Key(222), end_key_2 = Key(888); + + ASSERT_OK(db_->ClipColumnFamily(db_->DefaultColumnFamily(), begin_key_2, + end_key_2)); + + for (auto i = 0; i < 222; i++) { + ReadOptions ropts; + std::string result; + auto s = db_->Get(ropts, Key(i), &result); + ASSERT_TRUE(s.IsNotFound()); + } + for (auto i = 222; i < 888; i++) { + ASSERT_EQ(Get(Key(i)), values[i]); + } + for (auto i = 888; i < 1000; i++) { + ReadOptions ropts; + std::string result; + auto s = db_->Get(ropts, Key(i), &result); + ASSERT_TRUE(s.IsNotFound()); + } + + std::vector all_metadata_2; + db_->GetLiveFilesMetaData(&all_metadata_2); + for (auto& md : all_metadata_2) { + // make sure clip_begin_key <= file_smallestkey <= file_largestkey <= + // clip_end_key + bool in_range = false; + if (begin_key_2.compare(md.smallestkey) <= 0 && + end_key_2.compare(md.largestkey) > 0) { + in_range = true; + } + ASSERT_TRUE(in_range); + } +} +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} \ No newline at end of file diff --git a/db/db_impl/compacted_db_impl.h b/db/db_impl/compacted_db_impl.h index e0e0d566494..9879d81b618 100644 --- a/db/db_impl/compacted_db_impl.h +++ b/db/db_impl/compacted_db_impl.h @@ -128,6 +128,13 @@ class CompactedDBImpl : public DBImpl { return Status::NotSupported("Not supported in compacted db mode."); } + using DB::ClipColumnFamily; + virtual Status ClipColumnFamily(ColumnFamilyHandle* /*column_family*/, + const Slice& /*begin*/, + const Slice& /*end*/) override { + return Status::NotSupported("Not supported in compacted db mode."); + } + // FIXME: some missing overrides for more "write" functions // Share with DBImplReadOnly? diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc index 07aee535db5..2ed0caf7f24 100644 --- a/db/db_impl/db_impl.cc +++ b/db/db_impl/db_impl.cc @@ -4522,7 +4522,6 @@ void DBImpl::GetAllColumnFamilyMetaData( } } - Status DBImpl::CheckConsistency() { mutex_.AssertHeld(); std::vector metadata; @@ -5705,6 +5704,81 @@ Status DBImpl::CreateColumnFamilyWithImport( return status; } +Status DBImpl::ClipColumnFamily(ColumnFamilyHandle* column_family, + const Slice& begin_key, const Slice& end_key) { + assert(column_family); + Status status; + // Flush memtable + FlushOptions flush_opts; + flush_opts.allow_write_stall = true; + auto* cfd = + static_cast_with_check(column_family)->cfd(); + if (immutable_db_options_.atomic_flush) { + status = AtomicFlushMemTables(flush_opts, FlushReason::kDeleteFiles, + {} /* provided_candidate_cfds */, + false /* entered_write_thread */); + } else { + status = FlushMemTable(cfd, flush_opts, FlushReason::kDeleteFiles, + false /* entered_write_thread */); + } + + if (status.ok()) { + // DeleteFilesInRanges non-overlap files except L0 + std::vector ranges; + ranges.push_back(RangePtr(nullptr, &begin_key)); + ranges.push_back(RangePtr(&end_key, nullptr)); + status = DeleteFilesInRanges(column_family, ranges.data(), ranges.size()); + } + + // DeleteRange the remaining overlapping keys + bool empty_after_delete = false; + if (status.ok()) { + Slice smallest_user_key, largest_user_key; + { + // Lock db mutex + InstrumentedMutexLock l(&mutex_); + cfd->current()->GetSstFilesBoundaryKeys(&smallest_user_key, + &largest_user_key); + } + // all the files has been deleted after DeleteFilesInRanges; + if (smallest_user_key.empty() && largest_user_key.empty()) { + empty_after_delete = true; + } else { + const Comparator* const ucmp = column_family->GetComparator(); + WriteOptions wo; + // Delete [smallest_user_key, clip_begin_key) + if (ucmp->Compare(smallest_user_key, begin_key) < 0) { + status = DeleteRange(wo, column_family, smallest_user_key, begin_key); + } + + if (status.ok()) { + // Delete [clip_end_key, largest_use_key] + if (ucmp->Compare(end_key, largest_user_key) < 0) { + status = DeleteRange(wo, column_family, end_key, largest_user_key); + if (status.ok()) { + status = Delete(wo, column_family, largest_user_key); + } + } + } + } + } + + if (status.ok() && !empty_after_delete) { + // CompactRange delete all the tombstones + CompactRangeOptions compact_options; + compact_options.exclusive_manual_compaction = true; + compact_options.bottommost_level_compaction = + BottommostLevelCompaction::kForceOptimized; + // We could just compact the ranges [null, clip_begin_key] and + // [clip_end_key, null]. But due to how manual compaction calculates the + // last level to compact to and that range tombstones are not dropped + // during non-bottommost compactions, calling CompactRange() on these two + // ranges may not clear all range tombstones. + status = CompactRange(compact_options, nullptr, nullptr); + } + return status; +} + Status DBImpl::VerifyFileChecksums(const ReadOptions& read_options) { return VerifyChecksumInternal(read_options, /*use_file_checksum=*/true); } diff --git a/db/db_impl/db_impl.h b/db/db_impl/db_impl.h index 8f782a34dc4..f2da7467db5 100644 --- a/db/db_impl/db_impl.h +++ b/db/db_impl/db_impl.h @@ -534,6 +534,11 @@ class DBImpl : public DB { const ExportImportFilesMetaData& metadata, ColumnFamilyHandle** handle) override; + using DB::ClipColumnFamily; + virtual Status ClipColumnFamily(ColumnFamilyHandle* column_family, + const Slice& begin_key, + const Slice& end_key) override; + using DB::VerifyFileChecksums; Status VerifyFileChecksums(const ReadOptions& read_options) override; diff --git a/db/db_impl/db_impl_readonly.h b/db/db_impl/db_impl_readonly.h index 1cc37419814..a694acc0033 100644 --- a/db/db_impl/db_impl_readonly.h +++ b/db/db_impl/db_impl_readonly.h @@ -142,6 +142,13 @@ class DBImplReadOnly : public DBImpl { return Status::NotSupported("Not supported operation in read only mode."); } + using DB::ClipColumnFamily; + virtual Status ClipColumnFamily(ColumnFamilyHandle* /*column_family*/, + const Slice& /*begin*/, + const Slice& /*end*/) override { + return Status::NotSupported("Not supported operation in read only mode."); + } + // FIXME: some missing overrides for more "write" functions protected: diff --git a/db/db_test.cc b/db/db_test.cc index fdf82153c98..d23daa55dfc 100644 --- a/db/db_test.cc +++ b/db/db_test.cc @@ -3107,6 +3107,13 @@ class ModelDB : public DB { return Status::NotSupported("Not implemented."); } + using DB::ClipColumnFamily; + virtual Status ClipColumnFamily(ColumnFamilyHandle* /*column_family*/, + const Slice& /*begin*/, + const Slice& /*end*/) override { + return Status::NotSupported("Not implemented."); + } + using DB::GetPropertiesOfAllTables; Status GetPropertiesOfAllTables( ColumnFamilyHandle* /*column_family*/, diff --git a/db/version_set.cc b/db/version_set.cc index b9610331ef5..674c0e4aa99 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -1826,6 +1826,49 @@ uint64_t Version::GetSstFilesSize() { return sst_files_size; } +void Version::GetSstFilesBoundaryKeys(Slice* smallest_user_key, + Slice* largest_user_key) { + smallest_user_key->clear(); + largest_user_key->clear(); + bool initialized = false; + const Comparator* ucmp = storage_info_.user_comparator_; + for (int level = 0; level < cfd_->NumberLevels(); level++) { + if (storage_info_.LevelFiles(level).size() == 0) { + continue; + } + if (level == 0) { + // we need to consider all files on level 0 + for (const auto& file : storage_info_.LevelFiles(level)) { + const Slice& start_user_key = file->smallest.user_key(); + if (!initialized || + ucmp->Compare(start_user_key, *smallest_user_key) < 0) { + *smallest_user_key = start_user_key; + } + const Slice& end_user_key = file->largest.user_key(); + if (!initialized || + ucmp->Compare(end_user_key, *largest_user_key) > 0) { + *largest_user_key = end_user_key; + } + initialized = true; + } + } else { + // we only need to consider the first and last file + const Slice& start_user_key = + storage_info_.LevelFiles(level)[0]->smallest.user_key(); + if (!initialized || + ucmp->Compare(start_user_key, *smallest_user_key) < 0) { + *smallest_user_key = start_user_key; + } + const Slice& end_user_key = + storage_info_.LevelFiles(level).back()->largest.user_key(); + if (!initialized || ucmp->Compare(end_user_key, *largest_user_key) > 0) { + *largest_user_key = end_user_key; + } + initialized = true; + } + } +} + void Version::GetCreationTimeOfOldestFile(uint64_t* creation_time) { uint64_t oldest_time = std::numeric_limits::max(); for (int level = 0; level < storage_info_.num_non_empty_levels_; level++) { diff --git a/db/version_set.h b/db/version_set.h index e7e96bc6cd6..25ad365837a 100644 --- a/db/version_set.h +++ b/db/version_set.h @@ -992,6 +992,9 @@ class Version { void GetColumnFamilyMetaData(ColumnFamilyMetaData* cf_meta); + void GetSstFilesBoundaryKeys(Slice* smallest_user_key, + Slice* largest_user_key); + uint64_t GetSstFilesSize(); // Retrieves the file_creation_time of the oldest file in the DB. diff --git a/include/rocksdb/db.h b/include/rocksdb/db.h index 6539eb8aeb4..a42f5b8b6df 100644 --- a/include/rocksdb/db.h +++ b/include/rocksdb/db.h @@ -1763,6 +1763,25 @@ class DB { const ExportImportFilesMetaData& metadata, ColumnFamilyHandle** handle) = 0; + // EXPERIMENTAL + // ClipColumnFamily() will clip the entries in the CF according to the range + // [begin_key, + // end_key). + // Returns OK on success, and a non-OK status on error. + // Any entries outside this range will be completely deleted (including + // tombstones). + // The main difference between ClipColumnFamily(begin, end) and + // DeleteRange(begin, end) + // is that the former physically deletes all keys outside the range, but is + // more heavyweight than the latter. + // This feature is mainly used to ensure that there is no overlapping Key when + // calling + // CreateColumnFamilyWithImports() to import multiple CFs. + // Note that: concurrent updates cannot be performed during Clip. + virtual Status ClipColumnFamily(ColumnFamilyHandle* column_family, + const Slice& begin_key, + const Slice& end_key) = 0; + // Verify the checksums of files in db. Currently the whole-file checksum of // table files are checked. virtual Status VerifyFileChecksums(const ReadOptions& /*read_options*/) { diff --git a/include/rocksdb/utilities/stackable_db.h b/include/rocksdb/utilities/stackable_db.h index abb365ad37f..1a87a11366d 100644 --- a/include/rocksdb/utilities/stackable_db.h +++ b/include/rocksdb/utilities/stackable_db.h @@ -178,6 +178,13 @@ class StackableDB : public DB { import_options, metadata, handle); } + using DB::ClipColumnFamily; + virtual Status ClipColumnFamily(ColumnFamilyHandle* column_family, + const Slice& begin_key, + const Slice& end_key) override { + return db_->ClipColumnFamily(column_family, begin_key, end_key); + } + using DB::VerifyFileChecksums; Status VerifyFileChecksums(const ReadOptions& read_opts) override { return db_->VerifyFileChecksums(read_opts); diff --git a/src.mk b/src.mk index baaea29be64..eb70ac04b73 100644 --- a/src.mk +++ b/src.mk @@ -460,6 +460,7 @@ TEST_MAIN_SOURCES = \ db/db_bloom_filter_test.cc \ db/db_compaction_filter_test.cc \ db/db_compaction_test.cc \ + db/db_clip_test.cc \ db/db_dynamic_level_test.cc \ db/db_encryption_test.cc \ db/db_flush_test.cc \