From 6fd759aeb0c19f0bfef2251637cd41f364fa0393 Mon Sep 17 00:00:00 2001 From: eldenmoon Date: Thu, 9 Jan 2025 12:00:31 +0800 Subject: [PATCH] [opt] optimize insert_range_from --- .../rowset/segment_v2/variant_column_writer_impl.cpp | 2 ++ be/src/vec/columns/column_object.cpp | 12 ++++++++++-- be/src/vec/data_types/data_type_nothing.h | 7 +------ .../data/variant_p0/test_sub_path_pruning.out | 4 ++-- .../compaction/compaction_sparse_column.groovy | 2 +- 5 files changed, 16 insertions(+), 11 deletions(-) diff --git a/be/src/olap/rowset/segment_v2/variant_column_writer_impl.cpp b/be/src/olap/rowset/segment_v2/variant_column_writer_impl.cpp index a26d3d4eab8d54..442595eb0c0b19 100644 --- a/be/src/olap/rowset/segment_v2/variant_column_writer_impl.cpp +++ b/be/src/olap/rowset/segment_v2/variant_column_writer_impl.cpp @@ -311,6 +311,8 @@ void VariantStatistics::to_pb(VariantStatisticsPB* stats) const { for (const auto& [path, value] : sparse_column_non_null_size) { stats->mutable_sparse_column_non_null_size()->emplace(path, value); } + LOG(INFO) << "num subcolumns " << subcolumns_non_null_size.size() << ", num sparse columns " + << sparse_column_non_null_size.size(); } void VariantStatistics::from_pb(const VariantStatisticsPB& stats) { diff --git a/be/src/vec/columns/column_object.cpp b/be/src/vec/columns/column_object.cpp index c60146a869f939..535e4332bbe540 100644 --- a/be/src/vec/columns/column_object.cpp +++ b/be/src/vec/columns/column_object.cpp @@ -1318,9 +1318,10 @@ void ColumnObject::insert_range_from(const IColumn& src, size_t start, size_t le // We can reach the limit of subcolumns, and in this case // the rest of subcolumns from src will be inserted into sparse column. std::map src_path_and_subcoumn_for_sparse_column; + int idx_hint = 0; for (const auto& entry : src_object.subcolumns) { // Check if we already have such dense column path. - if (auto* subcolumn = get_subcolumn(entry->path); subcolumn != nullptr) { + if (auto* subcolumn = get_subcolumn(entry->path, idx_hint); subcolumn != nullptr) { subcolumn->insert_range_from(entry->data, start, length); } else if (try_add_new_subcolumn(entry->path)) { subcolumn = get_subcolumn(entry->path); @@ -1329,6 +1330,7 @@ void ColumnObject::insert_range_from(const IColumn& src, size_t start, size_t le } else { src_path_and_subcoumn_for_sparse_column.emplace(entry->path.get_path(), entry->data); } + ++idx_hint; } // Paths in sparse column are sorted, so paths from src_dense_column_path_for_sparse_column should be inserted properly @@ -1345,7 +1347,7 @@ void ColumnObject::insert_range_from(const IColumn& src, size_t start, size_t le src_object, std::move(sorted_src_subcolumn_for_sparse_column), start, length); num_rows += length; - finalize(); + // finalize(); ENABLE_CHECK_CONSISTENCY(this); } @@ -1946,6 +1948,12 @@ void ColumnObject::clear_sparse_column() { } Status ColumnObject::finalize(FinalizeMode mode) { + if (is_finalized() && mode == FinalizeMode::READ_MODE) { + doc_structure = nullptr; + _prev_positions.clear(); + ENABLE_CHECK_CONSISTENCY(this); + return Status::OK(); + } Subcolumns new_subcolumns; if (auto root = subcolumns.get_mutable_root(); root == nullptr) { diff --git a/be/src/vec/data_types/data_type_nothing.h b/be/src/vec/data_types/data_type_nothing.h index bb0e095b5a5a36..6741fbd5031b45 100644 --- a/be/src/vec/data_types/data_type_nothing.h +++ b/be/src/vec/data_types/data_type_nothing.h @@ -78,12 +78,7 @@ class DataTypeNothing final : public IDataType { const char* deserialize(const char* buf, MutableColumnPtr* column, int be_exec_version) const override; - [[noreturn]] Field get_default() const override { - throw doris::Exception(ErrorCode::NOT_IMPLEMENTED_ERROR, - "Method get_default() is not implemented for data type {}.", - get_name()); - __builtin_unreachable(); - } + Field get_default() const override { return Null(); } [[noreturn]] Field get_field(const TExprNode& node) const override { throw doris::Exception(ErrorCode::NOT_IMPLEMENTED_ERROR, diff --git a/regression-test/data/variant_p0/test_sub_path_pruning.out b/regression-test/data/variant_p0/test_sub_path_pruning.out index d1089d6f4cd8c4..ae75160a91d597 100644 --- a/regression-test/data/variant_p0/test_sub_path_pruning.out +++ b/regression-test/data/variant_p0/test_sub_path_pruning.out @@ -229,7 +229,7 @@ 1 {"b":{"c":{"d":{"e":11}}},"c":{"d":{"e":12}},"d":{"e":13},"e":14} -- !sql -- -"1" +1 {"b":{"c":{"d":{"e":11}}},"c":{"d":{"e":12}},"d":{"e":13},"e":14} -- !sql -- @@ -241,7 +241,7 @@ 1 {"d":{"e":11}} -- !sql -- -"1" +1 {"d":{"e":11}} -- !sql -- diff --git a/regression-test/suites/variant_p1/compaction/compaction_sparse_column.groovy b/regression-test/suites/variant_p1/compaction/compaction_sparse_column.groovy index 82f60e594cf585..5d753b9738292e 100644 --- a/regression-test/suites/variant_p1/compaction/compaction_sparse_column.groovy +++ b/regression-test/suites/variant_p1/compaction/compaction_sparse_column.groovy @@ -47,7 +47,7 @@ suite("test_compaction_sparse_column", "p1,nonConcurrent") { try { set_be_config.call("write_buffer_size", "10240") - set_be_config.call("variant_max_subcolumns_count", "3") + set_be_config.call("variant_max_subcolumns_count", "2") sql """ DROP TABLE IF EXISTS ${tableName} """ sql """