add debug point test

eldenmoon · Dec 25, 2024 · 81411f2 · 81411f2
1 parent 5ec1358
commit 81411f2
Show file tree

Hide file tree

Showing 15 changed files with 170 additions and 141 deletions.
diff --git a/be/src/common/config.cpp b/be/src/common/config.cpp
@@ -1019,9 +1019,8 @@ DEFINE_mInt64(workload_group_scan_task_wait_timeout_ms, "10000");
 
 // Whether use schema dict in backend side instead of MetaService side(cloud mode)
 DEFINE_mBool(variant_use_cloud_schema_dict, "true");
-DEFINE_mDouble(variant_ratio_of_defaults_as_sparse_column, "1");
-DEFINE_mInt64(variant_threshold_rows_to_estimate_sparse_column, "2048");
 DEFINE_mBool(variant_throw_exeception_on_invalid_json, "false");
+DEFINE_mInt32(variant_max_subcolumns_count, "5");
 
 // block file cache
 DEFINE_Bool(enable_file_cache, "false");

diff --git a/be/src/common/config.h b/be/src/common/config.h
@@ -1216,15 +1216,13 @@ DECLARE_mInt64(lookup_connection_cache_capacity);
 // level of compression when using LZ4_HC, whose defalut value is LZ4HC_CLEVEL_DEFAULT
 DECLARE_mInt64(LZ4_HC_compression_level);
 // Threshold of a column as sparse column
-// Notice: TEST ONLY
-DECLARE_mDouble(variant_ratio_of_defaults_as_sparse_column);
 DECLARE_mBool(variant_use_cloud_schema_dict);
 // Threshold to estimate a column is sparsed
-// Notice: TEST ONLY
-DECLARE_mInt64(variant_threshold_rows_to_estimate_sparse_column);
 // Treat invalid json format str as string, instead of throwing exception if false
 DECLARE_mBool(variant_throw_exeception_on_invalid_json);
 
+DECLARE_mInt32(variant_max_subcolumns_count);
+
 DECLARE_mBool(enable_merge_on_write_correctness_check);
 // USED FOR DEBUGING
 // core directly if the compaction found there's duplicate key on mow table

diff --git a/be/src/olap/rowset/segment_v2/variant_column_writer_impl.cpp b/be/src/olap/rowset/segment_v2/variant_column_writer_impl.cpp
@@ -16,8 +16,10 @@
 // under the License.
 #include "olap/rowset/segment_v2/variant_column_writer_impl.h"
 
+#include <fmt/core.h>
 #include <gen_cpp/segment_v2.pb.h>
 
+#include "common/config.h"
 #include "common/status.h"
 #include "olap/olap_common.h"
 #include "olap/rowset/beta_rowset.h"
@@ -43,14 +45,14 @@ VariantColumnWriterImpl::VariantColumnWriterImpl(const ColumnWriterOptions& opts
 
 Status VariantColumnWriterImpl::init() {
     // caculate stats info
-    std::set<std::string> dynamic_paths;
-    RETURN_IF_ERROR(_get_subcolumn_paths_from_stats(dynamic_paths));
-    if (dynamic_paths.empty()) {
+    std::set<std::string> subcolumn_paths;
+    RETURN_IF_ERROR(_get_subcolumn_paths_from_stats(subcolumn_paths));
+    if (subcolumn_paths.empty()) {
         _column = vectorized::ColumnObject::create(true, false);
     } else {
         // create root
         auto col = vectorized::ColumnObject::create(true, true);
-        for (const auto& str_path : dynamic_paths) {
+        for (const auto& str_path : subcolumn_paths) {
             DCHECK(col->add_sub_column(vectorized::PathInData(str_path), 0));
         }
         _column = std::move(col);
@@ -97,19 +99,20 @@ Status VariantColumnWriterImpl::_get_subcolumn_paths_from_stats(std::set<std::st
             }
         }
     }
-    // Check if the number of all dynamic paths exceeds the limit.
-    if (path_to_total_number_of_non_null_values.size() > vectorized::ColumnObject::MAX_SUBCOLUMNS) {
+
+    // Check if the number of all subcolumn paths exceeds the limit.
+    if (path_to_total_number_of_non_null_values.size() > config::variant_max_subcolumns_count) {
         // Sort paths by total number of non null values.
         std::vector<std::pair<size_t, std::string_view>> paths_with_sizes;
         paths_with_sizes.reserve(path_to_total_number_of_non_null_values.size());
         for (const auto& [path, size] : path_to_total_number_of_non_null_values) {
             paths_with_sizes.emplace_back(size, path);
         }
         std::sort(paths_with_sizes.begin(), paths_with_sizes.end(), std::greater());
-        // Fill dynamic_paths with first max_dynamic_paths paths in sorted list.
+        // Fill subcolumn_paths with first subcolumn paths in sorted list.
         // reserve 1 for root column
         for (const auto& [size, path] : paths_with_sizes) {
-            if (paths.size() < vectorized::ColumnObject::MAX_SUBCOLUMNS - 1) {
+            if (paths.size() < config::variant_max_subcolumns_count - 1) {
                 VLOG_DEBUG << "pick " << path << " as subcolumn";
                 paths.emplace(path);
             }
@@ -118,8 +121,31 @@ Status VariantColumnWriterImpl::_get_subcolumn_paths_from_stats(std::set<std::st
             //     new_statistics.sparse_data_paths_statistics.emplace(path, size);
             // }
         }
+        DBUG_EXECUTE_IF("variant_column_writer_impl._get_subcolumn_paths_from_stats", {
+            auto stats = DebugPoints::instance()->get_debug_param_or_default<std::string>(
+                    "variant_column_writer_impl._get_subcolumn_paths_from_stats", "stats", "");
+            auto subcolumns = DebugPoints::instance()->get_debug_param_or_default<std::string>(
+                    "variant_column_writer_impl._get_subcolumn_paths_from_stats", "subcolumns", "");
+            LOG(INFO) << "stats: " << stats;
+            LOG(INFO) << "subcolumns: " << subcolumns;
+            if (stats.empty()) {
+                return Status::Error<ErrorCode::INTERNAL_ERROR>("debug point stats is empty");
+            }
+            std::vector<std::string> sizes;
+            boost::split(sizes, stats, boost::algorithm::is_any_of(","));
+            CHECK_EQ(sizes.size(), paths_with_sizes.size()) << "stats not match " << stats;
+            for (int i = 0; i < sizes.size(); ++i) {
+                CHECK_EQ(fmt::format("{}", paths_with_sizes[i].first), sizes[i]);
+            }
+            std::set<std::string> subcolumns_set;
+            boost::split(subcolumns_set, subcolumns, boost::algorithm::is_any_of(","));
+            if (!std::equal(paths.begin(), paths.end(), subcolumns_set.begin(),
+                            subcolumns_set.end())) {
+                CHECK(false) << "subcolumns not match " << subcolumns;
+            }
+        })
     } else {
-        // Use all dynamic paths from all source columns.
+        // Use all subcolumn paths from all source columns.
         for (const auto& [path, _] : path_to_total_number_of_non_null_values) {
             VLOG_DEBUG << "pick " << path << " as subcolumn";
             paths.emplace(path);

diff --git a/be/src/vec/columns/column_object.cpp b/be/src/vec/columns/column_object.cpp
@@ -662,19 +662,6 @@ void ColumnObject::resize(size_t n) {
     ENABLE_CHECK_CONSISTENCY(this);
 }
 
-bool ColumnObject::Subcolumn::check_if_sparse_column(size_t num_rows) {
-    if (num_rows < config::variant_threshold_rows_to_estimate_sparse_column) {
-        return false;
-    }
-    std::vector<double> defaults_ratio;
-    for (size_t i = 0; i < data.size(); ++i) {
-        defaults_ratio.push_back(data[i]->get_ratio_of_default_rows());
-    }
-    double default_ratio = std::accumulate(defaults_ratio.begin(), defaults_ratio.end(), 0.0) /
-                           defaults_ratio.size();
-    return default_ratio >= config::variant_ratio_of_defaults_as_sparse_column;
-}
-
 void ColumnObject::Subcolumn::finalize(FinalizeMode mode) {
     if (is_finalized()) {
         return;
@@ -1273,7 +1260,7 @@ void ColumnObject::add_nested_subcolumn(const PathInData& key, const FieldInfo&
 }
 
 bool ColumnObject::try_add_new_subcolumn(const PathInData& path) {
-    if (subcolumns.size() == MAX_SUBCOLUMNS) return false;
+    if (subcolumns.size() == config::variant_max_subcolumns_count) return false;
 
     return add_sub_column(path, num_rows);
 }
@@ -1919,7 +1906,8 @@ Status ColumnObject::finalize(FinalizeMode mode) {
     }
 
     const bool need_pick_subcolumn_to_sparse_column =
-            mode == FinalizeMode::WRITE_MODE && subcolumns.size() > MAX_SUBCOLUMNS;
+            mode == FinalizeMode::WRITE_MODE &&
+            subcolumns.size() > config::variant_max_subcolumns_count;
     // finalize all subcolumns
     for (auto&& entry : subcolumns) {
         const auto& least_common_type = entry->data.get_least_common_type();
@@ -1966,8 +1954,10 @@ Status ColumnObject::finalize(FinalizeMode mode) {
         std::sort(sorted_by_size.begin(), sorted_by_size.end(),
                   [](const auto& a, const auto& b) { return a.second > b.second; });
 
-        // 3. pick MAX_SUBCOLUMNS selected subcolumns
-        for (size_t i = 0; i < std::min(MAX_SUBCOLUMNS, sorted_by_size.size()); ++i) {
+        // 3. pick config::variant_max_subcolumns_count selected subcolumns
+        for (size_t i = 0;
+             i < std::min(size_t(config::variant_max_subcolumns_count), sorted_by_size.size());
+             ++i) {
             // if too many null values, then consider it as sparse column
             if (sorted_by_size[i].second < num_rows * 0.95) {
                 continue;
@@ -2149,16 +2139,6 @@ const DataTypePtr ColumnObject::NESTED_TYPE = std::make_shared<vectorized::DataT
         std::make_shared<vectorized::DataTypeArray>(std::make_shared<vectorized::DataTypeNullable>(
                 std::make_shared<vectorized::DataTypeObject>())));
 
-// const size_t ColumnObject::MAX_SUBCOLUMNS = 5;
-#ifndef NDEBUG
-const size_t ColumnObject::MAX_SUBCOLUMNS = []() -> size_t {
-    std::srand(std::time(nullptr)); // 初始化随机数种子
-    return 2 + std::rand() % 8;    // 随机值范围 [1, 10]
-}();
-#else
-const size_t ColumnObject::MAX_SUBCOLUMNS = 5;
-#endif
-
 DataTypePtr ColumnObject::get_root_type() const {
     return subcolumns.get_root()->data.get_least_common_type();
 }

diff --git a/be/src/vec/columns/column_object.h b/be/src/vec/columns/column_object.h
@@ -100,7 +100,6 @@ class ColumnObject final : public COWHelper<IColumn, ColumnObject> {
     constexpr static TypeIndex MOST_COMMON_TYPE_ID = TypeIndex::JSONB;
     // Nullable(Array(Nullable(Object)))
     const static DataTypePtr NESTED_TYPE;
-    const static size_t MAX_SUBCOLUMNS;
     // Finlize mode for subcolumns, write mode will estimate which subcolumns are sparse columns(too many null values inside column),
     // merge and encode them into a shared column in root column. Only affects in flush block to segments.
     // Otherwise read mode should be as default mode.
@@ -171,8 +170,6 @@ class ColumnObject final : public COWHelper<IColumn, ColumnObject> {
         /// Returns last inserted field.
         Field get_last_field() const;
 
-        bool check_if_sparse_column(size_t num_rows);
-
         /// Returns single column if subcolumn in finalizes.
         /// Otherwise -- undefined behaviour.
         IColumn& get_finalized_column();

diff --git a/regression-test/data/variant_p1/compaction/compaction_sparse_column.out b/regression-test/data/variant_p1/compaction/compaction_sparse_column.out
@@ -1,6 +1,6 @@
 -- This file is automatically generated. You should know what you did if you want to edit this
 -- !select_b_bfcompact --
-12291
+12292
 
 -- !select_xxxx_bfcompact --
 12291
@@ -48,7 +48,7 @@
 3	1234	\N	ddddd	1	\N
 
 -- !select_b --
-12291
+12292
 
 -- !select_xxxx --
 12291

diff --git a/...test/suites/inverted_index_p0/test_show_nested_index_file_http_action_with_variant.groovy b/...test/suites/inverted_index_p0/test_show_nested_index_file_http_action_with_variant.groovy
@@ -59,7 +59,6 @@ suite("test_show_nested_index_file_http_action_with_variant", "nonConcurrent,p0"
     }
 
     set_be_config.call("memory_limitation_per_thread_for_schema_change_bytes", "6294967296")
-    set_be_config.call("variant_ratio_of_defaults_as_sparse_column", "1")
     def run_test = { format ->
         def tableName = "test_show_nested_index_file_http_action_with_variant_" + format
 

diff --git a/regression-test/suites/variant_github_events_new_p2/load.groovy b/regression-test/suites/variant_github_events_new_p2/load.groovy
@@ -53,8 +53,6 @@ suite("regression_test_variant_github_events_p2", "nonConcurrent,p2"){
             }
         }
     }
-    set_be_config.call("variant_ratio_of_defaults_as_sparse_column", "1")
-
     def table_name = "github_events"
     sql """DROP TABLE IF EXISTS ${table_name}"""
     table_name = "github_events"

diff --git a/regression-test/suites/variant_github_events_p2/load.groovy b/regression-test/suites/variant_github_events_p2/load.groovy
@@ -160,7 +160,7 @@ suite("regression_test_variant_github_events_p2", "nonConcurrent,p2"){
         DISTRIBUTED BY HASH(k) BUCKETS 4 
         properties("replication_num" = "1", "disable_auto_compaction" = "true", "bloom_filter_columns" = "v", "variant_enable_flatten_nested" = "true");
     """
-    set_be_config.call("variant_ratio_of_defaults_as_sparse_column", "1")
+
     // 2015
     load_json_data.call(table_name, """${getS3Url() + '/regression/gharchive.m/2015-01-01-0.json'}""")
     load_json_data.call(table_name, """${getS3Url() + '/regression/gharchive.m/2015-01-01-1.json'}""")

diff --git a/regression-test/suites/variant_log_data_p2/load.groovy b/regression-test/suites/variant_log_data_p2/load.groovy
@@ -72,27 +72,27 @@ suite("regression_test_variant_logdata", "nonConcurrent,p2"){
     create_table.call(table_name, "DUPLICATE", "4")
     // sql "set enable_two_phase_read_opt = false;"
     // no sparse columns
-    set_be_config.call("variant_ratio_of_defaults_as_sparse_column", "1.0")
+
     load_json_data.call(table_name, """${getS3Url() + '/regression/load/logdata.json'}""")
     qt_sql_32 """ select json_extract(v, "\$.json.parseFailed") from logdata where  json_extract(v, "\$.json.parseFailed") != 'null' order by k limit 1;"""
     qt_sql_32_1 """select cast(v['json']['parseFailed'] as string) from  logdata where cast(v['json']['parseFailed'] as string) is not null and k = 162 limit 1;"""
     sql "truncate table ${table_name}"
 
     // 0.95 default ratio    
-    set_be_config.call("variant_ratio_of_defaults_as_sparse_column", "0.95")
+
     load_json_data.call(table_name, """${getS3Url() + '/regression/load/logdata.json'}""")
     qt_sql_33 """ select json_extract(v,"\$.json.parseFailed") from logdata where  json_extract(v,"\$.json.parseFailed") != 'null' order by k limit 1;"""
     qt_sql_33_1 """select cast(v['json']['parseFailed'] as string) from  logdata where cast(v['json']['parseFailed'] as string) is not null and k = 162 limit 1;"""
     sql "truncate table ${table_name}"
 
     // always sparse column
-    set_be_config.call("variant_ratio_of_defaults_as_sparse_column", "0.95")
+
     load_json_data.call(table_name, """${getS3Url() + '/regression/load/logdata.json'}""")
     qt_sql_34 """ select json_extract(v, "\$.json.parseFailed") from logdata where  json_extract(v,"\$.json.parseFailed") != 'null' order by k limit 1;"""
     sql "truncate table ${table_name}"
     qt_sql_35 """select json_extract(v,"\$.json.parseFailed")  from logdata where k = 162 and  json_extract(v,"\$.json.parseFailed") != 'null';"""
     qt_sql_35_1 """select cast(v['json']['parseFailed'] as string) from  logdata where cast(v['json']['parseFailed'] as string) is not null and k = 162 limit 1;"""
     // TODO add test case that some certain columns are materialized in some file while others are not materilized(sparse)
     // unique table
-    set_be_config.call("variant_ratio_of_defaults_as_sparse_column", "1")
+
 }
diff --git a/regression-test/suites/variant_p0/desc.groovy b/regression-test/suites/variant_p0/desc.groovy
@@ -97,7 +97,7 @@ suite("regression_test_variant_desc", "nonConcurrent"){
         // sparse columns
         def table_name = "sparse_columns"
         create_table table_name
-        set_be_config.call("variant_ratio_of_defaults_as_sparse_column", "0.95")
+
         sql """set describe_extend_variant_column = true"""
         sql """insert into  sparse_columns select 0, '{"a": 11245, "b" : [123, {"xx" : 1}], "c" : {"c" : 456, "d" : null, "e" : 7.111}}'  as json_str
             union  all select 0, '{"a": 1123}' as json_str union all select 0, '{"a" : 1234, "xxxx" : "kaana"}' as json_str from numbers("number" = "4096") limit 4096 ;"""
@@ -115,7 +115,7 @@ suite("regression_test_variant_desc", "nonConcurrent"){
         table_name = "no_sparse_columns"
         create_table.call(table_name, "4")
         sql "set enable_two_phase_read_opt = false;"
-        set_be_config.call("variant_ratio_of_defaults_as_sparse_column", "1.0")
+
         sql """insert into  ${table_name} select 0, '{"a": 11245, "b" : [123, {"xx" : 1}], "c" : {"c" : 456, "d" : null, "e" : 7.111}}'  as json_str
             union  all select 0, '{"a": 1123}' as json_str union all select 0, '{"a" : 1234, "xxxx" : "kaana"}' as json_str from numbers("number" = "4096") limit 4096 ;"""
         sql "select * from no_sparse_columns limit 1"
@@ -126,7 +126,7 @@ suite("regression_test_variant_desc", "nonConcurrent"){
         table_name = "partition_data"
         create_table_partition.call(table_name, "4")
         sql "set enable_two_phase_read_opt = false;"
-        set_be_config.call("variant_ratio_of_defaults_as_sparse_column", "0.95")
+
         sql """insert into  ${table_name} select 2500, '{"a": 1123, "b" : [123, {"xx" : 1}], "c" : {"c" : 456, "d" : null, "e" : 7.111}, "zzz" : null, "oooo" : {"akakaka" : null, "xxxx" : {"xxx" : 123}}}'  as json_str
             union  all select 2500, '{"a" : 1234, "xxxx" : "kaana", "ddd" : {"aaa" : 123, "mxmxm" : [456, "789"]}}' as json_str from numbers("number" = "4096") limit 4096 ;"""
         sql """insert into  ${table_name} select 45000, '{"a": 11245, "b" : [123, {"xx" : 1}], "c" : {"c" : 456, "d" : null, "e" : 7.111}}'  as json_str
@@ -274,6 +274,6 @@ suite("regression_test_variant_desc", "nonConcurrent"){
         sql "desc large_tablets"
     } finally {
         // reset flags
-        set_be_config.call("variant_ratio_of_defaults_as_sparse_column", "0.95")
+
     }
 }
diff --git a/regression-test/suites/variant_p0/with_index/load.groovy b/regression-test/suites/variant_p0/with_index/load.groovy
@@ -45,8 +45,6 @@ suite("regression_test_variant_with_index", "nonConcurrent"){
         }
         assertTrue(useTime <= OpTimeout, "wait_for_latest_op_on_table_finish timeout")
     }
-    set_be_config.call("variant_ratio_of_defaults_as_sparse_column", "1.0")
-    set_be_config.call("variant_threshold_rows_to_estimate_sparse_column", "0")
     def table_name = "var_with_index"
     sql "DROP TABLE IF EXISTS var_with_index"
     sql """
@@ -68,7 +66,6 @@ suite("regression_test_variant_with_index", "nonConcurrent"){
     qt_sql_inv_3 """select * from var_with_index where inv match 'hello' and cast(v["a"] as int) > 0 order by k"""
     sql "truncate table var_with_index"
     // set back configs
-    set_be_config.call("variant_threshold_rows_to_estimate_sparse_column", "2048")
     // sql "truncate table ${table_name}"
     sql """insert into var_with_index values(1, '{"a1" : 0, "b1": 3}', 'hello world'), (2, '{"a2" : 123}', 'world'),(3, '{"a3" : 123}', 'hello world')"""
     sql """insert into var_with_index values(4, '{"b1" : 0, "b2": 3}', 'hello world'), (5, '{"b2" : 123}', 'world'),(6, '{"b3" : 123}', 'hello world')"""