Skip to content

Commit

Permalink
add debug point test
Browse files Browse the repository at this point in the history
  • Loading branch information
eldenmoon committed Dec 25, 2024
1 parent 5ec1358 commit 81411f2
Show file tree
Hide file tree
Showing 15 changed files with 170 additions and 141 deletions.
3 changes: 1 addition & 2 deletions be/src/common/config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1019,9 +1019,8 @@ DEFINE_mInt64(workload_group_scan_task_wait_timeout_ms, "10000");

// Whether use schema dict in backend side instead of MetaService side(cloud mode)
DEFINE_mBool(variant_use_cloud_schema_dict, "true");
DEFINE_mDouble(variant_ratio_of_defaults_as_sparse_column, "1");
DEFINE_mInt64(variant_threshold_rows_to_estimate_sparse_column, "2048");
DEFINE_mBool(variant_throw_exeception_on_invalid_json, "false");
DEFINE_mInt32(variant_max_subcolumns_count, "5");

// block file cache
DEFINE_Bool(enable_file_cache, "false");
Expand Down
6 changes: 2 additions & 4 deletions be/src/common/config.h
Original file line number Diff line number Diff line change
Expand Up @@ -1216,15 +1216,13 @@ DECLARE_mInt64(lookup_connection_cache_capacity);
// level of compression when using LZ4_HC, whose defalut value is LZ4HC_CLEVEL_DEFAULT
DECLARE_mInt64(LZ4_HC_compression_level);
// Threshold of a column as sparse column
// Notice: TEST ONLY
DECLARE_mDouble(variant_ratio_of_defaults_as_sparse_column);
DECLARE_mBool(variant_use_cloud_schema_dict);
// Threshold to estimate a column is sparsed
// Notice: TEST ONLY
DECLARE_mInt64(variant_threshold_rows_to_estimate_sparse_column);
// Treat invalid json format str as string, instead of throwing exception if false
DECLARE_mBool(variant_throw_exeception_on_invalid_json);

DECLARE_mInt32(variant_max_subcolumns_count);

DECLARE_mBool(enable_merge_on_write_correctness_check);
// USED FOR DEBUGING
// core directly if the compaction found there's duplicate key on mow table
Expand Down
44 changes: 35 additions & 9 deletions be/src/olap/rowset/segment_v2/variant_column_writer_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,10 @@
// under the License.
#include "olap/rowset/segment_v2/variant_column_writer_impl.h"

#include <fmt/core.h>
#include <gen_cpp/segment_v2.pb.h>

#include "common/config.h"
#include "common/status.h"
#include "olap/olap_common.h"
#include "olap/rowset/beta_rowset.h"
Expand All @@ -43,14 +45,14 @@ VariantColumnWriterImpl::VariantColumnWriterImpl(const ColumnWriterOptions& opts

Status VariantColumnWriterImpl::init() {
// caculate stats info
std::set<std::string> dynamic_paths;
RETURN_IF_ERROR(_get_subcolumn_paths_from_stats(dynamic_paths));
if (dynamic_paths.empty()) {
std::set<std::string> subcolumn_paths;
RETURN_IF_ERROR(_get_subcolumn_paths_from_stats(subcolumn_paths));
if (subcolumn_paths.empty()) {
_column = vectorized::ColumnObject::create(true, false);
} else {
// create root
auto col = vectorized::ColumnObject::create(true, true);
for (const auto& str_path : dynamic_paths) {
for (const auto& str_path : subcolumn_paths) {
DCHECK(col->add_sub_column(vectorized::PathInData(str_path), 0));
}
_column = std::move(col);
Expand Down Expand Up @@ -97,19 +99,20 @@ Status VariantColumnWriterImpl::_get_subcolumn_paths_from_stats(std::set<std::st
}
}
}
// Check if the number of all dynamic paths exceeds the limit.
if (path_to_total_number_of_non_null_values.size() > vectorized::ColumnObject::MAX_SUBCOLUMNS) {

// Check if the number of all subcolumn paths exceeds the limit.
if (path_to_total_number_of_non_null_values.size() > config::variant_max_subcolumns_count) {
// Sort paths by total number of non null values.
std::vector<std::pair<size_t, std::string_view>> paths_with_sizes;
paths_with_sizes.reserve(path_to_total_number_of_non_null_values.size());
for (const auto& [path, size] : path_to_total_number_of_non_null_values) {
paths_with_sizes.emplace_back(size, path);
}
std::sort(paths_with_sizes.begin(), paths_with_sizes.end(), std::greater());
// Fill dynamic_paths with first max_dynamic_paths paths in sorted list.
// Fill subcolumn_paths with first subcolumn paths in sorted list.
// reserve 1 for root column
for (const auto& [size, path] : paths_with_sizes) {
if (paths.size() < vectorized::ColumnObject::MAX_SUBCOLUMNS - 1) {
if (paths.size() < config::variant_max_subcolumns_count - 1) {
VLOG_DEBUG << "pick " << path << " as subcolumn";
paths.emplace(path);
}
Expand All @@ -118,8 +121,31 @@ Status VariantColumnWriterImpl::_get_subcolumn_paths_from_stats(std::set<std::st
// new_statistics.sparse_data_paths_statistics.emplace(path, size);
// }
}
DBUG_EXECUTE_IF("variant_column_writer_impl._get_subcolumn_paths_from_stats", {
auto stats = DebugPoints::instance()->get_debug_param_or_default<std::string>(
"variant_column_writer_impl._get_subcolumn_paths_from_stats", "stats", "");
auto subcolumns = DebugPoints::instance()->get_debug_param_or_default<std::string>(
"variant_column_writer_impl._get_subcolumn_paths_from_stats", "subcolumns", "");
LOG(INFO) << "stats: " << stats;
LOG(INFO) << "subcolumns: " << subcolumns;
if (stats.empty()) {
return Status::Error<ErrorCode::INTERNAL_ERROR>("debug point stats is empty");
}
std::vector<std::string> sizes;
boost::split(sizes, stats, boost::algorithm::is_any_of(","));
CHECK_EQ(sizes.size(), paths_with_sizes.size()) << "stats not match " << stats;
for (int i = 0; i < sizes.size(); ++i) {
CHECK_EQ(fmt::format("{}", paths_with_sizes[i].first), sizes[i]);
}
std::set<std::string> subcolumns_set;
boost::split(subcolumns_set, subcolumns, boost::algorithm::is_any_of(","));
if (!std::equal(paths.begin(), paths.end(), subcolumns_set.begin(),
subcolumns_set.end())) {
CHECK(false) << "subcolumns not match " << subcolumns;
}
})
} else {
// Use all dynamic paths from all source columns.
// Use all subcolumn paths from all source columns.
for (const auto& [path, _] : path_to_total_number_of_non_null_values) {
VLOG_DEBUG << "pick " << path << " as subcolumn";
paths.emplace(path);
Expand Down
34 changes: 7 additions & 27 deletions be/src/vec/columns/column_object.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -662,19 +662,6 @@ void ColumnObject::resize(size_t n) {
ENABLE_CHECK_CONSISTENCY(this);
}

bool ColumnObject::Subcolumn::check_if_sparse_column(size_t num_rows) {
if (num_rows < config::variant_threshold_rows_to_estimate_sparse_column) {
return false;
}
std::vector<double> defaults_ratio;
for (size_t i = 0; i < data.size(); ++i) {
defaults_ratio.push_back(data[i]->get_ratio_of_default_rows());
}
double default_ratio = std::accumulate(defaults_ratio.begin(), defaults_ratio.end(), 0.0) /
defaults_ratio.size();
return default_ratio >= config::variant_ratio_of_defaults_as_sparse_column;
}

void ColumnObject::Subcolumn::finalize(FinalizeMode mode) {
if (is_finalized()) {
return;
Expand Down Expand Up @@ -1273,7 +1260,7 @@ void ColumnObject::add_nested_subcolumn(const PathInData& key, const FieldInfo&
}

bool ColumnObject::try_add_new_subcolumn(const PathInData& path) {
if (subcolumns.size() == MAX_SUBCOLUMNS) return false;
if (subcolumns.size() == config::variant_max_subcolumns_count) return false;

return add_sub_column(path, num_rows);
}
Expand Down Expand Up @@ -1919,7 +1906,8 @@ Status ColumnObject::finalize(FinalizeMode mode) {
}

const bool need_pick_subcolumn_to_sparse_column =
mode == FinalizeMode::WRITE_MODE && subcolumns.size() > MAX_SUBCOLUMNS;
mode == FinalizeMode::WRITE_MODE &&
subcolumns.size() > config::variant_max_subcolumns_count;
// finalize all subcolumns
for (auto&& entry : subcolumns) {
const auto& least_common_type = entry->data.get_least_common_type();
Expand Down Expand Up @@ -1966,8 +1954,10 @@ Status ColumnObject::finalize(FinalizeMode mode) {
std::sort(sorted_by_size.begin(), sorted_by_size.end(),
[](const auto& a, const auto& b) { return a.second > b.second; });

// 3. pick MAX_SUBCOLUMNS selected subcolumns
for (size_t i = 0; i < std::min(MAX_SUBCOLUMNS, sorted_by_size.size()); ++i) {
// 3. pick config::variant_max_subcolumns_count selected subcolumns
for (size_t i = 0;
i < std::min(size_t(config::variant_max_subcolumns_count), sorted_by_size.size());
++i) {
// if too many null values, then consider it as sparse column
if (sorted_by_size[i].second < num_rows * 0.95) {
continue;
Expand Down Expand Up @@ -2149,16 +2139,6 @@ const DataTypePtr ColumnObject::NESTED_TYPE = std::make_shared<vectorized::DataT
std::make_shared<vectorized::DataTypeArray>(std::make_shared<vectorized::DataTypeNullable>(
std::make_shared<vectorized::DataTypeObject>())));

// const size_t ColumnObject::MAX_SUBCOLUMNS = 5;
#ifndef NDEBUG
const size_t ColumnObject::MAX_SUBCOLUMNS = []() -> size_t {
std::srand(std::time(nullptr)); // 初始化随机数种子
return 2 + std::rand() % 8; // 随机值范围 [1, 10]
}();
#else
const size_t ColumnObject::MAX_SUBCOLUMNS = 5;
#endif

DataTypePtr ColumnObject::get_root_type() const {
return subcolumns.get_root()->data.get_least_common_type();
}
Expand Down
3 changes: 0 additions & 3 deletions be/src/vec/columns/column_object.h
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,6 @@ class ColumnObject final : public COWHelper<IColumn, ColumnObject> {
constexpr static TypeIndex MOST_COMMON_TYPE_ID = TypeIndex::JSONB;
// Nullable(Array(Nullable(Object)))
const static DataTypePtr NESTED_TYPE;
const static size_t MAX_SUBCOLUMNS;
// Finlize mode for subcolumns, write mode will estimate which subcolumns are sparse columns(too many null values inside column),
// merge and encode them into a shared column in root column. Only affects in flush block to segments.
// Otherwise read mode should be as default mode.
Expand Down Expand Up @@ -171,8 +170,6 @@ class ColumnObject final : public COWHelper<IColumn, ColumnObject> {
/// Returns last inserted field.
Field get_last_field() const;

bool check_if_sparse_column(size_t num_rows);

/// Returns single column if subcolumn in finalizes.
/// Otherwise -- undefined behaviour.
IColumn& get_finalized_column();
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
-- This file is automatically generated. You should know what you did if you want to edit this
-- !select_b_bfcompact --
12291
12292

-- !select_xxxx_bfcompact --
12291
Expand Down Expand Up @@ -48,7 +48,7 @@
3 1234 \N ddddd 1 \N

-- !select_b --
12291
12292

-- !select_xxxx --
12291
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,6 @@ suite("test_show_nested_index_file_http_action_with_variant", "nonConcurrent,p0"
}

set_be_config.call("memory_limitation_per_thread_for_schema_change_bytes", "6294967296")
set_be_config.call("variant_ratio_of_defaults_as_sparse_column", "1")
def run_test = { format ->
def tableName = "test_show_nested_index_file_http_action_with_variant_" + format

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,6 @@ suite("regression_test_variant_github_events_p2", "nonConcurrent,p2"){
}
}
}
set_be_config.call("variant_ratio_of_defaults_as_sparse_column", "1")

def table_name = "github_events"
sql """DROP TABLE IF EXISTS ${table_name}"""
table_name = "github_events"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,7 @@ suite("regression_test_variant_github_events_p2", "nonConcurrent,p2"){
DISTRIBUTED BY HASH(k) BUCKETS 4
properties("replication_num" = "1", "disable_auto_compaction" = "true", "bloom_filter_columns" = "v", "variant_enable_flatten_nested" = "true");
"""
set_be_config.call("variant_ratio_of_defaults_as_sparse_column", "1")

// 2015
load_json_data.call(table_name, """${getS3Url() + '/regression/gharchive.m/2015-01-01-0.json'}""")
load_json_data.call(table_name, """${getS3Url() + '/regression/gharchive.m/2015-01-01-1.json'}""")
Expand Down
8 changes: 4 additions & 4 deletions regression-test/suites/variant_log_data_p2/load.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -72,27 +72,27 @@ suite("regression_test_variant_logdata", "nonConcurrent,p2"){
create_table.call(table_name, "DUPLICATE", "4")
// sql "set enable_two_phase_read_opt = false;"
// no sparse columns
set_be_config.call("variant_ratio_of_defaults_as_sparse_column", "1.0")

load_json_data.call(table_name, """${getS3Url() + '/regression/load/logdata.json'}""")
qt_sql_32 """ select json_extract(v, "\$.json.parseFailed") from logdata where json_extract(v, "\$.json.parseFailed") != 'null' order by k limit 1;"""
qt_sql_32_1 """select cast(v['json']['parseFailed'] as string) from logdata where cast(v['json']['parseFailed'] as string) is not null and k = 162 limit 1;"""
sql "truncate table ${table_name}"

// 0.95 default ratio
set_be_config.call("variant_ratio_of_defaults_as_sparse_column", "0.95")

load_json_data.call(table_name, """${getS3Url() + '/regression/load/logdata.json'}""")
qt_sql_33 """ select json_extract(v,"\$.json.parseFailed") from logdata where json_extract(v,"\$.json.parseFailed") != 'null' order by k limit 1;"""
qt_sql_33_1 """select cast(v['json']['parseFailed'] as string) from logdata where cast(v['json']['parseFailed'] as string) is not null and k = 162 limit 1;"""
sql "truncate table ${table_name}"

// always sparse column
set_be_config.call("variant_ratio_of_defaults_as_sparse_column", "0.95")

load_json_data.call(table_name, """${getS3Url() + '/regression/load/logdata.json'}""")
qt_sql_34 """ select json_extract(v, "\$.json.parseFailed") from logdata where json_extract(v,"\$.json.parseFailed") != 'null' order by k limit 1;"""
sql "truncate table ${table_name}"
qt_sql_35 """select json_extract(v,"\$.json.parseFailed") from logdata where k = 162 and json_extract(v,"\$.json.parseFailed") != 'null';"""
qt_sql_35_1 """select cast(v['json']['parseFailed'] as string) from logdata where cast(v['json']['parseFailed'] as string) is not null and k = 162 limit 1;"""
// TODO add test case that some certain columns are materialized in some file while others are not materilized(sparse)
// unique table
set_be_config.call("variant_ratio_of_defaults_as_sparse_column", "1")

}
8 changes: 4 additions & 4 deletions regression-test/suites/variant_p0/desc.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ suite("regression_test_variant_desc", "nonConcurrent"){
// sparse columns
def table_name = "sparse_columns"
create_table table_name
set_be_config.call("variant_ratio_of_defaults_as_sparse_column", "0.95")

sql """set describe_extend_variant_column = true"""
sql """insert into sparse_columns select 0, '{"a": 11245, "b" : [123, {"xx" : 1}], "c" : {"c" : 456, "d" : null, "e" : 7.111}}' as json_str
union all select 0, '{"a": 1123}' as json_str union all select 0, '{"a" : 1234, "xxxx" : "kaana"}' as json_str from numbers("number" = "4096") limit 4096 ;"""
Expand All @@ -115,7 +115,7 @@ suite("regression_test_variant_desc", "nonConcurrent"){
table_name = "no_sparse_columns"
create_table.call(table_name, "4")
sql "set enable_two_phase_read_opt = false;"
set_be_config.call("variant_ratio_of_defaults_as_sparse_column", "1.0")

sql """insert into ${table_name} select 0, '{"a": 11245, "b" : [123, {"xx" : 1}], "c" : {"c" : 456, "d" : null, "e" : 7.111}}' as json_str
union all select 0, '{"a": 1123}' as json_str union all select 0, '{"a" : 1234, "xxxx" : "kaana"}' as json_str from numbers("number" = "4096") limit 4096 ;"""
sql "select * from no_sparse_columns limit 1"
Expand All @@ -126,7 +126,7 @@ suite("regression_test_variant_desc", "nonConcurrent"){
table_name = "partition_data"
create_table_partition.call(table_name, "4")
sql "set enable_two_phase_read_opt = false;"
set_be_config.call("variant_ratio_of_defaults_as_sparse_column", "0.95")

sql """insert into ${table_name} select 2500, '{"a": 1123, "b" : [123, {"xx" : 1}], "c" : {"c" : 456, "d" : null, "e" : 7.111}, "zzz" : null, "oooo" : {"akakaka" : null, "xxxx" : {"xxx" : 123}}}' as json_str
union all select 2500, '{"a" : 1234, "xxxx" : "kaana", "ddd" : {"aaa" : 123, "mxmxm" : [456, "789"]}}' as json_str from numbers("number" = "4096") limit 4096 ;"""
sql """insert into ${table_name} select 45000, '{"a": 11245, "b" : [123, {"xx" : 1}], "c" : {"c" : 456, "d" : null, "e" : 7.111}}' as json_str
Expand Down Expand Up @@ -274,6 +274,6 @@ suite("regression_test_variant_desc", "nonConcurrent"){
sql "desc large_tablets"
} finally {
// reset flags
set_be_config.call("variant_ratio_of_defaults_as_sparse_column", "0.95")

}
}
3 changes: 0 additions & 3 deletions regression-test/suites/variant_p0/with_index/load.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,6 @@ suite("regression_test_variant_with_index", "nonConcurrent"){
}
assertTrue(useTime <= OpTimeout, "wait_for_latest_op_on_table_finish timeout")
}
set_be_config.call("variant_ratio_of_defaults_as_sparse_column", "1.0")
set_be_config.call("variant_threshold_rows_to_estimate_sparse_column", "0")
def table_name = "var_with_index"
sql "DROP TABLE IF EXISTS var_with_index"
sql """
Expand All @@ -68,7 +66,6 @@ suite("regression_test_variant_with_index", "nonConcurrent"){
qt_sql_inv_3 """select * from var_with_index where inv match 'hello' and cast(v["a"] as int) > 0 order by k"""
sql "truncate table var_with_index"
// set back configs
set_be_config.call("variant_threshold_rows_to_estimate_sparse_column", "2048")
// sql "truncate table ${table_name}"
sql """insert into var_with_index values(1, '{"a1" : 0, "b1": 3}', 'hello world'), (2, '{"a2" : 123}', 'world'),(3, '{"a3" : 123}', 'hello world')"""
sql """insert into var_with_index values(4, '{"b1" : 0, "b2": 3}', 'hello world'), (5, '{"b2" : 123}', 'world'),(6, '{"b3" : 123}', 'hello world')"""
Expand Down
Loading

0 comments on commit 81411f2

Please sign in to comment.