Skip to content

Commit

Permalink
[InvertedIndex](Variant) supoort inverted index for array type in var…
Browse files Browse the repository at this point in the history
…iant
  • Loading branch information
eldenmoon committed Feb 19, 2025
1 parent 3a3c4b9 commit ebab86a
Show file tree
Hide file tree
Showing 10 changed files with 1,463 additions and 5 deletions.
8 changes: 6 additions & 2 deletions be/src/olap/rowset/segment_v2/inverted_index_writer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -76,15 +76,19 @@ bool InvertedIndexColumnWriter::check_support_inverted_index(const TabletColumn&
static std::set<FieldType> invalid_types = {
FieldType::OLAP_FIELD_TYPE_DOUBLE,
FieldType::OLAP_FIELD_TYPE_JSONB,
FieldType::OLAP_FIELD_TYPE_ARRAY,
FieldType::OLAP_FIELD_TYPE_FLOAT,
};
if (column.is_extracted_column() && (invalid_types.contains(column.type()))) {
if (invalid_types.contains(column.type())) {
return false;
}
if (column.is_variant_type()) {
return false;
}
if (column.is_array_type()) {
// only support one level array
const auto& subcolumn = column.get_sub_column(0);
return !subcolumn.is_array_type() && check_support_inverted_index(subcolumn);
}
return true;
}

Expand Down
4 changes: 2 additions & 2 deletions be/src/vec/exprs/vexpr.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -653,7 +653,7 @@ Status VExpr::_evaluate_inverted_index(VExprContext* context, const FunctionBase
context->get_inverted_index_context()
->get_storage_name_and_type_by_column_id(column_id);
auto storage_type = remove_nullable(storage_name_type->second);
auto target_type = cast_expr->get_target_type();
auto target_type = remove_nullable(cast_expr->get_target_type());
auto origin_primitive_type = storage_type->get_type_as_type_descriptor().type;
auto target_primitive_type = target_type->get_type_as_type_descriptor().type;
if (is_complex_type(storage_type)) {
Expand All @@ -673,7 +673,7 @@ Status VExpr::_evaluate_inverted_index(VExprContext* context, const FunctionBase
}
}
if (origin_primitive_type != TYPE_VARIANT &&
(origin_primitive_type == target_primitive_type ||
(storage_type->equals(*target_type) ||
(is_string_type(target_primitive_type) &&
is_string_type(origin_primitive_type)))) {
children_exprs.emplace_back(expr_without_cast(child));
Expand Down
12 changes: 12 additions & 0 deletions be/test/common/schema_util_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,18 @@ void construct_subcolumn(TabletSchemaSPtr schema, const FieldType& type, int32_t
vectorized::PathInData col_path(path);
subcol.set_path_info(col_path);
subcol.set_name(col_path.get_path());

if (type == FieldType::OLAP_FIELD_TYPE_ARRAY) {
TabletColumn array_item_col;
// double not support inverted index
array_item_col.set_type(FieldType::OLAP_FIELD_TYPE_DOUBLE);
array_item_col.set_is_nullable(true);
array_item_col.set_unique_id(-1);
array_item_col.set_parent_unique_id(col_unique_id);

subcol.add_sub_column(array_item_col);
}

schema->append_column(subcol);
subcolumns->emplace_back(std::move(subcol));
}
Expand Down
12 changes: 12 additions & 0 deletions regression-test/data/variant_github_events_new_p2/load.out
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,15 @@
-- !sql_select_count --
67843

-- !sql_inv --
11 ["async-await", "express", "javascript", "promise"]
106 ["javascript", "nodejs"]
165 ["bem-methodology", "css", "frontend-components", "react-components", "style-guide"]
181 ["bem-methodology", "css", "frontend-components", "react-components", "style-guide"]
448 ["es6", "eslint", "front-end", "frontend", "html", "html-css", "html5", "javascript", "js", "minions", "pam", "salt", "salt-command", "webapp", "yarn"]
496 ["bootstrap", "css", "css-framework", "html", "javascript", "sass", "scss"]
685 ["apollo-client", "apollo-server", "graphql", "javascript", "node", "react"]
826 ["javascript", "react"]
948 ["chrome", "chrome-extension", "firefox", "firefox-addon", "javascript", "jquery"]
962 ["application-programming-interface", "clang", "command-line-interface", "community-feedback-supported", "cpp", "css3", "html5", "javascript", "php8", "py311", "typescript"]

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
-- This file is automatically generated. You should know what you did if you want to edit this
-- !sql1 --
7

-- !sql2 --
2019-01-01 d93d942d985a8fb7547c72dada8d332f ["v", "w", "x", "y", "z"]

-- !sql --
2019-01-01 d93d942d985a8fb7547c72dada8d332f ["v", "w", "x", "y", "z"]

-- !sql3 --
2017-01-01 d93d942d985a8fb7547c72dada8d332e ["m", "n", "o", "p", "q", "r", "s", "t", "u"]

-- !sql --
2017-01-01 d93d942d985a8fb7547c72dada8d332e ["m", "n", "o", "p", "q", "r", "s", "t", "u"]

-- !sql4 --

-- !sql5 --
2017-01-01 6afef581285b6608bf80d5a4e46cf839 ["a", "b", "c"]
2017-01-01 8fcb57ae675f0af4d613d9e6c0e8a2a3 \N
2017-01-01 8fcb57ae675f0af4d613d9e6c0e8a2a4 \N
2017-01-01 8fcb57ae675f0af4d613d9e6c0e8a2a6 \N
2017-01-01 d93d942d985a8fb7547c72dada8d332d ["d", "e", "f", "g", "h", "i", "j", "k", "l"]
2017-01-01 d93d942d985a8fb7547c72dada8d332e ["m", "n", "o", "p", "q", "r", "s", "t", "u"]

-- !sql6 --
2017-01-01 6afef581285b6608bf80d5a4e46cf839 ["a", "b", "c"]
2017-01-01 d93d942d985a8fb7547c72dada8d332d ["d", "e", "f", "g", "h", "i", "j", "k", "l"]
2019-01-01 d93d942d985a8fb7547c72dada8d332f ["v", "w", "x", "y", "z"]

-- !sql7 --
2017-01-01 6afef581285b6608bf80d5a4e46cf839 ["a", "b", "c"]
2017-01-01 d93d942d985a8fb7547c72dada8d332d ["d", "e", "f", "g", "h", "i", "j", "k", "l"]

-- !sql8 --
2019-01-01 d93d942d985a8fb7547c72dada8d332f ["v", "w", "x", "y", "z"]

-- !sql9 --
2017-01-01 6afef581285b6608bf80d5a4e46cf839 ["a", "b", "c"]
2017-01-01 8fcb57ae675f0af4d613d9e6c0e8a2a3 \N
2017-01-01 8fcb57ae675f0af4d613d9e6c0e8a2a4 \N
2017-01-01 8fcb57ae675f0af4d613d9e6c0e8a2a6 \N
2017-01-01 d93d942d985a8fb7547c72dada8d332d ["d", "e", "f", "g", "h", "i", "j", "k", "l"]
2017-01-01 d93d942d985a8fb7547c72dada8d332e ["m", "n", "o", "p", "q", "r", "s", "t", "u"]
2019-01-01 d93d942d985a8fb7547c72dada8d332f ["v", "w", "x", "y", "z"]

-- !sql10 --
2017-01-01 d93d942d985a8fb7547c72dada8d332e ["m", "n", "o", "p", "q", "r", "s", "t", "u"]
2019-01-01 d93d942d985a8fb7547c72dada8d332f ["v", "w", "x", "y", "z"]

24 changes: 23 additions & 1 deletion regression-test/suites/variant_github_events_new_p2/load.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ suite("regression_test_variant_github_events_p2", "nonConcurrent,p2"){
CREATE TABLE IF NOT EXISTS ${table_name} (
k bigint,
v variant,
INDEX idx_var(v) USING INVERTED PROPERTIES("parser" = "english") COMMENT ''
INDEX idx_var(v) USING INVERTED COMMENT ''
)
DUPLICATE KEY(`k`)
DISTRIBUTED BY HASH(k) BUCKETS 4
Expand Down Expand Up @@ -128,4 +128,26 @@ suite("regression_test_variant_github_events_p2", "nonConcurrent,p2"){
sql """DELETE FROM github_events where k >= 9223372036854775107"""

qt_sql_select_count """ select count(*) from github_events_2; """

trigger_and_wait_compaction("github_events", "full")

// query and filterd by inverted index
profile("test_profile_1") {
sql """ set enable_common_expr_pushdown = true; """
sql """ set enable_common_expr_pushdown_for_inverted_index = true; """
sql """ set enable_pipeline_x_engine = true;"""
sql """ set enable_profile = true;"""
sql """ set profile_level = 2;"""
run {
qt_sql_inv """/* test_profile_1 */
select k, v['payload']['pull_request']['head']['repo']['topics'] from github_events where arrays_overlap(cast(v['payload']['pull_request']['head']['repo']['topics'] as array<text>), ['javascript', 'css'] ) order by k
"""
}

check { profileString, exception ->
log.info(profileString)
assertTrue(profileString.contains("RowsInvertedIndexFiltered: 67.682K"))
}
}
qt_sql_inv """select k, v['payload']['pull_request']['head']['repo']['topics'] from github_events where arrays_overlap(cast(v['payload']['pull_request']['head']['repo']['topics'] as array<text>), ['javascript', 'css'] ) order by k limit 10;"""
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

suite("test_array_contains_with_inverted_index"){
// prepare test table
def indexTblName = "tai"
sql "set disable_inverted_index_v1_for_variant = false"

// If we use common expr pass to inverted index , we should set enable_common_expr_pushdown = true
sql """ set enable_common_expr_pushdown = true; """
sql """ set enable_common_expr_pushdown_for_inverted_index = true; """
sql """ set enable_pipeline_x_engine = true;"""
sql """ set enable_profile = true;"""

sql "DROP TABLE IF EXISTS ${indexTblName}"
// create 1 replica table
def storageFormat = new Random().nextBoolean() ? "V1" : "V2"
sql """
CREATE TABLE IF NOT EXISTS `${indexTblName}` (
`apply_date` date NULL COMMENT '',
`id` varchar(60) NOT NULL COMMENT '',
`inventors` variant NULL COMMENT '',
INDEX index_inverted_inventors(inventors) USING INVERTED COMMENT ''
) ENGINE=OLAP
DUPLICATE KEY(`apply_date`, `id`)
COMMENT 'OLAP'
DISTRIBUTED BY HASH(`id`) BUCKETS 1
PROPERTIES (
"replication_allocation" = "tag.location.default: 1",
"is_being_synced" = "false",
"storage_format" = "V2",
"light_schema_change" = "true",
"disable_auto_compaction" = "false",
"enable_single_replica_compaction" = "false",
"inverted_index_storage_format" = "$storageFormat"
);
"""

sql """ INSERT INTO `${indexTblName}`(`apply_date`, `id`, `inventors`) VALUES ('2017-01-01', '6afef581285b6608bf80d5a4e46cf839', '{"inventors":["a", "b", "c"]}'); """
sql """ INSERT INTO `${indexTblName}`(`apply_date`, `id`, `inventors`) VALUES ('2017-01-01', 'd93d942d985a8fb7547c72dada8d332d', '{"inventors":["d", "e", "f", "g", "h", "i", "j", "k", "l"]}'); """
sql """ INSERT INTO `${indexTblName}`(`apply_date`, `id`, `inventors`) VALUES ('2017-01-01', '48a33ec3453a28bce84b8f96fe161956', '{"inventors":["m"]}'); """
sql """ INSERT INTO `${indexTblName}`(`apply_date`, `id`, `inventors`) VALUES ('2017-01-01', '021603e7dcfe65d44af0efd0e5aee154', '{"inventors":["n"]}'); """
sql """ INSERT INTO `${indexTblName}`(`apply_date`, `id`, `inventors`) VALUES ('2017-01-01', '9fcb57ae675f0af4d613d9e6c0e8a2a2', '{"inventors":["o"]}'); """
sql """ INSERT INTO `${indexTblName}`(`apply_date`, `id`) VALUES ('2017-01-01', '8fcb57ae675f0af4d613d9e6c0e8a2a3'); """
sql """ INSERT INTO `${indexTblName}`(`apply_date`, `id`, `inventors`) VALUES ('2017-01-01', '8fcb57ae675f0af4d613d9e6c0e8a2a4', NULL); """
sql """ INSERT INTO `${indexTblName}`(`apply_date`, `id`, `inventors`) VALUES ('2017-01-01', '8fcb57ae675f0af4d613d9e6c0e8a2a5', '{"inventors":[]}'); """
sql """ INSERT INTO `${indexTblName}`(`apply_date`, `id`, `inventors`) VALUES ('2017-01-01', '8fcb57ae675f0af4d613d9e6c0e8a2a6', '{"inventors":[null,null,null]}'); """
sql """ INSERT INTO `${indexTblName}`(`apply_date`, `id`, `inventors`) VALUES ('2017-01-01', '8fcb57ae675f0af4d613d9e6c0e8a2a7', '{"inventors":[null,null,null]}'); """
sql """ INSERT INTO `${indexTblName}`(`apply_date`, `id`, `inventors`) VALUES ('2017-01-01', '8fcb57ae675f0af4d613d9e6c0e8a2a8', '{"inventors":[]}'); """
sql """ INSERT INTO `${indexTblName}`(`apply_date`, `id`, `inventors`) VALUES ('2019-01-01', 'a648a447b8f71522f11632eba4b4adde', '{"inventors":["p", "q", "r", "s", "t"]}'); """
sql """ INSERT INTO `${indexTblName}`(`apply_date`, `id`, `inventors`) VALUES ('2019-01-01', 'a9fb5c985c90bf05f3bee5ca3ae95260', '{"inventors":["u", "v"]}'); """
sql """ INSERT INTO `${indexTblName}`(`apply_date`, `id`, `inventors`) VALUES ('2019-01-01', '0974e7a82e30d1af83205e474fadd0a2', '{"inventors":["w"]}'); """
sql """ INSERT INTO `${indexTblName}`(`apply_date`, `id`, `inventors`) VALUES ('2019-01-01', '26823b3995ee38bd145ddd910b2f6300', '{"inventors":["x"]}'); """
sql """ INSERT INTO `${indexTblName}`(`apply_date`, `id`, `inventors`) VALUES ('2019-01-01', 'ee27ee1da291e46403c408e220bed6e1', '{"inventors":["y"]}'); """
sql """ set enable_common_expr_pushdown = true """

qt_sql """ select count() from ${indexTblName}"""
def param_contains = ["'s'", "''", null]
for (int i = 0 ; i < param_contains.size(); ++i) {
def p = param_contains[i]
log.info("param: ${p}")
order_qt_sql """ select * from tai where array_contains(cast(inventors['inventors'] as array<text>), ${p}) order by id; """
order_qt_sql """ select * from tai where array_contains(cast(inventors['inventors'] as array<text>), ${p}) and apply_date = '2017-01-01' order by id; """
order_qt_sql """ select * from tai where array_contains(cast(inventors['inventors'] as array<text>), ${p}) and apply_date = '2019-01-01' order by id; """
order_qt_sql """ select * from tai where array_contains(cast(inventors['inventors'] as array<text>), ${p}) or apply_date = '2017-01-01' order by id; """
order_qt_sql """ select * from tai where !array_contains(cast(inventors['inventors'] as array<text>), ${p}) order by id; """
order_qt_sql """ select * from tai where !array_contains(cast(inventors['inventors'] as array<text>), ${p}) and apply_date = '2017-01-01' order by id; """
order_qt_sql """ select * from tai where !array_contains(cast(inventors['inventors'] as array<text>), ${p}) and apply_date = '2019-01-01' order by id; """
order_qt_sql """ select * from tai where !array_contains(cast(inventors['inventors'] as array<text>), ${p}) or apply_date = '2017-01-01' order by id; """
order_qt_sql """ select * from tai where (array_contains(cast(inventors['inventors'] as array<text>), ${p}) and apply_date = '2017-01-01') or apply_date = '2019-01-01' order by id; """
}

// test arrays_overlap with inverted index
// now if we use inverted index we will not eval exprs
def param = [["'s'", "'t'"], [], null, ["'s'", "''", "'t'"], ["'s'", null, "'t'"], [null, "''"], ["'s'", null, "'t'", "''"]] // null for arrays_overlap will return null which in predicate will lead to return empty set
for (int i = 0 ; i < param.size(); ++i) {
def p = param[i]
log.info("param: ${p}")
order_qt_sql """ select /*+SET_VAR(enable_common_expr_pushdown = true)*/ * from tai where arrays_overlap(cast(inventors['inventors'] as array<text>), ${p}) order by id; """
order_qt_sql """ select /*+SET_VAR(enable_common_expr_pushdown = false)*/ * from tai where arrays_overlap(cast(inventors['inventors'] as array<text>), ${p}) order by id; """
order_qt_sql """ select /*+SET_VAR(enable_common_expr_pushdown = true)*/ * from tai where arrays_overlap(cast(inventors['inventors'] as array<text>), ${p}) and apply_date = '2017-01-01' order by id; """
order_qt_sql """ select /*+SET_VAR(enable_common_expr_pushdown = false)*/ * from tai where arrays_overlap(cast(inventors['inventors'] as array<text>), ${p}) and apply_date = '2017-01-01' order by id; """
order_qt_sql """ select /*+SET_VAR(enable_common_expr_pushdown = true)*/ * from tai where arrays_overlap(cast(inventors['inventors'] as array<text>), ${p}) and apply_date = '2019-01-01' order by id; """
order_qt_sql """ select /*+SET_VAR(enable_common_expr_pushdown = false)*/ * from tai where arrays_overlap(cast(inventors['inventors'] as array<text>), ${p}) and apply_date = '2019-01-01' order by id; """
order_qt_sql """ select /*+SET_VAR(enable_common_expr_pushdown = true)*/ * from tai where arrays_overlap(cast(inventors['inventors'] as array<text>), ${p}) or apply_date = '2017-01-01' order by id; """
order_qt_sql """ select /*+SET_VAR(enable_common_expr_pushdown = false)*/ * from tai where arrays_overlap(cast(inventors['inventors'] as array<text>), ${p}) or apply_date = '2017-01-01' order by id; """
order_qt_sql """ select /*+SET_VAR(enable_common_expr_pushdown = true)*/ * from tai where !arrays_overlap(cast(inventors['inventors'] as array<text>), ${p}) order by id; """
order_qt_sql """ select /*+SET_VAR(enable_common_expr_pushdown = false)*/ * from tai where !arrays_overlap(cast(inventors['inventors'] as array<text>), ${p}) order by id; """
order_qt_sql """ select /*+SET_VAR(enable_common_expr_pushdown = true)*/ * from tai where !arrays_overlap(cast(inventors['inventors'] as array<text>), ${p}) and apply_date = '2017-01-01' order by id; """
order_qt_sql """ select /*+SET_VAR(enable_common_expr_pushdown = false)*/ * from tai where !arrays_overlap(cast(inventors['inventors'] as array<text>), ${p}) and apply_date = '2017-01-01' order by id; """
order_qt_sql """ select /*+SET_VAR(enable_common_expr_pushdown = true)*/ * from tai where !arrays_overlap(cast(inventors['inventors'] as array<text>), ${p}) and apply_date = '2019-01-01' order by id; """
order_qt_sql """ select /*+SET_VAR(enable_common_expr_pushdown = false)*/ * from tai where !arrays_overlap(cast(inventors['inventors'] as array<text>), ${p}) and apply_date = '2019-01-01' order by id; """
order_qt_sql """ select /*+SET_VAR(enable_common_expr_pushdown = true)*/ * from tai where !arrays_overlap(cast(inventors['inventors'] as array<text>), ${p}) or apply_date = '2017-01-01' order by id; """
order_qt_sql """ select /*+SET_VAR(enable_common_expr_pushdown = false)*/ * from tai where !arrays_overlap(cast(inventors['inventors'] as array<text>), ${p}) or apply_date = '2017-01-01' order by id; """
order_qt_sql """ select /*+SET_VAR(enable_common_expr_pushdown = true)*/ * from tai where (arrays_overlap(cast(inventors['inventors'] as array<text>), ${p}) and apply_date = '2017-01-01') or apply_date = '2019-01-01' order by id; """
order_qt_sql """ select /*+SET_VAR(enable_common_expr_pushdown = false)*/ * from tai where (arrays_overlap(cast(inventors['inventors'] as array<text>), ${p}) and apply_date = '2017-01-01') or apply_date = '2019-01-01' order by id; """
}
}
Loading

0 comments on commit ebab86a

Please sign in to comment.