-
Notifications
You must be signed in to change notification settings - Fork 73
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Draft: Optimization for GLT search #252
base: main
Are you sure you want to change the base?
Changes from all commits
96f18d5
19dbadb
fd94018
1196327
9718d56
1cf9bac
693ad94
979b029
a6f2025
2b8c883
6becc48
7366ed5
a44ecad
8f41624
46725f4
5b76807
8ad0793
50b79ba
0617c48
11fd9b7
c63cccb
61b3eb8
66275da
6702c9d
ff5b61f
93808f0
02b4a30
87880f8
1e69b99
e9fde16
f12aa15
7db1315
d698c01
5c033f4
9c60bd5
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||||||
---|---|---|---|---|---|---|---|---|---|---|
|
@@ -18,6 +18,8 @@ using glt::ir::is_delim; | |||||||||
using glt::streaming_archive::reader::Archive; | ||||||||||
using glt::streaming_archive::reader::File; | ||||||||||
using glt::streaming_archive::reader::Message; | ||||||||||
using std::make_pair; | ||||||||||
using std::pair; | ||||||||||
using std::string; | ||||||||||
using std::vector; | ||||||||||
|
||||||||||
|
@@ -158,15 +160,14 @@ QueryToken::QueryToken( | |||||||||
if (converts_to_int || converts_to_float) { | ||||||||||
converts_to_non_dict_var = true; | ||||||||||
} | ||||||||||
|
||||||||||
if (!converts_to_non_dict_var) { | ||||||||||
// Dictionary variable | ||||||||||
// GLT TODO | ||||||||||
// Actually this is incorrect, because it's possible user enters 23412*34 aiming to | ||||||||||
// match 23412.34. This should be an ambigious type. | ||||||||||
// match 23412.34. we should consider the possibility that middle wildcard causes | ||||||||||
// the converts_to_non_dict_var to be false. | ||||||||||
m_type = Type::DictionaryVar; | ||||||||||
m_cannot_convert_to_non_dict_var = true; | ||||||||||
} else { | ||||||||||
// GLT TODO: think about this carefully. | ||||||||||
m_type = Type::Ambiguous; | ||||||||||
m_possible_types.push_back(Type::IntVar); | ||||||||||
m_possible_types.push_back(Type::FloatVar); | ||||||||||
|
@@ -393,6 +394,152 @@ bool find_matching_message( | |||||||||
return true; | ||||||||||
} | ||||||||||
|
||||||||||
void find_boundaries( | ||||||||||
LogTypeDictionaryEntry const* logtype_entry, | ||||||||||
vector<pair<string, bool>> const& tokens, | ||||||||||
size_t& var_begin_ix, | ||||||||||
size_t& var_end_ix | ||||||||||
) { | ||||||||||
auto const& logtype_string = logtype_entry->get_value(); | ||||||||||
// left boundary is exclusive and right boundary are inclusive, meaning | ||||||||||
// that logtype_string.substr[0, left_boundary) and logtype_string.substr[right_boundary, end) | ||||||||||
// can be safely ignored. | ||||||||||
// They are initialized assuming that the entire logtype can be safely ignored. So if the | ||||||||||
// tokens doesn't contain variable. the behavior is consistent. | ||||||||||
size_t left_boundary{logtype_string.length()}; | ||||||||||
size_t right_boundary{0}; | ||||||||||
// First, match the token from front to end. | ||||||||||
size_t find_start_index{0}; | ||||||||||
bool tokens_contain_variable{false}; | ||||||||||
for (auto const& token : tokens) { | ||||||||||
auto const& token_str = token.first; | ||||||||||
bool contains_variable = token.second; | ||||||||||
size_t found_index = logtype_string.find(token_str, find_start_index); | ||||||||||
if (string::npos == found_index) { | ||||||||||
printf("failed to find: [%s] from %s\n", | ||||||||||
token_str.c_str(), | ||||||||||
logtype_string.substr(find_start_index).c_str()); | ||||||||||
throw; | ||||||||||
} | ||||||||||
// the first time we see a token with variable, we know that | ||||||||||
// we don't care about the variables in the substr before this token in the logtype. | ||||||||||
// Technically, logtype_string.substr[0, token[begin_index]) | ||||||||||
// (since token[begin_index] is the beginning of the token) | ||||||||||
if (contains_variable) { | ||||||||||
tokens_contain_variable = true; | ||||||||||
left_boundary = found_index; | ||||||||||
break; | ||||||||||
} | ||||||||||
// else, the token doesn't contain a variable | ||||||||||
// we can proceed by skipping this token. | ||||||||||
find_start_index = found_index + token_str.length(); | ||||||||||
} | ||||||||||
|
||||||||||
// second, match the token from back | ||||||||||
size_t rfind_end_index = logtype_string.length(); | ||||||||||
for (auto it = tokens.rbegin(); it != tokens.rend(); ++it) { | ||||||||||
auto const& token_str = it->first; | ||||||||||
bool contains_var = it->second; | ||||||||||
|
||||||||||
size_t rfound_index = logtype_string.rfind(token_str, rfind_end_index); | ||||||||||
if (string::npos == rfound_index) { | ||||||||||
printf("failed to find: [%s] from %s\n", | ||||||||||
token_str.c_str(), | ||||||||||
logtype_string.substr(0, rfind_end_index).c_str()); | ||||||||||
throw; | ||||||||||
} | ||||||||||
|
||||||||||
// the first time we see a token with variable, we know that | ||||||||||
// we don't care about the variables in the substr after this token in the logtype. | ||||||||||
// Technically, logtype_string.substr[rfound_index + len(token), end) | ||||||||||
// since logtype_string[rfound_index] is the beginning of the token | ||||||||||
if (contains_var) { | ||||||||||
tokens_contain_variable = true; | ||||||||||
right_boundary = rfound_index + token_str.length(); | ||||||||||
break; | ||||||||||
} | ||||||||||
|
||||||||||
// Note, rfind end index is inclusive. has to subtract by 1 so | ||||||||||
// in the next rfind, we skip the token we have already seen. | ||||||||||
rfind_end_index = rfound_index - 1; | ||||||||||
} | ||||||||||
|
||||||||||
// if we didn't find any variable, we can do an early return | ||||||||||
if (false == tokens_contain_variable) { | ||||||||||
var_begin_ix = logtype_entry->get_num_variables(); | ||||||||||
var_end_ix = 0; | ||||||||||
return; | ||||||||||
} | ||||||||||
|
||||||||||
// Now we have the left boundary and right boundary, try to filter out the variables; | ||||||||||
// var_begin_ix is an inclusive interval | ||||||||||
auto const logtype_variable_num = logtype_entry->get_num_variables(); | ||||||||||
ir::VariablePlaceholder var_placeholder; | ||||||||||
var_begin_ix = 0; | ||||||||||
for (size_t var_ix = 0; var_ix < logtype_variable_num; var_ix++) { | ||||||||||
size_t var_position = logtype_entry->get_variable_info(var_ix, var_placeholder); | ||||||||||
if (var_position < left_boundary) { | ||||||||||
// if the variable is within the left boundary, then it should be skipped. | ||||||||||
var_begin_ix++; | ||||||||||
} else { | ||||||||||
// if the variable is not within the left boundary | ||||||||||
break; | ||||||||||
} | ||||||||||
} | ||||||||||
|
||||||||||
// For right boundary, var_end_ix is an exclusive interval | ||||||||||
var_end_ix = logtype_variable_num; | ||||||||||
for (size_t var_ix = 0; var_ix < logtype_variable_num; var_ix++) { | ||||||||||
size_t reversed_ix = logtype_variable_num - 1 - var_ix; | ||||||||||
size_t var_position = logtype_entry->get_variable_info(reversed_ix, var_placeholder); | ||||||||||
if (var_position >= right_boundary) { | ||||||||||
// if the variable is within the right boundary, then it should be skipped. | ||||||||||
var_end_ix--; | ||||||||||
} else { | ||||||||||
// if the variable is not within the right boundary | ||||||||||
break; | ||||||||||
} | ||||||||||
} | ||||||||||
|
||||||||||
if (var_end_ix <= var_begin_ix) { | ||||||||||
printf("tokens contain a variable, end index %lu is smaller and equal than begin index " | ||||||||||
"%lu\n", | ||||||||||
var_end_ix, | ||||||||||
var_begin_ix); | ||||||||||
throw; | ||||||||||
} | ||||||||||
} | ||||||||||
Comment on lines
+397
to
+511
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fix improper use of In the Apply the following diff to throw an appropriate exception: - throw;
+ throw std::runtime_error("Failed to find token in logtype_string"); Ensure that +#include <stdexcept> Repeat this change for all instances where
🧰 Tools🪛 cppcheck[error] 422-422: Rethrowing current exception with 'throw;', it seems there is no current exception to rethrow. If there is no current exception this calls std (rethrowNoCurrentException) [error] 449-449: Rethrowing current exception with 'throw;', it seems there is no current exception to rethrow. If there is no current exception this calls std (rethrowNoCurrentException) [error] 509-509: Rethrowing current exception with 'throw;', it seems there is no current exception to rethrow. If there is no current exception this calls std (rethrowNoCurrentException) |
||||||||||
|
||||||||||
template <typename EscapeDecoder> | ||||||||||
vector<pair<string, bool>> | ||||||||||
retokenization(std::string_view input_string, EscapeDecoder escape_decoder) { | ||||||||||
vector<pair<string, bool>> retokenized_tokens; | ||||||||||
size_t input_length = input_string.size(); | ||||||||||
string current_token; | ||||||||||
bool contains_variable_placeholder = false; | ||||||||||
for (size_t ix = 0; ix < input_length; ix++) { | ||||||||||
auto const current_char = input_string.at(ix); | ||||||||||
if (enum_to_underlying_type(ir::VariablePlaceholder::Escape) == current_char) { | ||||||||||
escape_decoder(input_string, ix, current_token); | ||||||||||
continue; | ||||||||||
} | ||||||||||
|
||||||||||
if (current_char != '*') { | ||||||||||
current_token += current_char; | ||||||||||
contains_variable_placeholder |= ir::is_variable_placeholder(current_char); | ||||||||||
} else { | ||||||||||
if (!current_token.empty()) { | ||||||||||
retokenized_tokens.emplace_back(current_token, contains_variable_placeholder); | ||||||||||
current_token.clear(); | ||||||||||
} | ||||||||||
} | ||||||||||
} | ||||||||||
if (!current_token.empty()) { | ||||||||||
retokenized_tokens.emplace_back(current_token, contains_variable_placeholder); | ||||||||||
} | ||||||||||
return retokenized_tokens; | ||||||||||
} | ||||||||||
|
||||||||||
SubQueryMatchabilityResult generate_logtypes_and_vars_for_subquery( | ||||||||||
Archive const& archive, | ||||||||||
string& processed_search_string, | ||||||||||
|
@@ -415,6 +562,31 @@ SubQueryMatchabilityResult generate_logtypes_and_vars_for_subquery( | |||||||||
logtype += escape_char; | ||||||||||
} | ||||||||||
}; | ||||||||||
auto escape_decoder | ||||||||||
= [](std::string_view input_str, size_t& current_pos, string& token) -> void { | ||||||||||
auto const escape_char{enum_to_underlying_type(ir::VariablePlaceholder::Escape)}; | ||||||||||
// Note: we don't need to do a check, because the upstream should guarantee all | ||||||||||
// escapes are followed by some characters | ||||||||||
auto const next_char = input_str.at(current_pos + 1); | ||||||||||
if (escape_char == next_char) { | ||||||||||
// turn two consecutive escape into a single one. | ||||||||||
token += escape_char; | ||||||||||
} else if (is_wildcard(next_char)) { | ||||||||||
// if it is an escape followed by a wildcard, we know no escape has been added. | ||||||||||
// we also remove the original escape because it was purely for query | ||||||||||
token += next_char; | ||||||||||
} else if (ir::is_variable_placeholder(next_char)) { | ||||||||||
// If we are at here, it means we are in the middle of processing a '\\\v' sequence | ||||||||||
// in this case, since we removed only one escape from the previous '\\' sequence | ||||||||||
// we need to remove another escape here. | ||||||||||
token += next_char; | ||||||||||
} else { | ||||||||||
printf("Unexpected\n"); | ||||||||||
throw; | ||||||||||
Comment on lines
+584
to
+585
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Correct use of In the Replace - throw;
+ throw std::runtime_error("Unexpected character encountered in escape_decoder"); Ensure that 📝 Committable suggestion
Suggested change
🧰 Tools🪛 cppcheck[error] 585-585: Rethrowing current exception with 'throw;', it seems there is no current exception to rethrow. If there is no current exception this calls std (rethrowNoCurrentException) |
||||||||||
} | ||||||||||
current_pos++; | ||||||||||
}; | ||||||||||
|
||||||||||
for (auto const& query_token : query_tokens) { | ||||||||||
// Append from end of last token to beginning of this token, to logtype | ||||||||||
ir::append_constant_to_logtype( | ||||||||||
|
@@ -434,6 +606,7 @@ SubQueryMatchabilityResult generate_logtypes_and_vars_for_subquery( | |||||||||
// ambiguous tokens | ||||||||||
sub_query.mark_wildcard_match_required(); | ||||||||||
if (!query_token.is_var()) { | ||||||||||
// Must mean the token is text only, with * in it. | ||||||||||
logtype += '*'; | ||||||||||
} else { | ||||||||||
logtype += '*'; | ||||||||||
|
@@ -472,6 +645,15 @@ SubQueryMatchabilityResult generate_logtypes_and_vars_for_subquery( | |||||||||
if (possible_logtype_entries.empty()) { | ||||||||||
return SubQueryMatchabilityResult::WontMatch; | ||||||||||
} | ||||||||||
|
||||||||||
// Find boundaries | ||||||||||
auto const retokenized_tokens = retokenization(logtype, escape_decoder); | ||||||||||
for (auto const& logtype_entry : possible_logtype_entries) { | ||||||||||
size_t var_begin_index; | ||||||||||
size_t var_end_index; | ||||||||||
find_boundaries(logtype_entry, retokenized_tokens, var_begin_index, var_end_index); | ||||||||||
sub_query.set_logtype_boundary(logtype_entry->get_id(), var_begin_index, var_end_index); | ||||||||||
} | ||||||||||
sub_query.set_possible_logtypes(possible_logtype_entries); | ||||||||||
|
||||||||||
// Calculate the IDs of the segments that may contain results for the sub-query now that we've | ||||||||||
|
@@ -901,7 +1083,12 @@ Grep::get_converted_logtype_query(Query const& query, size_t segment_id) { | |||||||||
for (auto const& possible_logtype_entry : possible_log_entries) { | ||||||||||
// create one LogtypeQuery for each logtype | ||||||||||
logtype_dictionary_id_t possible_logtype_id = possible_logtype_entry->get_id(); | ||||||||||
LogtypeQuery query_info(sub_query->get_vars(), sub_query->wildcard_match_required()); | ||||||||||
auto const& boundary = sub_query->get_boundary_by_logtype_id(possible_logtype_id); | ||||||||||
LogtypeQuery query_info( | ||||||||||
sub_query->get_vars(), | ||||||||||
sub_query->wildcard_match_required(), | ||||||||||
boundary | ||||||||||
); | ||||||||||
|
||||||||||
// The boundary is a range like [left:right). note it's open on the right side | ||||||||||
auto const& containing_segments | ||||||||||
|
@@ -1155,8 +1342,9 @@ size_t Grep::search_combined_table_and_output( | |||||||||
compressed_msg.resize_var(num_vars); | ||||||||||
compressed_msg.set_logtype_id(logtype_id); | ||||||||||
|
||||||||||
size_t left_boundary = 0; | ||||||||||
size_t right_boundary = num_vars; | ||||||||||
size_t var_begin_ix = num_vars; | ||||||||||
size_t var_end_ix = 0; | ||||||||||
get_union_of_bounds(queries_by_logtype, var_begin_ix, var_end_ix); | ||||||||||
|
||||||||||
bool required_wild_card; | ||||||||||
while (num_matches < limit) { | ||||||||||
|
@@ -1166,8 +1354,8 @@ size_t Grep::search_combined_table_and_output( | |||||||||
compressed_msg, | ||||||||||
required_wild_card, | ||||||||||
query, | ||||||||||
left_boundary, | ||||||||||
right_boundary | ||||||||||
var_begin_ix, | ||||||||||
var_end_ix | ||||||||||
); | ||||||||||
if (found_matched == false) { | ||||||||||
break; | ||||||||||
|
@@ -1232,12 +1420,13 @@ size_t Grep::search_segment_optimized_and_output( | |||||||||
|
||||||||||
auto num_vars = archive.get_logtype_dictionary().get_entry(logtype_id).get_num_variables(); | ||||||||||
|
||||||||||
size_t left_boundary = 0; | ||||||||||
size_t right_boundary = num_vars; | ||||||||||
size_t var_begin_ix = num_vars; | ||||||||||
size_t var_end_ix = 0; | ||||||||||
get_union_of_bounds(sub_queries, var_begin_ix, var_end_ix); | ||||||||||
|
||||||||||
// load timestamps and columns that fall into the ranges. | ||||||||||
logtype_table_manager.load_ts(); | ||||||||||
logtype_table_manager.load_partial_columns(left_boundary, right_boundary); | ||||||||||
logtype_table_manager.load_partial_columns(var_begin_ix, var_end_ix); | ||||||||||
|
||||||||||
std::vector<size_t> matched_row_ix; | ||||||||||
std::vector<bool> wildcard_required; | ||||||||||
|
@@ -1278,4 +1467,22 @@ size_t Grep::search_segment_optimized_and_output( | |||||||||
return num_matches; | ||||||||||
} | ||||||||||
|
||||||||||
// we use a simple assumption atm. | ||||||||||
// if subquery1 has range (a,b) and subquery2 has range (c,d). | ||||||||||
// then the range will be (min(a,c), max(b,d)), even if c > b. | ||||||||||
void Grep::get_union_of_bounds( | ||||||||||
std::vector<LogtypeQuery> const& sub_queries, | ||||||||||
size_t& var_begin_ix, | ||||||||||
size_t& var_end_ix | ||||||||||
) { | ||||||||||
for (auto const& subquery : sub_queries) { | ||||||||||
// we use a simple assumption atm. | ||||||||||
// if subquery1 has range [begin1, end1) and subquery2 has range [begin2, end2). | ||||||||||
// then the range will be (min(begin1, begin2), max(end1, end2)). | ||||||||||
// Note, this would cause some inefficiency if begin1 < end1 < begin2 < end2. | ||||||||||
var_begin_ix = std::min(var_begin_ix, subquery.get_begin_ix()); | ||||||||||
var_end_ix = std::max(var_end_ix, subquery.get_end_ix()); | ||||||||||
} | ||||||||||
} | ||||||||||
|
||||||||||
} // namespace glt |
Original file line number | Diff line number | Diff line change | ||||||||||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
@@ -179,6 +179,12 @@ class LogTypeDictionaryEntry : public DictionaryEntry<logtype_dictionary_id_t> { | |||||||||||||||||||||||
*/ | ||||||||||||||||||||||||
void read_from_file(streaming_compression::Decompressor& decompressor); | ||||||||||||||||||||||||
|
||||||||||||||||||||||||
/** | ||||||||||||||||||||||||
* Generate a human readable version of value. | ||||||||||||||||||||||||
* @param decompressor | ||||||||||||||||||||||||
*/ | ||||||||||||||||||||||||
std::string get_human_readable_value() const; | ||||||||||||||||||||||||
Comment on lines
+182
to
+186
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fix incorrect documentation for get_human_readable_value() The method documentation appears to be incorrect:
Apply this diff to fix the documentation: /**
- * Generate a human readable version of value.
- * @param decompressor
+ * Generates a human-readable version of the log type entry by combining
+ * constant segments with placeholder indicators.
+ * @return A string representation of the log type entry
*/
std::string get_human_readable_value() const; 📝 Committable suggestion
Suggested change
💡 Codebase verification Implementation of 🔗 Analysis chainVerify implementation matches the header declaration The method declaration looks correct, but let's verify its implementation matches the intended functionality. 🏁 Scripts executedThe following scripts were executed for the analysis: Script: #!/bin/bash
# Description: Verify the implementation of get_human_readable_value()
# Expected: Find the implementation and verify it uses m_value and placeholder info
# Search for the implementation
rg -U "std::string\s+LogTypeDictionaryEntry::get_human_readable_value\(\)\s+const\s*\{[\s\S]+?\}" components/core/src/glt/
# Check if the implementation uses required member variables
rg -U "get_human_readable_value.*\{[\s\S]+?m_(value|placeholder_positions)" components/core/src/glt/
Length of output: 1364 |
||||||||||||||||||||||||
|
||||||||||||||||||||||||
private: | ||||||||||||||||||||||||
// Variables | ||||||||||||||||||||||||
std::vector<size_t> m_placeholder_positions; | ||||||||||||||||||||||||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Address the TODO comment and handle middle wildcard cases
There is a TODO comment indicating that the current logic may not correctly handle patterns like
23412*34
intended to match23412.34
. Middle wildcards might causeconverts_to_non_dict_var
to be false incorrectly.Please implement the necessary logic to account for middle wildcards affecting variable conversion. Would you like assistance in refining this logic or proposing a solution?