diff --git a/components/core/src/glt/Grep.cpp b/components/core/src/glt/Grep.cpp index 5a7356046..b443caebe 100644 --- a/components/core/src/glt/Grep.cpp +++ b/components/core/src/glt/Grep.cpp @@ -18,6 +18,8 @@ using glt::ir::is_delim; using glt::streaming_archive::reader::Archive; using glt::streaming_archive::reader::File; using glt::streaming_archive::reader::Message; +using std::make_pair; +using std::pair; using std::string; using std::vector; @@ -158,15 +160,14 @@ QueryToken::QueryToken( if (converts_to_int || converts_to_float) { converts_to_non_dict_var = true; } - if (!converts_to_non_dict_var) { - // Dictionary variable + // GLT TODO // Actually this is incorrect, because it's possible user enters 23412*34 aiming to - // match 23412.34. This should be an ambigious type. + // match 23412.34. we should consider the possibility that middle wildcard causes + // the converts_to_non_dict_var to be false. m_type = Type::DictionaryVar; m_cannot_convert_to_non_dict_var = true; } else { - // GLT TODO: think about this carefully. m_type = Type::Ambiguous; m_possible_types.push_back(Type::IntVar); m_possible_types.push_back(Type::FloatVar); @@ -393,6 +394,152 @@ bool find_matching_message( return true; } +void find_boundaries( + LogTypeDictionaryEntry const* logtype_entry, + vector> const& tokens, + size_t& var_begin_ix, + size_t& var_end_ix +) { + auto const& logtype_string = logtype_entry->get_value(); + // left boundary is exclusive and right boundary are inclusive, meaning + // that logtype_string.substr[0, left_boundary) and logtype_string.substr[right_boundary, end) + // can be safely ignored. + // They are initialized assuming that the entire logtype can be safely ignored. So if the + // tokens doesn't contain variable. the behavior is consistent. + size_t left_boundary{logtype_string.length()}; + size_t right_boundary{0}; + // First, match the token from front to end. + size_t find_start_index{0}; + bool tokens_contain_variable{false}; + for (auto const& token : tokens) { + auto const& token_str = token.first; + bool contains_variable = token.second; + size_t found_index = logtype_string.find(token_str, find_start_index); + if (string::npos == found_index) { + printf("failed to find: [%s] from %s\n", + token_str.c_str(), + logtype_string.substr(find_start_index).c_str()); + throw; + } + // the first time we see a token with variable, we know that + // we don't care about the variables in the substr before this token in the logtype. + // Technically, logtype_string.substr[0, token[begin_index]) + // (since token[begin_index] is the beginning of the token) + if (contains_variable) { + tokens_contain_variable = true; + left_boundary = found_index; + break; + } + // else, the token doesn't contain a variable + // we can proceed by skipping this token. + find_start_index = found_index + token_str.length(); + } + + // second, match the token from back + size_t rfind_end_index = logtype_string.length(); + for (auto it = tokens.rbegin(); it != tokens.rend(); ++it) { + auto const& token_str = it->first; + bool contains_var = it->second; + + size_t rfound_index = logtype_string.rfind(token_str, rfind_end_index); + if (string::npos == rfound_index) { + printf("failed to find: [%s] from %s\n", + token_str.c_str(), + logtype_string.substr(0, rfind_end_index).c_str()); + throw; + } + + // the first time we see a token with variable, we know that + // we don't care about the variables in the substr after this token in the logtype. + // Technically, logtype_string.substr[rfound_index + len(token), end) + // since logtype_string[rfound_index] is the beginning of the token + if (contains_var) { + tokens_contain_variable = true; + right_boundary = rfound_index + token_str.length(); + break; + } + + // Note, rfind end index is inclusive. has to subtract by 1 so + // in the next rfind, we skip the token we have already seen. + rfind_end_index = rfound_index - 1; + } + + // if we didn't find any variable, we can do an early return + if (false == tokens_contain_variable) { + var_begin_ix = logtype_entry->get_num_variables(); + var_end_ix = 0; + return; + } + + // Now we have the left boundary and right boundary, try to filter out the variables; + // var_begin_ix is an inclusive interval + auto const logtype_variable_num = logtype_entry->get_num_variables(); + ir::VariablePlaceholder var_placeholder; + var_begin_ix = 0; + for (size_t var_ix = 0; var_ix < logtype_variable_num; var_ix++) { + size_t var_position = logtype_entry->get_variable_info(var_ix, var_placeholder); + if (var_position < left_boundary) { + // if the variable is within the left boundary, then it should be skipped. + var_begin_ix++; + } else { + // if the variable is not within the left boundary + break; + } + } + + // For right boundary, var_end_ix is an exclusive interval + var_end_ix = logtype_variable_num; + for (size_t var_ix = 0; var_ix < logtype_variable_num; var_ix++) { + size_t reversed_ix = logtype_variable_num - 1 - var_ix; + size_t var_position = logtype_entry->get_variable_info(reversed_ix, var_placeholder); + if (var_position >= right_boundary) { + // if the variable is within the right boundary, then it should be skipped. + var_end_ix--; + } else { + // if the variable is not within the right boundary + break; + } + } + + if (var_end_ix <= var_begin_ix) { + printf("tokens contain a variable, end index %lu is smaller and equal than begin index " + "%lu\n", + var_end_ix, + var_begin_ix); + throw; + } +} + +template +vector> +retokenization(std::string_view input_string, EscapeDecoder escape_decoder) { + vector> retokenized_tokens; + size_t input_length = input_string.size(); + string current_token; + bool contains_variable_placeholder = false; + for (size_t ix = 0; ix < input_length; ix++) { + auto const current_char = input_string.at(ix); + if (enum_to_underlying_type(ir::VariablePlaceholder::Escape) == current_char) { + escape_decoder(input_string, ix, current_token); + continue; + } + + if (current_char != '*') { + current_token += current_char; + contains_variable_placeholder |= ir::is_variable_placeholder(current_char); + } else { + if (!current_token.empty()) { + retokenized_tokens.emplace_back(current_token, contains_variable_placeholder); + current_token.clear(); + } + } + } + if (!current_token.empty()) { + retokenized_tokens.emplace_back(current_token, contains_variable_placeholder); + } + return retokenized_tokens; +} + SubQueryMatchabilityResult generate_logtypes_and_vars_for_subquery( Archive const& archive, string& processed_search_string, @@ -415,6 +562,31 @@ SubQueryMatchabilityResult generate_logtypes_and_vars_for_subquery( logtype += escape_char; } }; + auto escape_decoder + = [](std::string_view input_str, size_t& current_pos, string& token) -> void { + auto const escape_char{enum_to_underlying_type(ir::VariablePlaceholder::Escape)}; + // Note: we don't need to do a check, because the upstream should guarantee all + // escapes are followed by some characters + auto const next_char = input_str.at(current_pos + 1); + if (escape_char == next_char) { + // turn two consecutive escape into a single one. + token += escape_char; + } else if (is_wildcard(next_char)) { + // if it is an escape followed by a wildcard, we know no escape has been added. + // we also remove the original escape because it was purely for query + token += next_char; + } else if (ir::is_variable_placeholder(next_char)) { + // If we are at here, it means we are in the middle of processing a '\\\v' sequence + // in this case, since we removed only one escape from the previous '\\' sequence + // we need to remove another escape here. + token += next_char; + } else { + printf("Unexpected\n"); + throw; + } + current_pos++; + }; + for (auto const& query_token : query_tokens) { // Append from end of last token to beginning of this token, to logtype ir::append_constant_to_logtype( @@ -434,6 +606,7 @@ SubQueryMatchabilityResult generate_logtypes_and_vars_for_subquery( // ambiguous tokens sub_query.mark_wildcard_match_required(); if (!query_token.is_var()) { + // Must mean the token is text only, with * in it. logtype += '*'; } else { logtype += '*'; @@ -472,6 +645,15 @@ SubQueryMatchabilityResult generate_logtypes_and_vars_for_subquery( if (possible_logtype_entries.empty()) { return SubQueryMatchabilityResult::WontMatch; } + + // Find boundaries + auto const retokenized_tokens = retokenization(logtype, escape_decoder); + for (auto const& logtype_entry : possible_logtype_entries) { + size_t var_begin_index; + size_t var_end_index; + find_boundaries(logtype_entry, retokenized_tokens, var_begin_index, var_end_index); + sub_query.set_logtype_boundary(logtype_entry->get_id(), var_begin_index, var_end_index); + } sub_query.set_possible_logtypes(possible_logtype_entries); // Calculate the IDs of the segments that may contain results for the sub-query now that we've @@ -901,7 +1083,12 @@ Grep::get_converted_logtype_query(Query const& query, size_t segment_id) { for (auto const& possible_logtype_entry : possible_log_entries) { // create one LogtypeQuery for each logtype logtype_dictionary_id_t possible_logtype_id = possible_logtype_entry->get_id(); - LogtypeQuery query_info(sub_query->get_vars(), sub_query->wildcard_match_required()); + auto const& boundary = sub_query->get_boundary_by_logtype_id(possible_logtype_id); + LogtypeQuery query_info( + sub_query->get_vars(), + sub_query->wildcard_match_required(), + boundary + ); // The boundary is a range like [left:right). note it's open on the right side auto const& containing_segments @@ -1155,8 +1342,9 @@ size_t Grep::search_combined_table_and_output( compressed_msg.resize_var(num_vars); compressed_msg.set_logtype_id(logtype_id); - size_t left_boundary = 0; - size_t right_boundary = num_vars; + size_t var_begin_ix = num_vars; + size_t var_end_ix = 0; + get_union_of_bounds(queries_by_logtype, var_begin_ix, var_end_ix); bool required_wild_card; while (num_matches < limit) { @@ -1166,8 +1354,8 @@ size_t Grep::search_combined_table_and_output( compressed_msg, required_wild_card, query, - left_boundary, - right_boundary + var_begin_ix, + var_end_ix ); if (found_matched == false) { break; @@ -1232,12 +1420,13 @@ size_t Grep::search_segment_optimized_and_output( auto num_vars = archive.get_logtype_dictionary().get_entry(logtype_id).get_num_variables(); - size_t left_boundary = 0; - size_t right_boundary = num_vars; + size_t var_begin_ix = num_vars; + size_t var_end_ix = 0; + get_union_of_bounds(sub_queries, var_begin_ix, var_end_ix); // load timestamps and columns that fall into the ranges. logtype_table_manager.load_ts(); - logtype_table_manager.load_partial_columns(left_boundary, right_boundary); + logtype_table_manager.load_partial_columns(var_begin_ix, var_end_ix); std::vector matched_row_ix; std::vector wildcard_required; @@ -1278,4 +1467,22 @@ size_t Grep::search_segment_optimized_and_output( return num_matches; } +// we use a simple assumption atm. +// if subquery1 has range (a,b) and subquery2 has range (c,d). +// then the range will be (min(a,c), max(b,d)), even if c > b. +void Grep::get_union_of_bounds( + std::vector const& sub_queries, + size_t& var_begin_ix, + size_t& var_end_ix +) { + for (auto const& subquery : sub_queries) { + // we use a simple assumption atm. + // if subquery1 has range [begin1, end1) and subquery2 has range [begin2, end2). + // then the range will be (min(begin1, begin2), max(end1, end2)). + // Note, this would cause some inefficiency if begin1 < end1 < begin2 < end2. + var_begin_ix = std::min(var_begin_ix, subquery.get_begin_ix()); + var_end_ix = std::max(var_end_ix, subquery.get_end_ix()); + } +} + } // namespace glt diff --git a/components/core/src/glt/Grep.hpp b/components/core/src/glt/Grep.hpp index 240859d41..7f678e8d5 100644 --- a/components/core/src/glt/Grep.hpp +++ b/components/core/src/glt/Grep.hpp @@ -212,6 +212,12 @@ class Grep { */ static std::unordered_map get_converted_logtype_query(Query const& query, size_t segment_id); + + static void get_union_of_bounds( + std::vector const& sub_queries, + size_t& var_begin_ix, + size_t& var_end_ix + ); }; } // namespace glt diff --git a/components/core/src/glt/LogTypeDictionaryEntry.cpp b/components/core/src/glt/LogTypeDictionaryEntry.cpp index f5e6595bb..fe81127fa 100644 --- a/components/core/src/glt/LogTypeDictionaryEntry.cpp +++ b/components/core/src/glt/LogTypeDictionaryEntry.cpp @@ -202,4 +202,34 @@ void LogTypeDictionaryEntry::read_from_file(streaming_compression::Decompressor& throw OperationFailed(error_code, __FILENAME__, __LINE__); } } + +string LogTypeDictionaryEntry::get_human_readable_value() const { + string human_readable_value; + + size_t constant_begin_pos = 0; + for (size_t placeholder_ix = 0; placeholder_ix < get_num_placeholders(); ++placeholder_ix) { + VariablePlaceholder placeholder; + size_t placeholder_pos = get_placeholder_info(placeholder_ix, placeholder); + + // Add the constant that's between the last variable and this one, with newlines escaped + human_readable_value + .append(m_value, constant_begin_pos, placeholder_pos - constant_begin_pos); + + if (VariablePlaceholder::Dictionary == placeholder) { + human_readable_value += "v"; + } else if (VariablePlaceholder::Float == placeholder) { + human_readable_value += "f"; + } else if (VariablePlaceholder::Integer == placeholder) { + human_readable_value += "i"; + } + // Move past the variable delimiter + constant_begin_pos = placeholder_pos + 1; + } + // Append remainder of value, if any + if (constant_begin_pos < m_value.length()) { + human_readable_value.append(m_value, constant_begin_pos, string::npos); + } + return human_readable_value; +} + } // namespace glt diff --git a/components/core/src/glt/LogTypeDictionaryEntry.hpp b/components/core/src/glt/LogTypeDictionaryEntry.hpp index 525f15010..221ad5a90 100644 --- a/components/core/src/glt/LogTypeDictionaryEntry.hpp +++ b/components/core/src/glt/LogTypeDictionaryEntry.hpp @@ -179,6 +179,12 @@ class LogTypeDictionaryEntry : public DictionaryEntry { */ void read_from_file(streaming_compression::Decompressor& decompressor); + /** + * Generate a human readable version of value. + * @param decompressor + */ + std::string get_human_readable_value() const; + private: // Variables std::vector m_placeholder_positions; diff --git a/components/core/src/glt/Query.cpp b/components/core/src/glt/Query.cpp index 41e14ecb7..bff53d83d 100644 --- a/components/core/src/glt/Query.cpp +++ b/components/core/src/glt/Query.cpp @@ -175,15 +175,16 @@ void SubQuery::calculate_ids_of_matching_segments() { void SubQuery::clear() { m_vars.clear(); m_possible_logtype_ids.clear(); + m_logtype_boundaries.clear(); m_wildcard_match_required = false; } -bool SubQuery::matches_logtype(logtype_dictionary_id_t const logtype) const { - return m_possible_logtype_ids.count(logtype) > 0; -} - -bool SubQuery::matches_vars(std::vector const& vars) const { - return matches_var(vars, m_vars, 0, 0); +void SubQuery::set_logtype_boundary( + glt::logtype_dictionary_id_t logtype_id, + size_t var_begin_ix, + size_t var_end_ix +) { + m_logtype_boundaries.emplace(logtype_id, QueryBoundary(var_begin_ix, var_end_ix)); } Query::Query( @@ -218,6 +219,6 @@ void Query::make_sub_queries_relevant_to_segment(segment_id_t segment_id) { } bool LogtypeQuery::matches_vars(std::vector const& vars) const { - return matches_var(vars, m_vars, 0, 0); + return matches_var(vars, m_vars, m_var_begin_ix, m_var_end_ix); } } // namespace glt diff --git a/components/core/src/glt/Query.hpp b/components/core/src/glt/Query.hpp index 56462ecd9..ff6b9b814 100644 --- a/components/core/src/glt/Query.hpp +++ b/components/core/src/glt/Query.hpp @@ -3,6 +3,7 @@ #include #include +#include #include #include @@ -64,6 +65,14 @@ class QueryVar { std::unordered_set m_possible_var_dict_entries; }; +class QueryBoundary { +public: + QueryBoundary(size_t begin, size_t end) : var_begin_ix(begin), var_end_ix(end) {} + + size_t var_begin_ix; + size_t var_end_ix; +}; + /** * Class representing a subquery (or informally, an interpretation) of a user query. It contains a * series of possible logtypes, a set of QueryVars, and whether the query still requires wildcard @@ -133,25 +142,30 @@ class SubQuery { return m_ids_of_matching_segments; } + QueryBoundary const& get_boundary_by_logtype_id(logtype_dictionary_id_t logtype_id) const { + return m_logtype_boundaries.at(logtype_id); + } + /** - * Whether the given logtype ID matches one of the possible logtypes in this subquery - * @param logtype - * @return true if matched, false otherwise - */ - bool matches_logtype(logtype_dictionary_id_t logtype) const; - /** - * Whether the given variables contain the subquery's variables in order (but not necessarily + * GLT TODO: Currently just a quick implementation + * Insert a logtype's begin and end into the subquery. * contiguously) - * @param vars - * @return true if matched, false otherwise + * @param logtype_id + * @param var_begin_ix + * @param var_end_ix */ - bool matches_vars(std::vector const& vars) const; + void set_logtype_boundary( + logtype_dictionary_id_t logtype_id, + size_t var_begin_ix, + size_t var_end_ix + ); private: // Variables std::unordered_set m_possible_logtype_entries; std::unordered_set m_possible_logtype_ids; std::set m_ids_of_matching_segments; + std::unordered_map m_logtype_boundaries; std::vector m_vars; bool m_wildcard_match_required; }; @@ -230,10 +244,15 @@ class Query { class LogtypeQuery { public: // Methods - LogtypeQuery(std::vector const& vars, bool wildcard_match_required) { - m_vars = vars; - m_wildcard_match_required = wildcard_match_required; - } + LogtypeQuery( + std::vector const& vars, + bool wildcard_match_required, + QueryBoundary const& boundary + ) + : m_vars(vars), + m_wildcard_match_required(wildcard_match_required), + m_var_begin_ix(boundary.var_begin_ix), + m_var_end_ix(boundary.var_end_ix) {} /** * Whether the given variables contain the subquery's variables in order (but not necessarily @@ -245,10 +264,17 @@ class LogtypeQuery { bool get_wildcard_flag() const { return m_wildcard_match_required; } + size_t get_begin_ix() const { return m_var_begin_ix; } + + size_t get_end_ix() const { return m_var_end_ix; } + private: // Variables std::vector m_vars; bool m_wildcard_match_required; + // [begin, end) + size_t m_var_begin_ix; + size_t m_var_end_ix; }; class LogtypeQueries { diff --git a/components/core/src/glt/glt/search.cpp b/components/core/src/glt/glt/search.cpp index c258686e5..6a247dea5 100644 --- a/components/core/src/glt/glt/search.cpp +++ b/components/core/src/glt/glt/search.cpp @@ -374,7 +374,7 @@ static size_t search_segments( ); // first search through the single variable table - num_matches += Grep::search_segment_and_output( + num_matches += Grep::search_segment_optimized_and_output( single_table_queries, query, SIZE_MAX, diff --git a/components/core/src/glt/streaming_archive/reader/Archive.cpp b/components/core/src/glt/streaming_archive/reader/Archive.cpp index bfb489cc9..35ef8fbd5 100644 --- a/components/core/src/glt/streaming_archive/reader/Archive.cpp +++ b/components/core/src/glt/streaming_archive/reader/Archive.cpp @@ -407,7 +407,11 @@ void Archive::find_message_matching_with_logtype_query_optimized( if (query.timestamp_is_in_search_time_range(ts)) { // that means we need to loop through every loop. that takes time. for (auto const& possible_sub_query : logtype_query) { - logtype_table.get_next_row(vars_to_load, 0, num_column); + logtype_table.get_next_row( + vars_to_load, + possible_sub_query.get_begin_ix(), + possible_sub_query.get_end_ix() + ); if (possible_sub_query.matches_vars(vars_to_load)) { // Message matches completely, so set remaining properties wildcard.push_back(possible_sub_query.get_wildcard_flag());