From b8e85de841bb18d528addfb0b738bd2c5ec4f8bd Mon Sep 17 00:00:00 2001 From: Tony Xiao Date: Tue, 26 Mar 2024 15:48:59 -0400 Subject: [PATCH] feat(mapping-optimizer): Support in operator for mapping optimizer (#5691) Re-apply #5685 This was a TODO item. But on the spans dataset, one easy to encounter situation is a condition like ``` sentry_tags[key] IN (value1, value2) ``` This results in a sql like ``` in((arrayElement(sentry_tags.value, indexOf(sentry_tags.key, 'key')) AS `_snuba_sentry_tags[key]`), ['value1', 'value1']) ``` which scans the entire `sentry_tags.key` and `sentry_tags.value` columns. The optimization here is to use the tags hash map which gives us a condition like ``` hasAny(_sentry_tags_hash_map, array(cityHash64('key=value1'), cityHash64('key=value1'))) ``` --- .github/workflows/ci.yml | 1 + .../processors/physical/mapping_optimizer.py | 160 ++++++++++++++---- .../processors/test_mapping_optimizer.py | 102 +++++++++++ 3 files changed, 228 insertions(+), 35 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 8492cc196e..b34896a7be 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -366,6 +366,7 @@ jobs: tests/sentry/search/events \ tests/sentry/event_manager \ tests/sentry/api/endpoints/test_organization_profiling_functions.py \ + tests/snuba/api/endpoints/test_organization_events_stats_mep.py \ tests/snuba/test_snql_snuba.py \ tests/snuba/test_metrics_layer.py \ -vv --cov . --cov-report="xml:.artifacts/snuba.coverage.xml" diff --git a/snuba/query/processors/physical/mapping_optimizer.py b/snuba/query/processors/physical/mapping_optimizer.py index 13f5f73960..b2b6c37ad9 100644 --- a/snuba/query/processors/physical/mapping_optimizer.py +++ b/snuba/query/processors/physical/mapping_optimizer.py @@ -1,6 +1,6 @@ import logging from enum import Enum -from typing import Optional, Tuple +from typing import Optional, Tuple, cast from snuba import environment from snuba.clickhouse.query import Query @@ -13,6 +13,7 @@ ) from snuba.query.conditions import ( BooleanFunctions, + ConditionFunctions, combine_and_conditions, combine_or_conditions, get_first_level_and_conditions, @@ -90,9 +91,8 @@ def __init__( self.__killswitch = killswitch self.__value_subcolumn_name = value_subcolumn_name - # TODO: Add the support for IN conditions. - self.__optimizable_pattern = FunctionCall( - function_name=String("equals"), + self.__equals_condition_pattern = FunctionCall( + function_name=String(ConditionFunctions.EQ), parameters=( Or( [ @@ -106,6 +106,19 @@ def __init__( Param("right_hand_side", Literal(Any(str))), ), ) + self.__in_condition_pattern = FunctionCall( + function_name=String(ConditionFunctions.IN), + parameters=( + mapping_pattern, + Param( + "right_hand_side", + FunctionCall( + Or([String("array"), String("tuple")]), + all_parameters=Literal(), + ), + ), + ), + ) self.__tag_exists_patterns = [ FunctionCall( function_name=String("notEquals"), @@ -155,12 +168,18 @@ def __classify_combined_conditions(self, condition: Expression) -> ConditionClas def __classify_condition(self, condition: Expression) -> ConditionClass: # Expects this to be an individual condition - match = self.__optimizable_pattern.match(condition) + equals_condition_match = self.__equals_condition_pattern.match(condition) + in_condition_match = ( + self.__in_condition_pattern.match(condition) + if equals_condition_match is None + else None + ) if ( - match is not None - and match.string(KEY_COL_MAPPING_PARAM) == f"{self.__column_name}.key" + equals_condition_match is not None + and equals_condition_match.string(KEY_COL_MAPPING_PARAM) + == f"{self.__column_name}.key" ): - rhs = match.expression("right_hand_side") + rhs = equals_condition_match.expression("right_hand_side") assert isinstance(rhs, LiteralExpr) return ( ConditionClass.NOT_OPTIMIZABLE @@ -168,7 +187,24 @@ def __classify_condition(self, condition: Expression) -> ConditionClass: if rhs.value == "" else ConditionClass.OPTIMIZABLE ) - elif match is None: + if ( + in_condition_match is not None + and in_condition_match.string(KEY_COL_MAPPING_PARAM) + == f"{self.__column_name}.key" + ): + rhs = in_condition_match.expression("right_hand_side") + assert isinstance(rhs, FunctionExpr) + + params = rhs.parameters + for param in params: + assert isinstance(param, LiteralExpr) + + # tags[foo] IN array('') is not optimizable + if param.value == "": + return ConditionClass.NOT_OPTIMIZABLE + + return ConditionClass.OPTIMIZABLE + elif equals_condition_match is None and in_condition_match is None: # If this condition is not matching an optimizable condition, # check that it does not reference the optimizable column. # If it does, it means we should not optimize this query. @@ -183,35 +219,89 @@ def __classify_condition(self, condition: Expression) -> ConditionClass: return ConditionClass.IRRELEVANT def __replace_with_hash(self, condition: Expression) -> Expression: - match = self.__optimizable_pattern.match(condition) + equals_condition_match = self.__equals_condition_pattern.match(condition) + in_condition_match = ( + self.__in_condition_pattern.match(condition) + if equals_condition_match is None + else None + ) if ( - match is None - or match.string(KEY_COL_MAPPING_PARAM) != f"{self.__column_name}.key" + equals_condition_match is not None + and equals_condition_match.string(KEY_COL_MAPPING_PARAM) + == f"{self.__column_name}.key" ): - return condition - rhs = match.expression("right_hand_side") - assert isinstance(rhs, LiteralExpr) - key = match.scalar(KEY_MAPPING_PARAM) - assert isinstance(key, (str, int)) - if isinstance(key, str): - key = key.translate(ESCAPE_TRANSLATION) - - return FunctionExpr( - alias=condition.alias, - function_name="has", - parameters=( - Column( - alias=None, - table_name=match.optional_string(TABLE_MAPPING_PARAM), - column_name=self.__hash_map_name, + rhs = equals_condition_match.expression("right_hand_side") + assert isinstance(rhs, LiteralExpr) + key = equals_condition_match.scalar(KEY_MAPPING_PARAM) + assert isinstance(key, (str, int)) + if isinstance(key, str): + key = key.translate(ESCAPE_TRANSLATION) + + return FunctionExpr( + alias=condition.alias, + function_name="has", + parameters=( + Column( + alias=None, + table_name=equals_condition_match.optional_string( + TABLE_MAPPING_PARAM + ), + column_name=self.__hash_map_name, + ), + FunctionExpr( + alias=None, + function_name="cityHash64", + parameters=(LiteralExpr(None, f"{key}={rhs.value}"),), + ), ), - FunctionExpr( - alias=None, - function_name="cityHash64", - parameters=(LiteralExpr(None, f"{key}={rhs.value}"),), + ) + elif ( + in_condition_match is not None + and in_condition_match.string(KEY_COL_MAPPING_PARAM) + == f"{self.__column_name}.key" + ): + key = in_condition_match.scalar(KEY_MAPPING_PARAM) + assert isinstance(key, (str, int)) + if isinstance(key, str): + key = key.translate(ESCAPE_TRANSLATION) + + rhs = in_condition_match.expression("right_hand_side") + assert isinstance(rhs, FunctionExpr) + for param in rhs.parameters: + assert isinstance(param, LiteralExpr) + params = cast(list[LiteralExpr], rhs.parameters) + + return FunctionExpr( + alias=condition.alias, + function_name="hasAny", + parameters=( + Column( + alias=None, + table_name=in_condition_match.optional_string( + TABLE_MAPPING_PARAM + ), + column_name=self.__hash_map_name, + ), + FunctionExpr( + alias=rhs.alias, + function_name="array", + parameters=tuple( + [ + FunctionExpr( + alias=None, + function_name="cityHash64", + parameters=( + LiteralExpr(None, f"{key}={lit.value}"), + ), + ) + for lit in params + ] + ), + ), ), - ), - ) + ) + + return condition def _get_condition_without_redundant_checks( self, condition: Expression, query: Query @@ -265,7 +355,7 @@ def _get_condition_without_redundant_checks( if tag_exist_match: matched_tag_exists_conditions[condition_id] = tag_exist_match if not tag_exist_match: - eq_match = self.__optimizable_pattern.match(cond) + eq_match = self.__equals_condition_pattern.match(cond) if eq_match: tag_eq_match_keys.add(eq_match.scalar(KEY_MAPPING_PARAM)) useful_conditions = [] diff --git a/tests/query/processors/test_mapping_optimizer.py b/tests/query/processors/test_mapping_optimizer.py index 03888c4ec9..ba512fc07b 100644 --- a/tests/query/processors/test_mapping_optimizer.py +++ b/tests/query/processors/test_mapping_optimizer.py @@ -278,6 +278,108 @@ nested_condition("tags", "my_tag", ConditionFunctions.EQ, "a"), id="Non optimizable having", ), + pytest.param( + build_query( + selected_columns=[column("event_id")], + condition=binary_condition( + ConditionFunctions.IN, + nested_expression("tags", "my_tag"), + FunctionCall( + None, + "tuple", + ( + Literal(None, "a"), + Literal(None, "b"), + Literal(None, "c"), + ), + ), + ), + ), + FunctionCall( + None, + "hasAny", + ( + column("_tags_hash_map", True), + FunctionCall( + None, + "array", + ( + FunctionCall(None, "cityHash64", (Literal(None, "my_tag=a"),)), + FunctionCall(None, "cityHash64", (Literal(None, "my_tag=b"),)), + FunctionCall(None, "cityHash64", (Literal(None, "my_tag=c"),)), + ), + ), + ), + ), + id="Optimizable IN condition using tuple", + ), + pytest.param( + build_query( + selected_columns=[column("event_id")], + condition=binary_condition( + ConditionFunctions.IN, + nested_expression("tags", "my_tag"), + FunctionCall( + None, + "array", + ( + Literal(None, "a"), + Literal(None, "b"), + Literal(None, "c"), + ), + ), + ), + ), + FunctionCall( + None, + "hasAny", + ( + column("_tags_hash_map", True), + FunctionCall( + None, + "array", + ( + FunctionCall(None, "cityHash64", (Literal(None, "my_tag=a"),)), + FunctionCall(None, "cityHash64", (Literal(None, "my_tag=b"),)), + FunctionCall(None, "cityHash64", (Literal(None, "my_tag=c"),)), + ), + ), + ), + ), + id="Optimizable IN condition using array", + ), + pytest.param( + build_query( + selected_columns=[column("event_id")], + condition=binary_condition( + ConditionFunctions.IN, + nested_expression("tags", "my_tag"), + FunctionCall( + None, + "array", + ( + Literal(None, "a"), + Literal(None, "b"), + Literal(None, ""), + ), + ), + ), + ), + binary_condition( + ConditionFunctions.IN, + nested_expression("tags", "my_tag"), + FunctionCall( + None, + "array", + ( + Literal(None, "a"), + Literal(None, "b"), + Literal(None, ""), + ), + ), + ), + id="Non optimizable IN condition with empty value", + ), ]