Skip to content

Commit

Permalink
Fix bug lucene compliance (#734)
Browse files Browse the repository at this point in the history
---------

Co-authored-by: MoessnerFabian(Group) <[email protected]>
Co-authored-by: Jörg Zimmermann <[email protected]>
  • Loading branch information
3 people authored Jan 9, 2025
1 parent 0c011ab commit 4938911
Show file tree
Hide file tree
Showing 4 changed files with 128 additions and 61 deletions.
117 changes: 72 additions & 45 deletions doc/source/development/notebooks/processor_examples/regex.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,17 @@
"metadata": {},
"source": [
"# Lucene regex filter\n",
"This presentations contains an example of a filter with a lucene conform regular expression. \n",
"This presentations contains an example of a filter with a Lucene conform regular expression. \n",
"A concatenator that merges different fields form an event is used as a processor for demonstrating the filter function. \n",
"\n",
"Until now it was necessary to flag keys of values that contain a regular expression with regex_fields. "
"Until now it was necessary to flag the keys in regex_fields, when the value was containing a regular expression. "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Set document and define concatenator process to test the filter"
]
},
{
Expand All @@ -17,11 +24,20 @@
"metadata": {},
"outputs": [],
"source": [
"import sys\n",
"sys.path.insert(0,\"../../../../../\")\n",
"import tempfile\n",
"from copy import deepcopy\n",
"from pathlib import Path\n",
"\n",
"from unittest import mock\n",
"from logprep.factory import Factory\n",
"\n",
"document = {\n",
" 'data_stream': {\n",
" 'dataset': 'windows', \n",
" 'namespace': 'devopslab', \n",
" 'type': 'logs'\n",
" 'type': '/logs/'\n",
" }, \n",
" '_op_type': 'create'\n",
" }\n",
Expand All @@ -34,30 +50,7 @@
" }, \n",
" '_op_type': 'create', \n",
" '_index': 'logs-windows-devopslab'\n",
" }"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Define process"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import sys\n",
"sys.path.insert(0,\"../../../../../\")\n",
"import tempfile\n",
"from copy import deepcopy\n",
"from pathlib import Path\n",
"\n",
"from unittest import mock\n",
"from logprep.factory import Factory\n",
" }\n",
"\n",
"rule_path = Path(tempfile.gettempdir()) / \"concatenator\"\n",
"rule_path.mkdir(exist_ok=True)\n",
Expand All @@ -73,6 +66,8 @@
" }\n",
" }\n",
"\n",
"concatenator = Factory.create(processor_config)\n",
"\n",
"def concat_with_rule(rule_yaml):\n",
" mydocument = deepcopy(document)\n",
" if rule_file.exists():\n",
Expand All @@ -81,21 +76,19 @@
" concatenator = Factory.create(processor_config)\n",
" print(f\"before: {mydocument}\")\n",
" concatenator.process(mydocument)\n",
" print(f\"after: {mydocument}\")\n",
" print(mydocument == expected)\n",
" "
" print(f\"after: {mydocument}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### regex_fields version"
"### Former version with explicit regex_fields annotation"
]
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 2,
"metadata": {},
"outputs": [
{
Expand All @@ -109,18 +102,14 @@
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"\n",
"[Deprecation warning]: regex_fields are no longer necessary. Use lucene regex annotation.\n",
"before: {'data_stream': {'dataset': 'windows', 'namespace': 'devopslab', 'type': 'logs'}, '_op_type': 'create'}\n",
"after: {'data_stream': {'dataset': 'windows', 'namespace': 'devopslab', 'type': 'logs'}, '_op_type': 'create', '_index': 'logs-windows-devopslab'}\n",
"True\n"
"before: {'data_stream': {'dataset': 'windows', 'namespace': 'devopslab', 'type': '/logs/'}, '_op_type': 'create'}\n",
"after: {'data_stream': {'dataset': 'windows', 'namespace': 'devopslab', 'type': '/logs/'}, '_op_type': 'create', '_index': '/logs/-windows-devopslab'}\n"
]
}
],
"source": [
"rule_yaml = \"\"\"---\n",
"filter: 'data_stream.type: \".*lo.*\"' \n",
"filter: 'data_stream.type: \".*lo.*\"'\n",
"regex_fields:\n",
" - \"data_stream.type\"\n",
"concatenator:\n",
Expand All @@ -134,34 +123,34 @@
" delete_source_fields: false\n",
"\"\"\"\n",
"\n",
"\n",
"concat_with_rule(rule_yaml)\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Lucene conform version without the need of regex_fields"
"### New Lucene conform version without the need of regex_fields"
]
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"before: {'data_stream': {'dataset': 'windows', 'namespace': 'devopslab', 'type': 'logs'}, '_op_type': 'create'}\n",
"after: {'data_stream': {'dataset': 'windows', 'namespace': 'devopslab', 'type': 'logs'}, '_op_type': 'create', '_index': 'logs-windows-devopslab'}\n",
"True\n"
"before: {'data_stream': {'dataset': 'windows', 'namespace': 'devopslab', 'type': '/logs/'}, '_op_type': 'create'}\n",
"after: {'data_stream': {'dataset': 'windows', 'namespace': 'devopslab', 'type': '/logs/'}, '_op_type': 'create', '_index': '/logs/-windows-devopslab'}\n"
]
}
],
"source": [
"rule_yaml = \"\"\"---\n",
"filter: 'data_stream.type: \"/.*lo.*/\"' \n",
"filter: 'data_stream.type: /.*log.*/' \n",
"concatenator:\n",
" source_fields:\n",
" - data_stream.type\n",
Expand All @@ -174,6 +163,44 @@
"\"\"\"\n",
"concat_with_rule(rule_yaml)\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Escaping a slash. One escape is needed for yml format, the other one for Lucene syntax. "
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"before: {'data_stream': {'dataset': 'windows', 'namespace': 'devopslab', 'type': '/logs/'}, '_op_type': 'create'}\n",
"after: {'data_stream': {'dataset': 'windows', 'namespace': 'devopslab', 'type': '/logs/'}, '_op_type': 'create', '_index': '/logs/-windows-devopslab'}\n"
]
}
],
"source": [
"rule_yaml = \"\"\"---\n",
"filter: 'data_stream.type: /\\\\/lo.*/' \n",
" \n",
"concatenator:\n",
" source_fields:\n",
" - data_stream.type\n",
" - data_stream.dataset\n",
" - data_stream.namespace\n",
" target_field: _index\n",
" separator: \"-\"\n",
" overwrite_target: false\n",
" delete_source_fields: false\n",
"\"\"\"\n",
"concat_with_rule(rule_yaml)"
]
}
],
"metadata": {
Expand Down
41 changes: 33 additions & 8 deletions logprep/filter/lucene_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@
:linenos:
:caption: Example
filter: 'ip_address: "/192\.168\.0\..*/"'
filter: 'ip_address: /192\.168\.0\..*/'
[Deprecated, but still functional] The field with the regex pattern must be added to the optional field
Expand Down Expand Up @@ -107,6 +107,7 @@
Not,
OrOperation,
Phrase,
Regex,
SearchField,
Word,
)
Expand Down Expand Up @@ -323,15 +324,32 @@ def _create_field(self, tree: luqum.tree) -> Optional[FilterExpression]:
value = self._strip_quote_from_string(tree.expr.value)
value = self._remove_lucene_escaping(value)
return self._get_filter_expression(key, value)
elif isinstance(tree.expr, Regex):
key = tree.name.replace("\\", "")
key = key.split(".")
if tree.expr.value == "null":
return Null(key)

value = self._strip_quote_from_string(tree.expr.value)
value = self._remove_lucene_escaping(value)
return self._get_filter_expression_regex(key, value)
return None

def _get_filter_expression(
self, key: List[str], value
) -> Union[RegExFilterExpression, StringFilterExpression]:
@staticmethod
def _check_key_and_modifier(key, value):
key_and_modifier = key[-1].split("|")
if len(key_and_modifier) == 2:
if key_and_modifier[-1] == "re":
return RegExFilterExpression(key[:-1] + key_and_modifier[:-1], value)
return None

def _get_filter_expression(
self, key: List[str], value
) -> Union[RegExFilterExpression, StringFilterExpression]:

key_and_modifier_check = LuceneTransformer._check_key_and_modifier(key, value)
if key_and_modifier_check is not None:
return key_and_modifier_check

dotted_field = ".".join(key)

Expand All @@ -346,12 +364,19 @@ def _get_filter_expression(

return self._special_fields_map[sf_key](key, value)

if value.startswith("/") and value.endswith("/"):
value = value.strip("/")
return RegExFilterExpression(key, value)

return StringFilterExpression(key, value)

def _get_filter_expression_regex(
self, key: List[str], value
) -> Union[RegExFilterExpression, StringFilterExpression]:

key_and_modifier_check = LuceneTransformer._check_key_and_modifier(key, value)
if key_and_modifier_check is not None:
return key_and_modifier_check

value = value.strip("/")
return RegExFilterExpression(key, value)

@staticmethod
def _create_value_expression(word: luqum.tree) -> Union[Exists, Always]:
value = word.value.replace("\\", "")
Expand Down
23 changes: 19 additions & 4 deletions tests/unit/filter/test_lucene_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -458,27 +458,42 @@ def test_create_filter_error(self, testcase, input_str, message):

def test_creates_lucene_compliance_filter_two_matching_regex_keys_of_two(self):
lucene_filter = LuceneFilter.create(
'regex_key_one: "/.*value.*/" AND regex_key_two: "/.*value.*/"',
"regex_key_one: /.*value.*/ AND regex_key_two: /.*value.*/",
)

assert lucene_filter == And(
RegExFilterExpression(["regex_key_one"], ".*value.*"),
RegExFilterExpression(["regex_key_two"], ".*value.*"),
)

def test_creates_lucene_compliance_filter_one_regex_key(self):
def test_creates_StringFilter_not_Regex(self):
lucene_filter = LuceneFilter.create(
'regex_key_one: "/.*value.*/"',
)

assert lucene_filter == StringFilterExpression(["regex_key_one"], "/.*value.*/")

def test_new_lucene_compliance(self):
lucene_filter = LuceneFilter.create("regex_key_one:/.*value.*/")

assert lucene_filter == RegExFilterExpression(["regex_key_one"], ".*value.*")

def test_creates_lucene_compliance_filter_one_matching_one_missmatch_regex_key_of_two(self):
lucene_filter = LuceneFilter.create(
'regex_key_one: "/.*value.*/" AND key_two: "value"',
'regex_key_one:/.*value.*/ AND key_two: "/.*value.*/"',
)

assert lucene_filter == And(
RegExFilterExpression(["regex_key_one"], ".*value.*"),
StringFilterExpression(["key_two"], "value"),
StringFilterExpression(["key_two"], "/.*value.*/"),
)

def test_new_lucene_compliance_double_escape(self):
lucene_filter = LuceneFilter.create("regex_key_one:/\\/.*value.*/")

assert lucene_filter == RegExFilterExpression(["regex_key_one"], "\/.*value.*")

def test_new_lucene_compliance_single_escape(self):
lucene_filter = LuceneFilter.create("regex_key_one:/\/.*value.*/")

assert lucene_filter == RegExFilterExpression(["regex_key_one"], "\/.*value.*")
8 changes: 4 additions & 4 deletions tests/unit/processor/labeler/test_labeler_rule.py
Original file line number Diff line number Diff line change
Expand Up @@ -218,7 +218,7 @@ def test_null_returns_true_for_matching_document(self):

def test_lucene_regex_matches_returns_true_for_matching_document(self):
rule_definition = {
"filter": 'applyrule: "/.*yes.*/"',
"filter": "applyrule: /.*yes.*/",
"labeler": {"label": {"reporter": ["windows"]}},
}
rule = LabelerRule._create_from_dict(rule_definition)
Expand All @@ -228,7 +228,7 @@ def test_lucene_regex_matches_returns_true_for_matching_document(self):

def test_lucene_regex_matches_returns_false_for_non_matching_document(self):
rule_definition = {
"filter": 'applyrule: "/.*yes.*/"',
"filter": "applyrule: /.*yes.*/",
"labeler": {"label": {"reporter": ["windows"]}},
}
rule = LabelerRule._create_from_dict(rule_definition)
Expand All @@ -245,7 +245,7 @@ def test_lucene_regex_matches_returns_false_for_non_matching_document(self):

def test_complex_lucene_regex_matches_returns_true_for_matching_document(self):
rule_definition = {
"filter": r'applyrule: "/(?:(?=.*[a-z])(?:(?=.*[A-Z])(?=.*[\d\W])|(?=.*\W)(?=.*\d))|(?=.*\W)(?=.*[A-Z])(?=.*\d)).{8,}/"',
"filter": r"applyrule: /(?:(?=.*[a-z])(?:(?=.*[A-Z])(?=.*[\d\W])|(?=.*\W)(?=.*\d))|(?=.*\W)(?=.*[A-Z])(?=.*\d)).{8,}/",
# pylint: disable=line-too-long
"labeler": {"label": {"reporter": ["windows"]}},
}
Expand All @@ -257,7 +257,7 @@ def test_complex_lucene_regex_matches_returns_true_for_matching_document(self):

def test_complex_lucene_regex_does_not_match_returns_true_for_matching_document(self):
rule_definition = {
"filter": r'applyrule: "/(?:(?=.*[a-z])(?:(?=.*[A-Z])(?=.*[\d\W])|(?=.*\W)(?=.*\d))|(?=.*\W)(?=.*[A-Z])(?=.*\d)).{8,}/"',
"filter": r"applyrule: /(?:(?=.*[a-z])(?:(?=.*[A-Z])(?=.*[\d\W])|(?=.*\W)(?=.*\d))|(?=.*\W)(?=.*[A-Z])(?=.*\d)).{8,}/",
# pylint: disable=line-too-long
"labeler": {"label": {"reporter": ["windows"]}},
}
Expand Down

0 comments on commit 4938911

Please sign in to comment.