Fix bug lucene compliance (#734)

--------- Co-authored-by: MoessnerFabian(Group) <[email protected]> Co-authored-by: Jörg Zimmermann <[email protected]>
fkie-cad · Jan 9, 2025 · 4938911 · 4938911
1 parent 0c011ab
commit 4938911
Show file tree

Hide file tree

Showing 4 changed files with 128 additions and 61 deletions.
diff --git a/doc/source/development/notebooks/processor_examples/regex.ipynb b/doc/source/development/notebooks/processor_examples/regex.ipynb
@@ -5,10 +5,17 @@
    "metadata": {},
    "source": [
     "# Lucene regex filter\n",
-    "This presentations contains an example of a filter with a lucene conform regular expression. \n",
+    "This presentations contains an example of a filter with a Lucene conform regular expression. \n",
     "A concatenator that merges different fields form an event is used as a processor for demonstrating the filter function. \n",
     "\n",
-    "Until now it was necessary to flag keys of values that contain a regular expression with regex_fields. "
+    "Until now it was necessary to flag the keys in regex_fields, when the value was containing a regular expression. "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Set document and define concatenator process to test the filter"
    ]
   },
   {
@@ -17,11 +24,20 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "import sys\n",
+    "sys.path.insert(0,\"../../../../../\")\n",
+    "import tempfile\n",
+    "from copy import deepcopy\n",
+    "from pathlib import Path\n",
+    "\n",
+    "from unittest import mock\n",
+    "from logprep.factory import Factory\n",
+    "\n",
     "document = {\n",
     "    'data_stream': {\n",
     "        'dataset': 'windows', \n",
     "        'namespace': 'devopslab', \n",
-    "        'type': 'logs'\n",
+    "        'type': '/logs/'\n",
     "        }, \n",
     "    '_op_type': 'create'\n",
     "    }\n",
@@ -34,30 +50,7 @@
     "        }, \n",
     "    '_op_type': 'create', \n",
     "    '_index': 'logs-windows-devopslab'\n",
-    "    }"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Define process"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import sys\n",
-    "sys.path.insert(0,\"../../../../../\")\n",
-    "import tempfile\n",
-    "from copy import deepcopy\n",
-    "from pathlib import Path\n",
-    "\n",
-    "from unittest import mock\n",
-    "from logprep.factory import Factory\n",
+    "    }\n",
     "\n",
     "rule_path = Path(tempfile.gettempdir()) / \"concatenator\"\n",
     "rule_path.mkdir(exist_ok=True)\n",
@@ -73,6 +66,8 @@
     "        }\n",
     "    }\n",
     "\n",
+    "concatenator = Factory.create(processor_config)\n",
+    "\n",
     "def concat_with_rule(rule_yaml):\n",
     "    mydocument = deepcopy(document)\n",
     "    if rule_file.exists():\n",
@@ -81,21 +76,19 @@
     "    concatenator = Factory.create(processor_config)\n",
     "    print(f\"before: {mydocument}\")\n",
     "    concatenator.process(mydocument)\n",
-    "    print(f\"after: {mydocument}\")\n",
-    "    print(mydocument == expected)\n",
-    "    "
+    "    print(f\"after: {mydocument}\")"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### regex_fields version"
+    "### Former version with explicit regex_fields annotation"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [
     {
@@ -109,18 +102,14 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "\n",
-      "\n",
-      "[Deprecation warning]: regex_fields are no longer necessary. Use lucene regex annotation.\n",
-      "before: {'data_stream': {'dataset': 'windows', 'namespace': 'devopslab', 'type': 'logs'}, '_op_type': 'create'}\n",
-      "after: {'data_stream': {'dataset': 'windows', 'namespace': 'devopslab', 'type': 'logs'}, '_op_type': 'create', '_index': 'logs-windows-devopslab'}\n",
-      "True\n"
+      "before: {'data_stream': {'dataset': 'windows', 'namespace': 'devopslab', 'type': '/logs/'}, '_op_type': 'create'}\n",
+      "after: {'data_stream': {'dataset': 'windows', 'namespace': 'devopslab', 'type': '/logs/'}, '_op_type': 'create', '_index': '/logs/-windows-devopslab'}\n"
      ]
     }
    ],
    "source": [
     "rule_yaml = \"\"\"---\n",
-    "filter: 'data_stream.type: \".*lo.*\"'     \n",
+    "filter: 'data_stream.type: \".*lo.*\"'\n",
     "regex_fields:\n",
     "  - \"data_stream.type\"\n",
     "concatenator:\n",
@@ -134,34 +123,34 @@
     "  delete_source_fields: false\n",
     "\"\"\"\n",
     "\n",
+    "\n",
     "concat_with_rule(rule_yaml)\n"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### Lucene conform version without the need of regex_fields"
+    "### New Lucene conform version without the need of regex_fields"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "before: {'data_stream': {'dataset': 'windows', 'namespace': 'devopslab', 'type': 'logs'}, '_op_type': 'create'}\n",
-      "after: {'data_stream': {'dataset': 'windows', 'namespace': 'devopslab', 'type': 'logs'}, '_op_type': 'create', '_index': 'logs-windows-devopslab'}\n",
-      "True\n"
+      "before: {'data_stream': {'dataset': 'windows', 'namespace': 'devopslab', 'type': '/logs/'}, '_op_type': 'create'}\n",
+      "after: {'data_stream': {'dataset': 'windows', 'namespace': 'devopslab', 'type': '/logs/'}, '_op_type': 'create', '_index': '/logs/-windows-devopslab'}\n"
      ]
     }
    ],
    "source": [
     "rule_yaml = \"\"\"---\n",
-    "filter: 'data_stream.type: \"/.*lo.*/\"'    \n",
+    "filter: 'data_stream.type: /.*log.*/'    \n",
     "concatenator:\n",
     "  source_fields:\n",
     "    - data_stream.type\n",
@@ -174,6 +163,44 @@
     "\"\"\"\n",
     "concat_with_rule(rule_yaml)\n"
    ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Escaping a slash. One escape is needed for yml format, the other one for Lucene syntax. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "before: {'data_stream': {'dataset': 'windows', 'namespace': 'devopslab', 'type': '/logs/'}, '_op_type': 'create'}\n",
+      "after: {'data_stream': {'dataset': 'windows', 'namespace': 'devopslab', 'type': '/logs/'}, '_op_type': 'create', '_index': '/logs/-windows-devopslab'}\n"
+     ]
+    }
+   ],
+   "source": [
+    "rule_yaml = \"\"\"---\n",
+    "filter: 'data_stream.type: /\\\\/lo.*/'    \n",
+    "                           \n",
+    "concatenator:\n",
+    "  source_fields:\n",
+    "    - data_stream.type\n",
+    "    - data_stream.dataset\n",
+    "    - data_stream.namespace\n",
+    "  target_field: _index\n",
+    "  separator: \"-\"\n",
+    "  overwrite_target: false\n",
+    "  delete_source_fields: false\n",
+    "\"\"\"\n",
+    "concat_with_rule(rule_yaml)"
+   ]
   }
  ],
  "metadata": {

diff --git a/logprep/filter/lucene_filter.py b/logprep/filter/lucene_filter.py
@@ -70,7 +70,7 @@
     :linenos:
     :caption: Example
 
-    filter: 'ip_address: "/192\.168\.0\..*/"'
+    filter: 'ip_address: /192\.168\.0\..*/'
 
 
 [Deprecated, but still functional] The field with the regex pattern must be added to the optional field
@@ -107,6 +107,7 @@
     Not,
     OrOperation,
     Phrase,
+    Regex,
     SearchField,
     Word,
 )
@@ -323,15 +324,32 @@ def _create_field(self, tree: luqum.tree) -> Optional[FilterExpression]:
             value = self._strip_quote_from_string(tree.expr.value)
             value = self._remove_lucene_escaping(value)
             return self._get_filter_expression(key, value)
+        elif isinstance(tree.expr, Regex):
+            key = tree.name.replace("\\", "")
+            key = key.split(".")
+            if tree.expr.value == "null":
+                return Null(key)
+
+            value = self._strip_quote_from_string(tree.expr.value)
+            value = self._remove_lucene_escaping(value)
+            return self._get_filter_expression_regex(key, value)
         return None
 
-    def _get_filter_expression(
-        self, key: List[str], value
-    ) -> Union[RegExFilterExpression, StringFilterExpression]:
+    @staticmethod
+    def _check_key_and_modifier(key, value):
         key_and_modifier = key[-1].split("|")
         if len(key_and_modifier) == 2:
             if key_and_modifier[-1] == "re":
                 return RegExFilterExpression(key[:-1] + key_and_modifier[:-1], value)
+        return None
+
+    def _get_filter_expression(
+        self, key: List[str], value
+    ) -> Union[RegExFilterExpression, StringFilterExpression]:
+
+        key_and_modifier_check = LuceneTransformer._check_key_and_modifier(key, value)
+        if key_and_modifier_check is not None:
+            return key_and_modifier_check
 
         dotted_field = ".".join(key)
 
@@ -346,12 +364,19 @@ def _get_filter_expression(
 
                     return self._special_fields_map[sf_key](key, value)
 
-        if value.startswith("/") and value.endswith("/"):
-            value = value.strip("/")
-            return RegExFilterExpression(key, value)
-
         return StringFilterExpression(key, value)
 
+    def _get_filter_expression_regex(
+        self, key: List[str], value
+    ) -> Union[RegExFilterExpression, StringFilterExpression]:
+
+        key_and_modifier_check = LuceneTransformer._check_key_and_modifier(key, value)
+        if key_and_modifier_check is not None:
+            return key_and_modifier_check
+
+        value = value.strip("/")
+        return RegExFilterExpression(key, value)
+
     @staticmethod
     def _create_value_expression(word: luqum.tree) -> Union[Exists, Always]:
         value = word.value.replace("\\", "")

diff --git a/tests/unit/filter/test_lucene_filter.py b/tests/unit/filter/test_lucene_filter.py
@@ -458,27 +458,42 @@ def test_create_filter_error(self, testcase, input_str, message):
 
     def test_creates_lucene_compliance_filter_two_matching_regex_keys_of_two(self):
         lucene_filter = LuceneFilter.create(
-            'regex_key_one: "/.*value.*/" AND regex_key_two: "/.*value.*/"',
+            "regex_key_one: /.*value.*/ AND regex_key_two: /.*value.*/",
         )
 
         assert lucene_filter == And(
             RegExFilterExpression(["regex_key_one"], ".*value.*"),
             RegExFilterExpression(["regex_key_two"], ".*value.*"),
         )
 
-    def test_creates_lucene_compliance_filter_one_regex_key(self):
+    def test_creates_StringFilter_not_Regex(self):
         lucene_filter = LuceneFilter.create(
             'regex_key_one: "/.*value.*/"',
         )
 
+        assert lucene_filter == StringFilterExpression(["regex_key_one"], "/.*value.*/")
+
+    def test_new_lucene_compliance(self):
+        lucene_filter = LuceneFilter.create("regex_key_one:/.*value.*/")
+
         assert lucene_filter == RegExFilterExpression(["regex_key_one"], ".*value.*")
 
     def test_creates_lucene_compliance_filter_one_matching_one_missmatch_regex_key_of_two(self):
         lucene_filter = LuceneFilter.create(
-            'regex_key_one: "/.*value.*/" AND key_two: "value"',
+            'regex_key_one:/.*value.*/ AND key_two: "/.*value.*/"',
         )
 
         assert lucene_filter == And(
             RegExFilterExpression(["regex_key_one"], ".*value.*"),
-            StringFilterExpression(["key_two"], "value"),
+            StringFilterExpression(["key_two"], "/.*value.*/"),
         )
+
+    def test_new_lucene_compliance_double_escape(self):
+        lucene_filter = LuceneFilter.create("regex_key_one:/\\/.*value.*/")
+
+        assert lucene_filter == RegExFilterExpression(["regex_key_one"], "\/.*value.*")
+
+    def test_new_lucene_compliance_single_escape(self):
+        lucene_filter = LuceneFilter.create("regex_key_one:/\/.*value.*/")
+
+        assert lucene_filter == RegExFilterExpression(["regex_key_one"], "\/.*value.*")
diff --git a/tests/unit/processor/labeler/test_labeler_rule.py b/tests/unit/processor/labeler/test_labeler_rule.py
@@ -218,7 +218,7 @@ def test_null_returns_true_for_matching_document(self):
 
     def test_lucene_regex_matches_returns_true_for_matching_document(self):
         rule_definition = {
-            "filter": 'applyrule: "/.*yes.*/"',
+            "filter": "applyrule: /.*yes.*/",
             "labeler": {"label": {"reporter": ["windows"]}},
         }
         rule = LabelerRule._create_from_dict(rule_definition)
@@ -228,7 +228,7 @@ def test_lucene_regex_matches_returns_true_for_matching_document(self):
 
     def test_lucene_regex_matches_returns_false_for_non_matching_document(self):
         rule_definition = {
-            "filter": 'applyrule: "/.*yes.*/"',
+            "filter": "applyrule: /.*yes.*/",
             "labeler": {"label": {"reporter": ["windows"]}},
         }
         rule = LabelerRule._create_from_dict(rule_definition)
@@ -245,7 +245,7 @@ def test_lucene_regex_matches_returns_false_for_non_matching_document(self):
 
     def test_complex_lucene_regex_matches_returns_true_for_matching_document(self):
         rule_definition = {
-            "filter": r'applyrule: "/(?:(?=.*[a-z])(?:(?=.*[A-Z])(?=.*[\d\W])|(?=.*\W)(?=.*\d))|(?=.*\W)(?=.*[A-Z])(?=.*\d)).{8,}/"',
+            "filter": r"applyrule: /(?:(?=.*[a-z])(?:(?=.*[A-Z])(?=.*[\d\W])|(?=.*\W)(?=.*\d))|(?=.*\W)(?=.*[A-Z])(?=.*\d)).{8,}/",
             # pylint: disable=line-too-long
             "labeler": {"label": {"reporter": ["windows"]}},
         }
@@ -257,7 +257,7 @@ def test_complex_lucene_regex_matches_returns_true_for_matching_document(self):
 
     def test_complex_lucene_regex_does_not_match_returns_true_for_matching_document(self):
         rule_definition = {
-            "filter": r'applyrule: "/(?:(?=.*[a-z])(?:(?=.*[A-Z])(?=.*[\d\W])|(?=.*\W)(?=.*\d))|(?=.*\W)(?=.*[A-Z])(?=.*\d)).{8,}/"',
+            "filter": r"applyrule: /(?:(?=.*[a-z])(?:(?=.*[A-Z])(?=.*[\d\W])|(?=.*\W)(?=.*\d))|(?=.*\W)(?=.*[A-Z])(?=.*\d)).{8,}/",
             # pylint: disable=line-too-long
             "labeler": {"label": {"reporter": ["windows"]}},
         }