From 12326c641c92aef2f38bb9819320fd6137c12ed5 Mon Sep 17 00:00:00 2001 From: MoshiMoshi0 Date: Tue, 7 Jan 2025 19:11:54 +0100 Subject: [PATCH] Support multi line regex in strip_ignore_text --- changedetectionio/html_tools.py | 47 ++++++++++++++----- .../tests/test_ignore_regex_text.py | 41 +++++++++++++++- 2 files changed, 74 insertions(+), 14 deletions(-) diff --git a/changedetectionio/html_tools.py b/changedetectionio/html_tools.py index b710077f1e5..36c8f1eee32 100644 --- a/changedetectionio/html_tools.py +++ b/changedetectionio/html_tools.py @@ -363,22 +363,41 @@ def extract_json_as_string(content, json_filter, ensure_is_ldjson_info_type=None # wordlist - list of regex's (str) or words (str) # Preserves all linefeeds and other whitespacing, its not the job of this to remove that def strip_ignore_text(content, wordlist, mode="content"): - i = 0 - output = [] ignore_text = [] ignore_regex = [] - ignored_line_numbers = [] + ignore_regex_multiline = [] + ignored_lines = [] for k in wordlist: # Is it a regex? res = re.search(PERL_STYLE_REGEX, k, re.IGNORECASE) if res: - ignore_regex.append(re.compile(perl_style_slash_enclosed_regex_to_options(k))) + res = re.compile(perl_style_slash_enclosed_regex_to_options(k)) + if res.flags & re.DOTALL or res.flags & re.MULTILINE: + ignore_regex_multiline.append(res) + else: + ignore_regex.append(res) else: ignore_text.append(k.strip()) - for line in content.splitlines(keepends=True): - i += 1 + for r in ignore_regex_multiline: + for match in r.finditer(content): + content_lines = content[:match.end()].splitlines(keepends=True) + match_lines = content[match.start():match.end()].splitlines(keepends=True) + + end_line = len(content_lines) + start_line = end_line - len(match_lines) + + if end_line - start_line <= 1: + # Match is empty or in the middle of the line + ignored_lines.append(start_line) + else: + for i in range(start_line, end_line): + ignored_lines.append(i) + + line_index = 0 + lines = content.splitlines(keepends=True) + for line in lines: # Always ignore blank lines in this mode. (when this function gets called) got_match = False for l in ignore_text: @@ -390,17 +409,19 @@ def strip_ignore_text(content, wordlist, mode="content"): if r.search(line): got_match = True - if not got_match: - # Not ignored, and should preserve "keepends" - output.append(line) - else: - ignored_line_numbers.append(i) + if got_match: + ignored_lines.append(line_index) + + line_index += 1 + + ignored_lines = set([i for i in ignored_lines if i >= 0 and i < len(lines)]) # Used for finding out what to highlight if mode == "line numbers": - return ignored_line_numbers + return [i + 1 for i in ignored_lines] - return ''.join(output) + output_lines = set(range(len(lines))) - ignored_lines + return ''.join([lines[i] for i in output_lines]) def cdata_in_document_to_text(html_content: str, render_anchor_tag_content=False) -> str: from xml.sax.saxutils import escape as xml_escape diff --git a/changedetectionio/tests/test_ignore_regex_text.py b/changedetectionio/tests/test_ignore_regex_text.py index 3488318256e..dca89817f75 100644 --- a/changedetectionio/tests/test_ignore_regex_text.py +++ b/changedetectionio/tests/test_ignore_regex_text.py @@ -32,7 +32,6 @@ def test_strip_regex_text_func(): ] stripped_content = html_tools.strip_ignore_text(test_content, ignore_lines) - assert "but 1 lines" in stripped_content assert "igNORe-cAse text" not in stripped_content assert "but 1234 lines" not in stripped_content @@ -42,6 +41,46 @@ def test_strip_regex_text_func(): # Check line number reporting stripped_content = html_tools.strip_ignore_text(test_content, ignore_lines, mode="line numbers") assert stripped_content == [2, 5, 6, 7, 8, 10] + + stripped_content = html_tools.strip_ignore_text(test_content, ['/but 1.+5 lines/s']) + assert "but 1 lines" not in stripped_content + assert "skip 5 lines" not in stripped_content + + stripped_content = html_tools.strip_ignore_text(test_content, ['/but 1.+5 lines/s'], mode="line numbers") + assert stripped_content == [4, 5] + + stripped_content = html_tools.strip_ignore_text(test_content, ['/.+/s']) + assert stripped_content == "" + + stripped_content = html_tools.strip_ignore_text(test_content, ['/.+/s'], mode="line numbers") + assert stripped_content == [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11] + + stripped_content = html_tools.strip_ignore_text(test_content, ['/^.+but.+\\n.+lines$/m']) + assert "but 1 lines" not in stripped_content + assert "skip 5 lines" not in stripped_content + + stripped_content = html_tools.strip_ignore_text(test_content, ['/^.+but.+\\n.+lines$/m'], mode="line numbers") + assert stripped_content == [4, 5] + + stripped_content = html_tools.strip_ignore_text(test_content, ['/^.+?\.$/m']) + assert "but sometimes we want to remove the lines." not in stripped_content + assert "but not always." not in stripped_content + + stripped_content = html_tools.strip_ignore_text(test_content, ['/^.+?\.$/m'], mode="line numbers") + assert stripped_content == [2, 11] + + stripped_content = html_tools.strip_ignore_text(test_content, ['/but.+?but/ms']) + assert "but sometimes we want to remove the lines." not in stripped_content + assert "but 1 lines" not in stripped_content + assert "but 1234 lines" not in stripped_content + assert "igNORe-cAse text we dont want to keep" not in stripped_content + assert "but not always." not in stripped_content + + stripped_content = html_tools.strip_ignore_text(test_content, ['/but.+?but/ms'], mode="line numbers") + assert stripped_content == [2, 3, 4, 9, 10, 11] + + stripped_content = html_tools.strip_ignore_text("\n\ntext\n\ntext\n\n", ['/^$/ms'], mode="line numbers") + assert stripped_content == [1, 2, 4, 6] # Check that linefeeds are preserved when there are is no matching ignores content = "some text\n\nand other text\n"