Skip to content

Commit

Permalink
Improve adblock decoder.
Browse files Browse the repository at this point in the history
Indeed, before this patch we weren't decoding the following cases:

  * |http://example.com/*
  * |http://example.org^

This patch fixes PyFunceble/adblock-decoder#3.

Contributors:
  * @smed79
  • Loading branch information
funilrys committed May 1, 2023
1 parent 49b32da commit d32914b
Show file tree
Hide file tree
Showing 2 changed files with 139 additions and 4 deletions.
85 changes: 82 additions & 3 deletions PyFunceble/converter/adblock_input_line2subject.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@
limitations under the License.
"""

from typing import Any, List, Optional, Set, Union
from typing import Any, List, Optional, Set, Tuple, Union

from PyFunceble.converter.base import ConverterBase
from PyFunceble.converter.url2netloc import Url2Netloc
Expand All @@ -68,13 +68,15 @@ class AdblockInputLine2Subject(ConverterBase):
_aggressive: bool = False

_regex_helper: Optional[RegexHelper] = None
url2netloc: Optional[Url2Netloc] = None

def __init__(
self,
data_to_convert: Optional[Any] = None,
aggressive: bool = False,
*,
regex_helper: Optional[RegexHelper] = None,
url2netloc: Optional[Url2Netloc] = None,
) -> None:
if aggressive is not None:
self.aggressive = aggressive
Expand All @@ -84,6 +86,11 @@ def __init__(
else:
self._regex_helper = regex_helper

if url2netloc is None:
self.url2netloc = Url2Netloc()
else:
self.url2netloc = url2netloc

super().__init__(data_to_convert=data_to_convert)

@ConverterBase.data_to_convert.setter
Expand Down Expand Up @@ -144,8 +151,7 @@ def should_be_ignored(line: str) -> bool:

return any(line.startswith(x) for x in starting_chars)

@staticmethod
def extract_base(subject: Union[str, List[str]]) -> Union[str, List[str]]:
def extract_base(self, subject: Union[str, List[str]]) -> Union[str, List[str]]:
"""
Extracts the base of the given subject (supposely URL).
Expand All @@ -160,10 +166,38 @@ def extract_base(subject: Union[str, List[str]]) -> Union[str, List[str]]:
subject = subject.replace("*", "").replace("~", "")

try:
# TODO: Fix this.
return Url2Netloc(subject).get_converted()
except ValueError:
return subject

def split_seprators(self, line: str) -> Tuple[Set[str], str]:
"""
Splits the separators providing the 2 possible parts: domains and body.
:param line:
The line to convert.
Example:
Given: :code:`"||example.com$script,domain=example.org` returns
:code:`({"example.org"}, {"script,domain=example.org"})`
"""

separators = ["##", "#?#", "#@#", "#$#", "$"]

targets, options = set(), set()

for separator in separators:
if separator not in line:
continue

target, option = line.rsplit(separator, 1)
targets.add(target)
options.add(option)

return targets, options

def _decode_multiple_subject(self, decoded: str) -> Set[str]:
"""
Implementation of the decoding of the case that multiple
Expand Down Expand Up @@ -427,6 +461,50 @@ def _decode_v6(self, line: str, *, aggressive: bool = False) -> Set[str]:

return {x for x in result if "." in x}

def _decode_v7(self, line: str, *, aggressive: bool = False) -> Set[str]:
"""
Implementation of our seventh decoding mode.
In this mode we try to decode the explicit URL:
|http://example.org/.*
|https://example.org/.*
:param line:
The line to decode.
"""

local_line = line.strip()
result = set()

if (
local_line.startswith("||")
or (local_line.startswith("|") and local_line.endswith("|"))
or (not line.startswith("|"))
):
return result

if local_line.startswith("|"):
local_line = local_line.replace("|", "", 1)

if local_line.endswith("^"):
local_line = local_line.rstrip("^")

targets, options = self.split_seprators(local_line)

for target in targets:
result.update(self._decode_multiple_subject(target))

if aggressive:
for option in options:
result.update(self._decode_options(option.split(",")))

if not options:
# Wish me luck :-)
result.update(self._decode_multiple_subject(local_line))

return {x for x in result if "." in x}

def get_converted(self) -> List[str]:
"""
Provides the converted data.
Expand All @@ -452,6 +530,7 @@ def convert(self, data: Any, *, aggressive: bool = False) -> List[str]:
result.update(self._decode_v3(data, aggressive=aggressive))
result.update(self._decode_v5(data, aggressive=aggressive))
result.update(self._decode_v6(data, aggressive=aggressive))
result.update(self._decode_v7(data, aggressive=aggressive))

result.update(self._decode_v4(data, aggressive=aggressive))

Expand Down
58 changes: 57 additions & 1 deletion tests/converter/test_adblock_input_line2subject.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@
from typing import List

from PyFunceble.converter.adblock_input_line2subject import AdblockInputLine2Subject
from PyFunceble.converter.url2netloc import Url2Netloc
from PyFunceble.helpers.regex import RegexHelper


Expand Down Expand Up @@ -274,6 +275,55 @@ class TestAdblockInputLine2Subject(unittest.TestCase):
"aggressive": ["example.com", "example.net", "example.org"],
},
},
{
"subject": "|http://example.org/hello-world^$scripts,image",
"expected": {"aggressive": ["example.org"], "standard": ["example.org"]},
},
{
"subject": "|http://example.org/*",
"expected": {"aggressive": ["example.org"], "standard": ["example.org"]},
},
{
"subject": "|http://example.org^",
"expected": {"aggressive": ["example.org"], "standard": ["example.org"]},
},
{
"subject": "|http://example.org",
"expected": {"aggressive": ["example.org"], "standard": ["example.org"]},
},
{
"subject": "|https://example.org/^$domain=example.com",
"expected": {
"aggressive": ["example.com", "example.org"],
"standard": ["example.org"],
},
},
{
"subject": "|ftp://example.org$domain=example.com|example.net",
"expected": {
"aggressive": ["example.com", "example.net", "example.org"],
"standard": ["example.org"],
},
},
{
"subject": "|http://example.com$script,image,domain=example.org|foo.example.net",
"expected": {
"aggressive": ["example.com", "example.org", "foo.example.net"],
"standard": ["example.com"],
},
},
{
"subject": "|http://example.com,https://example.de$script,image,domain=example.org|foo.example.net",
"expected": {
"aggressive": [
"example.com",
"example.de",
"example.org",
"foo.example.net",
],
"standard": ["example.com", "example.de"],
},
},
]

def setUp(self) -> None:
Expand All @@ -296,12 +346,18 @@ def test_init_with_helper(self) -> None:
"""

regex_helper = RegexHelper()
self.converter = AdblockInputLine2Subject(regex_helper=regex_helper)
url2netloc = Url2Netloc()
self.converter = AdblockInputLine2Subject(
regex_helper=regex_helper, url2netloc=url2netloc
)

# pylint: disable=protected-access
self.assertIsInstance(self.converter._regex_helper, RegexHelper)
self.assertEqual(id(regex_helper), id(self.converter._regex_helper))

self.assertIsInstance(self.converter.url2netloc, Url2Netloc)
self.assertEqual(id(url2netloc), id(self.converter.url2netloc))

def test_set_data_to_convert_no_string(self) -> None:
"""
Tests the method which let us set the data to work with for the case
Expand Down

0 comments on commit d32914b

Please sign in to comment.