From 686662f0426bcc30d6765e0859d7bb90365ed49c Mon Sep 17 00:00:00 2001 From: masklinn Date: Mon, 15 Jul 2024 21:20:24 +0200 Subject: [PATCH] regex-based POC Uses ua-parser/uap-rust#3 Fixes #166 --- .github/workflows/ci.yml | 30 +++++----------- doc/conf.py | 2 ++ doc/installation.rst | 18 ++++++---- pyproject.toml | 15 +++++--- setup.py | 14 +++++--- src/ua_parser/__main__.py | 17 +++++---- src/ua_parser/regex.py | 76 +++++++++++++++++++++++++++++++++++++++ tests/test_core.py | 19 ++++++++-- tox.ini | 22 ++++++++---- 9 files changed, 160 insertions(+), 53 deletions(-) create mode 100644 src/ua_parser/regex.py diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index c2a7957..bdac57d 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -2,15 +2,8 @@ name: CI on: push: - branches: [ '*' ] pull_request: - branches: [ '*' ] workflow_dispatch: - schedule: - # cron is kinda random, assumes 22:00 UTC is a low ebb, eastern - # countries are very early morning, and US are mid-day to - # mid-afternoon - - cron: '0 22 * * 2' jobs: checks: @@ -88,19 +81,14 @@ jobs: - sdist - source python-version: - - "3.8" - "3.9" - "3.10" - "3.11" - "3.12" - "3.13" - - "pypy-3.8" - - "pypy-3.9" - "pypy-3.10" # - "pypy-3.11" - # don't enable graal because it's slower than even pypy and - # fails because oracle/graalpython#385 - # - "graalpy-23" + - "graalpy-24" include: - source: sdist artifact: dist/*.tar.gz @@ -119,17 +107,17 @@ jobs: - name: Install test dependencies run: | python -mpip install --upgrade pip - # cyaml is outright broken on pypy - if ! ${{ startsWith(matrix.python-version, 'pypy-') }}; then - # if binary wheels are not available for the current - # package install libyaml-dev so we can install pyyaml - # from source - if ! pip download --only-binary pyyaml -rrequirements_dev.txt > /dev/null 2>&1; then - sudo apt install libyaml-dev - fi + # if binary wheels are not available for the current + # package install libyaml-dev so we can install pyyaml + # from source + if ! pip download --only-binary :all: pyyaml > /dev/null 2>&1; then + sudo apt install libyaml-dev fi python -mpip install pytest pyyaml + # install rs accelerator if available, ignore if not + python -mpip ua-parser-rs || true + # re2 is basically impossible to install from source so don't # bother, and suppress installation failure so the test does # not fail (re2 tests will just be skipped for versions / diff --git a/doc/conf.py b/doc/conf.py index f0d3838..cc07643 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -19,9 +19,11 @@ rst_epilog = """ .. |pyyaml| replace:: ``PyYaml`` .. |re2| replace:: ``google-re2`` +.. |regex| replace:: ``regex`` .. _pyyaml: https://pyyaml.org .. _re2: https://pypi.org/project/google-re2 +.. _regex: https://pypi.org/project/ua-parser-rs """ # -- General configuration --------------------------------------------------- diff --git a/doc/installation.rst b/doc/installation.rst index e8ca58d..d4bf7ba 100644 --- a/doc/installation.rst +++ b/doc/installation.rst @@ -5,11 +5,14 @@ Installation Python Version ============== -ua-parser currently supports Python 3.8 and newer, as well as recent -versions of PyPy supporting the same standards. +ua-parser currently supports CPython 3.9 and newer, recent Pypy +(supporting 3.10), and Graal 24. -.. note:: While PyPy is supported, it is not *fast*, and google-re2 is - not supported on it. +.. note:: + + While pypy and graal are supported, they are rather slow when using + pure python mode and ``[re2]`` is not supported, so using the + ``[regex]`` feature is very strongly recommended. Installation ============ @@ -21,13 +24,14 @@ Installation Optional Dependencies ===================== -ua-parser currently has two optional dependencies, |re2|_ and -|pyyaml|_. These dependencies will be detected and used automatically +ua-parser currently has three optional dependencies, |regex|_, |re2|_ and +|pyyaml|_. These dependencies will be detected and used augitomatically if installed, but can also be installed via and alongside ua-parser: .. code-block:: sh + $ pip install 'ua-parser[regex]' $ pip install 'ua-parser[re2]' $ pip install 'ua-parser[yaml]' - $ pip install 'ua-parser[re2,yaml]' + $ pip install 'ua-parser[regex,yaml]' diff --git a/pyproject.toml b/pyproject.toml index 1dae0e6..b7b0280 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,9 +7,8 @@ name = "ua-parser" description = "Python port of Browserscope's user agent parser" version = "1.0.0a1" readme = "README.rst" -requires-python = ">=3.8" +requires-python = ">=3.9" dependencies = [] -optional-dependencies = { yaml = ["PyYaml"], re2 = ["google-re2"] } license = {text = "Apache 2.0"} urls = {repository = "https://github.com/ua-parser/uap-python"} @@ -35,14 +34,20 @@ classifiers = [ "Topic :: Internet :: WWW/HTTP", "Topic :: Software Development :: Libraries :: Python Modules", "Programming Language :: Python", - "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", "Programming Language :: Python :: Implementation :: CPython", - "Programming Language :: Python :: Implementation :: PyPy" + "Programming Language :: Python :: Implementation :: PyPy", + "Programming Language :: Python :: Implementation :: GraalPy", ] +[project.optional-dependencies] +yaml = ["PyYaml"] +re2 = ["google-re2"] +regex = ["ua-parser-rs"] + [tool.setuptools.packages.find] where = ["src"] @@ -63,7 +68,7 @@ known-first-party = ["ua_parser"] combine-as-imports = true [tool.mypy] -python_version = "3.8" +python_version = "3.9" files = "src,tests" # can't use strict because it's only global diff --git a/setup.py b/setup.py index c694778..f423348 100644 --- a/setup.py +++ b/setup.py @@ -67,16 +67,20 @@ def run(self) -> None: dest_lazy = outdir / "_lazy.py" dest_legacy = outdir / "_regexes.py" - with dest.open("wb") as eager, dest_lazy.open("wb") as lazy, dest_legacy.open( - "wb" - ) as legacy: + with ( + dest.open("wb") as eager, + dest_lazy.open("wb") as lazy, + dest_legacy.open("wb") as legacy, + ): eager = EagerWriter(eager) lazy = LazyWriter(lazy) legacy = LegacyWriter(legacy) for section in ["user_agent_parsers", "os_parsers", "device_parsers"]: - with eager.section(section), lazy.section(section), legacy.section( - section + with ( + eager.section(section), + lazy.section(section), + legacy.section(section), ): extract = EXTRACTORS[section] for p in regexes[section]: diff --git a/src/ua_parser/__main__.py b/src/ua_parser/__main__.py index d4ff29b..c461a28 100644 --- a/src/ua_parser/__main__.py +++ b/src/ua_parser/__main__.py @@ -39,11 +39,13 @@ from .caching import Cache, Local from .loaders import load_builtins, load_yaml from .re2 import Resolver as Re2Resolver +from .regex import Resolver as RegexResolver from .user_agent_parser import Parse CACHEABLE = { "basic": True, "re2": True, + "regex": True, "legacy": False, } @@ -178,6 +180,8 @@ def get_parser( r = BasicResolver(rules) elif parser == "re2": r = Re2Resolver(rules) + elif parser == "regex": + r = RegexResolver(rules) else: sys.exit(f"unknown parser {parser!r}") @@ -327,6 +331,7 @@ def run_threaded(args: argparse.Namespace) -> None: ("locking-lru", CachingResolver(basic, caching.Lru(CACHESIZE))), ("local-lru", CachingResolver(basic, Local(lambda: caching.Lru(CACHESIZE)))), ("re2", Re2Resolver(load_builtins())), + ("regex", RegexResolver(load_builtins())), ] for name, resolver in resolvers: print(f"{name:11}: ", end="", flush=True) @@ -436,14 +441,14 @@ def __call__( bench.add_argument( "--bases", nargs="+", - choices=["basic", "re2", "legacy"], - default=["basic", "re2", "legacy"], + choices=["basic", "re2", "regex", "legacy"], + default=["basic", "re2", "regex", "legacy"], help="""Base resolvers to benchmark. `basic` is a linear search through the regexes file, `re2` is a prefiltered regex set - implemented in C++, `legacy` is the legacy API (essentially a - basic resolver with a clearing cache of fixed 200 entries, but - less layered so usually slightly faster than an equivalent - basic-based resolver).""", + implemented in C++, `regex` is a prefiltered regex set implemented + in Rust, `legacy` is the legacy API (essentially a basic resolver + with a clearing cache of fixed 200 entries, but less layered so + usually slightly faster than an equivalent basic-based resolver).""", ) bench.add_argument( "--caches", diff --git a/src/ua_parser/regex.py b/src/ua_parser/regex.py new file mode 100644 index 0000000..704df16 --- /dev/null +++ b/src/ua_parser/regex.py @@ -0,0 +1,76 @@ +__all__ = ["Resolver"] + +from operator import attrgetter + +import ua_parser_rs # type: ignore + +from .core import ( + Device, + Domain, + Matchers, + OS, + PartialResult, + UserAgent, +) + + +class Resolver: + ua: ua_parser_rs.UserAgentExtractor + os: ua_parser_rs.OSExtractor + de: ua_parser_rs.DeviceExtractor + + def __init__(self, matchers: Matchers) -> None: + ua, os, de = matchers + self.ua = ua_parser_rs.UserAgentExtractor( + map( + attrgetter("regex", "family", "major", "minor", "patch", "patch_minor"), + ua, + ) + ) + self.os = ua_parser_rs.OSExtractor( + map( + attrgetter("regex", "family", "major", "minor", "patch", "patch_minor"), + os, + ) + ) + self.de = ua_parser_rs.DeviceExtractor( + map( + attrgetter("regex", "regex_flag", "family", "brand", "model"), + de, + ) + ) + + def __call__(self, ua: str, domains: Domain, /) -> PartialResult: + user_agent = os = device = None + if Domain.USER_AGENT in domains: + if m := self.ua.extract(ua): + user_agent = UserAgent( + m.family, + m.major, + m.minor, + m.patch, + m.patch_minor, + ) + if Domain.OS in domains: + if m := self.os.extract(ua): + os = OS( + m.family, + m.major, + m.minor, + m.patch, + m.patch_minor, + ) + if Domain.DEVICE in domains: + if m := self.de.extract(ua): + device = Device( + m.family, + m.brand, + m.model, + ) + return PartialResult( + domains=domains, + string=ua, + user_agent=user_agent, + os=os, + device=device, + ) diff --git a/tests/test_core.py b/tests/test_core.py index 4c80126..310ddec 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -53,6 +53,19 @@ else: PARSERS.append(pytest.param(Parser(re2.Resolver(load_builtins())), id="re2")) +try: + from ua_parser import regex +except ImportError: + PARSERS.append( + pytest.param( + None, + id="regex", + marks=pytest.mark.skip(reason="regex parser not available"), + ) + ) +else: + PARSERS.append(pytest.param(Parser(regex.Resolver(load_builtins())), id="regex")) + UA_FIELDS = {f.name for f in dataclasses.fields(UserAgent)} @@ -64,7 +77,7 @@ CORE_DIR / "test_resources" / "firefox_user_agent_strings.yaml", CORE_DIR / "test_resources" / "pgts_browser_list.yaml", ], - ids=attrgetter("name"), + ids=attrgetter("stem"), ) def test_ua(parser, test_file): with test_file.open("rb") as f: @@ -90,7 +103,7 @@ def test_ua(parser, test_file): CORE_DIR / "tests" / "test_os.yaml", CORE_DIR / "test_resources" / "additional_os_tests.yaml", ], - ids=attrgetter("name"), + ids=attrgetter("stem"), ) def test_os(parser, test_file): with test_file.open("rb") as f: @@ -111,7 +124,7 @@ def test_os(parser, test_file): [ CORE_DIR / "tests" / "test_device.yaml", ], - ids=attrgetter("name"), + ids=attrgetter("stem"), ) def test_devices(parser, test_file): with test_file.open("rb") as f: diff --git a/tox.ini b/tox.ini index bb4af08..17dd84e 100644 --- a/tox.ini +++ b/tox.ini @@ -1,12 +1,14 @@ [tox] min_version = 4.0 -env_list = py3{8,9,10,11,12} - pypy3.{8,9,10} +env_list = py3{9,10,11,12} + pypy3.10 + #graalpy-24 flake8, black, typecheck labels = - test = py3{8,9,10,11,12},pypy3.{8,9,10} - cpy = py3{8,9,10,11,12} - pypy = pypy3.{8,9,10} + test = py3{9,10,11,12},pypy3.10,graalpy-24 + cpy = py3{9,10,11,12} + pypy = pypy3.10 + #graal = graalpy-24 check = flake8, black, typecheck [testenv] @@ -20,13 +22,21 @@ deps = pytest pyyaml google-re2 + ua-parser-rs commands = pytest -Werror --doctest-glob="*.rst" {posargs} -[testenv:pypy3.{8,9,10}] +[testenv:pypy3.10] deps = pytest pyyaml + ua-parser-rs + +[testenv:graalpy-24] +deps = + pytest + pyyaml + ua-parser-rs [testenv:flake8] package = skip