From 788dbe24786eb9948eb209bfdff46b698f7be8d3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Mon, 1 Jul 2024 20:40:17 +0200 Subject: [PATCH 01/28] Initial implementation and docs --- .bandit.yml | 3 + .flake8 | 4 + .pre-commit-config.yaml | 12 +++ docs/api.rst | 6 +- docs/conf.py | 19 +++- docs/usage.rst | 178 ++++++++++++++++++++++++++++++++- form2request/__init__.py | 2 + form2request/_base.py | 207 +++++++++++++++++++++++++++++++++++++++ pyproject.toml | 1 + tox.ini | 20 +++- 10 files changed, 444 insertions(+), 8 deletions(-) create mode 100644 .bandit.yml create mode 100644 form2request/_base.py diff --git a/.bandit.yml b/.bandit.yml new file mode 100644 index 0000000..2237265 --- /dev/null +++ b/.bandit.yml @@ -0,0 +1,3 @@ +skips: +- B101 # assert_used, needed for mypy +exclude_dirs: ['tests'] diff --git a/.flake8 b/.flake8 index 9ee8f89..f06e676 100644 --- a/.flake8 +++ b/.flake8 @@ -1,6 +1,10 @@ [flake8] +extend-select = TC, TC1 ignore = +max-line-length = 88 per-file-ignores = + # F401: Imported but unused + form2request/__init__.py:F401 # D100-D104: Missing docstring docs/conf.py:D100 tests/__init__.py:D104 diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 663563f..6d5a2de 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -17,8 +17,20 @@ repos: - flake8-debugger - flake8-docstrings - flake8-string-format + - flake8-type-checking - repo: https://github.com/asottile/pyupgrade rev: v3.16.0 hooks: - id: pyupgrade args: [--py38-plus] +- repo: https://github.com/pycqa/bandit + rev: 1.7.9 + hooks: + - id: bandit + args: [-r, -c, .bandit.yml] +- repo: https://github.com/adamchainz/blacken-docs + rev: 1.18.0 + hooks: + - id: blacken-docs + additional_dependencies: + - black==24.4.2 diff --git a/docs/api.rst b/docs/api.rst index 29c1176..05c1a65 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -2,4 +2,8 @@ API reference ============= -… +.. autofunction:: form2request.request_from_form + +.. autoclass:: form2request.Request + :members: + :undoc-members: diff --git a/docs/conf.py b/docs/conf.py index acc9c71..1440069 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -9,11 +9,24 @@ html_theme = "sphinx_rtd_theme" -autodoc_member_order = "groupwise" - -intersphinx_disabled_reftypes = [] +intersphinx_disabled_reftypes = [ + "lxml.etree.FormElement", +] intersphinx_mapping = { "lxml": ("https://lxml.de/apidoc/", None), "parsel": ("https://parsel.readthedocs.io/en/stable", None), "python": ("https://docs.python.org/3", None), + "scrapy": ("https://docs.scrapy.org/en/latest", None), } + +nitpick_ignore = [ + *( + ("py:class", cls) + for cls in ( + # https://github.com/sphinx-doc/sphinx/issues/11225 + "Element", + "FormdataType", + "FormElement", + ) + ), +] diff --git a/docs/usage.rst b/docs/usage.rst index d9237b5..971b802 100644 --- a/docs/usage.rst +++ b/docs/usage.rst @@ -2,4 +2,180 @@ Usage ===== -… +:ref:`Given an HTML form
`: + +.. _fromstring-example: + +>>> from lxml.html import fromstring +>>> html = b"""
""" +>>> root = fromstring(html, base_url="https://example.com") +>>> form = root.xpath("//form")[0] + +You can use :func:`~form2request.request_from_form` to generate :ref:`form +submission request data `: + +>>> from form2request import request_from_form +>>> request_from_form(form) +Request(url='https://example.com?foo=bar', method='GET', headers=[], body=b'') + +:func:`~form2request.request_from_form` supports :ref:`user-defined form data +` and :ref:`choosing a specific form submission button (or none) +`. + + +.. _form: + +Getting a form +============== + +:func:`~form2request.request_from_form` requires an +:class:`lxml.html.FormElement` object. + +You can build one using :func:`lxml.html.fromstring` to parse an HTML document +and :meth:`lxml.html.HtmlElement.xpath` to find a form element in that +document, as :ref:`seen above `. + +Here are some examples of XPath expressions that can be useful to find a form +element using :meth:`~lxml.html.HtmlElement.xpath`: + +- To find a form by one of its attributes, such as ``id`` or ``name``, use + ``//form[@=""]``. For example, to find ``
`: + +>>> from parsel import Selector +>>> selector = Selector(body=html, base_url="https://example.com") +>>> form = selector.css("form")[0].root +>>> type(form) + + +A similar example, with a :doc:`Scrapy ` response: + +>>> from scrapy.http import TextResponse +>>> response = TextResponse("https://example.com", body=html) +>>> form = response.css("form")[0].root +>>> type(form) + + + +.. _data: + +Setting form data +================= + +While there are forms made entirely of hidden fields, like :ref:`the one above +`, most often you will work with forms that expect +user-defined data: + +>>> html = b"""""" +>>> root = fromstring(html, base_url="https://example.com") +>>> form = root.xpath("//form")[0] + +Use the second parameter of :func:`~form2request.request_from_form`, to define +the corresponding data: + +>>> request_from_form(form, {"foo": "bar"}) +Request(url='https://example.com?foo=bar', method='GET', headers=[], body=b'') + +You may sometimes find forms where more than one field has the same ``name`` +attribute: + +>>> html = b"""
""" +>>> root = fromstring(html, base_url="https://example.com") +>>> form = root.xpath("//form")[0] + +To specify values for all same-name fields, instead of a dictionary, use an +iterable of key-value tuples: + +>>> request_from_form(form, (("foo", "bar"), ("foo", "baz"))) +Request(url='https://example.com?foo=bar&foo=baz', method='GET', headers=[], body=b'') + + +.. _click: + +Configuring form submission +=========================== + +When an HTML form is submitted, the way the submission is triggered has an +impact on the resulting request data. + +Given a submit button with ``name`` and ``value`` attributes: + +>>> html = b"""
""" +>>> root = fromstring(html, base_url="https://example.com") +>>> form = root.xpath("//form")[0] + +If you submit the form by clicking that button, those attributes are included +in the request data, which is what :func:`~form2request.request_from_form` does +by default: + +>>> request_from_form(form) +Request(url='https://example.com?foo=bar', method='GET', headers=[], body=b'') + +However, sometimes it is possible to submit a form without clicking a submit +button, even when there is such a button. In such cases, the button data should +not be part of the request data. For such cases, set ``click`` to ``False``: + +>>> request_from_form(form, click=False) +Request(url='https://example.com', method='GET', headers=[], body=b'') + +You may also find forms with more than one submit button: + +>>> html = b"""
""" +>>> root = fromstring(html, base_url="https://example.com") +>>> form = root.xpath("//form")[0] + +By default, :func:`~form2request.request_from_form` clicks the first submission +element: + +>>> request_from_form(form) +Request(url='https://example.com?foo=bar', method='GET', headers=[], body=b'') + +To change that, set ``click`` to the element that should be clicked: + +>>> submit_baz = form.xpath('.//*[@value="baz"]')[0] +>>> request_from_form(form, click=submit_baz) +Request(url='https://example.com?foo=baz', method='GET', headers=[], body=b'') + + +.. _request: + +Using request data +================== + +:class:`~form2request.Request` is a simple data container that you can use to +build an actual request object: + +>>> request_data = request_from_form(form) + +Here are some examples for popular Python libraries and frameworks: + +>>> from requests import Request +>>> request = Request(request_data.method, request_data.url, headers=request_data.headers, data=request_data.body) +>>> request + + + +>>> from scrapy import Request +>>> request = Request(request_data.url, method=request_data.method, headers=request_data.headers, body=request_data.body) +>>> request + diff --git a/form2request/__init__.py b/form2request/__init__.py index 4908450..c6d4a5c 100644 --- a/form2request/__init__.py +++ b/form2request/__init__.py @@ -1 +1,3 @@ """Build HTTP requests out of HTML forms.""" + +from ._base import Request, request_from_form diff --git a/form2request/_base.py b/form2request/_base.py new file mode 100644 index 0000000..cc538cc --- /dev/null +++ b/form2request/_base.py @@ -0,0 +1,207 @@ +from __future__ import annotations + +from dataclasses import dataclass +from typing import TYPE_CHECKING, Any, Dict, Iterable, Optional, Tuple, Union, cast +from urllib.parse import urlencode, urljoin, urlsplit, urlunsplit + +from w3lib.html import strip_html5_whitespace + +if TYPE_CHECKING: + from lxml.etree import Element # nosec + from lxml.html import FormElement # nosec + from lxml.html import InputElement # nosec + from lxml.html import MultipleSelectOptions # nosec + from lxml.html import SelectElement # nosec + from lxml.html import TextareaElement # nosec + +FormdataVType = Union[str, Iterable[str]] +FormdataKVType = Tuple[str, FormdataVType] +FormdataType = Optional[Union[Dict[str, FormdataVType], Iterable[FormdataKVType]]] + + +def _is_listlike(x: Any) -> bool: + """Return ``True`` if *x* is a list-like object or ``False`` otherwise. + + A list-like object is an iterable, excluding strings or bytes. + """ + return hasattr(x, "__iter__") and not isinstance(x, (str, bytes)) + + +def _value( + ele: InputElement | SelectElement | TextareaElement, +) -> tuple[str | None, None | str | MultipleSelectOptions]: + n = ele.name + v = ele.value + if ele.tag == "select": + return _select_value(cast("SelectElement", ele), n, v) + return n, v + + +def _select_value( + ele: SelectElement, n: str | None, v: None | str | MultipleSelectOptions +) -> tuple[str | None, None | str | MultipleSelectOptions]: + multiple = ele.multiple + if v is None and not multiple: + # Match browser behavior on simple select tag without options selected + # And for select tags without options + o = ele.value_options + return (n, o[0]) if o else (None, None) + return n, v + + +def _url(form: FormElement) -> str: + if form.base_url is None: + raise ValueError(f"{form} has no base_url set.") + action = form.get("action") + if action is None: + return form.base_url + return urljoin(form.base_url, strip_html5_whitespace(action)) + + +def _method(form: FormElement) -> str: + method = form.method + assert method is not None + method = method.upper() + if method not in {"GET", "POST"}: + method = "GET" + return method + + +class _NoClickables(ValueError): + pass + + +def _click_data( + form: FormElement, click: None | bool | Element +) -> tuple[()] | tuple[str, str]: + if click is False: + return () + if click in {None, True}: + clickables = list( + form.xpath( + 'descendant::input[re:test(@type, "^(submit|image)$", "i")]' + '|descendant::button[not(@type) or re:test(@type, "^submit$", "i")]', + namespaces={"re": "http://exslt.org/regular-expressions"}, + ) + ) + if not clickables: + if click: + raise _NoClickables + else: + return () + click = clickables[0] + return click.get("name"), click.get("value") + + +def _data(data: FormdataType, click_data: tuple[()] | tuple[str, str]) -> FormdataType: + data = data or {} + if click_data: + assert len(click_data) == 2 + if isinstance(data, dict): + data = dict(data) + data[click_data[0]] = click_data[1] + else: + data = list(data) + data.append(click_data) + return data + + +def _query(form: FormElement, data: FormdataType) -> str: + keys = dict(data or ()).keys() + if not data: + data = [] + inputs = form.xpath( + "descendant::textarea" + "|descendant::select" + "|descendant::input[not(@type) or @type[" + ' not(re:test(., "^(?:submit|image|reset)$", "i"))' + " and (../@checked or" + ' not(re:test(., "^(?:checkbox|radio)$", "i")))]]', + namespaces={"re": "http://exslt.org/regular-expressions"}, + ) + values: list[FormdataKVType] = [ + (k, "" if v is None else v) + for k, v in (_value(e) for e in inputs) + if k and k not in keys + ] + items = data.items() if isinstance(data, dict) else data + values.extend((k, v) for k, v in items if v is not None) + encoded_values = [ + (k.encode(), v.encode()) + for k, vs in values + for v in (cast("Iterable[str]", vs) if _is_listlike(vs) else [cast("str", vs)]) + ] + return urlencode(encoded_values, doseq=True) + + +@dataclass +class Request: + """HTTP request data.""" + + url: str + method: str + headers: list[tuple[str, str]] + body: bytes + + +def request_from_form( + form: FormElement, + data: FormdataType = None, + /, + *, + click: None | bool | Element = None, +) -> Request: + """Return a form submission request. + + *form* should be an instance of :class:`lxml.html.FormElement`. + + *data* should be either a dictionary of a list of 2-item tuples indicating + the key-value pairs to include in the request as submission data. + + *click* can be any of: + + - ``None`` (default): the first submission element of the form (e.g. a + submit button) is used to build a request for a click-based + form submission. + + If no submission elements are found, the request is built for a + non-click-based form submission, i.e. a form submission triggered by a + non-click event, such as pressing the Enter key while the focus is in + a single-line text input field of the form. + + - ``True`` behaves like ``None``, but raises a :exc:`ValueError` + exception if no submission element is found in the form. + + - ``False`` builds a request for a non-click-based form submission. + + - A submission element of *form*, to build a request for a form submission + based on the clicking of that specific element. + + On forms with multiple submission elements, specifying the right + submission element here may be necessary. + """ + url = _url(form) + method = _method(form) + try: + click_data = _click_data(form, click) + except _NoClickables: + raise ValueError( + f"No clickable elements found in form {form}. Set click=False or " + f"point it to the element to be clicked." + ) + data = _data(data, click_data) + query = _query(form, data) + headers = [] + body = b"" + if method == "GET": + url = urlunsplit(urlsplit(url)._replace(query=query)) + else: + assert method == "POST" + headers = [("Content-Type", "application/x-www-form-urlencoded")] + body = query.encode() + return Request( + url=url, + method=method, + headers=headers, + body=body, + ) diff --git a/pyproject.toml b/pyproject.toml index a566415..597272c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,6 +25,7 @@ classifiers = [ requires-python = ">=3.8" dependencies = [ "lxml >= 4.4.1", + "w3lib >= 1.17.0", ] [project.urls] diff --git a/tox.ini b/tox.ini index 4de7873..e68601f 100644 --- a/tox.ini +++ b/tox.ini @@ -1,5 +1,5 @@ [tox] -envlist = pre-commit,mypy,docs,twinecheck,min,py38,py39,py310,py311,py312 +envlist = pre-commit,mypy,docs,doctest,twinecheck,min,py38,py39,py310,py311,py312 [testenv] deps = @@ -10,14 +10,14 @@ commands = --cov-report=term-missing:skip-covered \ --cov-report=xml \ --cov=form2request \ - --doctest-glob="*.rst" --doctest-modules \ - {posargs:docs form2request tests} + {posargs:tests} [testenv:min] basepython = python3.8 deps = {[testenv]deps} lxml==4.4.1 + w3lib==1.17.0 [testenv:pre-commit] deps = @@ -41,6 +41,20 @@ setenv = commands = sphinx-build -W -n -b html . {envtmpdir}/html +[testenv:doctest] +deps = + {[testenv]deps} + parsel + requests + scrapy +commands = + pytest \ + --cov-report=term-missing:skip-covered \ + --cov-report=xml \ + --cov=form2request \ + --doctest-glob="*.rst" --doctest-modules \ + {posargs:docs} + [testenv:twinecheck] basepython = python3 deps = From 612dd72c54205ce15c1331d96b4d201e8224e601 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 2 Jul 2024 09:24:15 +0200 Subject: [PATCH 02/28] Remove the AI mention from the docs --- README.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.rst b/README.rst index 586477c..a6b7f0d 100644 --- a/README.rst +++ b/README.rst @@ -20,8 +20,8 @@ form2request .. description starts -``form2request`` is an AI-powered Python 3.8+ library to build HTTP requests -out of HTML forms. +``form2request`` is a Python 3.8+ library to build HTTP requests out of HTML +forms. .. description ends From 83d02003e796f392132222df5b09e7d13fe85921 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 2 Jul 2024 10:19:57 +0200 Subject: [PATCH 03/28] Support formaction and formmethod, and raise NotImplementedError for enctype and formenctype with a known unsupported value --- form2request/_base.py | 45 +++++++++++++++++++++++++++---------------- 1 file changed, 28 insertions(+), 17 deletions(-) diff --git a/form2request/_base.py b/form2request/_base.py index cc538cc..88e0ba3 100644 --- a/form2request/_base.py +++ b/form2request/_base.py @@ -8,6 +8,7 @@ if TYPE_CHECKING: from lxml.etree import Element # nosec + from lxml.html import HtmlElement # nosec from lxml.html import FormElement # nosec from lxml.html import InputElement # nosec from lxml.html import MultipleSelectOptions # nosec @@ -49,18 +50,18 @@ def _select_value( return n, v -def _url(form: FormElement) -> str: +def _url(form: FormElement, click_element: Optional[HtmlElement]) -> str: if form.base_url is None: raise ValueError(f"{form} has no base_url set.") - action = form.get("action") + action = (click_element.get("formaction") if click_element else None) or form.get("action") if action is None: return form.base_url return urljoin(form.base_url, strip_html5_whitespace(action)) -def _method(form: FormElement) -> str: - method = form.method - assert method is not None +def _method(form: FormElement, click_element: Optional[HtmlElement]) -> str: + method = (click_element.get("formmethod") if click_element else None) or form.method + assert method is not None # lxml’s form.method is always filled method = method.upper() if method not in {"GET", "POST"}: method = "GET" @@ -71,11 +72,11 @@ class _NoClickables(ValueError): pass -def _click_data( +def _click_element( form: FormElement, click: None | bool | Element -) -> tuple[()] | tuple[str, str]: +) -> Optional[HtmlElement]: if click is False: - return () + return None if click in {None, True}: clickables = list( form.xpath( @@ -88,15 +89,15 @@ def _click_data( if click: raise _NoClickables else: - return () + return None click = clickables[0] - return click.get("name"), click.get("value") + return click -def _data(data: FormdataType, click_data: tuple[()] | tuple[str, str]) -> FormdataType: +def _data(data: FormdataType, click_element: Optional[HtmlElement]) -> FormdataType: data = data or {} - if click_data: - assert len(click_data) == 2 + if click_element and (name := click_element.get("name")): + click_data = (name, click_element.get("value")) if isinstance(data, dict): data = dict(data) data[click_data[0]] = click_data[1] @@ -180,16 +181,26 @@ def request_from_form( On forms with multiple submission elements, specifying the right submission element here may be necessary. """ - url = _url(form) - method = _method(form) + if form.get("enctype") == "multipart/form-data": + raise NotImplementedError( + f"{form} has enctype set to multipart/form-data, which " + f"form2request does not currently support." + ) try: - click_data = _click_data(form, click) + click_element = _click_element(form, click) except _NoClickables: raise ValueError( f"No clickable elements found in form {form}. Set click=False or " f"point it to the element to be clicked." ) - data = _data(data, click_data) + if click_element and click_element.get("formenctype") == "multipart/form-data": + raise NotImplementedError( + f"{click_element} has formenctype set to multipart/form-data, " + f"which form2request does not currently support." + ) + url = _url(form, click_element) + method = _method(form, click_element) + data = _data(data, click_element) query = _query(form, data) headers = [] body = b"" From 564f5fc00e4ca008e786b9f4184f3df89b8eed91 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 2 Jul 2024 10:21:03 +0200 Subject: [PATCH 04/28] docs/conf.py: remove leftover --- docs/conf.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index 1440069..36483ae 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -9,9 +9,7 @@ html_theme = "sphinx_rtd_theme" -intersphinx_disabled_reftypes = [ - "lxml.etree.FormElement", -] +intersphinx_disabled_reftypes = [] intersphinx_mapping = { "lxml": ("https://lxml.de/apidoc/", None), "parsel": ("https://parsel.readthedocs.io/en/stable", None), From d4d0b61b6c17204b71ed3b55c5d949982c517c87 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 2 Jul 2024 10:21:31 +0200 Subject: [PATCH 05/28] Use from None to hide internal exception --- form2request/_base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/form2request/_base.py b/form2request/_base.py index 88e0ba3..5974d99 100644 --- a/form2request/_base.py +++ b/form2request/_base.py @@ -192,7 +192,7 @@ def request_from_form( raise ValueError( f"No clickable elements found in form {form}. Set click=False or " f"point it to the element to be clicked." - ) + ) from None if click_element and click_element.get("formenctype") == "multipart/form-data": raise NotImplementedError( f"{click_element} has formenctype set to multipart/form-data, " From c227a4da6c95896eb224c50387c8f7791658f8fe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 2 Jul 2024 10:36:14 +0200 Subject: [PATCH 06/28] Solve issues reported by CI --- .gitignore | 1 + dist/form2request-0.0.0.tar.gz | Bin 4457 -> 0 bytes form2request/_base.py | 5 ++--- tox.ini | 3 ++- 4 files changed, 5 insertions(+), 4 deletions(-) delete mode 100644 dist/form2request-0.0.0.tar.gz diff --git a/.gitignore b/.gitignore index b53725c..eda423e 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ /.coverage /coverage.xml +/dist/ diff --git a/dist/form2request-0.0.0.tar.gz b/dist/form2request-0.0.0.tar.gz deleted file mode 100644 index f99bed94a6483bfd6e1f6ffaf4415a2e7b5c50d8..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 4457 zcmZ{kc{J4j_s10}W#89^tR=~wbx5IXkti9mWs8ZilrV|xvSdk?A$ziveaYBD$WV64 zzGj^fGh^oc`qt ztc`B*^2N?p+F|L#KO|C}W}mcr@D(llAjP6|8rE)eqDXrIfwt_Bzn+SG+42feE_e|FdG#8Y^@+2g7!P*x#D8SIK#L z|F-#Q2MS7Yr7%b?9k+aK?fGR0MREuged+uP-RbL@AK=~6UjYolc6En<&)x?Jmj~I! zKE5g8b?T!BR3}xdbwznl7r287C!Uo4PIQvBhy~RxXw71pWIm)x>C<&=c$ zJMP@gFP415-JMyx71b5pb*iu_`HEPHTT1GB4Lzc4RbUv=-@)W9{KBv)0;#jJ{baG* zB_M7^u+rdedzSY5v~T9$c{TGr%s$>-T~}Rvy?J(G=&AOcFiX#$+$s^(Pp;{5`te(= zWiB1pWBu6^^L}ev>JYSjge8=HHGLR)UZm}2IRd_|WA>3T@_4dktp7y=k00AB-!F}x zQA=Mvq2#*ht~Ts(D3EZ|FSYHE8n~5Lm%p9(Ea8TumH6@$wAgg;pkBw{BUPxj;hi(g`~=c~b%*5vKq%LwAVrs6v_-HsBy!R2O!&NGo6 zH=VE+NY~}Co24cPEE`8k2=QAoA^Jc1Q9MtYu5DEUo&PtTXrO@4#(o^Shehu-TWJ=X28TG%b{@a}@LX!$fx0?UB z=P0zcL7y@t_3?$s8Q1evtNo+Tvy|@qwtNzWE)PMa@a~+7I8$ zl-~A>8e5IeTe_*m)7<;+087msptW=kjMaWWS5`zYi=26&lI(hT!)-NB?+deNXZd(- zRc4ih^iIS#ncZHH1aLhdHs+l$z)n)ol@_DhWr4a?&%VY3K8hVD3f~LS&aif)b<$#b zK{}gFnbKp;Tr0lj@eRi3Qb$7^8hE*l=K^eWHlY0v%m(OCV2c1MW|6o46XJRA0Q>G7 zckvh+e?mNxh>fdqAw{?m^;Yth&1t`KC!L6{B|OE1n8vQf7n)Tt@EPZCCM)N3OJj1`l=_s-9NkmgI`l@LaERI4 zXx={*i~C?!QR)0hD84XwoYdh?$_NX5c{PLK?N-;}(JxEo>4+q?EAL#>3#;^xT~QCM z<8TYeN>ecrNRVu}>`7}(>psZwNYdlaC!sKtx$2-s$F@1GIvF1MsvK*fVpA@WVI3Z3 zjZd}LeZ*+zCgE}t3+|e?m@W9y=!-o6Xrqwv@-(L}v#HRz^XC_N6pR$Vrg-mL;u)x+)raqc=qcYRw-z1LVyw6h= z!&~3!_C6d&=CVKgP#4?Hqs(LC?K01q-0+0mZBI22^zgWXwLUHm5_EzksOP@30ee0C zGwQ;G4gG}d$EGl zucuuUVZ@X%l@MAwPnN}Foy)<6mkt$C7k3}xu?$<75u)8AU}u($Y5+UkYv?QhZmj@P zi$xm=z5$XTr1DnImP+Pj^}yQ%@CHgpJ-t4E?OqD%3eblV$lxszPDa0jSS|`$HfQH; zKb;&G1hISQLaBKIJQ3tWcFMqo4EtxLKn6Gpv>K>t6bQhxt} zEdmFqW(^JGit(SGY?65LVJoOMf^?l zsn#D`yF85%yVaiEltRSD3tqNKBvYnP^I@Ti=zD?h!+d5z#j~438H}-=IVfeb=s8AB zo)c>hO7?Kt3O5b{{C=7}IpK!`KlJVfB8wI!Q z4Pp0>5<1WR?FD*^XtS@;Q9wliNRCrD_(AnAa8Y`Z1m86!Q!p)3B7dENx+#1LXHlQD zmPDmlCM7V1n*Ff-;GXp->$+M|0ZuioHGvW~uMk`lwGYSf@rpP!_d$+s&Q^5jn?0?x z6>;#Qkxf-d%4|s4iGFWXQJhio}Tq{NyqaV!qEJ7iebJ+(@ zmRb>fwRGkhmoiUukV{f;a>K1hZg~^(U1wRsjOJ!^nvW6Ei-VPH=xfXGO6FQB`UL!q zBY(fWB4c>y^d9rmCJ^>eVTU-NCvIir)pLMc}tJvp!+x{6S!N$g~pl~Nu! z(rt$w$S4H!7S?L^yxcpZ6h+@|b`ozun9=E}Unf_NhOR6 zu(NnXk6_ily@H%R|GeXvHri0dOAR;iTjJ|pE)Ow7fyE=1G0^DTuNuv;Zubbuw?%$N z0?QL?6dFunGzG*^U>JZ3{#U~&=wzU<#A@BMCDsFsj_g9n)P$E=Q9Ec`W>7!1B_ZAu z*@CReY~L>O>kjEk9LJ2m52zwM@@Ps0JK*0u?S?-e1Dtcg% zjQg_=)>qLyJ`GS9dDd{LeU>l?9B|;$(ivZ>e)}p~6zEgY2nuWqT>>1Zv(pCwms}*} zKP(Fgz`>j2AE2b1r^&1!ec2V%XYr(J%m5kb2Hm5J1Bnk!xq)m}jwO!UmL&C1*Tvdc9 z`@huk@7d`9WEW6{Ii+zztBXXt0wRWk5C!^A&3@|u7DUp9=M{=p1jPB?&fAI}+R<;@ zY31Tx45o|3ekd|8#XcTxM||0fCMFQE7L;^+6o>)@&1}VwB(!}TaEJou7Jz&QOg0lJ zQ0yFu@F3KhDV8#O!9i;Qz5r?$(ErFo59+^?m4{pwumy1xbj;I?Nq|!A0?y%KusAB< zV%FxxC8jyFtwjM1*dxgrGSL6E`cG4TuOr(52(~PBb41Kb(CFaZ z2g&j{1wozWSDgUh-o-rTDF;8Ke5pWSn?c2>yfUltXv7^}d$Wq)o_x6QbLqLCqA>l= zE8D7-)^A_ZPPMK;`@i%r$rLx`IHcc|+hn|N6jd(M!XaMJp@4f~C0#m#v+Rq_(O7SMBft~w{z2HCxn&0vr2^>9wkVR4_FI)L~1=srJl;>*1 zWa7O@Kd}XGzC9?Mx@(#pau=`BU&s<@ff)DJwL0$qL?-$2lvZ1z-qHh^K2`r*n`80J z|CYdm1Kt3(07ieU1v^u$V(=pn_NquEgf?Ut{7D4;tU&$eYtAE^^@<9brGW9Amek{V z4=Ro;Jef8+HdRTLHNNL*yh^a_V!77MV(TM&Wrs(x5J%PDIB^)6l1=v5cJ8waoh_3! zM;lN*V(^ja?MfcmM z(7y_1W_B~j2q8UdeEWUmzSoDsehR&J8AIrXorI-Ex_5)^MZDPKQV0w&Z+wvLJ!JLb z`n7!J;O5orR#QVkDmnAvEw1(`Cmx&M86HPt>&^MMi+93TkK5{2Gky?R>{NGYVCI-h z|F?(gUDzQIfpB5K+<_?j^-ISH1aCtJfob!Ofa+beI3?)1+2Wzg5Vlrc53n6n=7do> z4<~)v>UFe^;gNDnrt$f~a%AYmktq?H&7bw(kfu?3K#z=fG%XO^;(wC#)<~7)p`3Ze z_=Ry+_hgq_mhokZ_|l)r@S_k{kG<{3kBptR@+;x-5VZ9JIY-3JCqZX141;>;frBJL zj=cJa8}uL6X~|SgF9*#P9`C%#zyVCj?kX6jV&LjKZJ)ZIAHbH-$O!QDG#OO*64u-L4HR$MIig#vhQV6}$fNZep{gDVnz)=z&du|FO0>mIt zUM)taSAzIofRR$!I|lIA_rblJR~okeV5e0cQWI)X?(z2*p_rq@=59aj4Sv5f;o5)0 z-b2Mk0jqYPeq#Y`mNfFz#$3A%alJJ*V!(F?iYYhJgH-9aD8FIk_ z1pjAHS~7G9&L_Wsu!anO|Fd2_`GCxADjY66{rDvJ`{Rtfmmjn}pD*TtU_%~cW*XjO GH2(v8Z(#WV diff --git a/form2request/_base.py b/form2request/_base.py index 5974d99..22b8b24 100644 --- a/form2request/_base.py +++ b/form2request/_base.py @@ -7,7 +7,6 @@ from w3lib.html import strip_html5_whitespace if TYPE_CHECKING: - from lxml.etree import Element # nosec from lxml.html import HtmlElement # nosec from lxml.html import FormElement # nosec from lxml.html import InputElement # nosec @@ -73,7 +72,7 @@ class _NoClickables(ValueError): def _click_element( - form: FormElement, click: None | bool | Element + form: FormElement, click: None | bool | HtmlElement ) -> Optional[HtmlElement]: if click is False: return None @@ -150,7 +149,7 @@ def request_from_form( data: FormdataType = None, /, *, - click: None | bool | Element = None, + click: None | bool | HtmlElement = None, ) -> Request: """Return a form submission request. diff --git a/tox.ini b/tox.ini index e68601f..2d05d07 100644 --- a/tox.ini +++ b/tox.ini @@ -28,6 +28,7 @@ commands = pre-commit run --all-files --show-diff-on-failure basepython = python3.12 deps = mypy==1.10.0 + lxml-stubs commands = mypy form2request tests @@ -58,7 +59,7 @@ commands = [testenv:twinecheck] basepython = python3 deps = - twine==5.1.0 + twine==5.1.1 build==1.2.1 commands = python -m build --sdist From 29d1f6f8503748529c24f133b40b6bf78a184cfa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 2 Jul 2024 10:44:09 +0200 Subject: [PATCH 07/28] Solve additional CI issues --- docs/conf.py | 2 +- form2request/_base.py | 14 ++++++++------ 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index 36483ae..8ab46e1 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -22,9 +22,9 @@ ("py:class", cls) for cls in ( # https://github.com/sphinx-doc/sphinx/issues/11225 - "Element", "FormdataType", "FormElement", + "HtmlElement", ) ), ] diff --git a/form2request/_base.py b/form2request/_base.py index 22b8b24..b5f2f53 100644 --- a/form2request/_base.py +++ b/form2request/_base.py @@ -7,8 +7,8 @@ from w3lib.html import strip_html5_whitespace if TYPE_CHECKING: - from lxml.html import HtmlElement # nosec from lxml.html import FormElement # nosec + from lxml.html import HtmlElement # nosec from lxml.html import InputElement # nosec from lxml.html import MultipleSelectOptions # nosec from lxml.html import SelectElement # nosec @@ -49,16 +49,18 @@ def _select_value( return n, v -def _url(form: FormElement, click_element: Optional[HtmlElement]) -> str: +def _url(form: FormElement, click_element: HtmlElement | None) -> str: if form.base_url is None: raise ValueError(f"{form} has no base_url set.") - action = (click_element.get("formaction") if click_element else None) or form.get("action") + action = (click_element.get("formaction") if click_element else None) or form.get( + "action" + ) if action is None: return form.base_url return urljoin(form.base_url, strip_html5_whitespace(action)) -def _method(form: FormElement, click_element: Optional[HtmlElement]) -> str: +def _method(form: FormElement, click_element: HtmlElement | None) -> str: method = (click_element.get("formmethod") if click_element else None) or form.method assert method is not None # lxml’s form.method is always filled method = method.upper() @@ -73,7 +75,7 @@ class _NoClickables(ValueError): def _click_element( form: FormElement, click: None | bool | HtmlElement -) -> Optional[HtmlElement]: +) -> HtmlElement | None: if click is False: return None if click in {None, True}: @@ -93,7 +95,7 @@ def _click_element( return click -def _data(data: FormdataType, click_element: Optional[HtmlElement]) -> FormdataType: +def _data(data: FormdataType, click_element: HtmlElement | None) -> FormdataType: data = data or {} if click_element and (name := click_element.get("name")): click_data = (name, click_element.get("value")) From ec0a7c9681ee811a57a038fb9a8dad0d5ed8b15d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 2 Jul 2024 10:47:22 +0200 Subject: [PATCH 08/28] Add doctest to GitHub Actions --- .github/workflows/test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 0d2ca22..70870f8 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -41,7 +41,7 @@ jobs: fail-fast: false matrix: python-version: ['3.12'] - tox-job: ["pre-commit", "mypy", "docs", "twinecheck"] + tox-job: ["pre-commit", "mypy", "docs", "doctest", "twinecheck"] steps: - uses: actions/checkout@v4 - name: Set up Python ${{ matrix.python-version }} From 3475f1781b69d18375c21d8daae0ee4348fae090 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 2 Jul 2024 14:30:09 +0200 Subject: [PATCH 09/28] Complete test coverage --- .coveragerc | 3 + docs/usage.rst | 14 + form2request/_base.py | 76 +++-- tests/test_main.py | 626 +++++++++++++++++++++++++++++++++++++++++- tox.ini | 3 - 5 files changed, 673 insertions(+), 49 deletions(-) create mode 100644 .coveragerc diff --git a/.coveragerc b/.coveragerc new file mode 100644 index 0000000..0856c03 --- /dev/null +++ b/.coveragerc @@ -0,0 +1,3 @@ +[report] +exclude_lines = + if TYPE_CHECKING: diff --git a/docs/usage.rst b/docs/usage.rst index 971b802..ed030a2 100644 --- a/docs/usage.rst +++ b/docs/usage.rst @@ -109,6 +109,20 @@ iterable of key-value tuples: >>> request_from_form(form, (("foo", "bar"), ("foo", "baz"))) Request(url='https://example.com?foo=bar&foo=baz', method='GET', headers=[], body=b'') +Sometimes, you might want to prevent a value from a field from being included +in the generated request data. For example, because the field is removed or +disabled through JavaScript, or because the field or a parent element has the +``disabled`` attribute (currently not supported by form2request): + +>>> html = b"""
""" +>>> root = fromstring(html, base_url="https://example.com") +>>> form = root.xpath("//form")[0] + +To remove a field value, set it to ``None``: + +>>> request_from_form(form, {"foo": None}) +Request(url='https://example.com', method='GET', headers=[], body=b'') + .. _click: diff --git a/form2request/_base.py b/form2request/_base.py index b5f2f53..95bd402 100644 --- a/form2request/_base.py +++ b/form2request/_base.py @@ -27,46 +27,33 @@ def _is_listlike(x: Any) -> bool: return hasattr(x, "__iter__") and not isinstance(x, (str, bytes)) -def _value( - ele: InputElement | SelectElement | TextareaElement, -) -> tuple[str | None, None | str | MultipleSelectOptions]: - n = ele.name - v = ele.value - if ele.tag == "select": - return _select_value(cast("SelectElement", ele), n, v) - return n, v - - -def _select_value( - ele: SelectElement, n: str | None, v: None | str | MultipleSelectOptions -) -> tuple[str | None, None | str | MultipleSelectOptions]: - multiple = ele.multiple - if v is None and not multiple: - # Match browser behavior on simple select tag without options selected - # And for select tags without options - o = ele.value_options - return (n, o[0]) if o else (None, None) - return n, v - - -def _url(form: FormElement, click_element: HtmlElement | None) -> str: +def _url(form: FormElement, click_element: Optional[HtmlElement]) -> str: if form.base_url is None: raise ValueError(f"{form} has no base_url set.") - action = (click_element.get("formaction") if click_element else None) or form.get( - "action" - ) + action = (click_element.get("formaction") if click_element is not None else None) or form.get("action") if action is None: return form.base_url return urljoin(form.base_url, strip_html5_whitespace(action)) -def _method(form: FormElement, click_element: HtmlElement | None) -> str: - method = (click_element.get("formmethod") if click_element else None) or form.method +def _method(form: FormElement, click_element: Optional[HtmlElement]) -> str: + method = None + if click_element is not None: + method = click_element.get("formmethod") + if method: + method_src = click_element + else: + method = form.method + method_src = form assert method is not None # lxml’s form.method is always filled - method = method.upper() - if method not in {"GET", "POST"}: - method = "GET" - return method + upper_method = method.upper() + if upper_method not in {"GET", "POST"}: + attribute = "formmethod" if method_src is click_element else "method" + raise NotImplementedError( + f"form2request does not support the {attribute} attribute of " + f"{method_src}: {method!r}" + ) + return upper_method class _NoClickables(ValueError): @@ -75,7 +62,7 @@ class _NoClickables(ValueError): def _click_element( form: FormElement, click: None | bool | HtmlElement -) -> HtmlElement | None: +) -> Optional[HtmlElement]: if click is False: return None if click in {None, True}: @@ -95,9 +82,9 @@ def _click_element( return click -def _data(data: FormdataType, click_element: HtmlElement | None) -> FormdataType: +def _data(data: FormdataType, click_element: Optional[HtmlElement]) -> FormdataType: data = data or {} - if click_element and (name := click_element.get("name")): + if click_element is not None and (name := click_element.get("name")): click_data = (name, click_element.get("value")) if isinstance(data, dict): data = dict(data) @@ -123,7 +110,7 @@ def _query(form: FormElement, data: FormdataType) -> str: ) values: list[FormdataKVType] = [ (k, "" if v is None else v) - for k, v in (_value(e) for e in inputs) + for k, v in ((e.name, e.value) for e in inputs) if k and k not in keys ] items = data.items() if isinstance(data, dict) else data @@ -182,11 +169,6 @@ def request_from_form( On forms with multiple submission elements, specifying the right submission element here may be necessary. """ - if form.get("enctype") == "multipart/form-data": - raise NotImplementedError( - f"{form} has enctype set to multipart/form-data, which " - f"form2request does not currently support." - ) try: click_element = _click_element(form, click) except _NoClickables: @@ -194,10 +176,16 @@ def request_from_form( f"No clickable elements found in form {form}. Set click=False or " f"point it to the element to be clicked." ) from None - if click_element and click_element.get("formenctype") == "multipart/form-data": + if click_element is not None and (enctype := click_element.get("formenctype")): + if enctype != "application/x-www-form-urlencoded": + raise NotImplementedError( + f"{click_element} has formenctype set to {enctype!r}, which " + f"form2request does not currently support." + ) + elif (enctype := form.get("enctype")) and enctype != "application/x-www-form-urlencoded": raise NotImplementedError( - f"{click_element} has formenctype set to multipart/form-data, " - f"which form2request does not currently support." + f"{form} has enctype set to {enctype!r}, which form2request does " + f"not currently support." ) url = _url(form, click_element) method = _method(form, click_element) diff --git a/tests/test_main.py b/tests/test_main.py index e6ef5c8..83a1e46 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -1,2 +1,624 @@ -def test_main(): - assert True +import pytest +from lxml.html import fromstring + +from form2request import Request, request_from_form + +@pytest.mark.parametrize( + ("base_url", "html", "data", "click", "expected"), + ( + # Empty form. + ( + "https://example.com", + b"""
""", + None, + None, + Request( + "https://example.com", + "GET", + [], + b"", + ), + ), + # Hidden field. + ( + "https://example.com", + b"""
""", + None, + None, + Request( + "https://example.com?a=b", + "GET", + [], + b"", + ), + ), + # User data not defined by any form field. + # We need to support this, for example, to make it easy to deal with + # forms that may have fields injected with JavaScript. + ( + "https://example.com", + b"""
""", + {"a": "b"}, + None, + Request( + "https://example.com?a=b", + "GET", + [], + b"", + ), + ), + # User data setting a value for a form field. + ( + "https://example.com", + b"""
""", + {"a": "b"}, + None, + Request( + "https://example.com?a=b", + "GET", + [], + b"", + ), + ), + # User data overriding the value of a form field. + # Also needed for JavaScript use cases. + ( + "https://example.com", + b"""
""", + {"a": "c"}, + None, + Request( + "https://example.com?a=c", + "GET", + [], + b"", + ), + ), + # User data with None as value not present in the form is ignored. + ( + "https://example.com", + b"""
""", + {"a": None}, + None, + Request( + "https://example.com", + "GET", + [], + b"", + ), + ), + # User data setting a value from a form field to None removes that + # value. + ( + "https://example.com", + b"""
""", + {"a": None}, + None, + Request( + "https://example.com", + "GET", + [], + b"", + ), + ), + # User data overriding the value of a form field to None removes that + # value. + ( + "https://example.com", + b"""
""", + {"a": None}, + None, + Request( + "https://example.com", + "GET", + [], + b"", + ), + ), + # Form field with an unset value. + ( + "https://example.com", + b"""
""", + None, + None, + Request( + "https://example.com?a=", + "GET", + [], + b"", + ), + ), + # User data as an iterable of key-value tuples. + ( + "https://example.com", + b"""
""", + (("a", "b"), ("a", "c")), + None, + Request( + "https://example.com?a=b&a=c", + "GET", + [], + b"", + ), + ), + # A submit button is “clicked” by default, i.e. its attributes are + # taken into account. + ( + "https://example.com", + b"""
""", + None, + None, + Request( + "https://example.com?a=b", + "GET", + [], + b"", + ), + ), + # You can disable the clicking of any submit button. + ( + "https://example.com", + b"""
""", + None, + False, + Request( + "https://example.com", + "GET", + [], + b"", + ), + ), + # You can force the clicking of the first submit button. + ( + "https://example.com", + b"""
""", + None, + True, + Request( + "https://example.com?a=b", + "GET", + [], + b"", + ), + ), + # Forcing the clicking of the first submit button will trigger a + # ValueError if there are no submit buttons. + ( + "https://example.com", + b"""
""", + None, + True, + ValueError, + ), + # If there are 2 or more submit buttons, the first one is used by + # default. + ( + "https://example.com", + b"""
""", + None, + None, + Request( + "https://example.com?a=b", + "GET", + [], + b"", + ), + ), + # You can force a specific submit button to be used. + ( + "https://example.com", + b"""
""", + None, + './/*[@value="c"]', + Request( + "https://example.com?a=c", + "GET", + [], + b"", + ), + ), + # Only the application/x-www-form-urlencoded enctype (default) is + # supported. + *( + ( + "https://example.com", + f"""
""".encode(), + None, + None, + Request( + "https://example.com", + "GET", + [], + b"", + ), + ) + for enctype in ( + "", + "application/x-www-form-urlencoded", + ) + ), + # Any other raise a NotImplementedError expection. + *( + ( + "https://example.com", + f"""
""".encode(), + None, + None, + NotImplementedError, + ) + for enctype in ( + "multipart/form-data", + "text/plain", + "foo", + ) + ), + # The formenctype from the submit button is taken into account. + ( + "https://example.com", + b"""
""", + None, + None, + Request( + "https://example.com", + "GET", + [], + b"", + ), + ), + # Even if the form has an unsupported enctype, things work if the + # submit button sets a supported one. + ( + "https://example.com", + b"""
""", + None, + None, + NotImplementedError, + ), + # Only submit buttons are detected as such. + *( + ( + "https://example.com", + f"""
{button}