diff --git a/.bandit.yml b/.bandit.yml new file mode 100644 index 0000000..2237265 --- /dev/null +++ b/.bandit.yml @@ -0,0 +1,3 @@ +skips: +- B101 # assert_used, needed for mypy +exclude_dirs: ['tests'] diff --git a/.coveragerc b/.coveragerc new file mode 100644 index 0000000..0856c03 --- /dev/null +++ b/.coveragerc @@ -0,0 +1,3 @@ +[report] +exclude_lines = + if TYPE_CHECKING: diff --git a/.flake8 b/.flake8 index 9ee8f89..f06e676 100644 --- a/.flake8 +++ b/.flake8 @@ -1,6 +1,10 @@ [flake8] +extend-select = TC, TC1 ignore = +max-line-length = 88 per-file-ignores = + # F401: Imported but unused + form2request/__init__.py:F401 # D100-D104: Missing docstring docs/conf.py:D100 tests/__init__.py:D104 diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 0d2ca22..70870f8 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -41,7 +41,7 @@ jobs: fail-fast: false matrix: python-version: ['3.12'] - tox-job: ["pre-commit", "mypy", "docs", "twinecheck"] + tox-job: ["pre-commit", "mypy", "docs", "doctest", "twinecheck"] steps: - uses: actions/checkout@v4 - name: Set up Python ${{ matrix.python-version }} diff --git a/.gitignore b/.gitignore index b53725c..bc020ed 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,4 @@ /.coverage /coverage.xml +/dist/ +/.tox/ diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 663563f..6d5a2de 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -17,8 +17,20 @@ repos: - flake8-debugger - flake8-docstrings - flake8-string-format + - flake8-type-checking - repo: https://github.com/asottile/pyupgrade rev: v3.16.0 hooks: - id: pyupgrade args: [--py38-plus] +- repo: https://github.com/pycqa/bandit + rev: 1.7.9 + hooks: + - id: bandit + args: [-r, -c, .bandit.yml] +- repo: https://github.com/adamchainz/blacken-docs + rev: 1.18.0 + hooks: + - id: blacken-docs + additional_dependencies: + - black==24.4.2 diff --git a/README.rst b/README.rst index 586477c..a6b7f0d 100644 --- a/README.rst +++ b/README.rst @@ -20,8 +20,8 @@ form2request .. description starts -``form2request`` is an AI-powered Python 3.8+ library to build HTTP requests -out of HTML forms. +``form2request`` is a Python 3.8+ library to build HTTP requests out of HTML +forms. .. description ends diff --git a/dist/form2request-0.0.0.tar.gz b/dist/form2request-0.0.0.tar.gz deleted file mode 100644 index f99bed9..0000000 Binary files a/dist/form2request-0.0.0.tar.gz and /dev/null differ diff --git a/docs/api.rst b/docs/api.rst index 29c1176..18f8d4b 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -2,4 +2,8 @@ API reference ============= -… +.. autofunction:: form2request.form2request + +.. autoclass:: form2request.Request + :members: + :undoc-members: diff --git a/docs/conf.py b/docs/conf.py index acc9c71..f2435e9 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -9,11 +9,24 @@ html_theme = "sphinx_rtd_theme" -autodoc_member_order = "groupwise" - intersphinx_disabled_reftypes = [] intersphinx_mapping = { "lxml": ("https://lxml.de/apidoc/", None), "parsel": ("https://parsel.readthedocs.io/en/stable", None), "python": ("https://docs.python.org/3", None), + "scrapy": ("https://docs.scrapy.org/en/latest", None), } + +nitpick_ignore = [ + *( + ("py:class", cls) + for cls in ( + # https://github.com/sphinx-doc/sphinx/issues/11225 + "FormdataType", + "FormElement", + "HtmlElement", + "Selector", + "SelectorList", + ) + ), +] diff --git a/docs/usage.rst b/docs/usage.rst index d9237b5..08abe3a 100644 --- a/docs/usage.rst +++ b/docs/usage.rst @@ -2,4 +2,221 @@ Usage ===== -… +:ref:`Given an HTML form
`: + +.. _parsel-example: + +>>> from parsel import Selector +>>> html = b"""
""" +>>> selector = Selector(body=html, base_url="https://example.com") +>>> form = selector.css("form") + +You can use :func:`~form2request.form2request` to generate form submission +request data: + +>>> from form2request import form2request +>>> req = form2request(form) +>>> req +Request(url='https://example.com?foo=bar', method='GET', headers=[], body=b'') + +:func:`~form2request.form2request` does not make requests, but you can use its +output to build requests with any HTTP client software, e.g. with the requests_ +library: + +.. _requests: https://requests.readthedocs.io/en/latest/ + +.. _requests-example: + +>>> import requests +>>> requests.request(req.method, req.url, headers=req.headers, data=req.body) # doctest: +SKIP + + +:func:`~form2request.form2request` supports :ref:`user-defined form data +`, :ref:`choosing a specific submit button (or none) `, and +:ref:`overriding form attributes `. + + +.. _form: + +Getting a form +============== + +:func:`~form2request.form2request` requires an HTML form object. You can get +one using :doc:`parsel `, as :ref:`seen above `, +or you can use :doc:`lxml `: + +.. _fromstring-example: + +>>> from lxml.html import fromstring +>>> root = fromstring(html, base_url="https://example.com") +>>> form = root.xpath("//form")[0] + +If you use a library or framework based on :doc:`parsel ` or +:doc:`lxml `, chances are they also let you get a form object. For +example, when using a :doc:`Scrapy ` response: + +>>> from scrapy.http import TextResponse +>>> response = TextResponse("https://example.com", body=html) +>>> form = response.css("form") + +Here are some examples of XPath expressions that can be useful to get a form +using parsel’s :meth:`Selector.xpath ` or +lxml’s :meth:`HtmlElement.xpath `: + +- To find a form by one of its attributes, such as ``id`` or ``name``, use + ``//form[@=""]``. For example, to find ``
`, ``#`` + (e.g. ``#foo``) finds by ``id``, and ``[=""]`` (e.g. + ``[name=foo]`` or ``[name="foo bar"]``) finds by any other attribute. + +- To find a form by index, by order of appearance in the HTML code, use + ``(//form)[n]``, where ``n`` is a 1-based index. For example, to find the + 2nd form, use ``(//form)[2]``. + +If you prefer, you could use the XPath of an element inside the form, and then +visit parent elements until you reach the form element. For example: + +.. code-block:: python + + element = root.xpath('//input[@name="zip_code"]')[0] + while True: + if element.tag == "form": + break + element = element.getparent() + form = element + + +.. _data: + +Setting form data +================= + +While there are forms made entirely of hidden fields, like :ref:`the one above +`, most often you will work with forms that expect +user-defined data: + +>>> html = b"""""" +>>> selector = Selector(body=html, base_url="https://example.com") +>>> form = selector.css("form") + +Use the ``data`` parameter of :func:`~form2request.form2request`, to define +the corresponding data: + +>>> form2request(form, {"foo": "bar"}) +Request(url='https://example.com?foo=bar', method='GET', headers=[], body=b'') + +You may sometimes find forms where more than one field has the same ``name`` +attribute: + +>>> html = b"""
""" +>>> selector = Selector(body=html, base_url="https://example.com") +>>> form = selector.css("form") + +To specify values for all same-name fields, instead of a dictionary, use an +iterable of key-value tuples: + +>>> form2request(form, (("foo", "bar"), ("foo", "baz"))) +Request(url='https://example.com?foo=bar&foo=baz', method='GET', headers=[], body=b'') + +.. _remove-data: + +Sometimes, you might want to prevent a value from a field from being included +in the generated request data. For example, because the field is removed or +disabled through JavaScript, or because the field or a parent element has the +``disabled`` attribute (currently not supported by form2request): + +>>> html = b"""
""" +>>> selector = Selector(body=html, base_url="https://example.com") +>>> form = selector.css("form") + +To remove a field value, set it to ``None``: + +>>> form2request(form, {"foo": None}) +Request(url='https://example.com', method='GET', headers=[], body=b'') + + +.. _click: + +Choosing a submit button +======================== + +When an HTML form is submitted, the way form submission is triggered has an +impact on the resulting request data. + +Given a submit button with ``name`` and ``value`` attributes: + +>>> html = b"""
""" +>>> selector = Selector(body=html, base_url="https://example.com") +>>> form = selector.css("form") + +If you submit the form by clicking that button, those attributes are included +in the request data, which is what :func:`~form2request.form2request` does +by default: + +>>> form2request(form) +Request(url='https://example.com?foo=bar', method='GET', headers=[], body=b'') + +However, sometimes it is possible to submit a form without clicking a submit +button, even when there is such a button. In such cases, the button data should +not be part of the request data. For such cases, set ``click`` to ``False``: + +>>> form2request(form, click=False) +Request(url='https://example.com', method='GET', headers=[], body=b'') + +You may also find forms with more than one submit button: + +>>> html = b"""
""" +>>> selector = Selector(body=html, base_url="https://example.com") +>>> form = selector.css("form") + +By default, :func:`~form2request.form2request` clicks the first submit button: + +>>> form2request(form) +Request(url='https://example.com?foo=bar', method='GET', headers=[], body=b'') + +To change that, set ``click`` to the element that should be clicked: + +>>> submit_baz = form.css("[value=baz]") +>>> form2request(form, click=submit_baz) +Request(url='https://example.com?foo=baz', method='GET', headers=[], body=b'') + + +.. _override: + +Overriding form attributes +========================== + +You can override the method_ and enctype_ attributes of a form: + +.. _enctype: https://developer.mozilla.org/en-US/docs/Web/HTML/Element/form#enctype +.. _method: https://developer.mozilla.org/en-US/docs/Web/HTML/Element/form#method + +>>> form2request(form, method="POST", enctype="text/plain") +Request(url='https://example.com', method='POST', headers=[('Content-Type', 'text/plain')], body=b'foo=bar') + + +.. _request: + +Using request data +================== + +The output of :func:`~form2request.form2request`, +:class:`~form2request.Request`, is a simple request data container: + +>>> req = form2request(form) +>>> req +Request(url='https://example.com?foo=bar', method='GET', headers=[], body=b'') + +While :func:`~form2request.form2request` does not make requests, you can use +its output request data to build an actual request with any HTTP client +software, like the requests_ library (see an example :ref:`above +`) or the :doc:`Scrapy ` web scraping +framework: + +.. _Scrapy: https://docs.scrapy.org/en/latest/ + +>>> from scrapy import Request +>>> Request(req.url, method=req.method, headers=req.headers, body=req.body) + diff --git a/form2request/__init__.py b/form2request/__init__.py index 4908450..9868802 100644 --- a/form2request/__init__.py +++ b/form2request/__init__.py @@ -1 +1,3 @@ """Build HTTP requests out of HTML forms.""" + +from ._base import Request, form2request diff --git a/form2request/_base.py b/form2request/_base.py new file mode 100644 index 0000000..ceeb8f7 --- /dev/null +++ b/form2request/_base.py @@ -0,0 +1,239 @@ +from __future__ import annotations + +from dataclasses import dataclass +from typing import TYPE_CHECKING, Dict, Iterable, Optional, Tuple, Union +from urllib.parse import urlencode, urljoin, urlsplit, urlunsplit + +from parsel import Selector, SelectorList +from w3lib.html import strip_html5_whitespace + +if TYPE_CHECKING: + from lxml.html import FormElement # nosec + from lxml.html import HtmlElement # nosec + +FormdataVType = Union[str, Iterable[str]] +FormdataKVType = Tuple[str, FormdataVType] +FormdataType = Optional[Union[Dict[str, FormdataVType], Iterable[FormdataKVType]]] + + +def _parsel_to_lxml(element: HtmlElement | Selector | SelectorList) -> HtmlElement: + if isinstance(element, SelectorList): + element = element[0] + if isinstance(element, Selector): + element = element.root + return element + + +def _enctype( + form: FormElement, click_element: HtmlElement | None, enctype: None | str +) -> str: + if enctype: + enctype = enctype.lower() + if enctype not in {"application/x-www-form-urlencoded", "text/plain"}: + raise ValueError( + f"The specified form enctype ({enctype!r}) is not supported " + f"for forms with the POST method." + ) + elif click_element is not None and ( + enctype := (click_element.get("formenctype") or "").lower() + ): + if enctype == "multipart/form-data": + raise NotImplementedError( + f"{click_element} has formenctype set to {enctype!r}, which " + f"form2request does not currently support for forms with the " + f"POST method." + ) + elif ( + enctype := (form.get("enctype") or "").lower() + ) and enctype == "multipart/form-data": + raise NotImplementedError( + f"{form} has enctype set to {enctype!r}, which form2request does " + f"not currently support for forms with the POST method." + ) + return enctype + + +def _url(form: FormElement, click_element: HtmlElement | None) -> str: + if form.base_url is None: + raise ValueError(f"{form} has no base_url set.") + action = ( + click_element.get("formaction") if click_element is not None else None + ) or form.get("action") + if action is None: + return form.base_url + return urljoin(form.base_url, strip_html5_whitespace(action)) + + +USER = object() + + +def _method( + form: FormElement, click_element: HtmlElement | None, method: None | str +) -> str: + if method: + method_src = USER + else: + if click_element is not None: + method = click_element.get("formmethod") + if method: + method_src = click_element + else: + method = form.method + assert method is not None # lxml’s form.method is always filled + method_src = form + method = method.upper() + if method_src is USER and method not in {"GET", "POST"}: + raise ValueError(f"The specified form method ({method!r}) is not supported.") + if method == "DIALOG": + if method_src is click_element: + raise NotImplementedError( + f"Found unsupported form method {method!r} in the formmethod " + f"attribute of the submission button." + ) + raise NotImplementedError(f"Found unsupported form method {method!r}.") + if method not in {"GET", "POST"}: + method = "GET" + return method + + +def _click_element( + form: FormElement, click: None | bool | HtmlElement +) -> HtmlElement | None: + if click is False: + return None + if click is None or click is True: + clickables = list( + form.xpath( + 'descendant::input[re:test(@type, "^(submit|image)$", "i")]' + '|descendant::button[not(@type) or re:test(@type, "^submit$", "i")]', + namespaces={"re": "http://exslt.org/regular-expressions"}, + ) + ) + if not clickables: + if click: + raise ValueError( + f"No clickable elements found in form {form}. Set click=False or " + f"point it to the element to be clicked." + ) + else: + return None + click = clickables[0] + else: + click = _parsel_to_lxml(click) + return click + + +def _data( + form: FormElement, data: FormdataType, click_element: HtmlElement | None +) -> list[tuple[str, str]]: + data = data or {} + if click_element is not None and (name := click_element.get("name")): + click_data = (name, click_element.get("value")) + if isinstance(data, dict): + data = dict(data) + data[click_data[0]] = click_data[1] + else: + data = list(data) + data.append(click_data) + keys = dict(data or ()).keys() + if not data: + data = [] + inputs = form.xpath( + "descendant::textarea" + "|descendant::select" + "|descendant::input[not(@type) or @type[" + ' not(re:test(., "^(?:submit|image|reset)$", "i"))' + " and (../@checked or" + ' not(re:test(., "^(?:checkbox|radio)$", "i")))]]', + namespaces={"re": "http://exslt.org/regular-expressions"}, + ) + values: list[FormdataKVType] = [ + (k, "" if v is None else v) + for k, v in ((e.name, e.value) for e in inputs) + if k and k not in keys + ] + items = data.items() if isinstance(data, dict) else data + values.extend((k, v) for k, v in items if v is not None) + return [ + (k, v) + for k, vs in values + for v in ([vs] if isinstance(vs, (str, bytes)) else vs) + ] + + +@dataclass +class Request: + """HTTP request data.""" + + url: str + method: str + headers: list[tuple[str, str]] + body: bytes + + +def form2request( + form: FormElement | Selector | SelectorList, + data: FormdataType = None, + *, + click: None | bool | HtmlElement = None, + method: None | str = None, + enctype: None | str = None, +) -> Request: + """Return request data for an HTML form submission. + + *form* must be an instance of :class:`parsel.selector.Selector` or + :class:`parsel.selector.SelectorList` that points to an HTML form, or an + instance of :class:`lxml.html.FormElement`. + + *data* should be either a dictionary or a list of 2-item tuples indicating + the key-value pairs to include in the request as submission data. Keys with + ``None`` as value exclude matching form fields. + + *click* can be any of: + + - ``None`` (default): the first submission element of the form (e.g. a + submit button) is used to build a request for a click-based + form submission. + + If no submission elements are found, the request is built for a + non-click-based form submission, i.e. a form submission triggered by a + non-click event, such as pressing the Enter key while the focus is in + a single-line text input field of the form. + + - ``True`` behaves like ``None``, but raises a :exc:`ValueError` + exception if no submission element is found in the form. + + - ``False`` builds a request for a non-click-based form submission. + + - A submit button of *form*, to build a request for a form submission + based on the clicking of that button. + + On forms with multiple submit buttons, specifying the right button here + may be necessary. + + *method* and *enctype* may be used to override matching form attributes. + """ + form = _parsel_to_lxml(form) + click_element = _click_element(form, click) + url = _url(form, click_element) + method = _method(form, click_element, method) + headers = [] + body = "" + data = _data(form, data, click_element) + if method == "GET": + url = urlunsplit(urlsplit(url)._replace(query=urlencode(data, doseq=True))) + else: + assert method == "POST" + enctype = _enctype(form, click_element, enctype) + if enctype == "text/plain": + headers = [("Content-Type", "text/plain")] + body = "\n".join(f"{k}={v}" for k, v in data) + else: + headers = [("Content-Type", "application/x-www-form-urlencoded")] + body = urlencode(data, doseq=True) + return Request( + url=url, + method=method, + headers=headers, + body=body.encode(), + ) diff --git a/pyproject.toml b/pyproject.toml index a566415..8088639 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,6 +25,8 @@ classifiers = [ requires-python = ">=3.8" dependencies = [ "lxml >= 4.4.1", + "parsel >= 1.8.1", + "w3lib >= 1.19.0", ] [project.urls] diff --git a/tests/test_main.py b/tests/test_main.py index e6ef5c8..82c9788 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -1,2 +1,774 @@ -def test_main(): - assert True +import pytest +from lxml.html import fromstring +from parsel import Selector + +from form2request import Request, form2request + + +@pytest.mark.parametrize( + ("base_url", "html", "kwargs", "expected"), + ( + # Empty form. + ( + "https://example.com", + b"""
""", + {}, + Request( + "https://example.com", + "GET", + [], + b"", + ), + ), + # Hidden field. + ( + "https://example.com", + b"""
""", + {}, + Request( + "https://example.com?a=b", + "GET", + [], + b"", + ), + ), + # User data not defined by any form field. + # We need to support this, for example, to make it easy to deal with + # forms that may have fields injected with JavaScript. + ( + "https://example.com", + b"""
""", + {"data": {"a": "b"}}, + Request( + "https://example.com?a=b", + "GET", + [], + b"", + ), + ), + # User data setting a value for a form field. + ( + "https://example.com", + b"""
""", + {"data": {"a": "b"}}, + Request( + "https://example.com?a=b", + "GET", + [], + b"", + ), + ), + # User data overriding the value of a form field. + # Also needed for JavaScript use cases. + ( + "https://example.com", + b"""
""", + {"data": {"a": "c"}}, + Request( + "https://example.com?a=c", + "GET", + [], + b"", + ), + ), + # User data with None as value not present in the form is ignored. + ( + "https://example.com", + b"""
""", + {"data": {"a": None}}, + Request( + "https://example.com", + "GET", + [], + b"", + ), + ), + # User data setting a value from a form field to None removes that + # value. + ( + "https://example.com", + b"""
""", + {"data": {"a": None}}, + Request( + "https://example.com", + "GET", + [], + b"", + ), + ), + # User data overriding the value of a form field to None removes that + # value. + ( + "https://example.com", + b"""
""", + {"data": {"a": None}}, + Request( + "https://example.com", + "GET", + [], + b"", + ), + ), + # Form field with an unset value. + ( + "https://example.com", + b"""
""", + {}, + Request( + "https://example.com?a=", + "GET", + [], + b"", + ), + ), + # User data as an iterable of key-value tuples. + ( + "https://example.com", + b"""
""", + {"data": (("a", "b"), ("a", "c"))}, + Request( + "https://example.com?a=b&a=c", + "GET", + [], + b"", + ), + ), + # A submit button is “clicked” by default, i.e. its attributes are + # taken into account. + ( + "https://example.com", + b"""
""", + {}, + Request( + "https://example.com?a=b", + "GET", + [], + b"", + ), + ), + # You can disable the clicking of any submit button. + ( + "https://example.com", + b"""
""", + {"click": False}, + Request( + "https://example.com", + "GET", + [], + b"", + ), + ), + # You can force the clicking of the first submit button. + ( + "https://example.com", + b"""
""", + {"click": True}, + Request( + "https://example.com?a=b", + "GET", + [], + b"", + ), + ), + # Forcing the clicking of the first submit button will trigger a + # ValueError if there are no submit buttons. + ( + "https://example.com", + b"""
""", + {"click": True}, + ValueError, + ), + # If there are 2 or more submit buttons, the first one is used by + # default. + ( + "https://example.com", + b"""
+
""", + {}, + Request( + "https://example.com?a=b", + "GET", + [], + b"", + ), + ), + # You can force a specific submit button to be used. + ( + "https://example.com", + b"""
+
""", + {"click": './/*[@value="c"]'}, + Request( + "https://example.com?a=c", + "GET", + [], + b"", + ), + ), + # Supported enctypes are application/x-www-form-urlencoded (default) + # and text/plain. Unknown enctypes are treated as the default one. + *( + ( + "https://example.com", + f"""
""".encode(), + {}, + Request( + "https://example.com", + "GET", + [], + b"", + ), + ) + for enctype in ( + "", + "application/x-www-form-urlencoded", + "text/plain", + "foo", + ) + ), + # multipart/form-data raises a NotImplementedError exception when the + # method is POST. + ( + "https://example.com", + b"""
""", + {}, + NotImplementedError, + ), + # multipart/form-data does work when method is GET (default). + ( + "https://example.com", + b"""
+
""", + {}, + Request( + "https://example.com?a=b", + "GET", + [], + b"", + ), + ), + # The formenctype from the submit button is taken into account, even if + # it has an unknown value. + ( + "https://example.com", + b"""
""", + {}, + Request( + "https://example.com", + "POST", + [("Content-Type", "application/x-www-form-urlencoded")], + b"", + ), + ), + ( + "https://example.com", + b"""
""", + {}, + Request( + "https://example.com", + "POST", + [("Content-Type", "application/x-www-form-urlencoded")], + b"", + ), + ), + ( + "https://example.com", + b"""
+
""", + {}, + NotImplementedError, + ), + # enctype may be overridden, in which case it raises ValueError for + # both unknown and unsupported values when method is POST. + ( + "https://example.com", + b"""
""", + {"enctype": "multipart/form-data"}, + ValueError, + ), + ( + "https://example.com", + b"""
""", + {"enctype": "a"}, + ValueError, + ), + # Only submit buttons are detected as such. + *( + ( + "https://example.com", + f"""
{button}