diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 3c87037..56c7dd4 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -6,24 +6,24 @@ jobs: strategy: max-parallel: 3 matrix: - python-version: ["3.7", "3.8", "3.9", "3.10"] + python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] steps: - name: checkout - uses: actions/checkout@v2 + uses: actions/checkout@v4 - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v1 + uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} - name: Install dependencies run: | python -m pip install --upgrade pip - pip install -U black pytest pytest-cov - python setup.py -q install + python -m pip install --upgrade ".[dev]" - name: Style Check + if: matrix.python-version == '3.8' run: | black --check cdxj_indexer/* black --check test/* @@ -31,9 +31,10 @@ jobs: - name: Test with pytest run: | set -e - pytest -v --cov=cdxj_indexer --cov-report=xml + python -m pytest -v --cov=cdxj_indexer --cov-report=xml - name: Upload coverage to Codecov - uses: codecov/codecov-action@v1 + if: matrix.python-version == '3.8' + uses: codecov/codecov-action@v4 with: verbose: true diff --git a/README.rst b/README.rst index 01bd9bd..b21db2a 100644 --- a/README.rst +++ b/README.rst @@ -4,7 +4,7 @@ CDXJ Indexer A command-line tool for generating CDXJ (and CDX) indexes from WARC and ARC files. The indexer is a new tool redesigned for fast and flexible indexing. (Based on the indexing functionality from `pywb `_) -Install with ``pip install cdxj-indexer`` or install locally with ``python setup.py install`` +Install with ``pip install cdxj-indexer`` or install locally with ``pip install .`` (or ``pip install -e ".[dev]"`` to install in editable/development mode and include all dev dependencies: black, pytest, ...) The indexer supports classic CDX index format as well as the more flexible CDXJ. With CDXJ, the indexer supports custom fields and ``request`` record access for WARC files. See the examples below and the command line ``-h`` option for latest features. (This is a work in progress). @@ -42,5 +42,9 @@ More advanced use cases: add additonal http headers as fields. ``http:`` prefix The CDXJ Indexer extends the ``Indexer`` functionality in `warcio `_ and should be flexible to extend. +Contributing +~~~~~~~~~~~~~~~~~~~~ +Run tests with ``python -m pytest -v --cov=cdxj_indexer --cov-report term-missing`` +If you wanna build the sdist/wheel, first install ``build`` package with ``python -m pip install build`` and then run ``python -m build --sdist --wheel``. \ No newline at end of file diff --git a/cdxj_indexer/__init__.py b/cdxj_indexer/__init__.py index ab71815..9e3c5dd 100644 --- a/cdxj_indexer/__init__.py +++ b/cdxj_indexer/__init__.py @@ -1,3 +1,5 @@ from cdxj_indexer.main import CDXJIndexer, iter_file_or_dir from cdxj_indexer.postquery import append_method_query_from_req_resp from cdxj_indexer.bufferiter import buffering_record_iter + +__version__ = "1.5.0-dev0" diff --git a/cdxj_indexer/postquery.py b/cdxj_indexer/postquery.py index aafb110..073c090 100644 --- a/cdxj_indexer/postquery.py +++ b/cdxj_indexer/postquery.py @@ -1,3 +1,4 @@ +from multipart import MultipartParser from warcio.utils import to_native_str from urllib.parse import unquote_plus, urlencode @@ -6,12 +7,13 @@ from cdxj_indexer.amf import amf_parse import base64 -import cgi import json import sys +import re MAX_QUERY_LENGTH = 4096 + # ============================================================================ def append_method_query_from_req_resp(req, resp): len_ = req.http_headers.get_header("Content-Length") @@ -93,27 +95,17 @@ def handle_binary(query_data): query = handle_binary(query_data) elif mime.startswith("multipart/"): - env = { - "REQUEST_METHOD": "POST", - "CONTENT_TYPE": mime, - "CONTENT_LENGTH": len(query_data), - } - - args = dict(fp=BytesIO(query_data), environ=env, keep_blank_values=True) - - args["encoding"] = "utf-8" - - try: - data = cgi.FieldStorage(**args) - except ValueError: - # Content-Type multipart/form-data may lack "boundary" info - query = handle_binary(query_data) - else: + if boundary_match := re.match(r".*boundary=(\w*?)(?:\s|$|;).*", mime): + data = MultipartParser( + stream=BytesIO(query_data), boundary=boundary_match[1], charset="utf-8" + ) values = [] - for item in data.list: + for item in data.parts(): values.append((item.name, item.value)) - query = urlencode(values, True) + else: + # Content-Type multipart/form-data may lack "boundary" info + query = handle_binary(query_data) elif mime.startswith("application/json"): try: diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..ccf30f3 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,51 @@ +[project] +name = "cdxj_indexer" +description = "CDXJ Indexer for WARC and ARC files" +readme = "README.rst" +authors = [ + { name = "Ilya Kreymer", email = "ikreymer@gmail.com" } +] +license = { text = "Apache 2.0" } +dynamic = ["version"] +dependencies = [ + 'warcio', + 'surt', + 'py3amf', + 'multipart' +] +classifiers = [ + "Development Status :: 4 - Beta", + "Environment :: Web Environment", + "License :: OSI Approved :: Apache Software License", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Topic :: Software Development :: Libraries :: Python Modules", + "Topic :: Utilities", +] + +[project.optional-dependencies] +lint = [ + "black", +] +test = [ + "pytest", + "pytest-cov", +] +dev = [ + "cdxj_indexer[lint]", + "cdxj_indexer[test]", +] + +[project.scripts] +cdxj-indexer = "cdxj_indexer.main:main" + +[build-system] +requires = ["setuptools"] +build-backend = "setuptools.build_meta" + +[tool.setuptools.dynamic] +version = {attr = "cdxj_indexer.__version__"} \ No newline at end of file diff --git a/setup.py b/setup.py deleted file mode 100755 index 9679105..0000000 --- a/setup.py +++ /dev/null @@ -1,78 +0,0 @@ -#!/usr/bin/env python -# vim: set sw=4 et: - -from setuptools import setup, find_packages -from setuptools.command.test import test as TestCommand -import glob - -__version__ = "1.4.5" - - -class PyTest(TestCommand): - def finalize_options(self): - TestCommand.finalize_options(self) - # should work with setuptools <18, 18 18.5 - self.test_suite = " " - - def run_tests(self): - import pytest - import sys - import os - - errcode = pytest.main( - [ - "--doctest-modules", - "./cdxj_indexer", - "--cov", - "cdxj_indexer", - "-v", - "test/", - ] - ) - sys.exit(errcode) - - -setup( - name="cdxj_indexer", - version=__version__, - author="Ilya Kreymer", - author_email="ikreymer@gmail.com", - license="Apache 2.0", - packages=find_packages(), - url="https://github.com/webrecorder/cdxj-indexer", - description="CDXJ Indexer for WARC and ARC files", - long_description=open("README.rst").read(), - provides=[ - "cdxj_indexer", - ], - install_requires=[ - "warcio", - "surt", - # temp fix for requests - "idna<3.0", - "py3amf", - ], - zip_safe=True, - entry_points=""" - [console_scripts] - cdxj-indexer=cdxj_indexer.main:main - """, - cmdclass={"test": PyTest}, - test_suite="", - tests_require=[ - "pytest", - "pytest-cov", - ], - classifiers=[ - "Development Status :: 4 - Beta", - "Environment :: Web Environment", - "License :: OSI Approved :: Apache Software License", - "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.5", - "Programming Language :: Python :: 3.6", - "Programming Language :: Python :: 3.7", - "Programming Language :: Python :: 3.8", - "Topic :: Software Development :: Libraries :: Python Modules", - "Topic :: Utilities", - ], -) diff --git a/test/test_indexer.py b/test/test_indexer.py index dee7ea0..42bd8b9 100644 --- a/test/test_indexer.py +++ b/test/test_indexer.py @@ -10,7 +10,6 @@ from cdxj_indexer.main import write_cdx_index, main, CDXJIndexer -import pkg_resources TEST_DIR = os.path.join(os.path.dirname(os.path.realpath(__file__)), "data")