From 74c681b86ce6973d238adb2d5918adf9235d022b Mon Sep 17 00:00:00 2001 From: benoit74 Date: Wed, 22 May 2024 09:11:55 +0000 Subject: [PATCH 1/9] Add missing newline according to new black conventions --- cdxj_indexer/postquery.py | 1 + 1 file changed, 1 insertion(+) diff --git a/cdxj_indexer/postquery.py b/cdxj_indexer/postquery.py index aafb110..938af01 100644 --- a/cdxj_indexer/postquery.py +++ b/cdxj_indexer/postquery.py @@ -12,6 +12,7 @@ MAX_QUERY_LENGTH = 4096 + # ============================================================================ def append_method_query_from_req_resp(req, resp): len_ = req.http_headers.get_header("Content-Length") From 05e6c529ecd004536f7488771f5b85f3e5af2475 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Wed, 22 May 2024 09:12:32 +0000 Subject: [PATCH 2/9] Drop support for Python 3.7, add 3.11 and 3.12 --- .github/workflows/ci.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 3c87037..b42e83b 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -6,7 +6,7 @@ jobs: strategy: max-parallel: 3 matrix: - python-version: ["3.7", "3.8", "3.9", "3.10"] + python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] steps: - name: checkout From 26fb63989b0f0d2c485e750e863b932671941ec6 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Wed, 22 May 2024 09:15:47 +0000 Subject: [PATCH 3/9] Replace deprecated (in 3.12) cgi package with multipart external dependency --- cdxj_indexer/postquery.py | 29 ++++++++++------------------- setup.py | 1 + 2 files changed, 11 insertions(+), 19 deletions(-) diff --git a/cdxj_indexer/postquery.py b/cdxj_indexer/postquery.py index 938af01..073c090 100644 --- a/cdxj_indexer/postquery.py +++ b/cdxj_indexer/postquery.py @@ -1,3 +1,4 @@ +from multipart import MultipartParser from warcio.utils import to_native_str from urllib.parse import unquote_plus, urlencode @@ -6,9 +7,9 @@ from cdxj_indexer.amf import amf_parse import base64 -import cgi import json import sys +import re MAX_QUERY_LENGTH = 4096 @@ -94,27 +95,17 @@ def handle_binary(query_data): query = handle_binary(query_data) elif mime.startswith("multipart/"): - env = { - "REQUEST_METHOD": "POST", - "CONTENT_TYPE": mime, - "CONTENT_LENGTH": len(query_data), - } - - args = dict(fp=BytesIO(query_data), environ=env, keep_blank_values=True) - - args["encoding"] = "utf-8" - - try: - data = cgi.FieldStorage(**args) - except ValueError: - # Content-Type multipart/form-data may lack "boundary" info - query = handle_binary(query_data) - else: + if boundary_match := re.match(r".*boundary=(\w*?)(?:\s|$|;).*", mime): + data = MultipartParser( + stream=BytesIO(query_data), boundary=boundary_match[1], charset="utf-8" + ) values = [] - for item in data.list: + for item in data.parts(): values.append((item.name, item.value)) - query = urlencode(values, True) + else: + # Content-Type multipart/form-data may lack "boundary" info + query = handle_binary(query_data) elif mime.startswith("application/json"): try: diff --git a/setup.py b/setup.py index 9679105..0220633 100755 --- a/setup.py +++ b/setup.py @@ -51,6 +51,7 @@ def run_tests(self): # temp fix for requests "idna<3.0", "py3amf", + "multipart", ], zip_safe=True, entry_points=""" From 8520dcd8d91ce936c7b330898997fc2ca4b755a3 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Wed, 22 May 2024 09:16:04 +0000 Subject: [PATCH 4/9] Remove idna constraint now that requests is OK --- setup.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/setup.py b/setup.py index 0220633..59d7060 100755 --- a/setup.py +++ b/setup.py @@ -48,8 +48,6 @@ def run_tests(self): install_requires=[ "warcio", "surt", - # temp fix for requests - "idna<3.0", "py3amf", "multipart", ], From f85082041ae8a9254b151c0f2bd839c3f4fe244a Mon Sep 17 00:00:00 2001 From: benoit74 Date: Wed, 22 May 2024 09:38:40 +0000 Subject: [PATCH 5/9] Migrate from setup.py to pyproject.toml --- .github/workflows/ci.yaml | 5 +-- README.rst | 2 +- cdxj_indexer/__init__.py | 2 + pyproject.toml | 51 ++++++++++++++++++++++++++ setup.py | 77 --------------------------------------- 5 files changed, 56 insertions(+), 81 deletions(-) create mode 100644 pyproject.toml delete mode 100755 setup.py diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index b42e83b..5d2c62a 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -20,8 +20,7 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - pip install -U black pytest pytest-cov - python setup.py -q install + python -m pip install --upgrade ".[dev]" - name: Style Check run: | @@ -31,7 +30,7 @@ jobs: - name: Test with pytest run: | set -e - pytest -v --cov=cdxj_indexer --cov-report=xml + python -m pytest -v --cov=cdxj_indexer --cov-report=xml - name: Upload coverage to Codecov uses: codecov/codecov-action@v1 diff --git a/README.rst b/README.rst index 01bd9bd..ed18e3f 100644 --- a/README.rst +++ b/README.rst @@ -4,7 +4,7 @@ CDXJ Indexer A command-line tool for generating CDXJ (and CDX) indexes from WARC and ARC files. The indexer is a new tool redesigned for fast and flexible indexing. (Based on the indexing functionality from `pywb `_) -Install with ``pip install cdxj-indexer`` or install locally with ``python setup.py install`` +Install with ``pip install cdxj-indexer`` or install locally with ``pip install .`` (or ``pip install -e ".[dev]"`` to install in editable/development mode and include all dev dependencies: black, pytest, ...) The indexer supports classic CDX index format as well as the more flexible CDXJ. With CDXJ, the indexer supports custom fields and ``request`` record access for WARC files. See the examples below and the command line ``-h`` option for latest features. (This is a work in progress). diff --git a/cdxj_indexer/__init__.py b/cdxj_indexer/__init__.py index ab71815..9e3c5dd 100644 --- a/cdxj_indexer/__init__.py +++ b/cdxj_indexer/__init__.py @@ -1,3 +1,5 @@ from cdxj_indexer.main import CDXJIndexer, iter_file_or_dir from cdxj_indexer.postquery import append_method_query_from_req_resp from cdxj_indexer.bufferiter import buffering_record_iter + +__version__ = "1.5.0-dev0" diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..ccf30f3 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,51 @@ +[project] +name = "cdxj_indexer" +description = "CDXJ Indexer for WARC and ARC files" +readme = "README.rst" +authors = [ + { name = "Ilya Kreymer", email = "ikreymer@gmail.com" } +] +license = { text = "Apache 2.0" } +dynamic = ["version"] +dependencies = [ + 'warcio', + 'surt', + 'py3amf', + 'multipart' +] +classifiers = [ + "Development Status :: 4 - Beta", + "Environment :: Web Environment", + "License :: OSI Approved :: Apache Software License", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Topic :: Software Development :: Libraries :: Python Modules", + "Topic :: Utilities", +] + +[project.optional-dependencies] +lint = [ + "black", +] +test = [ + "pytest", + "pytest-cov", +] +dev = [ + "cdxj_indexer[lint]", + "cdxj_indexer[test]", +] + +[project.scripts] +cdxj-indexer = "cdxj_indexer.main:main" + +[build-system] +requires = ["setuptools"] +build-backend = "setuptools.build_meta" + +[tool.setuptools.dynamic] +version = {attr = "cdxj_indexer.__version__"} \ No newline at end of file diff --git a/setup.py b/setup.py deleted file mode 100755 index 59d7060..0000000 --- a/setup.py +++ /dev/null @@ -1,77 +0,0 @@ -#!/usr/bin/env python -# vim: set sw=4 et: - -from setuptools import setup, find_packages -from setuptools.command.test import test as TestCommand -import glob - -__version__ = "1.4.5" - - -class PyTest(TestCommand): - def finalize_options(self): - TestCommand.finalize_options(self) - # should work with setuptools <18, 18 18.5 - self.test_suite = " " - - def run_tests(self): - import pytest - import sys - import os - - errcode = pytest.main( - [ - "--doctest-modules", - "./cdxj_indexer", - "--cov", - "cdxj_indexer", - "-v", - "test/", - ] - ) - sys.exit(errcode) - - -setup( - name="cdxj_indexer", - version=__version__, - author="Ilya Kreymer", - author_email="ikreymer@gmail.com", - license="Apache 2.0", - packages=find_packages(), - url="https://github.com/webrecorder/cdxj-indexer", - description="CDXJ Indexer for WARC and ARC files", - long_description=open("README.rst").read(), - provides=[ - "cdxj_indexer", - ], - install_requires=[ - "warcio", - "surt", - "py3amf", - "multipart", - ], - zip_safe=True, - entry_points=""" - [console_scripts] - cdxj-indexer=cdxj_indexer.main:main - """, - cmdclass={"test": PyTest}, - test_suite="", - tests_require=[ - "pytest", - "pytest-cov", - ], - classifiers=[ - "Development Status :: 4 - Beta", - "Environment :: Web Environment", - "License :: OSI Approved :: Apache Software License", - "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.5", - "Programming Language :: Python :: 3.6", - "Programming Language :: Python :: 3.7", - "Programming Language :: Python :: 3.8", - "Topic :: Software Development :: Libraries :: Python Modules", - "Topic :: Utilities", - ], -) From 18c7c1ae005b5ee941ac2a400c43764d0819adbe Mon Sep 17 00:00:00 2001 From: benoit74 Date: Wed, 22 May 2024 09:50:46 +0000 Subject: [PATCH 6/9] Add contributing instructions --- README.rst | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/README.rst b/README.rst index ed18e3f..b21db2a 100644 --- a/README.rst +++ b/README.rst @@ -42,5 +42,9 @@ More advanced use cases: add additonal http headers as fields. ``http:`` prefix The CDXJ Indexer extends the ``Indexer`` functionality in `warcio `_ and should be flexible to extend. +Contributing +~~~~~~~~~~~~~~~~~~~~ +Run tests with ``python -m pytest -v --cov=cdxj_indexer --cov-report term-missing`` +If you wanna build the sdist/wheel, first install ``build`` package with ``python -m pip install build`` and then run ``python -m build --sdist --wheel``. \ No newline at end of file From c3eef0fc4b44eb225e2b16556e4d5819c5e060b2 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Wed, 22 May 2024 09:51:12 +0000 Subject: [PATCH 7/9] Remove unused import --- test/test_indexer.py | 1 - 1 file changed, 1 deletion(-) diff --git a/test/test_indexer.py b/test/test_indexer.py index dee7ea0..42bd8b9 100644 --- a/test/test_indexer.py +++ b/test/test_indexer.py @@ -10,7 +10,6 @@ from cdxj_indexer.main import write_cdx_index, main, CDXJIndexer -import pkg_resources TEST_DIR = os.path.join(os.path.dirname(os.path.realpath(__file__)), "data") From 39b9d6c9b5f890941185938379f05d35deb6efd8 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Wed, 22 May 2024 09:54:35 +0000 Subject: [PATCH 8/9] Upgrade github actions --- .github/workflows/ci.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 5d2c62a..c79354a 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -10,10 +10,10 @@ jobs: steps: - name: checkout - uses: actions/checkout@v2 + uses: actions/checkout@v4 - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v1 + uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} @@ -33,6 +33,6 @@ jobs: python -m pytest -v --cov=cdxj_indexer --cov-report=xml - name: Upload coverage to Codecov - uses: codecov/codecov-action@v1 + uses: codecov/codecov-action@v4 with: verbose: true From 320f98c342f5ad64d33ef84525e347349048c09f Mon Sep 17 00:00:00 2001 From: benoit74 Date: Wed, 22 May 2024 09:55:10 +0000 Subject: [PATCH 9/9] Update CI to run style checks and coverage upload only once --- .github/workflows/ci.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index c79354a..56c7dd4 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -23,6 +23,7 @@ jobs: python -m pip install --upgrade ".[dev]" - name: Style Check + if: matrix.python-version == '3.8' run: | black --check cdxj_indexer/* black --check test/* @@ -33,6 +34,7 @@ jobs: python -m pytest -v --cov=cdxj_indexer --cov-report=xml - name: Upload coverage to Codecov + if: matrix.python-version == '3.8' uses: codecov/codecov-action@v4 with: verbose: true