diff --git a/.github/workflows/Publish.yaml b/.github/workflows/Publish.yaml index 99f3299c..18ae2760 100644 --- a/.github/workflows/Publish.yaml +++ b/.github/workflows/Publish.yaml @@ -19,6 +19,17 @@ jobs: python-version-file: pyproject.toml architecture: x64 + - name: Generate fuzzy rules + run: python rules/generate_rules.py + + - name: Build Javascript wombatSetup.js + uses: addnab/docker-run-action@v3 + with: + image: node:20-bookworm + options: -v ${{ github.workspace }}/src/warc2zim/statics:/output -v ${{ github.workspace }}/rules:/src/rules -v ${{ github.workspace }}/javascript:/src/javascript -v ${{ github.workspace }}/build_js.sh:/src/build_js.sh + run: | + /src/build_js.sh + - name: Build packages run: | pip install -U pip build diff --git a/.github/workflows/PublishDockerDevImage.yaml b/.github/workflows/PublishDockerDevImage.yaml index 7cb32def..73503d05 100644 --- a/.github/workflows/PublishDockerDevImage.yaml +++ b/.github/workflows/PublishDockerDevImage.yaml @@ -12,6 +12,14 @@ jobs: steps: - uses: actions/checkout@v3 + - name: Build Javascript wombatSetup.js + uses: addnab/docker-run-action@v3 + with: + image: node:20-bookworm + options: -v ${{ github.workspace }}/src/warc2zim/statics:/output -v ${{ github.workspace }}/rules:/src/rules -v ${{ github.workspace }}/javascript:/src/javascript -v ${{ github.workspace }}/build_js.sh:/src/build_js.sh + run: | + /src/build_js.sh + - name: Build and push Docker image uses: openzim/docker-publish-action@v10 with: diff --git a/.github/workflows/QA.yaml b/.github/workflows/QA.yaml index 48ccee5a..2a1ac3cb 100644 --- a/.github/workflows/QA.yaml +++ b/.github/workflows/QA.yaml @@ -24,6 +24,9 @@ jobs: pip install -U pip pip install -e .[lint,scripts,test,check] + - name: Generate fuzzy rules + run: python rules/generate_rules.py + - name: Check black formatting run: inv lint-black @@ -32,3 +35,20 @@ jobs: - name: Check pyright run: inv check-pyright + + - name: Set up Node.JS + uses: actions/setup-node@v4 + with: + node-version: 20 + + - name: Install JS dependencies + working-directory: javascript + run: yarn install + + - name: Check prettier formatting + working-directory: javascript + run: yarn prettier-check + + - name: Check eslint rules + working-directory: javascript + run: yarn eslint diff --git a/.github/workflows/Tests.yaml b/.github/workflows/Tests.yaml index 838269fb..163d9c0d 100644 --- a/.github/workflows/Tests.yaml +++ b/.github/workflows/Tests.yaml @@ -24,6 +24,9 @@ jobs: pip install -U pip pip install -e .[test,scripts] + - name: Generate fuzzy rules + run: python rules/generate_rules.py + - name: Run the tests run: inv coverage --args "-vvv" @@ -32,6 +35,19 @@ jobs: with: token: ${{ secrets.CODECOV_TOKEN }} + - name: Set up Node.JS + uses: actions/setup-node@v4 + with: + node-version: 20 + + - name: Install JS dependencies + working-directory: javascript + run: yarn install + + - name: Run JS tests + working-directory: javascript + run: yarn test + build_python: runs-on: ubuntu-22.04 steps: @@ -43,6 +59,14 @@ jobs: python-version-file: pyproject.toml architecture: x64 + - name: Install dependencies (and project) + run: | + pip install -U pip build + pip install -e . + + - name: Generate fuzzy rules + run: python rules/generate_rules.py + - name: Ensure we can build Python targets run: | pip install -U pip build diff --git a/.gitignore b/.gitignore index f6522214..e5de7a75 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,133 @@ -# Created by https://www.toptal.com/developers/gitignore/api/python,macos -# Edit at https://www.toptal.com/developers/gitignore?templates=python,macos +# Created by https://www.toptal.com/developers/gitignore/api/linux,macos,python,node,visualstudiocode,intellij +# Edit at https://www.toptal.com/developers/gitignore?templates=linux,macos,python,node,visualstudiocode,intellij + +### Intellij ### +# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider +# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 + +# User-specific stuff +.idea/**/workspace.xml +.idea/**/tasks.xml +.idea/**/usage.statistics.xml +.idea/**/dictionaries +.idea/**/shelf + +# AWS User-specific +.idea/**/aws.xml + +# Generated files +.idea/**/contentModel.xml + +# Sensitive or high-churn files +.idea/**/dataSources/ +.idea/**/dataSources.ids +.idea/**/dataSources.local.xml +.idea/**/sqlDataSources.xml +.idea/**/dynamic.xml +.idea/**/uiDesigner.xml +.idea/**/dbnavigator.xml + +# Gradle +.idea/**/gradle.xml +.idea/**/libraries + +# Gradle and Maven with auto-import +# When using Gradle or Maven with auto-import, you should exclude module files, +# since they will be recreated, and may cause churn. Uncomment if using +# auto-import. +# .idea/artifacts +# .idea/compiler.xml +# .idea/jarRepositories.xml +# .idea/modules.xml +# .idea/*.iml +# .idea/modules +# *.iml +# *.ipr + +# CMake +cmake-build-*/ + +# Mongo Explorer plugin +.idea/**/mongoSettings.xml + +# File-based project format +*.iws + +# IntelliJ +out/ + +# mpeltonen/sbt-idea plugin +.idea_modules/ + +# JIRA plugin +atlassian-ide-plugin.xml + +# Cursive Clojure plugin +.idea/replstate.xml + +# SonarLint plugin +.idea/sonarlint/ + +# Crashlytics plugin (for Android Studio and IntelliJ) +com_crashlytics_export_strings.xml +crashlytics.properties +crashlytics-build.properties +fabric.properties + +# Editor-based Rest Client +.idea/httpRequests + +# Android studio 3.1+ serialized cache file +.idea/caches/build_file_checksums.ser + +### Intellij Patch ### +# Comment Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-215987721 + +# *.iml +# modules.xml +# .idea/misc.xml +# *.ipr + +# Sonarlint plugin +# https://plugins.jetbrains.com/plugin/7973-sonarlint +.idea/**/sonarlint/ + +# SonarQube Plugin +# https://plugins.jetbrains.com/plugin/7238-sonarqube-community-plugin +.idea/**/sonarIssues.xml + +# Markdown Navigator plugin +# https://plugins.jetbrains.com/plugin/7896-markdown-navigator-enhanced +.idea/**/markdown-navigator.xml +.idea/**/markdown-navigator-enh.xml +.idea/**/markdown-navigator/ + +# Cache file creation bug +# See https://youtrack.jetbrains.com/issue/JBR-2257 +.idea/$CACHE_FILE$ + +# CodeStream plugin +# https://plugins.jetbrains.com/plugin/12206-codestream +.idea/codestream.xml + +# Azure Toolkit for IntelliJ plugin +# https://plugins.jetbrains.com/plugin/8053-azure-toolkit-for-intellij +.idea/**/azureSettings.xml + +### Linux ### +*~ + +# temporary files which can be created if a process still has a handle open of a deleted file +.fuse_hidden* + +# KDE directory preferences +.directory + +# Linux trash folder which might appear on any partition or disk +.Trash-* + +# .nfs files are created when an open file is removed but is still being accessed +.nfs* ### macOS ### # General @@ -34,6 +162,146 @@ Temporary Items # iCloud generated files *.icloud +### Node ### +# Logs +logs +*.log +npm-debug.log* +yarn-debug.log* +yarn-error.log* +lerna-debug.log* +.pnpm-debug.log* + +# Diagnostic reports (https://nodejs.org/api/report.html) +report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json + +# Runtime data +pids +*.pid +*.seed +*.pid.lock + +# Directory for instrumented libs generated by jscoverage/JSCover +lib-cov + +# Coverage directory used by tools like istanbul +coverage +*.lcov + +# nyc test coverage +.nyc_output + +# Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files) +.grunt + +# Bower dependency directory (https://bower.io/) +bower_components + +# node-waf configuration +.lock-wscript + +# Compiled binary addons (https://nodejs.org/api/addons.html) +build/Release + +# Dependency directories +node_modules/ +jspm_packages/ + +# Snowpack dependency directory (https://snowpack.dev/) +web_modules/ + +# TypeScript cache +*.tsbuildinfo + +# Optional npm cache directory +.npm + +# Optional eslint cache +.eslintcache + +# Optional stylelint cache +.stylelintcache + +# Microbundle cache +.rpt2_cache/ +.rts2_cache_cjs/ +.rts2_cache_es/ +.rts2_cache_umd/ + +# Optional REPL history +.node_repl_history + +# Output of 'npm pack' +*.tgz + +# Yarn Integrity file +.yarn-integrity + +# dotenv environment variable files +.env +.env.development.local +.env.test.local +.env.production.local +.env.local + +# parcel-bundler cache (https://parceljs.org/) +.cache +.parcel-cache + +# Next.js build output +.next +out + +# Nuxt.js build / generate output +.nuxt +dist + +# Gatsby files +.cache/ +# Comment in the public line in if your project uses Gatsby and not Next.js +# https://nextjs.org/blog/next-9-1#public-directory-support +# public + +# vuepress build output +.vuepress/dist + +# vuepress v2.x temp and cache directory +.temp + +# Docusaurus cache and generated files +.docusaurus + +# Serverless directories +.serverless/ + +# FuseBox cache +.fusebox/ + +# DynamoDB Local files +.dynamodb/ + +# TernJS port file +.tern-port + +# Stores VSCode versions used for testing VSCode extensions +.vscode-test + +# yarn v2 +.yarn/cache +.yarn/unplugged +.yarn/build-state.yml +.yarn/install-state.gz +.pnp.* + +### Node Patch ### +# Serverless Webpack directories +.webpack/ + +# Optional stylelint cache + +# SvelteKit build / generate output +.svelte-kit + ### Python ### # Byte-compiled / optimized / DLL files __pycache__/ @@ -79,7 +347,6 @@ htmlcov/ .nox/ .coverage .coverage.* -.cache nosetests.xml coverage.xml *.cover @@ -93,7 +360,6 @@ cover/ *.pot # Django stuff: -*.log local_settings.py db.sqlite3 db.sqlite3-journal @@ -157,7 +423,6 @@ celerybeat.pid *.sage.py # Environments -.env .venv env/ venv/ @@ -206,14 +471,40 @@ poetry.toml # LSP config files pyrightconfig.json -# End of https://www.toptal.com/developers/gitignore/api/python,macos +### VisualStudioCode ### +.vscode/* +!.vscode/settings.json +!.vscode/tasks.json +!.vscode/launch.json +!.vscode/extensions.json +!.vscode/*.code-snippets + +# Local History for Visual Studio Code +.history/ + +# Built Visual Studio Code Extensions +*.vsix + +### VisualStudioCode Patch ### +# Ignore all local history of files +.history +.ionide + +# End of https://www.toptal.com/developers/gitignore/api/linux,macos,python,node,visualstudiocode,intellij # ignore all vscode, this is not standard configuration in this place .vscode # installed at build time -wombat.js +src/warc2zim/statics/wombat.js -# temporary directories using during development +# temporary directories used during development output -tmp \ No newline at end of file +tmp + +# rule files are generated by rules/generate_rules.py +src/warc2zim/rules.py +javascript/src/fuzzyRules.js + +# wombatSetup.js is generated with rollup +src/warc2zim/statics/wombatSetup.js diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index b93b4a92..8d465d48 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -25,3 +25,17 @@ repos: 'types_or': [python, pyi] require_serial: true minimum_pre_commit_version: '2.9.2' +- repo: https://github.com/pre-commit/mirrors-prettier + rev: v3.1.0 + hooks: + - id: prettier + files: javascript\/.*$ # files in javascript folder + args: + - --config + - javascript/.prettierrc.json +- repo: https://github.com/pre-commit/mirrors-eslint + rev: v8.51.0 + hooks: + - id: eslint + types: [file] + files: javascript\/src\/.*(?:\.[jt]sx?|\.vue)$ # *.js, *.jsx, *.ts, *.tsx, *.vue in javascript/src folder diff --git a/Dockerfile b/Dockerfile index 6a7091c7..f265b3ba 100644 --- a/Dockerfile +++ b/Dockerfile @@ -13,12 +13,14 @@ WORKDIR /output # Copy pyproject.toml and its dependencies COPY pyproject.toml openzim.toml README.md /src/ +COPY rules/generate_rules.py /src/rules/generate_rules.py COPY src/warc2zim/__about__.py /src/src/warc2zim/__about__.py # Install Python dependencies RUN pip install --no-cache-dir /src # Copy code + associated artifacts +COPY rules /src/rules COPY src /src/src COPY *.md /src/ diff --git a/README.md b/README.md index 83827c87..81cd00fb 100644 --- a/README.md +++ b/README.md @@ -82,6 +82,11 @@ We have documentation about the [functional architecture](docs/functional_archit ## Contributing +Requirements: +- proper Python version (see pyproject.toml) with pip +- optionally Docker +- optionally Node LTS version (20 recommended) + First, clone this repository. If you do not already have it on your system, install hatch to build the software and manage virtual environments (you might be interested by our detailed [Developer Setup](https://github.com/openzim/_python-bootstrap/wiki/Developer-Setup) as well). @@ -96,6 +101,27 @@ Start a hatch shell: this will install software including dependencies in an iso hatch shell ``` +### Regenerate wombatSetup.js + +wombatSetup.js is the JS code used to setup wombat when the ZIM is used. + +It is normally retrieved by Python build process (see openzim.toml for details). + +Recommended solution to develop this JS code is to install Node.JS on your system, and then + +```bash +cd javascript +yarn build-dev # or yarn build-prod +``` + +Should you want to regenerate this code without install Node.JS, you might simply run following command. + +```bash +docker run -v $PWD/src/warc2zim/statics:/output -v $PWD/rules:/src/rules -v $PWD/javascript:/src/javascript -v $PWD/build_js.sh:/src/build_js.sh -it --rm --entrypoint /src/build_js.sh node:20-bookworm +``` + +It will install Python3 on-top of Node.JS in a Docker container, generate JS fuzzy rules and bundle JS code straight to `/src/warc2zim/statics/wombatSetup.js` where the file is expected to be placed. + ## License [GPLv3](https://www.gnu.org/licenses/gpl-3.0) or later, see diff --git a/build_js.sh b/build_js.sh new file mode 100755 index 00000000..67e4e235 --- /dev/null +++ b/build_js.sh @@ -0,0 +1,23 @@ +#!/bin/bash + +# Custom script to install Python on top of a Docker Node-JS image, then install +# required Python deps, generate fuzzy rules, and finally bundle JS script + +apt-get update -y + +apt-get install -y --no-install-recommends \ + python3 python3-pip python3-venv + +rm -rf /var/lib/apt/lists/* + +python3 -m venv /local + +/local/bin/python -m pip install --no-cache-dir -U \ + pip \ + jinja2==3.1.3 + +/local/bin/python /src/rules/generate_rules.py + +cd /src/javascript + +OUTPUT_DIR=/output yarn build-prod diff --git a/docs/software_architecture.md b/docs/software_architecture.md index 96fd326a..393a622c 100644 --- a/docs/software_architecture.md +++ b/docs/software_architecture.md @@ -22,6 +22,8 @@ Static JS rewriting is simply a matter of pure textual manipulation with regular Dynamic JS rewriting is done with [wombat JS library](https://github.com/webrecorder/wombat). The same fuzzy rules that are used for static rewritting are injected into wombat configuration. Code to rewrite URLs is an adapted version of the code used to compute ZIM paths. +For wombat setup, including the URL rewriting part, we need to pass wombat configuration info. This code is developed in the `javascript` folder. For URL parsing, it relies on the [uri-js library](https://www.npmjs.com/package/uri-js). This javascript code is bundled into a single `wombatSetup.js` file with [rollup bundler](https://rollupjs.org), the same bundler used by webrecorder team to bundle wombat. + ## cdxj_indexer and warcio [cdxj_indexer Python library](https://pypi.org/project/cdxj-indexer/) is a thin wrapper over [warcio Python library](https://pypi.org/project/warcio/). It used to iterate all record in WARCs. diff --git a/docs/technical_architecture.md b/docs/technical_architecture.md index 09965dd9..e9e3ab7d 100644 --- a/docs/technical_architecture.md +++ b/docs/technical_architecture.md @@ -1,6 +1,24 @@ # Technical architecture -## High level overview +## Fuzzy rules + +Fuzzy rules are stored in `rules/rules.json`. This configuration file is then used by `rules/generateRules.py` to generate Python and JS code. + +Should you update these fuzzy rules, you hence have to: +- regenerate Python and JS files by running `python rules/generateRules.py` +- bundle again Javascript `wombatSetup.js` (see below). + +## Wombat configuration + +Wombat configuration contains some static configuration and the dynamic URL rewriting, including fuzzy rules. + +It is bundled by rollup with `cd javascript && yarn build-prod` and the result is pushed to proper scraper location for inclusion at build time. + +Tests are available and run with `cd javascript && yarn test`. + +## Scraper operations + +### High level overview The scraper behavior is done in two phases. @@ -10,7 +28,7 @@ Second, the WARC records are iterated to be transformed and appended inside the In both phases, WARC records are iterated in natural order, i.e. as they have been retrieved online during the crawl. -## Transformation of URL into ZIM path +### Transformation of URL into ZIM path Transforming a URL into a ZIM path has to respect the ZIM specification: path must not be url-encoded (i.e. it must be decoded) and it must be stored as UTF-8. @@ -22,7 +40,7 @@ Computation of the ZIM path is hence mostly straightforward: - decode the hostname which is puny-encoded - decode the path and query parameter which might be url-encoded -## URL rewriting +### URL rewriting In addition to the computation of the relative path from the current document URL to the URL to rewrite, URL rewriting also consists in computing the proper ZIM path (with same operation as above) and properly encoding it so that the resulting URL respects [RFC 3986](https://datatracker.ietf.org/doc/html/rfc3986). Some important stuff has to be noted in this encoding. @@ -37,7 +55,15 @@ Below is an example case of the rewrite operation on an image URL found in an HT - Image rewritten URL: `../../../ex%C3%A9mple.com/a/resource/image.png%3Ffoo%3Dbar` - Image ZIM Path: `exémple.com/a/resource/image.png?foo=bar` -## Different kinds of WARC records +### JS Rewriting + +JS Rewriting is a bit special because rules to apply are different wether we are using "classic" Javascript or "module" Javascript. + +Detection of Javascript modules starts at the HTML level where we have a ` - + + - + + @@ -384,6 +429,7 @@

Various Javascript

+ diff --git a/test-website/content/javascript/cont!nt.txt b/test-website/content/javascript/cont!nt.txt new file mode 100644 index 00000000..168b100e --- /dev/null +++ b/test-website/content/javascript/cont!nt.txt @@ -0,0 +1 @@ +This is working OK diff --git a/test-website/content/javascript/not_working.png b/test-website/content/javascript/not_working.png new file mode 100644 index 00000000..9d572cad Binary files /dev/null and b/test-website/content/javascript/not_working.png differ diff --git a/test-website/content/javascript/script02!b.js b/test-website/content/javascript/script02!b.js new file mode 100644 index 00000000..59d072f2 --- /dev/null +++ b/test-website/content/javascript/script02!b.js @@ -0,0 +1,3 @@ +const span02b = document.getElementById('span02b'); +span02b.innerHTML = 'This is working OK'; +span02b.classList.add('green'); diff --git a/test-website/content/javascript/script02.js b/test-website/content/javascript/script02.js deleted file mode 100644 index 5538776c..00000000 --- a/test-website/content/javascript/script02.js +++ /dev/null @@ -1,3 +0,0 @@ -const span02 = document.getElementById("span02"); -span02.innerHTML="This is working OK"; -span02.classList.add("green"); diff --git a/test-website/content/javascript/script02a.js b/test-website/content/javascript/script02a.js new file mode 100644 index 00000000..8454a99c --- /dev/null +++ b/test-website/content/javascript/script02a.js @@ -0,0 +1,3 @@ +const span02a = document.getElementById('span02a'); +span02a.innerHTML = 'This is working OK'; +span02a.classList.add('green'); diff --git a/test-website/content/javascript/script10.js b/test-website/content/javascript/script10.js new file mode 100644 index 00000000..4b89d82f --- /dev/null +++ b/test-website/content/javascript/script10.js @@ -0,0 +1,5 @@ +const img10 = document.getElementById('img10'); +const origSrc = img10.getAttribute('src') +const newSrc = origSrc.replace('not_working', 'working') +console.debug('Replacing ' + origSrc + ' with ' + newSrc) +img10.src = newSrc; \ No newline at end of file diff --git a/test-website/content/javascript/working.png b/test-website/content/javascript/working.png new file mode 100644 index 00000000..0a542b00 Binary files /dev/null and b/test-website/content/javascript/working.png differ diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 00000000..4b3b88aa --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,11 @@ +import pytest + + +@pytest.fixture(scope="module") +def no_js_notify(): + """Fixture to not care about notification of detection of a JS file""" + + def no_js_notify_handler(_: str): + pass + + yield no_js_notify_handler diff --git a/tests/test_fuzzy_rules.py b/tests/test_fuzzy_rules.py index 66d3750f..6b09ca88 100644 --- a/tests/test_fuzzy_rules.py +++ b/tests/test_fuzzy_rules.py @@ -218,41 +218,67 @@ def test_fuzzyrules_youtube_embed(youtube_embed_case): @pytest.fixture( params=[ ContentForTests( - "gcs-vimeo.akamaized.net/123.mp4", - "vimeo-cdn.fuzzy.replayweb.page/123.mp4", + "gcs-vimeo.akamaized.net/123.mp4?range=123-456", + "vimeo-cdn.fuzzy.replayweb.page/123.mp4?range=123-456", ), ContentForTests( - "vod.akamaized.net/123.mp4", - "vimeo-cdn.fuzzy.replayweb.page/123.mp4", + "vod.akamaized.net/123.mp4?range=123-456", + "vimeo-cdn.fuzzy.replayweb.page/123.mp4?range=123-456", ), ContentForTests( - "vod-progressive.akamaized.net/123.mp4", - "vimeo-cdn.fuzzy.replayweb.page/123.mp4", + "vod-progressive.akamaized.net/123.mp4?range=123-456", + "vimeo-cdn.fuzzy.replayweb.page/123.mp4?range=123-456", ), ContentForTests( - "foovod.akamaized.net/123.mp4", - "vimeo-cdn.fuzzy.replayweb.page/123.mp4", + "vod-adaptive.akamaized.net/123.mp4?range=123-456", + "vimeo-cdn.fuzzy.replayweb.page/123.mp4?range=123-456", ), ContentForTests( - "vod.akamaized.net/1/23.mp4", - "vimeo-cdn.fuzzy.replayweb.page/1/23.mp4", + "vod.akamaized.net/123.mp4?foo=bar&range=123-456", + "vimeo-cdn.fuzzy.replayweb.page/123.mp4?range=123-456", ), ContentForTests( - "vod.akamaized.net/a/23.mp4", - "vimeo-cdn.fuzzy.replayweb.page/23.mp4", + "vod.akamaized.net/123.mp4?foo=bar&range=123-456&bar=foo", + "vimeo-cdn.fuzzy.replayweb.page/123.mp4?range=123-456", ), ContentForTests( - "vod.akamaized.net/foo/bar/23.mp4", - "vimeo-cdn.fuzzy.replayweb.page/23.mp4", + "vod.akamaized.net/123.mp4?range=123-456&bar=foo", + "vimeo-cdn.fuzzy.replayweb.page/123.mp4?range=123-456", + ), + ContentForTests( + "foovod.akamaized.net/123.mp4?range=123-456", + "vimeo-cdn.fuzzy.replayweb.page/123.mp4?range=123-456", ), ContentForTests( - "foo.akamaized.net/123.mp4", + "vod.akamaized.net/1/23.mp4?range=123-456", + "vimeo-cdn.fuzzy.replayweb.page/23.mp4?range=123-456", + ), + ContentForTests( + "vod.akamaized.net/a/23.mp4?range=123-456", + "vimeo-cdn.fuzzy.replayweb.page/23.mp4?range=123-456", + ), + ContentForTests( + "vod.akamaized.net/foo/bar/23.mp4?range=123-456", + "vimeo-cdn.fuzzy.replayweb.page/23.mp4?range=123-456", + ), + ContentForTests( + "foo.akamaized.net/123.mp4?range=123-456", + ), + ContentForTests( + "vod.akamaized.net/23.mp4", + "vimeo-cdn.fuzzy.replayweb.page/23.mp4", ), ContentForTests( - "vod.akamaized.net/23.mp4?foo", + "vod.akamaized.net/23/12332.mp4", + "vimeo-cdn.fuzzy.replayweb.page/23/12332.mp4", ), ContentForTests( - "vod.akamaized.net/23.mp3", + "https://vod-progressive.akamaized.net/exp=1635528595" + "~acl=%2Fvimeo-prod-skyfire-std-us" + "%2F01%2F4423%2F13%2F347119375%2F1398505169.mp4" + "~hmac=27c31f1990aab5e5429f7f7db5b2dcbcf8d2f5c92184d53102da36920d33d53e" + "/vimeo-prod-skyfire-std-us/01/4423/13/347119375/1398505169.mp4", + "vimeo-cdn.fuzzy.replayweb.page/01/4423/13/347119375/1398505169.mp4", ), ] ) diff --git a/tests/test_html_rewriting.py b/tests/test_html_rewriting.py index cfac82c7..5b1c95fe 100644 --- a/tests/test_html_rewriting.py +++ b/tests/test_html_rewriting.py @@ -42,7 +42,7 @@ def no_rewrite_content(request): yield request.param -def test_no_rewrite(no_rewrite_content): +def test_no_rewrite(no_rewrite_content, no_js_notify): assert ( HtmlRewriter( ArticleUrlRewriter( @@ -50,6 +50,7 @@ def test_no_rewrite(no_rewrite_content): ), "", "", + no_js_notify, ) .rewrite(no_rewrite_content.input_str) .content @@ -105,12 +106,13 @@ def escaped_content(request): yield request.param -def test_escaped_content(escaped_content): +def test_escaped_content(escaped_content, no_js_notify): transformed = ( HtmlRewriter( ArticleUrlRewriter(HttpUrl(f"http://{escaped_content.article_url}"), set()), "", "", + no_js_notify, ) .rewrite(escaped_content.input_str) .content @@ -195,7 +197,7 @@ def rewrite_url(request): yield request.param -def test_rewrite(rewrite_url): +def test_rewrite(rewrite_url, no_js_notify): assert ( HtmlRewriter( ArticleUrlRewriter( @@ -204,6 +206,7 @@ def test_rewrite(rewrite_url): ), "", "", + no_js_notify, ) .rewrite(rewrite_url.input_str) .content @@ -211,7 +214,7 @@ def test_rewrite(rewrite_url): ) -def test_extract_title(): +def test_extract_title(no_js_notify): content = """ Page title @@ -227,6 +230,7 @@ def test_extract_title(): lambda _: "kiwix.org", # pyright: ignore[reportGeneralTypeIssues, reportArgumentType] "", "", + no_js_notify, ) .rewrite(content) .title @@ -234,11 +238,12 @@ def test_extract_title(): ) -def test_rewrite_attributes(): +def test_rewrite_attributes(no_js_notify): rewriter = HtmlRewriter( ArticleUrlRewriter(HttpUrl("http://kiwix.org/"), {ZimPath("kiwix.org/foo")}), "", "", + no_js_notify, ) assert ( @@ -260,9 +265,14 @@ def test_rewrite_attributes(): ) -def test_rewrite_css(): +def test_rewrite_css(no_js_notify): output = ( - HtmlRewriter(ArticleUrlRewriter(HttpUrl("http://kiwix.org/"), set()), "", "") + HtmlRewriter( + ArticleUrlRewriter(HttpUrl("http://kiwix.org/"), set()), + "", + "", + no_js_notify, + ) .rewrite( "", @@ -275,7 +285,7 @@ def test_rewrite_css(): ) -def test_head_insert(): +def test_head_insert(no_js_notify): content = """ A test content @@ -286,16 +296,118 @@ def test_head_insert(): content = dedent(content) url_rewriter = ArticleUrlRewriter(HttpUrl("http://kiwix.org/"), set()) - assert HtmlRewriter(url_rewriter, "", "").rewrite(content).content == content + assert ( + HtmlRewriter(url_rewriter, "", "", no_js_notify).rewrite(content).content + == content + ) - assert HtmlRewriter(url_rewriter, "PRE_HEAD_INSERT", "").rewrite( + assert HtmlRewriter(url_rewriter, "PRE_HEAD_INSERT", "", no_js_notify).rewrite( content ).content == content.replace("", "PRE_HEAD_INSERT") - assert HtmlRewriter(url_rewriter, "", "POST_HEAD_INSERT").rewrite( + assert HtmlRewriter(url_rewriter, "", "POST_HEAD_INSERT", no_js_notify).rewrite( content ).content == content.replace("", "POST_HEAD_INSERT") - assert HtmlRewriter(url_rewriter, "PRE_HEAD_INSERT", "POST_HEAD_INSERT").rewrite( - content - ).content == content.replace("", "PRE_HEAD_INSERT").replace( + assert HtmlRewriter( + url_rewriter, "PRE_HEAD_INSERT", "POST_HEAD_INSERT", no_js_notify + ).rewrite(content).content == content.replace( + "", "PRE_HEAD_INSERT" + ).replace( "", "POST_HEAD_INSERT" ) + + +@pytest.mark.parametrize( + "js_src,expected_js_module_path", + [ + ("my-module-script.js", "kiwix.org/my_folder/my-module-script.js"), + ("./my-module-script.js", "kiwix.org/my_folder/my-module-script.js"), + ("../my-module-script.js", "kiwix.org/my-module-script.js"), + ("../../../my-module-script.js", "kiwix.org/my-module-script.js"), + ("/my-module-script.js", "kiwix.org/my-module-script.js"), + ("//myserver.com/my-module-script.js", "myserver.com/my-module-script.js"), + ( + "https://myserver.com/my-module-script.js", + "myserver.com/my-module-script.js", + ), + ], +) +def test_js_module_detected_script(js_src, expected_js_module_path): + + js_modules = [] + + def custom_notify(zim_path: ZimPath): + js_modules.append(zim_path) + + HtmlRewriter( + url_rewriter=ArticleUrlRewriter( + HttpUrl("http://kiwix.org/my_folder/my_article.html"), set() + ), + pre_head_insert="", + post_head_insert="", + notify_js_module=custom_notify, + ).rewrite(f'') + + assert len(js_modules) == 1 + assert js_modules[0].value == expected_js_module_path + + +@pytest.mark.parametrize( + "js_src,expected_js_module_path", + [ + ("my-module-script.js", "kiwix.org/my_folder/my-module-script.js"), + ("./my-module-script.js", "kiwix.org/my_folder/my-module-script.js"), + ("../my-module-script.js", "kiwix.org/my-module-script.js"), + ("../../../my-module-script.js", "kiwix.org/my-module-script.js"), + ("/my-module-script.js", "kiwix.org/my-module-script.js"), + ("//myserver.com/my-module-script.js", "myserver.com/my-module-script.js"), + ( + "https://myserver.com/my-module-script.js", + "myserver.com/my-module-script.js", + ), + ], +) +def test_js_module_detected_module_preload(js_src, expected_js_module_path): + + js_modules = [] + + def custom_notify(zim_path: ZimPath): + js_modules.append(zim_path) + + HtmlRewriter( + url_rewriter=ArticleUrlRewriter( + HttpUrl("http://kiwix.org/my_folder/my_article.html"), set() + ), + pre_head_insert="", + post_head_insert="", + notify_js_module=custom_notify, + ).rewrite(f'') + + assert len(js_modules) == 1 + assert js_modules[0].value == expected_js_module_path + + +@pytest.mark.parametrize( + "script_src", + [ + (''), + (''), + (''), + ], +) +def test_no_js_module_detected(script_src): + + js_modules = [] + + def custom_notify(zim_path: ZimPath): + js_modules.append(zim_path) + + HtmlRewriter( + url_rewriter=ArticleUrlRewriter( + HttpUrl("http://kiwix.org/my_folder/my_article.html"), set() + ), + pre_head_insert="", + post_head_insert="", + notify_js_module=custom_notify, + ).rewrite(script_src) + + assert len(js_modules) == 0 diff --git a/tests/test_js_rewriting.py b/tests/test_js_rewriting.py index 0c425274..fe83ae9f 100644 --- a/tests/test_js_rewriting.py +++ b/tests/test_js_rewriting.py @@ -1,11 +1,16 @@ import pytest from warc2zim.content_rewriting.js import JsRewriter -from warc2zim.url_rewriting import ArticleUrlRewriter, HttpUrl +from warc2zim.url_rewriting import ArticleUrlRewriter, HttpUrl, ZimPath from .utils import ContentForTests +@pytest.fixture +def simple_js_rewriter(no_js_notify) -> JsRewriter: + return JsRewriter(url_rewriter=lambda x: x, notify_js_module=no_js_notify) + + @pytest.fixture( params=[ "a = this;", @@ -24,9 +29,9 @@ def rewrite_this_js_content(request): ) -def test_this_js_rewrite(rewrite_this_js_content): +def test_this_js_rewrite(simple_js_rewriter: JsRewriter, rewrite_this_js_content): assert ( - JsRewriter(lambda x: x).rewrite(rewrite_this_js_content.input_str) + simple_js_rewriter.rewrite(rewrite_this_js_content.input_str) == rewrite_this_js_content.expected_str ) @@ -103,9 +108,9 @@ def rewrite_wrapped_content(request): yield request.param -def test_wrapped_rewrite(rewrite_wrapped_content): +def test_wrapped_rewrite(simple_js_rewriter: JsRewriter, rewrite_wrapped_content): assert ( - JsRewriter(lambda x: x).rewrite(rewrite_wrapped_content.input_str) + simple_js_rewriter.rewrite(rewrite_wrapped_content.input_str) == rewrite_wrapped_content.expected_str ) @@ -187,7 +192,7 @@ def __post_init__(self): import * from "../../../example.com/file.js" import A from "../../../example.com/path/file2.js"; -import {C, D} from "abc.js"; +import {C, D} from "./abc.js"; import {X, Y} from "../parent.js"; import {E, F, G} from "../../path.js"; import { Z } from "../../path.js"; @@ -217,12 +222,14 @@ def rewrite_import_content(request): yield request.param -def test_import_rewrite(rewrite_import_content): +def test_import_rewrite(no_js_notify, rewrite_import_content): url_rewriter = ArticleUrlRewriter( HttpUrl(rewrite_import_content.article_url), set() ) assert ( - JsRewriter(url_rewriter).rewrite(rewrite_import_content.input_str) + JsRewriter(url_rewriter=url_rewriter, notify_js_module=no_js_notify).rewrite( + rewrite_import_content.input_str, opts={"isModule": True} + ) == rewrite_import_content.expected_str ) @@ -271,7 +278,38 @@ def no_rewrite_js_content(request): yield request.param -def test_no_rewrite(no_rewrite_js_content): - assert ( - JsRewriter(lambda x: x).rewrite(no_rewrite_js_content) == no_rewrite_js_content +def test_no_rewrite(simple_js_rewriter: JsRewriter, no_rewrite_js_content): + assert simple_js_rewriter.rewrite(no_rewrite_js_content) == no_rewrite_js_content + + +@pytest.mark.parametrize( + "js_src,expected_js_module_path", + [ + ("./my-module-script.js", "kiwix.org/my_folder/my-module-script.js"), + ("../my-module-script.js", "kiwix.org/my-module-script.js"), + ("../../../my-module-script.js", "kiwix.org/my-module-script.js"), + ("/my-module-script.js", "kiwix.org/my-module-script.js"), + ("//myserver.com/my-module-script.js", "myserver.com/my-module-script.js"), + ( + "https://myserver.com/my-module-script.js", + "myserver.com/my-module-script.js", + ), + ], +) +def test_js_rewrite_nested_module_detected(js_src, expected_js_module_path): + + js_modules = [] + + def custom_notify(zim_path: ZimPath): + js_modules.append(zim_path) + + url_rewriter = ArticleUrlRewriter( + HttpUrl("http://kiwix.org/my_folder/my_article.html"), set() + ) + + JsRewriter(url_rewriter=url_rewriter, notify_js_module=custom_notify).rewrite( + f'import * from "{js_src}"', opts={"isModule": True} ) + + assert len(js_modules) == 1 + assert js_modules[0].value == expected_js_module_path diff --git a/tests/test_url_rewriting.py b/tests/test_url_rewriting.py index e4633e2b..1057f829 100644 --- a/tests/test_url_rewriting.py +++ b/tests/test_url_rewriting.py @@ -112,6 +112,13 @@ ["kiwix.org/foo.html"], False, ), + ( + "https://kiwix.org/a/article/document.html", + "foo?param=value", + "foo%3Fparam%3Dvalue", + ["kiwix.org/a/article/foo?param=value"], + False, + ), ( "https://kiwix.org/a/article/document.html", "foo?param=value%2F", @@ -133,6 +140,83 @@ ["kiwix.org/a/article/foo/"], False, ), + ( + "https://kiwix.org/a/article/document.html", + "/fo o.html", + "../../fo%20o.html", + ["kiwix.org/fo o.html"], + False, + ), + ( + "https://kiwix.org/a/article/document.html", + "/fo+o.html", + "../../fo%2Bo.html", + ["kiwix.org/fo+o.html"], + False, + ), + ( + "https://kiwix.org/a/article/document.html", + "/fo%2Bo.html", + "../../fo%2Bo.html", + ["kiwix.org/fo+o.html"], + False, + ), + ( + "https://kiwix.org/a/article/document.html", + "/foo.html?param=val+ue", + "../../foo.html%3Fparam%3Dval%20ue", + ["kiwix.org/foo.html?param=val ue"], + False, + ), + ( + "https://kiwix.org/a/article/document.html", + "/fo~o.html", + "../../fo~o.html", + ["kiwix.org/fo~o.html"], + False, + ), + ( + "https://kiwix.org/a/article/document.html", + "/fo-o.html", + "../../fo-o.html", + ["kiwix.org/fo-o.html"], + False, + ), + ( + "https://kiwix.org/a/article/document.html", + "/fo_o.html", + "../../fo_o.html", + ["kiwix.org/fo_o.html"], + False, + ), + ( + "https://kiwix.org/a/article/document.html", + "/fo%7Eo.html", # must not be encoded / must be decoded (RFC 3986 #2.3) + "../../fo~o.html", + ["kiwix.org/fo~o.html"], + False, + ), + ( + "https://kiwix.org/a/article/document.html", + "/fo%2Do.html", # must not be encoded / must be decoded (RFC 3986 #2.3) + "../../fo-o.html", + ["kiwix.org/fo-o.html"], + False, + ), + ( + "https://kiwix.org/a/article/document.html", + "/fo%5Fo.html", # must not be encoded / must be decoded (RFC 3986 #2.3) + "../../fo_o.html", + ["kiwix.org/fo_o.html"], + False, + ), + ( + "https://kiwix.org/a/article/document.html", + "/foo%2Ehtml", # must not be encoded / must be decoded (RFC 3986 #2.3) + "../../foo.html", + ["kiwix.org/foo.html"], + False, + ), ], ) def test_relative_url( diff --git a/tests/test_warc_to_zim.py b/tests/test_warc_to_zim.py index 7855873c..00560a75 100644 --- a/tests/test_warc_to_zim.py +++ b/tests/test_warc_to_zim.py @@ -346,7 +346,7 @@ def test_warc_to_zim_specify_params_and_metadata(self, tmp_path): "example.com/": "Example Domain", "_zim_static/__wb_module_decl.js": "_zim_static/__wb_module_decl.js", "_zim_static/wombat.js": "_zim_static/wombat.js", - "_zim_static/wombat_setup.js": "_zim_static/wombat_setup.js", + "_zim_static/wombatSetup.js": "_zim_static/wombatSetup.js", } zim_fh = Archive(zim_output)