diff --git a/.github/workflows/ci-tests.yml b/.github/workflows/ci-tests.yml index 57a21a0..7569299 100644 --- a/.github/workflows/ci-tests.yml +++ b/.github/workflows/ci-tests.yml @@ -38,5 +38,6 @@ jobs: poetry run flake8 src --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics - name: Test with pytest run: | - # pip install pytest poetry run pytest + - name: Upload coverage reports to Codecov + uses: codecov/codecov-action@v3 diff --git a/.gitignore b/.gitignore index 96bb478..826ad19 100644 --- a/.gitignore +++ b/.gitignore @@ -105,3 +105,6 @@ dmypy.json # Ignore demo-output dir demo-output/ + +# Ignore dir containing verification artifacts +alto2txt-verify/ diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index ac4197d..bc73ec6 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -2,7 +2,7 @@ # See https://pre-commit.com/hooks.html for more hooks repos: - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.3.0 + rev: v4.4.0 hooks: - id: trailing-whitespace # Leave demo-files unaltered after download from BL website. @@ -12,20 +12,20 @@ repos: - id: check-xml - id: check-added-large-files - repo: https://github.com/python-poetry/poetry - rev: '1.3.2' + rev: '1.3.0' hooks: - id: poetry-check - id: poetry-lock - repo: https://github.com/psf/black - rev: 22.6.0 + rev: 23.1.0 hooks: - id: black - repo: https://github.com/pre-commit/mirrors-autopep8 - rev: v1.6.0 # Use the sha / tag you want to point at + rev: v2.0.1 # Use the sha / tag you want to point at hooks: - id: autopep8 - repo: https://github.com/pre-commit/mirrors-mypy - rev: v0.971 # Use the sha / tag you want to point at + rev: v1.0.1 # Use the sha / tag you want to point at hooks: - id: mypy - repo: https://github.com/pre-commit/mirrors-isort @@ -33,7 +33,7 @@ repos: hooks: - id: isort - repo: https://github.com/hadialqattan/pycln - rev: v1.2.5 + rev: v2.1.3 hooks: - id: pycln args: [--config=pyproject.toml] diff --git a/README.md b/README.md index 47358b4..ca5bb2e 100644 --- a/README.md +++ b/README.md @@ -1,62 +1,64 @@ -# `alto2txt`: Extract plain text from newspapers +# `alto2txt`: Extract plain text from digital newspaper OCR scans -![GitHub](https://img.shields.io/github/license/Living-with-Machines/alto2txt) ![PyPI](https://img.shields.io/pypi/v/alto2txt) [![DOI](https://zenodo.org/badge/259340615.svg)](https://zenodo.org/badge/latestdoi/259340615) +![GitHub](https://img.shields.io/github/license/Living-with-Machines/alto2txt) ![PyPI](https://img.shields.io/pypi/v/alto2txt) [![DOI](https://zenodo.org/badge/259340615.svg)](https://zenodo.org/badge/latestdoi/259340615) [![pre-commit](https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit)](https://github.com/pre-commit/pre-commit) +*Version extract_text 0.3.4* +`alto2txt` converts `XML` `ALTO`/`METS` Optical Character Recognition (OCR) scans into plaintext files with minimal metadata. -Converts `XML` (in `METS` `1.8`/`ALTO` `1.4`, `METS` `1.3`/`ALTO` `1.4`, `BLN` or `UKP` format) publications to plaintext articles and generates minimal metadata. - +**`XML` compatibility: `METS 1.8`/`ALTO 1.4`, `METS 1.3`/`ALTO 1.4`, `BLN`, or `UKP` format** ## [Full documentation and demo instructions.](https://living-with-machines.github.io/alto2txt/#/) +`ALTO` and `METS` are industry standards maintained by the [US Library of Congress](https://www.loc.gov/librarians/standards) targeting newspaper digitization used by hundreds of modern, large-scale newspaper digitization projects. One text file is output per article, each complemented by one `XML` metadata file[^1] . -## Installation - -### Installation using an Anaconda environment - -We recommend installation via Anaconda: +[`METS` (Metadata Encoding and Transmission Standard)](http://www.loc.gov/standards/mets/) is a standard for encoding descriptive, administrative, and structural metadata regarding objects within a digital library, expressed in `XML`. [`ALTO` (Analyzed Layout and Text Objects)](https://www.loc.gov/standards/alto/) is an [`XML schema`](https://en.wikipedia.org/wiki/XML_schema) for technical metadata describing the layout and content of text resources such as book or newspaper pages. `ALTO` is often used in combination with `METS` but can also be used independently. Details of the `ALTO` schema are avilable at https://github.com/altoxml/schema. -* Refer to the [Anaconda website and follow the instructions](https://docs.anaconda.com/anaconda/install/). -* Create a new environment for `alto2txt` +## Quick Install -```bash -conda create -n py37alto python=3.7 -``` +### `pip` -* Activate the environment: +As of verion `v0.3.4` `alto2txt` is available on [`PyPI`](https://pypi.org/project/alto2txt/) and can be installed via -```bash -conda activate py37alto +```console +$ pip install alto2txt ``` -### Installation using pip, outside an Anaconda environment +### `conda` -Note, the use of ``alto2txt`` outside a conda environment has not been as extensively tested as within a conda environment. Whilst we believe that this should work, please use with caution. +If you are comfortable with the command line, git, and already have Python & Anaconda installed, you can install `alto2txt` by navigating to an empty directory in the terminal and run the following commands: -```bash -pip install alto2txt +```console +$ git clone https://github.com/Living-with-machines/alto2txt.git +$ cd alto2txt +$ conda create -n py37alto python=3.7 +$ conda activate py37alto +$ pip install pyproject.toml ``` ### Installation of a test release -If you need (or want) to install a test release of `alto2txt` you will likely be advised of the specific version number to install. This examaple command will install `v0.3.1-alpha.20`: +If you need (or want) to install a test release of `alto2txt` you will likely be advised of the specific version number to install. This command will install `v0.3.1-alpha.20`: ```bash -pip install -i https://test.pypi.org/simple/ alto2txt==0.3.1a20 +$ pip install -i https://test.pypi.org/simple/ alto2txt==0.3.1a20 ``` -## Usage - -Downsampling can be used to convert only every Nth issue of each newspaper. One text file is output per article, each complemented by one `XML` metadata file. - +[Click here](https://living-with-machines.github.io/alto2txt/#Demo.md) for more in-depth installation instructions using demo files. +## Usage +> *Note*: the formatting below is altered for readability ``` -usage: alto2txt [-h] [-p [PROCESS_TYPE]] [-l [LOG_FILE]] [-d [DOWNSAMPLE]] [-n [NUM_CORES]] +$ alto2txt -h + +usage: alto2txt [-h] + [-p [PROCESS_TYPE]] + [-l [LOG_FILE]] + [-d [DOWNSAMPLE]] + [-n [NUM_CORES]] xml_in_dir txt_out_dir -alto2txt [-h] [-p [PROCESS_TYPE]] [-l [LOG_FILE]] [-d [DOWNSAMPLE]] [-n [NUM_CORES]] - xml_in_dir txt_out_dir Converts XML publications to plaintext articles @@ -75,91 +77,92 @@ optional arguments: -n [NUM_CORES], --num-cores [NUM_CORES] Number of cores (Spark only). Default 1") ``` +To read about downsampling, logs, and using spark see [Advanced Information](https://living-with-machines.github.io/alto2txt/#/advanced). -`xml_in_dir` is expected to hold `XML` for multiple publications, in the following structure: - -``` -xml_in_dir -|-- publication -| |-- year -| | |-- issue -| | | |-- xml_content -| |-- year -|-- publication -``` - -However, if `-p|--process-type single` is provided then `xml_in_dir` is expected to hold `XML` for a single publication, in the following structure: - -``` -xml_in_dir -|-- year -| |-- issue -| | |-- xml_content -|-- year -``` - -`txt_out_dir` is created with an analogous structure to `xml_in_dir`. +## Process Types -`PROCESS_TYPE` can be one of: +`-p | -process-type` can be one of: * `single`: Process single publication. * `serial`: Process publications serially. * `multi`: Process publications using multiprocessing (default). * `spark`: Process publications using Spark. -`DOWNSAMPLE` must be a positive integer, default 1. +### Process Multiple Publications -The following `XSLT` files need to be in an `extract_text.xslts` module: +For default settings, (`multi`) multiprocessing assumes the following directory structure for multiple publications in `xml_in_dir`: -* `extract_text_mets18.xslt`: `METS 1.8 XSL` file. -* `extract_text_mets13.xslt`: `METS 1.3 XSL` file. -* `extract_text_bln.xslt`: `BLN XSL` file. -* `extract_text_ukp.xslt`: `UKP XSL` file. - -## Process publications +``` +xml_in_dir/ + ├── publication + │ ├── year + │ │ └── issue + │ │ └── xml_content + │ └── year + └── publication +``` +Assuming `xml_in_dir` follows this structure, run alto2txt with the following in the terminal: -Assume folder `BNA` exists and matches the structure above. +```console +$ alto2txt xml_in_dir txt_out_dir +``` -Extract text from every publication: +To downsample and only process every 100th edition: -```bash -alto2txt BNA txt +```console +$ alto2txt xml_in_dir txt_out_dir -d 100 ``` -Extract text from every 100th issue of every publication: -```bash -alto2txt BNA txt -d 100 -``` +### Process Single Publication -## Process a single publication +[A demo for processing a single publication is available here.](https://living-with-machines.github.io/alto2txt/#/?id=process-single-publication) -Extract text from every issue of a single publication: +If `-p|--process-type single` is provided then `xml_in_dir` is expected to hold `XML` for a single publication, in the following structure: -```bash -alto2txt -p single BNA/0000151 txt +``` +xml_in_dir/ + ├── year + │ └── issue + │ └── xml_content + └── year ``` -Extract text from every 100th issue of a single publication: +Assuming `xml_in_dir` follows this structure, run `alto2txt` with the following in the terminal in the folder `xml_in_dir` is stored in: -```bash -alto2txt -p single BNA/0000151 txt -d 100 +```console +$ alto2txt -p single xml_in_dir txt_out_dir ``` +To downsample and only process every 100th edition from the one publication: + +```console +$ alto2txt -p single xml_in_dir txt_out_dir -d 100 +``` + +### Plain Text Files Output + +`txt_out_dir` is created with an analogous structure to `xml_in_dir`. +One `.txt` file and one metadata `.xml` file are produced per article. + + ## Configure logging By default, logs are put in `out.log`. To specify an alternative location for logs, use the `-l` flag e.g. -```bash -alto2txt -l mylog.txt BNA txt -d 100 2> err.log +```console +$ alto2txt -l mylog.txt single xml_in_dir txt_out_dir -d 100 2> err.log ``` ## Process publications via Spark -[Information on running on spark.](spark_instructions.md) +[Information on running on spark.](https://living-with-machines.github.io/alto2txt/#/advanced?id=using-spark) +## Contributing + +Suggestions, code, tests, further documentation and features – especially to cover various OCR output formats – are needed and welcome. For details and examples see the [Contributing](https://living-with-machines.github.io/alto2txt/#/contributing) section. ## Future work @@ -191,3 +194,6 @@ This data is "CC0 1.0 Universal Public Domain" - [No Copyright - Other Known Leg This software has been developed as part of the [Living with Machines](https://livingwithmachines.ac.uk) project. This project, funded by the UK Research and Innovation (UKRI) Strategic Priority Fund, is a multidisciplinary collaboration delivered by the Arts and Humanities Research Council (AHRC), with The Alan Turing Institute, the British Library and the Universities of Cambridge, East Anglia, Exeter, and Queen Mary University of London. Grant reference: AH/S01179X/1 + +> Last updated 2023-02-21 +[^1]: For a more detailed description see: https://www.coloradohistoricnewspapers.org/forum/what-is-metsalto/ diff --git a/alto2txt-verify.sh b/alto2txt-verify.sh new file mode 100755 index 0000000..977cbec --- /dev/null +++ b/alto2txt-verify.sh @@ -0,0 +1,111 @@ +#!/usr/bin/env bash +# +# Verification checks to run after processing newspapers with alto2txt. +# +# Call this script with two command line arguments: +# 1. the directory path containing the input data processed by alto2txt +# 2. the directory path containing the output data generated by alto2txt +# +# Both input & output directories should have identical subdirectory +# structure of the following form: title_code/YYYY/MMDD/ +# +# This script checks that the directory structures are identical. +# It does not check the validity of the alto2txt text or metadata files. + +input_dir=$1 +output_dir=$2 + +if [[ ! -e $1 ]]; then + echo "Error: Input directory $1 does not exist." + exit +fi +if [[ ! -e $2 ]]; then + echo "Error: Output directory $2 does not exist." + exit +fi +echo ">>> alto2txt verification <<<" +echo "Input directory: '$input_dir'" +echo "Output directory: '$output_dir'" + +# Create subdirectories to store temporary files and results. +subdir="alto2txt-verify" +if [[ -e $subdir ]]; then + echo "Error: $subdir subdirectory already exists." + exit +fi +mkdir $subdir + +input_subdir="$subdir/input" +if [[ ! -e $input_subdir ]]; then + mkdir $input_subdir +fi +output_subdir="$subdir/output" +if [[ ! -e $output_subdir ]]; then + mkdir $output_subdir +fi + +# Loop over the title_code/YYYY subdirectories in the input +# directory and get a list of MMDD subdirectories for each. +sep="-" +for yyyy_dir in $(find $input_dir -mindepth 2 -maxdepth 2 -not -path '*/.*' -type d); do + yyyy="${yyyy_dir##*/}" + title_dir="${yyyy_dir:0:(${#yyyy_dir}-5)}" + title="${title_dir##*/}" + txtfile="$input_subdir/$title$sep$yyyy.txt" + (cd $input_dir && find "$title/$yyyy" -mindepth 1 -maxdepth 1 -not -path '*/.*' -type d) > $txtfile +done + +# Repeat for the output subdirectories. +for yyyy_dir in $(find $output_dir -mindepth 2 -maxdepth 2 -not -path '*/.*' -type d); do + yyyy="${yyyy_dir##*/}" + title_dir="${yyyy_dir:0:(${#yyyy_dir}-5)}" + title="${title_dir##*/}" + txtfile="$output_subdir/$title$sep$yyyy.txt" + (cd $output_dir && find "$title/$yyyy" -mindepth 1 -maxdepth 1 -not -path '*/.*' -type d) > $txtfile +done + +# Compare the results. +success=true +titleyearcheck="$subdir/missing-title-years.txt" +mmddcheck="$subdir/missing-mmdd-directories" +affectedtitles="$subdir/affected-titles.txt" +first=true +first_cmp=true +for filename in $(ls $input_subdir); do + title="${filename%$sep*}" + if [[ ! -e $output_subdir/$filename ]]; then + # Make a list of any .txt files in $input_subdir that aren't found in $output_subdir + success=false + echo "$title" >> $affectedtitles + if [ "$first" = true ]; then + echo "WARNING: Missing title-years detected in alto2txt output" + echo "Missing title-years are listed in: $titleyearcheck" + first=false + fi + echo "$filename" >> $titleyearcheck + else + # Compare .txt files that are found in both subdirs + if (! cmp -s "$input_subdir/$filename" "$output_subdir/$filename"); then + success=false + echo "$title" >> $affectedtitles + if [ "$first_cmp" = true ]; then + echo "WARNING: Missing MMDD directories detected in alto2txt output" + echo "Missing MMDD directories are are listed in subdirectory: $mmddcheck" + if [[ ! -e $mmddcheck ]]; then + mkdir $mmddcheck + fi + first_cmp=false + fi + comm -23 <(sort $input_subdir/$filename) <(sort $output_subdir/$filename) > $mmddcheck/$filename + fi + fi +done + +if [[ -e $affectedtitles ]]; then + sort -u $affectedtitles -o $affectedtitles + echo ">>> All affected titles are listed in: $affectedtitles" +fi + +if [ "$success" = true ]; then + echo "SUCCESS: alto2txt verification checks passed." +fi diff --git a/docs/Demo.md b/docs/Demo.md index d801c52..5c7176d 100644 --- a/docs/Demo.md +++ b/docs/Demo.md @@ -12,24 +12,30 @@ If you are comfortable with the command line, git, and already have Python & Ana Navigate to an empty directory in the terminal and run the following commands: +```console +$ git clone https://github.com/Living-with-machines/alto2txt.git +$ cd alto2txt +$ conda create -n py37alto python=3.7 +$ conda activate py37alto ``` -> git clone https://github.com/Living-with-machines/alto2txt.git -> cd alto2txt -> conda create -n py37alto python=3.7 -> conda activate py37alto -``` + To install that checkout you can -``` -> pip install pyproject.toml + +```console +$ pip install pyproject.toml ``` You can simply install the latest release (but this may not be up to date with the GitHub `main` branch) + +```console +$ pip install alto2txt ``` -> pip install alto2txt -``` + regardless this should make the following command run + +```console +$ alto2txt -p single demo-files demo-output ``` -> alto2txt -p single demo-files demo-output -``` + and the resulting plain text files of the articles will be in `alto2txt/demo-output/`. Read on for a more in-depth explanation. @@ -59,21 +65,21 @@ cd ~myFolder/alto2txt Create a new [Conda environment](https://docs.conda.io/projects/conda/en/latest/user-guide/tasks/manage-environments.html) with Python 3.7. The environment name can be whatever you choose, here it is `py37alto`: -``` -conda create -n py37alto python=3.7 +```console +$ conda create -n py37alto python=3.7 ``` After creating the environment, activate it: -``` -conda activate py37alto +```console +$ conda activate py37alto ``` #### Install Required Packages Install the required packages which are outlined in `pyproject.toml`: -``` -pip install pyproject.toml +```console +$ pip install pyproject.toml ``` Follow the instructions to download and install the packages. You should now have all the required Python packages within your conda environment to run `alto2txt`. @@ -84,7 +90,7 @@ Follow the instructions to download and install the packages. You should now hav Make sure you have navigated to the `alto2txt` directory in your terminal or Anaconda prompt. For this demo, we are using a single edition for a single publication. The output files will be created in `/demo-output` which you can check is currently empty. ``` -alto2txt -p single demo-files demo-output +$ alto2txt -p single demo-files demo-output ``` Here we use the positional argument `-p` to determine which process type, in this case `single`. The script can be run on many publications and years by default, but in this case we only have one publication. [Click here](/#process-types) to read more about different process types. @@ -178,26 +184,26 @@ Running these steps for your own files works in the same way. Your source and/or #### Run on a single publication, multiple years, multiple editions -``` -alto2txt -p single input-directory output-directory +```console +$ alto2txt -p single input-directory output-directory ``` #### Run on multiple publications, multiple years, multiple editions -``` -alto2txt input-directory output-directory +```console +$ alto2txt input-directory output-directory ``` #### Extract every 100th edition from every publication -``` -alto2txt input-directory output-directory -d 100 +```console +$ alto2txt input-directory output-directory -d 100 ``` Where `-d` determines the downsample value. #### Extract every 100th edition from one publication -``` -alto2txt -p single input-directory output-directory -d 100 +```console +$ alto2txt -p single input-directory output-directory -d 100 ``` diff --git a/docs/README.md b/docs/README.md index d539874..773caf3 100644 --- a/docs/README.md +++ b/docs/README.md @@ -1,14 +1,42 @@ -# `alto2txt`: Extract plain text from digitised newspapers +# `alto2txt`: Extract plain text from digital newspaper OCR scans -*Version extract_text 0.3.0* +*Version extract_text 0.3.4* -`alto2txt` converts `XML` publications to plaintext articles with minimal metadata. -ALTO and METS are the current industry standards for newspaper digitisation used by hundreds of modern, large-scale newspaper digitisation projects. -One text file is output per article, each complemented by one `XML` metadata file. +![GitHub](https://img.shields.io/github/license/Living-with-Machines/alto2txt) ![PyPI](https://img.shields.io/pypi/v/alto2txt) [![DOI](https://zenodo.org/badge/259340615.svg)](https://zenodo.org/badge/latestdoi/259340615) [![pre-commit](https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit)](https://github.com/pre-commit/pre-commit) -[METS (Metadata Encoding and Transmission Standard)](http://www.loc.gov/standards/mets/) is a standard for encoding descriptive, administrative, and structural metadata regarding objects within a digital library, expressed in XML. [ALTO (Analyzed Layout and Text Objects](http://www.loc.gov/standards/alto/) is an 'XML Schema that details technical metadata for describing the layout and content of physical text resources, such as pages of a book or a newspaper'. ALTO extends METS but can also be used independently. -**`XML` compatibility: METS 1.8/ALTO 1.4, METS 1.3/ALTO 1.4, BLN, or UKP format** +`alto2txt` converts `XML` `ALTO`/`METS` Optical Character Recognition (OCR) scans into plaintext files with minimal metadata. + +**`XML` compatibility: `METS 1.8`/`ALTO 1.4`, `METS 1.3`/`ALTO 1.4`, `BLN`, or `UKP` format** + +`ALTO` and `METS` are industry standards maintained by the [US Library of Congress](https://www.loc.gov/librarians/standards) targeting newspaper digitization used by hundreds of modern, large-scale newspaper digitization projects. One text file is output per article, each complemented by one `XML` metadata file[^1] . + +[`METS` (Metadata Encoding and Transmission Standard)](http://www.loc.gov/standards/mets/) is a standard for encoding descriptive, administrative, and structural metadata regarding objects within a digital library, expressed in `XML`. [`ALTO` (Analyzed Layout and Text Objects)](https://www.loc.gov/standards/alto/) is an [`XML schema`](https://en.wikipedia.org/wiki/XML_schema) for technical metadata describing the layout and content of text resources such as book or newspaper pages. `ALTO` is often used in combination with `METS` but can also be used independently. Details of the `ALTO` schema are avilable at https://github.com/altoxml/schema. + + +## Quick Install + +### `pip` + +As of verion `v0.3.4` `alto2txt` is available on [`PyPI`](https://pypi.org/project/alto2txt/) and can be installed via + +```bash +pip install alto2txt +``` + +### `conda` + +If you are comfortable with the command line, git, and already have Python & Anaconda installed, you can install `alto2txt` by navigating to an empty directory in the terminal and run the following commands: + +```bash +git clone https://github.com/Living-with-machines/alto2txt.git +cd alto2txt +conda create -n py37alto python=3.7 +conda activate py37alto +pip install pyproject.toml +``` + +[Click here](/Demo.md) for more in-depth installation instructions using demo files. ## Usage @@ -40,22 +68,6 @@ optional arguments: ``` To read about downsampling, logs, and using spark see [Advanced Information](advanced.md). - -## Quick Install - -If you are comfortable with the command line, git, and already have Python & Anaconda installed, you can install `alto2txt` by navigating to an empty directory in the terminal and run the following commands: - -``` -> git clone https://github.com/Living-with-machines/alto2txt.git -> cd alto2txt -> conda create -n py37alto python=3.7 -> conda activate py37alto -> pip install pyproject.toml -``` - -[Click here](/Demo.md) for more in-depth installation instructions using demo files. - - ## Process Types @@ -81,14 +93,14 @@ xml_in_dir/ ``` Assuming `xml_in_dir` follows this structure, run alto2txt with the following in the terminal: -```bash -alto2txt xml_in_dir txt_out_dir +```console +$ alto2txt xml_in_dir txt_out_dir ``` To downsample and only process every 100th edition: -```bash -alto2txt xml_in_dir txt_out_dir -d 100 +```console +$ alto2txt xml_in_dir txt_out_dir -d 100 ``` @@ -108,14 +120,14 @@ xml_in_dir/ Assuming `xml_in_dir` follows this structure, run `alto2txt` with the following in the terminal in the folder `xml_in_dir` is stored in: -```bash -alto2txt -p single xml_in_dir txt_out_dir +```console +$ alto2txt -p single xml_in_dir txt_out_dir ``` To downsample and only process every 100th edition from the one publication: -```bash -alto2txt -p single xml_in_dir txt_out_dir -d 100 +```console +$ alto2txt -p single xml_in_dir txt_out_dir -d 100 ``` ## Plain Text Files Output @@ -135,8 +147,9 @@ Quality assurance is performed to check for: ## Future work -* Export more metadata from ALTO, probably by parsing METS first. +* Export more metadata from `ALTO`, probably by parsing `METS` first. * Check and ensure that articles that span multiple pages are pulled into a single article file. * Smarter handling of articles spanning multiple pages. -> Last updated 2022-11-10 +> Last updated 2023-02-22 +[^1]: For a more detailed description see: https://www.coloradohistoricnewspapers.org/forum/what-is-metsalto/ diff --git a/docs/_coverpage.md b/docs/_coverpage.md index c793c4f..c168ca1 100644 --- a/docs/_coverpage.md +++ b/docs/_coverpage.md @@ -1,7 +1,7 @@ -# Alto2Txt +# alto2txt -> Extract text from digitised newspapers articles +> Extract plain text from newspaper OCR scans @@ -11,4 +11,4 @@ - Made with docsify +Made with docsify. diff --git a/docs/_sidebar.md b/docs/_sidebar.md index 35b2994..c92fa4d 100644 --- a/docs/_sidebar.md +++ b/docs/_sidebar.md @@ -1,4 +1,5 @@ -* [Alto2Txt](/) +* [`alto2txt`](/) * [Demo](Demo.md) * [Advanced](advanced.md) +* [Contributing](contributing.md) diff --git a/docs/advanced.md b/docs/advanced.md index 7eb19c3..3091d49 100644 --- a/docs/advanced.md +++ b/docs/advanced.md @@ -1,5 +1,4 @@ -# Further Information - +# Advanced features and utilities ## `XSTL`: `XML` Stylesheet @@ -42,8 +41,8 @@ By default, logs are put in `out.log`. To specify an alternative location for logs, use the `-l` flag e.g. -```bash -alto2txt -l mylog.txt ~/xml_in_dir ~/txt_out_dir -d 100 2> err.log +```console +$ alto2txt -l mylog.txt ~/xml_in_dir ~/txt_out_dir -d 100 2> err.log ``` ## Using Spark @@ -56,16 +55,16 @@ When running via Spark ensure that: For example, the code can be run on Urika requesting as follows... -Install the code as a package: +Install the code as a package with the `spark` option: -```bash -python setup.py install +```console +poetry install --with spark ``` Run `spark-submit`: -```bash -spark-submit ./extract_publications_text.py \ +```console +$ spark-submit ./extract_publications_text.py \ -p spark \ -n 144 \ -l /mnt/lustre/at003/at003//log.out \ @@ -83,22 +82,16 @@ this would request 144/36 = 4 workers/executors and nodes. To update the version number: -1. Edit `README.md`: - -``` -# Extract plain text from newspapers (extract_text 0.3.0) -``` - -2. Edit `setup.py`: +1. Edit `pyproject.toml`: ``` -version="0.3.0", +version = "0.3.4" ``` -3. Exit `extract_text/xslts/extract_text_common.xslt`: +2. Edit `extract_text/xslts/extract_text_common.xslt`: ``` -0.3.0 +0.3.4 ``` ## Documentation with Docsify diff --git a/docs/contributing.md b/docs/contributing.md new file mode 100644 index 0000000..2104891 --- /dev/null +++ b/docs/contributing.md @@ -0,0 +1,165 @@ +# Contributing + +Contributions via [GitHub issues](https://github.com/Living-with-machines/alto2txt/issues) and [pull requests](https://github.com/Living-with-machines/alto2txt/pulls) very welcome. To install locally for contribtuions we recommend using [`poetry`](https://python-poetry.org/). + +## Local checkout + +1. Install [`poetry`](https://python-poetry.org/docs/#installation) +2. `$ git checkout https://github.com/Living-with-machines/alto2txt.git` +3. `$ cd alto2txt` +4. `$ poetry install` + +## [`pre-commit`](https://pre-commit.com/) local changes + +Whatever contribution you make, be it code or documentation, your changes will need to pass our [`pre-commit`](https://pre-commit.com/) [configuration](https://github.com/Living-with-machines/alto2txt/blob/main/.pre-commit-config.yaml). To prepare that: + +1. Follow [installation instructions](https://pre-commit.com/#install) +2. Add `pre-commit` [git commit hooks](https://pre-commit.com/#3-install-the-git-hook-scripts) + +```console +$ cd path/to/alto2txt +$ pre-commit install +``` + +3. Make your local commit and see if any `pre-commit` changes are added. See a [best practice guide](https://www.conventionalcommits.org/en/v1.0.0/#examples) for writing these: + +```console +$ git commit -m "docs: add docs/contributing.py and enable in sidebar" +``` + +This *should* then run `pre-commit` checks that print messages like this to the screen (this is simplifed as an example): + +```console +[INFO] Initializing environment for https://github.com/pre-commit/pre-commit-hooks. +[INFO] Initializing environment for https://github.com/psf/black. +[INFO] Installing environment for https://github.com/pre-commit/pre-commit-hooks. +[INFO] Installing environment for https://github.com/pre-commit/mirrors-autopep8. +[INFO] Installing environment for https://github.com/pre-commit/mirrors-mypy. +[INFO] This may take a few minutes... +trim trailing whitespace.................................................Failed +- hook id: trailing-whitespace +- exit code: 1 +- files were modified by this hook + +Fixing docs/contributing.md + +fix end of files.........................................................Passed +Check Yaml...............................................................Passed +Fix End of Files.........................................................Passed +check yaml...............................................................Passed +check xml............................................(no files to check)Skipped +check for added large files..............................................Passed +poetry-check.........................................(no files to check)Skipped +poetry-lock..............................................................Passed +black................................................(no files to check)Skipped +autopep8.............................................(no files to check)Skipped +mypy.................................................(no files to check)Skipped +isort................................................(no files to check)Skipped +pycln................................................(no files to check)Skipped +``` + +In this simple case, adding the new changes generated by `pre-commit` (trimming trailing whitespace in `docs/contributing.md`) should then pass and complete the commit:j + +```console +$ git add docs/contributing.md +$ git commit -m "docs: add docs/contributing.py and enable in sidebar" + +trim trailing whitespace.................................................Passed +fix end of files.........................................................Passed +check yaml...............................................................Passed +check xml............................................(no files to check)Skipped +check for added large files..............................................Passed +poetry-check.........................................(no files to check)Skipped +poetry-lock..............................................................Passed +black................................................(no files to check)Skipped +autopep8.............................................(no files to check)Skipped +mypy.................................................(no files to check)Skipped +isort................................................(no files to check)Skipped +pycln................................................(no files to check)Skipped +[doc-copy-edits 18be46e] fix: update extract_text_common.xslt version + 1 file changed, 1 insertion(+), 1 deletion(-) +``` + +## Running tests + +If you make changes to the code, please add a test (and/or modify current tests) within the `tests/` folder to help ensure your contribution produces correct results. It also makes it much easier (and quicker) to review contributions and either accept or suggest additional changes. Tests are run via + +```console +$ poetry run pytest +``` + +See `pytest` [examples](https://docs.pytest.org/en/7.1.x/getting-started.html#create-your-first-test) for advice on writing tests and using [`fixtures`](https://docs.pytest.org/en/7.1.x/reference/fixtures.html#fixture-availability). Fixtures for our tests are are `tests/conftest.py`. + +If you need to debug changes and/or tests, you can run + +```console +$ poetry run pytest --pdb +``` + +to drop into [`ipython`](https://ipython.readthedocs.io/en/stable/) to interactively debug how a test failed. + +Once tests have finished running, it will print out a summary of how much code was covered by the tests via [coverage](https://coverage.readthedocs.io/en/7.1.0/) package. For example: + +```console +================================= test session starts ================================= +platform darwin -- Python 3.10.10, pytest-7.2.1, pluggy-1.0.0 +rootdir: /Users/you-user-name/path-to-git-checkouts/alto2txt, configfile: pyproject.toml +plugins: cov-4.0.0 +collected 8 items + +tests/test_e2e.py ...s.ss [ 87%] +tests/test_import.py . [100%] + +--------- coverage: platform darwin, python 3.10.10-final-0 ---------- +Name Stmts Miss Cover +--------------------------------------------------------------- +src/alto2txt/extract_publications_text.py 21 1 95% +src/alto2txt/spark_xml_to_text.py 29 29 0% +src/alto2txt/xml_to_text.py 111 42 62% +src/alto2txt/xml_to_text_entry.py 38 6 84% +--------------------------------------------------------------- +TOTAL 320 78 76% + +5 files skipped due to complete coverage. +``` + +*Generally* higher coverage is better. In the example above, any contributions to testing `spark_xml_to_text.py` would be very appreciated. + +## Documentation with Docsify + +Documentation is a collection of [`markdown`](https://www.markdownguide.org/basic-syntax/) files rendered by [`docsify`](https://docsify.js.org/ +) and staticly hosted on [GitHub Pages](https://pages.github.com/). To contribute: + +1. Edit the `.md` files within `docs/`. +2. Add any extra pages to `_sidebar.md` or reorder them +3. Generate an [issue](https://github.com/Living-with-machines/alto2txt/issues) describing what you've added +4. Make a [pull request](https://github.com/Living-with-machines/alto2txt/pulls) + +To preview locally from the terminal: + +1. Navigate to your `alto2txt` repository (and follow installation instructions above if needed). +3. `$ poetry shell` to activate your local `python` environment +4. `$ cd docs && python -m http.server 3000` to render the `docs` +5. Navigate to `http://localhost:3000` in a browser to render changes as you make them + +## Update Version + +Once you've made a contribution, you may need to update the `alto2txt` version number. If requested, here's an example of following that process + +1. Edit `README.md`: + +```md +# Extract plain text from newspapers (extract_text 0.3.4) +``` + +2. Edit `pyproject.toml`: + +```toml +version = "0.3.4" +``` + +3. Exit `extract_text/xslts/extract_text_common.xslt`: + +```xml +0.3.4 +``` diff --git a/docs/index.html b/docs/index.html index 12fa727..d578e19 100644 --- a/docs/index.html +++ b/docs/index.html @@ -12,7 +12,7 @@
+ diff --git a/poetry.lock b/poetry.lock index 4862390..c7362ab 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,16 @@ -# This file is automatically @generated by Poetry and should not be changed by hand. +# This file is automatically @generated by Poetry 1.4.2 and should not be changed by hand. + +[[package]] +name = "appnope" +version = "0.1.3" +description = "Disable App Nap on macOS >= 10.9" +category = "dev" +optional = false +python-versions = "*" +files = [ + {file = "appnope-0.1.3-py2.py3-none-any.whl", hash = "sha256:265a455292d0bd8a72453494fa24df5a11eb18373a60c7c0430889f22548605e"}, + {file = "appnope-0.1.3.tar.gz", hash = "sha256:02bd91c4de869fbb1e1c50aafc4098827a7a54ab2f39d9dcba6c9547ed920e24"}, +] [[package]] name = "asttokens" @@ -19,73 +31,46 @@ six = "*" test = ["astroid", "pytest"] [[package]] -name = "attrs" -version = "22.2.0" -description = "Classes Without Boilerplate" +name = "backcall" +version = "0.2.0" +description = "Specifications for callback functions passed in to an API" category = "dev" optional = false -python-versions = ">=3.6" +python-versions = "*" files = [ - {file = "attrs-22.2.0-py3-none-any.whl", hash = "sha256:29e95c7f6778868dbd49170f98f8818f78f3dc5e0e37c0b1f474e3561b240836"}, - {file = "attrs-22.2.0.tar.gz", hash = "sha256:c9227bfc2f01993c03f68db37d1d15c9690188323c067c641f1a35ca58185f99"}, + {file = "backcall-0.2.0-py2.py3-none-any.whl", hash = "sha256:fbbce6a29f263178a1f7915c1940bde0ec2b2a967566fe1c65c1dfb7422bd255"}, + {file = "backcall-0.2.0.tar.gz", hash = "sha256:5cbdbf27be5e7cfadb448baf0aa95508f91f2bbc6c6437cd9cd06e2a4c215e1e"}, ] -[package.extras] -cov = ["attrs[tests]", "coverage-enable-subprocess", "coverage[toml] (>=5.3)"] -dev = ["attrs[docs,tests]"] -docs = ["furo", "myst-parser", "sphinx", "sphinx-notfound-page", "sphinxcontrib-towncrier", "towncrier", "zope.interface"] -tests = ["attrs[tests-no-zope]", "zope.interface"] -tests-no-zope = ["cloudpickle", "cloudpickle", "hypothesis", "hypothesis", "mypy (>=0.971,<0.990)", "mypy (>=0.971,<0.990)", "pympler", "pympler", "pytest (>=4.3.0)", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-mypy-plugins", "pytest-xdist[psutil]", "pytest-xdist[psutil]"] - [[package]] name = "black" -version = "23.1.0" +version = "21.12b0" description = "The uncompromising code formatter." category = "dev" optional = false -python-versions = ">=3.7" +python-versions = ">=3.6.2" files = [ - {file = "black-23.1.0-cp310-cp310-macosx_10_16_arm64.whl", hash = "sha256:b6a92a41ee34b883b359998f0c8e6eb8e99803aa8bf3123bf2b2e6fec505a221"}, - {file = "black-23.1.0-cp310-cp310-macosx_10_16_universal2.whl", hash = "sha256:57c18c5165c1dbe291d5306e53fb3988122890e57bd9b3dcb75f967f13411a26"}, - {file = "black-23.1.0-cp310-cp310-macosx_10_16_x86_64.whl", hash = "sha256:9880d7d419bb7e709b37e28deb5e68a49227713b623c72b2b931028ea65f619b"}, - {file = "black-23.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e6663f91b6feca5d06f2ccd49a10f254f9298cc1f7f49c46e498a0771b507104"}, - {file = "black-23.1.0-cp310-cp310-win_amd64.whl", hash = "sha256:9afd3f493666a0cd8f8df9a0200c6359ac53940cbde049dcb1a7eb6ee2dd7074"}, - {file = "black-23.1.0-cp311-cp311-macosx_10_16_arm64.whl", hash = "sha256:bfffba28dc52a58f04492181392ee380e95262af14ee01d4bc7bb1b1c6ca8d27"}, - {file = "black-23.1.0-cp311-cp311-macosx_10_16_universal2.whl", hash = "sha256:c1c476bc7b7d021321e7d93dc2cbd78ce103b84d5a4cf97ed535fbc0d6660648"}, - {file = "black-23.1.0-cp311-cp311-macosx_10_16_x86_64.whl", hash = "sha256:382998821f58e5c8238d3166c492139573325287820963d2f7de4d518bd76958"}, - {file = "black-23.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2bf649fda611c8550ca9d7592b69f0637218c2369b7744694c5e4902873b2f3a"}, - {file = "black-23.1.0-cp311-cp311-win_amd64.whl", hash = "sha256:121ca7f10b4a01fd99951234abdbd97728e1240be89fde18480ffac16503d481"}, - {file = "black-23.1.0-cp37-cp37m-macosx_10_16_x86_64.whl", hash = "sha256:a8471939da5e824b891b25751955be52ee7f8a30a916d570a5ba8e0f2eb2ecad"}, - {file = "black-23.1.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8178318cb74f98bc571eef19068f6ab5613b3e59d4f47771582f04e175570ed8"}, - {file = "black-23.1.0-cp37-cp37m-win_amd64.whl", hash = "sha256:a436e7881d33acaf2536c46a454bb964a50eff59b21b51c6ccf5a40601fbef24"}, - {file = "black-23.1.0-cp38-cp38-macosx_10_16_arm64.whl", hash = "sha256:a59db0a2094d2259c554676403fa2fac3473ccf1354c1c63eccf7ae65aac8ab6"}, - {file = "black-23.1.0-cp38-cp38-macosx_10_16_universal2.whl", hash = "sha256:0052dba51dec07ed029ed61b18183942043e00008ec65d5028814afaab9a22fd"}, - {file = "black-23.1.0-cp38-cp38-macosx_10_16_x86_64.whl", hash = "sha256:49f7b39e30f326a34b5c9a4213213a6b221d7ae9d58ec70df1c4a307cf2a1580"}, - {file = "black-23.1.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:162e37d49e93bd6eb6f1afc3e17a3d23a823042530c37c3c42eeeaf026f38468"}, - {file = "black-23.1.0-cp38-cp38-win_amd64.whl", hash = "sha256:8b70eb40a78dfac24842458476135f9b99ab952dd3f2dab738c1881a9b38b753"}, - {file = "black-23.1.0-cp39-cp39-macosx_10_16_arm64.whl", hash = "sha256:a29650759a6a0944e7cca036674655c2f0f63806ddecc45ed40b7b8aa314b651"}, - {file = "black-23.1.0-cp39-cp39-macosx_10_16_universal2.whl", hash = "sha256:bb460c8561c8c1bec7824ecbc3ce085eb50005883a6203dcfb0122e95797ee06"}, - {file = "black-23.1.0-cp39-cp39-macosx_10_16_x86_64.whl", hash = "sha256:c91dfc2c2a4e50df0026f88d2215e166616e0c80e86004d0003ece0488db2739"}, - {file = "black-23.1.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2a951cc83ab535d248c89f300eccbd625e80ab880fbcfb5ac8afb5f01a258ac9"}, - {file = "black-23.1.0-cp39-cp39-win_amd64.whl", hash = "sha256:0680d4380db3719ebcfb2613f34e86c8e6d15ffeabcf8ec59355c5e7b85bb555"}, - {file = "black-23.1.0-py3-none-any.whl", hash = "sha256:7a0f701d314cfa0896b9001df70a530eb2472babb76086344e688829efd97d32"}, - {file = "black-23.1.0.tar.gz", hash = "sha256:b0bd97bea8903f5a2ba7219257a44e3f1f9d00073d6cc1add68f0beec69692ac"}, + {file = "black-21.12b0-py3-none-any.whl", hash = "sha256:a615e69ae185e08fdd73e4715e260e2479c861b5740057fde6e8b4e3b7dd589f"}, + {file = "black-21.12b0.tar.gz", hash = "sha256:77b80f693a569e2e527958459634f18df9b0ba2625ba4e0c2d5da5be42e6f2b3"}, ] [package.dependencies] -click = ">=8.0.0" +click = ">=7.1.2" mypy-extensions = ">=0.4.3" -packaging = ">=22.0" -pathspec = ">=0.9.0" +pathspec = ">=0.9.0,<1" platformdirs = ">=2" -tomli = {version = ">=1.1.0", markers = "python_version < \"3.11\""} +tomli = ">=0.2.6,<2.0.0" typed-ast = {version = ">=1.4.2", markers = "python_version < \"3.8\" and implementation_name == \"cpython\""} -typing-extensions = {version = ">=3.10.0.0", markers = "python_version < \"3.10\""} +typing-extensions = [ + {version = ">=3.10.0.0", markers = "python_version < \"3.10\""}, + {version = ">=3.10.0.0,<3.10.0.1 || >3.10.0.1", markers = "python_version >= \"3.10\""}, +] [package.extras] colorama = ["colorama (>=0.4.3)"] d = ["aiohttp (>=3.7.4)"] jupyter = ["ipython (>=7.8.0)", "tokenize-rt (>=3.2.0)"] +python2 = ["typed-ast (>=1.4.3)"] uvloop = ["uvloop (>=0.15.2)"] [[package]] @@ -130,63 +115,63 @@ files = [ [[package]] name = "coverage" -version = "7.1.0" +version = "7.2.3" description = "Code coverage measurement for Python" category = "dev" optional = false python-versions = ">=3.7" files = [ - {file = "coverage-7.1.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:3b946bbcd5a8231383450b195cfb58cb01cbe7f8949f5758566b881df4b33baf"}, - {file = "coverage-7.1.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:ec8e767f13be637d056f7e07e61d089e555f719b387a7070154ad80a0ff31801"}, - {file = "coverage-7.1.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d4a5a5879a939cb84959d86869132b00176197ca561c664fc21478c1eee60d75"}, - {file = "coverage-7.1.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b643cb30821e7570c0aaf54feaf0bfb630b79059f85741843e9dc23f33aaca2c"}, - {file = "coverage-7.1.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:32df215215f3af2c1617a55dbdfb403b772d463d54d219985ac7cd3bf124cada"}, - {file = "coverage-7.1.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:33d1ae9d4079e05ac4cc1ef9e20c648f5afabf1a92adfaf2ccf509c50b85717f"}, - {file = "coverage-7.1.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:29571503c37f2ef2138a306d23e7270687c0efb9cab4bd8038d609b5c2393a3a"}, - {file = "coverage-7.1.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:63ffd21aa133ff48c4dff7adcc46b7ec8b565491bfc371212122dd999812ea1c"}, - {file = "coverage-7.1.0-cp310-cp310-win32.whl", hash = "sha256:4b14d5e09c656de5038a3f9bfe5228f53439282abcab87317c9f7f1acb280352"}, - {file = "coverage-7.1.0-cp310-cp310-win_amd64.whl", hash = "sha256:8361be1c2c073919500b6601220a6f2f98ea0b6d2fec5014c1d9cfa23dd07038"}, - {file = "coverage-7.1.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:da9b41d4539eefd408c46725fb76ecba3a50a3367cafb7dea5f250d0653c1040"}, - {file = "coverage-7.1.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:c5b15ed7644ae4bee0ecf74fee95808dcc34ba6ace87e8dfbf5cb0dc20eab45a"}, - {file = "coverage-7.1.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d12d076582507ea460ea2a89a8c85cb558f83406c8a41dd641d7be9a32e1274f"}, - {file = "coverage-7.1.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e2617759031dae1bf183c16cef8fcfb3de7617f394c813fa5e8e46e9b82d4222"}, - {file = "coverage-7.1.0-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c4e4881fa9e9667afcc742f0c244d9364d197490fbc91d12ac3b5de0bf2df146"}, - {file = "coverage-7.1.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:9d58885215094ab4a86a6aef044e42994a2bd76a446dc59b352622655ba6621b"}, - {file = "coverage-7.1.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:ffeeb38ee4a80a30a6877c5c4c359e5498eec095878f1581453202bfacc8fbc2"}, - {file = "coverage-7.1.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:3baf5f126f30781b5e93dbefcc8271cb2491647f8283f20ac54d12161dff080e"}, - {file = "coverage-7.1.0-cp311-cp311-win32.whl", hash = "sha256:ded59300d6330be27bc6cf0b74b89ada58069ced87c48eaf9344e5e84b0072f7"}, - {file = "coverage-7.1.0-cp311-cp311-win_amd64.whl", hash = "sha256:6a43c7823cd7427b4ed763aa7fb63901ca8288591323b58c9cd6ec31ad910f3c"}, - {file = "coverage-7.1.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:7a726d742816cb3a8973c8c9a97539c734b3a309345236cd533c4883dda05b8d"}, - {file = "coverage-7.1.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bc7c85a150501286f8b56bd8ed3aa4093f4b88fb68c0843d21ff9656f0009d6a"}, - {file = "coverage-7.1.0-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f5b4198d85a3755d27e64c52f8c95d6333119e49fd001ae5798dac872c95e0f8"}, - {file = "coverage-7.1.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ddb726cb861c3117a553f940372a495fe1078249ff5f8a5478c0576c7be12050"}, - {file = "coverage-7.1.0-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:51b236e764840a6df0661b67e50697aaa0e7d4124ca95e5058fa3d7cbc240b7c"}, - {file = "coverage-7.1.0-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:7ee5c9bb51695f80878faaa5598040dd6c9e172ddcf490382e8aedb8ec3fec8d"}, - {file = "coverage-7.1.0-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:c31b75ae466c053a98bf26843563b3b3517b8f37da4d47b1c582fdc703112bc3"}, - {file = "coverage-7.1.0-cp37-cp37m-win32.whl", hash = "sha256:3b155caf3760408d1cb903b21e6a97ad4e2bdad43cbc265e3ce0afb8e0057e73"}, - {file = "coverage-7.1.0-cp37-cp37m-win_amd64.whl", hash = "sha256:2a60d6513781e87047c3e630b33b4d1e89f39836dac6e069ffee28c4786715f5"}, - {file = "coverage-7.1.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:f2cba5c6db29ce991029b5e4ac51eb36774458f0a3b8d3137241b32d1bb91f06"}, - {file = "coverage-7.1.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:beeb129cacea34490ffd4d6153af70509aa3cda20fdda2ea1a2be870dfec8d52"}, - {file = "coverage-7.1.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0c45948f613d5d18c9ec5eaa203ce06a653334cf1bd47c783a12d0dd4fd9c851"}, - {file = "coverage-7.1.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ef382417db92ba23dfb5864a3fc9be27ea4894e86620d342a116b243ade5d35d"}, - {file = "coverage-7.1.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7c7c0d0827e853315c9bbd43c1162c006dd808dbbe297db7ae66cd17b07830f0"}, - {file = "coverage-7.1.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:e5cdbb5cafcedea04924568d990e20ce7f1945a1dd54b560f879ee2d57226912"}, - {file = "coverage-7.1.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:9817733f0d3ea91bea80de0f79ef971ae94f81ca52f9b66500c6a2fea8e4b4f8"}, - {file = "coverage-7.1.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:218fe982371ac7387304153ecd51205f14e9d731b34fb0568181abaf7b443ba0"}, - {file = "coverage-7.1.0-cp38-cp38-win32.whl", hash = "sha256:04481245ef966fbd24ae9b9e537ce899ae584d521dfbe78f89cad003c38ca2ab"}, - {file = "coverage-7.1.0-cp38-cp38-win_amd64.whl", hash = "sha256:8ae125d1134bf236acba8b83e74c603d1b30e207266121e76484562bc816344c"}, - {file = "coverage-7.1.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:2bf1d5f2084c3932b56b962a683074a3692bce7cabd3aa023c987a2a8e7612f6"}, - {file = "coverage-7.1.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:98b85dd86514d889a2e3dd22ab3c18c9d0019e696478391d86708b805f4ea0fa"}, - {file = "coverage-7.1.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:38da2db80cc505a611938d8624801158e409928b136c8916cd2e203970dde4dc"}, - {file = "coverage-7.1.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3164d31078fa9efe406e198aecd2a02d32a62fecbdef74f76dad6a46c7e48311"}, - {file = "coverage-7.1.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:db61a79c07331e88b9a9974815c075fbd812bc9dbc4dc44b366b5368a2936063"}, - {file = "coverage-7.1.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:9ccb092c9ede70b2517a57382a601619d20981f56f440eae7e4d7eaafd1d1d09"}, - {file = "coverage-7.1.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:33ff26d0f6cc3ca8de13d14fde1ff8efe1456b53e3f0273e63cc8b3c84a063d8"}, - {file = "coverage-7.1.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:d47dd659a4ee952e90dc56c97d78132573dc5c7b09d61b416a9deef4ebe01a0c"}, - {file = "coverage-7.1.0-cp39-cp39-win32.whl", hash = "sha256:d248cd4a92065a4d4543b8331660121b31c4148dd00a691bfb7a5cdc7483cfa4"}, - {file = "coverage-7.1.0-cp39-cp39-win_amd64.whl", hash = "sha256:7ed681b0f8e8bcbbffa58ba26fcf5dbc8f79e7997595bf071ed5430d8c08d6f3"}, - {file = "coverage-7.1.0-pp37.pp38.pp39-none-any.whl", hash = "sha256:755e89e32376c850f826c425ece2c35a4fc266c081490eb0a841e7c1cb0d3bda"}, - {file = "coverage-7.1.0.tar.gz", hash = "sha256:10188fe543560ec4874f974b5305cd1a8bdcfa885ee00ea3a03733464c4ca265"}, + {file = "coverage-7.2.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:e58c0d41d336569d63d1b113bd573db8363bc4146f39444125b7f8060e4e04f5"}, + {file = "coverage-7.2.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:344e714bd0fe921fc72d97404ebbdbf9127bac0ca1ff66d7b79efc143cf7c0c4"}, + {file = "coverage-7.2.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:974bc90d6f6c1e59ceb1516ab00cf1cdfbb2e555795d49fa9571d611f449bcb2"}, + {file = "coverage-7.2.3-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0743b0035d4b0e32bc1df5de70fba3059662ace5b9a2a86a9f894cfe66569013"}, + {file = "coverage-7.2.3-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5d0391fb4cfc171ce40437f67eb050a340fdbd0f9f49d6353a387f1b7f9dd4fa"}, + {file = "coverage-7.2.3-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:4a42e1eff0ca9a7cb7dc9ecda41dfc7cbc17cb1d02117214be0561bd1134772b"}, + {file = "coverage-7.2.3-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:be19931a8dcbe6ab464f3339966856996b12a00f9fe53f346ab3be872d03e257"}, + {file = "coverage-7.2.3-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:72fcae5bcac3333a4cf3b8f34eec99cea1187acd55af723bcbd559adfdcb5535"}, + {file = "coverage-7.2.3-cp310-cp310-win32.whl", hash = "sha256:aeae2aa38395b18106e552833f2a50c27ea0000122bde421c31d11ed7e6f9c91"}, + {file = "coverage-7.2.3-cp310-cp310-win_amd64.whl", hash = "sha256:83957d349838a636e768251c7e9979e899a569794b44c3728eaebd11d848e58e"}, + {file = "coverage-7.2.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:dfd393094cd82ceb9b40df4c77976015a314b267d498268a076e940fe7be6b79"}, + {file = "coverage-7.2.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:182eb9ac3f2b4874a1f41b78b87db20b66da6b9cdc32737fbbf4fea0c35b23fc"}, + {file = "coverage-7.2.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1bb1e77a9a311346294621be905ea8a2c30d3ad371fc15bb72e98bfcfae532df"}, + {file = "coverage-7.2.3-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ca0f34363e2634deffd390a0fef1aa99168ae9ed2af01af4a1f5865e362f8623"}, + {file = "coverage-7.2.3-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:55416d7385774285b6e2a5feca0af9652f7f444a4fa3d29d8ab052fafef9d00d"}, + {file = "coverage-7.2.3-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:06ddd9c0249a0546997fdda5a30fbcb40f23926df0a874a60a8a185bc3a87d93"}, + {file = "coverage-7.2.3-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:fff5aaa6becf2c6a1699ae6a39e2e6fb0672c2d42eca8eb0cafa91cf2e9bd312"}, + {file = "coverage-7.2.3-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:ea53151d87c52e98133eb8ac78f1206498c015849662ca8dc246255265d9c3c4"}, + {file = "coverage-7.2.3-cp311-cp311-win32.whl", hash = "sha256:8f6c930fd70d91ddee53194e93029e3ef2aabe26725aa3c2753df057e296b925"}, + {file = "coverage-7.2.3-cp311-cp311-win_amd64.whl", hash = "sha256:fa546d66639d69aa967bf08156eb8c9d0cd6f6de84be9e8c9819f52ad499c910"}, + {file = "coverage-7.2.3-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:b2317d5ed777bf5a033e83d4f1389fd4ef045763141d8f10eb09a7035cee774c"}, + {file = "coverage-7.2.3-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:be9824c1c874b73b96288c6d3de793bf7f3a597770205068c6163ea1f326e8b9"}, + {file = "coverage-7.2.3-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2c3b2803e730dc2797a017335827e9da6da0e84c745ce0f552e66400abdfb9a1"}, + {file = "coverage-7.2.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8f69770f5ca1994cb32c38965e95f57504d3aea96b6c024624fdd5bb1aa494a1"}, + {file = "coverage-7.2.3-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:1127b16220f7bfb3f1049ed4a62d26d81970a723544e8252db0efde853268e21"}, + {file = "coverage-7.2.3-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:aa784405f0c640940595fa0f14064d8e84aff0b0f762fa18393e2760a2cf5841"}, + {file = "coverage-7.2.3-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:3146b8e16fa60427e03884301bf8209221f5761ac754ee6b267642a2fd354c48"}, + {file = "coverage-7.2.3-cp37-cp37m-win32.whl", hash = "sha256:1fd78b911aea9cec3b7e1e2622c8018d51c0d2bbcf8faaf53c2497eb114911c1"}, + {file = "coverage-7.2.3-cp37-cp37m-win_amd64.whl", hash = "sha256:0f3736a5d34e091b0a611964c6262fd68ca4363df56185902528f0b75dbb9c1f"}, + {file = "coverage-7.2.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:981b4df72c93e3bc04478153df516d385317628bd9c10be699c93c26ddcca8ab"}, + {file = "coverage-7.2.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:c0045f8f23a5fb30b2eb3b8a83664d8dc4fb58faddf8155d7109166adb9f2040"}, + {file = "coverage-7.2.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f760073fcf8f3d6933178d67754f4f2d4e924e321f4bb0dcef0424ca0215eba1"}, + {file = "coverage-7.2.3-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c86bd45d1659b1ae3d0ba1909326b03598affbc9ed71520e0ff8c31a993ad911"}, + {file = "coverage-7.2.3-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:172db976ae6327ed4728e2507daf8a4de73c7cc89796483e0a9198fd2e47b462"}, + {file = "coverage-7.2.3-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:d2a3a6146fe9319926e1d477842ca2a63fe99af5ae690b1f5c11e6af074a6b5c"}, + {file = "coverage-7.2.3-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:f649dd53833b495c3ebd04d6eec58479454a1784987af8afb77540d6c1767abd"}, + {file = "coverage-7.2.3-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:7c4ed4e9f3b123aa403ab424430b426a1992e6f4c8fd3cb56ea520446e04d152"}, + {file = "coverage-7.2.3-cp38-cp38-win32.whl", hash = "sha256:eb0edc3ce9760d2f21637766c3aa04822030e7451981ce569a1b3456b7053f22"}, + {file = "coverage-7.2.3-cp38-cp38-win_amd64.whl", hash = "sha256:63cdeaac4ae85a179a8d6bc09b77b564c096250d759eed343a89d91bce8b6367"}, + {file = "coverage-7.2.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:20d1a2a76bb4eb00e4d36b9699f9b7aba93271c9c29220ad4c6a9581a0320235"}, + {file = "coverage-7.2.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:4ea748802cc0de4de92ef8244dd84ffd793bd2e7be784cd8394d557a3c751e21"}, + {file = "coverage-7.2.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:21b154aba06df42e4b96fc915512ab39595105f6c483991287021ed95776d934"}, + {file = "coverage-7.2.3-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:fd214917cabdd6f673a29d708574e9fbdb892cb77eb426d0eae3490d95ca7859"}, + {file = "coverage-7.2.3-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2c2e58e45fe53fab81f85474e5d4d226eeab0f27b45aa062856c89389da2f0d9"}, + {file = "coverage-7.2.3-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:87ecc7c9a1a9f912e306997ffee020297ccb5ea388421fe62a2a02747e4d5539"}, + {file = "coverage-7.2.3-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:387065e420aed3c71b61af7e82c7b6bc1c592f7e3c7a66e9f78dd178699da4fe"}, + {file = "coverage-7.2.3-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:ea3f5bc91d7d457da7d48c7a732beaf79d0c8131df3ab278e6bba6297e23c6c4"}, + {file = "coverage-7.2.3-cp39-cp39-win32.whl", hash = "sha256:ae7863a1d8db6a014b6f2ff9c1582ab1aad55a6d25bac19710a8df68921b6e30"}, + {file = "coverage-7.2.3-cp39-cp39-win_amd64.whl", hash = "sha256:3f04becd4fcda03c0160d0da9c8f0c246bc78f2f7af0feea1ec0930e7c93fa4a"}, + {file = "coverage-7.2.3-pp37.pp38.pp39-none-any.whl", hash = "sha256:965ee3e782c7892befc25575fa171b521d33798132692df428a09efacaffe8d0"}, + {file = "coverage-7.2.3.tar.gz", hash = "sha256:d298c2815fa4891edd9abe5ad6e6cb4207104c7dd9fd13aea3fdebf6f9b91259"}, ] [package.dependencies] @@ -195,6 +180,18 @@ tomli = {version = "*", optional = true, markers = "python_full_version <= \"3.1 [package.extras] toml = ["tomli"] +[[package]] +name = "decorator" +version = "5.1.1" +description = "Decorators for Humans" +category = "dev" +optional = false +python-versions = ">=3.5" +files = [ + {file = "decorator-5.1.1-py3-none-any.whl", hash = "sha256:b8c3f85900b9dc423225913c5aace94729fe1fa9763b38939a95226f02d37186"}, + {file = "decorator-5.1.1.tar.gz", hash = "sha256:637996211036b6385ef91435e4fae22989472f9d571faba8927ba8253acbc330"}, +] + [[package]] name = "distlib" version = "0.3.6" @@ -209,14 +206,14 @@ files = [ [[package]] name = "exceptiongroup" -version = "1.1.0" +version = "1.1.1" description = "Backport of PEP 654 (exception groups)" category = "dev" optional = false python-versions = ">=3.7" files = [ - {file = "exceptiongroup-1.1.0-py3-none-any.whl", hash = "sha256:327cbda3da756e2de031a3107b81ab7b3770a602c4d16ca618298c526f4bec1e"}, - {file = "exceptiongroup-1.1.0.tar.gz", hash = "sha256:bcb67d800a4497e1b404c2dd44fca47d3b7a5e5433dbab67f96c1a685cdfdf23"}, + {file = "exceptiongroup-1.1.1-py3-none-any.whl", hash = "sha256:232c37c63e4f682982c8b6459f33a8981039e5fb8756b2074364e5055c498c9e"}, + {file = "exceptiongroup-1.1.1.tar.gz", hash = "sha256:d484c3090ba2889ae2928419117447a14daf3c1231d5e30d0aae34f354f01785"}, ] [package.extras] @@ -239,19 +236,19 @@ tests = ["asttokens", "littleutils", "pytest", "rich"] [[package]] name = "filelock" -version = "3.9.0" +version = "3.11.0" description = "A platform independent file lock." category = "dev" optional = false python-versions = ">=3.7" files = [ - {file = "filelock-3.9.0-py3-none-any.whl", hash = "sha256:f58d535af89bb9ad5cd4df046f741f8553a418c01a7856bf0d173bbc9f6bd16d"}, - {file = "filelock-3.9.0.tar.gz", hash = "sha256:7b319f24340b51f55a2bf7a12ac0755a9b03e718311dac567a0f4f7fabd2f5de"}, + {file = "filelock-3.11.0-py3-none-any.whl", hash = "sha256:f08a52314748335c6460fc8fe40cd5638b85001225db78c2aa01c8c0db83b318"}, + {file = "filelock-3.11.0.tar.gz", hash = "sha256:3618c0da67adcc0506b015fd11ef7faf1b493f0b40d87728e19986b536890c37"}, ] [package.extras] -docs = ["furo (>=2022.12.7)", "sphinx (>=5.3)", "sphinx-autodoc-typehints (>=1.19.5)"] -testing = ["covdefaults (>=2.2.2)", "coverage (>=7.0.1)", "pytest (>=7.2)", "pytest-cov (>=4)", "pytest-timeout (>=2.1)"] +docs = ["furo (>=2023.3.27)", "sphinx (>=6.1.3)", "sphinx-autodoc-typehints (>=1.22,!=1.23.4)"] +testing = ["covdefaults (>=2.3)", "coverage (>=7.2.2)", "diff-cover (>=7.5)", "pytest (>=7.2.2)", "pytest-cov (>=4)", "pytest-mock (>=3.10)", "pytest-timeout (>=2.1)"] [[package]] name = "flake8" @@ -291,14 +288,14 @@ pygments = ">=2.2.0" [[package]] name = "identify" -version = "2.5.18" +version = "2.5.22" description = "File identification library for Python" category = "dev" optional = false python-versions = ">=3.7" files = [ - {file = "identify-2.5.18-py2.py3-none-any.whl", hash = "sha256:93aac7ecf2f6abf879b8f29a8002d3c6de7086b8c28d88e1ad15045a15ab63f9"}, - {file = "identify-2.5.18.tar.gz", hash = "sha256:89e144fa560cc4cffb6ef2ab5e9fb18ed9f9b3cb054384bab4b95c12f6c309fe"}, + {file = "identify-2.5.22-py2.py3-none-any.whl", hash = "sha256:f0faad595a4687053669c112004178149f6c326db71ee999ae4636685753ad2f"}, + {file = "identify-2.5.22.tar.gz", hash = "sha256:f7a93d6cf98e29bd07663c60728e7a4057615068d7a639d132dc883b2d54d31e"}, ] [package.extras] @@ -336,6 +333,63 @@ files = [ {file = "iniconfig-2.0.0.tar.gz", hash = "sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3"}, ] +[[package]] +name = "ipython" +version = "7.34.0" +description = "IPython: Productive Interactive Computing" +category = "dev" +optional = false +python-versions = ">=3.7" +files = [ + {file = "ipython-7.34.0-py3-none-any.whl", hash = "sha256:c175d2440a1caff76116eb719d40538fbb316e214eda85c5515c303aacbfb23e"}, + {file = "ipython-7.34.0.tar.gz", hash = "sha256:af3bdb46aa292bce5615b1b2ebc76c2080c5f77f54bda2ec72461317273e7cd6"}, +] + +[package.dependencies] +appnope = {version = "*", markers = "sys_platform == \"darwin\""} +backcall = "*" +colorama = {version = "*", markers = "sys_platform == \"win32\""} +decorator = "*" +jedi = ">=0.16" +matplotlib-inline = "*" +pexpect = {version = ">4.3", markers = "sys_platform != \"win32\""} +pickleshare = "*" +prompt-toolkit = ">=2.0.0,<3.0.0 || >3.0.0,<3.0.1 || >3.0.1,<3.1.0" +pygments = "*" +setuptools = ">=18.5" +traitlets = ">=4.2" + +[package.extras] +all = ["Sphinx (>=1.3)", "ipykernel", "ipyparallel", "ipywidgets", "nbconvert", "nbformat", "nose (>=0.10.1)", "notebook", "numpy (>=1.17)", "pygments", "qtconsole", "requests", "testpath"] +doc = ["Sphinx (>=1.3)"] +kernel = ["ipykernel"] +nbconvert = ["nbconvert"] +nbformat = ["nbformat"] +notebook = ["ipywidgets", "notebook"] +parallel = ["ipyparallel"] +qtconsole = ["qtconsole"] +test = ["ipykernel", "nbformat", "nose (>=0.10.1)", "numpy (>=1.17)", "pygments", "requests", "testpath"] + +[[package]] +name = "jedi" +version = "0.18.2" +description = "An autocompletion tool for Python that can be used for text editors." +category = "dev" +optional = false +python-versions = ">=3.6" +files = [ + {file = "jedi-0.18.2-py2.py3-none-any.whl", hash = "sha256:203c1fd9d969ab8f2119ec0a3342e0b49910045abe6af0a3ae83a5764d54639e"}, + {file = "jedi-0.18.2.tar.gz", hash = "sha256:bae794c30d07f6d910d32a7048af09b5a39ed740918da923c6b780790ebac612"}, +] + +[package.dependencies] +parso = ">=0.8.0,<0.9.0" + +[package.extras] +docs = ["Jinja2 (==2.11.3)", "MarkupSafe (==1.1.1)", "Pygments (==2.8.1)", "alabaster (==0.7.12)", "babel (==2.9.1)", "chardet (==4.0.0)", "commonmark (==0.8.1)", "docutils (==0.17.1)", "future (==0.18.2)", "idna (==2.10)", "imagesize (==1.2.0)", "mock (==1.0.1)", "packaging (==20.9)", "pyparsing (==2.4.7)", "pytz (==2021.1)", "readthedocs-sphinx-ext (==2.1.4)", "recommonmark (==0.5.0)", "requests (==2.25.1)", "six (==1.15.0)", "snowballstemmer (==2.1.0)", "sphinx (==1.8.5)", "sphinx-rtd-theme (==0.4.3)", "sphinxcontrib-serializinghtml (==1.1.4)", "sphinxcontrib-websupport (==1.2.4)", "urllib3 (==1.26.4)"] +qa = ["flake8 (==3.8.3)", "mypy (==0.782)"] +testing = ["Django (<3.1)", "attrs", "colorama", "docopt", "pytest (<7.0.0)"] + [[package]] name = "lxml" version = "4.9.2" @@ -429,6 +483,21 @@ html5 = ["html5lib"] htmlsoup = ["BeautifulSoup4"] source = ["Cython (>=0.29.7)"] +[[package]] +name = "matplotlib-inline" +version = "0.1.6" +description = "Inline Matplotlib backend for Jupyter" +category = "dev" +optional = false +python-versions = ">=3.5" +files = [ + {file = "matplotlib-inline-0.1.6.tar.gz", hash = "sha256:f887e5f10ba98e8d2b150ddcf4702c1e5f8b3a20005eb0f74bfdbd360ee6f304"}, + {file = "matplotlib_inline-0.1.6-py3-none-any.whl", hash = "sha256:f1f41aab5328aa5aaea9b16d083b128102f8712542f819fe7e6a420ff581b311"}, +] + +[package.dependencies] +traitlets = "*" + [[package]] name = "mccabe" version = "0.7.0" @@ -470,26 +539,69 @@ setuptools = "*" [[package]] name = "packaging" -version = "23.0" +version = "23.1" description = "Core utilities for Python packages" category = "dev" optional = false python-versions = ">=3.7" files = [ - {file = "packaging-23.0-py3-none-any.whl", hash = "sha256:714ac14496c3e68c99c29b00845f7a2b85f3bb6f1078fd9f72fd20f0570002b2"}, - {file = "packaging-23.0.tar.gz", hash = "sha256:b6ad297f8907de0fa2fe1ccbd26fdaf387f5f47c7275fedf8cce89f99446cf97"}, + {file = "packaging-23.1-py3-none-any.whl", hash = "sha256:994793af429502c4ea2ebf6bf664629d07c1a9fe974af92966e4b8d2df7edc61"}, + {file = "packaging-23.1.tar.gz", hash = "sha256:a392980d2b6cffa644431898be54b0045151319d1e7ec34f0cfed48767dd334f"}, +] + +[[package]] +name = "parso" +version = "0.8.3" +description = "A Python Parser" +category = "dev" +optional = false +python-versions = ">=3.6" +files = [ + {file = "parso-0.8.3-py2.py3-none-any.whl", hash = "sha256:c001d4636cd3aecdaf33cbb40aebb59b094be2a74c556778ef5576c175e19e75"}, + {file = "parso-0.8.3.tar.gz", hash = "sha256:8c07be290bb59f03588915921e29e8a50002acaf2cdc5fa0e0114f91709fafa0"}, ] +[package.extras] +qa = ["flake8 (==3.8.3)", "mypy (==0.782)"] +testing = ["docopt", "pytest (<6.0.0)"] + [[package]] name = "pathspec" -version = "0.11.0" +version = "0.11.1" description = "Utility library for gitignore style pattern matching of file paths." category = "dev" optional = false python-versions = ">=3.7" files = [ - {file = "pathspec-0.11.0-py3-none-any.whl", hash = "sha256:3a66eb970cbac598f9e5ccb5b2cf58930cd8e3ed86d393d541eaf2d8b1705229"}, - {file = "pathspec-0.11.0.tar.gz", hash = "sha256:64d338d4e0914e91c1792321e6907b5a593f1ab1851de7fc269557a21b30ebbc"}, + {file = "pathspec-0.11.1-py3-none-any.whl", hash = "sha256:d8af70af76652554bd134c22b3e8a1cc46ed7d91edcdd721ef1a0c51a84a5293"}, + {file = "pathspec-0.11.1.tar.gz", hash = "sha256:2798de800fa92780e33acca925945e9a19a133b715067cf165b8866c15a31687"}, +] + +[[package]] +name = "pexpect" +version = "4.8.0" +description = "Pexpect allows easy control of interactive console applications." +category = "dev" +optional = false +python-versions = "*" +files = [ + {file = "pexpect-4.8.0-py2.py3-none-any.whl", hash = "sha256:0b48a55dcb3c05f3329815901ea4fc1537514d6ba867a152b581d69ae3710937"}, + {file = "pexpect-4.8.0.tar.gz", hash = "sha256:fc65a43959d153d0114afe13997d439c22823a27cefceb5ff35c2178c6784c0c"}, +] + +[package.dependencies] +ptyprocess = ">=0.5" + +[[package]] +name = "pickleshare" +version = "0.7.5" +description = "Tiny 'shelve'-like database with concurrency support" +category = "dev" +optional = false +python-versions = "*" +files = [ + {file = "pickleshare-0.7.5-py2.py3-none-any.whl", hash = "sha256:9649af414d74d4df115d5d718f82acb59c9d418196b7b4290ed47a12ce62df56"}, + {file = "pickleshare-0.7.5.tar.gz", hash = "sha256:87683d47965c1da65cdacaf31c8441d12b8044cdec9aca500cd78fc2c683afca"}, ] [[package]] @@ -550,6 +662,45 @@ nodeenv = ">=0.11.1" pyyaml = ">=5.1" virtualenv = ">=20.10.0" +[[package]] +name = "prompt-toolkit" +version = "3.0.38" +description = "Library for building powerful interactive command lines in Python" +category = "dev" +optional = false +python-versions = ">=3.7.0" +files = [ + {file = "prompt_toolkit-3.0.38-py3-none-any.whl", hash = "sha256:45ea77a2f7c60418850331366c81cf6b5b9cf4c7fd34616f733c5427e6abbb1f"}, + {file = "prompt_toolkit-3.0.38.tar.gz", hash = "sha256:23ac5d50538a9a38c8bde05fecb47d0b403ecd0662857a86f886f798563d5b9b"}, +] + +[package.dependencies] +wcwidth = "*" + +[[package]] +name = "ptyprocess" +version = "0.7.0" +description = "Run a subprocess in a pseudo terminal" +category = "dev" +optional = false +python-versions = "*" +files = [ + {file = "ptyprocess-0.7.0-py2.py3-none-any.whl", hash = "sha256:4b41f3967fce3af57cc7e94b888626c18bf37a083e3651ca8feeb66d492fef35"}, + {file = "ptyprocess-0.7.0.tar.gz", hash = "sha256:5c5d0a3b48ceee0b48485e0c26037c0acd7d29765ca3fbb5cb3831d347423220"}, +] + +[[package]] +name = "py4j" +version = "0.10.9.7" +description = "Enables Python programs to dynamically access arbitrary Java objects" +category = "main" +optional = false +python-versions = "*" +files = [ + {file = "py4j-0.10.9.7-py2.py3-none-any.whl", hash = "sha256:85defdfd2b2376eb3abf5ca6474b51ab7e0de341c75a02f46dc9b5976f5a5c1b"}, + {file = "py4j-0.10.9.7.tar.gz", hash = "sha256:0b6e5315bb3ada5cf62ac651d107bb2ebc02def3dee9d9548e3baac644ea8dbb"}, +] + [[package]] name = "pycodestyle" version = "2.9.1" @@ -576,33 +727,53 @@ files = [ [[package]] name = "pygments" -version = "2.14.0" +version = "2.15.0" description = "Pygments is a syntax highlighting package written in Python." category = "dev" optional = false -python-versions = ">=3.6" +python-versions = ">=3.7" files = [ - {file = "Pygments-2.14.0-py3-none-any.whl", hash = "sha256:fa7bd7bd2771287c0de303af8bfdfc731f51bd2c6a47ab69d117138893b82717"}, - {file = "Pygments-2.14.0.tar.gz", hash = "sha256:b3ed06a9e8ac9a9aae5a6f5dbe78a8a58655d17b43b93c078f094ddc476ae297"}, + {file = "Pygments-2.15.0-py3-none-any.whl", hash = "sha256:77a3299119af881904cd5ecd1ac6a66214b6e9bed1f2db16993b54adede64094"}, + {file = "Pygments-2.15.0.tar.gz", hash = "sha256:f7e36cffc4c517fbc252861b9a6e4644ca0e5abadf9a113c72d1358ad09b9500"}, ] [package.extras] plugins = ["importlib-metadata"] +[[package]] +name = "pyspark" +version = "3.4.0" +description = "Apache Spark Python API" +category = "main" +optional = false +python-versions = ">=3.7" +files = [ + {file = "pyspark-3.4.0.tar.gz", hash = "sha256:167a23e11854adb37f8602de6fcc3a4f96fd5f1e323b9bb83325f38408c5aafd"}, +] + +[package.dependencies] +py4j = "0.10.9.7" + +[package.extras] +connect = ["googleapis-common-protos (>=1.56.4)", "grpcio (>=1.48.1)", "grpcio-status (>=1.48.1)", "numpy (>=1.15)", "pandas (>=1.0.5)", "pyarrow (>=1.0.0)"] +ml = ["numpy (>=1.15)"] +mllib = ["numpy (>=1.15)"] +pandas-on-spark = ["numpy (>=1.15)", "pandas (>=1.0.5)", "pyarrow (>=1.0.0)"] +sql = ["numpy (>=1.15)", "pandas (>=1.0.5)", "pyarrow (>=1.0.0)"] + [[package]] name = "pytest" -version = "7.2.1" +version = "7.3.0" description = "pytest: simple powerful testing with Python" category = "dev" optional = false python-versions = ">=3.7" files = [ - {file = "pytest-7.2.1-py3-none-any.whl", hash = "sha256:c7c6ca206e93355074ae32f7403e8ea12163b1163c976fee7d4d84027c162be5"}, - {file = "pytest-7.2.1.tar.gz", hash = "sha256:d45e0952f3727241918b8fd0f376f5ff6b301cc0777c6f9a556935c92d8a7d42"}, + {file = "pytest-7.3.0-py3-none-any.whl", hash = "sha256:933051fa1bfbd38a21e73c3960cebdad4cf59483ddba7696c48509727e17f201"}, + {file = "pytest-7.3.0.tar.gz", hash = "sha256:58ecc27ebf0ea643ebfdf7fb1249335da761a00c9f955bcd922349bcb68ee57d"}, ] [package.dependencies] -attrs = ">=19.2.0" colorama = {version = "*", markers = "sys_platform == \"win32\""} exceptiongroup = {version = ">=1.0.0rc8", markers = "python_version < \"3.11\""} importlib-metadata = {version = ">=0.12", markers = "python_version < \"3.8\""} @@ -612,7 +783,7 @@ pluggy = ">=0.12,<2.0" tomli = {version = ">=1.0.0", markers = "python_version < \"3.11\""} [package.extras] -testing = ["argcomplete", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "xmlschema"] +testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "xmlschema"] [[package]] name = "pytest-cov" @@ -685,14 +856,14 @@ files = [ [[package]] name = "setuptools" -version = "67.3.2" +version = "67.6.1" description = "Easily download, build, install, upgrade, and uninstall Python packages" category = "dev" optional = false python-versions = ">=3.7" files = [ - {file = "setuptools-67.3.2-py3-none-any.whl", hash = "sha256:bb6d8e508de562768f2027902929f8523932fcd1fb784e6d573d2cafac995a48"}, - {file = "setuptools-67.3.2.tar.gz", hash = "sha256:95f00380ef2ffa41d9bba85d95b27689d923c93dfbafed4aecd7cf988a25e012"}, + {file = "setuptools-67.6.1-py3-none-any.whl", hash = "sha256:e728ca814a823bf7bf60162daf9db95b93d532948c4c0bea762ce62f60189078"}, + {file = "setuptools-67.6.1.tar.gz", hash = "sha256:257de92a9d50a60b8e22abfcbb771571fde0dbf3ec234463212027a4eeecbe9a"}, ] [package.extras] @@ -714,16 +885,32 @@ files = [ [[package]] name = "tomli" -version = "2.0.1" +version = "1.2.3" description = "A lil' TOML parser" category = "dev" optional = false +python-versions = ">=3.6" +files = [ + {file = "tomli-1.2.3-py3-none-any.whl", hash = "sha256:e3069e4be3ead9668e21cb9b074cd948f7b3113fd9c8bba083f48247aab8b11c"}, + {file = "tomli-1.2.3.tar.gz", hash = "sha256:05b6166bff487dc068d322585c7ea4ef78deed501cc124060e0f238e89a9231f"}, +] + +[[package]] +name = "traitlets" +version = "5.9.0" +description = "Traitlets Python configuration system" +category = "dev" +optional = false python-versions = ">=3.7" files = [ - {file = "tomli-2.0.1-py3-none-any.whl", hash = "sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc"}, - {file = "tomli-2.0.1.tar.gz", hash = "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"}, + {file = "traitlets-5.9.0-py3-none-any.whl", hash = "sha256:9e6ec080259b9a5940c797d58b613b5e31441c2257b87c2e795c5228ae80d2d8"}, + {file = "traitlets-5.9.0.tar.gz", hash = "sha256:f6cde21a9c68cf756af02035f72d5a723bf607e862e7be33ece505abf4a3bad9"}, ] +[package.extras] +docs = ["myst-parser", "pydata-sphinx-theme", "sphinx"] +test = ["argcomplete (>=2.0)", "pre-commit", "pytest", "pytest-mock"] + [[package]] name = "typed-ast" version = "1.5.4" @@ -792,23 +979,35 @@ platformdirs = ">=2,<3" docs = ["proselint (>=0.10.2)", "sphinx (>=3)", "sphinx-argparse (>=0.2.5)", "sphinx-rtd-theme (>=0.4.3)", "towncrier (>=21.3)"] testing = ["coverage (>=4)", "coverage-enable-subprocess (>=1)", "flaky (>=3)", "packaging (>=20.0)", "pytest (>=4)", "pytest-env (>=0.6.2)", "pytest-freezegun (>=0.4.1)", "pytest-mock (>=2)", "pytest-randomly (>=1)", "pytest-timeout (>=1)"] +[[package]] +name = "wcwidth" +version = "0.2.6" +description = "Measures the displayed width of unicode strings in a terminal" +category = "dev" +optional = false +python-versions = "*" +files = [ + {file = "wcwidth-0.2.6-py2.py3-none-any.whl", hash = "sha256:795b138f6875577cd91bba52baf9e445cd5118fd32723b460e30a0af30ea230e"}, + {file = "wcwidth-0.2.6.tar.gz", hash = "sha256:a5220780a404dbe3353789870978e472cfe477761f06ee55077256e509b156d0"}, +] + [[package]] name = "zipp" -version = "3.13.0" +version = "3.15.0" description = "Backport of pathlib-compatible object wrapper for zip files" category = "dev" optional = false python-versions = ">=3.7" files = [ - {file = "zipp-3.13.0-py3-none-any.whl", hash = "sha256:e8b2a36ea17df80ffe9e2c4fda3f693c3dad6df1697d3cd3af232db680950b0b"}, - {file = "zipp-3.13.0.tar.gz", hash = "sha256:23f70e964bc11a34cef175bc90ba2914e1e4545ea1e3e2f67c079671883f9cb6"}, + {file = "zipp-3.15.0-py3-none-any.whl", hash = "sha256:48904fc76a60e542af151aded95726c1a5c34ed43ab4134b597665c86d7ad556"}, + {file = "zipp-3.15.0.tar.gz", hash = "sha256:112929ad649da941c23de50f356a2b5570c954b65150642bccdd66bf194d224b"}, ] [package.extras] docs = ["furo", "jaraco.packaging (>=9)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"] -testing = ["flake8 (<5)", "func-timeout", "jaraco.functools", "jaraco.itertools", "more-itertools", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-flake8", "pytest-mypy (>=0.9.1)"] +testing = ["big-O", "flake8 (<5)", "jaraco.functools", "jaraco.itertools", "more-itertools", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-flake8", "pytest-mypy (>=0.9.1)"] [metadata] lock-version = "2.0" python-versions = ">=3.7.0" -content-hash = "3f21ad4b8185c5eeaa0b9476990b910e49cfd3f093beb0e731c0f68b1a962376" +content-hash = "28d7a08b50392937dff26fc17e542b22030ed59dba6bc05580d9b6de9c39fb8d" diff --git a/pyproject.toml b/pyproject.toml index 8c9e3ea..7f09895 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,9 +26,17 @@ documentation = "https://living-with-machines.github.io/alto2txt/" [tool.poetry.dependencies] python = ">=3.7.0" lxml = "^4.7.1" +pyspark = "^3.4.0" -[tool.poetry.dev-dependencies] -black = "^23.1" +[tool.poetry.group.spark] +optional = true + +[tool.poetry.group.spark.dependencies] +pyspark = "^3.3.1" + +[tool.poetry.group.dev.dependencies] +ipython = "7.34.0" +black = "^21.12b0" flake8 = "^5.0.4" pytest = "^7.2.1" icecream = "^2.1.2" @@ -36,8 +44,6 @@ pre-commit = "^2.21.0" coverage = {extras = ["toml"], version = "^7.1.0"} pytest-cov = "^4.0.0" -[tool.poetry.group.dev.dependencies] - [build-system] requires = ["poetry-core>=1.0.0", "setuptools", "wheel"] build-backend = "poetry.core.masonry.api" @@ -55,6 +61,8 @@ relative_files = true addopts = """ --cov=src/alto2txt --cov-report=term:skip-covered +--pdbcls=IPython.terminal.debugger:TerminalPdb +--doctest-modules """ pythonpath = [ "src" diff --git a/src/alto2txt/extract_publications_text.py b/src/alto2txt/extract_publications_text.py index 61f16f5..eb300fe 100755 --- a/src/alto2txt/extract_publications_text.py +++ b/src/alto2txt/extract_publications_text.py @@ -39,6 +39,7 @@ Downsample. Default 1 -n [NUM_CORES], --num-cores [NUM_CORES] Number of cores (Spark only). Default 1") + -t, --test-output Verify output txt files xml_in_dir is expected to hold XML for multiple publications, in the following structure: @@ -80,10 +81,40 @@ * extract_text_ukp.xslt: UKP XSL file. """ -from argparse import ArgumentParser +from argparse import ArgumentParser, BooleanOptionalAction +from os import PathLike +from pathlib import Path +from subprocess import CompletedProcess, run +from typing import Final from alto2txt import xml_to_text_entry +VERIFY_SCRIPT_PATH: Final[PathLike | str] = "alto2txt-verify.sh" +VERIFY_SCRIPT_TEMP_PATH: Final[Path] = Path(Path(VERIFY_SCRIPT_PATH).stem) +VERIFY_TEMP_INPUT_PATH: Final[PathLike] = VERIFY_SCRIPT_TEMP_PATH / "input" +VERIFY_TEMP_OUTPUT_PATH: Final[PathLike] = VERIFY_SCRIPT_TEMP_PATH / "output" + + +class ScriptPathError(Exception): + ... + + +def verify_output( + input_dir: PathLike, + output_dir: PathLike, + script_path: PathLike | str = VERIFY_SCRIPT_PATH, +) -> CompletedProcess: + """Run script_path to verify the output_dir paths match the input_dir paths. + + Note: + * This assumes the script_path is in the root directory of the package + """ + try: + assert Path(script_path).is_file() + except AssertionError: + raise ScriptPathError(f"{script_path} is not a script file") + return run(["bash", script_path, input_dir, output_dir]) + def main(): """ @@ -132,6 +163,12 @@ def main(): default=1, help="Number of cores (Spark only). Default 1", ) + parser.add_argument( + "-t", + "--test-output", + action=BooleanOptionalAction, + help="Verify output txt files", + ) args = parser.parse_args() xml_in_dir = args.xml_in_dir txt_out_dir = args.txt_out_dir @@ -142,6 +179,8 @@ def main(): xml_to_text_entry.xml_publications_to_text( xml_in_dir, txt_out_dir, process_type, log_file, num_cores, downsample ) + if args.test_output: + verify_output(xml_in_dir, txt_out_dir) if __name__ == "__main__": diff --git a/src/alto2txt/xml_to_text.py b/src/alto2txt/xml_to_text.py index 3d9d95e..d3440b4 100755 --- a/src/alto2txt/xml_to_text.py +++ b/src/alto2txt/xml_to_text.py @@ -107,7 +107,6 @@ def issue_to_text(publication, year, issue, issue_dir, txt_out_dir, xslts): issue_out_stub = os.path.splitext(input_filename)[0] issue_out_path = os.path.join(issue_out_dir, issue_out_stub) try: - xslt( document_tree, input_path=etree.XSLT.strparam(os.path.abspath(issue_dir)), diff --git a/src/alto2txt/xslts/extract_text_common.xslt b/src/alto2txt/xslts/extract_text_common.xslt index 9384585..a77f46b 100644 --- a/src/alto2txt/xslts/extract_text_common.xslt +++ b/src/alto2txt/xslts/extract_text_common.xslt @@ -8,7 +8,7 @@ alto2txt - 0.3.1 + 0.3.4 https://github.com/Living-with-machines/alto2txt diff --git a/tests/test_e2e.py b/tests/test_e2e.py index f9429a4..eb25183 100644 --- a/tests/test_e2e.py +++ b/tests/test_e2e.py @@ -1,13 +1,42 @@ import sys +from os import PathLike, getcwd +from pathlib import Path +from shutil import rmtree +from subprocess import CompletedProcess +from typing import Final import pytest from icecream import ic from alto2txt import extract_publications_text as ept +DEMO_FILES_PATH: Final[str] = "demo-files" +DEMO_OUTPUT_PATH: Final[str] = "demo-files" -def test_cli_no_args(capsys): +@pytest.fixture +def demo_files_path(path: str = DEMO_FILES_PATH) -> str: + return path + + +@pytest.fixture +def demo_output_path(path: str = DEMO_OUTPUT_PATH) -> str: + return path + + +@pytest.fixture +def demo_output_dir(tmp_path: Path, demo_output_path: PathLike) -> Path: + output_dir = tmp_path / demo_output_path + output_dir.mkdir() + return output_dir + + +@pytest.fixture +def set_test_dir(request, monkeypatch): + monkeypatch.chdir(request.fspath.dirname) + + +def test_cli_no_args(capsys): # Test that not supplying any args should give a usage message with pytest.raises(SystemExit): sys.argv[1:] = "" @@ -17,24 +46,24 @@ def test_cli_no_args(capsys): assert captured.err.startswith("usage") -def test_input_dir_args(): +def test_input_dir_args(demo_files_path, demo_output_path): # Test that an error is raised `xml_in_dir` and `txt_out_dir` are the same. with pytest.raises(AssertionError) as ae: - sys.argv[1:] = ["demo-files", "demo-files"] + sys.argv[1:] = [demo_files_path, demo_files_path] ept.main() assert ic(ae.match("should be different")) # Test that a non-existant `xml_in_dir` if caught with pytest.raises(AssertionError) as ae: - sys.argv[1:] = ["non-existant-input-dir", "demo-output"] + sys.argv[1:] = ["non-existant-input-dir", demo_output_path] ept.main() assert ic(ae.match("non-existant-input-dir")) assert ic(ae.match("xml_in_dir.+not found")) -def test_output_dir_args(tmp_path): +def test_output_dir_args(tmp_path, demo_files_path): # Use the `tmp_path` fixture to ensure that the so-called "non-existant" dirs aren't # later inadvertently created within the repo. @@ -56,7 +85,7 @@ def test_output_dir_args(tmp_path): for output_dir in output_dirs_list: assert not output_dir.exists() - sys.argv[1:] = ["demo-files", str(output_dir)] + sys.argv[1:] = [demo_files_path, str(output_dir)] ept.main() assert output_dir.exists() @@ -65,7 +94,7 @@ def test_output_dir_args(tmp_path): file_not_dir.touch() with pytest.raises(AssertionError) as ae: - sys.argv[1:] = ["demo-files", str(file_not_dir)] + sys.argv[1:] = [demo_files_path, str(file_not_dir)] ept.main() assert ae.match("output-file.txt") @@ -73,7 +102,7 @@ def test_output_dir_args(tmp_path): @pytest.mark.skip("Correct behaviour not confirmed. See GH Issue #27") -def test_non_empty_output_dir(tmp_path): +def test_non_empty_output_dir(demo_files_path, tmp_path): # Run twice to ensure that on the second run `txt_out_dir` is not empty. # What should the correct behaviour be here? # See https://github.com/Living-with-machines/alto2txt/issues/27 @@ -81,11 +110,11 @@ def test_non_empty_output_dir(tmp_path): run_twice_dir = str(tmp_path / "run-twice") # Run first time to ensure that there is already content - sys.argv[1:] = ["demo-files", run_twice_dir] + sys.argv[1:] = [demo_files_path, run_twice_dir] ept.main() with pytest.raises(ValueError) as ve: - sys.argv[1:] = ["demo-files", run_twice_dir] + sys.argv[1:] = [demo_files_path, run_twice_dir] ept.main() # TODO: confirm the expected behaviour here. These assert statements are @@ -97,7 +126,7 @@ def test_non_empty_output_dir(tmp_path): assert False -def test_log_file_args(tmp_path): +def test_log_file_args(demo_files_path, demo_output_dir, tmp_path): # Test path to `-l log_file`. Does it exist? # If so is it overwritten, or clobbered @@ -106,11 +135,11 @@ def test_log_file_args(tmp_path): assert not log_file.exists() # Test that a non-existant `txt_out_dir` if caught - output_dir = tmp_path / "output-dir" - output_dir.mkdir() - output_dir = str(output_dir) + # output_dir = tmp_path / demo_output_path + # output_dir.mkdir() + # output_dir = str(output_dir) - sys.argv[1:] = ["--l", str(log_file), "demo-files", output_dir] + sys.argv[1:] = ["-l", str(log_file), demo_files_path, str(demo_output_dir)] ept.main() # It does exist after we run it @@ -120,7 +149,7 @@ def test_log_file_args(tmp_path): # Run a second time and check that the logfile is roughly twice the original size # This asserts that the correct behaviour is that alto2txt always appends to an existing # logfile and does not overwrite it. - sys.argv[1:] = ["--l", str(log_file), "demo-files", output_dir] + sys.argv[1:] = ["-l", str(log_file), demo_files_path, str(demo_output_dir)] ept.main() second_run_size = log_file.stat().st_size @@ -143,6 +172,56 @@ def test_processor_args(): @pytest.mark.skip("Not yet implemented") def test_downsample_arg(): # Test `-d` - # What does this does/mean? + # What does this do/mean? assert False + + +class TestVerifyOutput: + + """Test running verify_output wrapper of alto2txt-verify.sh""" + + def rm_temp_files(self) -> None: + """Ensure temp files generated from verify_output are removed.""" + if ept.VERIFY_SCRIPT_TEMP_PATH.exists(): + rmtree(ept.VERIFY_SCRIPT_TEMP_PATH) + + @property + def abosulte_default_temp_path(self) -> Path: + return getcwd() / ept.VERIFY_SCRIPT_TEMP_PATH + + def get_input_path(self, path: PathLike) -> Path: + return Path(ept.VERIFY_TEMP_INPUT_PATH) / path + + def get_output_path(self, path: PathLike) -> Path: + return Path(ept.VERIFY_TEMP_OUTPUT_PATH) / path + + def setup_method(self) -> None: + self.rm_temp_files() + + def teardown_method(self) -> None: + self.rm_temp_files() + + def test_verify_path_error( + self, demo_files_path, demo_output_dir, set_test_dir + ) -> None: + """Test ScriptPathError raised due to script not in default relative path.""" + with pytest.raises(ept.ScriptPathError) as excinfo: + ept.verify_output(demo_files_path, demo_output_dir) + assert str(ept.VERIFY_SCRIPT_PATH) in str(excinfo) + + def test_verify_output(self, demo_files_path, demo_output_dir, capsys) -> None: + """Test paths are correctly generated to verify output.""" + test_file_name = Path("0002647-1824.txt") + correct_input_file_path: Path = self.get_input_path(test_file_name) + correct_output_file_path: Path = self.get_output_path(test_file_name) + assert not correct_input_file_path.exists() + assert not correct_output_file_path.exists() + completed_process: CompletedProcess = ept.verify_output( + demo_files_path, demo_output_dir + ) + assert completed_process.returncode == 0 # Default success code + script_temp_path: Path = self.abosulte_default_temp_path + assert script_temp_path.is_dir() + assert correct_input_file_path.exists() + # assert correct_output_file_path.exists() diff --git a/tests/test_import.py b/tests/test_import.py index 318f0e7..67bb6a9 100644 --- a/tests/test_import.py +++ b/tests/test_import.py @@ -3,5 +3,5 @@ def test_import(): import alto2txt # noqa F401 assert True - except (ImportError): + except ImportError: assert False