diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index dbd786b..496cb85 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -32,7 +32,7 @@ jobs: restore-keys: | ${{ runner.os }}-pip- - run: pip install -r ci-requirements.txt - - run: python setup.py develop + - run: python -m pip install --editable . - run: python -m unittest coverage: @@ -51,6 +51,6 @@ jobs: restore-keys: | ${{ runner.os }}-pip- - run: pip install -r ci-requirements.txt - - run: python setup.py develop + - run: python -m pip install --editable . - run: coverage run -m unittest - uses: codecov/codecov-action@v1 diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 04097d1..8fc31e9 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -14,7 +14,7 @@ jobs: with: python-version: 3.8 - run: pip install -r dev-requirements.txt - - run: python setup.py sdist bdist_wheel + - run: python -m build - run: twine upload dist/* env: TWINE_USERNAME: ${{ secrets.TWINE_USERNAME }} diff --git a/MANIFEST.in b/MANIFEST.in index 56dd42b..e9f96d9 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1 +1 @@ -include dphon/data \ No newline at end of file +graft dphon/data diff --git a/README.md b/README.md index d788e66..9858c74 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,5 @@ # dphon + [![ci](https://github.com/direct-phonology/dphon/workflows/ci/badge.svg)](https://github.com/direct-phonology/dphon/actions?query=workflow%3Aci) [![codecov](https://codecov.io/gh/direct-phonology/dphon/branch/main/graph/badge.svg?token=uGbgB5UFtk)](https://codecov.io/gh/direct-phonology/dphon) ![pyversions](https://img.shields.io/pypi/pyversions/dphon.svg?style=flat) @@ -19,6 +20,7 @@ if you're on windows and are seeing incorrectly formatted output in your termina ## usage ### basics + the main function of `dphon` is to look for instances of text reuse in a corpus of old chinese texts. instead of relying purely on graphemes, it does this by performing grapheme-to-phoneme conversion, and determining possible reuse based on whether passages are likely to have _sounded_ similar (or rhymed) when spoken aloud. you will need to have files stored locally as utf-8 encoded plain-text (`.txt`) or json-lines (`.jsonl`) format. for the former, one file is assumed to represent one document. for the latter, one file can contain any number of lines, each of which is a document, with required keys `id` (a unique identifier) and `text` (text content) and any number of optional keys. you can obtain a representative corpus of old chinese sourced from the kanseki repository via [`direct-phonology/ect-krp`](https://github.com/direct-phonology/ect-krp). @@ -42,7 +44,7 @@ which would look for phonetically similar passages between `text_a` and `text_b` the numbers next to the identifiers are _token indices_, and may vary depending on how the text is tokenized – `dphon` currently uses character-based tokenization. whitespace will be removed, and the output will be aligned to make it easier to spot differences between the two sequences. by default, insertions are highlighted in green, and mismatches (differences between the two sequences) are highlighted in red. additional (non-matching) context added to either side of match sequences is displayed using a dimmed color (see "advanced usage" below for more information on colorization). -matches are sorted by the ratio of their phomenic similarity to their graphic similarity – in other words, matches between texts that sound highly similar but were written very differently will be at the top of the list. +matches are sorted by the ratio of their phomenic similarity to their graphic similarity – in other words, matches between texts that sound highly similar but were written very differently will be at the top of the list. by default, `dphon` only returns matches that display at least one instance of _graphic variation_ – a case where two different graphemes are used in the same place to represent the same sound. these cases are highlighted in blue. if you're interested in all instances of reuse, regardless of graphic variation, you can use the `--all` flag: @@ -51,16 +53,19 @@ $ dphon --all text_a.txt text_b.txt ``` you can view the full list of command options with: + ```sh $ dphon --help ``` this tool is under active development, and results may vary. to find the version you are running: + ```sh $ dphon --version ``` ### advanced usage + by default, `dphon` uses your system's `$PAGER` to display output, since the results can be quite long. on MacOS and Linux, this will likely be `less`, which supports additional options like searching through the output once it's displayed. for more information, see the man page: ```sh @@ -100,7 +105,7 @@ if two characters have the same phonemes, they're treated as a match. for charac in version 1.0, `dphon`'s default reconstruction was based on Schuessler 2007[1](#note1), but used a single "dummy" character to represent all the lexemes in a rhyming group. [the dictionary](dphon/data/sound_table_v1.json) was compiled by John O'Leary ([@valgrinderror](https://github.com/valgrinderror)) and Gian Duri Rominger ([@GDRom](https://github.com/GDRom)). since version 2.0, `dphon` uses [a dictionary](dphon/data/sound_table_v2.json) based on the Baxter-Sagart 2014 reconstruction[2](#note2), with additional work by Rominger. -the matching algorithm is based on Paul Vierthaler's [`chinesetextreuse`](https://github.com/vierth/chinesetextreuse) project[3](#note3), with some modifications. it uses a [BLAST](https://en.wikipedia.org/wiki/BLAST_(biotechnology))-like strategy to identify initial match candidates, and then extend them via phonetic [edit distance](https://en.wikipedia.org/wiki/Edit_distance) comparison. finally, the results are aligned using a version of the [Smith-Waterman algorithm](https://en.wikipedia.org/wiki/Smith%E2%80%93Waterman_algorithm) that operates on phonemes, powered by the `lingpy` library[4](#note4). +the matching algorithm is based on Paul Vierthaler's [`chinesetextreuse`](https://github.com/vierth/chinesetextreuse) project[3](#note3), with some modifications. it uses a [BLAST]()-like strategy to identify initial match candidates, and then extend them via phonetic [edit distance](https://en.wikipedia.org/wiki/Edit_distance) comparison. finally, the results are aligned using a version of the [Smith-Waterman algorithm](https://en.wikipedia.org/wiki/Smith%E2%80%93Waterman_algorithm) that operates on phonemes, powered by the `lingpy` library[4](#note4). ## development setup @@ -133,22 +138,25 @@ $ pip install -e . now your changes will be automatically picked up when you run `dphon`. pull requests can be made against `main`. + ## code documentation + code documentation is [available on github pages](https://direct-phonology.github.io/dphon) and is generated with `pdoc3`. to build the docs: + ```sh $ pdoc --html --output-dir docs dphon ``` ## tests + unit tests are written with `unittest`. you can run them with: ```sh $ python -m unittest ``` - ## releases the package is built and published to pyPI automatically using `twine` when using GitHub's release functionality. diff --git a/dev-requirements.txt b/dev-requirements.txt index 04d11ec..cc62691 100644 --- a/dev-requirements.txt +++ b/dev-requirements.txt @@ -1,6 +1,6 @@ -r requirements.txt coverage -wheel +build twine setuptools pdoc3 diff --git a/dphon/__init__.py b/dphon/__init__.py index 5fa9130..f6bb6f4 100644 --- a/dphon/__init__.py +++ b/dphon/__init__.py @@ -1 +1 @@ -__version__ = "2.0.3" +__version__ = "2.0.4" diff --git a/pyproject.toml b/pyproject.toml index 1b68d94..acfbbc4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,3 +1,61 @@ [build-system] requires = ["setuptools>=42", "wheel"] -build-backend = "setuptools.build_meta" \ No newline at end of file +build-backend = "setuptools.build_meta" + +[project] +name = "dphon" +readme = "README.md" +description = "Tools and algorithms for phonology-aware Early Chinese NLP." +dynamic = ["version"] +license = { file = "LICENSE" } +keywords = ["old chinese", "early chinese", "phonology", "linguistics", "nlp"] +classifiers = [ + "Environment :: Console", + "Intended Audience :: Science/Research", + "License :: OSI Approved :: MIT License", + "Natural Language :: Chinese (Traditional)", + "Operating System :: OS Independent", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3 :: Only", + "Topic :: Text Processing :: Linguistic", + "Topic :: Utilities", +] +dependencies = [ + "docopt", + "spacy>=3", + "python-levenshtein", + "lingpy", + "rich", + "jsonlines", +] +authors = [ + { name = "Nick Budak", email = "budak@stanford.edu" }, + { name = "Gian Duri Rominger", email = "gromin@uw.edu" }, +] +requires-python = ">=3.8" + +[project.urls] +Repository = "https://github.com/direct-phonology/dphon" +Issues = "https://github.com/direct-phonology/dphon/issues" +Documentation = "https://direct-phonology.github.io/dphon/" + +[project.optional-dependencies] +dev = ["check-manifest", "mypy", "pylint"] +test = ["coverage"] + +[project.entry-points.console_scripts] +dphon = "dphon.cli:run" + +[project.entry-points.spacy_factories] +ngrams = "dphon.ngrams:create_ngrams" +g2p = "dphon.g2p:create_graphemes_to_phonemes" +ngram_phonemes_index = "dphon.index:create_ngram_phonemes_lookup_index" + +[tool.setuptools] +packages = ["dphon"] + +[tool.setuptools.dynamic] +version = { attr = "dphon.__version__" } diff --git a/setup.cfg b/setup.cfg deleted file mode 100644 index 0792c86..0000000 --- a/setup.cfg +++ /dev/null @@ -1,2 +0,0 @@ -[metadata] -license_files = LICENSE \ No newline at end of file diff --git a/setup.py b/setup.py deleted file mode 100644 index 3adc714..0000000 --- a/setup.py +++ /dev/null @@ -1,59 +0,0 @@ -"""Packaging settings for dphon.""" - -import pathlib - -from setuptools import find_packages, setup - -from dphon import __version__ - -here = pathlib.Path(__file__).parent.resolve() -long_description = (here / "README.md").read_text(encoding="utf8") - -setup( - name="dphon", - version=__version__, - description="Tools and algorithms for phonology-aware Early Chinese NLP.", - long_description=long_description, - long_description_content_type="text/markdown", - url="https://github.com/direct-phonology/dphon", - include_package_data=True, - author="Nick Budak", - author_email="nbudak@princeton.edu", - classifiers=[ - "Environment :: Console", - "Intended Audience :: Science/Research", - "License :: OSI Approved :: MIT License", - "Natural Language :: Chinese (Traditional)", - "Operating System :: OS Independent", - "Programming Language :: Python :: 3.8", - "Programming Language :: Python :: 3.9", - "Programming Language :: Python :: 3.10", - "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3 :: Only", - "Topic :: Text Processing :: Linguistic", - "Topic :: Utilities", - ], - keywords="old chinese, phonology, linguistics, nlp", - packages=find_packages(), - python_requires=">=3.8", - install_requires=["docopt", "spacy>=3", - "python-levenshtein", "lingpy", "rich", "jsonlines"], - extras_require={ - "dev": ["check-manifest", "mypy", "pylint"], - "test": ["coverage"], - }, - entry_points={ - "console_scripts": [ - "dphon=dphon.cli:run", - ], - "spacy_factories": [ - "ngrams=dphon.ngrams:create_ngrams", - "g2p=dphon.g2p:create_graphemes_to_phonemes", - "ngram_phonemes_index=dphon.index:create_ngram_phonemes_lookup_index", - ] - }, - project_urls={ - "Source": "https://github.com/direct-phonology/dphon", - "Tracker": "https://github.com/direct-phonology/dphon/issues", - }, -)