Skip to content

Commit

Permalink
Merge branch 'rl-0.10.0-RC01'
Browse files Browse the repository at this point in the history
  • Loading branch information
amenezes committed Dec 4, 2022
2 parents 2f331d6 + 6312b8a commit 5a4d45e
Show file tree
Hide file tree
Showing 24 changed files with 283 additions and 118 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ jobs:
tests:
strategy:
matrix:
python-version: ['3.8', '3.9', '3.10', 'pypy-3.8', 'pypy-3.9']
python-version: ['3.8', '3.9', '3.10', '3.11', 'pypy-3.8', 'pypy-3.9']
os: [ubuntu]
fail-fast: true
runs-on: ${{ matrix.os }}-latest
Expand Down
42 changes: 42 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
exclude: >
(?x)(
^alembic.ini$|
^migrations/
)
fail_fast: false
repos:
- repo: local
hooks:
- id: black
name: black
entry: black
language: system
types: [python]
- id: isort
name: isort
entry: isort
language: system
types: [python]
args: ["--profile", "black"]
- id: flake8
name: flake8
entry: flake8
language: system
types: [ python ]
- id: mypy
name: mypy
entry: mypy
language: system
types: [ python ]
- repo: https://github.com/Lucas-C/pre-commit-hooks
rev: v1.3.0
hooks:
- id: forbid-crlf
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.3.0
hooks:
- id: check-case-conflict
- id: check-merge-conflict
- id: end-of-file-fixer
- id: check-yaml
- id: check-added-large-files
14 changes: 5 additions & 9 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@ VERSION := $(shell cat aiopytesseract/__init__.py | grep '__version__ ' | cut -d
lint:
ifeq ($(SKIP_STYLE), )
@echo "> running isort..."
isort aiopytesseract/
isort tests/
isort aiopytesseract
isort tests
isort examples
@echo "> running black..."
black aiopytesseract
Expand All @@ -23,11 +23,13 @@ tests:

docs:
@echo "> generate project documentation..."
portray server
@cp README.md docs/index.md
mkdocs serve

install-deps:
@echo "> installing dependencies..."
pip install -r requirements-dev.txt
pre-commit install

tox:
@echo "> running tox..."
Expand All @@ -47,16 +49,10 @@ about:

ci: lint tests
ifeq ($(GITHUB_HEAD_REF), false)
@echo "> download CI dependencies"
curl -L https://codeclimate.com/downloads/test-reporter/test-reporter-latest-linux-amd64 > ./cc-test-reporter
chmod +x ./cc-test-reporter
@echo "> uploading report..."
codecov --file coverage.xml -t $$CODECOV_TOKEN
./cc-test-reporter format-coverage -t coverage.py -o codeclimate.json
./cc-test-reporter upload-coverage -i codeclimate.json -r $$CC_TEST_REPORTER_ID
endif

all: install-deps ci


.PHONY: lint tests ci docs install-deps tox all
23 changes: 6 additions & 17 deletions aiopytesseract/__init__.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,11 @@
from .commands import (
confidence,
deskew,
get_languages,
get_tesseract_version,
image_to_boxes,
image_to_data,
image_to_hocr,
image_to_osd,
image_to_pdf,
image_to_string,
languages,
run,
tesseract_parameters,
tesseract_version,
)
from .commands import (confidence, deskew, get_languages,
get_tesseract_version, image_to_boxes, image_to_data,
image_to_hocr, image_to_osd, image_to_pdf,
image_to_string, languages, run, tesseract_parameters,
tesseract_version)
from .models import OSD, Box, Data, Parameter

__version__ = "0.9.0"
__version__ = "0.10.0"
__all__ = [
"__version__",
"OSD",
Expand Down
12 changes: 5 additions & 7 deletions aiopytesseract/base_command.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,13 @@
from typing import Any, List, Optional, Tuple

from ._logger import logger
from .constants import (
AIOPYTESSERACT_DEFAULT_ENCODING,
AIOPYTESSERACT_DEFAULT_TIMEOUT,
OUTPUT_FILE_EXTENSIONS,
TESSERACT_CMD,
)
from .constants import (AIOPYTESSERACT_DEFAULT_ENCODING,
AIOPYTESSERACT_DEFAULT_TIMEOUT, OUTPUT_FILE_EXTENSIONS,
TESSERACT_CMD)
from .exceptions import TesseractRuntimeError, TesseractTimeoutError
from .returncode import ReturnCode
from .validators import file_exists, language_is_valid, oem_is_valid, psm_is_valid
from .validators import (file_exists, language_is_valid, oem_is_valid,
psm_is_valid)


async def execute_cmd(cmd_args: str, timeout: float = AIOPYTESSERACT_DEFAULT_TIMEOUT):
Expand Down
11 changes: 4 additions & 7 deletions aiopytesseract/validators.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,9 @@
from pathlib import Path

from .constants import OCR_ENGINE_MODES, PAGE_SEGMENTATION_MODES, TESSERACT_LANGUAGES
from .exceptions import (
LanguageInvalidException,
NoSuchFileException,
OEMInvalidException,
PSMInvalidException,
)
from .constants import (OCR_ENGINE_MODES, PAGE_SEGMENTATION_MODES,
TESSERACT_LANGUAGES)
from .exceptions import (LanguageInvalidException, NoSuchFileException,
OEMInvalidException, PSMInvalidException)


async def psm_is_valid(psm: int) -> None:
Expand Down
15 changes: 15 additions & 0 deletions docs/best-practices.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
## Tips to improve recognition

### Reference

- [https://tesseract-ocr.github.io/tessdoc/ImproveQuality.html](https://tesseract-ocr.github.io/tessdoc/ImproveQuality.html)
- [https://www.pyimagesearch.com/2021/11/15/tesseract-page-segmentation-modes-psms-explained-how-to-improve-your-ocr-accuracy/](https://www.pyimagesearch.com/2021/11/15/tesseract-page-segmentation-modes-psms-explained-how-to-improve-your-ocr-accuracy/)


## Tesseract Command line

### Reference

- [https://tesseract-ocr.github.io/tessdoc/](https://tesseract-ocr.github.io/tessdoc/)
- [https://tesseract-ocr.github.io/tessdoc/Command-Line-Usage.html](https://tesseract-ocr.github.io/tessdoc/Command-Line-Usage.html)
- [https://github.com/tesseract-ocr/tesseract/blob/main/doc/tesseract.1.asc](https://github.com/tesseract-ocr/tesseract/blob/main/doc/tesseract.1.asc)
24 changes: 24 additions & 0 deletions docs/development.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# Development

## Install development dependencies

```bash
make install-deps
```

> OR: pip install -r requirements-dev.txt
## Execute tests

```bash
make tests
```

> OR: pytest
## Generating documentation locally.

```bash
pip install 'aiopytesseract[docs]'
make docs
```
131 changes: 131 additions & 0 deletions docs/index.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
[![ci](https://github.com/amenezes/aiopytesseract/actions/workflows/ci.yml/badge.svg)](https://github.com/amenezes/aiopytesseract/actions/workflows/ci.yml)
[![codecov](https://codecov.io/gh/amenezes/aiopytesseract/branch/master/graph/badge.svg)](https://codecov.io/gh/amenezes/aiopytesseract)
[![PyPI version](https://badge.fury.io/py/aiopytesseract.svg)](https://badge.fury.io/py/aiopytesseract)
![PyPI - Python Version](https://img.shields.io/pypi/pyversions/aiopytesseract)
[![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)

# aiopytesseract

A Python [asyncio](https://docs.python.org/3/library/asyncio.html) wrapper for [Tesseract-OCR](https://tesseract-ocr.github.io/tessdoc/).

## Installation

Install and update using pip:

````bash
pip install aiopytesseract
````

## Usage

```python
from pathlib import Path

import aiopytesseract


# list all available languages by tesseract installation
await aiopytesseract.languages()
await aiopytesseract.get_languages()


# tesseract version
await aiopytesseract.tesseract_version()
await aiopytesseract.get_tesseract_version()


# tesseract parameters
await aiopytesseract.tesseract_parameters()


# confidence only info
await aiopytesseract.confidence("tests/samples/file-sample_150kB.png")


# deskew info
await aiopytesseract.deskew("tests/samples/file-sample_150kB.png")


# extract text from an image: locally or bytes
await aiopytesseract.image_to_string("tests/samples/file-sample_150kB.png")
await aiopytesseract.image_to_string(
Path("tests/samples/file-sample_150kB.png")read_bytes(), dpi=220, lang='eng+por'
)


# box estimates
await aiopytesseract.image_to_boxes("tests/samples/file-sample_150kB.png")
await aiopytesseract.image_to_boxes(Path("tests/samples/file-sample_150kB.png")


# boxes, confidence and page numbers
await aiopytesseract.image_to_data("tests/samples/file-sample_150kB.png")
await aiopytesseract.image_to_data(Path("tests/samples/file-sample_150kB.png")


# information about orientation and script detection
await aiopytesseract.image_to_osd("tests/samples/file-sample_150kB.png")
await aiopytesseract.image_to_osd(Path("tests/samples/file-sample_150kB.png")


# generate a searchable PDF
await aiopytesseract.image_to_pdf("tests/samples/file-sample_150kB.png")
await aiopytesseract.image_to_pdf(Path("tests/samples/file-sample_150kB.png")


# generate HOCR output
await aiopytesseract.image_to_hocr("tests/samples/file-sample_150kB.png")
await aiopytesseract.image_to_hocr(Path("tests/samples/file-sample_150kB.png")


# multi ouput
async with aiopytesseract.run(
Path('tests/samples/file-sample_150kB.png').read_bytes(),
'output',
'alto tsv txt'
) as resp:
# will generate (output.xml, output.tsv and output.txt)
print(resp)
alto_file, tsv_file, txt_file = resp
```

## Examples

If you want to test **aiopytesseract** easily, can you use some options like:

- docker
- docker-compose
- [streamlit](https://streamlit.io)

### Docker

Just copy and paste the following line.

```bash
docker run --rm --name aiopytesseract -p 8501:8501 amenezes/aiopytesseract
```

### docker-compose

After clone this repo run the command below:

```bash
docker-compose up -d
```

### streamlit app

For this option it's necessary first install `aiopytesseract` and `streamlit`, after execute:

```python
streamlit run https://github.com/amenezes/aiopytesseract/blob/master/examples/streamlit/app.py
```

> note: The streamlit example need **python >= 3.10**

## Links

- License: [Apache License](https://choosealicense.com/licenses/apache-2.0/)
- Code: [https://github.com/amenezes/aiopytesseract](https://github.com/amenezes/aiopytesseract)
- Issue tracker: [https://github.com/amenezes/aiopytesseract/issues](https://github.com/amenezes/aiopytesseract/issues)
- Docs: [https://aiopytesseract.amenezes.net](https://github.com/amenezes/aiopytesseract)
15 changes: 0 additions & 15 deletions docs/more.md

This file was deleted.

42 changes: 42 additions & 0 deletions mkdocs.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
site_name: aiopytesseract
repo_url: https://github.com/amenezes/aiopytesseract
repo_name: amenezes/aiopytesseract
theme:
name: material
features:
- navigation.instant
- navigation.top
- navigation.prune
- toc.integrate
- search.highlight
- search.suggest
- search.share
- content.code.annotate
- content.tooltips
- toc.follow
palette:
- scheme: default
primary: blue grey
accent: indigo
toggle:
icon: material/lightbulb-on
name: Switch to dark mode
- scheme: slate
primary: blue grey
accent: indigo
toggle:
icon: material/lightbulb
name: Switch to light mode
icon:
repo: fontawesome/brands/github-alt
extra:
social:
- icon: fontawesome/brands/github
link: https://github.com/amenezes/aiopytesseract
- icon: fontawesome/solid/bug
link: https://github.com/amenezes/aiopytesseract/issues
- icon: fontawesome/solid/envelope
link: mailto:[email protected]
nav:
- Best practices: best-practices.md
- Development: development.md
Loading

0 comments on commit 5a4d45e

Please sign in to comment.