diff --git a/.tool-versions b/.tool-versions index 1569bf5..f1c0c50 100644 --- a/.tool-versions +++ b/.tool-versions @@ -1 +1 @@ -python 3.12.0 +python 3.12.1 diff --git a/README.md b/README.md index 64474a7..664fa9f 100644 --- a/README.md +++ b/README.md @@ -18,67 +18,123 @@ pip install aiopytesseract ## Usage -```python -from pathlib import Path +### List all available languages by Tesseract installation +``` python import aiopytesseract - -# list all available languages by tesseract installation await aiopytesseract.languages() await aiopytesseract.get_languages() +``` +### Tesseract version + +``` python +import aiopytesseract -# tesseract version await aiopytesseract.tesseract_version() await aiopytesseract.get_tesseract_version() +``` + +### Tesseract parameters +``` python +import aiopytesseract -# tesseract parameters await aiopytesseract.tesseract_parameters() +``` +### Confidence only info + +``` python +import aiopytesseract -# confidence only info await aiopytesseract.confidence("tests/samples/file-sample_150kB.png") +``` +### Deskew info + +``` python +import aiopytesseract -# deskew info await aiopytesseract.deskew("tests/samples/file-sample_150kB.png") +``` +### Extract text from an image: locally or bytes + +``` python +from pathlib import Path + +import aiopytesseract -# extract text from an image: locally or bytes await aiopytesseract.image_to_string("tests/samples/file-sample_150kB.png") await aiopytesseract.image_to_string( - Path("tests/samples/file-sample_150kB.png")read_bytes(), dpi=220, lang='eng+por' + Path("tests/samples/file-sample_150kB.png").read_bytes(), dpi=220, lang='eng+por' ) +``` + +### Box estimates +``` python +from pathlib import Path + +import aiopytesseract -# box estimates await aiopytesseract.image_to_boxes("tests/samples/file-sample_150kB.png") await aiopytesseract.image_to_boxes(Path("tests/samples/file-sample_150kB.png") +``` + +### Boxes, confidence and page numbers + +``` python +from pathlib import Path +import aiopytesseract -# boxes, confidence and page numbers await aiopytesseract.image_to_data("tests/samples/file-sample_150kB.png") await aiopytesseract.image_to_data(Path("tests/samples/file-sample_150kB.png") +``` +### Information about orientation and script detection + +``` python +from pathlib import Path + +import aiopytesseract -# information about orientation and script detection await aiopytesseract.image_to_osd("tests/samples/file-sample_150kB.png") await aiopytesseract.image_to_osd(Path("tests/samples/file-sample_150kB.png") +``` + +### Generate a searchable PDF + +``` python +from pathlib import Path +import aiopytesseract -# generate a searchable PDF await aiopytesseract.image_to_pdf("tests/samples/file-sample_150kB.png") await aiopytesseract.image_to_pdf(Path("tests/samples/file-sample_150kB.png") +``` + +### Generate HOCR output + +``` python +from pathlib import Path +import aiopytesseract -# generate HOCR output await aiopytesseract.image_to_hocr("tests/samples/file-sample_150kB.png") await aiopytesseract.image_to_hocr(Path("tests/samples/file-sample_150kB.png") +``` +### Multi ouput + +``` python +from pathlib import Path + +import aiopytesseract -# multi ouput async with aiopytesseract.run( Path('tests/samples/file-sample_150kB.png').read_bytes(), 'output', @@ -89,7 +145,43 @@ async with aiopytesseract.run( alto_file, tsv_file, txt_file = resp ``` -For more details on Tesseract best practices and the aiopytesseract, see the folder: `docs`. +### Config variables + +``` python +from pathlib import Path + +import aiopytesseract + +async with aiopytesseract.run( + Path('tests/samples/text-with-chars-and-numbers.png').read_bytes(), + 'output', + 'alto tsv txt' + config=[("tessedit_char_whitelist", "0123456789")] +) as resp: + # will generate (output.xml, output.tsv and output.txt) + print(resp) + alto_file, tsv_file, txt_file = resp +``` + +``` python +from pathlib import Path + +import aiopytesseract + +await aiopytesseract.image_to_string( + "tests/samples/text-with-chars-and-numbers.png", + config=[("tessedit_char_whitelist", "0123456789")] +) + +await aiopytesseract.image_to_string( + Path("tests/samples/text-with-chars-and-numbers.png").read_bytes(), + dpi=220, + lang='eng+por', + config=[("tessedit_char_whitelist", "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ")] +) +``` + +> For more details on Tesseract best practices and the aiopytesseract, see the folder: `docs`. ## Examples diff --git a/aiopytesseract/__init__.py b/aiopytesseract/__init__.py index 63cf00b..b4cd8ff 100644 --- a/aiopytesseract/__init__.py +++ b/aiopytesseract/__init__.py @@ -16,7 +16,7 @@ ) from .models import OSD, Box, Data, Parameter -__version__ = "0.13.0" +__version__ = "0.14.0" __all__ = [ "__version__", "OSD", diff --git a/aiopytesseract/base_command.py b/aiopytesseract/base_command.py index 7036400..f01db7a 100644 --- a/aiopytesseract/base_command.py +++ b/aiopytesseract/base_command.py @@ -51,6 +51,8 @@ async def execute( user_words: Union[None, str] = None, user_patterns: Union[None, str] = None, tessdata_dir: Union[None, str] = None, + config: Union[None, List[Tuple[str, str]]] = None, + encoding: str = AIOPYTESSERACT_DEFAULT_ENCODING, ) -> bytes: raise NotImplementedError @@ -67,6 +69,8 @@ async def _( user_words: Union[None, str] = None, user_patterns: Union[None, str] = None, tessdata_dir: Union[None, str] = None, + config: Union[None, List[Tuple[str, str]]] = None, + encoding: str = AIOPYTESSERACT_DEFAULT_ENCODING, ) -> bytes: await file_exists(image) response: bytes = await execute( @@ -80,6 +84,8 @@ async def _( user_words=user_words, user_patterns=user_patterns, tessdata_dir=tessdata_dir, + config=config, + encoding=encoding, ) return response @@ -89,13 +95,14 @@ async def _( image: bytes, output_format: str, dpi: int, - lang: Union[None, str], psm: int, oem: int, timeout: float, + lang: Union[None, str] = None, user_words: Union[None, str] = None, user_patterns: Union[None, str] = None, tessdata_dir: Union[None, str] = None, + config: Union[None, List[Tuple[str, str]]] = None, encoding: str = AIOPYTESSERACT_DEFAULT_ENCODING, ) -> bytes: cmd_args = await _build_cmd_args( @@ -103,10 +110,11 @@ async def _( dpi=dpi, psm=psm, oem=oem, + lang=lang, user_words=user_words, user_patterns=user_patterns, tessdata_dir=tessdata_dir, - lang=lang, + config=config, ) try: proc = await asyncio.wait_for( @@ -142,6 +150,7 @@ async def execute_multi_output_cmd( user_words: Union[None, str] = None, user_patterns: Union[None, str] = None, tessdata_dir: Union[None, str] = None, + config: Union[None, List[Tuple[str, str]]] = None, encoding: str = AIOPYTESSERACT_DEFAULT_ENCODING, ) -> Tuple[str, ...]: cmd_args = await _build_cmd_args( @@ -154,6 +163,7 @@ async def execute_multi_output_cmd( tessdata_dir=tessdata_dir, lang=lang, output=output_file, + config=config, ) try: proc = await asyncio.wait_for( @@ -187,6 +197,7 @@ async def _build_cmd_args( tessdata_dir: Union[None, str] = None, lang: Union[None, str] = None, output: str = "stdout", + config: Union[None, List[Tuple[str, str]]] = None, ) -> List[str]: await asyncio.gather(psm_is_valid(psm), oem_is_valid(oem)) # OCR options must occur before any configfile. @@ -212,6 +223,11 @@ async def _build_cmd_args( cmd_args.append("-l") cmd_args.append(lang) + if config: + for option, value in config: + cmd_args.append("-c") + cmd_args.append(f"{option}={value} ") + extension = reversed(output_extension.split()) for ext in extension: cmd_args.append(ext) diff --git a/aiopytesseract/commands.py b/aiopytesseract/commands.py index 8f1ebee..c275662 100644 --- a/aiopytesseract/commands.py +++ b/aiopytesseract/commands.py @@ -176,32 +176,36 @@ async def tesseract_parameters( [param.group(1), param.group(2), param.group(3)], Parameter # type: ignore ) ) - return params + return sorted(params, key=lambda p: p.name) @singledispatch async def image_to_string( image: Any, - user_words: Union[None, str] = None, - user_patterns: Union[None, str] = None, - tessdata_dir: Union[None, str] = None, dpi: int = AIOPYTESSERACT_DEFAULT_DPI, lang: str = AIOPYTESSERACT_DEFAULT_LANGUAGE, psm: int = AIOPYTESSERACT_DEFAULT_PSM, oem: int = AIOPYTESSERACT_DEFAULT_OEM, + encoding: str = AIOPYTESSERACT_DEFAULT_ENCODING, timeout: float = AIOPYTESSERACT_DEFAULT_TIMEOUT, + user_words: Union[None, str] = None, + user_patterns: Union[None, str] = None, + tessdata_dir: Union[None, str] = None, + config: Union[None, List[Tuple[str, str]]] = None, ) -> str: """Extract string from an image. :param image: image input to tesseract. (valid values: str, bytes) - :param user_words: location of user words file. (default: None) - :param user_patterns: location of user patterns file. (default: None) - :param tessdata_dir: location of tessdata path. (default: None) :param dpi: image dots per inch (DPI). (default: 300) :param lang: tesseract language. (default: eng, format: eng, eng+por, eng+por+fra) :param psm: page segmentation modes. (default: 3) :param oem: ocr engine modes. (default: 3) + :param encoding: encoding. (default: UTF-8) :param timeout: command timeout. (default: 30) + :param user_words: location of user words file. (default: None) + :param user_patterns: location of user patterns file. (default: None) + :param tessdata_dir: location of tessdata path. (default: None) + :param config: set value for config variables. (default: None) """ raise NotImplementedError(f"Type {type(image)} not supported.") @@ -213,11 +217,12 @@ async def _( lang: str = AIOPYTESSERACT_DEFAULT_LANGUAGE, psm: int = AIOPYTESSERACT_DEFAULT_PSM, oem: int = AIOPYTESSERACT_DEFAULT_OEM, + encoding: str = AIOPYTESSERACT_DEFAULT_ENCODING, timeout: float = AIOPYTESSERACT_DEFAULT_TIMEOUT, user_words: Union[None, str] = None, user_patterns: Union[None, str] = None, tessdata_dir: Union[None, str] = None, - encoding: str = AIOPYTESSERACT_DEFAULT_ENCODING, + config: Union[None, List[Tuple[str, str]]] = None, ) -> str: image_text: bytes = await execute( image, @@ -230,6 +235,7 @@ async def _( user_words=user_words, user_patterns=user_patterns, tessdata_dir=tessdata_dir, + config=config, ) return image_text.decode(encoding) @@ -241,11 +247,12 @@ async def _( lang: str = AIOPYTESSERACT_DEFAULT_LANGUAGE, psm: int = AIOPYTESSERACT_DEFAULT_PSM, oem: int = AIOPYTESSERACT_DEFAULT_OEM, + encoding: str = AIOPYTESSERACT_DEFAULT_ENCODING, timeout: float = AIOPYTESSERACT_DEFAULT_TIMEOUT, user_words: Union[None, str] = None, user_patterns: Union[None, str] = None, tessdata_dir: Union[None, str] = None, - encoding: str = AIOPYTESSERACT_DEFAULT_ENCODING, + config: Union[None, List[Tuple[str, str]]] = None, ) -> str: image_text: bytes = await execute( image, @@ -258,6 +265,7 @@ async def _( user_words=user_words, user_patterns=user_patterns, tessdata_dir=tessdata_dir, + config=config, ) return image_text.decode(encoding) @@ -645,6 +653,7 @@ async def run( user_words: Union[None, str] = None, user_patterns: Union[None, str] = None, tessdata_dir: Union[None, str] = None, + config: Union[None, List[Tuple[str, str]]] = None, encoding: str = AIOPYTESSERACT_DEFAULT_ENCODING, ) -> AsyncGenerator[Tuple[str, ...], None]: """Run Tesseract-OCR with multiple analysis. @@ -665,6 +674,7 @@ async def run( :param user_words: location of user words file. (default: None) :param user_patterns: location of user patterns file. (default: None) :param tessdata_dir: location of tessdata path. (default: None) + :param config: set value for config variables. (default: None) :param encoding: decode bytes to string. (default: utf-8) """ if not isinstance(image, bytes): @@ -682,6 +692,7 @@ async def run( user_words=user_words, user_patterns=user_patterns, tessdata_dir=tessdata_dir, + config=config, encoding=encoding, ) yield resp