From f9ed17c492b420d36421741735f73944ef759f8e Mon Sep 17 00:00:00 2001 From: alexandre menezes Date: Fri, 22 Apr 2022 13:48:00 -0300 Subject: [PATCH] updated timeout manager --- aiopytesseract/__init__.py | 2 +- aiopytesseract/base_command.py | 72 ++++++++++++++++++++-------------- aiopytesseract/commands.py | 50 ++++++++++++++++------- aiopytesseract/constants.py | 1 + aiopytesseract/validators.py | 8 ++-- setup.cfg | 1 - tests/test_base_command.py | 2 +- tests/test_commands.py | 31 ++++++++++++++- tests/test_validators.py | 32 ++++++--------- 9 files changed, 127 insertions(+), 72 deletions(-) diff --git a/aiopytesseract/__init__.py b/aiopytesseract/__init__.py index 067ee67..68dcad7 100644 --- a/aiopytesseract/__init__.py +++ b/aiopytesseract/__init__.py @@ -4,7 +4,7 @@ image_to_string, languages, run, tesseract_parameters, tesseract_version) -__version__ = "0.6.0" +__version__ = "0.7.0" __all__ = [ "__version__", "confidence", diff --git a/aiopytesseract/base_command.py b/aiopytesseract/base_command.py index d35a374..4f03a7d 100644 --- a/aiopytesseract/base_command.py +++ b/aiopytesseract/base_command.py @@ -1,12 +1,13 @@ import asyncio import shlex from collections import deque -from functools import singledispatch +from functools import lru_cache, singledispatch from pathlib import Path from typing import Any, List, Optional, Tuple from ._logger import logger from .constants import ( + AIOPYTESSERACT_DEFAULT_BUILD_CMD_CACHE, AIOPYTESSERACT_DEFAULT_ENCODING, AIOPYTESSERACT_DEFAULT_TIMEOUT, OUTPUT_FILE_EXTENSIONS, @@ -63,7 +64,7 @@ async def _( user_patterns: Optional[str] = None, tessdata_dir: Optional[str] = None, ) -> bytes: - await file_exists(image) + file_exists(image) response: bytes = await execute( Path(image).read_bytes(), output_format, @@ -93,7 +94,7 @@ async def _( tessdata_dir: Optional[str] = None, encoding: str = AIOPYTESSERACT_DEFAULT_ENCODING, ) -> bytes: - cmd_args = await _build_cmd_args( + cmd_args = _build_cmd_args( output_extension=output_format, dpi=dpi, psm=psm, @@ -103,17 +104,23 @@ async def _( tessdata_dir=tessdata_dir, lang=lang, ) - proc = await asyncio.wait_for( - asyncio.create_subprocess_exec( - TESSERACT_CMD, - *cmd_args, - stdin=asyncio.subprocess.PIPE, - stdout=asyncio.subprocess.PIPE, - stderr=asyncio.subprocess.PIPE, - ), - timeout=timeout, - ) - stdout, stderr = await asyncio.wait_for(proc.communicate(image), timeout=timeout) + try: + proc = await asyncio.wait_for( + asyncio.create_subprocess_exec( + TESSERACT_CMD, + *cmd_args, + stdin=asyncio.subprocess.PIPE, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ), + timeout=timeout, + ) + stdout, stderr = await asyncio.wait_for( + proc.communicate(image), timeout=timeout + ) + except asyncio.TimeoutError: + proc.kill() + raise RuntimeError("Tesseract process timeout") if proc.returncode != ReturnCode.SUCCESS: raise TesseractRuntimeError(stderr.decode(encoding)) return stdout @@ -133,7 +140,7 @@ async def execute_multi_output_cmd( tessdata_dir: Optional[str] = None, encoding: str = AIOPYTESSERACT_DEFAULT_ENCODING, ) -> Tuple[str, ...]: - cmd_args = await _build_cmd_args( + cmd_args = _build_cmd_args( output_extension=output_format, dpi=dpi, psm=psm, @@ -144,17 +151,21 @@ async def execute_multi_output_cmd( lang=lang, output=output_file, ) - proc = await asyncio.wait_for( - asyncio.create_subprocess_exec( - TESSERACT_CMD, - *cmd_args, - stdin=asyncio.subprocess.PIPE, - stdout=asyncio.subprocess.PIPE, - stderr=asyncio.subprocess.PIPE, - ), - timeout=timeout, - ) - _, stderr = await asyncio.wait_for(proc.communicate(image), timeout=timeout) + try: + proc = await asyncio.wait_for( + asyncio.create_subprocess_exec( + TESSERACT_CMD, + *cmd_args, + stdin=asyncio.subprocess.PIPE, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ), + timeout=timeout, + ) + _, stderr = await asyncio.wait_for(proc.communicate(image), timeout=timeout) + except asyncio.TimeoutError: + proc.kill() + raise RuntimeError("Tesseract process timeout") if proc.returncode != ReturnCode.SUCCESS: raise TesseractRuntimeError(stderr.decode(encoding)) return tuple( @@ -162,7 +173,8 @@ async def execute_multi_output_cmd( ) -async def _build_cmd_args( +@lru_cache(maxsize=AIOPYTESSERACT_DEFAULT_BUILD_CMD_CACHE) +def _build_cmd_args( output_extension: str, dpi: int, psm: int, @@ -173,7 +185,9 @@ async def _build_cmd_args( lang: Optional[str] = None, output: str = "stdout", ) -> List[str]: - await asyncio.gather(psm_is_valid(psm), oem_is_valid(oem)) + psm_is_valid(psm) + oem_is_valid(oem) + cmd_args = deque( ["stdin", f"{output}", "--dpi", f"{dpi}", "--psm", f"{psm}", "--oem", f"{oem}"] ) @@ -190,7 +204,7 @@ async def _build_cmd_args( cmd_args.append(tessdata_dir) if lang: - await language_is_valid(lang) + language_is_valid(lang) cmd_args.append("-l") cmd_args.append(lang) diff --git a/aiopytesseract/commands.py b/aiopytesseract/commands.py index dc8af0a..ac9e016 100644 --- a/aiopytesseract/commands.py +++ b/aiopytesseract/commands.py @@ -76,17 +76,22 @@ async def confidence( :param oem: ocr engine modes (default: 3) :param timeout: command timeout (default: 30) """ - proc = await execute_cmd(f"stdin stdout -l {lang} --dpi {dpi} --psm 0 --oem {oem}") - stdout, _ = await asyncio.wait_for( - proc.communicate(Path(image).read_bytes()), timeout=timeout - ) try: + proc = await execute_cmd( + f"stdin stdout -l {lang} --dpi {dpi} --psm 0 --oem {oem}" + ) + stdout, _ = await asyncio.wait_for( + proc.communicate(Path(image).read_bytes()), timeout=timeout + ) confidence_value = float( re.search( # type: ignore r"(Script.confidence:.(\d{1,10}.\d{1,10})$)", stdout.decode(encoding), ).group(2) ) + except asyncio.TimeoutError: + proc.kill() + raise RuntimeError("Tesseract process timeout") except AttributeError: confidence_value = 0.0 return confidence_value @@ -108,17 +113,20 @@ async def deskew( :param lang: tesseract language. (Format: eng, eng+por, eng+por+fra) :param timeout: command timeout (default: 30) """ - proc = await execute_cmd( - f"{image} stdout -l {lang} --dpi {dpi} --psm 2 --oem {oem}" - ) - data = await asyncio.wait_for(proc.stderr.read(), timeout=timeout) try: + proc = await execute_cmd( + f"{image} stdout -l {lang} --dpi {dpi} --psm 2 --oem {oem}" + ) + data = await asyncio.wait_for(proc.stderr.read(), timeout=timeout) deskew_value = float( re.search( # type: ignore r"(Deskew.angle:.)(\d{1,10}.\d{1,10}$)", data.decode(encoding), ).group(2) ) + except asyncio.TimeoutError: + proc.kill() + raise RuntimeError("Tesseract process timeout") except AttributeError: deskew_value = 0.0 return deskew_value @@ -404,7 +412,7 @@ async def image_to_boxes( @image_to_boxes.register(str) async def _(image: str, timeout: float = AIOPYTESSERACT_DEFAULT_TIMEOUT) -> List[Box]: - await file_exists(image) + file_exists(image) boxes = await image_to_boxes(Path(image).read_bytes(), timeout) return boxes @@ -415,8 +423,14 @@ async def _( timeout: float = AIOPYTESSERACT_DEFAULT_TIMEOUT, encoding: str = AIOPYTESSERACT_DEFAULT_ENCODING, ) -> List[Box]: - proc = await execute_cmd("stdin stdout batch.nochop makebox") - stdout, stderr = await asyncio.wait_for(proc.communicate(image), timeout=timeout) + try: + proc = await execute_cmd("stdin stdout batch.nochop makebox") + stdout, stderr = await asyncio.wait_for( + proc.communicate(image), timeout=timeout + ) + except asyncio.TimeoutError: + proc.kill() + raise RuntimeError("Tesseract process timeout") if proc.returncode != ReturnCode.SUCCESS: raise TesseractRuntimeError(stderr.decode(encoding)) data = stdout.decode(encoding) @@ -447,7 +461,7 @@ async def _( dpi: int = AIOPYTESSERACT_DEFAULT_DPI, timeout: float = AIOPYTESSERACT_DEFAULT_TIMEOUT, ) -> List[Data]: - await file_exists(image) + file_exists(image) data_values = await image_to_data(Path(image).read_bytes(), dpi, timeout) return data_values @@ -459,8 +473,14 @@ async def _( timeout: float = AIOPYTESSERACT_DEFAULT_TIMEOUT, encoding: str = AIOPYTESSERACT_DEFAULT_ENCODING, ) -> List[Data]: - proc = await execute_cmd(f"stdin stdout -c tessedit_create_tsv=1 --dpi {dpi}") - stdout, stderr = await asyncio.wait_for(proc.communicate(image), timeout=timeout) + try: + proc = await execute_cmd(f"stdin stdout -c tessedit_create_tsv=1 --dpi {dpi}") + stdout, stderr = await asyncio.wait_for( + proc.communicate(image), timeout=timeout + ) + except asyncio.TimeoutError: + proc.kill() + raise RuntimeError("Tesseract process timeout") if proc.returncode != ReturnCode.SUCCESS: raise TesseractRuntimeError(stderr.decode(encoding)) data: str = stdout.decode(encoding) @@ -498,7 +518,7 @@ async def _( timeout: float = AIOPYTESSERACT_DEFAULT_TIMEOUT, encoding: str = AIOPYTESSERACT_DEFAULT_ENCODING, ) -> OSD: - await file_exists(image) + file_exists(image) osd = await image_to_osd(Path(image).read_bytes(), dpi, oem, timeout, encoding) return osd diff --git a/aiopytesseract/constants.py b/aiopytesseract/constants.py index 079645d..4e8339f 100644 --- a/aiopytesseract/constants.py +++ b/aiopytesseract/constants.py @@ -161,6 +161,7 @@ AIOPYTESSERACT_DEFAULT_DPI: int = 200 AIOPYTESSERACT_DEFAULT_PSM: int = 3 AIOPYTESSERACT_DEFAULT_OEM: int = 3 +AIOPYTESSERACT_DEFAULT_BUILD_CMD_CACHE: int = 1 OUTPUT_FILE_EXTENSIONS = { FileFormat.ALTO: ".xml", diff --git a/aiopytesseract/validators.py b/aiopytesseract/validators.py index 7080aaa..e50de69 100644 --- a/aiopytesseract/validators.py +++ b/aiopytesseract/validators.py @@ -6,22 +6,22 @@ OEMInvalidException, PSMInvalidException) -async def psm_is_valid(psm: int) -> None: +def psm_is_valid(psm: int) -> None: if psm not in PAGE_SEGMENTATION_MODES.keys(): raise PSMInvalidException -async def oem_is_valid(oem: int) -> None: +def oem_is_valid(oem: int) -> None: if oem not in OCR_ENGINE_MODES.keys(): raise OEMInvalidException -async def file_exists(file_path: str) -> None: +def file_exists(file_path: str) -> None: if not Path(file_path).exists(): raise NoSuchFileException(f"No such file: '{file_path}'") -async def language_is_valid(language: str) -> None: +def language_is_valid(language: str) -> None: for lang in language.split("+"): if lang not in TESSERACT_LANGUAGES: raise LanguageInvalidException( diff --git a/setup.cfg b/setup.cfg index 7bbfaf0..6527428 100644 --- a/setup.cfg +++ b/setup.cfg @@ -56,7 +56,6 @@ ignore = E501 # line too long D103 # missing docstring in public function D105 # missing docstring in magic method D107 # missing docstring in __init__ - W503 # line break before binary operator verbose = 2 doctests = True show_source = True diff --git a/tests/test_base_command.py b/tests/test_base_command.py index 5c02330..a7b5fae 100644 --- a/tests/test_base_command.py +++ b/tests/test_base_command.py @@ -14,7 +14,7 @@ async def test_execute_unsupported(input_data): @pytest.mark.asyncio async def test_build_cmd_args_with_user_patterns(): - command = await aiopytesseract.base_command._build_cmd_args( + command = aiopytesseract.base_command._build_cmd_args( "stdout", 200, 3, diff --git a/tests/test_commands.py b/tests/test_commands.py index 68295b9..6c01a8e 100644 --- a/tests/test_commands.py +++ b/tests/test_commands.py @@ -26,7 +26,6 @@ async def test_tesseract_version(func): assert len(version) > 0 -# run @pytest.mark.asyncio async def test_run_with_type_not_supported(): with pytest.raises(NotImplementedError): @@ -84,3 +83,33 @@ async def test_tesseract_parameters(): parameters = await aiopytesseract.tesseract_parameters() assert isinstance(parameters, list) assert isinstance(parameters[0], Parameter) + + +@pytest.mark.asyncio +@pytest.mark.parametrize( + "func, timeout", + [ + (aiopytesseract.image_to_string, 0.1), + (aiopytesseract.image_to_hocr, 0.1), + (aiopytesseract.image_to_osd, 0.1), + (aiopytesseract.image_to_pdf, 0.1), + (aiopytesseract.image_to_data, 0.1), + (aiopytesseract.image_to_boxes, 0.1), + (aiopytesseract.deskew, 0.01), + (aiopytesseract.confidence, 0.1), + ], +) +async def test_method_timeout(func, timeout): + with pytest.raises(RuntimeError): + await func("tests/samples/file-sample_150kB.png", timeout=timeout) + + +async def test_run_timeout(): + with pytest.raises(RuntimeError): + async with aiopytesseract.run( + Path("tests/samples/file-sample_150kB.png").read_bytes(), + "xxx", + "alto tsv txt", + timeout=0.1, + ) as out: + print(out) diff --git a/tests/test_validators.py b/tests/test_validators.py index f70b875..2742596 100644 --- a/tests/test_validators.py +++ b/tests/test_validators.py @@ -3,52 +3,44 @@ from aiopytesseract import constants, exceptions, validators -@pytest.mark.asyncio async def test_valid_psm(): for psm in constants.PAGE_SEGMENTATION_MODES.keys(): - await validators.psm_is_valid(psm) + validators.psm_is_valid(psm) -@pytest.mark.asyncio @pytest.mark.parametrize("psm", [-1, 14, "1"]) -async def test_invalid_psm(psm): +def test_invalid_psm(psm): with pytest.raises(exceptions.PSMInvalidException): - await validators.psm_is_valid(psm) + validators.psm_is_valid(psm) -@pytest.mark.asyncio -async def test_valid_oem(): +def test_valid_oem(): for oem in constants.OCR_ENGINE_MODES.keys(): - await validators.oem_is_valid(oem) + validators.oem_is_valid(oem) -@pytest.mark.asyncio @pytest.mark.parametrize("oem", [-1, 4, "1"]) -async def test_invalid_oem(oem): +def test_invalid_oem(oem): with pytest.raises(exceptions.OEMInvalidException): - await validators.oem_is_valid(oem) + validators.oem_is_valid(oem) -@pytest.mark.asyncio async def test_file_exists(): - await validators.file_exists("tests/samples/file-sample_150kB.png") + validators.file_exists("tests/samples/file-sample_150kB.png") -@pytest.mark.asyncio -async def test_file_does_not_exist(): +def test_file_does_not_exist(): with pytest.raises(exceptions.NoSuchFileException): - await validators.file_exists("tests/samples/file-sample_150kB.jpeg") + validators.file_exists("tests/samples/file-sample_150kB.jpeg") -@pytest.mark.asyncio @pytest.mark.parametrize("lang", ["por", "por+eng", "por+eng+fra"]) async def test_language_is_valid(lang): - resp = await validators.language_is_valid(lang) + resp = validators.language_is_valid(lang) assert resp is None -@pytest.mark.asyncio @pytest.mark.parametrize("lang", ["por eng", "por:eng", "por-eng", "por+zuul"]) async def test_language_is_invalid(lang): with pytest.raises(exceptions.LanguageInvalidException): - await validators.language_is_valid(lang) + validators.language_is_valid(lang)