diff --git a/doctr/io/pdf.py b/doctr/io/pdf.py index ced6262b7..025239028 100644 --- a/doctr/io/pdf.py +++ b/doctr/io/pdf.py @@ -4,7 +4,7 @@ # See LICENSE or go to for full license details. from pathlib import Path -from typing import Any, List, Optional +from typing import Any, Iterator, Optional import numpy as np import pypdfium2 as pdfium @@ -19,7 +19,7 @@ def read_pdf( scale: float = 2, password: Optional[str] = None, **kwargs: Any, -) -> List[np.ndarray]: +) -> Iterator[np.ndarray]: """Read a PDF file and convert it into an image in numpy format >>> from doctr.documents import read_pdf @@ -42,4 +42,5 @@ def read_pdf( # Rasterise pages to PIL images with pypdfium2 and convert to numpy ndarrays with pdfium.PdfDocument(file, password=password) as pdf: - return [np.asarray(img) for img in pdf.render_topil(scale=scale, **kwargs)] + for img in pdf.render_topil(scale=scale, **kwargs): + yield np.asarray(img) diff --git a/doctr/io/reader.py b/doctr/io/reader.py index e98859328..37cf36a74 100644 --- a/doctr/io/reader.py +++ b/doctr/io/reader.py @@ -34,7 +34,7 @@ def from_pdf(cls, file: AbstractFile, **kwargs) -> List[np.ndarray]: the list of pages decoded as numpy ndarray of shape H x W x 3 """ - return read_pdf(file, **kwargs) + return list(read_pdf(file, **kwargs)) @classmethod def from_url(cls, url: str, **kwargs) -> List[np.ndarray]: diff --git a/tests/common/test_io.py b/tests/common/test_io.py index 8e4f2afea..23738f430 100644 --- a/tests/common/test_io.py +++ b/tests/common/test_io.py @@ -15,20 +15,20 @@ def _check_doc_content(doc_tensors, num_pages): def test_read_pdf(mock_pdf): - doc = io.read_pdf(mock_pdf) + doc = list(io.read_pdf(mock_pdf)) _check_doc_content(doc, 2) with open(mock_pdf, "rb") as f: - doc = io.read_pdf(f.read()) + doc = list(io.read_pdf(f.read())) _check_doc_content(doc, 2) # Wrong input type with pytest.raises(TypeError): - _ = io.read_pdf(123) + _ = list(io.read_pdf(123)) # Wrong path with pytest.raises(FileNotFoundError): - _ = io.read_pdf("my_imaginary_file.pdf") + _ = list(io.read_pdf("my_imaginary_file.pdf")) def test_read_img_as_numpy(tmpdir_factory, mock_pdf):