From 8b194a03a926a5b8cd5653d483b68b237e101c3e Mon Sep 17 00:00:00 2001 From: mara004 Date: Fri, 7 Oct 2022 15:36:29 +0200 Subject: [PATCH] Use pypdfium2's new range-based text extractor get_text() was boundary-based, which is not that suited for the use case of just extracting all text of a page. I believe the new get_text_range() function might both yield better results and be more performant. This can be merged once pypdfium2 3.3 is released. --- benchmark.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/benchmark.py b/benchmark.py index a7b4e72..589e7d4 100644 --- a/benchmark.py +++ b/benchmark.py @@ -149,10 +149,7 @@ def pdfium_get_text(data: bytes) -> str: for i in range(len(pdf)): page = pdf.get_page(i) textpage = page.get_textpage() - text += textpage.get_text() - text += "\n" - [g.close() for g in (textpage, page)] - pdf.close() + text += textpage.get_text_range() + "\n" return text