From 8b194a03a926a5b8cd5653d483b68b237e101c3e Mon Sep 17 00:00:00 2001
From: mara004 <geisserml@gmail.com>
Date: Fri, 7 Oct 2022 15:36:29 +0200
Subject: [PATCH] Use pypdfium2's new range-based text extractor

get_text() was boundary-based, which is not that suited for the use case of just extracting all text of a page.
I believe the new get_text_range() function might both yield better results and be more performant.

This can be merged once pypdfium2 3.3 is released.
---
 benchmark.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/benchmark.py b/benchmark.py
index a7b4e72..589e7d4 100644
--- a/benchmark.py
+++ b/benchmark.py
@@ -149,10 +149,7 @@ def pdfium_get_text(data: bytes) -> str:
     for i in range(len(pdf)):
         page = pdf.get_page(i)
         textpage = page.get_textpage()
-        text += textpage.get_text()
-        text += "\n"
-        [g.close() for g in (textpage, page)]
-    pdf.close()
+        text += textpage.get_text_range() + "\n"
     return text