Fix bug, add alternate OCR func

VikParuchuri · Dec 1, 2023 · 0146964 · 0146964
1 parent b47629d
commit 0146964
Show file tree

Hide file tree

Showing 4 changed files with 21 additions and 4 deletions.
diff --git a/marker/extract_text.py b/marker/extract_text.py
@@ -3,7 +3,7 @@
 
 from spellchecker import SpellChecker
 
-from marker.ocr.page import ocr_entire_page_ocrmp
+from marker.ocr.page import ocr_entire_page_ocrmp, ocr_entire_page_tess
 from marker.ocr.utils import detect_bad_ocr, font_flags_decomposer
 from marker.settings import settings
 from marker.schema import Span, Line, Block, Page

diff --git a/marker/ocr/page.py b/marker/ocr/page.py
@@ -12,6 +12,24 @@
 ocrmypdf.configure_logging(verbosity=ocrmypdf.Verbosity.quiet)
 
 
+def ocr_entire_page_tess(page, lang: str, spellchecker: SpellChecker | None = None) -> List[Block]:
+    try:
+        full_tp = page.get_textpage_ocr(flags=settings.TEXT_FLAGS, dpi=settings.OCR_DPI, full=True, language=lang)
+        blocks = page.get_text("dict", sort=True, flags=settings.TEXT_FLAGS, textpage=full_tp)["blocks"]
+        full_text = page.get_text("text", sort=True, flags=settings.TEXT_FLAGS, textpage=full_tp)
+
+        if len(full_text) == 0:
+            return []
+
+        # Check if OCR worked. If it didn't, return empty list
+        # OCR can fail if there is a scanned blank page with some faint text impressions, for example
+        if detect_bad_ocr(full_text, spellchecker):
+            return []
+    except RuntimeError:
+        return []
+    return blocks
+
+
 def ocr_entire_page_ocrmp(page, lang: str, spellchecker: SpellChecker | None = None) -> List[Block]:
     # Use ocrmypdf to get OCR text for the whole page
     src = page.parent  # the page's document

diff --git a/marker/ordering.py b/marker/ordering.py
@@ -82,7 +82,7 @@ def batch_inference(rgb_images, bboxes, words, model):
 
 
 def add_column_counts(doc, doc_blocks, model, batch_size):
-    for i in range(0, len(doc), batch_size):
+    for i in range(0, len(doc_blocks), batch_size):
         batch = range(i, min(i + batch_size, len(doc_blocks)))
         rgb_images = []
         bboxes = []

diff --git a/marker/settings.py b/marker/settings.py
@@ -29,8 +29,7 @@ class Settings(BaseSettings):
 
     # OCR
     INVALID_CHARS: List[str] = [chr(0xfffd), "�"]
-    DPI: int = 800
-    SEGMENT_DPI: int = 1200
+    OCR_DPI: int = 400
     TESSDATA_PREFIX: str = ""
     TESSERACT_LANGUAGES: Dict = {
         "English": "eng",