Skip to content

Commit

Permalink
Fix bug, add alternate OCR func
Browse files Browse the repository at this point in the history
  • Loading branch information
VikParuchuri committed Dec 1, 2023
1 parent b47629d commit 0146964
Show file tree
Hide file tree
Showing 4 changed files with 21 additions and 4 deletions.
2 changes: 1 addition & 1 deletion marker/extract_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

from spellchecker import SpellChecker

from marker.ocr.page import ocr_entire_page_ocrmp
from marker.ocr.page import ocr_entire_page_ocrmp, ocr_entire_page_tess
from marker.ocr.utils import detect_bad_ocr, font_flags_decomposer
from marker.settings import settings
from marker.schema import Span, Line, Block, Page
Expand Down
18 changes: 18 additions & 0 deletions marker/ocr/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,24 @@
ocrmypdf.configure_logging(verbosity=ocrmypdf.Verbosity.quiet)


def ocr_entire_page_tess(page, lang: str, spellchecker: SpellChecker | None = None) -> List[Block]:
try:
full_tp = page.get_textpage_ocr(flags=settings.TEXT_FLAGS, dpi=settings.OCR_DPI, full=True, language=lang)
blocks = page.get_text("dict", sort=True, flags=settings.TEXT_FLAGS, textpage=full_tp)["blocks"]
full_text = page.get_text("text", sort=True, flags=settings.TEXT_FLAGS, textpage=full_tp)

if len(full_text) == 0:
return []

# Check if OCR worked. If it didn't, return empty list
# OCR can fail if there is a scanned blank page with some faint text impressions, for example
if detect_bad_ocr(full_text, spellchecker):
return []
except RuntimeError:
return []
return blocks


def ocr_entire_page_ocrmp(page, lang: str, spellchecker: SpellChecker | None = None) -> List[Block]:
# Use ocrmypdf to get OCR text for the whole page
src = page.parent # the page's document
Expand Down
2 changes: 1 addition & 1 deletion marker/ordering.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ def batch_inference(rgb_images, bboxes, words, model):


def add_column_counts(doc, doc_blocks, model, batch_size):
for i in range(0, len(doc), batch_size):
for i in range(0, len(doc_blocks), batch_size):
batch = range(i, min(i + batch_size, len(doc_blocks)))
rgb_images = []
bboxes = []
Expand Down
3 changes: 1 addition & 2 deletions marker/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,7 @@ class Settings(BaseSettings):

# OCR
INVALID_CHARS: List[str] = [chr(0xfffd), "�"]
DPI: int = 800
SEGMENT_DPI: int = 1200
OCR_DPI: int = 400
TESSDATA_PREFIX: str = ""
TESSERACT_LANGUAGES: Dict = {
"English": "eng",
Expand Down

0 comments on commit 0146964

Please sign in to comment.