From 4ae3d7407a346a63e6cbd125ed83f40393d8f8bb Mon Sep 17 00:00:00 2001
From: Vik Paruchuri <vik.paruchuri@gmail.com>
Date: Tue, 22 Oct 2024 21:55:15 -0400
Subject: [PATCH] Fix bugs

---
 marker/convert.py         |  9 ++++++---
 marker/ocr/recognition.py | 12 ++++++++++--
 pyproject.toml            |  2 +-
 3 files changed, 17 insertions(+), 6 deletions(-)

diff --git a/marker/convert.py b/marker/convert.py
index 0a7cd7d0..8cf6a015 100644
--- a/marker/convert.py
+++ b/marker/convert.py
@@ -93,8 +93,6 @@ def convert_single_pdf(
 
     # Identify text lines, layout, reading order
     surya_detection(lowres_images, pages, detection_model, batch_multiplier=batch_multiplier)
-    surya_layout(lowres_images, pages, layout_model, batch_multiplier=batch_multiplier)
-    surya_order(lowres_images, pages, order_model, batch_multiplier=batch_multiplier)
 
     # OCR pages as needed
     pages, ocr_stats = run_ocr(doc, pages, langs, ocr_model, batch_multiplier=batch_multiplier, ocr_all_pages=ocr_all_pages)
@@ -105,12 +103,17 @@ def convert_single_pdf(
         print(f"Could not extract any text blocks for {fname}")
         return "", {}, out_meta
 
+    surya_layout(lowres_images, pages, layout_model, batch_multiplier=batch_multiplier)
+
     # Find headers and footers
     bad_span_ids = filter_header_footer(pages)
     out_meta["block_stats"] = {"header_footer": len(bad_span_ids)}
 
-    # Add block types from layout and sort from reading order
+    # Add block types from layout
     annotate_block_types(pages)
+
+    # Sort from reading order
+    surya_order(lowres_images, pages, order_model, batch_multiplier=batch_multiplier)
     sort_blocks_in_reading_order(pages)
 
     # Dump debug data if flags are set
diff --git a/marker/ocr/recognition.py b/marker/ocr/recognition.py
index 0ebfe060..7105fc40 100644
--- a/marker/ocr/recognition.py
+++ b/marker/ocr/recognition.py
@@ -79,10 +79,18 @@ def surya_recognition(doc, page_idxs, langs: List[str], rec_model, pages: List[P
     polygons = deepcopy([[b.polygon for b in bboxes] for bboxes in detection_results])
 
     # Scale polygons to get correct image slices
-    for poly in polygons:
-        for p in poly:
+    for j, poly in enumerate(polygons):
+        skip_idxs = []
+        for z, p in enumerate(poly):
             for i in range(len(p)):
                 p[i] = [int(p[i][0] * box_scale), int(p[i][1] * box_scale)]
+            x_coords = [p[i][0] for i in range(len(p))]
+            y_coords = [p[i][1] for i in range(len(p))]
+            bbox = [min(x_coords), min(y_coords), max(x_coords), max(y_coords)]
+            if (bbox[2] - bbox[0]) * (bbox[3] - bbox[1]) == 0:
+                skip_idxs.append(z)
+        if len(skip_idxs) > 0:
+            polygons[j] = [p for i, p in enumerate(poly) if i not in skip_idxs]
 
     results = run_recognition(images, surya_langs, rec_model, processor, polygons=polygons, batch_size=int(get_batch_size() * batch_multiplier))
 
diff --git a/pyproject.toml b/pyproject.toml
index 15f00803..3ce651eb 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "marker-pdf"
-version = "0.3.4"
+version = "0.3.5"
 description = "Convert PDF to markdown with high speed and accuracy."
 authors = ["Vik Paruchuri <github@vikas.sh>"]
 readme = "README.md"