From 4ae3d7407a346a63e6cbd125ed83f40393d8f8bb Mon Sep 17 00:00:00 2001 From: Vik Paruchuri Date: Tue, 22 Oct 2024 21:55:15 -0400 Subject: [PATCH] Fix bugs --- marker/convert.py | 9 ++++++--- marker/ocr/recognition.py | 12 ++++++++++-- pyproject.toml | 2 +- 3 files changed, 17 insertions(+), 6 deletions(-) diff --git a/marker/convert.py b/marker/convert.py index 0a7cd7d0..8cf6a015 100644 --- a/marker/convert.py +++ b/marker/convert.py @@ -93,8 +93,6 @@ def convert_single_pdf( # Identify text lines, layout, reading order surya_detection(lowres_images, pages, detection_model, batch_multiplier=batch_multiplier) - surya_layout(lowres_images, pages, layout_model, batch_multiplier=batch_multiplier) - surya_order(lowres_images, pages, order_model, batch_multiplier=batch_multiplier) # OCR pages as needed pages, ocr_stats = run_ocr(doc, pages, langs, ocr_model, batch_multiplier=batch_multiplier, ocr_all_pages=ocr_all_pages) @@ -105,12 +103,17 @@ def convert_single_pdf( print(f"Could not extract any text blocks for {fname}") return "", {}, out_meta + surya_layout(lowres_images, pages, layout_model, batch_multiplier=batch_multiplier) + # Find headers and footers bad_span_ids = filter_header_footer(pages) out_meta["block_stats"] = {"header_footer": len(bad_span_ids)} - # Add block types from layout and sort from reading order + # Add block types from layout annotate_block_types(pages) + + # Sort from reading order + surya_order(lowres_images, pages, order_model, batch_multiplier=batch_multiplier) sort_blocks_in_reading_order(pages) # Dump debug data if flags are set diff --git a/marker/ocr/recognition.py b/marker/ocr/recognition.py index 0ebfe060..7105fc40 100644 --- a/marker/ocr/recognition.py +++ b/marker/ocr/recognition.py @@ -79,10 +79,18 @@ def surya_recognition(doc, page_idxs, langs: List[str], rec_model, pages: List[P polygons = deepcopy([[b.polygon for b in bboxes] for bboxes in detection_results]) # Scale polygons to get correct image slices - for poly in polygons: - for p in poly: + for j, poly in enumerate(polygons): + skip_idxs = [] + for z, p in enumerate(poly): for i in range(len(p)): p[i] = [int(p[i][0] * box_scale), int(p[i][1] * box_scale)] + x_coords = [p[i][0] for i in range(len(p))] + y_coords = [p[i][1] for i in range(len(p))] + bbox = [min(x_coords), min(y_coords), max(x_coords), max(y_coords)] + if (bbox[2] - bbox[0]) * (bbox[3] - bbox[1]) == 0: + skip_idxs.append(z) + if len(skip_idxs) > 0: + polygons[j] = [p for i, p in enumerate(poly) if i not in skip_idxs] results = run_recognition(images, surya_langs, rec_model, processor, polygons=polygons, batch_size=int(get_batch_size() * batch_multiplier)) diff --git a/pyproject.toml b/pyproject.toml index 15f00803..3ce651eb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "marker-pdf" -version = "0.3.4" +version = "0.3.5" description = "Convert PDF to markdown with high speed and accuracy." authors = ["Vik Paruchuri "] readme = "README.md"