Skip to content

Commit

Permalink
Char level splicing of inline math into provider lines
Browse files Browse the repository at this point in the history
  • Loading branch information
tarun-menta committed Feb 2, 2025
1 parent 34381e6 commit ce867b8
Show file tree
Hide file tree
Showing 3 changed files with 75 additions and 5 deletions.
67 changes: 63 additions & 4 deletions marker/builders/line.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,10 @@ class LineBuilder(BaseBuilder):
float,
"The minimum overlap of a span with an inline math box to consider for removal"
] = .5
char_inline_math_overlap_threshold: Annotated[
float,
"The minimum overlap of a character with an inline math box to consider for removal"
] = .5
line_inline_math_overlap_threshold: Annotated[
float,
"The minimum overlap of a provider line with an inline math box to consider as a match"
Expand Down Expand Up @@ -267,13 +271,68 @@ def merge_provider_lines_inline_math(self, document_page_id, provider_lines, inl
def _reconstruct_provider_line(self, provider_line, math_line_polygon):
spans_to_keep = []
spans = provider_line.spans
SpanClass: Span = get_block_class(BlockTypes.Span)

for span in spans:
span_area = span.polygon.area
overlap = span.polygon.intersection_area(math_line_polygon)/span_area
if overlap<self.span_inline_math_overlap_threshold:
#For providers which do not surface characters
if provider_line.chars is None:
for span in spans:
if span.polygon.intersection_pct(math_line_polygon)<self.span_inline_math_overlap_threshold:
spans_to_keep.append(span)
provider_line.spans = spans_to_keep
return

#For providers which surface characters - Split the span based on overlapping characters
chars_to_keep = []
assert len(spans) == len(provider_line.chars)
for span, span_chars in zip(spans, provider_line.chars):
if span.polygon.intersection_area(math_line_polygon)==0:
spans_to_keep.append(span)
chars_to_keep.append(span_chars)
continue
#Split at the inline math
left_chars, right_chars = [], []
math_line_center_x = math_line_polygon.center[0]

for char in span_chars:
if char.polygon.intersection_pct(math_line_polygon)>self.char_inline_math_overlap_threshold:
continue # Skip characters that overlap with the math polygon

# Since chars are already in left-to-right order, we can just check position
if char.polygon.center[0] < math_line_center_x:
left_chars.append(char)
else:
right_chars.append(char)

if left_chars:
left_polygon = left_chars[0].polygon.merge([c.polygon for c in left_chars])
spans_to_keep.append(SpanClass(
text=fix_text(''.join(c.char for c in left_chars)),
formats=span.formats,
page_id=span.page_id,
polygon=left_polygon,
minimum_position=left_chars[0].char_idx,
maximum_position=left_chars[-1].char_idx,
font=span.font,
font_weight=span.font_weight,
font_size=span.font_size
))
chars_to_keep.append(left_chars)
if right_chars:
right_polygon = right_chars[0].polygon.merge([c.polygon for c in right_chars])
spans_to_keep.append(SpanClass(
text=fix_text(''.join(c.char for c in right_chars)),
formats=span.formats,
page_id=span.page_id,
polygon=right_polygon,
minimum_position=right_chars[0].char_idx,
maximum_position=right_chars[-1].char_idx,
font=span.font,
font_weight=span.font_weight,
font_size=span.font_size
))
chars_to_keep.append(right_chars)
provider_line.spans = spans_to_keep
provider_line.chars = chars_to_keep


def check_layout_coverage(
Expand Down
6 changes: 6 additions & 0 deletions marker/providers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,15 @@
from marker.schema.text.line import Line
from marker.util import assign_config

class Char(BaseModel):
char: str
polygon: PolygonBox
char_idx: int

class ProviderOutput(BaseModel):
line: Line
spans: List[Span]
chars: Optional[List[List[Char]]] = None

@property
def raw_text(self):
Expand Down
7 changes: 6 additions & 1 deletion marker/providers/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from PIL import Image
from pypdfium2 import PdfiumError

from marker.providers import BaseProvider, ProviderOutput, ProviderPageLines
from marker.providers import BaseProvider, ProviderOutput, Char, ProviderPageLines
from marker.providers.utils import alphanum_ratio
from marker.schema import BlockTypes
from marker.schema.polygon import PolygonBox
Expand Down Expand Up @@ -191,6 +191,7 @@ def pdftext_extraction(self) -> ProviderPageLines:
for block in page["blocks"]:
for line in block["lines"]:
spans: List[Span] = []
chars: List[List[Char]] = []
for span in line["spans"]:
if not span["text"]:
continue
Expand All @@ -199,6 +200,7 @@ def pdftext_extraction(self) -> ProviderPageLines:
font_weight = span["font"]["weight"] or 0
font_size = span["font"]["size"] or 0
polygon = PolygonBox.from_bbox(span["bbox"], ensure_nonzero_area=True)
span_chars = [Char(char=c['char'], polygon=PolygonBox.from_bbox(c['bbox'], ensure_nonzero_area=True), char_idx=c['char_idx']) for c in span["chars"]]
spans.append(
SpanClass(
polygon=polygon,
Expand All @@ -214,11 +216,14 @@ def pdftext_extraction(self) -> ProviderPageLines:
url=span.get("url"),
)
)
chars.append(span_chars)
polygon = PolygonBox.from_bbox(line["bbox"], ensure_nonzero_area=True)
assert len(spans) == len(chars)
lines.append(
ProviderOutput(
line=LineClass(polygon=polygon, page_id=page_id),
spans=spans,
chars=chars
)
)
if self.check_line_spans(lines):
Expand Down

0 comments on commit ce867b8

Please sign in to comment.