diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py index 841e786..ec62ffe 100644 --- a/docling_core/types/doc/document.py +++ b/docling_core/types/doc/document.py @@ -36,7 +36,7 @@ from docling_core.types.base import _JSON_POINTER_REGEX from docling_core.types.doc import BoundingBox, Size from docling_core.types.doc.base import ImageRefMode -from docling_core.types.doc.labels import DocItemLabel, GroupLabel +from docling_core.types.doc.labels import DocItemLabel, GroupLabel, InvisibleTextLabel from docling_core.types.legacy_doc.tokens import DocumentToken from docling_core.utils.file import relative_path @@ -640,6 +640,13 @@ def export_to_document_tokens( return body +class InvisibleTextItem(TextItem): + """InvisibleTextItem.""" + + label: typing.Literal[DocItemLabel.INVISIBLE_TEXT] = DocItemLabel.INVISIBLE_TEXT + category: InvisibleTextLabel + + class SectionHeaderItem(TextItem): """SectionItem.""" diff --git a/docling_core/types/doc/labels.py b/docling_core/types/doc/labels.py index f5b5c32..9cfd8a9 100644 --- a/docling_core/types/doc/labels.py +++ b/docling_core/types/doc/labels.py @@ -28,12 +28,22 @@ class DocItemLabel(str, Enum): # Additional labels for markup-based formats (e.g. HTML, Word) PARAGRAPH = "paragraph" # explicitly a paragraph and not arbitrary text REFERENCE = "reference" + INVISIBLE_TEXT = "invisible_text" def __str__(self): """Get string value.""" return str(self.value) +class InvisibleTextLabel(str, Enum): + """InvisibleTextLabel.""" + + UNSPECIFIED = "unspecified" + + INVISIBLE_TEXT = "invisible_text" + AUTHOR_NOTE = "author_note" + + class GroupLabel(str, Enum): """GroupLabel.""" diff --git a/docs/DoclingDocument.json b/docs/DoclingDocument.json index 7772b84..a3c7de8 100644 --- a/docs/DoclingDocument.json +++ b/docs/DoclingDocument.json @@ -192,7 +192,8 @@ "form", "key_value_region", "paragraph", - "reference" + "reference", + "invisible_text" ], "title": "DocItemLabel", "type": "string" diff --git a/test/data/docling_document/unit/InvisibleTextItem.yaml b/test/data/docling_document/unit/InvisibleTextItem.yaml new file mode 100644 index 0000000..37c227d --- /dev/null +++ b/test/data/docling_document/unit/InvisibleTextItem.yaml @@ -0,0 +1,8 @@ +category: author_note +children: [] +label: invisible_text +orig: whatever +parent: null +prov: [] +self_ref: '#' +text: whatever diff --git a/test/test_docling_doc.py b/test/test_docling_doc.py index e12abb8..ddabc5d 100644 --- a/test/test_docling_doc.py +++ b/test/test_docling_doc.py @@ -19,6 +19,7 @@ DocumentOrigin, FloatingItem, ImageRef, + InvisibleTextItem, KeyValueItem, ListItem, PictureItem, @@ -30,7 +31,7 @@ TableItem, TextItem, ) -from docling_core.types.doc.labels import DocItemLabel, GroupLabel +from docling_core.types.doc.labels import DocItemLabel, GroupLabel, InvisibleTextLabel GENERATE = False @@ -138,6 +139,14 @@ def verify(dc, obj): ) verify(dc, obj) + elif dc is InvisibleTextItem: + obj = dc( + text="whatever", + orig="whatever", + self_ref="#", + category=InvisibleTextLabel.AUTHOR_NOTE, + ) + verify(dc, obj) else: # print(f"{dc.__name__} is not known") assert False, "new derived class detected {dc.__name__}: {e}"