Merge branch 'main' of github.com:DS4SD/docling into cau/picture-cont…

…ent-example
DS4SD · Jan 20, 2025 · 1598971 · 1598971
2 parents 687c469 + c49b352
commit 1598971
Show file tree

Hide file tree

Showing 43 changed files with 3,000 additions and 1,649 deletions.
diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
@@ -14,7 +14,10 @@ jobs:
         - uses: ./.github/actions/setup-poetry
         - name: Build docs
           run: poetry run mkdocs build --verbose --clean
+        - name: Make docs LLM ready
+          if: inputs.deploy
+          uses: demodrive-ai/llms-txt-action@ad720693843126e6a73910a667d0eba37c1dea4b
         - name: Build and push docs
           if: inputs.deploy
-          run: poetry run mkdocs gh-deploy --force
-
+          run: poetry run mkdocs gh-deploy --force --dirty
+
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,36 @@
+## [v2.15.1](https://github.com/DS4SD/docling/releases/tag/v2.15.1) - 2025-01-10
+
+### Fix
+
+* Improve OCR results, stricten criteria before dropping bitmap areas ([#719](https://github.com/DS4SD/docling/issues/719)) ([`5a060f2`](https://github.com/DS4SD/docling/commit/5a060f237d1decd0ff9db9e73478978419315778))
+* Allow earlier requests versions ([#716](https://github.com/DS4SD/docling/issues/716)) ([`e64b5a2`](https://github.com/DS4SD/docling/commit/e64b5a2f628acc340a6d94ee6f1ada2aa267cecc))
+
+### Documentation
+
+* Add pointers to LangChain-side docs ([#718](https://github.com/DS4SD/docling/issues/718)) ([`9a6b5c8`](https://github.com/DS4SD/docling/commit/9a6b5c8c8debc81e0ddcbe91df6afbbeb29e97e6))
+* Add LangChain docs ([#717](https://github.com/DS4SD/docling/issues/717)) ([`4fa8028`](https://github.com/DS4SD/docling/commit/4fa8028bd8120d7557e1d45ba31e200e130af698))
+
+## [v2.15.0](https://github.com/DS4SD/docling/releases/tag/v2.15.0) - 2025-01-08
+
+### Feature
+
+* Added http header support for document converter and cli ([#642](https://github.com/DS4SD/docling/issues/642)) ([`0ee849e`](https://github.com/DS4SD/docling/commit/0ee849e8bc8cf24d1c5597af3fe20a7fa19a29e0))
+
+### Fix
+
+* Correct scaling of debug visualizations, tune OCR ([#700](https://github.com/DS4SD/docling/issues/700)) ([`5cb4cf6`](https://github.com/DS4SD/docling/commit/5cb4cf6f19f91e6c87141e93400c4b54b93aa5d7))
+* Let BeautifulSoup detect the HTML encoding ([#695](https://github.com/DS4SD/docling/issues/695)) ([`42856fd`](https://github.com/DS4SD/docling/commit/42856fdf79559188ec4617bc5d3a007286f114d2))
+* **mspowerpoint:** Handle invalid images in PowerPoint slides ([#650](https://github.com/DS4SD/docling/issues/650)) ([`d49650c`](https://github.com/DS4SD/docling/commit/d49650c54ffa60bc6d6106970e104071689bc7b0))
+
+### Documentation
+
+* Specify docstring types ([#702](https://github.com/DS4SD/docling/issues/702)) ([`ead396a`](https://github.com/DS4SD/docling/commit/ead396ab407f6bbd43176abd6ed2bed7ed8c7c43))
+* Add link to rag with granite ([#698](https://github.com/DS4SD/docling/issues/698)) ([`6701f34`](https://github.com/DS4SD/docling/commit/6701f34c855992c52918b210c65a2edb1c827c01))
+* Add integrations, revamp docs ([#693](https://github.com/DS4SD/docling/issues/693)) ([`2d24fae`](https://github.com/DS4SD/docling/commit/2d24faecd96bfa656b2b8c80f25cdf251a50526a))
+* Add OpenContracts as an integration ([#679](https://github.com/DS4SD/docling/issues/679)) ([`569038d`](https://github.com/DS4SD/docling/commit/569038df4205703f87517ea58da7902d143e7699))
+* Add Weaviate RAG recipe notebook ([#451](https://github.com/DS4SD/docling/issues/451)) ([`2b591f9`](https://github.com/DS4SD/docling/commit/2b591f98726ed0d883236dd0550201b95203eebb))
+* Document Haystack & Vectara support ([#628](https://github.com/DS4SD/docling/issues/628)) ([`fc645ea`](https://github.com/DS4SD/docling/commit/fc645ea531ddc67959640b428007851d641c923e))
+
 ## [v2.14.0](https://github.com/DS4SD/docling/releases/tag/v2.14.0) - 2024-12-18
 
 ### Feature

diff --git a/README.md b/README.md
@@ -29,7 +29,7 @@ Docling parses documents and exports them to the desired format with ease and sp
 * 🗂️ Reads popular document formats (PDF, DOCX, PPTX, XLSX, Images, HTML, AsciiDoc & Markdown) and exports to HTML, Markdown and JSON (with embedded and referenced images)
 * 📑 Advanced PDF document understanding including page layout, reading order & table structures
 * 🧩 Unified, expressive [DoclingDocument](https://ds4sd.github.io/docling/concepts/docling_document/) representation format
-* 🤖 Easy integration with 🦙 LlamaIndex & 🦜🔗 LangChain for powerful RAG / QA applications
+* 🤖 Plug-and-play [integrations](https://ds4sd.github.io/docling/integrations/) incl. LangChain, LlamaIndex, Crew AI & Haystack for agentic AI
 * 🔍 OCR support for scanned PDFs
 * 💻 Simple and convenient CLI
 
@@ -39,7 +39,6 @@ Explore the [documentation](https://ds4sd.github.io/docling/) to discover plenty
 
 * ♾️ Equation & code extraction
 * 📝 Metadata extraction, including title, authors, references & language
-* 🦜🔗 Native LangChain extension
 
 ## Installation
 

diff --git a/docling/backend/docling_parse_backend.py b/docling/backend/docling_parse_backend.py
@@ -132,7 +132,7 @@ def draw_clusters_and_cells():
         return cells
 
     def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
-        AREA_THRESHOLD = 32 * 32
+        AREA_THRESHOLD = 0  # 32 * 32
 
         for i in range(len(self._dpage["images"])):
             bitmap = self._dpage["images"][i]
@@ -163,7 +163,7 @@ def get_page_image(
                 l=0, r=0, t=0, b=0, coord_origin=CoordOrigin.BOTTOMLEFT
             )
         else:
-            padbox = cropbox.to_bottom_left_origin(page_size.height)
+            padbox = cropbox.to_bottom_left_origin(page_size.height).model_copy()
             padbox.r = page_size.width - padbox.r
             padbox.t = page_size.height - padbox.t
 

diff --git a/docling/backend/docling_parse_v2_backend.py b/docling/backend/docling_parse_v2_backend.py
@@ -140,7 +140,7 @@ def draw_clusters_and_cells():
         return cells
 
     def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
-        AREA_THRESHOLD = 32 * 32
+        AREA_THRESHOLD = 0  # 32 * 32
 
         images = self._dpage["sanitized"]["images"]["data"]
         images_header = self._dpage["sanitized"]["images"]["header"]
@@ -178,7 +178,7 @@ def get_page_image(
                 l=0, r=0, t=0, b=0, coord_origin=CoordOrigin.BOTTOMLEFT
             )
         else:
-            padbox = cropbox.to_bottom_left_origin(page_size.height)
+            padbox = cropbox.to_bottom_left_origin(page_size.height).model_copy()
             padbox.r = page_size.width - padbox.r
             padbox.t = page_size.height - padbox.t
 

diff --git a/docling/backend/html_backend.py b/docling/backend/html_backend.py
@@ -37,10 +37,10 @@ def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]
 
         try:
             if isinstance(self.path_or_stream, BytesIO):
-                text_stream = self.path_or_stream.getvalue().decode("utf-8")
+                text_stream = self.path_or_stream.getvalue()
                 self.soup = BeautifulSoup(text_stream, "html.parser")
             if isinstance(self.path_or_stream, Path):
-                with open(self.path_or_stream, "r", encoding="utf-8") as f:
+                with open(self.path_or_stream, "rb") as f:
                     html_content = f.read()
                     self.soup = BeautifulSoup(html_content, "html.parser")
         except Exception as e:

diff --git a/docling/backend/mspowerpoint_backend.py b/docling/backend/mspowerpoint_backend.py
@@ -16,7 +16,7 @@
     TableCell,
     TableData,
 )
-from PIL import Image
+from PIL import Image, UnidentifiedImageError
 from pptx import Presentation
 from pptx.enum.shapes import MSO_SHAPE_TYPE, PP_PLACEHOLDER
 
@@ -120,6 +120,7 @@ def handle_text_elements(self, shape, parent_slide, slide_ind, doc):
         bullet_type = "None"
         list_text = ""
         list_label = GroupLabel.LIST
+        doc_label = DocItemLabel.LIST_ITEM
         prov = self.generate_prov(shape, slide_ind, shape.text.strip())
 
         # Identify if shape contains lists
@@ -276,16 +277,19 @@ def handle_pictures(self, shape, parent_slide, slide_ind, doc):
         im_dpi, _ = image.dpi
 
         # Open it with PIL
-        pil_image = Image.open(BytesIO(image_bytes))
-
-        # shape has picture
-        prov = self.generate_prov(shape, slide_ind, "")
-        doc.add_picture(
-            parent=parent_slide,
-            image=ImageRef.from_pil(image=pil_image, dpi=im_dpi),
-            caption=None,
-            prov=prov,
-        )
+        try:
+            pil_image = Image.open(BytesIO(image_bytes))
+
+            # shape has picture
+            prov = self.generate_prov(shape, slide_ind, "")
+            doc.add_picture(
+                parent=parent_slide,
+                image=ImageRef.from_pil(image=pil_image, dpi=im_dpi),
+                caption=None,
+                prov=prov,
+            )
+        except (UnidentifiedImageError, OSError) as e:
+            _log.warning(f"Warning: image cannot be loaded by Pillow: {e}")
         return
 
     def handle_tables(self, shape, parent_slide, slide_ind, doc):

diff --git a/docling/backend/pypdfium2_backend.py b/docling/backend/pypdfium2_backend.py
@@ -39,7 +39,7 @@ def is_valid(self) -> bool:
         return self.valid
 
     def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
-        AREA_THRESHOLD = 32 * 32
+        AREA_THRESHOLD = 0  # 32 * 32
         for obj in self._ppage.get_objects(filter=[pdfium_c.FPDF_PAGEOBJ_IMAGE]):
             pos = obj.get_pos()
             cropbox = BoundingBox.from_tuple(
@@ -210,7 +210,7 @@ def get_page_image(
                 l=0, r=0, t=0, b=0, coord_origin=CoordOrigin.BOTTOMLEFT
             )
         else:
-            padbox = cropbox.to_bottom_left_origin(page_size.height)
+            padbox = cropbox.to_bottom_left_origin(page_size.height).model_copy()
             padbox.r = page_size.width - padbox.r
             padbox.t = page_size.height - padbox.t
 

diff --git a/docling/cli/main.py b/docling/cli/main.py
@@ -164,6 +164,11 @@ def convert(
     to_formats: List[OutputFormat] = typer.Option(
         None, "--to", help="Specify output formats. Defaults to Markdown."
     ),
+    headers: str = typer.Option(
+        None,
+        "--headers",
+        help="Specify http request headers used when fetching url input sources in the form of a JSON string",
+    ),
     image_export_mode: Annotated[
         ImageRefMode,
         typer.Option(
@@ -279,12 +284,19 @@ def convert(
     if from_formats is None:
         from_formats = [e for e in InputFormat]
 
+    parsed_headers: Optional[Dict[str, str]] = None
+    if headers is not None:
+        headers_t = TypeAdapter(Dict[str, str])
+        parsed_headers = headers_t.validate_json(headers)
+
     with tempfile.TemporaryDirectory() as tempdir:
         input_doc_paths: List[Path] = []
         for src in input_sources:
             try:
                 # check if we can fetch some remote url
-                source = resolve_source_to_path(source=src, workdir=Path(tempdir))
+                source = resolve_source_to_path(
+                    source=src, headers=parsed_headers, workdir=Path(tempdir)
+                )
                 input_doc_paths.append(source)
             except FileNotFoundError:
                 err_console.print(
@@ -390,7 +402,7 @@ def convert(
         start_time = time.time()
 
         conv_results = doc_converter.convert_all(
-            input_doc_paths, raises_on_error=abort_on_error
+            input_doc_paths, headers=parsed_headers, raises_on_error=abort_on_error
         )
 
         output.mkdir(parents=True, exist_ok=True)

diff --git a/docling/datamodel/base_models.py b/docling/datamodel/base_models.py
@@ -4,6 +4,7 @@
 from docling_core.types.doc import (
     BoundingBox,
     DocItemLabel,
+    NodeItem,
     PictureDataType,
     Size,
     TableCell,
@@ -201,6 +202,13 @@ class AssembledUnit(BaseModel):
     headers: List[PageElement] = []
 
 
+class ItemAndImageEnrichmentElement(BaseModel):
+    model_config = ConfigDict(arbitrary_types_allowed=True)
+
+    item: NodeItem
+    image: Image
+
+
 class Page(BaseModel):
     model_config = ConfigDict(arbitrary_types_allowed=True)
 
@@ -219,12 +227,28 @@ class Page(BaseModel):
         {}
     )  # Cache of images in different scales. By default it is cleared during assembling.
 
-    def get_image(self, scale: float = 1.0) -> Optional[Image]:
+    def get_image(
+        self, scale: float = 1.0, cropbox: Optional[BoundingBox] = None
+    ) -> Optional[Image]:
         if self._backend is None:
             return self._image_cache.get(scale, None)
+
         if not scale in self._image_cache:
-            self._image_cache[scale] = self._backend.get_page_image(scale=scale)
-        return self._image_cache[scale]
+            if cropbox is None:
+                self._image_cache[scale] = self._backend.get_page_image(scale=scale)
+            else:
+                return self._backend.get_page_image(scale=scale, cropbox=cropbox)
+
+        if cropbox is None:
+            return self._image_cache[scale]
+        else:
+            page_im = self._image_cache[scale]
+            assert self.size is not None
+            return page_im.crop(
+                cropbox.to_top_left_origin(page_height=self.size.height)
+                .scaled(scale=scale)
+                .as_tuple()
+            )
 
     @property
     def image(self) -> Optional[Image]:

diff --git a/docling/datamodel/document.py b/docling/datamodel/document.py
@@ -227,13 +227,18 @@ def unload(self):
 class _DocumentConversionInput(BaseModel):
 
     path_or_stream_iterator: Iterable[Union[Path, str, DocumentStream]]
+    headers: Optional[Dict[str, str]] = None
     limits: Optional[DocumentLimits] = DocumentLimits()
 
     def docs(
         self, format_options: Dict[InputFormat, "FormatOption"]
     ) -> Iterable[InputDocument]:
         for item in self.path_or_stream_iterator:
-            obj = resolve_source_to_stream(item) if isinstance(item, str) else item
+            obj = (
+                resolve_source_to_stream(item, self.headers)
+                if isinstance(item, str)
+                else item
+            )
             format = self._guess_format(obj)
             backend: Type[AbstractDocumentBackend]
             if format not in format_options.keys():

diff --git a/docling/datamodel/pipeline_options.py b/docling/datamodel/pipeline_options.py
@@ -139,7 +139,7 @@ class EasyOcrOptions(OcrOptions):
 
     use_gpu: Optional[bool] = None
 
-    confidence_threshold: float = 0.65
+    confidence_threshold: float = 0.5
 
     model_storage_directory: Optional[str] = None
     recog_network: Optional[str] = "standard"

diff --git a/docling/document_converter.py b/docling/document_converter.py
@@ -176,6 +176,7 @@ def initialize_pipeline(self, format: InputFormat):
     def convert(
         self,
         source: Union[Path, str, DocumentStream],  # TODO review naming
+        headers: Optional[Dict[str, str]] = None,
         raises_on_error: bool = True,
         max_num_pages: int = sys.maxsize,
         max_file_size: int = sys.maxsize,
@@ -185,13 +186,15 @@ def convert(
             raises_on_error=raises_on_error,
             max_num_pages=max_num_pages,
             max_file_size=max_file_size,
+            headers=headers,
         )
         return next(all_res)
 
     @validate_call(config=ConfigDict(strict=True))
     def convert_all(
         self,
         source: Iterable[Union[Path, str, DocumentStream]],  # TODO review naming
+        headers: Optional[Dict[str, str]] = None,
         raises_on_error: bool = True,  # True: raises on first conversion error; False: does not raise on conv error
         max_num_pages: int = sys.maxsize,
         max_file_size: int = sys.maxsize,
@@ -201,8 +204,7 @@ def convert_all(
             max_file_size=max_file_size,
         )
         conv_input = _DocumentConversionInput(
-            path_or_stream_iterator=source,
-            limits=limits,
+            path_or_stream_iterator=source, limits=limits, headers=headers
         )
         conv_res_iter = self._convert(conv_input, raises_on_error=raises_on_error)