Merge pull request #192 from ServiceNow/dtremblay/add_docling

Use `docling` package with default PdfConverter
ServiceNow · Mar 6, 2025 · 07b660f · 07b660f
2 parents 62dc4da + 6abb531
commit 07b660f
Show file tree

Hide file tree

Showing 6 changed files with 444 additions and 38 deletions.
diff --git a/conf/convert_document.yaml b/conf/convert_document.yaml
@@ -0,0 +1,52 @@
+defaults:
+  - _self_
+  - llm: gpt4o
+
+environment:
+  _target_: tapeagents.environment.ToolCollectionEnvironment
+  tools:
+    - _target_: tapeagents.tools.document_reader.DocumentReader
+      preferred_pdf_converter:
+        _target_: hydra.utils.get_class
+        path: tapeagents.tools.converters.PdfConverter
+
+agent:
+  _target_: tapeagents.agent.Agent
+  name: document_agent
+  max_iterations: 2
+  llms:
+    default: ${llm}
+  templates:
+    system_prompt: |
+      You will help the user to extract information from files.
+      Use as many relevant tools as possible to include more details and facts in your responses.
+    allowed_tools: |
+      You have access to the following tools:
+      {tools_description}
+    thought_format: |
+      Important! Respond with the plain text, do not include any JSON or code.
+      Do not output anything besides what I asked in this message.
+    allowed_steps: |
+      You have access to the following tools:
+      {tools_description}
+      You are allowed to produce ONLY steps with the following JSON schemas:
+      {allowed_steps}
+      Do not reproduce the schema when producing steps; use it as a reference.
+    format: >
+      Output only a single JSON dict. 
+      DO NOT OUTPUT ANYTHING BESIDES THE JSON! DO NOT PLACE ANY COMMENTS INSIDE THE JSON. 
+      It will break the system that processes the output.
+  nodes:
+    - _target_: tapeagents.nodes.StandardNode
+      name: act
+      system_prompt: ${agent.templates.system_prompt}
+      guidance: |
+        You have access to tools to read and convert files that contain useful information. Never call the same tool twice.
+        The first step should be to simply read the data in the file.
+        The second step should be to return the data to the user.
+        ${agent.templates.format}
+      steps_prompt: ${agent.templates.allowed_steps}
+      steps:
+        - tapeagents.dialog_tape.AssistantAnswer
+      use_known_actions: true
+      next_node: act
diff --git a/examples/convert_document.py b/examples/convert_document.py
@@ -0,0 +1,32 @@
+import argparse
+
+from hydra import compose, initialize
+from omegaconf import DictConfig
+
+from tapeagents.dialog_tape import DialogTape, UserStep
+from tapeagents.orchestrator import get_agent_and_env_from_config, main_loop
+
+
+def main(cfg: DictConfig, path: str) -> None:
+    agent, env = get_agent_and_env_from_config(cfg)
+
+    print("Run the agent!")
+    for event in main_loop(
+        agent,
+        DialogTape() + [UserStep(content=f"Read and convert the document at `{path}` and return its results to me")],
+        env,
+    ):
+        if ae := event.agent_event:
+            if ae.step:
+                print(ae.step.model_dump_json(indent=2))
+        if event.observation:
+            print(event.observation.model_dump_json(indent=2))
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--input-path", "-i", type=str, required=True, help="Document to convert")
+    args = parser.parse_args()
+    with initialize(version_base=None, config_path="../conf"):
+        cfg = compose(config_name="convert_document")
+    main(cfg, path=args.input_path)
diff --git a/pyproject.toml b/pyproject.toml
@@ -72,6 +72,7 @@ dev = [
 [project.optional-dependencies]
 converters = [
   "beautifulsoup4~=4.12",
+  "docling==2.15.0",
   "easyocr~=1.7",
   "ffmpeg-python~=0.2",
   "lxml[html-clean]~=5.2",
@@ -85,7 +86,7 @@ converters = [
   "puremagic~=1.26",
   "pydub~=0.25",
   "pyparsing~=3.1",
-  "python-pptx~=0.6",
+  "python-pptx~=1.0.2",
   "readability-lxml>=0.8",
   "webvtt-py~=0.5",
   "xlrd~=2.0",

diff --git a/tapeagents/tools/converters.py b/tapeagents/tools/converters.py
@@ -18,6 +18,7 @@
 import copy
 import html
 import json
+import logging
 import mimetypes
 import os
 import re
@@ -39,24 +40,36 @@
 from bs4 import BeautifulSoup
 from readability import Document
 
+logger = logging.getLogger(__name__)
+
 # Optional PDF support
-IS_PDF_CAPABLE = False
+IS_PDF_MINER_CAPABLE = False
 try:
     import pdfminer
     import pdfminer.high_level
 
-    IS_PDF_CAPABLE = True
-except ModuleNotFoundError:
-    pass
+    IS_PDF_MINER_CAPABLE = True
+except ModuleNotFoundError as e:
+    logger.warning(f"PDF conversion support via `pdfminer` not available: {str(e)}")
+
+IS_PDF_DOCLING_CAPABLE = False
+try:
+    from docling.datamodel.base_models import InputFormat
+    from docling.datamodel.pipeline_options import PdfPipelineOptions, TableFormerMode, TableStructureOptions
+    from docling.document_converter import DocumentConverter as DoclingDocumentConverter, PdfFormatOption
+
+    IS_PDF_DOCLING_CAPABLE = True
+except ModuleNotFoundError as e:
+    logger.warning(f"PDF conversion support via `docling` not available: {str(e)}")
 
 # Optional YouTube transcription support
 IS_YOUTUBE_TRANSCRIPT_CAPABLE = False
 try:
     from youtube_transcript_api import YouTubeTranscriptApi
 
     IS_YOUTUBE_TRANSCRIPT_CAPABLE = True
-except ModuleNotFoundError:
-    pass
+except ModuleNotFoundError as e:
+    logger.warning(f"YouTube transcript support via `youtube_transcript_api` not available: {str(e)}")
 
 
 class DocumentConverterResult:
@@ -307,7 +320,7 @@ def _findKey(self, json, key):
         return None
 
 
-class PdfConverter(DocumentConverter):
+class PdfMinerConverter(DocumentConverter):
     def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
         # Bail if not a PDF
         extension = kwargs.get("file_extension", "")
@@ -320,6 +333,26 @@ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
         )
 
 
+class PdfConverter(DocumentConverter):
+    def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
+        # Bail if not a PDF
+        extension = kwargs.get("file_extension", "")
+        if extension.lower() != ".pdf":
+            return None
+        pipeline_options = PdfPipelineOptions(
+            do_table_structure=True, table_structure_options=TableStructureOptions(mode=TableFormerMode.ACCURATE)
+        )
+        converter = DoclingDocumentConverter(
+            format_options={InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)}
+        )
+        result = converter.convert(local_path)
+        markdown = result.document.export_to_markdown()
+        return DocumentConverterResult(
+            title=None,
+            text_content=markdown,
+        )
+
+
 class DocxConverter(HtmlConverter):
     def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
         # Bail if not a DOCX
@@ -606,6 +639,7 @@ def __init__(
         self,
         requests_session: Optional[requests.Session] = None,
         mlm_client: Optional[Any] = None,
+        preferred_pdf_converter: Optional[type[PdfConverter | PdfMinerConverter]] = PdfConverter,
     ):
         if requests_session is None:
             self._requests_session = requests.Session()
@@ -630,8 +664,10 @@ def __init__(
         self.register_page_converter(Mp3Converter())
         self.register_page_converter(ImageConverter())
 
-        if IS_PDF_CAPABLE:
+        if IS_PDF_DOCLING_CAPABLE and preferred_pdf_converter == PdfConverter:
             self.register_page_converter(PdfConverter())
+        elif IS_PDF_MINER_CAPABLE and preferred_pdf_converter == PdfMinerConverter:
+            self.register_page_converter(PdfMinerConverter())
 
     def convert(self, source, **kwargs):
         """

diff --git a/tapeagents/tools/document_reader.py b/tapeagents/tools/document_reader.py
@@ -1,17 +1,26 @@
-from typing import Literal
+from typing import Literal, Optional
 
 from pydantic import Field
 
 from tapeagents.core import Action, Observation
 from tapeagents.tools.base import Tool
-from tapeagents.tools.converters import FileConversionException, FileConverter, UnsupportedFormatException
-
-
-def read_document(path: str) -> tuple[str, str | None]:
+from tapeagents.tools.converters import (
+    FileConversionException,
+    FileConverter,
+    PdfConverter,
+    PdfMinerConverter,
+    UnsupportedFormatException,
+)
+
+
+def read_document(
+    path: str, preferred_pdf_converter: Optional[type[PdfConverter | PdfMinerConverter]] = PdfConverter
+) -> tuple[str, str | None]:
+    """Read a document, file or image and and convert it to Markdown."""
     try:
         text = ""
         error = None
-        text = FileConverter().convert(path).text_content
+        text = FileConverter(preferred_pdf_converter=preferred_pdf_converter).convert(path).text_content
     except UnsupportedFormatException as e:
         error = f"Failed to read document {path}: {e}"
     except FileConversionException as e:
@@ -43,7 +52,8 @@ class DocumentReader(Tool):
 
     action: type[Action] = ReadLocalDocumentAction
     observation: type[Observation] = DocumentObservation
+    preferred_pdf_converter: Optional[type[PdfConverter | PdfMinerConverter]] = PdfConverter
 
     def execute_action(self, action: ReadLocalDocumentAction) -> DocumentObservation:
-        text, error = read_document(action.path)
+        text, error = read_document(action.path, self.preferred_pdf_converter)
         return DocumentObservation(text=text, error=error)