Skip to content

Commit

Permalink
Merge pull request #192 from ServiceNow/dtremblay/add_docling
Browse files Browse the repository at this point in the history
Use `docling` package with default PdfConverter
  • Loading branch information
danieltremblay authored Mar 6, 2025
2 parents 62dc4da + 6abb531 commit 07b660f
Show file tree
Hide file tree
Showing 6 changed files with 444 additions and 38 deletions.
52 changes: 52 additions & 0 deletions conf/convert_document.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
defaults:
- _self_
- llm: gpt4o

environment:
_target_: tapeagents.environment.ToolCollectionEnvironment
tools:
- _target_: tapeagents.tools.document_reader.DocumentReader
preferred_pdf_converter:
_target_: hydra.utils.get_class
path: tapeagents.tools.converters.PdfConverter

agent:
_target_: tapeagents.agent.Agent
name: document_agent
max_iterations: 2
llms:
default: ${llm}
templates:
system_prompt: |
You will help the user to extract information from files.
Use as many relevant tools as possible to include more details and facts in your responses.
allowed_tools: |
You have access to the following tools:
{tools_description}
thought_format: |
Important! Respond with the plain text, do not include any JSON or code.
Do not output anything besides what I asked in this message.
allowed_steps: |
You have access to the following tools:
{tools_description}
You are allowed to produce ONLY steps with the following JSON schemas:
{allowed_steps}
Do not reproduce the schema when producing steps; use it as a reference.
format: >
Output only a single JSON dict.
DO NOT OUTPUT ANYTHING BESIDES THE JSON! DO NOT PLACE ANY COMMENTS INSIDE THE JSON.
It will break the system that processes the output.
nodes:
- _target_: tapeagents.nodes.StandardNode
name: act
system_prompt: ${agent.templates.system_prompt}
guidance: |
You have access to tools to read and convert files that contain useful information. Never call the same tool twice.
The first step should be to simply read the data in the file.
The second step should be to return the data to the user.
${agent.templates.format}
steps_prompt: ${agent.templates.allowed_steps}
steps:
- tapeagents.dialog_tape.AssistantAnswer
use_known_actions: true
next_node: act
32 changes: 32 additions & 0 deletions examples/convert_document.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
import argparse

from hydra import compose, initialize
from omegaconf import DictConfig

from tapeagents.dialog_tape import DialogTape, UserStep
from tapeagents.orchestrator import get_agent_and_env_from_config, main_loop


def main(cfg: DictConfig, path: str) -> None:
agent, env = get_agent_and_env_from_config(cfg)

print("Run the agent!")
for event in main_loop(
agent,
DialogTape() + [UserStep(content=f"Read and convert the document at `{path}` and return its results to me")],
env,
):
if ae := event.agent_event:
if ae.step:
print(ae.step.model_dump_json(indent=2))
if event.observation:
print(event.observation.model_dump_json(indent=2))


if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--input-path", "-i", type=str, required=True, help="Document to convert")
args = parser.parse_args()
with initialize(version_base=None, config_path="../conf"):
cfg = compose(config_name="convert_document")
main(cfg, path=args.input_path)
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ dev = [
[project.optional-dependencies]
converters = [
"beautifulsoup4~=4.12",
"docling==2.15.0",
"easyocr~=1.7",
"ffmpeg-python~=0.2",
"lxml[html-clean]~=5.2",
Expand All @@ -85,7 +86,7 @@ converters = [
"puremagic~=1.26",
"pydub~=0.25",
"pyparsing~=3.1",
"python-pptx~=0.6",
"python-pptx~=1.0.2",
"readability-lxml>=0.8",
"webvtt-py~=0.5",
"xlrd~=2.0",
Expand Down
52 changes: 44 additions & 8 deletions tapeagents/tools/converters.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
import copy
import html
import json
import logging
import mimetypes
import os
import re
Expand All @@ -39,24 +40,36 @@
from bs4 import BeautifulSoup
from readability import Document

logger = logging.getLogger(__name__)

# Optional PDF support
IS_PDF_CAPABLE = False
IS_PDF_MINER_CAPABLE = False
try:
import pdfminer
import pdfminer.high_level

IS_PDF_CAPABLE = True
except ModuleNotFoundError:
pass
IS_PDF_MINER_CAPABLE = True
except ModuleNotFoundError as e:
logger.warning(f"PDF conversion support via `pdfminer` not available: {str(e)}")

IS_PDF_DOCLING_CAPABLE = False
try:
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions, TableFormerMode, TableStructureOptions
from docling.document_converter import DocumentConverter as DoclingDocumentConverter, PdfFormatOption

IS_PDF_DOCLING_CAPABLE = True
except ModuleNotFoundError as e:
logger.warning(f"PDF conversion support via `docling` not available: {str(e)}")

# Optional YouTube transcription support
IS_YOUTUBE_TRANSCRIPT_CAPABLE = False
try:
from youtube_transcript_api import YouTubeTranscriptApi

IS_YOUTUBE_TRANSCRIPT_CAPABLE = True
except ModuleNotFoundError:
pass
except ModuleNotFoundError as e:
logger.warning(f"YouTube transcript support via `youtube_transcript_api` not available: {str(e)}")


class DocumentConverterResult:
Expand Down Expand Up @@ -307,7 +320,7 @@ def _findKey(self, json, key):
return None


class PdfConverter(DocumentConverter):
class PdfMinerConverter(DocumentConverter):
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
# Bail if not a PDF
extension = kwargs.get("file_extension", "")
Expand All @@ -320,6 +333,26 @@ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
)


class PdfConverter(DocumentConverter):
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
# Bail if not a PDF
extension = kwargs.get("file_extension", "")
if extension.lower() != ".pdf":
return None
pipeline_options = PdfPipelineOptions(
do_table_structure=True, table_structure_options=TableStructureOptions(mode=TableFormerMode.ACCURATE)
)
converter = DoclingDocumentConverter(
format_options={InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)}
)
result = converter.convert(local_path)
markdown = result.document.export_to_markdown()
return DocumentConverterResult(
title=None,
text_content=markdown,
)


class DocxConverter(HtmlConverter):
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
# Bail if not a DOCX
Expand Down Expand Up @@ -606,6 +639,7 @@ def __init__(
self,
requests_session: Optional[requests.Session] = None,
mlm_client: Optional[Any] = None,
preferred_pdf_converter: Optional[type[PdfConverter | PdfMinerConverter]] = PdfConverter,
):
if requests_session is None:
self._requests_session = requests.Session()
Expand All @@ -630,8 +664,10 @@ def __init__(
self.register_page_converter(Mp3Converter())
self.register_page_converter(ImageConverter())

if IS_PDF_CAPABLE:
if IS_PDF_DOCLING_CAPABLE and preferred_pdf_converter == PdfConverter:
self.register_page_converter(PdfConverter())
elif IS_PDF_MINER_CAPABLE and preferred_pdf_converter == PdfMinerConverter:
self.register_page_converter(PdfMinerConverter())

def convert(self, source, **kwargs):
"""
Expand Down
24 changes: 17 additions & 7 deletions tapeagents/tools/document_reader.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,26 @@
from typing import Literal
from typing import Literal, Optional

from pydantic import Field

from tapeagents.core import Action, Observation
from tapeagents.tools.base import Tool
from tapeagents.tools.converters import FileConversionException, FileConverter, UnsupportedFormatException


def read_document(path: str) -> tuple[str, str | None]:
from tapeagents.tools.converters import (
FileConversionException,
FileConverter,
PdfConverter,
PdfMinerConverter,
UnsupportedFormatException,
)


def read_document(
path: str, preferred_pdf_converter: Optional[type[PdfConverter | PdfMinerConverter]] = PdfConverter
) -> tuple[str, str | None]:
"""Read a document, file or image and and convert it to Markdown."""
try:
text = ""
error = None
text = FileConverter().convert(path).text_content
text = FileConverter(preferred_pdf_converter=preferred_pdf_converter).convert(path).text_content
except UnsupportedFormatException as e:
error = f"Failed to read document {path}: {e}"
except FileConversionException as e:
Expand Down Expand Up @@ -43,7 +52,8 @@ class DocumentReader(Tool):

action: type[Action] = ReadLocalDocumentAction
observation: type[Observation] = DocumentObservation
preferred_pdf_converter: Optional[type[PdfConverter | PdfMinerConverter]] = PdfConverter

def execute_action(self, action: ReadLocalDocumentAction) -> DocumentObservation:
text, error = read_document(action.path)
text, error = read_document(action.path, self.preferred_pdf_converter)
return DocumentObservation(text=text, error=error)
Loading

0 comments on commit 07b660f

Please sign in to comment.