From 88ca8172d6eedd715ff1f2bb8a6fbfbb14169a47 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Juan=20Vall=C3=A9s?= Date: Fri, 7 Feb 2025 09:14:08 +0100 Subject: [PATCH] chore(document): capture logs from PDF python scripts --- .../execution/docling_pdf_to_md_converter.py | 40 ++++-- .../pdfplumber_pdf_to_md_converter.py | 125 ++++++++++-------- 2 files changed, 93 insertions(+), 72 deletions(-) diff --git a/pkg/component/operator/document/v0/transformer/execution/docling_pdf_to_md_converter.py b/pkg/component/operator/document/v0/transformer/execution/docling_pdf_to_md_converter.py index 05252ed7d..e85be8c0f 100644 --- a/pkg/component/operator/document/v0/transformer/execution/docling_pdf_to_md_converter.py +++ b/pkg/component/operator/document/v0/transformer/execution/docling_pdf_to_md_converter.py @@ -4,13 +4,27 @@ import base64 import sys import re -from contextlib import redirect_stdout +import logging + +# Docling imports from docling.document_converter import DocumentConverter, PdfFormatOption from docling.datamodel.base_models import DocumentStream, InputFormat from docling.datamodel.pipeline_options import PdfPipelineOptions from docling_core.types.doc import ImageRefMode, PictureItem if __name__ == "__main__": + # Capture warnings and errors. These are printed to stderr by default, which + # will prevent clients from unmarshalling the response. + conversion_logs = StringIO() + log_handler = logging.StreamHandler(conversion_logs) + log_handler.setLevel(logging.WARNING) + + # Remove any existing handlers to avoid duplicate logging + logging.getLogger().handlers = [] + + # Add the handler to capture warnings/errors + logging.getLogger().addHandler(log_handler) + json_str = sys.stdin.buffer.read().decode('utf-8') params = json.loads(json_str) display_image_tag = params["display-image-tag"] @@ -55,18 +69,16 @@ ) # Process the PDF document - conversion_logs = StringIO() - with redirect_stdout(conversion_logs): - doc = converter.convert(source) - - # Extract the markdown text per page - markdown_pages = [ - doc.document.export_to_markdown( - page_no=i + 1, - image_mode=ImageRefMode.PLACEHOLDER - ) - for i in range(doc.document.num_pages()) - ] + doc = converter.convert(source) + + # Extract the markdown text per page + markdown_pages = [ + doc.document.export_to_markdown( + page_no=i + 1, + image_mode=ImageRefMode.PLACEHOLDER + ) + for i in range(doc.document.num_pages()) + ] # Format the image placeholder according to current convention image_counter = [0] @@ -113,4 +125,4 @@ def replace_image(match): } print(json.dumps(output)) except Exception as e: - print(json.dumps({"system_error": str(e)})) + print(json.dumps({"system_error": str(e)}), file=sys.stderr) diff --git a/pkg/component/operator/document/v0/transformer/execution/pdfplumber_pdf_to_md_converter.py b/pkg/component/operator/document/v0/transformer/execution/pdfplumber_pdf_to_md_converter.py index 552bab6cd..9d7fb5bf6 100644 --- a/pkg/component/operator/document/v0/transformer/execution/pdfplumber_pdf_to_md_converter.py +++ b/pkg/component/operator/document/v0/transformer/execution/pdfplumber_pdf_to_md_converter.py @@ -1,8 +1,8 @@ -from io import BytesIO, StringIO -from contextlib import redirect_stdout -import json import base64 +import json +import logging import sys +from io import BytesIO, StringIO # TODO chuang8511: # Deal with the import error when running the code in the docker container. @@ -12,67 +12,76 @@ if __name__ == "__main__": - json_str = sys.stdin.buffer.read().decode('utf-8') - params = json.loads(json_str) - display_image_tag = params["display-image-tag"] - display_all_page_image = params["display-all-page-image"] - pdf_string = params["PDF"] - if "resolution" in params and params["resolution"] != 0 and params["resolution"] != None: - resolution = params["resolution"] - else: - resolution = 300 - decoded_bytes = base64.b64decode(pdf_string) - pdf_file_obj = BytesIO(decoded_bytes) - pdf = PDFTransformer(pdf_file_obj, display_image_tag) + # Capture warnings and errors. These are printed to stderr by default, which + # will prevent clients from unmarshalling the response. + conversion_logs = StringIO() + log_handler = logging.StreamHandler(conversion_logs) + log_handler.setLevel(logging.WARNING) + + # Remove any existing handlers to avoid duplicate logging + logging.getLogger().handlers = [] + + # Add the handler to capture warnings/errors + logging.getLogger().addHandler(log_handler) - result = "" - images = [] - separator_number = 30 - image_index = 0 - errors = [] - all_page_images = [] - markdowns = [] + json_str = sys.stdin.buffer.read().decode('utf-8') + params = json.loads(json_str) + display_image_tag = params["display-image-tag"] + display_all_page_image = params["display-all-page-image"] + pdf_string = params["PDF"] + if "resolution" in params and params["resolution"] != 0 and params["resolution"] != None: + resolution = params["resolution"] + else: + resolution = 300 + decoded_bytes = base64.b64decode(pdf_string) + pdf_file_obj = BytesIO(decoded_bytes) + pdf = PDFTransformer(pdf_file_obj, display_image_tag) - try: - times = len(pdf.raw_pages) // separator_number + 1 - for i in range(times): - pdf = PDFTransformer(x=pdf_file_obj, display_image_tag=display_image_tag, image_index=image_index, resolution=resolution) - if i == times - 1: - pdf.pages = pdf.raw_pages[i*separator_number:] - else: - pdf.pages = pdf.raw_pages[i*separator_number:(i+1)*separator_number] + result = "" + images = [] + separator_number = 30 + image_index = 0 + errors = [] + all_page_images = [] + markdowns = [] - conversion_logs = StringIO() - with redirect_stdout(conversion_logs): - pdf.preprocess() - image_index = pdf.image_index - result += pdf.execute() + try: + times = len(pdf.raw_pages) // separator_number + 1 + for i in range(times): + pdf = PDFTransformer(x=pdf_file_obj, display_image_tag=display_image_tag, image_index=image_index, resolution=resolution) + if i == times - 1: + pdf.pages = pdf.raw_pages[i*separator_number:] + else: + pdf.pages = pdf.raw_pages[i*separator_number:(i+1)*separator_number] - for image in pdf.base64_images: - images.append(image) + pdf.preprocess() + image_index = pdf.image_index + result += pdf.execute() - if display_all_page_image: - raw_pages = pdf.raw_pages + for image in pdf.base64_images: + images.append(image) - for page_number in pdf.page_numbers_with_images: - page = raw_pages[page_number - 1] - page_image = page.to_image(resolution=resolution) - encoded_image = PageImageProcessor.encode_image(page_image) - all_page_images.append(encoded_image) + if display_all_page_image: + raw_pages = pdf.raw_pages - errors += pdf.errors + for page_number in pdf.page_numbers_with_images: + page = raw_pages[page_number - 1] + page_image = page.to_image(resolution=resolution) + encoded_image = PageImageProcessor.encode_image(page_image) + all_page_images.append(encoded_image) - markdowns += pdf.markdowns + errors += pdf.errors + markdowns += pdf.markdowns - output = { - "body": result, - "images": images, - "parsing_error": errors, - "all_page_images": all_page_images, - "display_all_page_image": display_all_page_image, - "markdowns": markdowns, - "logs": conversion_logs.getvalue().splitlines(), - } - print(json.dumps(output)) - except Exception as e: - print(json.dumps({"system_error": str(e)})) + output = { + "body": result, + "images": images, + "parsing_error": errors, + "all_page_images": all_page_images, + "display_all_page_image": display_all_page_image, + "markdowns": markdowns, + "logs": conversion_logs.getvalue().splitlines(), + } + print(json.dumps(output)) + except Exception as e: + print(json.dumps({"system_error": str(e)}), file=sys.stderr)