Skip to content

Commit

Permalink
chore(document): capture logs from PDF python scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
jvallesm committed Feb 7, 2025
1 parent 113c4de commit f4df57d
Show file tree
Hide file tree
Showing 2 changed files with 50 additions and 27 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,27 @@
import base64
import sys
import re
from contextlib import redirect_stdout
import logging

# Docling imports
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.base_models import DocumentStream, InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling_core.types.doc import ImageRefMode, PictureItem

if __name__ == "__main__":
# Capture warnings and errors. These are printed to stderr by default, which
# will prevent clients from unmarshalling the response.
conversion_logs = StringIO()
log_handler = logging.StreamHandler(conversion_logs)
log_handler.setLevel(logging.WARNING)

# Remove any existing handlers to avoid duplicate logging
logging.getLogger().handlers = []

# Add the handler to capture warnings/errors
logging.getLogger().addHandler(log_handler)

json_str = sys.stdin.buffer.read().decode('utf-8')
params = json.loads(json_str)
display_image_tag = params["display-image-tag"]
Expand Down Expand Up @@ -55,17 +69,15 @@
)

# Process the PDF document
conversion_logs = StringIO()
with redirect_stdout(conversion_logs):
doc = converter.convert(source)

# Extract the markdown text per page
markdown_pages = [
doc.document.export_to_markdown(
page_no=i + 1,
image_mode=ImageRefMode.PLACEHOLDER
)
for i in range(doc.document.num_pages())
doc = converter.convert(source)

# Extract the markdown text per page
markdown_pages = [
doc.document.export_to_markdown(
page_no=i + 1,
image_mode=ImageRefMode.PLACEHOLDER
)
for i in range(doc.document.num_pages())
]

# Format the image placeholder according to current convention
Expand Down Expand Up @@ -113,4 +125,4 @@ def replace_image(match):
}
print(json.dumps(output))
except Exception as e:
print(json.dumps({"system_error": str(e)}))
print(json.dumps({"system_error": str(e)}), file=sys.stderr)
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
from io import BytesIO, StringIO
from contextlib import redirect_stdout
import json
# Standard library imports
import base64
import json
import logging
import sys
from io import BytesIO, StringIO

# TODO chuang8511:
# Deal with the import error when running the code in the docker container.
Expand All @@ -12,11 +13,23 @@


if __name__ == "__main__":
json_str = sys.stdin.buffer.read().decode('utf-8')
params = json.loads(json_str)
display_image_tag = params["display-image-tag"]
display_all_page_image = params["display-all-page-image"]
pdf_string = params["PDF"]
# Capture warnings and errors. These are printed to stderr by default, which
# will prevent clients from unmarshalling the response.
conversion_logs = StringIO()
log_handler = logging.StreamHandler(conversion_logs)
log_handler.setLevel(logging.WARNING)

# Remove any existing handlers to avoid duplicate logging
logging.getLogger().handlers = []

# Add the handler to capture warnings/errors
logging.getLogger().addHandler(log_handler)

json_str = sys.stdin.buffer.read().decode('utf-8')
params = json.loads(json_str)
display_image_tag = params["display-image-tag"]
display_all_page_image = params["display-all-page-image"]
pdf_string = params["PDF"]
if "resolution" in params and params["resolution"] != 0 and params["resolution"] != None:
resolution = params["resolution"]
else:
Expand All @@ -42,11 +55,9 @@
else:
pdf.pages = pdf.raw_pages[i*separator_number:(i+1)*separator_number]

conversion_logs = StringIO()
with redirect_stdout(conversion_logs):
pdf.preprocess()
image_index = pdf.image_index
result += pdf.execute()
pdf.preprocess()
image_index = pdf.image_index
result += pdf.execute()

for image in pdf.base64_images:
images.append(image)
Expand Down Expand Up @@ -75,4 +86,4 @@
}
print(json.dumps(output))
except Exception as e:
print(json.dumps({"system_error": str(e)}))
print(json.dumps({"system_error": str(e)}), file=sys.stderr)

0 comments on commit f4df57d

Please sign in to comment.