Skip to content

Commit

Permalink
chore(document): capture logs from PDF python scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
jvallesm committed Feb 7, 2025
1 parent 0b3d3b8 commit 88ca817
Show file tree
Hide file tree
Showing 2 changed files with 93 additions and 72 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,27 @@
import base64
import sys
import re
from contextlib import redirect_stdout
import logging

# Docling imports
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.base_models import DocumentStream, InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling_core.types.doc import ImageRefMode, PictureItem

if __name__ == "__main__":
# Capture warnings and errors. These are printed to stderr by default, which
# will prevent clients from unmarshalling the response.
conversion_logs = StringIO()
log_handler = logging.StreamHandler(conversion_logs)
log_handler.setLevel(logging.WARNING)

# Remove any existing handlers to avoid duplicate logging
logging.getLogger().handlers = []

# Add the handler to capture warnings/errors
logging.getLogger().addHandler(log_handler)

json_str = sys.stdin.buffer.read().decode('utf-8')
params = json.loads(json_str)
display_image_tag = params["display-image-tag"]
Expand Down Expand Up @@ -55,18 +69,16 @@
)

# Process the PDF document
conversion_logs = StringIO()
with redirect_stdout(conversion_logs):
doc = converter.convert(source)

# Extract the markdown text per page
markdown_pages = [
doc.document.export_to_markdown(
page_no=i + 1,
image_mode=ImageRefMode.PLACEHOLDER
)
for i in range(doc.document.num_pages())
]
doc = converter.convert(source)

# Extract the markdown text per page
markdown_pages = [
doc.document.export_to_markdown(
page_no=i + 1,
image_mode=ImageRefMode.PLACEHOLDER
)
for i in range(doc.document.num_pages())
]

# Format the image placeholder according to current convention
image_counter = [0]
Expand Down Expand Up @@ -113,4 +125,4 @@ def replace_image(match):
}
print(json.dumps(output))
except Exception as e:
print(json.dumps({"system_error": str(e)}))
print(json.dumps({"system_error": str(e)}), file=sys.stderr)
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
from io import BytesIO, StringIO
from contextlib import redirect_stdout
import json
import base64
import json
import logging
import sys
from io import BytesIO, StringIO

# TODO chuang8511:
# Deal with the import error when running the code in the docker container.
Expand All @@ -12,67 +12,76 @@


if __name__ == "__main__":
json_str = sys.stdin.buffer.read().decode('utf-8')
params = json.loads(json_str)
display_image_tag = params["display-image-tag"]
display_all_page_image = params["display-all-page-image"]
pdf_string = params["PDF"]
if "resolution" in params and params["resolution"] != 0 and params["resolution"] != None:
resolution = params["resolution"]
else:
resolution = 300
decoded_bytes = base64.b64decode(pdf_string)
pdf_file_obj = BytesIO(decoded_bytes)
pdf = PDFTransformer(pdf_file_obj, display_image_tag)
# Capture warnings and errors. These are printed to stderr by default, which
# will prevent clients from unmarshalling the response.
conversion_logs = StringIO()
log_handler = logging.StreamHandler(conversion_logs)
log_handler.setLevel(logging.WARNING)

# Remove any existing handlers to avoid duplicate logging
logging.getLogger().handlers = []

# Add the handler to capture warnings/errors
logging.getLogger().addHandler(log_handler)

result = ""
images = []
separator_number = 30
image_index = 0
errors = []
all_page_images = []
markdowns = []
json_str = sys.stdin.buffer.read().decode('utf-8')
params = json.loads(json_str)
display_image_tag = params["display-image-tag"]
display_all_page_image = params["display-all-page-image"]
pdf_string = params["PDF"]
if "resolution" in params and params["resolution"] != 0 and params["resolution"] != None:
resolution = params["resolution"]
else:
resolution = 300
decoded_bytes = base64.b64decode(pdf_string)
pdf_file_obj = BytesIO(decoded_bytes)
pdf = PDFTransformer(pdf_file_obj, display_image_tag)

try:
times = len(pdf.raw_pages) // separator_number + 1
for i in range(times):
pdf = PDFTransformer(x=pdf_file_obj, display_image_tag=display_image_tag, image_index=image_index, resolution=resolution)
if i == times - 1:
pdf.pages = pdf.raw_pages[i*separator_number:]
else:
pdf.pages = pdf.raw_pages[i*separator_number:(i+1)*separator_number]
result = ""
images = []
separator_number = 30
image_index = 0
errors = []
all_page_images = []
markdowns = []

conversion_logs = StringIO()
with redirect_stdout(conversion_logs):
pdf.preprocess()
image_index = pdf.image_index
result += pdf.execute()
try:
times = len(pdf.raw_pages) // separator_number + 1
for i in range(times):
pdf = PDFTransformer(x=pdf_file_obj, display_image_tag=display_image_tag, image_index=image_index, resolution=resolution)
if i == times - 1:
pdf.pages = pdf.raw_pages[i*separator_number:]
else:
pdf.pages = pdf.raw_pages[i*separator_number:(i+1)*separator_number]

for image in pdf.base64_images:
images.append(image)
pdf.preprocess()
image_index = pdf.image_index
result += pdf.execute()

if display_all_page_image:
raw_pages = pdf.raw_pages
for image in pdf.base64_images:
images.append(image)

for page_number in pdf.page_numbers_with_images:
page = raw_pages[page_number - 1]
page_image = page.to_image(resolution=resolution)
encoded_image = PageImageProcessor.encode_image(page_image)
all_page_images.append(encoded_image)
if display_all_page_image:
raw_pages = pdf.raw_pages

errors += pdf.errors
for page_number in pdf.page_numbers_with_images:
page = raw_pages[page_number - 1]
page_image = page.to_image(resolution=resolution)
encoded_image = PageImageProcessor.encode_image(page_image)
all_page_images.append(encoded_image)

markdowns += pdf.markdowns
errors += pdf.errors
markdowns += pdf.markdowns

output = {
"body": result,
"images": images,
"parsing_error": errors,
"all_page_images": all_page_images,
"display_all_page_image": display_all_page_image,
"markdowns": markdowns,
"logs": conversion_logs.getvalue().splitlines(),
}
print(json.dumps(output))
except Exception as e:
print(json.dumps({"system_error": str(e)}))
output = {
"body": result,
"images": images,
"parsing_error": errors,
"all_page_images": all_page_images,
"display_all_page_image": display_all_page_image,
"markdowns": markdowns,
"logs": conversion_logs.getvalue().splitlines(),
}
print(json.dumps(output))
except Exception as e:
print(json.dumps({"system_error": str(e)}), file=sys.stderr)

0 comments on commit 88ca817

Please sign in to comment.