From 7023df52531fe9dc69064f5f21ac1877e59652d5 Mon Sep 17 00:00:00 2001 From: Pieter Marsman Date: Tue, 9 Jul 2024 07:12:19 +0200 Subject: [PATCH] Explicitly parse rectangles to prevent errors downstream --- pdfminer/pdfpage.py | 15 +++++++++------ pdfminer/utils.py | 8 ++++++++ 2 files changed, 17 insertions(+), 6 deletions(-) diff --git a/pdfminer/pdfpage.py b/pdfminer/pdfpage.py index 7df4cf6b..cbc33aeb 100644 --- a/pdfminer/pdfpage.py +++ b/pdfminer/pdfpage.py @@ -2,12 +2,12 @@ import logging from typing import BinaryIO, Container, Dict, Iterator, List, Optional, Tuple, Any -from pdfminer.utils import Rect +from pdfminer.utils import parse_rect from . import settings from .pdfdocument import PDFDocument, PDFTextExtractionNotAllowed, PDFNoPageLabels from .pdfparser import PDFParser from .pdftypes import dict_value -from .pdfexceptions import PDFObjectNotFound +from .pdfexceptions import PDFObjectNotFound, PDFValueError from .pdftypes import int_value from .pdftypes import list_value from .pdftypes import resolve1 @@ -63,11 +63,14 @@ def __init__( mediabox_params: List[Any] = [ resolve1(mediabox_param) for mediabox_param in self.attrs["MediaBox"] ] - self.mediabox: Rect = resolve1(mediabox_params) + self.mediabox = parse_rect(resolve1(mediabox_params)) + self.cropbox = self.mediabox if "CropBox" in self.attrs: - self.cropbox: Rect = resolve1(self.attrs["CropBox"]) - else: - self.cropbox = self.mediabox + try: + self.cropbox = parse_rect(resolve1(self.attrs["CropBox"])) + except PDFValueError: + pass + self.rotate = (int_value(self.attrs.get("Rotate", 0)) + 360) % 360 self.annots = self.attrs.get("Annots") self.beads = self.attrs.get("B") diff --git a/pdfminer/utils.py b/pdfminer/utils.py index fae1f643..439fb8df 100644 --- a/pdfminer/utils.py +++ b/pdfminer/utils.py @@ -238,6 +238,14 @@ def apply_png_predictor( MATRIX_IDENTITY: Matrix = (1, 0, 0, 1, 0, 0) +def parse_rect(o: Any) -> Rect: + try: + (x0, y0, x1, y1) = o + return float(x0), float(y0), float(x1), float(y1) + except ValueError: + raise PDFValueError("Could not parse rectangle") + + def mult_matrix(m1: Matrix, m0: Matrix) -> Matrix: (a1, b1, c1, d1, e1, f1) = m1 (a0, b0, c0, d0, e0, f0) = m0