From b9b75ff85877b7cd373539c79014cbde39508969 Mon Sep 17 00:00:00 2001 From: Pieter Marsman Date: Sun, 7 Jul 2024 17:06:01 +0200 Subject: [PATCH] Safely cast object id to integer in PDFParser to prevert TypeError and ValueError. (#972) --- CHANGELOG.md | 6 ++++++ pdfminer/casting.py | 8 ++++++++ pdfminer/pdfparser.py | 27 +++++++++++++-------------- pdfminer/pdftypes.py | 21 ++++++++++++++++++++- 4 files changed, 47 insertions(+), 15 deletions(-) create mode 100644 pdfminer/casting.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 55760958..7055a726 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,12 @@ All notable changes in pdfminer.six will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). +## [Unreleased] + +### Fixed + +- `TypeError` when PDF object reference cannot be parsed as int ([#972](https://github.com/pdfminer/pdfminer.six/pull/972))]) + ## [20240706] ### Added diff --git a/pdfminer/casting.py b/pdfminer/casting.py new file mode 100644 index 00000000..c73dfa6b --- /dev/null +++ b/pdfminer/casting.py @@ -0,0 +1,8 @@ +from typing import Optional, Any + + +def safe_int(o: Any) -> Optional[int]: + try: + return int(o) + except (TypeError, ValueError): + return None diff --git a/pdfminer/pdfparser.py b/pdfminer/pdfparser.py index b4f2d572..d8772918 100644 --- a/pdfminer/pdfparser.py +++ b/pdfminer/pdfparser.py @@ -3,13 +3,14 @@ from typing import BinaryIO, TYPE_CHECKING, Optional, Union from . import settings +from .casting import safe_int from .pdftypes import PDFObjRef from .pdfexceptions import PDFException from .pdftypes import PDFStream from .pdftypes import dict_value from .pdftypes import int_value from .psparser import KWD -from .psexceptions import PSEOF, PSSyntaxError +from .psexceptions import PSEOF from .psparser import PSKeyword from .psparser import PSStackParser @@ -73,14 +74,12 @@ def do_keyword(self, pos: int, token: PSKeyword) -> None: elif token is self.KEYWORD_R: # reference to indirect object if len(self.curstack) >= 2: - try: - ((_, objid), (_, genno)) = self.pop(2) - (objid, genno) = (int(objid), int(genno)) # type: ignore[arg-type] - assert self.doc is not None - obj = PDFObjRef(self.doc, objid, genno) + (_, _object_id), _ = self.pop(2) + object_id = safe_int(_object_id) + if object_id is not None: + obj = PDFObjRef(self.doc, object_id) self.push((pos, obj)) - except PSSyntaxError: - pass + elif token is self.KEYWORD_STREAM: # stream object ((_, dic),) = self.pop(1) @@ -157,19 +156,19 @@ def flush(self) -> None: def do_keyword(self, pos: int, token: PSKeyword) -> None: if token is self.KEYWORD_R: # reference to indirect object - try: - ((_, objid), (_, genno)) = self.pop(2) - (objid, genno) = (int(objid), int(genno)) # type: ignore[arg-type] - obj = PDFObjRef(self.doc, objid, genno) + (_, _object_id), _ = self.pop(2) + object_id = safe_int(_object_id) + if object_id is not None: + obj = PDFObjRef(self.doc, object_id) self.push((pos, obj)) - except PSSyntaxError: - pass return + elif token in (self.KEYWORD_OBJ, self.KEYWORD_ENDOBJ): if settings.STRICT: # See PDF Spec 3.4.6: Only the object values are stored in the # stream; the obj and endobj keywords are not used. raise PDFSyntaxError("Keyword endobj found in stream") return + # others self.push((pos, token)) diff --git a/pdfminer/pdftypes.py b/pdfminer/pdftypes.py index 1fb5be38..f062ab5e 100644 --- a/pdfminer/pdftypes.py +++ b/pdfminer/pdftypes.py @@ -13,6 +13,7 @@ Tuple, cast, ) +from warnings import warn from . import settings, pdfexceptions from .ascii85 import ascii85decode @@ -66,12 +67,30 @@ class PDFObject(PSObject): PDFObjectNotFound = pdfexceptions.PDFObjectNotFound PDFNotImplementedError = pdfexceptions.PDFNotImplementedError +_DEFAULT = object() + class PDFObjRef(PDFObject): - def __init__(self, doc: Optional["PDFDocument"], objid: int, _: object) -> None: + def __init__( + self, doc: Optional["PDFDocument"], objid: int, _: Any = _DEFAULT + ) -> None: + """Reference to a PDF object. + + :param doc: The PDF document. + :param objid: The object number. + :param _: Unused argument for backwards compatibility. + """ + if _ is not _DEFAULT: + warn( + "The third argument of PDFObjRef is unused and will be removed after " + "2024", + DeprecationWarning, + ) + if objid == 0: if settings.STRICT: raise PDFValueError("PDF object id cannot be 0.") + self.doc = doc self.objid = objid