pdfminer · pietermarsman · Jul 7, 2024 · Jul 6, 2024 · Jul 7, 2024 · Jul 7, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,12 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
 
 ## [Unreleased]
 
+### Fixed
+
+- `TypeError` when PDF object reference cannot be parsed as int ([#972](https://github.com/pdfminer/pdfminer.six/pull/972))])
+
+## [20240706]
+
 ### Added
 
 - Support for zipped jpeg's ([#938](https://github.com/pdfminer/pdfminer.six/pull/938))

diff --git a/pdfminer/casting.py b/pdfminer/casting.py
@@ -0,0 +1,8 @@
+from typing import Optional, Any
+
+
+def safe_int(o: Any) -> Optional[int]:
+    try:
+        return int(o)
+    except (TypeError, ValueError):
+        return None
diff --git a/pdfminer/pdfparser.py b/pdfminer/pdfparser.py
@@ -3,13 +3,14 @@
 from typing import BinaryIO, TYPE_CHECKING, Optional, Union
 
 from . import settings
+from .casting import safe_int
 from .pdftypes import PDFObjRef
 from .pdfexceptions import PDFException
 from .pdftypes import PDFStream
 from .pdftypes import dict_value
 from .pdftypes import int_value
 from .psparser import KWD
-from .psexceptions import PSEOF, PSSyntaxError
+from .psexceptions import PSEOF
 from .psparser import PSKeyword
 from .psparser import PSStackParser
 
@@ -73,14 +74,12 @@ def do_keyword(self, pos: int, token: PSKeyword) -> None:
         elif token is self.KEYWORD_R:
             # reference to indirect object
             if len(self.curstack) >= 2:
-                try:
-                    ((_, objid), (_, genno)) = self.pop(2)
-                    (objid, genno) = (int(objid), int(genno))  # type: ignore[arg-type]
-                    assert self.doc is not None
-                    obj = PDFObjRef(self.doc, objid, genno)
+                (_, _object_id), _ = self.pop(2)
+                object_id = safe_int(_object_id)
+                if object_id is not None:
+                    obj = PDFObjRef(self.doc, object_id)
                     self.push((pos, obj))
-                except PSSyntaxError:
-                    pass
+
         elif token is self.KEYWORD_STREAM:
             # stream object
             ((_, dic),) = self.pop(1)
@@ -157,19 +156,19 @@ def flush(self) -> None:
     def do_keyword(self, pos: int, token: PSKeyword) -> None:
         if token is self.KEYWORD_R:
             # reference to indirect object
-            try:
-                ((_, objid), (_, genno)) = self.pop(2)
-                (objid, genno) = (int(objid), int(genno))  # type: ignore[arg-type]
-                obj = PDFObjRef(self.doc, objid, genno)
+            (_, _object_id), _ = self.pop(2)
+            object_id = safe_int(_object_id)
+            if object_id is not None:
+                obj = PDFObjRef(self.doc, object_id)
                 self.push((pos, obj))
-            except PSSyntaxError:
-                pass
             return
+
         elif token in (self.KEYWORD_OBJ, self.KEYWORD_ENDOBJ):
             if settings.STRICT:
                 # See PDF Spec 3.4.6: Only the object values are stored in the
                 # stream; the obj and endobj keywords are not used.
                 raise PDFSyntaxError("Keyword endobj found in stream")
             return
+
         # others
         self.push((pos, token))
diff --git a/pdfminer/pdftypes.py b/pdfminer/pdftypes.py
@@ -13,6 +13,7 @@
     Tuple,
     cast,
 )
+from warnings import warn
 
 from . import settings, pdfexceptions
 from .ascii85 import ascii85decode
@@ -66,12 +67,30 @@ class PDFObject(PSObject):
 PDFObjectNotFound = pdfexceptions.PDFObjectNotFound
 PDFNotImplementedError = pdfexceptions.PDFNotImplementedError
 
+_DEFAULT = object()
+
 
 class PDFObjRef(PDFObject):
-    def __init__(self, doc: Optional["PDFDocument"], objid: int, _: object) -> None:
+    def __init__(
+        self, doc: Optional["PDFDocument"], objid: int, _: Any = _DEFAULT
+    ) -> None:
+        """Reference to a PDF object.
+
+        :param doc: The PDF document.
+        :param objid: The object number.
+        :param _: Unused argument for backwards compatibility.
+        """
+        if _ is not _DEFAULT:
+            warn(
+                "The third argument of PDFObjRef is unused and will be removed after "
+                "2024",
+                DeprecationWarning,
+            )
+
         if objid == 0:
             if settings.STRICT:
                 raise PDFValueError("PDF object id cannot be 0.")
+
         self.doc = doc
         self.objid = objid