Skip to content

Commit

Permalink
Safely cast object id to integer in PDFParser to prevert TypeError an…
Browse files Browse the repository at this point in the history
…d ValueError. (#972)
  • Loading branch information
pietermarsman authored Jul 7, 2024
1 parent fcfbcd3 commit b9b75ff
Show file tree
Hide file tree
Showing 4 changed files with 47 additions and 15 deletions.
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,12 @@ All notable changes in pdfminer.six will be documented in this file.

The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).

## [Unreleased]

### Fixed

- `TypeError` when PDF object reference cannot be parsed as int ([#972](https://github.com/pdfminer/pdfminer.six/pull/972))])

## [20240706]

### Added
Expand Down
8 changes: 8 additions & 0 deletions pdfminer/casting.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
from typing import Optional, Any


def safe_int(o: Any) -> Optional[int]:
try:
return int(o)
except (TypeError, ValueError):
return None
27 changes: 13 additions & 14 deletions pdfminer/pdfparser.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,14 @@
from typing import BinaryIO, TYPE_CHECKING, Optional, Union

from . import settings
from .casting import safe_int
from .pdftypes import PDFObjRef
from .pdfexceptions import PDFException
from .pdftypes import PDFStream
from .pdftypes import dict_value
from .pdftypes import int_value
from .psparser import KWD
from .psexceptions import PSEOF, PSSyntaxError
from .psexceptions import PSEOF
from .psparser import PSKeyword
from .psparser import PSStackParser

Expand Down Expand Up @@ -73,14 +74,12 @@ def do_keyword(self, pos: int, token: PSKeyword) -> None:
elif token is self.KEYWORD_R:
# reference to indirect object
if len(self.curstack) >= 2:
try:
((_, objid), (_, genno)) = self.pop(2)
(objid, genno) = (int(objid), int(genno)) # type: ignore[arg-type]
assert self.doc is not None
obj = PDFObjRef(self.doc, objid, genno)
(_, _object_id), _ = self.pop(2)
object_id = safe_int(_object_id)
if object_id is not None:
obj = PDFObjRef(self.doc, object_id)
self.push((pos, obj))
except PSSyntaxError:
pass

elif token is self.KEYWORD_STREAM:
# stream object
((_, dic),) = self.pop(1)
Expand Down Expand Up @@ -157,19 +156,19 @@ def flush(self) -> None:
def do_keyword(self, pos: int, token: PSKeyword) -> None:
if token is self.KEYWORD_R:
# reference to indirect object
try:
((_, objid), (_, genno)) = self.pop(2)
(objid, genno) = (int(objid), int(genno)) # type: ignore[arg-type]
obj = PDFObjRef(self.doc, objid, genno)
(_, _object_id), _ = self.pop(2)
object_id = safe_int(_object_id)
if object_id is not None:
obj = PDFObjRef(self.doc, object_id)
self.push((pos, obj))
except PSSyntaxError:
pass
return

elif token in (self.KEYWORD_OBJ, self.KEYWORD_ENDOBJ):
if settings.STRICT:
# See PDF Spec 3.4.6: Only the object values are stored in the
# stream; the obj and endobj keywords are not used.
raise PDFSyntaxError("Keyword endobj found in stream")
return

# others
self.push((pos, token))
21 changes: 20 additions & 1 deletion pdfminer/pdftypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
Tuple,
cast,
)
from warnings import warn

from . import settings, pdfexceptions
from .ascii85 import ascii85decode
Expand Down Expand Up @@ -66,12 +67,30 @@ class PDFObject(PSObject):
PDFObjectNotFound = pdfexceptions.PDFObjectNotFound
PDFNotImplementedError = pdfexceptions.PDFNotImplementedError

_DEFAULT = object()


class PDFObjRef(PDFObject):
def __init__(self, doc: Optional["PDFDocument"], objid: int, _: object) -> None:
def __init__(
self, doc: Optional["PDFDocument"], objid: int, _: Any = _DEFAULT
) -> None:
"""Reference to a PDF object.
:param doc: The PDF document.
:param objid: The object number.
:param _: Unused argument for backwards compatibility.
"""
if _ is not _DEFAULT:
warn(
"The third argument of PDFObjRef is unused and will be removed after "
"2024",
DeprecationWarning,
)

if objid == 0:
if settings.STRICT:
raise PDFValueError("PDF object id cannot be 0.")

self.doc = doc
self.objid = objid

Expand Down

0 comments on commit b9b75ff

Please sign in to comment.