diff --git a/CHANGELOG.md b/CHANGELOG.md index dd210ff3..b9c39d2c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -20,6 +20,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). - `ValueError` when corrupt PDF specifies a negative xref location ([#980](http://github.com/pdfminer/pdfminer.six/pull/980)) - `ValueError` when corrupt PDF specifies an invalid mediabox ([#987](https://github.com/pdfminer/pdfminer.six/pull/987)) - `RecursionError` when corrupt PDF specifies a recursive /Pages object ([#998](https://github.com/pdfminer/pdfminer.six/pull/998)) +- `TypeError` when corrupt PDF specifies text-positioning operators with invalid values ([#1000](https://github.com/pdfminer/pdfminer.six/pull/1000)) ### Removed diff --git a/pdfminer/casting.py b/pdfminer/casting.py index a8df21da..ac6bac5a 100644 --- a/pdfminer/casting.py +++ b/pdfminer/casting.py @@ -6,3 +6,10 @@ def safe_int(o: Any) -> Optional[int]: return int(o) except (TypeError, ValueError): return None + + +def safe_float(o: Any) -> Optional[float]: + try: + return float(o) + except (TypeError, ValueError): + return None diff --git a/pdfminer/pdfinterp.py b/pdfminer/pdfinterp.py index 8ad06a7a..869b6719 100644 --- a/pdfminer/pdfinterp.py +++ b/pdfminer/pdfinterp.py @@ -4,10 +4,11 @@ from typing import Dict, List, Mapping, Optional, Sequence, Tuple, Union, cast from pdfminer import settings +from pdfminer.casting import safe_float from pdfminer.cmapdb import CMap, CMapBase, CMapDB from pdfminer.pdfcolor import PREDEFINED_COLORSPACE, PDFColorSpace from pdfminer.pdfdevice import PDFDevice, PDFTextSeq -from pdfminer.pdfexceptions import PDFException +from pdfminer.pdfexceptions import PDFException, PDFValueError from pdfminer.pdffont import ( PDFCIDFont, PDFFont, @@ -791,20 +792,44 @@ def do_Ts(self, rise: PDFStackT) -> None: self.textstate.rise = cast(float, rise) def do_Td(self, tx: PDFStackT, ty: PDFStackT) -> None: - """Move text position""" - tx = cast(float, tx) - ty = cast(float, ty) - (a, b, c, d, e, f) = self.textstate.matrix - self.textstate.matrix = (a, b, c, d, tx * a + ty * c + e, tx * b + ty * d + f) + """Move to the start of the next line + + Offset from the start of the current line by (tx , ty). + """ + tx_ = safe_float(tx) + ty_ = safe_float(ty) + if tx_ is not None and ty_ is not None: + (a, b, c, d, e, f) = self.textstate.matrix + e_new = tx_ * a + ty_ * c + e + f_new = tx_ * b + ty_ * d + f + self.textstate.matrix = (a, b, c, d, e_new, f_new) + + elif settings.STRICT: + raise PDFValueError(f"Invalid offset ({tx!r}, {ty!r}) for Td") + self.textstate.linematrix = (0, 0) def do_TD(self, tx: PDFStackT, ty: PDFStackT) -> None: - """Move text position and set leading""" - tx = cast(float, tx) - ty = cast(float, ty) - (a, b, c, d, e, f) = self.textstate.matrix - self.textstate.matrix = (a, b, c, d, tx * a + ty * c + e, tx * b + ty * d + f) - self.textstate.leading = ty + """Move to the start of the next line. + + offset from the start of the current line by (tx , ty). As a side effect, this + operator sets the leading parameter in the text state. + """ + tx_ = safe_float(tx) + ty_ = safe_float(ty) + + if tx_ is not None and ty_ is not None: + (a, b, c, d, e, f) = self.textstate.matrix + e_new = tx_ * a + ty_ * c + e + f_new = tx_ * b + ty_ * d + f + self.textstate.matrix = (a, b, c, d, e_new, f_new) + + elif settings.STRICT: + raise PDFValueError("Invalid offset ({tx}, {ty}) for TD") + + if ty_ is not None: + self.textstate.leading = ty_ + self.textstate.linematrix = (0, 0) def do_Tm( @@ -961,7 +986,7 @@ def execute(self, streams: Sequence[object]) -> None: except PSEOF: # empty page return - while 1: + while True: try: (_, obj) = parser.nextobject() except PSEOF: