diff --git a/CHANGELOG.md b/CHANGELOG.md index 78eeea95..8a1f128a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,8 +11,9 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). ### Fixed -- `TypeError` when PDF object reference cannot be parsed as int ([#972](https://github.com/pdfminer/pdfminer.six/pull/972))]) -- `TypeError` when PDF literal cannot be converted to str ([#978](https://github.com/pdfminer/pdfminer.six/pull/978)) +- `TypeError` when corrupt PDF object reference cannot be parsed as int ([#972](https://github.com/pdfminer/pdfminer.six/pull/972))]) +- `TypeError` when corrupt PDF literal cannot be converted to str ([#978](https://github.com/pdfminer/pdfminer.six/pull/978)) +- `ValueError` when corrupt PDF specifies a negative xref location ([#980](http://github.com/pdfminer/pdfminer.six/pull/980)) ### Removed diff --git a/pdfminer/pdfdocument.py b/pdfminer/pdfdocument.py index bcacae6b..84713516 100644 --- a/pdfminer/pdfdocument.py +++ b/pdfminer/pdfdocument.py @@ -950,19 +950,28 @@ def get_dest(self, name: Union[str, bytes]) -> Any: def find_xref(self, parser: PDFParser) -> int: """Internal function used to locate the first XRef.""" # search the last xref table by scanning the file backwards. - prev = None + prev = b"" for line in parser.revreadlines(): line = line.strip() log.debug("find_xref: %r", line) + if line == b"startxref": - break + log.debug("xref found: pos=%r", prev) + + if not prev.isdigit(): + raise PDFNoValidXRef(f"Invalid xref position: {prev!r}") + + start = int(prev) + + if not start >= 0: + raise PDFNoValidXRef(f"Invalid negative xref position: {start}") + + return start + if line: prev = line - else: - raise PDFNoValidXRef("Unexpected EOF") - log.debug("xref found: pos=%r", prev) - assert prev is not None - return int(prev) + + raise PDFNoValidXRef("Unexpected EOF") # read xref table def read_xref_from( diff --git a/pdfminer/psparser.py b/pdfminer/psparser.py index 0839f1f0..8bebaf55 100755 --- a/pdfminer/psparser.py +++ b/pdfminer/psparser.py @@ -1,4 +1,5 @@ #!/usr/bin/env python3 +import io # -*- coding: utf-8 -*- @@ -260,7 +261,7 @@ def revreadlines(self) -> Iterator[bytes]: This is used to locate the trailers at the end of a file. """ - self.fp.seek(0, 2) + self.fp.seek(0, io.SEEK_END) pos = self.fp.tell() buf = b"" while 0 < pos: