diff --git a/CHANGELOG.md b/CHANGELOG.md index b9c39d2c..5425c5d3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -21,6 +21,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). - `ValueError` when corrupt PDF specifies an invalid mediabox ([#987](https://github.com/pdfminer/pdfminer.six/pull/987)) - `RecursionError` when corrupt PDF specifies a recursive /Pages object ([#998](https://github.com/pdfminer/pdfminer.six/pull/998)) - `TypeError` when corrupt PDF specifies text-positioning operators with invalid values ([#1000](https://github.com/pdfminer/pdfminer.six/pull/1000)) +- inline image parsing fails when stream data contains "EI\n" ([#1008](https://github.com/pdfminer/pdfminer.six/issues/1008)) ### Removed diff --git a/pdfminer/pdfinterp.py b/pdfminer/pdfinterp.py index 869b6719..1e2c1fa4 100644 --- a/pdfminer/pdfinterp.py +++ b/pdfminer/pdfinterp.py @@ -25,6 +25,7 @@ list_value, resolve1, stream_value, + LITERALS_ASCII85_DECODE, ) from pdfminer.psexceptions import PSEOF, PSTypeError from pdfminer.psparser import ( @@ -331,11 +332,22 @@ def do_keyword(self, pos: int, token: PSKeyword) -> None: if len(objs) % 2 != 0: error_msg = f"Invalid dictionary construct: {objs!r}" raise PSTypeError(error_msg) - d = {literal_name(k): v for (k, v) in choplist(2, objs)} - (pos, data) = self.get_inline_data(pos + len(b"ID ")) + d = {literal_name(k): resolve1(v) for (k, v) in choplist(2, objs)} + eos = b"EI" + filter = d.get("F", None) + if filter is not None: + if isinstance(filter, PSLiteral): + filter = [filter] + for f in LITERALS_ASCII85_DECODE: + if f in filter: + eos = b"~>" + (pos, data) = self.get_inline_data(pos + len(b"ID "), target=eos) + if eos != b"EI": # it may be necessary for decoding + data += eos obj = PDFStream(d, data) self.push((pos, obj)) - self.push((pos, self.KEYWORD_EI)) + if eos == b"EI": # otherwise it is still in the stream + self.push((pos, self.KEYWORD_EI)) except PSTypeError: if settings.STRICT: raise diff --git a/pdfminer/psparser.py b/pdfminer/psparser.py index 80ec1e18..67311174 100755 --- a/pdfminer/psparser.py +++ b/pdfminer/psparser.py @@ -450,6 +450,8 @@ def _parse_string_1(self, s: bytes, i: int) -> int: return i + 1 elif self.oct: + chrcode = int(self.oct, 8) + assert chrcode < 256, "Invalid octal %s (%d)" % (self.oct, chrcode) self._curtoken += bytes((int(self.oct, 8),)) self._parse1 = self._parse_string return i diff --git a/samples/contrib/issue-1008-inline-ascii85.pdf b/samples/contrib/issue-1008-inline-ascii85.pdf new file mode 100644 index 00000000..b23edc05 Binary files /dev/null and b/samples/contrib/issue-1008-inline-ascii85.pdf differ diff --git a/tests/test_tools_pdf2txt.py b/tests/test_tools_pdf2txt.py index ff1e0cb6..e80a5e69 100644 --- a/tests/test_tools_pdf2txt.py +++ b/tests/test_tools_pdf2txt.py @@ -184,3 +184,10 @@ def test_contrib_issue_495_pdfobjref(self): filepath = absolute_sample_path("contrib/issue_495_pdfobjref.pdf") image_files = self.extract_images(filepath) assert image_files[0].endswith("jpg") + + def test_contrib_issue_1008_inline(self): + """Test for parsing and extracting inline images""" + filepath = absolute_sample_path("contrib/issue-1008-inline-ascii85.pdf") + image_files = self.extract_images(filepath) + assert len(image_files) == 23 + assert all(x.endswith(".bmp") for x in image_files)