diff --git a/CHANGELOG.md b/CHANGELOG.md index b9c39d2c..5425c5d3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -21,6 +21,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). - `ValueError` when corrupt PDF specifies an invalid mediabox ([#987](https://github.com/pdfminer/pdfminer.six/pull/987)) - `RecursionError` when corrupt PDF specifies a recursive /Pages object ([#998](https://github.com/pdfminer/pdfminer.six/pull/998)) - `TypeError` when corrupt PDF specifies text-positioning operators with invalid values ([#1000](https://github.com/pdfminer/pdfminer.six/pull/1000)) +- inline image parsing fails when stream data contains "EI\n" ([#1008](https://github.com/pdfminer/pdfminer.six/issues/1008)) ### Removed diff --git a/pdfminer/image.py b/pdfminer/image.py index 898f2707..355c7fb7 100644 --- a/pdfminer/image.py +++ b/pdfminer/image.py @@ -16,6 +16,8 @@ LITERAL_DEVICE_CMYK, LITERAL_DEVICE_GRAY, LITERAL_DEVICE_RGB, + LITERAL_INLINE_DEVICE_GRAY, + LITERAL_INLINE_DEVICE_RGB, ) from pdfminer.pdfexceptions import PDFValueError from pdfminer.pdftypes import ( @@ -125,10 +127,16 @@ def export_image(self, image: LTImage) -> str: elif image.bits == 1: name = self._save_bmp(image, width, height, (width + 7) // 8, image.bits) - elif image.bits == 8 and LITERAL_DEVICE_RGB in image.colorspace: + elif image.bits == 8 and ( + LITERAL_DEVICE_RGB in image.colorspace + or LITERAL_INLINE_DEVICE_RGB in image.colorspace + ): name = self._save_bmp(image, width, height, width * 3, image.bits * 3) - elif image.bits == 8 and LITERAL_DEVICE_GRAY in image.colorspace: + elif image.bits == 8 and ( + LITERAL_DEVICE_GRAY in image.colorspace + or LITERAL_INLINE_DEVICE_GRAY in image.colorspace + ): name = self._save_bmp(image, width, height, width, image.bits) elif len(filters) == 1 and filters[0][0] in LITERALS_FLATE_DECODE: diff --git a/pdfminer/pdfcolor.py b/pdfminer/pdfcolor.py index da402715..6344b01f 100644 --- a/pdfminer/pdfcolor.py +++ b/pdfminer/pdfcolor.py @@ -6,6 +6,10 @@ LITERAL_DEVICE_GRAY = LIT("DeviceGray") LITERAL_DEVICE_RGB = LIT("DeviceRGB") LITERAL_DEVICE_CMYK = LIT("DeviceCMYK") +# Abbreviations for inline images +LITERAL_INLINE_DEVICE_GRAY = LIT("G") +LITERAL_INLINE_DEVICE_RGB = LIT("RGB") +LITERAL_INLINE_DEVICE_CMYK = LIT("CMYK") class PDFColorSpace: diff --git a/pdfminer/pdfinterp.py b/pdfminer/pdfinterp.py index 869b6719..ae1c46a7 100644 --- a/pdfminer/pdfinterp.py +++ b/pdfminer/pdfinterp.py @@ -19,6 +19,7 @@ ) from pdfminer.pdfpage import PDFPage from pdfminer.pdftypes import ( + LITERALS_ASCII85_DECODE, PDFObjRef, PDFStream, dict_value, @@ -331,11 +332,21 @@ def do_keyword(self, pos: int, token: PSKeyword) -> None: if len(objs) % 2 != 0: error_msg = f"Invalid dictionary construct: {objs!r}" raise PSTypeError(error_msg) - d = {literal_name(k): v for (k, v) in choplist(2, objs)} - (pos, data) = self.get_inline_data(pos + len(b"ID ")) + d = {literal_name(k): resolve1(v) for (k, v) in choplist(2, objs)} + eos = b"EI" + filter = d.get("F", None) + if filter is not None: + if isinstance(filter, PSLiteral): + filter = [filter] + if filter[0] in LITERALS_ASCII85_DECODE: + eos = b"~>" + (pos, data) = self.get_inline_data(pos + len(b"ID "), target=eos) + if eos != b"EI": # it may be necessary for decoding + data += eos obj = PDFStream(d, data) self.push((pos, obj)) - self.push((pos, self.KEYWORD_EI)) + if eos == b"EI": # otherwise it is still in the stream + self.push((pos, self.KEYWORD_EI)) except PSTypeError: if settings.STRICT: raise diff --git a/pdfminer/psparser.py b/pdfminer/psparser.py index 80ec1e18..b4869560 100755 --- a/pdfminer/psparser.py +++ b/pdfminer/psparser.py @@ -450,7 +450,9 @@ def _parse_string_1(self, s: bytes, i: int) -> int: return i + 1 elif self.oct: - self._curtoken += bytes((int(self.oct, 8),)) + chrcode = int(self.oct, 8) + assert chrcode < 256, "Invalid octal %s (%d)" % (repr(self.oct), chrcode) + self._curtoken += bytes((chrcode,)) self._parse1 = self._parse_string return i diff --git a/samples/contrib/issue-1008-inline-ascii85.pdf b/samples/contrib/issue-1008-inline-ascii85.pdf new file mode 100644 index 00000000..b23edc05 Binary files /dev/null and b/samples/contrib/issue-1008-inline-ascii85.pdf differ diff --git a/tests/test_tools_pdf2txt.py b/tests/test_tools_pdf2txt.py index ff1e0cb6..e80a5e69 100644 --- a/tests/test_tools_pdf2txt.py +++ b/tests/test_tools_pdf2txt.py @@ -184,3 +184,10 @@ def test_contrib_issue_495_pdfobjref(self): filepath = absolute_sample_path("contrib/issue_495_pdfobjref.pdf") image_files = self.extract_images(filepath) assert image_files[0].endswith("jpg") + + def test_contrib_issue_1008_inline(self): + """Test for parsing and extracting inline images""" + filepath = absolute_sample_path("contrib/issue-1008-inline-ascii85.pdf") + image_files = self.extract_images(filepath) + assert len(image_files) == 23 + assert all(x.endswith(".bmp") for x in image_files)