fix: correctly support ASCII85 in inline images (fixes: #1008)

pdfminer · Jul 10, 2024 · f39109e · f39109e
1 parent 27771e0
commit f39109e
Show file tree

Hide file tree

Showing 5 changed files with 25 additions and 3 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -21,6 +21,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
 - `ValueError` when corrupt PDF specifies an invalid mediabox ([#987](https://github.com/pdfminer/pdfminer.six/pull/987))
 - `RecursionError` when corrupt PDF specifies a recursive /Pages object ([#998](https://github.com/pdfminer/pdfminer.six/pull/998))
 - `TypeError` when corrupt PDF specifies text-positioning operators with invalid values ([#1000](https://github.com/pdfminer/pdfminer.six/pull/1000))
+- inline image parsing fails when stream data contains "EI\n" ([#1008](https://github.com/pdfminer/pdfminer.six/issues/1008))
 
 ### Removed
 

diff --git a/pdfminer/pdfinterp.py b/pdfminer/pdfinterp.py
@@ -25,6 +25,7 @@
     list_value,
     resolve1,
     stream_value,
+    LITERALS_ASCII85_DECODE,
 )
 from pdfminer.psexceptions import PSEOF, PSTypeError
 from pdfminer.psparser import (
@@ -331,11 +332,22 @@ def do_keyword(self, pos: int, token: PSKeyword) -> None:
                 if len(objs) % 2 != 0:
                     error_msg = f"Invalid dictionary construct: {objs!r}"
                     raise PSTypeError(error_msg)
-                d = {literal_name(k): v for (k, v) in choplist(2, objs)}
-                (pos, data) = self.get_inline_data(pos + len(b"ID "))
+                d = {literal_name(k): resolve1(v) for (k, v) in choplist(2, objs)}
+                eos = b"EI"
+                filter = d.get("F", None)
+                if filter is not None:
+                    if isinstance(filter, PSLiteral):
+                        filter = [filter]
+                    for f in LITERALS_ASCII85_DECODE:
+                        if f in filter:
+                            eos = b"~>"
+                (pos, data) = self.get_inline_data(pos + len(b"ID "), target=eos)
+                if eos != b"EI":  # it may be necessary for decoding
+                    data += eos
                 obj = PDFStream(d, data)
                 self.push((pos, obj))
-                self.push((pos, self.KEYWORD_EI))
+                if eos == b"EI":  # otherwise it is still in the stream
+                    self.push((pos, self.KEYWORD_EI))
             except PSTypeError:
                 if settings.STRICT:
                     raise

diff --git a/pdfminer/psparser.py b/pdfminer/psparser.py
@@ -450,6 +450,8 @@ def _parse_string_1(self, s: bytes, i: int) -> int:
             return i + 1
 
         elif self.oct:
+            chrcode = int(self.oct, 8)
+            assert chrcode < 256, "Invalid octal %s (%d)" % (self.oct, chrcode)
             self._curtoken += bytes((int(self.oct, 8),))
             self._parse1 = self._parse_string
             return i

diff --git a/samples/contrib/issue-1008-inline-ascii85.pdf b/samples/contrib/issue-1008-inline-ascii85.pdf
diff --git a/tests/test_tools_pdf2txt.py b/tests/test_tools_pdf2txt.py
@@ -184,3 +184,10 @@ def test_contrib_issue_495_pdfobjref(self):
         filepath = absolute_sample_path("contrib/issue_495_pdfobjref.pdf")
         image_files = self.extract_images(filepath)
         assert image_files[0].endswith("jpg")
+
+    def test_contrib_issue_1008_inline(self):
+        """Test for parsing and extracting inline images"""
+        filepath = absolute_sample_path("contrib/issue-1008-inline-ascii85.pdf")
+        image_files = self.extract_images(filepath)
+        assert len(image_files) == 23
+        assert all(x.endswith(".bmp") for x in image_files)