Skip to content

Commit

Permalink
fix: correctly support ASCII85 in inline images (fixes: #1008)
Browse files Browse the repository at this point in the history
  • Loading branch information
dhdaines committed Jul 10, 2024
1 parent 27771e0 commit f39109e
Show file tree
Hide file tree
Showing 5 changed files with 25 additions and 3 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
- `ValueError` when corrupt PDF specifies an invalid mediabox ([#987](https://github.com/pdfminer/pdfminer.six/pull/987))
- `RecursionError` when corrupt PDF specifies a recursive /Pages object ([#998](https://github.com/pdfminer/pdfminer.six/pull/998))
- `TypeError` when corrupt PDF specifies text-positioning operators with invalid values ([#1000](https://github.com/pdfminer/pdfminer.six/pull/1000))
- inline image parsing fails when stream data contains "EI\n" ([#1008](https://github.com/pdfminer/pdfminer.six/issues/1008))

### Removed

Expand Down
18 changes: 15 additions & 3 deletions pdfminer/pdfinterp.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
list_value,
resolve1,
stream_value,
LITERALS_ASCII85_DECODE,
)
from pdfminer.psexceptions import PSEOF, PSTypeError
from pdfminer.psparser import (
Expand Down Expand Up @@ -331,11 +332,22 @@ def do_keyword(self, pos: int, token: PSKeyword) -> None:
if len(objs) % 2 != 0:
error_msg = f"Invalid dictionary construct: {objs!r}"
raise PSTypeError(error_msg)
d = {literal_name(k): v for (k, v) in choplist(2, objs)}
(pos, data) = self.get_inline_data(pos + len(b"ID "))
d = {literal_name(k): resolve1(v) for (k, v) in choplist(2, objs)}
eos = b"EI"
filter = d.get("F", None)
if filter is not None:
if isinstance(filter, PSLiteral):
filter = [filter]
for f in LITERALS_ASCII85_DECODE:
if f in filter:
eos = b"~>"
(pos, data) = self.get_inline_data(pos + len(b"ID "), target=eos)
if eos != b"EI": # it may be necessary for decoding
data += eos
obj = PDFStream(d, data)
self.push((pos, obj))
self.push((pos, self.KEYWORD_EI))
if eos == b"EI": # otherwise it is still in the stream
self.push((pos, self.KEYWORD_EI))
except PSTypeError:
if settings.STRICT:
raise
Expand Down
2 changes: 2 additions & 0 deletions pdfminer/psparser.py
Original file line number Diff line number Diff line change
Expand Up @@ -450,6 +450,8 @@ def _parse_string_1(self, s: bytes, i: int) -> int:
return i + 1

elif self.oct:
chrcode = int(self.oct, 8)
assert chrcode < 256, "Invalid octal %s (%d)" % (self.oct, chrcode)
self._curtoken += bytes((int(self.oct, 8),))
self._parse1 = self._parse_string
return i
Expand Down
Binary file added samples/contrib/issue-1008-inline-ascii85.pdf
Binary file not shown.
7 changes: 7 additions & 0 deletions tests/test_tools_pdf2txt.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,3 +184,10 @@ def test_contrib_issue_495_pdfobjref(self):
filepath = absolute_sample_path("contrib/issue_495_pdfobjref.pdf")
image_files = self.extract_images(filepath)
assert image_files[0].endswith("jpg")

def test_contrib_issue_1008_inline(self):
"""Test for parsing and extracting inline images"""
filepath = absolute_sample_path("contrib/issue-1008-inline-ascii85.pdf")
image_files = self.extract_images(filepath)
assert len(image_files) == 23
assert all(x.endswith(".bmp") for x in image_files)

0 comments on commit f39109e

Please sign in to comment.