Skip to content

Commit

Permalink
Do not crash on ASCII85 in inline images and properly support their c…
Browse files Browse the repository at this point in the history
…olorspaces (#1010)
  • Loading branch information
dhdaines authored Jul 15, 2024
1 parent 88139ad commit 1a8bd2f
Show file tree
Hide file tree
Showing 7 changed files with 39 additions and 6 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
- `ValueError` when corrupt PDF specifies an invalid mediabox ([#987](https://github.com/pdfminer/pdfminer.six/pull/987))
- `RecursionError` when corrupt PDF specifies a recursive /Pages object ([#998](https://github.com/pdfminer/pdfminer.six/pull/998))
- `TypeError` when corrupt PDF specifies text-positioning operators with invalid values ([#1000](https://github.com/pdfminer/pdfminer.six/pull/1000))
- inline image parsing fails when stream data contains "EI\n" ([#1008](https://github.com/pdfminer/pdfminer.six/issues/1008))

### Removed

Expand Down
12 changes: 10 additions & 2 deletions pdfminer/image.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@
LITERAL_DEVICE_CMYK,
LITERAL_DEVICE_GRAY,
LITERAL_DEVICE_RGB,
LITERAL_INLINE_DEVICE_GRAY,
LITERAL_INLINE_DEVICE_RGB,
)
from pdfminer.pdfexceptions import PDFValueError
from pdfminer.pdftypes import (
Expand Down Expand Up @@ -125,10 +127,16 @@ def export_image(self, image: LTImage) -> str:
elif image.bits == 1:
name = self._save_bmp(image, width, height, (width + 7) // 8, image.bits)

elif image.bits == 8 and LITERAL_DEVICE_RGB in image.colorspace:
elif image.bits == 8 and (
LITERAL_DEVICE_RGB in image.colorspace
or LITERAL_INLINE_DEVICE_RGB in image.colorspace
):
name = self._save_bmp(image, width, height, width * 3, image.bits * 3)

elif image.bits == 8 and LITERAL_DEVICE_GRAY in image.colorspace:
elif image.bits == 8 and (
LITERAL_DEVICE_GRAY in image.colorspace
or LITERAL_INLINE_DEVICE_GRAY in image.colorspace
):
name = self._save_bmp(image, width, height, width, image.bits)

elif len(filters) == 1 and filters[0][0] in LITERALS_FLATE_DECODE:
Expand Down
4 changes: 4 additions & 0 deletions pdfminer/pdfcolor.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,10 @@
LITERAL_DEVICE_GRAY = LIT("DeviceGray")
LITERAL_DEVICE_RGB = LIT("DeviceRGB")
LITERAL_DEVICE_CMYK = LIT("DeviceCMYK")
# Abbreviations for inline images
LITERAL_INLINE_DEVICE_GRAY = LIT("G")
LITERAL_INLINE_DEVICE_RGB = LIT("RGB")
LITERAL_INLINE_DEVICE_CMYK = LIT("CMYK")


class PDFColorSpace:
Expand Down
17 changes: 14 additions & 3 deletions pdfminer/pdfinterp.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
)
from pdfminer.pdfpage import PDFPage
from pdfminer.pdftypes import (
LITERALS_ASCII85_DECODE,
PDFObjRef,
PDFStream,
dict_value,
Expand Down Expand Up @@ -331,11 +332,21 @@ def do_keyword(self, pos: int, token: PSKeyword) -> None:
if len(objs) % 2 != 0:
error_msg = f"Invalid dictionary construct: {objs!r}"
raise PSTypeError(error_msg)
d = {literal_name(k): v for (k, v) in choplist(2, objs)}
(pos, data) = self.get_inline_data(pos + len(b"ID "))
d = {literal_name(k): resolve1(v) for (k, v) in choplist(2, objs)}
eos = b"EI"
filter = d.get("F", None)
if filter is not None:
if isinstance(filter, PSLiteral):
filter = [filter]
if filter[0] in LITERALS_ASCII85_DECODE:
eos = b"~>"
(pos, data) = self.get_inline_data(pos + len(b"ID "), target=eos)
if eos != b"EI": # it may be necessary for decoding
data += eos
obj = PDFStream(d, data)
self.push((pos, obj))
self.push((pos, self.KEYWORD_EI))
if eos == b"EI": # otherwise it is still in the stream
self.push((pos, self.KEYWORD_EI))
except PSTypeError:
if settings.STRICT:
raise
Expand Down
4 changes: 3 additions & 1 deletion pdfminer/psparser.py
Original file line number Diff line number Diff line change
Expand Up @@ -450,7 +450,9 @@ def _parse_string_1(self, s: bytes, i: int) -> int:
return i + 1

elif self.oct:
self._curtoken += bytes((int(self.oct, 8),))
chrcode = int(self.oct, 8)
assert chrcode < 256, "Invalid octal %s (%d)" % (repr(self.oct), chrcode)
self._curtoken += bytes((chrcode,))
self._parse1 = self._parse_string
return i

Expand Down
Binary file added samples/contrib/issue-1008-inline-ascii85.pdf
Binary file not shown.
7 changes: 7 additions & 0 deletions tests/test_tools_pdf2txt.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,3 +184,10 @@ def test_contrib_issue_495_pdfobjref(self):
filepath = absolute_sample_path("contrib/issue_495_pdfobjref.pdf")
image_files = self.extract_images(filepath)
assert image_files[0].endswith("jpg")

def test_contrib_issue_1008_inline(self):
"""Test for parsing and extracting inline images"""
filepath = absolute_sample_path("contrib/issue-1008-inline-ascii85.pdf")
image_files = self.extract_images(filepath)
assert len(image_files) == 23
assert all(x.endswith(".bmp") for x in image_files)

0 comments on commit 1a8bd2f

Please sign in to comment.