Skip to content

Commit

Permalink
Rewrite PSBaseParser and add an optimized in-memory version
Browse files Browse the repository at this point in the history
  • Loading branch information
dhdaines committed Sep 19, 2024
1 parent 1a8bd2f commit 0ad48c6
Show file tree
Hide file tree
Showing 10 changed files with 969 additions and 400 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,4 @@ Pipfile.lock
.vscode/
poetry.lock
.eggs
*~
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
### Changed

- Using absolute instead of relative imports ([[#995](https://github.com/pdfminer/pdfminer.six/pull/995)])
- Reimplement optimized parsers (really lexers) for file versus in-memory input

### Deprecated

Expand Down
5 changes: 2 additions & 3 deletions pdfminer/cmapdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
import sys
from typing import (
Any,
BinaryIO,
Dict,
Iterable,
Iterator,
Expand Down Expand Up @@ -278,8 +277,8 @@ def get_unicode_map(cls, name: str, vertical: bool = False) -> UnicodeMap:


class CMapParser(PSStackParser[PSKeyword]):
def __init__(self, cmap: CMapBase, fp: BinaryIO) -> None:
PSStackParser.__init__(self, fp)
def __init__(self, cmap: CMapBase, data: bytes) -> None:
super().__init__(data)
self.cmap = cmap
# some ToUnicode maps don't have "begincmap" keyword.
self._in_cmap = True
Expand Down
7 changes: 5 additions & 2 deletions pdfminer/pdfdocument.py
Original file line number Diff line number Diff line change
Expand Up @@ -837,6 +837,7 @@ def getobj(self, objid: int) -> object:
if objid in self._cached_objs:
(obj, genno) = self._cached_objs[objid]
else:
obj = None
for xref in self.xrefs:
try:
(strmid, index, genno) = xref.get_pos(objid)
Expand All @@ -856,7 +857,7 @@ def getobj(self, objid: int) -> object:
break
except (PSEOF, PDFSyntaxError):
continue
else:
if obj is None:
raise PDFObjectNotFound(objid)
log.debug("register: objid=%r: %r", objid, obj)
if self.caching:
Expand Down Expand Up @@ -891,7 +892,9 @@ def get_page_labels(self) -> Iterator[str]:
If the document includes page labels, generates strings, one per page.
If not, raises PDFNoPageLabels.
The resulting iteration is unbounded.
The resulting iterator is unbounded, so it is recommended to
zip it with the iterator over actual pages returned by `get_pages`.
"""
assert self.catalog is not None

Expand Down
10 changes: 5 additions & 5 deletions pdfminer/pdffont.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,8 +115,8 @@ class Type1FontHeaderParser(PSStackParser[int]):
KEYWORD_READONLY = KWD(b"readonly")
KEYWORD_FOR = KWD(b"for")

def __init__(self, data: BinaryIO) -> None:
PSStackParser.__init__(self, data)
def __init__(self, data: bytes) -> None:
super().__init__(data)
self._cid2unicode: Dict[int, str] = {}

def get_encoding(self) -> Dict[int, str]:
Expand Down Expand Up @@ -969,7 +969,7 @@ def __init__(
if "ToUnicode" in spec:
strm = stream_value(spec["ToUnicode"])
self.unicode_map = FileUnicodeMap()
CMapParser(self.unicode_map, BytesIO(strm.get_data())).run()
CMapParser(self.unicode_map, strm.get_data()).run()
PDFFont.__init__(self, descriptor, widths)

def to_unichr(self, cid: int) -> str:
Expand Down Expand Up @@ -1009,7 +1009,7 @@ def __init__(self, rsrcmgr: "PDFResourceManager", spec: Mapping[str, Any]) -> No
self.fontfile = stream_value(descriptor.get("FontFile"))
length1 = int_value(self.fontfile["Length1"])
data = self.fontfile.get_data()[:length1]
parser = Type1FontHeaderParser(BytesIO(data))
parser = Type1FontHeaderParser(data)
self.cid2unicode = parser.get_encoding()

def __repr__(self) -> str:
Expand Down Expand Up @@ -1080,7 +1080,7 @@ def __init__(
if isinstance(spec["ToUnicode"], PDFStream):
strm = stream_value(spec["ToUnicode"])
self.unicode_map = FileUnicodeMap()
CMapParser(self.unicode_map, BytesIO(strm.get_data())).run()
CMapParser(self.unicode_map, strm.get_data()).run()
else:
cmap_name = literal_name(spec["ToUnicode"])
encoding = literal_name(spec["Encoding"])
Expand Down
138 changes: 62 additions & 76 deletions pdfminer/pdfinterp.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
import logging
import re
from io import BytesIO
from typing import Dict, List, Mapping, Optional, Sequence, Tuple, Union, cast

from pdfminer import settings
Expand All @@ -18,6 +16,7 @@
PDFType3Font,
)
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFSyntaxError
from pdfminer.pdftypes import (
LITERALS_ASCII85_DECODE,
PDFObjRef,
Expand All @@ -31,6 +30,7 @@
from pdfminer.psparser import (
KWD,
LIT,
PSBaseParserToken,
PSKeyword,
PSLiteral,
PSStackParser,
Expand Down Expand Up @@ -248,85 +248,52 @@ def get_font(self, objid: object, spec: Mapping[str, object]) -> PDFFont:
return font


KEYWORD_BI = KWD(b"BI")
KEYWORD_ID = KWD(b"ID")
KEYWORD_EI = KWD(b"EI")


class PDFContentParser(PSStackParser[Union[PSKeyword, PDFStream]]):
def __init__(self, streams: Sequence[object]) -> None:
self.streams = streams
self.istream = 0
# PSStackParser.__init__(fp=None) is safe only because we've overloaded
# all the methods that would attempt to access self.fp without first
# calling self.fillfp().
PSStackParser.__init__(self, None) # type: ignore[arg-type]

def fillfp(self) -> None:
if not self.fp:
if self.istream < len(self.streams):
strm = stream_value(self.streams[self.istream])
self.istream += 1
else:
raise PSEOF("Unexpected EOF, file truncated?")
self.fp = BytesIO(strm.get_data())
"""Parse the concatenation of multiple content streams, as
described in the spec (PDF 1.7, p.86):
...the effect shall be as if all of the streams in the array were
concatenated, in order, to form a single stream. Conforming
writers can create image objects and other resources as they
occur, even though they interrupt the content stream. The division
between streams may occur only at the boundaries between lexical
tokens (see 7.2, "Lexical Conventions") but shall be unrelated to
the page’s logical content or organization.
"""

def seek(self, pos: int) -> None:
self.fillfp()
PSStackParser.seek(self, pos)
def __init__(self, streams: Sequence[object]) -> None:
self.streamiter = iter(streams)
try:
stream = stream_value(next(self.streamiter))
except StopIteration:
raise PSEOF
log.debug("PDFContentParser starting stream %r", stream)
super().__init__(stream.get_data())

def fillbuf(self) -> None:
if self.charpos < len(self.buf):
return
while 1:
self.fillfp()
self.bufpos = self.fp.tell()
self.buf = self.fp.read(self.BUFSIZ)
if self.buf:
break
self.fp = None # type: ignore[assignment]
self.charpos = 0

def get_inline_data(self, pos: int, target: bytes = b"EI") -> Tuple[int, bytes]:
self.seek(pos)
i = 0
data = b""
while i <= len(target):
self.fillbuf()
if i:
ci = self.buf[self.charpos]
c = bytes((ci,))
data += c
self.charpos += 1
if (
len(target) <= i
and c.isspace()
or i < len(target)
and c == (bytes((target[i],)))
):
i += 1
else:
i = 0
else:
try:
j = self.buf.index(target[0], self.charpos)
data += self.buf[self.charpos : j + 1]
self.charpos = j + 1
i = 1
except ValueError:
data += self.buf[self.charpos :]
self.charpos = len(self.buf)
data = data[: -(len(target) + 1)] # strip the last part
data = re.sub(rb"(\x0d\x0a|[\x0d\x0a])$", b"", data)
return (pos, data)
def __next__(self) -> Tuple[int, PSBaseParserToken]:
while True:
try:
return super().__next__()
except StopIteration:
# Will also raise StopIteration if there are no more,
# which is exactly what we want
stream = stream_value(next(self.streamiter))
log.debug("PDFContentParser starting stream %r", stream)
self.reinit(stream.get_data())

def flush(self) -> None:
self.add_results(*self.popall())

KEYWORD_BI = KWD(b"BI")
KEYWORD_ID = KWD(b"ID")
KEYWORD_EI = KWD(b"EI")

def do_keyword(self, pos: int, token: PSKeyword) -> None:
if token is self.KEYWORD_BI:
if token is KEYWORD_BI:
# inline image within a content stream
self.start_type(pos, "inline")
elif token is self.KEYWORD_ID:
elif token is KEYWORD_ID:
try:
(_, objs) = self.end_type("inline")
if len(objs) % 2 != 0:
Expand All @@ -340,13 +307,32 @@ def do_keyword(self, pos: int, token: PSKeyword) -> None:
filter = [filter]
if filter[0] in LITERALS_ASCII85_DECODE:
eos = b"~>"
(pos, data) = self.get_inline_data(pos + len(b"ID "), target=eos)
if eos != b"EI": # it may be necessary for decoding
data += eos
# PDF 1.7 p. 215: Unless the image uses ASCIIHexDecode
# or ASCII85Decode as one of its filters, the ID
# operator shall be followed by a single white-space
# character, and the next character shall be
# interpreted as the first byte of image data.
if eos == b"EI":
self.seek(pos + len(token.name) + 1)
(pos, data) = self.get_inline_data(target=eos)
# FIXME: it is totally unspecified what to do with
# a newline between the end of the data and "EI",
# since there is no explicit stream length. (PDF
# 1.7 p. 756: There should be an end-of-line
# marker after the data and before endstream; this
# marker shall not be included in the stream
# length.)
data = data[: -len(eos)]
else:
self.seek(pos + len(token.name))
(pos, data) = self.get_inline_data(target=eos)
if pos == -1:
raise PDFSyntaxError("End of inline stream %r not found" % eos)
obj = PDFStream(d, data)
self.push((pos, obj))
if eos == b"EI": # otherwise it is still in the stream
self.push((pos, self.KEYWORD_EI))
# This was included in the data but we need to "parse" it
if eos == b"EI":
self.push((pos, KEYWORD_EI))
except PSTypeError:
if settings.STRICT:
raise
Expand Down
12 changes: 5 additions & 7 deletions pdfminer/pdfparser.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import logging
from io import BytesIO
from typing import TYPE_CHECKING, BinaryIO, Optional, Union

from pdfminer import settings
Expand Down Expand Up @@ -36,8 +35,8 @@ class PDFParser(PSStackParser[Union[PSKeyword, PDFStream, PDFObjRef, None]]):
"""

def __init__(self, fp: BinaryIO) -> None:
PSStackParser.__init__(self, fp)
def __init__(self, data: Union[BinaryIO, bytes]) -> None:
super().__init__(data)
self.doc: Optional[PDFDocument] = None
self.fallback = False

Expand Down Expand Up @@ -92,10 +91,9 @@ def do_keyword(self, pos: int, token: PSKeyword) -> None:
raise PDFSyntaxError("Unexpected EOF")
return
pos += len(line)
self.fp.seek(pos)
data = bytearray(self.fp.read(objlen))
data = bytearray(self.read(pos, objlen))
self.seek(pos + objlen)
while 1:
while True:
try:
(linepos, line) = self.nextline()
except PSEOF:
Expand Down Expand Up @@ -138,7 +136,7 @@ class PDFStreamParser(PDFParser):
"""

def __init__(self, data: bytes) -> None:
PDFParser.__init__(self, BytesIO(data))
super().__init__(data)

def flush(self) -> None:
self.add_results(*self.popall())
Expand Down
10 changes: 0 additions & 10 deletions pdfminer/pdftypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
Union,
cast,
)
from warnings import warn

from pdfminer import pdfexceptions, settings
from pdfminer.ascii85 import ascii85decode, asciihexdecode
Expand Down Expand Up @@ -74,21 +73,12 @@ def __init__(
self,
doc: Optional["PDFDocument"],
objid: int,
_: Any = _DEFAULT,
) -> None:
"""Reference to a PDF object.
:param doc: The PDF document.
:param objid: The object number.
:param _: Unused argument for backwards compatibility.
"""
if _ is not _DEFAULT:
warn(
"The third argument of PDFObjRef is unused and will be removed after "
"2024",
DeprecationWarning,
)

if objid == 0:
if settings.STRICT:
raise PDFValueError("PDF object id cannot be 0.")
Expand Down
Loading

0 comments on commit 0ad48c6

Please sign in to comment.