Rewrite PSBaseParser and add an optimized in-memory version

pdfminer · Sep 19, 2024 · df0867a · df0867a
1 parent 1a8bd2f
commit df0867a
Show file tree

Hide file tree

Showing 12 changed files with 978 additions and 395 deletions.
diff --git a/.gitignore b/.gitignore
@@ -26,3 +26,4 @@ Pipfile.lock
 .vscode/
 poetry.lock
 .eggs
+*~
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -8,6 +8,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
 ### Changed
 
 - Using absolute instead of relative imports ([[#995](https://github.com/pdfminer/pdfminer.six/pull/995)])
+- Reimplement optimized parsers (really lexers) for file versus in-memory input
 
 ### Deprecated
 

diff --git a/pdfminer/cmapdb.py b/pdfminer/cmapdb.py
@@ -18,7 +18,6 @@
 import sys
 from typing import (
     Any,
-    BinaryIO,
     Dict,
     Iterable,
     Iterator,
@@ -278,8 +277,8 @@ def get_unicode_map(cls, name: str, vertical: bool = False) -> UnicodeMap:
 
 
 class CMapParser(PSStackParser[PSKeyword]):
-    def __init__(self, cmap: CMapBase, fp: BinaryIO) -> None:
-        PSStackParser.__init__(self, fp)
+    def __init__(self, cmap: CMapBase, data: bytes) -> None:
+        super().__init__(data)
         self.cmap = cmap
         # some ToUnicode maps don't have "begincmap" keyword.
         self._in_cmap = True

diff --git a/pdfminer/image.py b/pdfminer/image.py
@@ -8,7 +8,7 @@
     from typing import Literal
 except ImportError:
     # Literal was introduced in Python 3.8
-    from typing_extensions import Literal  # type: ignore[assignment]
+    from typing_extensions import Literal  # type: ignore
 
 from pdfminer.jbig2 import JBIG2StreamReader, JBIG2StreamWriter
 from pdfminer.layout import LTImage

diff --git a/pdfminer/pdfdocument.py b/pdfminer/pdfdocument.py
@@ -837,6 +837,7 @@ def getobj(self, objid: int) -> object:
         if objid in self._cached_objs:
             (obj, genno) = self._cached_objs[objid]
         else:
+            obj = None
             for xref in self.xrefs:
                 try:
                     (strmid, index, genno) = xref.get_pos(objid)
@@ -856,7 +857,7 @@ def getobj(self, objid: int) -> object:
                     break
                 except (PSEOF, PDFSyntaxError):
                     continue
-            else:
+            if obj is None:
                 raise PDFObjectNotFound(objid)
             log.debug("register: objid=%r: %r", objid, obj)
             if self.caching:
@@ -891,7 +892,9 @@ def get_page_labels(self) -> Iterator[str]:
         If the document includes page labels, generates strings, one per page.
         If not, raises PDFNoPageLabels.
 
-        The resulting iteration is unbounded.
+        The resulting iterator is unbounded, so it is recommended to
+        zip it with the iterator over actual pages returned by `get_pages`.
+
         """
         assert self.catalog is not None
 

diff --git a/pdfminer/pdffont.py b/pdfminer/pdffont.py
@@ -115,8 +115,8 @@ class Type1FontHeaderParser(PSStackParser[int]):
     KEYWORD_READONLY = KWD(b"readonly")
     KEYWORD_FOR = KWD(b"for")
 
-    def __init__(self, data: BinaryIO) -> None:
-        PSStackParser.__init__(self, data)
+    def __init__(self, data: bytes) -> None:
+        super().__init__(data)
         self._cid2unicode: Dict[int, str] = {}
 
     def get_encoding(self) -> Dict[int, str]:
@@ -969,7 +969,7 @@ def __init__(
         if "ToUnicode" in spec:
             strm = stream_value(spec["ToUnicode"])
             self.unicode_map = FileUnicodeMap()
-            CMapParser(self.unicode_map, BytesIO(strm.get_data())).run()
+            CMapParser(self.unicode_map, strm.get_data()).run()
         PDFFont.__init__(self, descriptor, widths)
 
     def to_unichr(self, cid: int) -> str:
@@ -1009,7 +1009,7 @@ def __init__(self, rsrcmgr: "PDFResourceManager", spec: Mapping[str, Any]) -> No
             self.fontfile = stream_value(descriptor.get("FontFile"))
             length1 = int_value(self.fontfile["Length1"])
             data = self.fontfile.get_data()[:length1]
-            parser = Type1FontHeaderParser(BytesIO(data))
+            parser = Type1FontHeaderParser(data)
             self.cid2unicode = parser.get_encoding()
 
     def __repr__(self) -> str:
@@ -1080,7 +1080,7 @@ def __init__(
             if isinstance(spec["ToUnicode"], PDFStream):
                 strm = stream_value(spec["ToUnicode"])
                 self.unicode_map = FileUnicodeMap()
-                CMapParser(self.unicode_map, BytesIO(strm.get_data())).run()
+                CMapParser(self.unicode_map, strm.get_data()).run()
             else:
                 cmap_name = literal_name(spec["ToUnicode"])
                 encoding = literal_name(spec["Encoding"])

diff --git a/pdfminer/pdfinterp.py b/pdfminer/pdfinterp.py
@@ -1,6 +1,4 @@
 import logging
-import re
-from io import BytesIO
 from typing import Dict, List, Mapping, Optional, Sequence, Tuple, Union, cast
 
 from pdfminer import settings
@@ -18,6 +16,7 @@
     PDFType3Font,
 )
 from pdfminer.pdfpage import PDFPage
+from pdfminer.pdfparser import PDFSyntaxError
 from pdfminer.pdftypes import (
     LITERALS_ASCII85_DECODE,
     PDFObjRef,
@@ -31,6 +30,7 @@
 from pdfminer.psparser import (
     KWD,
     LIT,
+    PSBaseParserToken,
     PSKeyword,
     PSLiteral,
     PSStackParser,
@@ -248,85 +248,52 @@ def get_font(self, objid: object, spec: Mapping[str, object]) -> PDFFont:
         return font
 
 
+KEYWORD_BI = KWD(b"BI")
+KEYWORD_ID = KWD(b"ID")
+KEYWORD_EI = KWD(b"EI")
+
+
 class PDFContentParser(PSStackParser[Union[PSKeyword, PDFStream]]):
-    def __init__(self, streams: Sequence[object]) -> None:
-        self.streams = streams
-        self.istream = 0
-        # PSStackParser.__init__(fp=None) is safe only because we've overloaded
-        # all the methods that would attempt to access self.fp without first
-        # calling self.fillfp().
-        PSStackParser.__init__(self, None)  # type: ignore[arg-type]
-
-    def fillfp(self) -> None:
-        if not self.fp:
-            if self.istream < len(self.streams):
-                strm = stream_value(self.streams[self.istream])
-                self.istream += 1
-            else:
-                raise PSEOF("Unexpected EOF, file truncated?")
-            self.fp = BytesIO(strm.get_data())
+    """Parse the concatenation of multiple content streams, as
+    described in the spec (PDF 1.7, p.86):
+
+    ...the effect shall be as if all of the streams in the array were
+    concatenated, in order, to form a single stream.  Conforming
+    writers can create image objects and other resources as they
+    occur, even though they interrupt the content stream. The division
+    between streams may occur only at the boundaries between lexical
+    tokens (see 7.2, "Lexical Conventions") but shall be unrelated to
+    the page’s logical content or organization.
+    """
 
-    def seek(self, pos: int) -> None:
-        self.fillfp()
-        PSStackParser.seek(self, pos)
+    def __init__(self, streams: Sequence[object]) -> None:
+        self.streamiter = iter(streams)
+        try:
+            stream = stream_value(next(self.streamiter))
+        except StopIteration:
+            raise PSEOF
+        log.debug("PDFContentParser starting stream %r", stream)
+        super().__init__(stream.get_data())
 
-    def fillbuf(self) -> None:
-        if self.charpos < len(self.buf):
-            return
-        while 1:
-            self.fillfp()
-            self.bufpos = self.fp.tell()
-            self.buf = self.fp.read(self.BUFSIZ)
-            if self.buf:
-                break
-            self.fp = None  # type: ignore[assignment]
-        self.charpos = 0
-
-    def get_inline_data(self, pos: int, target: bytes = b"EI") -> Tuple[int, bytes]:
-        self.seek(pos)
-        i = 0
-        data = b""
-        while i <= len(target):
-            self.fillbuf()
-            if i:
-                ci = self.buf[self.charpos]
-                c = bytes((ci,))
-                data += c
-                self.charpos += 1
-                if (
-                    len(target) <= i
-                    and c.isspace()
-                    or i < len(target)
-                    and c == (bytes((target[i],)))
-                ):
-                    i += 1
-                else:
-                    i = 0
-            else:
-                try:
-                    j = self.buf.index(target[0], self.charpos)
-                    data += self.buf[self.charpos : j + 1]
-                    self.charpos = j + 1
-                    i = 1
-                except ValueError:
-                    data += self.buf[self.charpos :]
-                    self.charpos = len(self.buf)
-        data = data[: -(len(target) + 1)]  # strip the last part
-        data = re.sub(rb"(\x0d\x0a|[\x0d\x0a])$", b"", data)
-        return (pos, data)
+    def __next__(self) -> Tuple[int, PSBaseParserToken]:
+        while True:
+            try:
+                return super().__next__()
+            except StopIteration:
+                # Will also raise StopIteration if there are no more,
+                # which is exactly what we want
+                stream = stream_value(next(self.streamiter))
+                log.debug("PDFContentParser starting stream %r", stream)
+                self.reinit(stream.get_data())
 
     def flush(self) -> None:
         self.add_results(*self.popall())
 
-    KEYWORD_BI = KWD(b"BI")
-    KEYWORD_ID = KWD(b"ID")
-    KEYWORD_EI = KWD(b"EI")
-
     def do_keyword(self, pos: int, token: PSKeyword) -> None:
-        if token is self.KEYWORD_BI:
+        if token is KEYWORD_BI:
             # inline image within a content stream
             self.start_type(pos, "inline")
-        elif token is self.KEYWORD_ID:
+        elif token is KEYWORD_ID:
             try:
                 (_, objs) = self.end_type("inline")
                 if len(objs) % 2 != 0:
@@ -340,13 +307,32 @@ def do_keyword(self, pos: int, token: PSKeyword) -> None:
                         filter = [filter]
                     if filter[0] in LITERALS_ASCII85_DECODE:
                         eos = b"~>"
-                (pos, data) = self.get_inline_data(pos + len(b"ID "), target=eos)
-                if eos != b"EI":  # it may be necessary for decoding
-                    data += eos
+                # PDF 1.7 p. 215: Unless the image uses ASCIIHexDecode
+                # or ASCII85Decode as one of its filters, the ID
+                # operator shall be followed by a single white-space
+                # character, and the next character shall be
+                # interpreted as the first byte of image data.
+                if eos == b"EI":
+                    self.seek(pos + len(token.name) + 1)
+                    (pos, data) = self.get_inline_data(target=eos)
+                    # FIXME: it is totally unspecified what to do with
+                    # a newline between the end of the data and "EI",
+                    # since there is no explicit stream length.  (PDF
+                    # 1.7 p. 756: There should be an end-of-line
+                    # marker after the data and before endstream; this
+                    # marker shall not be included in the stream
+                    # length.)
+                    data = data[: -len(eos)]
+                else:
+                    self.seek(pos + len(token.name))
+                    (pos, data) = self.get_inline_data(target=eos)
+                if pos == -1:
+                    raise PDFSyntaxError("End of inline stream %r not found" % eos)
                 obj = PDFStream(d, data)
                 self.push((pos, obj))
-                if eos == b"EI":  # otherwise it is still in the stream
-                    self.push((pos, self.KEYWORD_EI))
+                # This was included in the data but we need to "parse" it
+                if eos == b"EI":
+                    self.push((pos, KEYWORD_EI))
             except PSTypeError:
                 if settings.STRICT:
                     raise

diff --git a/pdfminer/pdfparser.py b/pdfminer/pdfparser.py
@@ -1,5 +1,4 @@
 import logging
-from io import BytesIO
 from typing import TYPE_CHECKING, BinaryIO, Optional, Union
 
 from pdfminer import settings
@@ -36,8 +35,8 @@ class PDFParser(PSStackParser[Union[PSKeyword, PDFStream, PDFObjRef, None]]):
 
     """
 
-    def __init__(self, fp: BinaryIO) -> None:
-        PSStackParser.__init__(self, fp)
+    def __init__(self, data: Union[BinaryIO, bytes]) -> None:
+        super().__init__(data)
         self.doc: Optional[PDFDocument] = None
         self.fallback = False
 
@@ -92,10 +91,9 @@ def do_keyword(self, pos: int, token: PSKeyword) -> None:
                     raise PDFSyntaxError("Unexpected EOF")
                 return
             pos += len(line)
-            self.fp.seek(pos)
-            data = bytearray(self.fp.read(objlen))
+            data = bytearray(self.read(pos, objlen))
             self.seek(pos + objlen)
-            while 1:
+            while True:
                 try:
                     (linepos, line) = self.nextline()
                 except PSEOF:
@@ -138,7 +136,7 @@ class PDFStreamParser(PDFParser):
     """
 
     def __init__(self, data: bytes) -> None:
-        PDFParser.__init__(self, BytesIO(data))
+        super().__init__(data)
 
     def flush(self) -> None:
         self.add_results(*self.popall())