diff --git a/.github/workflows/actions.yml b/.github/workflows/actions.yml index 25186bae..47887ea8 100644 --- a/.github/workflows/actions.yml +++ b/.github/workflows/actions.yml @@ -20,9 +20,9 @@ jobs: runs-on: ubuntu-latest steps: - name: Checkout code - uses: actions/checkout@v2 + uses: actions/checkout@v4 - name: Set up Python ${{ env.default-python }} - uses: actions/setup-python@v2 + uses: actions/setup-python@v4 with: python-version: ${{ env.default-python }} - name: Upgrade pip, Install nox @@ -38,9 +38,9 @@ jobs: runs-on: ubuntu-latest steps: - name: Checkout code - uses: actions/checkout@v2 + uses: actions/checkout@v4 - name: Set up Python ${{ env.default-python }} - uses: actions/setup-python@v2 + uses: actions/setup-python@v4 with: python-version: ${{ env.default-python }} - name: Upgrade pip, Install nox @@ -56,9 +56,9 @@ jobs: runs-on: ubuntu-latest steps: - name: Checkout code - uses: actions/checkout@v2 + uses: actions/checkout@v4 - name: Set up Python ${{ env.default-python }} - uses: actions/setup-python@v2 + uses: actions/setup-python@v4 with: python-version: ${{ env.default-python }} - name: Upgrade pip, Install nox @@ -75,12 +75,12 @@ jobs: strategy: matrix: os: [ ubuntu-latest ] - python-version: [ "3.6", "3.7", "3.8", "3.9", "3.10" ] + python-version: [ "3.8", "3.9", "3.10", "3.11", "3.12" ] steps: - name: Checkout code - uses: actions/checkout@v2 + uses: actions/checkout@v4 - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 + uses: actions/setup-python@v4 with: python-version: ${{ matrix.python-version }} - name: Determine pip cache directory @@ -88,7 +88,7 @@ jobs: run: | echo "::set-output name=dir::$(pip cache dir)" - name: Cache pip cache - uses: actions/cache@v2 + uses: actions/cache@v3 with: path: ${{ steps.pip-cache.outputs.dir }} key: ${{ runner.os }}-pip${{ matrix.python-version }} @@ -105,9 +105,9 @@ jobs: runs-on: ubuntu-latest steps: - name: Checkout code - uses: actions/checkout@v2 + uses: actions/checkout@v4 - name: Set up Python ${{ env.default-python }} - uses: actions/setup-python@v2 + uses: actions/setup-python@v4 with: python-version: ${{ env.default-python }} - name: Upgrade pip and install nox @@ -129,19 +129,9 @@ jobs: - build-docs steps: - name: Checkout code - uses: actions/checkout@v2 + uses: actions/checkout@v4 - name: Install dependencies run: python -m pip install wheel - - name: Set version - run: | - if [[ "${{ github.ref }}" == "refs/tags/"* ]] - then - VERSION=$(echo "${{ github.ref }}" | sed -e 's,.*/\(.*\),\1,' | sed -e 's/^v//') - else - VERSION=$(date +%Y%m%d).$(date +%H%M%S) - fi - echo ${VERSION} - sed -i "s/__VERSION__/${VERSION}/g" pdfminer/__init__.py - name: Build package run: python setup.py sdist bdist_wheel - name: Generate changelog @@ -161,4 +151,4 @@ jobs: body_path: ${{ github.workspace }}-CHANGELOG.md files: | dist/*.tar.gz - dist/*.whl \ No newline at end of file + dist/*.whl diff --git a/.gitignore b/.gitignore index b155fbbd..c1642e11 100644 --- a/.gitignore +++ b/.gitignore @@ -26,3 +26,4 @@ Pipfile.lock .vscode/ pyproject.toml poetry.lock +.eggs diff --git a/CHANGELOG.md b/CHANGELOG.md index 89009d44..20ee7d61 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,24 +7,48 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). ### Added +- Adds `contains` method to `LTComponent` to check whether it contains another `LTComponent`. + +## [20231228] + +### Removed +- Support for Python 3.6 and 3.7 ([#921](https://github.com/pdfminer/pdfminer.six/pull/921)) + +### Added + - Output converter for the hOCR format ([#651](https://github.com/pdfminer/pdfminer.six/pull/651)) - Font name aliases for Arial, Courier New and Times New Roman ([#790](https://github.com/pdfminer/pdfminer.six/pull/790)) -- Adds `contains` method to `LTComponent` to check whether it contains another `LTComponent`. +- Documentation on why special characters can sometimes not be extracted ([#829](https://github.com/pdfminer/pdfminer.six/pull/829)) +- Storing Bezier path and dashing style of line in LTCurve ([#801](https://github.com/pdfminer/pdfminer.six/pull/801)) ### Fixed +- Broken CI/CD pipeline by setting upper version limit for black, mypy, pip and setuptools ([#921](https://github.com/pdfminer/pdfminer.six/pull/921)) +- `flake8` failures ([#921](https://github.com/pdfminer/pdfminer.six/pull/921)) - `ValueError` when bmp images with 1 bit channel are decoded ([#773](https://github.com/pdfminer/pdfminer.six/issues/773)) - `ValueError` when trying to decrypt empty metadata values ([#766](https://github.com/pdfminer/pdfminer.six/issues/766)) - Sphinx errors during building of documentation ([#760](https://github.com/pdfminer/pdfminer.six/pull/760)) - `TypeError` when getting default width of font ([#720](https://github.com/pdfminer/pdfminer.six/issues/720)) - Installing typing-extensions on Python 3.6 and 3.7 ([#775](https://github.com/pdfminer/pdfminer.six/pull/775)) - `TypeError` in cmapdb.py when parsing null characters ([#768](https://github.com/pdfminer/pdfminer.six/pull/768)) -- Color "convenience operators" now (per spec) also set color space ([#779](https://github.com/pdfminer/pdfminer.six/issues/779)) +- Color "convenience operators" now (per spec) also set color space ([#794](https://github.com/pdfminer/pdfminer.six/pull/794)) +- `ValueError` when extracting images, due to breaking changes in Pillow ([#827](https://github.com/pdfminer/pdfminer.six/pull/827)) +- Small typo's and issues in the documentation ([#828](https://github.com/pdfminer/pdfminer.six/pull/828)) +- Ignore non-Unicode cmaps in TrueType fonts ([#806](https://github.com/pdfminer/pdfminer.six/pull/806)) + +### Changed + +- Using non-hardcoded version string and setuptools-git-versioning to enable installation from source and building on Python 3.12 ([#922](https://github.com/pdfminer/pdfminer.six/issues/922)) + ### Deprecated - Usage of `if __name__ == "__main__"` where it was only intended for testing purposes ([#756](https://github.com/pdfminer/pdfminer.six/pull/756)) +### Removed + +- Support for Python 3.6 and 3.7 because they are end-of-life ([#923](https://github.com/pdfminer/pdfminer.six/pull/923)) + ## [20220524] ### Fixed diff --git a/README.md b/README.md index b8c25422..0015bb08 100644 --- a/README.md +++ b/README.md @@ -39,8 +39,8 @@ Features How to use ---------- -* Install Python 3.6 or newer. -* Install +* Install Python 3.8 or newer. +* Install pdfminer.six. `pip install pdfminer.six` @@ -48,9 +48,18 @@ How to use `pip install 'pdfminer.six[image]'` -* Use command-line interface to extract text from pdf: +* Use the command-line interface to extract text from pdf. - `python pdf2txt.py samples/simple1.pdf` + `pdf2txt.py example.pdf` + +* Or use it with Python. + +```python +from pdfminer.high_level import extract_text + +text = extract_text("example.pdf") +print(text) +``` Contributing ------------ diff --git a/docs/source/faq.rst b/docs/source/faq.rst index 5a742d64..b209c807 100644 --- a/docs/source/faq.rst +++ b/docs/source/faq.rst @@ -7,11 +7,11 @@ Why is it called pdfminer.six? ============================== Pdfminer.six is a fork of the `original pdfminer created by Euske -`_. Almost all of the code and architecture is in -fact created by Euske. But, for a long time this original pdfminer did not +`_. Almost all of the code and architecture are in +-fact created by Euske. But, for a long time, this original pdfminer did not support Python 3. Until 2020 the original pdfminer only supported Python 2. The original goal of pdfminer.six was to add support for Python 3. This was -done with the six package. The six package helps to write code that is +done with the `six` package. The `six` package helps to write code that is compatible with both Python 2 and Python 3. Hence, pdfminer.six. As of 2020, pdfminer.six dropped the support for Python 2 because it was @@ -27,15 +27,42 @@ also equal to six feet. How does pdfminer.six compare to other forks of pdfminer? ========================================================== -Pdfminer.six is now an independent and community maintained package for -extracting text from PDF's with Python. We actively fix bugs (also for PDF's +Pdfminer.six is now an independent and community-maintained package for +extracting text from PDFs with Python. We actively fix bugs (also for PDFs that don't strictly follow the PDF Reference), add new features and improve the usability of pdfminer.six. This community separates pdfminer.six from the other forks of the original pdfminer. PDF as a format is very diverse and there are countless deviations from the official format. The only way to -support all the PDF's out there is to have a community that actively uses and +support all the PDFs out there is to have a community that actively uses and improves pdfminer. Since 2020, the original pdfminer is `dormant `_, and pdfminer.six is the fork which Euske recommends if you need an actively maintained version of pdfminer. + +Why are there `(cid:x)` values in the textual output? +===================================================== + +One of the most common issues with pdfminer.six is that the textual output +contains raw character id's `(cid:x)`. This is often experienced as confusing +because the text is shown fine in a PDF viewer and other text from the same +PDF is extracted properly. + +The underlying problem is that a PDF has two different representations +of each character. Each character is mapped to a glyph that determines +how the character is shown in a PDF viewer. And each character is also +mapped to its unicode value that is used when copy-pasting the character. +Some PDF's have incomplete unicode mappings and therefore it is impossible +to convert the character to unicode. In these cases pdfminer.six defaults +to showing the raw character id `(cid:x)` + +A quick test to see if pdfminer.six should be able to do better is to +copy-paste the text from a PDF viewer to a text editor. If the result +is proper text, pdfminer.six should also be able to extract proper text. +If the result is gibberish, pdfminer.six will also not be able to convert +the characters to unicode. + +References: + +#. `Chapter 5: Text, PDF Reference 1.7 `_ +#. `Text: PDF, Wikipedia `_ diff --git a/docs/source/howto/acro_forms.rst b/docs/source/howto/acro_forms.rst index 276dccff..c4932c34 100644 --- a/docs/source/howto/acro_forms.rst +++ b/docs/source/howto/acro_forms.rst @@ -65,7 +65,7 @@ Only AcroForm interactive forms are supported, XFA forms are not supported. print(name, values) -This code snippet will print all the fields name and value and save them in the "data" dictionary. +This code snippet will print all the fields' names and values and save them in the "data" dictionary. How it works: @@ -77,9 +77,9 @@ How it works: parser = PDFParser(fp) doc = PDFDocument(parser) -- Get the catalog +- Get the Catalog - (the catalog contains references to other objects defining the document structure, see section 7.7.2 of PDF 32000-1:2008 specs: https://www.adobe.com/devnet/pdf/pdf_reference.html) + (the catalog contains references to other objects defining the document structure, see section 7.7.2 of PDF 32000-1:2008 specs: https://opensource.adobe.com/dc-acrobat-sdk-docs/pdflsdk/index.html#pdf-reference) .. code-block:: python @@ -122,7 +122,7 @@ How it works: - Call the value(s) decoding method as needed - (a single field can hold multiple values, for example a combo box can hold more than one value at time) + (a single field can hold multiple values, for example, a combo box can hold more than one value at a time) .. code-block:: python @@ -131,7 +131,7 @@ How it works: else: values = decode_value(values) -(the decode_value method takes care of decoding the fields value returning a string) +(the decode_value method takes care of decoding the field's value, returning a string) - Decode PSLiteral and PSKeyword field values diff --git a/docs/source/index.rst b/docs/source/index.rst index a6e666eb..8650b5d5 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -59,18 +59,31 @@ Features Installation instructions ========================= -Before using it, you must install it using Python 3.6 or newer. +* Install Python 3.6 or newer. +* Install pdfminer.six. :: + $ pip install pdfminer.six` - $ pip install pdfminer.six +* (Optionally) install extra dependencies for extracting images. +:: + $ pip install 'pdfminer.six[image]'` -Optionally install extra dependencies that are needed to extract jpg images. +* Use the command-line interface to extract text from pdf. :: + $ pdf2txt.py example.pdf` + +* Or use it with Python. + +.. code-block:: python + + from pdfminer.high_level import extract_text + + text = extract_text("example.pdf") + print(text) - $ pip install 'pdfminer.six[image]' Contributing diff --git a/docs/source/topic/converting_pdf_to_text.rst b/docs/source/topic/converting_pdf_to_text.rst index 5194b114..18c1cba0 100644 --- a/docs/source/topic/converting_pdf_to_text.rst +++ b/docs/source/topic/converting_pdf_to_text.rst @@ -3,7 +3,7 @@ Converting a PDF file to text ***************************** -Most PDF files look like they contain well structured text. But the reality is +Most PDF files look like they contain well-structured text. But the reality is that a PDF file does not contain anything that resembles paragraphs, sentences or even words. When it comes to text, a PDF file is only aware of the characters and their placement. @@ -14,7 +14,7 @@ compose the table, the page footer or the description of a figure. Unlike other document formats, like a `.txt` file or a word document, the PDF format does not contain a stream of text. -A PDF document does consists of a collection of objects that together describe +A PDF document consists of a collection of objects that together describe the appearance of one or more pages, possibly accompanied by additional interactive elements and higher-level application data. A PDF file contains the objects making up a PDF document along with associated structural @@ -53,7 +53,7 @@ uses these bounding boxes to decide which characters belong together. Characters that are both horizontally and vertically close are grouped onto one line. How close they should be is determined by the `char_margin` -(M in figure) and the `line_overlap` (not in figure) parameter. The horizontal +(M in the figure) and the `line_overlap` (not in figure) parameter. The horizontal *distance* between the bounding boxes of two characters should be smaller than the `char_margin` and the vertical *overlap* between the bounding boxes should be smaller than the `line_overlap`. @@ -76,7 +76,7 @@ be separated by a space. The result of this stage is a list of lines. Each line consists of a list of characters. These characters are either original `LTChar` characters that -originate from the PDF file, or inserted `LTAnno` characters that +originate from the PDF file or inserted `LTAnno` characters that represent spaces between words or newlines at the end of each line. Grouping lines into boxes @@ -91,7 +91,7 @@ Lines that are both horizontally overlapping and vertically close are grouped. How vertically close the lines should be is determined by the `line_margin`. This margin is specified relative to the height of the bounding box. Lines are close if the gap between the tops (see L :sub:`1` in the figure) and bottoms -(see L :sub:`2`) in the figure) of the bounding boxes is closer together +(see L :sub:`2`) in the figure) of the bounding boxes are closer together than the absolute line margin, i.e. the `line_margin` multiplied by the height of the bounding box. @@ -120,7 +120,7 @@ Working with rotated characters The algorithm described above assumes that all characters have the same orientation. However, any writing direction is possible in a PDF. To -accommodate for this, pdfminer.six allows to detect vertical writing with the +accommodate for this, pdfminer.six allows detecting vertical writing with the `detect_vertical` parameter. This will apply all the grouping steps as if the pdf was rotated 90 (or 270) degrees diff --git a/docs/source/tutorial/commandline.rst b/docs/source/tutorial/commandline.rst index 5aa352da..f780d36a 100644 --- a/docs/source/tutorial/commandline.rst +++ b/docs/source/tutorial/commandline.rst @@ -18,7 +18,7 @@ pdf2txt.py :: - $ python tools/pdf2txt.py example.pdf + $ pdf2txt.py example.pdf all the text from the pdf appears on the command line The :ref:`api_pdf2txt` tool extracts all the text from a PDF. It uses layout @@ -29,7 +29,7 @@ dumppdf.py :: - $ python tools/dumppdf.py -a example.pdf + $ dumppdf.py -a example.pdf ... diff --git a/noxfile.py b/noxfile.py index f55bbadb..52995e1b 100644 --- a/noxfile.py +++ b/noxfile.py @@ -3,13 +3,13 @@ import nox -PYTHON_ALL_VERSIONS = ["3.6", "3.7", "3.8", "3.9", "3.10"] +PYTHON_ALL_VERSIONS = ["3.8", "3.9", "3.10", "3.11", "3.12"] PYTHON_MODULES = ["pdfminer", "tools", "tests", "noxfile.py", "setup.py"] @nox.session def format(session): - session.install("black") + session.install("black<23") # Format files locally with black, but only check in cicd if "CI" in os.environ: session.run("black", "--check", *PYTHON_MODULES) @@ -25,7 +25,7 @@ def lint(session): @nox.session def types(session): - session.install("mypy") + session.install("mypy<1") session.run( "mypy", "--install-types", @@ -37,12 +37,16 @@ def types(session): @nox.session(python=PYTHON_ALL_VERSIONS) def tests(session): + session.install("pip") + session.install("setuptools") session.install("-e", ".[dev]") session.run("pytest") @nox.session def docs(session): + session.install("pip") + session.install("setuptools") session.install("-e", ".[docs]") session.run( "python", "-m", "sphinx", "-b", "html", "docs/source", "docs/build/html" diff --git a/pdfminer/__init__.py b/pdfminer/__init__.py index e8e5221f..5bd4d50a 100644 --- a/pdfminer/__init__.py +++ b/pdfminer/__init__.py @@ -1,4 +1,10 @@ -__version__ = "__VERSION__" # auto replaced with tag in github actions +from importlib.metadata import version, PackageNotFoundError + +try: + __version__ = version("pdfminer.six") +except PackageNotFoundError: + # package is not installed, return default + __version__ = "0.0" if __name__ == "__main__": print(__version__) diff --git a/pdfminer/cmapdb.py b/pdfminer/cmapdb.py index 01306ed2..f0c43ab7 100644 --- a/pdfminer/cmapdb.py +++ b/pdfminer/cmapdb.py @@ -195,15 +195,20 @@ def add_cid2unichr(self, cid: int, code: Union[PSLiteral, bytes, int]) -> None: if isinstance(code, PSLiteral): # Interpret as an Adobe glyph name. assert isinstance(code.name, str) - self.cid2unichr[cid] = name2unicode(code.name) + unichr = name2unicode(code.name) elif isinstance(code, bytes): # Interpret as UTF-16BE. - self.cid2unichr[cid] = code.decode("UTF-16BE", "ignore") + unichr = code.decode("UTF-16BE", "ignore") elif isinstance(code, int): - self.cid2unichr[cid] = chr(code) + unichr = chr(code) else: raise TypeError(code) + # A0 = non-breaking space, some weird fonts can have a collision on a cid here. + if unichr == "\u00A0" and self.cid2unichr.get(cid) == " ": + return + self.cid2unichr[cid] = unichr + class PyCMap(CMap): def __init__(self, name: str, module: Any) -> None: diff --git a/pdfminer/converter.py b/pdfminer/converter.py index 6b367aa2..8e48d86a 100644 --- a/pdfminer/converter.py +++ b/pdfminer/converter.py @@ -138,6 +138,19 @@ def paint_path( ] pts = [apply_matrix_pt(self.ctm, pt) for pt in raw_pts] + operators = [str(operation[0]) for operation in path] + transformed_points = [ + [ + apply_matrix_pt(self.ctm, (float(operand1), float(operand2))) + for operand1, operand2 in zip(operation[1::2], operation[2::2]) + ] + for operation in path + ] + transformed_path = [ + cast(PathSegment, (o, *p)) + for o, p in zip(operators, transformed_points) + ] + if shape in {"mlh", "ml"}: # single line segment # @@ -152,6 +165,8 @@ def paint_path( evenodd, gstate.scolor, gstate.ncolor, + original_path=transformed_path, + dashing_style=gstate.dash, ) self.cur_item.add(line) @@ -171,6 +186,8 @@ def paint_path( evenodd, gstate.scolor, gstate.ncolor, + transformed_path, + gstate.dash, ) self.cur_item.add(rect) else: @@ -182,9 +199,10 @@ def paint_path( evenodd, gstate.scolor, gstate.ncolor, + transformed_path, + gstate.dash, ) self.cur_item.add(curve) - else: curve = LTCurve( gstate.linewidth, @@ -194,6 +212,8 @@ def paint_path( evenodd, gstate.scolor, gstate.ncolor, + transformed_path, + gstate.dash, ) self.cur_item.add(curve) diff --git a/pdfminer/high_level.py b/pdfminer/high_level.py index 94be9d42..6587fdee 100644 --- a/pdfminer/high_level.py +++ b/pdfminer/high_level.py @@ -195,7 +195,7 @@ def extract_pages( :param caching: If resources should be cached :param laparams: An LAParams object from pdfminer.layout. If None, uses some default settings that often work well. - :return: + :return: LTPage objects """ if laparams is None: laparams = LAParams() diff --git a/pdfminer/image.py b/pdfminer/image.py index 54b14929..d72a10cd 100644 --- a/pdfminer/image.py +++ b/pdfminer/image.py @@ -8,7 +8,7 @@ from typing import Literal except ImportError: # Literal was introduced in Python 3.8 - from typing_extensions import Literal # type: ignore[misc] + from typing_extensions import Literal # type: ignore[assignment] from .jbig2 import JBIG2StreamReader, JBIG2StreamWriter from .layout import LTImage @@ -225,20 +225,24 @@ def _save_bytes(self, image: LTImage) -> str: with open(path, "wb") as fp: try: from PIL import Image # type: ignore[import] + from PIL import ImageOps except ImportError: raise ImportError(PIL_ERROR_MESSAGE) - mode: Literal["1", "8", "RGB", "CMYK"] + mode: Literal["1", "L", "RGB", "CMYK"] if image.bits == 1: mode = "1" elif image.bits == 8 and channels == 1: - mode = "8" + mode = "L" elif image.bits == 8 and channels == 3: mode = "RGB" elif image.bits == 8 and channels == 4: mode = "CMYK" img = Image.frombytes(mode, image.srcsize, image.stream.get_data(), "raw") + if mode == "L": + img = ImageOps.invert(img) + img.save(fp) return name diff --git a/pdfminer/layout.py b/pdfminer/layout.py index 907916bb..6eca8f5e 100644 --- a/pdfminer/layout.py +++ b/pdfminer/layout.py @@ -20,7 +20,7 @@ from .pdfinterp import Color from .pdfinterp import PDFGraphicState from .pdftypes import PDFStream -from .utils import INF +from .utils import INF, PathSegment from .utils import LTComponentT from .utils import Matrix from .utils import Plane @@ -225,7 +225,14 @@ def contains(self, obj: "LTComponent") -> bool: class LTCurve(LTComponent): - """A generic Bezier curve""" + """ + A generic Bezier curve + + The parameter `original_path` contains the original + pathing information from the pdf (e.g. for reconstructing Bezier Curves). + + `dashing_style` contains the Dashing information if any. + """ def __init__( self, @@ -236,6 +243,8 @@ def __init__( evenodd: bool = False, stroking_color: Optional[Color] = None, non_stroking_color: Optional[Color] = None, + original_path: Optional[List[PathSegment]] = None, + dashing_style: Optional[Tuple[object, object]] = None, ) -> None: LTComponent.__init__(self, get_bound(pts)) self.pts = pts @@ -245,6 +254,8 @@ def __init__( self.evenodd = evenodd self.stroking_color = stroking_color self.non_stroking_color = non_stroking_color + self.original_path = original_path + self.dashing_style = dashing_style def get_pts(self) -> str: return ",".join("%.3f,%.3f" % p for p in self.pts) @@ -266,6 +277,8 @@ def __init__( evenodd: bool = False, stroking_color: Optional[Color] = None, non_stroking_color: Optional[Color] = None, + original_path: Optional[List[PathSegment]] = None, + dashing_style: Optional[Tuple[object, object]] = None, ) -> None: LTCurve.__init__( self, @@ -276,6 +289,8 @@ def __init__( evenodd, stroking_color, non_stroking_color, + original_path, + dashing_style, ) @@ -294,6 +309,8 @@ def __init__( evenodd: bool = False, stroking_color: Optional[Color] = None, non_stroking_color: Optional[Color] = None, + original_path: Optional[List[PathSegment]] = None, + dashing_style: Optional[Tuple[object, object]] = None, ) -> None: (x0, y0, x1, y1) = bbox LTCurve.__init__( @@ -305,6 +322,8 @@ def __init__( evenodd, stroking_color, non_stroking_color, + original_path, + dashing_style, ) @@ -1026,8 +1045,8 @@ def analyze(self, laparams: LAParams) -> None: class LTPage(LTLayoutContainer): """Represents an entire page. - May contain child objects like LTTextBox, LTFigure, LTImage, LTRect, - LTCurve and LTLine. + Like any other LTLayoutContainer, an LTPage can be iterated to obtain child + objects like LTTextBox, LTFigure, LTImage, LTRect, LTCurve and LTLine. """ def __init__(self, pageid: int, bbox: Rect, rotate: float = 0) -> None: diff --git a/pdfminer/pdffont.py b/pdfminer/pdffont.py index 13629c77..63826b96 100644 --- a/pdfminer/pdffont.py +++ b/pdfminer/pdffont.py @@ -755,7 +755,11 @@ def create_unicode_map(self) -> FileUnicodeMap: ) char2gid: Dict[int, int] = {} # Only supports subtable type 0, 2 and 4. - for (_1, _2, st_offset) in subtables: + for (platform_id, encoding_id, st_offset) in subtables: + # Skip non-Unicode cmaps. + # https://docs.microsoft.com/en-us/typography/opentype/spec/cmap + if not (platform_id == 0 or (platform_id == 3 and encoding_id in [1, 10])): + continue fp.seek(base_offset + st_offset) (fmttype, fmtlen, fmtlang) = cast( Tuple[int, int, int], struct.unpack(">HHH", fp.read(6)) @@ -824,6 +828,8 @@ def create_unicode_map(self) -> FileUnicodeMap: char2gid[c] = (c + idd) & 0xFFFF else: assert False, str(("Unhandled", fmttype)) + if not char2gid: + raise TrueTypeFont.CMapNotFound # create unicode map unicode_map = FileUnicodeMap() for (char, gid) in char2gid.items(): diff --git a/samples/contrib/issue-791-non-unicode-cmap.pdf b/samples/contrib/issue-791-non-unicode-cmap.pdf new file mode 100644 index 00000000..8595bd6f Binary files /dev/null and b/samples/contrib/issue-791-non-unicode-cmap.pdf differ diff --git a/setup.py b/setup.py index 8f257c3f..516e6af6 100644 --- a/setup.py +++ b/setup.py @@ -1,24 +1,23 @@ -import sys from pathlib import Path - from setuptools import setup -from os import path - -sys.path.append(str(Path(__file__).parent)) -import pdfminer as package # noqa: E402 -with open(path.join(path.abspath(path.dirname(__file__)), "README.md")) as f: +root_dir = Path(__file__).parent +with open(root_dir / "README.md", "rt") as f: readme = f.read() setup( name="pdfminer.six", - version=package.__version__, + setuptools_git_versioning={ + "enabled": True, + }, + setup_requires=["setuptools-git-versioning<2"], packages=["pdfminer"], package_data={"pdfminer": ["cmap/*.pickle.gz", "py.typed"]}, install_requires=[ "charset-normalizer >= 2.0.0", "cryptography >= 36.0.0", 'typing_extensions; python_version < "3.8"', + 'importlib_metadata; python_version < "3.8"', ], extras_require={ "dev": ["pytest", "nox", "black", "mypy == 0.931"], @@ -45,10 +44,11 @@ python_requires=">=3.6", classifiers=[ "Programming Language :: Python", - "Programming Language :: Python :: 3.6", - "Programming Language :: Python :: 3.7", "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", "Programming Language :: Python :: 3 :: Only", "Development Status :: 5 - Production/Stable", "Environment :: Console", diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/test_converter.py b/tests/test_converter.py index 80de019d..17d280cb 100644 --- a/tests/test_converter.py +++ b/tests/test_converter.py @@ -1,11 +1,11 @@ import io from tempfile import TemporaryFile -from helpers import absolute_sample_path from pdfminer.converter import PDFLayoutAnalyzer, PDFConverter from pdfminer.high_level import extract_pages from pdfminer.layout import LTChar, LTContainer, LTRect, LTLine, LTCurve from pdfminer.pdfinterp import PDFGraphicState +from tests.helpers import absolute_sample_path class TestPaintPath: @@ -173,7 +173,7 @@ def get_types(path): # they all have shape 'ml' not 'mlh' ml_pdf = extract_pages("samples/contrib/pr-00530-ml-lines.pdf") ml_pdf_page = list(ml_pdf)[0] - assert sum(type(item) == LTLine for item in ml_pdf_page) == 6 + assert sum(type(item) is LTLine for item in ml_pdf_page) == 6 def _get_analyzer(self): analyzer = PDFLayoutAnalyzer(None) @@ -216,6 +216,45 @@ def parse(path): (71.41, 434.89), ] + def test_paint_path_beziers_check_raw(self): + """See section 4.4, table 4.9 of the PDF reference manual""" + + def parse(path): + analyzer = self._get_analyzer() + analyzer.cur_item = LTContainer([0, 1000, 0, 1000]) + analyzer.paint_path(PDFGraphicState(), False, False, False, path) + return analyzer.cur_item._objs + + # "c" operator + assert parse( + [ + ("m", 72.41, 433.89), + ("c", 72.41, 434.45, 71.96, 434.89, 71.41, 434.89), + ] + )[0].original_path == [ + ("m", (72.41, 433.89)), + ("c", (72.41, 434.45), (71.96, 434.89), (71.41, 434.89)), + ] + + def test_paint_path_dashed(self): + """See section 4.4, table 4.9 of the PDF reference manual""" + + def parse(path): + analyzer = self._get_analyzer() + analyzer.cur_item = LTContainer([0, 1000, 0, 1000]) + graphicstate = PDFGraphicState() + graphicstate.dash = ([1, 1], 0) + analyzer.paint_path(graphicstate, False, False, False, path) + return analyzer.cur_item._objs + + # "c" operator + assert parse( + [ + ("m", 72.41, 433.89), + ("c", 72.41, 434.45, 71.96, 434.89, 71.41, 434.89), + ] + )[0].dashing_style == ([1, 1], 0) + def test_paint_path_without_starting_m(self): gs = PDFGraphicState() analyzer = self._get_analyzer() diff --git a/tests/test_font_size.py b/tests/test_font_size.py index fca808c3..cac5b753 100644 --- a/tests/test_font_size.py +++ b/tests/test_font_size.py @@ -1,6 +1,6 @@ -from helpers import absolute_sample_path from pdfminer.high_level import extract_pages from pdfminer.layout import LTChar, LTTextBox +from tests.helpers import absolute_sample_path def test_font_size(): diff --git a/tests/test_highlevel_extracttext.py b/tests/test_highlevel_extracttext.py index 842459d2..cd7d8bfc 100644 --- a/tests/test_highlevel_extracttext.py +++ b/tests/test_highlevel_extracttext.py @@ -1,8 +1,8 @@ import unittest -from helpers import absolute_sample_path -from pdfminer.high_level import extract_text, extract_pages +from pdfminer.high_level import extract_pages, extract_text from pdfminer.layout import LAParams, LTTextContainer +from tests.helpers import absolute_sample_path def run_with_string(sample_path, laparams=None): @@ -39,6 +39,7 @@ def run_with_file(sample_path): "contrib/issue_566_test_1.pdf": "ISSUE Date:2019-4-25 Buyer:黎荣", "contrib/issue_566_test_2.pdf": "甲方:中国饮料有限公司(盖章)", "contrib/issue-625-identity-cmap.pdf": "Termin płatności: 2021-05-03", + "contrib/issue-791-non-unicode-cmap.pdf": "Peněžní prostředky na účtech", } @@ -120,6 +121,11 @@ def test_issue_625_identity_cmap(self): self.assertEqual(lines[6], test_strings[test_file]) + def test_issue_791_non_unicode_cmap(self): + test_file = "contrib/issue-791-non-unicode-cmap.pdf" + s = run_with_file(test_file) + self.assertEqual(s.strip(), test_strings[test_file]) + class TestExtractPages(unittest.TestCase): def _get_test_file_path(self): diff --git a/tests/test_layout.py b/tests/test_layout.py index fd393a4e..85058cf3 100644 --- a/tests/test_layout.py +++ b/tests/test_layout.py @@ -10,7 +10,7 @@ LTTextBoxVertical, ) from pdfminer.utils import Plane -from helpers import absolute_sample_path +from tests.helpers import absolute_sample_path class TestGroupTextLines(unittest.TestCase): diff --git a/tests/test_pdfdocument.py b/tests/test_pdfdocument.py index 3c1f2430..c57126fb 100644 --- a/tests/test_pdfdocument.py +++ b/tests/test_pdfdocument.py @@ -2,10 +2,10 @@ import pytest -from helpers import absolute_sample_path from pdfminer.pdfdocument import PDFDocument, PDFNoPageLabels from pdfminer.pdfparser import PDFParser from pdfminer.pdftypes import PDFObjectNotFound, dict_value, int_value +from tests.helpers import absolute_sample_path class TestPdfDocument(object): diff --git a/tests/test_pdfpage.py b/tests/test_pdfpage.py index 0d3109f1..c3fe86c2 100644 --- a/tests/test_pdfpage.py +++ b/tests/test_pdfpage.py @@ -1,7 +1,7 @@ -from helpers import absolute_sample_path from pdfminer.pdfdocument import PDFDocument from pdfminer.pdfpage import PDFPage from pdfminer.pdfparser import PDFParser +from tests.helpers import absolute_sample_path class TestPdfPage(object): diff --git a/tests/test_tools_dumppdf.py b/tests/test_tools_dumppdf.py index 84e3111c..971c3d07 100644 --- a/tests/test_tools_dumppdf.py +++ b/tests/test_tools_dumppdf.py @@ -2,8 +2,8 @@ import pytest -from helpers import absolute_sample_path -from tempfilepath import TemporaryFilePath +from tests.helpers import absolute_sample_path +from tests.tempfilepath import TemporaryFilePath from tools import dumppdf diff --git a/tests/test_tools_pdf2txt.py b/tests/test_tools_pdf2txt.py index abd53074..f6eeefcf 100644 --- a/tests/test_tools_pdf2txt.py +++ b/tests/test_tools_pdf2txt.py @@ -1,11 +1,11 @@ +import filecmp import os from shutil import rmtree from tempfile import mkdtemp -import filecmp import tools.pdf2txt as pdf2txt -from helpers import absolute_sample_path -from tempfilepath import TemporaryFilePath +from tests.helpers import absolute_sample_path +from tests.tempfilepath import TemporaryFilePath def run(sample_path, options=None): diff --git a/tests/test_utils.py b/tests/test_utils.py index 062a9733..160b02b4 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -2,15 +2,15 @@ import pytest -from helpers import absolute_sample_path from pdfminer.layout import LTComponent from pdfminer.utils import ( - open_filename, Plane, - shorten_str, - format_int_roman, format_int_alpha, + format_int_roman, + open_filename, + shorten_str, ) +from tests.helpers import absolute_sample_path class TestOpenFilename: