From a737311c8dbff6a8bea600c9277fe65dc8cd32eb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=94=D0=B6=D0=BE=D0=BD=2C=20=D0=BF=D1=80=D0=BE=D1=81?= =?UTF-8?q?=D1=82=D0=BE=20=D0=94=D0=B6=D0=BE=D0=BD?= <1506905+sudodoki@users.noreply.github.com> Date: Fri, 29 Dec 2023 16:39:16 +0100 Subject: [PATCH 1/5] Small change: fix code comment for color type (#816) Co-authored-by: Pieter Marsman --- pdfminer/pdfinterp.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pdfminer/pdfinterp.py b/pdfminer/pdfinterp.py index c1a85973..0ac8e5ad 100644 --- a/pdfminer/pdfinterp.py +++ b/pdfminer/pdfinterp.py @@ -115,8 +115,8 @@ def reset(self) -> None: Color = Union[ float, # Greyscale Tuple[float, float, float], # R, G, B - Tuple[float, float, float, float], -] # C, M, Y, K + Tuple[float, float, float, float], # C, M, Y, K +] class PDFGraphicState: From 17a36176c7b356c6aac618e0185781a2fce11944 Mon Sep 17 00:00:00 2001 From: pettzilla1 <97901734+pettzilla1@users.noreply.github.com> Date: Fri, 29 Dec 2023 17:25:02 +0000 Subject: [PATCH 2/5] Fix #800 (#819) Co-authored-by: Pieter Marsman --- CONTRIBUTING.md | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 493610c1..18800296 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -17,7 +17,7 @@ Any contribution is appreciated! You might want to: * Help others by sharing your thoughs in comments on issues and pull requests. * Join the chat on [gitter](https://gitter.im/pdfminer-six/Lobby) -## Guidelines for creating issues +## Guideline for creating issues * Search previous issues, as yours might be a duplicate. * When creating a new issue for a bug, include a minimal reproducible example. @@ -37,7 +37,7 @@ Any contribution is appreciated! You might want to: * Check spelling and grammar. * Don't forget to update the [CHANGELOG.md](CHANGELOG.md#[Unreleased]). -## Guidelines for posting comments +## Guideline for posting comments * [Be cordial and positive](https://www.kennethreitz.org/essays/be-cordial-or-be-on-your-way) @@ -45,6 +45,12 @@ Any contribution is appreciated! You might want to: * Publishing is automated. Add a YYYYMMDD version tag and GitHub workflows will do the rest. +## Guideline for dependencies + +* This package is distributed under the [MIT license](LICENSE). +* All dependencies should be compatible with this license. +* Use [licensecheck](https://pypi.org/project/licensecheck/) to validate if new packages are compatible. + ## Getting started 1. Clone the repository From 7715f88be78e9bea702f319949cbcb89d4c27fcd Mon Sep 17 00:00:00 2001 From: Diego Miguel Lozano <22967053+dmlls@users.noreply.github.com> Date: Fri, 29 Dec 2023 21:06:54 +0100 Subject: [PATCH 3/5] Add instructions to build documentation (#822) Co-authored-by: Pieter Marsman --- docs/README.md | 34 ++++++++++++++++++++++++++++++++++ docs/requirements.txt | 1 - 2 files changed, 34 insertions(+), 1 deletion(-) create mode 100644 docs/README.md delete mode 100644 docs/requirements.txt diff --git a/docs/README.md b/docs/README.md new file mode 100644 index 00000000..91e7ebcf --- /dev/null +++ b/docs/README.md @@ -0,0 +1,34 @@ +# Working on documentation + +The pdfminer.six docs are generated with [Sphinx](https://www.sphinx-doc.org/en/master/), using +[reStructuredText](https://docutils.sourceforge.io/rst.html). + +The documentation is hosted on https://pdfminersix.readthedocs.io/. + +## Deploying new documentation + +New documentation is deployed automatically when PR's are merged. + +## Building documentation locally + +You can build the documentation locally on your machine using the following steps. + +1. (Recommended) create a and activate a Python virtual environment. + + ```console + python -m venv .venv + source .venv/bin/activate + ``` + +2. With the virtual environment activated, install the dependencies for building the documentation. + + ```console + pip install '.[docs]' + ``` + +3. Build the documentation. + + ```console + make clean && make html + ``` + diff --git a/docs/requirements.txt b/docs/requirements.txt deleted file mode 100644 index 1e76fdad..00000000 --- a/docs/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -sphinx-argparse \ No newline at end of file From c85e9a38245fbe7d32fbc18a388536c957ad1a5c Mon Sep 17 00:00:00 2001 From: Diego Miguel Lozano <22967053+dmlls@users.noreply.github.com> Date: Fri, 29 Dec 2023 21:11:16 +0100 Subject: [PATCH 4/5] Fix links in PR template (#823) Co-authored-by: Pieter Marsman --- .github/pull_request_template.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md index 09f14a34..f7238f4e 100644 --- a/.github/pull_request_template.md +++ b/.github/pull_request_template.md @@ -8,8 +8,8 @@ Please *remove* this paragraph with a description of how this PR has been tested **Checklist** -- [ ] I have read [CONTRIBUTING.md](../CONTRIBUTING.md). -- [ ] I have added a concise human-readable description of the change to [CHANGELOG.md](../CHANGELOG.md). +- [ ] I have read [CONTRIBUTING.md](https://github.com/pdfminer/pdfminer.six/blob/master/CONTRIBUTING.md). +- [ ] I have added a concise human-readable description of the change to [CHANGELOG.md](https://github.com/pdfminer/pdfminer.six/blob/master/CHANGELOG.md). - [ ] I have tested that this fix is effective or that this feature works. - [ ] I have added docstrings to newly created methods and classes. -- [ ] I have updated the [README.md](../README.md) and the [readthedocs](../docs/source) documentation. Or verified that this is not necessary. +- [ ] I have updated the [README.md](https://github.com/pdfminer/pdfminer.six/blob/master/README.md) and the [readthedocs](https://github.com/pdfminer/pdfminer.six/tree/master/docs/source) documentation. Or verified that this is not necessary. From adf95a4bb92078d9962166f8864448a01eeab5cd Mon Sep 17 00:00:00 2001 From: Diego Miguel Lozano <22967053+dmlls@users.noreply.github.com> Date: Fri, 29 Dec 2023 21:43:44 +0100 Subject: [PATCH 5/5] Add how to extract font name, size and color to docs (#824) Co-authored-by: Pieter Marsman --- docs/source/howto/character_properties.rst | 162 +++++++++++++++++++++ docs/source/howto/index.rst | 1 + 2 files changed, 163 insertions(+) create mode 100644 docs/source/howto/character_properties.rst diff --git a/docs/source/howto/character_properties.rst b/docs/source/howto/character_properties.rst new file mode 100644 index 00000000..83945779 --- /dev/null +++ b/docs/source/howto/character_properties.rst @@ -0,0 +1,162 @@ +.. _character_properties: + +How to extract font names and sizes from PDF's +****************************************************** + +Before you start, make sure you have :ref:`installed pdfminer.six`. + +The following code sample shows how to extract font names and sizes for each of the characters. This uses the +[simple1.pdf](https://raw.githubusercontent.com/pdfminer/pdfminer.six/master/samples/simple1.pdf). + +.. code-block:: python +from pathlib import Path +from typing import Iterable, Any + +from pdfminer.high_level import extract_pages + + +def show_ltitem_hierarchy(o: Any, depth=0): + """Show location and text of LTItem and all its descendants""" + if depth == 0: + print('element font stroking color text') + print('------------------------------ --------------------- -------------- ----------') + + print( + f'{get_indented_name(o, depth):<30.30s} ' + f'{get_optional_fontinfo(o):<20.20s} ' + f'{get_optional_color(o):<17.17s}' + f'{get_optional_text(o)}' + ) + + if isinstance(o, Iterable): + for i in o: + show_ltitem_hierarchy(i, depth=depth + 1) + + +def get_indented_name(o: Any, depth: int) -> str: + """Indented name of class""" + return ' ' * depth + o.__class__.__name__ + + +def get_optional_fontinfo(o: Any) -> str: + """Font info of LTChar if available, otherwise empty string""" + if hasattr(o, 'fontname') and hasattr(o, 'size'): + return f'{o.fontname} {round(o.size)}pt' + return '' + +def get_optional_color(o: Any) -> str: + """Font info of LTChar if available, otherwise empty string""" + if hasattr(o, 'graphicstate'): + return f'{o.graphicstate.scolor}' + return '' + + +def get_optional_text(o: Any) -> str: + """Text of LTItem if available, otherwise empty string""" + if hasattr(o, 'get_text'): + return o.get_text().strip() + return '' + + +path = Path('samples/simple1.pdf').expanduser() +pages = extract_pages(path) +show_ltitem_hierarchy(pages) +.. note:: + +The output looks like below. Note that it shows the hierarchical structure of the layout elements. The layout algorithm +groups characters into lines and lines into boxes. And boxes appear on a page. The pages, boxes and lines do not have +font information because this can change for each character. The stroking color is always `None` in this example, but +it will contain the color if the PDF does specify colors. + +``` +element font stroking color text +------------------------------ --------------------- -------------- ---------- +generator + LTPage + LTTextBoxHorizontal Hello + LTTextLineHorizontal Hello + LTChar Helvetica 24pt None H + LTChar Helvetica 24pt None e + LTChar Helvetica 24pt None l + LTChar Helvetica 24pt None l + LTChar Helvetica 24pt None o + LTChar Helvetica 24pt None + LTAnno + LTTextBoxHorizontal World + LTTextLineHorizontal World + LTChar Helvetica 24pt None W + LTChar Helvetica 24pt None o + LTChar Helvetica 24pt None r + LTChar Helvetica 24pt None l + LTChar Helvetica 24pt None d + LTAnno + LTTextBoxHorizontal Hello + LTTextLineHorizontal Hello + LTChar Helvetica 24pt None H + LTChar Helvetica 24pt None e + LTChar Helvetica 24pt None l + LTChar Helvetica 24pt None l + LTChar Helvetica 24pt None o + LTChar Helvetica 24pt None + LTAnno + LTTextBoxHorizontal World + LTTextLineHorizontal World + LTChar Helvetica 24pt None W + LTChar Helvetica 24pt None o + LTChar Helvetica 24pt None r + LTChar Helvetica 24pt None l + LTChar Helvetica 24pt None d + LTAnno + LTTextBoxHorizontal H e l l o + LTTextLineHorizontal H e l l o + LTChar Helvetica 24pt None H + LTAnno + LTChar Helvetica 24pt None e + LTAnno + LTChar Helvetica 24pt None l + LTAnno + LTChar Helvetica 24pt None l + LTAnno + LTChar Helvetica 24pt None o + LTAnno + LTChar Helvetica 24pt None + LTAnno + LTTextBoxHorizontal W o r l d + LTTextLineHorizontal W o r l d + LTChar Helvetica 24pt None W + LTAnno + LTChar Helvetica 24pt None o + LTAnno + LTChar Helvetica 24pt None r + LTAnno + LTChar Helvetica 24pt None l + LTAnno + LTChar Helvetica 24pt None d + LTAnno + LTTextBoxHorizontal H e l l o + LTTextLineHorizontal H e l l o + LTChar Helvetica 24pt None H + LTAnno + LTChar Helvetica 24pt None e + LTAnno + LTChar Helvetica 24pt None l + LTAnno + LTChar Helvetica 24pt None l + LTAnno + LTChar Helvetica 24pt None o + LTAnno + LTChar Helvetica 24pt None + LTAnno + LTTextBoxHorizontal W o r l d + LTTextLineHorizontal W o r l d + LTChar Helvetica 24pt None W + LTAnno + LTChar Helvetica 24pt None o + LTAnno + LTChar Helvetica 24pt None r + LTAnno + LTChar Helvetica 24pt None l + LTAnno + LTChar Helvetica 24pt None d + LTAnno +``` diff --git a/docs/source/howto/index.rst b/docs/source/howto/index.rst index 9d3269aa..59033e36 100644 --- a/docs/source/howto/index.rst +++ b/docs/source/howto/index.rst @@ -10,3 +10,4 @@ How-to guides help you to solve specific problems with pdfminer.six. images acro_forms + character_properties