diff --git a/CHANGELOG.md b/CHANGELOG.md index 84d0959e..d9c16648 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,9 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). ## [Unreleased] +### Removed +- Support for Python 3.6 and 3.7 ([#921](https://github.com/pdfminer/pdfminer.six/pull/921)) + ### Added - Output converter for the hOCR format ([#651](https://github.com/pdfminer/pdfminer.six/pull/651)) @@ -14,7 +17,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). ### Fixed -- Minimally fix CI by freezing tool versions ([#921](https://github.com/pdfminer/pdfminer.six/pull/921)) +- Broken CI/CD pipeline by setting upper version limit for black, mypy, pip and setuptools ([#921](https://github.com/pdfminer/pdfminer.six/pull/921)) +- `flake8` failures ([#921](https://github.com/pdfminer/pdfminer.six/pull/921)) - `ValueError` when bmp images with 1 bit channel are decoded ([#773](https://github.com/pdfminer/pdfminer.six/issues/773)) - `ValueError` when trying to decrypt empty metadata values ([#766](https://github.com/pdfminer/pdfminer.six/issues/766)) - Sphinx errors during building of documentation ([#760](https://github.com/pdfminer/pdfminer.six/pull/760)) @@ -24,6 +28,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). - Color "convenience operators" now (per spec) also set color space ([#794](https://github.com/pdfminer/pdfminer.six/pull/794)) - `ValueError` when extracting images, due to breaking changes in Pillow ([#827](https://github.com/pdfminer/pdfminer.six/pull/827)) - Small typo's and issues in the documentation ([#828](https://github.com/pdfminer/pdfminer.six/pull/828)) +- Ignore non-Unicode cmaps in TrueType fonts ([#806](https://github.com/pdfminer/pdfminer.six/pull/806)) ### Changed diff --git a/noxfile.py b/noxfile.py index 12489fb0..24e91c29 100644 --- a/noxfile.py +++ b/noxfile.py @@ -37,12 +37,16 @@ def types(session): @nox.session(python=PYTHON_ALL_VERSIONS) def tests(session): + session.install("pip<23") + session.install("setuptools<58") session.install("-e", ".[dev]") session.run("pytest") @nox.session def docs(session): + session.install("pip<23") + session.install("setuptools<58") session.install("-e", ".[docs]") session.run( "python", "-m", "sphinx", "-b", "html", "docs/source", "docs/build/html" diff --git a/pdfminer/cmapdb.py b/pdfminer/cmapdb.py index 01306ed2..f0c43ab7 100644 --- a/pdfminer/cmapdb.py +++ b/pdfminer/cmapdb.py @@ -195,15 +195,20 @@ def add_cid2unichr(self, cid: int, code: Union[PSLiteral, bytes, int]) -> None: if isinstance(code, PSLiteral): # Interpret as an Adobe glyph name. assert isinstance(code.name, str) - self.cid2unichr[cid] = name2unicode(code.name) + unichr = name2unicode(code.name) elif isinstance(code, bytes): # Interpret as UTF-16BE. - self.cid2unichr[cid] = code.decode("UTF-16BE", "ignore") + unichr = code.decode("UTF-16BE", "ignore") elif isinstance(code, int): - self.cid2unichr[cid] = chr(code) + unichr = chr(code) else: raise TypeError(code) + # A0 = non-breaking space, some weird fonts can have a collision on a cid here. + if unichr == "\u00A0" and self.cid2unichr.get(cid) == " ": + return + self.cid2unichr[cid] = unichr + class PyCMap(CMap): def __init__(self, name: str, module: Any) -> None: diff --git a/pdfminer/pdffont.py b/pdfminer/pdffont.py index 13629c77..63826b96 100644 --- a/pdfminer/pdffont.py +++ b/pdfminer/pdffont.py @@ -755,7 +755,11 @@ def create_unicode_map(self) -> FileUnicodeMap: ) char2gid: Dict[int, int] = {} # Only supports subtable type 0, 2 and 4. - for (_1, _2, st_offset) in subtables: + for (platform_id, encoding_id, st_offset) in subtables: + # Skip non-Unicode cmaps. + # https://docs.microsoft.com/en-us/typography/opentype/spec/cmap + if not (platform_id == 0 or (platform_id == 3 and encoding_id in [1, 10])): + continue fp.seek(base_offset + st_offset) (fmttype, fmtlen, fmtlang) = cast( Tuple[int, int, int], struct.unpack(">HHH", fp.read(6)) @@ -824,6 +828,8 @@ def create_unicode_map(self) -> FileUnicodeMap: char2gid[c] = (c + idd) & 0xFFFF else: assert False, str(("Unhandled", fmttype)) + if not char2gid: + raise TrueTypeFont.CMapNotFound # create unicode map unicode_map = FileUnicodeMap() for (char, gid) in char2gid.items(): diff --git a/samples/contrib/issue-791-non-unicode-cmap.pdf b/samples/contrib/issue-791-non-unicode-cmap.pdf new file mode 100644 index 00000000..8595bd6f Binary files /dev/null and b/samples/contrib/issue-791-non-unicode-cmap.pdf differ diff --git a/tests/test_highlevel_extracttext.py b/tests/test_highlevel_extracttext.py index 842459d2..1ea0e7f2 100644 --- a/tests/test_highlevel_extracttext.py +++ b/tests/test_highlevel_extracttext.py @@ -39,6 +39,7 @@ def run_with_file(sample_path): "contrib/issue_566_test_1.pdf": "ISSUE Date:2019-4-25 Buyer:黎荣", "contrib/issue_566_test_2.pdf": "甲方:中国饮料有限公司(盖章)", "contrib/issue-625-identity-cmap.pdf": "Termin płatności: 2021-05-03", + "contrib/issue-791-non-unicode-cmap.pdf": "Peněžní prostředky na účtech", } @@ -120,6 +121,11 @@ def test_issue_625_identity_cmap(self): self.assertEqual(lines[6], test_strings[test_file]) + def test_issue_791_non_unicode_cmap(self): + test_file = "contrib/issue-791-non-unicode-cmap.pdf" + s = run_with_file(test_file) + self.assertEqual(s.strip(), test_strings[test_file]) + class TestExtractPages(unittest.TestCase): def _get_test_file_path(self):