Merge branch 'master' into fix_version

pdfminer · Dec 22, 2023 · d90720c · d90720c
2 parents 48c1397 + 997424d
commit d90720c
Show file tree

Hide file tree

Showing 6 changed files with 31 additions and 5 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,9 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
 
 ## [Unreleased]
 
+### Removed
+- Support for Python 3.6 and 3.7 ([#921](https://github.com/pdfminer/pdfminer.six/pull/921))
+
 ### Added
 
 - Output converter for the hOCR format ([#651](https://github.com/pdfminer/pdfminer.six/pull/651))
@@ -14,7 +17,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
 
 ### Fixed
 
-- Minimally fix CI by freezing tool versions ([#921](https://github.com/pdfminer/pdfminer.six/pull/921))
+- Broken CI/CD pipeline by setting upper version limit for black, mypy, pip and setuptools ([#921](https://github.com/pdfminer/pdfminer.six/pull/921))
+- `flake8` failures ([#921](https://github.com/pdfminer/pdfminer.six/pull/921))
 - `ValueError` when bmp images with 1 bit channel are decoded ([#773](https://github.com/pdfminer/pdfminer.six/issues/773))
 - `ValueError` when trying to decrypt empty metadata values ([#766](https://github.com/pdfminer/pdfminer.six/issues/766))
 - Sphinx errors during building of documentation ([#760](https://github.com/pdfminer/pdfminer.six/pull/760))
@@ -24,6 +28,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
 - Color "convenience operators" now (per spec) also set color space ([#794](https://github.com/pdfminer/pdfminer.six/pull/794))
 - `ValueError` when extracting images, due to breaking changes in Pillow ([#827](https://github.com/pdfminer/pdfminer.six/pull/827))
 - Small typo's and issues in the documentation ([#828](https://github.com/pdfminer/pdfminer.six/pull/828))
+- Ignore non-Unicode cmaps in TrueType fonts ([#806](https://github.com/pdfminer/pdfminer.six/pull/806))
 
 ### Changed
 

diff --git a/noxfile.py b/noxfile.py
@@ -37,12 +37,16 @@ def types(session):
 
 @nox.session(python=PYTHON_ALL_VERSIONS)
 def tests(session):
+    session.install("pip<23")
+    session.install("setuptools<58")
     session.install("-e", ".[dev]")
     session.run("pytest")
 
 
 @nox.session
 def docs(session):
+    session.install("pip<23")
+    session.install("setuptools<58")
     session.install("-e", ".[docs]")
     session.run(
         "python", "-m", "sphinx", "-b", "html", "docs/source", "docs/build/html"

diff --git a/pdfminer/cmapdb.py b/pdfminer/cmapdb.py
@@ -195,15 +195,20 @@ def add_cid2unichr(self, cid: int, code: Union[PSLiteral, bytes, int]) -> None:
         if isinstance(code, PSLiteral):
             # Interpret as an Adobe glyph name.
             assert isinstance(code.name, str)
-            self.cid2unichr[cid] = name2unicode(code.name)
+            unichr = name2unicode(code.name)
         elif isinstance(code, bytes):
             # Interpret as UTF-16BE.
-            self.cid2unichr[cid] = code.decode("UTF-16BE", "ignore")
+            unichr = code.decode("UTF-16BE", "ignore")
         elif isinstance(code, int):
-            self.cid2unichr[cid] = chr(code)
+            unichr = chr(code)
         else:
             raise TypeError(code)
 
+        # A0 = non-breaking space, some weird fonts can have a collision on a cid here.
+        if unichr == "\u00A0" and self.cid2unichr.get(cid) == " ":
+            return
+        self.cid2unichr[cid] = unichr
+
 
 class PyCMap(CMap):
     def __init__(self, name: str, module: Any) -> None:

diff --git a/pdfminer/pdffont.py b/pdfminer/pdffont.py
@@ -755,7 +755,11 @@ def create_unicode_map(self) -> FileUnicodeMap:
             )
         char2gid: Dict[int, int] = {}
         # Only supports subtable type 0, 2 and 4.
-        for (_1, _2, st_offset) in subtables:
+        for (platform_id, encoding_id, st_offset) in subtables:
+            # Skip non-Unicode cmaps.
+            # https://docs.microsoft.com/en-us/typography/opentype/spec/cmap
+            if not (platform_id == 0 or (platform_id == 3 and encoding_id in [1, 10])):
+                continue
             fp.seek(base_offset + st_offset)
             (fmttype, fmtlen, fmtlang) = cast(
                 Tuple[int, int, int], struct.unpack(">HHH", fp.read(6))
@@ -824,6 +828,8 @@ def create_unicode_map(self) -> FileUnicodeMap:
                             char2gid[c] = (c + idd) & 0xFFFF
             else:
                 assert False, str(("Unhandled", fmttype))
+        if not char2gid:
+            raise TrueTypeFont.CMapNotFound
         # create unicode map
         unicode_map = FileUnicodeMap()
         for (char, gid) in char2gid.items():

diff --git a/samples/contrib/issue-791-non-unicode-cmap.pdf b/samples/contrib/issue-791-non-unicode-cmap.pdf
diff --git a/tests/test_highlevel_extracttext.py b/tests/test_highlevel_extracttext.py
@@ -39,6 +39,7 @@ def run_with_file(sample_path):
     "contrib/issue_566_test_1.pdf": "ISSUE Date：2019-4-25 Buyer：黎荣",
     "contrib/issue_566_test_2.pdf": "甲方：中国饮料有限公司（盖章）",
     "contrib/issue-625-identity-cmap.pdf": "Termin płatności: 2021-05-03",
+    "contrib/issue-791-non-unicode-cmap.pdf": "Peněžní prostředky na účtech",
 }
 
 
@@ -120,6 +121,11 @@ def test_issue_625_identity_cmap(self):
 
         self.assertEqual(lines[6], test_strings[test_file])
 
+    def test_issue_791_non_unicode_cmap(self):
+        test_file = "contrib/issue-791-non-unicode-cmap.pdf"
+        s = run_with_file(test_file)
+        self.assertEqual(s.strip(), test_strings[test_file])
+
 
 class TestExtractPages(unittest.TestCase):
     def _get_test_file_path(self):