Skip to content

Commit

Permalink
Merge branch 'master' into fix_version
Browse files Browse the repository at this point in the history
  • Loading branch information
pietermarsman committed Dec 22, 2023
2 parents 48c1397 + 997424d commit d90720c
Show file tree
Hide file tree
Showing 6 changed files with 31 additions and 5 deletions.
7 changes: 6 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).

## [Unreleased]

### Removed
- Support for Python 3.6 and 3.7 ([#921](https://github.com/pdfminer/pdfminer.six/pull/921))

### Added

- Output converter for the hOCR format ([#651](https://github.com/pdfminer/pdfminer.six/pull/651))
Expand All @@ -14,7 +17,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).

### Fixed

- Minimally fix CI by freezing tool versions ([#921](https://github.com/pdfminer/pdfminer.six/pull/921))
- Broken CI/CD pipeline by setting upper version limit for black, mypy, pip and setuptools ([#921](https://github.com/pdfminer/pdfminer.six/pull/921))
- `flake8` failures ([#921](https://github.com/pdfminer/pdfminer.six/pull/921))
- `ValueError` when bmp images with 1 bit channel are decoded ([#773](https://github.com/pdfminer/pdfminer.six/issues/773))
- `ValueError` when trying to decrypt empty metadata values ([#766](https://github.com/pdfminer/pdfminer.six/issues/766))
- Sphinx errors during building of documentation ([#760](https://github.com/pdfminer/pdfminer.six/pull/760))
Expand All @@ -24,6 +28,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
- Color "convenience operators" now (per spec) also set color space ([#794](https://github.com/pdfminer/pdfminer.six/pull/794))
- `ValueError` when extracting images, due to breaking changes in Pillow ([#827](https://github.com/pdfminer/pdfminer.six/pull/827))
- Small typo's and issues in the documentation ([#828](https://github.com/pdfminer/pdfminer.six/pull/828))
- Ignore non-Unicode cmaps in TrueType fonts ([#806](https://github.com/pdfminer/pdfminer.six/pull/806))

### Changed

Expand Down
4 changes: 4 additions & 0 deletions noxfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,12 +37,16 @@ def types(session):

@nox.session(python=PYTHON_ALL_VERSIONS)
def tests(session):
session.install("pip<23")
session.install("setuptools<58")
session.install("-e", ".[dev]")
session.run("pytest")


@nox.session
def docs(session):
session.install("pip<23")
session.install("setuptools<58")
session.install("-e", ".[docs]")
session.run(
"python", "-m", "sphinx", "-b", "html", "docs/source", "docs/build/html"
Expand Down
11 changes: 8 additions & 3 deletions pdfminer/cmapdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,15 +195,20 @@ def add_cid2unichr(self, cid: int, code: Union[PSLiteral, bytes, int]) -> None:
if isinstance(code, PSLiteral):
# Interpret as an Adobe glyph name.
assert isinstance(code.name, str)
self.cid2unichr[cid] = name2unicode(code.name)
unichr = name2unicode(code.name)
elif isinstance(code, bytes):
# Interpret as UTF-16BE.
self.cid2unichr[cid] = code.decode("UTF-16BE", "ignore")
unichr = code.decode("UTF-16BE", "ignore")
elif isinstance(code, int):
self.cid2unichr[cid] = chr(code)
unichr = chr(code)
else:
raise TypeError(code)

# A0 = non-breaking space, some weird fonts can have a collision on a cid here.
if unichr == "\u00A0" and self.cid2unichr.get(cid) == " ":
return
self.cid2unichr[cid] = unichr


class PyCMap(CMap):
def __init__(self, name: str, module: Any) -> None:
Expand Down
8 changes: 7 additions & 1 deletion pdfminer/pdffont.py
Original file line number Diff line number Diff line change
Expand Up @@ -755,7 +755,11 @@ def create_unicode_map(self) -> FileUnicodeMap:
)
char2gid: Dict[int, int] = {}
# Only supports subtable type 0, 2 and 4.
for (_1, _2, st_offset) in subtables:
for (platform_id, encoding_id, st_offset) in subtables:
# Skip non-Unicode cmaps.
# https://docs.microsoft.com/en-us/typography/opentype/spec/cmap
if not (platform_id == 0 or (platform_id == 3 and encoding_id in [1, 10])):
continue
fp.seek(base_offset + st_offset)
(fmttype, fmtlen, fmtlang) = cast(
Tuple[int, int, int], struct.unpack(">HHH", fp.read(6))
Expand Down Expand Up @@ -824,6 +828,8 @@ def create_unicode_map(self) -> FileUnicodeMap:
char2gid[c] = (c + idd) & 0xFFFF
else:
assert False, str(("Unhandled", fmttype))
if not char2gid:
raise TrueTypeFont.CMapNotFound
# create unicode map
unicode_map = FileUnicodeMap()
for (char, gid) in char2gid.items():
Expand Down
Binary file added samples/contrib/issue-791-non-unicode-cmap.pdf
Binary file not shown.
6 changes: 6 additions & 0 deletions tests/test_highlevel_extracttext.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ def run_with_file(sample_path):
"contrib/issue_566_test_1.pdf": "ISSUE Date:2019-4-25 Buyer:黎荣",
"contrib/issue_566_test_2.pdf": "甲方:中国饮料有限公司(盖章)",
"contrib/issue-625-identity-cmap.pdf": "Termin płatności: 2021-05-03",
"contrib/issue-791-non-unicode-cmap.pdf": "Peněžní prostředky na účtech",
}


Expand Down Expand Up @@ -120,6 +121,11 @@ def test_issue_625_identity_cmap(self):

self.assertEqual(lines[6], test_strings[test_file])

def test_issue_791_non_unicode_cmap(self):
test_file = "contrib/issue-791-non-unicode-cmap.pdf"
s = run_with_file(test_file)
self.assertEqual(s.strip(), test_strings[test_file])


class TestExtractPages(unittest.TestCase):
def _get_test_file_path(self):
Expand Down

0 comments on commit d90720c

Please sign in to comment.