From b08ef24149454621471d96c0add47bb367769106 Mon Sep 17 00:00:00 2001 From: Patrick Maupin Date: Fri, 7 Apr 2017 22:50:40 -0500 Subject: [PATCH] Simplify pdfwriter in preparation for major changes - 'encoded' attribute may be None. This allows upcoming code to require that 'encoded' be defined in the base class. - Simplifications using encoded unfortunately changed the dict sort order in a very few cases (and changed checksums). - findobjs.py no longer imports user_fmt from pdfwriter. - New update_expected.py tool in test directory allows some changes to expected checksums to be made in bulk. --- pdfrw/findobjs.py | 3 +- pdfrw/objects/pdfname.py | 1 + pdfrw/pdfwriter.py | 8 ++-- pdfrw/toreportlab.py | 2 +- tests/expected.txt | 10 ++--- tests/update_expected.py | 84 ++++++++++++++++++++++++++++++++++++++++ 6 files changed, 96 insertions(+), 12 deletions(-) create mode 100755 tests/update_expected.py diff --git a/pdfrw/findobjs.py b/pdfrw/findobjs.py index f19ebdf..67d33a0 100644 --- a/pdfrw/findobjs.py +++ b/pdfrw/findobjs.py @@ -8,7 +8,6 @@ ''' from .objects import PdfDict, PdfArray, PdfName -from .pdfwriter import user_fmt def find_objects(source, valid_types=(PdfName.XObject, None), @@ -81,7 +80,7 @@ def wrap_object(obj, width, margin): iw, ih = float(obj.Width), float(obj.Height) ch = 1.0 * cw / iw * ih height = ch + margin[1] + margin[3] - p = tuple(user_fmt(x) for x in (cw, ch, xoffset, yoffset)) + p = tuple(('%.9f' % x).rstrip('0').rstrip('.') for x in (cw, ch, xoffset, yoffset)) contents.stream = fmt % p resources = PdfDict(XObject=PdfDict(MyImage=obj)) mbox = PdfArray((0, 0, width, height)) diff --git a/pdfrw/objects/pdfname.py b/pdfrw/objects/pdfname.py index 1fdf5b5..28a1464 100644 --- a/pdfrw/objects/pdfname.py +++ b/pdfrw/objects/pdfname.py @@ -23,6 +23,7 @@ class BasePdfName(str): ''' indirect = False + encoded = None whitespace = '\x00 \t\f\r\n' delimiters = '()<>{}[]/%' diff --git a/pdfrw/pdfwriter.py b/pdfrw/pdfwriter.py index a7d4af4..3c887ba 100755 --- a/pdfrw/pdfwriter.py +++ b/pdfrw/pdfwriter.py @@ -137,11 +137,11 @@ def format_obj(obj): elif isinstance(obj, PdfDict): if compress and obj.stream: do_compress([obj]) - pairs = sorted((x, y, getattr(x, 'encoded', x)) + pairs = sorted((getattr(x, 'encoded', None) or x, y) for (x, y) in obj.iteritems()) myarray = [] - for key, value, encoding in pairs: - myarray.append(encoding) + for key, value in pairs: + myarray.append(key) myarray.append(add(value)) result = format_array(myarray, '<<%s>>') stream = obj.stream @@ -155,7 +155,7 @@ def format_obj(obj): # We assume that an object with an indirect # attribute knows how to represent itself to us. if hasattr(obj, 'indirect'): - return str(getattr(obj, 'encoded', obj)) + return str(getattr(obj, 'encoded', None) or obj) return user_fmt(obj) def format_deferred(): diff --git a/pdfrw/toreportlab.py b/pdfrw/toreportlab.py index 9f77d26..3434fbf 100644 --- a/pdfrw/toreportlab.py +++ b/pdfrw/toreportlab.py @@ -108,7 +108,7 @@ def _makearray(rldoc, pdfobj): def _makestr(rldoc, pdfobj): assert isinstance(pdfobj, (float, int, str)), repr(pdfobj) # TODO: Add fix for float like in pdfwriter - return str(getattr(pdfobj, 'encoded', pdfobj)) + return str(getattr(pdfobj, 'encoded', None) or pdfobj) def makerl_recurse(rldoc, pdfobj): diff --git a/tests/expected.txt b/tests/expected.txt index 8c6c5f1..b1b7cca 100644 --- a/tests/expected.txt +++ b/tests/expected.txt @@ -11,7 +11,7 @@ examples/subset_b1c400de699af29ea3f1983bb26870ab_1-3_5 880a9578197130273ccb examples/unspread_d711b74110eefb4e9e6bf1a5bea16bfe 780a9abe26a9de0b5b95ee22c4835e4b examples/cat_b1c400de699af29ea3f1983bb26870ab_06c86654f9a77e82f9adaa0086fc391c 62bb9b746ff5932d3f1b88942d36a81d -examples/rotate_707e3e2d17cbe9ec2273414b3b63f333_270_1-4_7-8_10-50_52-56 841c980dfadf2cc47ad86e4649ca69b6 +examples/rotate_707e3e2d17cbe9ec2273414b3b63f333_270_1-4_7-8_10-50_52-56 7633ba56641115050ba098ecbef8d331 examples/watermark_b1c400de699af29ea3f1983bb26870ab_06c86654f9a77e82f9adaa0086fc391c fe2330d42b3bfc06212415f295752f0e examples/watermark_b1c400de699af29ea3f1983bb26870ab_06c86654f9a77e82f9adaa0086fc391c_-u e43e3ac0afe1cc242549424755dbf612 @@ -88,7 +88,7 @@ repaginate/5f265db2736850782aeaba2571a3c749.pdf 2e3046813ce6e40a39bd759a3c8a3c8c repaginate/6a42c8c79b807bf164d31071749e07b0.pdf bf00d5e44869ae59eb859860d7d5373f repaginate/6f3a4de5c68ba3b5093e9b54b7c4e9f4.pdf 612cdd84eeac797a1c42fc91756b6d9e repaginate/7037a992b80b60f0294016037baa9292.pdf dd41b0104f185206b51e7ffe5b07d261 -repaginate/707e3e2d17cbe9ec2273414b3b63f333.pdf 6c65526ab372d72cb185933e3d2584ef +repaginate/707e3e2d17cbe9ec2273414b3b63f333.pdf df4d756e2230c333f0c58ad354b5b51c repaginate/71a751ce2d93a6a5d6ff21735b701fb7.pdf a825f06c934319b93474902fcf300cd2 repaginate/72eb207b8f882618899aa7a65d3cecda.pdf 0b64f19a8a39fadfa2a3eec3f1a01233 repaginate/97ba0a239cefa0dc727c2f1be050ec6c.pdf a94fe7183ce8979174b2ac16dcd9b1ea @@ -127,7 +127,7 @@ simple/5f265db2736850782aeaba2571a3c749.pdf d4d2e93ab22e866c86e32da84421f6f9 simple/6a42c8c79b807bf164d31071749e07b0.pdf 221fec351c925a43f5f409fe03d90013 simple/6f3a4de5c68ba3b5093e9b54b7c4e9f4.pdf fe8dd16dd7fef40338140e0610d0cbbf simple/7037a992b80b60f0294016037baa9292.pdf 6a2ef24e5f74dd74969ff8cefdfc6a05 -simple/707e3e2d17cbe9ec2273414b3b63f333.pdf 4bdf1e57a96ce42717110b4e55098c1a +simple/707e3e2d17cbe9ec2273414b3b63f333.pdf fb6a8eb3cdc2fbef125babe8815f3b70 simple/71a751ce2d93a6a5d6ff21735b701fb7.pdf a825f06c934319b93474902fcf300cd2 simple/72eb207b8f882618899aa7a65d3cecda.pdf 4ce7ff29531cc417c26389af28dc1c5e simple/97ba0a239cefa0dc727c2f1be050ec6c.pdf c24873bab85b8ecc7c5433d8d802bceb @@ -167,7 +167,7 @@ decompress/5f265db2736850782aeaba2571a3c749.pdf 836abcf6e6e1d39ad96481eb20e9b149 decompress/6a42c8c79b807bf164d31071749e07b0.pdf 221fec351c925a43f5f409fe03d90013 decompress/6f3a4de5c68ba3b5093e9b54b7c4e9f4.pdf 226773cac79e1a5fed1379a0501a5df0 decompress/7037a992b80b60f0294016037baa9292.pdf c9a3602b26d82ae145d9f5822125a158 -decompress/707e3e2d17cbe9ec2273414b3b63f333.pdf f6d960e75480aa4f729059388dcedd71 +decompress/707e3e2d17cbe9ec2273414b3b63f333.pdf 3250a56e14a9855eccd67bb347808d24 decompress/71a751ce2d93a6a5d6ff21735b701fb7.pdf a825f06c934319b93474902fcf300cd2 decompress/72eb207b8f882618899aa7a65d3cecda.pdf a4366874fb6db1d9a0c998361ea32b8d decompress/97ba0a239cefa0dc727c2f1be050ec6c.pdf c24873bab85b8ecc7c5433d8d802bceb @@ -208,7 +208,7 @@ compress/5f265db2736850782aeaba2571a3c749.pdf bb4898beac50171de7502f13925af80c compress/6a42c8c79b807bf164d31071749e07b0.pdf 221fec351c925a43f5f409fe03d90013 compress/6f3a4de5c68ba3b5093e9b54b7c4e9f4.pdf 1c3fbae41e7cad7deca13fab93514bc7 compress/7037a992b80b60f0294016037baa9292.pdf 9182a9765544e4a91404db65a6f951d7 -compress/707e3e2d17cbe9ec2273414b3b63f333.pdf bde552c97872c5a4eeafab3b8b38f703 +compress/707e3e2d17cbe9ec2273414b3b63f333.pdf 0e75dda73bf18d9968499277ab1a367e compress/71a751ce2d93a6a5d6ff21735b701fb7.pdf faa7eb31789a3789f65de30a4e58e594 compress/72eb207b8f882618899aa7a65d3cecda.pdf 0155549fc04357220cc6be541dda7bc1 compress/97ba0a239cefa0dc727c2f1be050ec6c.pdf 067bfee3b2bd9c250e7c4157ff543a81 diff --git a/tests/update_expected.py b/tests/update_expected.py new file mode 100755 index 0000000..bed5331 --- /dev/null +++ b/tests/update_expected.py @@ -0,0 +1,84 @@ +#! /usr/bin/env python2 +""" +Put old (good) results in ramdisk/reference, +then generate new (unknown) test results in ramdisk/tmp_results, +THEN SWITCH BACK TO KNOWN GOOD SYSTEM, and finally: + +run this to update any checksums in expected.txt where both versions +parse to same PDFs. +""" + +import os +import hashlib +from pdfrw import PdfReader, PdfWriter, PdfArray, PdfDict, PdfObject + + +def make_canonical(trailer): + ''' Canonicalizes a PDF. Assumes everything + is a Pdf object already. + ''' + visited = set() + workitems = list(trailer.values()) + while workitems: + obj = workitems.pop() + objid = id(obj) + if objid in visited: + continue + visited.add(objid) + obj.indirect = True + if isinstance(obj, (PdfArray, PdfDict)): + if isinstance(obj, PdfArray): + workitems += obj + else: + workitems += obj.values() + return trailer + +with open('expected.txt', 'rb') as f: + expected = f.read() + +def get_digest(fname): + with open(fname, 'rb') as f: + data = f.read() + if data: + return hashlib.md5(data).hexdigest() + +tmp = '_temp.pdf' +count = 0 +goodcount = 0 + +changes = [] +for (srcpath, _, filenames) in os.walk('ramdisk/reference'): + for name in filenames: + if not name.endswith('.pdf'): + continue + src = os.path.join(srcpath, name) + dst = src.replace('/reference/', '/tmp_results/') + if not os.path.exists(dst): + continue + src_digest = get_digest(src) + if not src_digest or src_digest not in expected: + continue + print src + count += 1 + trailer = make_canonical(PdfReader(src)) + out = PdfWriter(tmp) + out.write(trailer=trailer) + match_digest = get_digest(tmp) + if not match_digest: + continue + trailer = make_canonical(PdfReader(dst)) + out = PdfWriter(tmp) + out.write(trailer=trailer) + if get_digest(tmp) != match_digest: + continue + goodcount += 1 + print "OK" + changes.append((src_digest, get_digest(dst))) + +print count, goodcount + +for stuff in changes: + expected = expected.replace(*stuff) + +with open('expected.txt', 'wb') as f: + f.write(expected)