Skip to content

Commit

Permalink
Merge pull request #83 from pmaupin/enhance_tokenizer
Browse files Browse the repository at this point in the history
Optimize tokenizer token memoization
  • Loading branch information
pmaupin authored Apr 1, 2017
2 parents a1f4cd9 + 73bebf2 commit d49024b
Show file tree
Hide file tree
Showing 2 changed files with 17 additions and 22 deletions.
5 changes: 5 additions & 0 deletions pdfrw/py23_diffs.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,3 +46,8 @@ def from_array(a):
xrange = xrange
except NameError:
xrange = range

try:
intern = intern
except NameError:
from sys import intern
34 changes: 12 additions & 22 deletions pdfrw/tokens.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from .objects import PdfString, PdfObject
from .objects.pdfname import BasePdfName
from .errors import log, PdfParseError
from .py23_diffs import nextattr
from .py23_diffs import nextattr, intern


def linepos(fdata, loc):
Expand Down Expand Up @@ -64,19 +64,7 @@ class PdfTokens(object):
findparen = re.compile('(%s)[%s]*' % (p_literal_string_extend,
whitespace), re.DOTALL).finditer

def _cacheobj(cache, obj, constructor):
''' This caching relies on the constructors
returning something that will compare as
equal to the original obj. This works
fine with our PDF objects.
'''
result = cache.get(obj)
if result is None:
result = constructor(obj)
cache[result] = result
return result

def _gettoks(self, startloc, cacheobj=_cacheobj,
def _gettoks(self, startloc, intern=intern,
delimiters=delimiters, findtok=findtok,
findparen=findparen, PdfString=PdfString,
PdfObject=PdfObject, BasePdfName=BasePdfName):
Expand All @@ -95,24 +83,23 @@ def _gettoks(self, startloc, cacheobj=_cacheobj,
fdata = self.fdata
current = self.current = [(startloc, startloc)]
cache = {}
get_cache = cache.get
while 1:
for match in findtok(fdata, current[0][1]):
current[0] = tokspan = match.span()
token = match.group(1)
firstch = token[0]
toktype = intern
if firstch not in delimiters:
token = cacheobj(cache, token, PdfObject)
toktype = PdfObject
elif firstch in '/<(%':
if firstch == '/':
# PDF Name
encoded = token
token = cache.get(encoded)
if token is None:
token = cache[token] = BasePdfName(encoded)
toktype = BasePdfName
elif firstch == '<':
# << dict delim, or < hex string >
if token[1:2] != '<':
token = cacheobj(cache, token, PdfString)
toktype = PdfString
elif firstch == '(':
# Literal string
# It's probably simple, but maybe not
Expand Down Expand Up @@ -145,7 +132,7 @@ def _gettoks(self, startloc, cacheobj=_cacheobj,
loc, ends, nest = ends
token = fdata[m_start:loc] + ')' * nest
current[0] = m_start, ends
token = cacheobj(cache, token, PdfString)
toktype = PdfString
elif firstch == '%':
# Comment
if self.strip_comments:
Expand All @@ -154,7 +141,10 @@ def _gettoks(self, startloc, cacheobj=_cacheobj,
self.exception(('Tokenizer logic incorrect -- '
'should never get here'))

yield token
newtok = get_cache(token)
if newtok is None:
newtok = cache[token] = toktype(token)
yield newtok
if current[0] is not tokspan:
break
else:
Expand Down

0 comments on commit d49024b

Please sign in to comment.