Skip to content

Commit

Permalink
Fix performance issues caused by 0.4.0
Browse files Browse the repository at this point in the history
  • Loading branch information
icaropires committed Aug 24, 2020
1 parent 289c594 commit d14b8b7
Show file tree
Hide file tree
Showing 3 changed files with 40 additions and 29 deletions.
20 changes: 17 additions & 3 deletions pdf2dataset/extract_task.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,9 @@ class ExtractTask(ABC):
fixed_featues = ('path')
_feature_prefix = 'get_' # Optional

_helper_list = None
_features_list = None

def __init__(self, path, file_bin=None, sel_features='all'):
self.path = path
self.file_bin = file_bin
Expand All @@ -57,6 +60,9 @@ def __init__(self, path, file_bin=None, sel_features='all'):

@classmethod
def list_helper_features(cls):
if cls._helper_list is not None:
return cls._helper_list

prefix = cls._feature_prefix

def is_helper(name, method):
Expand All @@ -65,10 +71,16 @@ def is_helper(name, method):

class_routines = getmembers(cls, predicate=isroutine)

return [n[len(prefix):] for n, m in class_routines if is_helper(n, m)]
cls._helper_list = [n[len(prefix):]
for n, m in class_routines if is_helper(n, m)]

return cls._helper_list

@classmethod
def list_features(cls, *, exclude_fixed=True):
if cls._features_list is not None:
return cls._features_list

def include(name, method):
helper_features = [cls._get_feature_methodname(f)
for f in cls.list_helper_features()]
Expand All @@ -80,8 +92,10 @@ def include(name, method):

class_routines = getmembers(cls, predicate=isroutine)

return [n[len(cls._feature_prefix):]
for n, m in class_routines if include(n, m)]
cls._features_list = [n[len(cls._feature_prefix):]
for n, m in class_routines if include(n, m)]

return cls._features_list

@classmethod
def get_schema(cls, features=()):
Expand Down
45 changes: 21 additions & 24 deletions pdf2dataset/extraction.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
#!/bin/env python3

import io
import itertools as it
import logging
Expand Down Expand Up @@ -27,6 +25,25 @@
# TODO: Substitute most (all?) prints for logs


def get_pages_range(path, doc_bin=None):
# Using pdftotext to get num_pages because it's the best way I know
# pdftotext extracts lazy, so this won't process the text

try:
if not doc_bin:
with path.open('rb') as f:
num_pages = len(pdftotext.PDF(f))
else:
with io.BytesIO(doc_bin) as f:
num_pages = len(pdftotext.PDF(f))

pages = range(1, num_pages+1)
except pdftotext.Error:
pages = [-1]

return pages


class Extraction:
_path_pat = r'(?P<path>.+)_(?P<feature>(\w|-)+)_(?P<page>-?\d+)\.txt'

Expand Down Expand Up @@ -171,25 +188,6 @@ def _append_to_df(self, results):
schema=schema, append=exists, engine='pyarrow'
)

@staticmethod
def _get_pages_range(path, doc_bin=None):
# Using pdftotext to get num_pages because it's the best way I know
# pdftotext extracts lazy, so this won't process the text

try:
if not doc_bin:
with path.open('rb') as f:
num_pages = len(pdftotext.PDF(f))
else:
with io.BytesIO(doc_bin) as f:
num_pages = len(pdftotext.PDF(f))

pages = range(1, num_pages+1)
except pdftotext.Error:
pages = [-1]

return pages

def _gen_tasks(self, docs):
'''
Returns tasks to be processed.
Expand All @@ -203,17 +201,16 @@ def _gen_tasks(self, docs):
tqdm(desc='Counting pages', unit='pages') as pbar:

results = pool.imap(
self._get_pages_range, docs, chunksize=chunksize
get_pages_range, docs, chunksize=chunksize
)

for path, range_pages in zip(docs, results):

new_tasks = [
self.task_class(path, p, **self.task_params)
for p in range_pages
]
tasks += new_tasks
pbar.update(len(new_tasks))
pbar.update(len(range_pages))

return tasks

Expand Down
4 changes: 2 additions & 2 deletions pdf2dataset/extraction_memory.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from pathlib import Path

from .extraction import Extraction
from .extraction import Extraction, get_pages_range


class ExtractionFromMemory(Extraction):
Expand Down Expand Up @@ -41,7 +41,7 @@ def uniform(task):
f"Document '{doc}' name must ends with '.pdf'"
)

range_pages = self._get_pages_range(doc, doc_bin=doc_bin)
range_pages = get_pages_range(doc, doc_bin=doc_bin)

# -1 specifically because of the flag used by _get_pages_range
if page in range_pages and not page == -1:
Expand Down

0 comments on commit d14b8b7

Please sign in to comment.