Fix performance issues caused by 0.4.0

icaropires · Aug 24, 2020 · d14b8b7 · d14b8b7
1 parent 289c594
commit d14b8b7
Show file tree

Hide file tree

Showing 3 changed files with 40 additions and 29 deletions.
diff --git a/pdf2dataset/extract_task.py b/pdf2dataset/extract_task.py
@@ -45,6 +45,9 @@ class ExtractTask(ABC):
     fixed_featues = ('path')
     _feature_prefix = 'get_'  # Optional
 
+    _helper_list = None
+    _features_list = None
+
     def __init__(self, path, file_bin=None, sel_features='all'):
         self.path = path
         self.file_bin = file_bin
@@ -57,6 +60,9 @@ def __init__(self, path, file_bin=None, sel_features='all'):
 
     @classmethod
     def list_helper_features(cls):
+        if cls._helper_list is not None:
+            return cls._helper_list
+
         prefix = cls._feature_prefix
 
         def is_helper(name, method):
@@ -65,10 +71,16 @@ def is_helper(name, method):
 
         class_routines = getmembers(cls, predicate=isroutine)
 
-        return [n[len(prefix):] for n, m in class_routines if is_helper(n, m)]
+        cls._helper_list = [n[len(prefix):]
+                            for n, m in class_routines if is_helper(n, m)]
+
+        return cls._helper_list
 
     @classmethod
     def list_features(cls, *, exclude_fixed=True):
+        if cls._features_list is not None:
+            return cls._features_list
+
         def include(name, method):
             helper_features = [cls._get_feature_methodname(f)
                                for f in cls.list_helper_features()]
@@ -80,8 +92,10 @@ def include(name, method):
 
         class_routines = getmembers(cls, predicate=isroutine)
 
-        return [n[len(cls._feature_prefix):]
-                for n, m in class_routines if include(n, m)]
+        cls._features_list = [n[len(cls._feature_prefix):]
+                              for n, m in class_routines if include(n, m)]
+
+        return cls._features_list
 
     @classmethod
     def get_schema(cls, features=()):

diff --git a/pdf2dataset/extraction.py b/pdf2dataset/extraction.py
@@ -1,5 +1,3 @@
-#!/bin/env python3
-
 import io
 import itertools as it
 import logging
@@ -27,6 +25,25 @@
 # TODO: Substitute most (all?) prints for logs
 
 
+def get_pages_range(path, doc_bin=None):
+    # Using pdftotext to get num_pages because it's the best way I know
+    # pdftotext extracts lazy, so this won't process the text
+
+    try:
+        if not doc_bin:
+            with path.open('rb') as f:
+                num_pages = len(pdftotext.PDF(f))
+        else:
+            with io.BytesIO(doc_bin) as f:
+                num_pages = len(pdftotext.PDF(f))
+
+        pages = range(1, num_pages+1)
+    except pdftotext.Error:
+        pages = [-1]
+
+    return pages
+
+
 class Extraction:
     _path_pat = r'(?P<path>.+)_(?P<feature>(\w|-)+)_(?P<page>-?\d+)\.txt'
 
@@ -171,25 +188,6 @@ def _append_to_df(self, results):
                 schema=schema, append=exists, engine='pyarrow'
             )
 
-    @staticmethod
-    def _get_pages_range(path, doc_bin=None):
-        # Using pdftotext to get num_pages because it's the best way I know
-        # pdftotext extracts lazy, so this won't process the text
-
-        try:
-            if not doc_bin:
-                with path.open('rb') as f:
-                    num_pages = len(pdftotext.PDF(f))
-            else:
-                with io.BytesIO(doc_bin) as f:
-                    num_pages = len(pdftotext.PDF(f))
-
-            pages = range(1, num_pages+1)
-        except pdftotext.Error:
-            pages = [-1]
-
-        return pages
-
     def _gen_tasks(self, docs):
         '''
         Returns tasks to be processed.
@@ -203,17 +201,16 @@ def _gen_tasks(self, docs):
                 tqdm(desc='Counting pages', unit='pages') as pbar:
 
             results = pool.imap(
-                self._get_pages_range, docs, chunksize=chunksize
+                get_pages_range, docs, chunksize=chunksize
             )
 
             for path, range_pages in zip(docs, results):
-
                 new_tasks = [
                     self.task_class(path, p, **self.task_params)
                     for p in range_pages
                 ]
                 tasks += new_tasks
-                pbar.update(len(new_tasks))
+                pbar.update(len(range_pages))
 
         return tasks
 

diff --git a/pdf2dataset/extraction_memory.py b/pdf2dataset/extraction_memory.py
@@ -1,6 +1,6 @@
 from pathlib import Path
 
-from .extraction import Extraction
+from .extraction import Extraction, get_pages_range
 
 
 class ExtractionFromMemory(Extraction):
@@ -41,7 +41,7 @@ def uniform(task):
                     f"Document '{doc}' name must ends with '.pdf'"
                 )
 
-            range_pages = self._get_pages_range(doc, doc_bin=doc_bin)
+            range_pages = get_pages_range(doc, doc_bin=doc_bin)
 
             # -1 specifically because of the flag used by _get_pages_range
             if page in range_pages and not page == -1: