Merge pull request #23 from icaropires/fix-custom-task

Fix custom task mechanism
icaropires · Sep 13, 2020 · b070d65 · b070d65
2 parents bfde2d1 + 900231b
commit b070d65
Show file tree

Hide file tree

Showing 6 changed files with 228 additions and 139 deletions.
diff --git a/pdf2dataset/extract_task.py b/pdf2dataset/extract_task.py
@@ -20,12 +20,13 @@ def decorator(feature_method):
         feature_method.is_feature = True
         feature_method.is_helper = is_helper
 
-        type_ = getattr(pa, pyarrow_type)(**type_args)
+        if pyarrow_type is not None:
+            type_ = getattr(pa, pyarrow_type)(**type_args)
 
-        if isinstance(type_, pa.DataType):
-            feature_method.pyarrow_type = type_
-        else:
-            raise ValueError(f'Invalid PyArrow type {pyarrow_type}!')
+            if isinstance(type_, pa.DataType):
+                feature_method.pyarrow_type = type_
+            else:
+                raise ValueError(f'Invalid PyArrow type {pyarrow_type}!')
 
         @wraps(feature_method)
         def inner(*args, **kwargs):
@@ -45,13 +46,9 @@ def inner(*args, **kwargs):
 # TODO: Eventually, I'll make this a new lib
 class ExtractTask(ABC):
 
-    fixed_featues = ('path')
+    fixed_featues = ('path',)
     _feature_prefix = 'get_'  # Optional
 
-    # Memoization
-    _helper_list = None
-    _features_list = {}
-
     def __init__(self, path, file_bin=None, sel_features='all'):
         self.path = path
         self.file_bin = file_bin
@@ -62,6 +59,11 @@ def __init__(self, path, file_bin=None, sel_features='all'):
 
         self._init_all_features()
 
+    def __init_subclass__(cls, **kwargs):
+        # Memoization
+        cls._helper_list = None
+        cls._features_list = {}
+
     @classmethod
     def list_helper_features(cls):
         if cls._helper_list is not None:

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "pdf2dataset"
-version = "0.5.2"
+version = "0.5.3"
 readme = "README.md"
 description = "Easily convert a subdirectory with big volume of PDF documents into a dataset, supports extracting text and images"
 authors = ["Ícaro Pires <[email protected]>"]

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -0,0 +1,47 @@
+import pytest
+from pathlib import Path
+import pandas as pd
+
+
+SAMPLES_DIR = Path('tests/samples')
+SAMPLE_IMAGE = SAMPLES_DIR / 'single_page1_1.jpeg'
+PARQUET_ENGINE = 'pyarrow'
+
+
+@pytest.fixture
+def complete_df():
+
+    def read_image(path, page):
+        if page == -1:
+            return None
+
+        path = Path(path).with_suffix('')
+        image_name = f'{path}_{page}.jpeg'
+        image_path = Path(SAMPLES_DIR) / image_name
+
+        with open(image_path, 'rb') as f:
+            image_bin = f.read()
+
+        return image_bin
+
+    rows = [
+        ['path', 'page', 'text', 'error_bool'],
+
+        ['multi_page1.pdf', 1, 'First page', False],
+        ['multi_page1.pdf', 2, 'Second page', False],
+        ['multi_page1.pdf', 3, 'Third page', False],
+        ['sub1/copy_multi_page1.pdf', 1, 'First page', False],
+        ['sub1/copy_multi_page1.pdf', 2, 'Second page', False],
+        ['sub1/copy_multi_page1.pdf', 3, 'Third page', False],
+        ['single_page1.pdf', 1, 'My beautiful sample!', False],
+        ['sub2/copy_single_page1.pdf', 1, 'My beautiful sample!', False],
+        ['invalid1.pdf', -1, None, True]
+    ]
+
+    names = rows.pop(0)
+    expected_dict = {n: r for n, r in zip(names, zip(*rows))}
+
+    df = pd.DataFrame(expected_dict)
+    df['image'] = df.apply(lambda row: read_image(row.path, row.page), axis=1)
+
+    return df
diff --git a/tests/test_extract_task.py b/tests/test_extract_task.py
@@ -0,0 +1,105 @@
+import pytest
+from pathlib import Path
+
+import pyarrow as pa
+import numpy as np
+from PIL import Image
+from pdf2dataset import (
+    PdfExtractTask,
+    extract,
+    feature,
+    image_to_bytes,
+    image_from_bytes,
+)
+
+from .conftest import SAMPLES_DIR, SAMPLE_IMAGE
+
+
+class MyCustomTask(PdfExtractTask):
+
+    @feature('bool_')
+    def get_is_page_even(self):
+        return self.page % 2 == 0
+
+    @feature(is_helper=True)
+    def get_doc_first_bytes(self):
+        return self.file_bin[:10]
+
+    @feature('list_', value_type=pa.string())
+    def get_list(self):
+        return ['E0', 'E1', 'My super string!']
+
+    @feature('string', exceptions=[ValueError])
+    def get_wrong(self):
+        raise ValueError("There was a problem!")
+
+
+@pytest.fixture
+def image():
+    return Image.open(SAMPLE_IMAGE)
+
+
+@pytest.fixture
+def image_bytes():
+    with open(SAMPLE_IMAGE, 'rb') as f:
+        bytes_ = f.read()
+
+    return bytes_
+
+
+def test_imagefrombytes(image, image_bytes):
+
+    assert image_from_bytes(image_bytes) == image
+
+
+def test_imagetobytes(image, image_bytes):
+    # png because jpeg change pixel values
+    calculated = image_from_bytes(image_to_bytes(image, 'png'))
+
+    assert (np.array(calculated) == np.array(image)).all()
+
+
+def test_list_features():
+    inherited_features = PdfExtractTask.list_features()
+    custom_features = MyCustomTask.list_features()
+
+    # 3 because I've defined this number of (not helpers) custom features
+    expected_num_features = len(inherited_features) + 3
+    assert expected_num_features == len(custom_features)
+
+    assert set(inherited_features) < set(custom_features)
+
+    assert set(['is_page_even', 'wrong', 'list']) < set(custom_features)
+
+
+def test_list_helper_features():
+    inherited_features = PdfExtractTask.list_helper_features()
+    custom_features = MyCustomTask.list_helper_features()
+
+    # 1 because I've defined one helpers custom feature
+    expected_num_features = len(inherited_features) + 1
+    assert expected_num_features == len(custom_features)
+
+    assert set(inherited_features) < set(custom_features)
+
+    assert set(['doc_first_bytes']) < set(custom_features)
+
+
+def test_saving_to_disk(tmp_path):
+    out_file = tmp_path / 'my_df.parquet.gzip'
+    extract(SAMPLES_DIR, out_file, task_class=MyCustomTask)
+
+    assert Path(out_file).exists()
+
+
+def test_columns_present():
+    df = extract('tests/samples', small=True, task_class=MyCustomTask)
+    assert set(MyCustomTask.list_features()) < set(df.columns)
+
+
+def test_error_recorded():
+    df = extract('tests/samples', small=True, task_class=MyCustomTask)
+    error_feature, error_msg = 'wrong', 'There was a problem'
+
+    assert error_msg in df.iloc[0].error
+    assert f'{error_feature}:' in df.iloc[0].error