Skip to content

Commit

Permalink
Merge pull request #23 from icaropires/fix-custom-task
Browse files Browse the repository at this point in the history
Fix custom task mechanism
  • Loading branch information
icaropires authored Sep 13, 2020
2 parents bfde2d1 + 900231b commit b070d65
Show file tree
Hide file tree
Showing 6 changed files with 228 additions and 139 deletions.
22 changes: 12 additions & 10 deletions pdf2dataset/extract_task.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,13 @@ def decorator(feature_method):
feature_method.is_feature = True
feature_method.is_helper = is_helper

type_ = getattr(pa, pyarrow_type)(**type_args)
if pyarrow_type is not None:
type_ = getattr(pa, pyarrow_type)(**type_args)

if isinstance(type_, pa.DataType):
feature_method.pyarrow_type = type_
else:
raise ValueError(f'Invalid PyArrow type {pyarrow_type}!')
if isinstance(type_, pa.DataType):
feature_method.pyarrow_type = type_
else:
raise ValueError(f'Invalid PyArrow type {pyarrow_type}!')

@wraps(feature_method)
def inner(*args, **kwargs):
Expand All @@ -45,13 +46,9 @@ def inner(*args, **kwargs):
# TODO: Eventually, I'll make this a new lib
class ExtractTask(ABC):

fixed_featues = ('path')
fixed_featues = ('path',)
_feature_prefix = 'get_' # Optional

# Memoization
_helper_list = None
_features_list = {}

def __init__(self, path, file_bin=None, sel_features='all'):
self.path = path
self.file_bin = file_bin
Expand All @@ -62,6 +59,11 @@ def __init__(self, path, file_bin=None, sel_features='all'):

self._init_all_features()

def __init_subclass__(cls, **kwargs):
# Memoization
cls._helper_list = None
cls._features_list = {}

@classmethod
def list_helper_features(cls):
if cls._helper_list is not None:
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "pdf2dataset"
version = "0.5.2"
version = "0.5.3"
readme = "README.md"
description = "Easily convert a subdirectory with big volume of PDF documents into a dataset, supports extracting text and images"
authors = ["Ícaro Pires <[email protected]>"]
Expand Down
47 changes: 47 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
import pytest
from pathlib import Path
import pandas as pd


SAMPLES_DIR = Path('tests/samples')
SAMPLE_IMAGE = SAMPLES_DIR / 'single_page1_1.jpeg'
PARQUET_ENGINE = 'pyarrow'


@pytest.fixture
def complete_df():

def read_image(path, page):
if page == -1:
return None

path = Path(path).with_suffix('')
image_name = f'{path}_{page}.jpeg'
image_path = Path(SAMPLES_DIR) / image_name

with open(image_path, 'rb') as f:
image_bin = f.read()

return image_bin

rows = [
['path', 'page', 'text', 'error_bool'],

['multi_page1.pdf', 1, 'First page', False],
['multi_page1.pdf', 2, 'Second page', False],
['multi_page1.pdf', 3, 'Third page', False],
['sub1/copy_multi_page1.pdf', 1, 'First page', False],
['sub1/copy_multi_page1.pdf', 2, 'Second page', False],
['sub1/copy_multi_page1.pdf', 3, 'Third page', False],
['single_page1.pdf', 1, 'My beautiful sample!', False],
['sub2/copy_single_page1.pdf', 1, 'My beautiful sample!', False],
['invalid1.pdf', -1, None, True]
]

names = rows.pop(0)
expected_dict = {n: r for n, r in zip(names, zip(*rows))}

df = pd.DataFrame(expected_dict)
df['image'] = df.apply(lambda row: read_image(row.path, row.page), axis=1)

return df
105 changes: 105 additions & 0 deletions tests/test_extract_task.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
import pytest
from pathlib import Path

import pyarrow as pa
import numpy as np
from PIL import Image
from pdf2dataset import (
PdfExtractTask,
extract,
feature,
image_to_bytes,
image_from_bytes,
)

from .conftest import SAMPLES_DIR, SAMPLE_IMAGE


class MyCustomTask(PdfExtractTask):

@feature('bool_')
def get_is_page_even(self):
return self.page % 2 == 0

@feature(is_helper=True)
def get_doc_first_bytes(self):
return self.file_bin[:10]

@feature('list_', value_type=pa.string())
def get_list(self):
return ['E0', 'E1', 'My super string!']

@feature('string', exceptions=[ValueError])
def get_wrong(self):
raise ValueError("There was a problem!")


@pytest.fixture
def image():
return Image.open(SAMPLE_IMAGE)


@pytest.fixture
def image_bytes():
with open(SAMPLE_IMAGE, 'rb') as f:
bytes_ = f.read()

return bytes_


def test_imagefrombytes(image, image_bytes):

assert image_from_bytes(image_bytes) == image


def test_imagetobytes(image, image_bytes):
# png because jpeg change pixel values
calculated = image_from_bytes(image_to_bytes(image, 'png'))

assert (np.array(calculated) == np.array(image)).all()


def test_list_features():
inherited_features = PdfExtractTask.list_features()
custom_features = MyCustomTask.list_features()

# 3 because I've defined this number of (not helpers) custom features
expected_num_features = len(inherited_features) + 3
assert expected_num_features == len(custom_features)

assert set(inherited_features) < set(custom_features)

assert set(['is_page_even', 'wrong', 'list']) < set(custom_features)


def test_list_helper_features():
inherited_features = PdfExtractTask.list_helper_features()
custom_features = MyCustomTask.list_helper_features()

# 1 because I've defined one helpers custom feature
expected_num_features = len(inherited_features) + 1
assert expected_num_features == len(custom_features)

assert set(inherited_features) < set(custom_features)

assert set(['doc_first_bytes']) < set(custom_features)


def test_saving_to_disk(tmp_path):
out_file = tmp_path / 'my_df.parquet.gzip'
extract(SAMPLES_DIR, out_file, task_class=MyCustomTask)

assert Path(out_file).exists()


def test_columns_present():
df = extract('tests/samples', small=True, task_class=MyCustomTask)
assert set(MyCustomTask.list_features()) < set(df.columns)


def test_error_recorded():
df = extract('tests/samples', small=True, task_class=MyCustomTask)
error_feature, error_msg = 'wrong', 'There was a problem'

assert error_msg in df.iloc[0].error
assert f'{error_feature}:' in df.iloc[0].error
Loading

0 comments on commit b070d65

Please sign in to comment.