diff --git a/pdf2dataset/extract_task.py b/pdf2dataset/extract_task.py index dd425e6..6f190d5 100644 --- a/pdf2dataset/extract_task.py +++ b/pdf2dataset/extract_task.py @@ -20,12 +20,13 @@ def decorator(feature_method): feature_method.is_feature = True feature_method.is_helper = is_helper - type_ = getattr(pa, pyarrow_type)(**type_args) + if pyarrow_type is not None: + type_ = getattr(pa, pyarrow_type)(**type_args) - if isinstance(type_, pa.DataType): - feature_method.pyarrow_type = type_ - else: - raise ValueError(f'Invalid PyArrow type {pyarrow_type}!') + if isinstance(type_, pa.DataType): + feature_method.pyarrow_type = type_ + else: + raise ValueError(f'Invalid PyArrow type {pyarrow_type}!') @wraps(feature_method) def inner(*args, **kwargs): @@ -45,13 +46,9 @@ def inner(*args, **kwargs): # TODO: Eventually, I'll make this a new lib class ExtractTask(ABC): - fixed_featues = ('path') + fixed_featues = ('path',) _feature_prefix = 'get_' # Optional - # Memoization - _helper_list = None - _features_list = {} - def __init__(self, path, file_bin=None, sel_features='all'): self.path = path self.file_bin = file_bin @@ -62,6 +59,11 @@ def __init__(self, path, file_bin=None, sel_features='all'): self._init_all_features() + def __init_subclass__(cls, **kwargs): + # Memoization + cls._helper_list = None + cls._features_list = {} + @classmethod def list_helper_features(cls): if cls._helper_list is not None: diff --git a/pyproject.toml b/pyproject.toml index 23f9492..e28b473 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "pdf2dataset" -version = "0.5.2" +version = "0.5.3" readme = "README.md" description = "Easily convert a subdirectory with big volume of PDF documents into a dataset, supports extracting text and images" authors = ["Ícaro Pires "] diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..0ec440e --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,47 @@ +import pytest +from pathlib import Path +import pandas as pd + + +SAMPLES_DIR = Path('tests/samples') +SAMPLE_IMAGE = SAMPLES_DIR / 'single_page1_1.jpeg' +PARQUET_ENGINE = 'pyarrow' + + +@pytest.fixture +def complete_df(): + + def read_image(path, page): + if page == -1: + return None + + path = Path(path).with_suffix('') + image_name = f'{path}_{page}.jpeg' + image_path = Path(SAMPLES_DIR) / image_name + + with open(image_path, 'rb') as f: + image_bin = f.read() + + return image_bin + + rows = [ + ['path', 'page', 'text', 'error_bool'], + + ['multi_page1.pdf', 1, 'First page', False], + ['multi_page1.pdf', 2, 'Second page', False], + ['multi_page1.pdf', 3, 'Third page', False], + ['sub1/copy_multi_page1.pdf', 1, 'First page', False], + ['sub1/copy_multi_page1.pdf', 2, 'Second page', False], + ['sub1/copy_multi_page1.pdf', 3, 'Third page', False], + ['single_page1.pdf', 1, 'My beautiful sample!', False], + ['sub2/copy_single_page1.pdf', 1, 'My beautiful sample!', False], + ['invalid1.pdf', -1, None, True] + ] + + names = rows.pop(0) + expected_dict = {n: r for n, r in zip(names, zip(*rows))} + + df = pd.DataFrame(expected_dict) + df['image'] = df.apply(lambda row: read_image(row.path, row.page), axis=1) + + return df diff --git a/tests/test_extract_task.py b/tests/test_extract_task.py new file mode 100644 index 0000000..5a6e16c --- /dev/null +++ b/tests/test_extract_task.py @@ -0,0 +1,105 @@ +import pytest +from pathlib import Path + +import pyarrow as pa +import numpy as np +from PIL import Image +from pdf2dataset import ( + PdfExtractTask, + extract, + feature, + image_to_bytes, + image_from_bytes, +) + +from .conftest import SAMPLES_DIR, SAMPLE_IMAGE + + +class MyCustomTask(PdfExtractTask): + + @feature('bool_') + def get_is_page_even(self): + return self.page % 2 == 0 + + @feature(is_helper=True) + def get_doc_first_bytes(self): + return self.file_bin[:10] + + @feature('list_', value_type=pa.string()) + def get_list(self): + return ['E0', 'E1', 'My super string!'] + + @feature('string', exceptions=[ValueError]) + def get_wrong(self): + raise ValueError("There was a problem!") + + +@pytest.fixture +def image(): + return Image.open(SAMPLE_IMAGE) + + +@pytest.fixture +def image_bytes(): + with open(SAMPLE_IMAGE, 'rb') as f: + bytes_ = f.read() + + return bytes_ + + +def test_imagefrombytes(image, image_bytes): + + assert image_from_bytes(image_bytes) == image + + +def test_imagetobytes(image, image_bytes): + # png because jpeg change pixel values + calculated = image_from_bytes(image_to_bytes(image, 'png')) + + assert (np.array(calculated) == np.array(image)).all() + + +def test_list_features(): + inherited_features = PdfExtractTask.list_features() + custom_features = MyCustomTask.list_features() + + # 3 because I've defined this number of (not helpers) custom features + expected_num_features = len(inherited_features) + 3 + assert expected_num_features == len(custom_features) + + assert set(inherited_features) < set(custom_features) + + assert set(['is_page_even', 'wrong', 'list']) < set(custom_features) + + +def test_list_helper_features(): + inherited_features = PdfExtractTask.list_helper_features() + custom_features = MyCustomTask.list_helper_features() + + # 1 because I've defined one helpers custom feature + expected_num_features = len(inherited_features) + 1 + assert expected_num_features == len(custom_features) + + assert set(inherited_features) < set(custom_features) + + assert set(['doc_first_bytes']) < set(custom_features) + + +def test_saving_to_disk(tmp_path): + out_file = tmp_path / 'my_df.parquet.gzip' + extract(SAMPLES_DIR, out_file, task_class=MyCustomTask) + + assert Path(out_file).exists() + + +def test_columns_present(): + df = extract('tests/samples', small=True, task_class=MyCustomTask) + assert set(MyCustomTask.list_features()) < set(df.columns) + + +def test_error_recorded(): + df = extract('tests/samples', small=True, task_class=MyCustomTask) + error_feature, error_msg = 'wrong', 'There was a problem' + + assert error_msg in df.iloc[0].error + assert f'{error_feature}:' in df.iloc[0].error diff --git a/tests/test_extraction.py b/tests/test_extraction.py index 72c37ca..7732f6e 100644 --- a/tests/test_extraction.py +++ b/tests/test_extraction.py @@ -4,76 +4,17 @@ import pytest import pandas as pd -import numpy as np from PIL import Image from pdf2dataset import ( ExtractionFromMemory, PdfExtractTask, extract, - extract_text, - image_to_bytes, - image_from_bytes, + extract_text ) from .testing_dataframe import check_and_compare - - -SAMPLES_DIR = Path('tests/samples') -TEST_IMAGE = SAMPLES_DIR / 'single_page1_1.jpeg' -PARQUET_ENGINE = 'pyarrow' - - -@pytest.fixture -def expected_all(): - - def read_image(path, page): - if page == -1: - return None - - path = Path(path).with_suffix('') - image_name = f'{path}_{page}.jpeg' - image_path = Path(SAMPLES_DIR) / image_name - - with open(image_path, 'rb') as f: - image_bin = f.read() - - return image_bin - - rows = [ - ['path', 'page', 'text', 'error_bool'], - - ['multi_page1.pdf', 1, 'First page', False], - ['multi_page1.pdf', 2, 'Second page', False], - ['multi_page1.pdf', 3, 'Third page', False], - ['sub1/copy_multi_page1.pdf', 1, 'First page', False], - ['sub1/copy_multi_page1.pdf', 2, 'Second page', False], - ['sub1/copy_multi_page1.pdf', 3, 'Third page', False], - ['single_page1.pdf', 1, 'My beautiful sample!', False], - ['sub2/copy_single_page1.pdf', 1, 'My beautiful sample!', False], - ['invalid1.pdf', -1, None, True] - ] - - names = rows.pop(0) - expected_dict = {n: r for n, r in zip(names, zip(*rows))} - - df = pd.DataFrame(expected_dict) - df['image'] = df.apply(lambda row: read_image(row.path, row.page), axis=1) - - return df - - -@pytest.fixture -def image(): - return Image.open(TEST_IMAGE) - - -@pytest.fixture -def image_bytes(): - with open(TEST_IMAGE, 'rb') as f: - bytes_ = f.read() - - return bytes_ +from .conftest import SAMPLES_DIR, PARQUET_ENGINE class TestExtractionCore: @@ -82,7 +23,7 @@ class TestExtractionCore: True, False, )) - def test_extraction_big(self, tmp_path, is_ocr, expected_all): + def test_extraction_big(self, tmp_path, is_ocr, complete_df): result_path = tmp_path / 'result.parquet.gzip' extract(SAMPLES_DIR, result_path, @@ -93,9 +34,9 @@ def test_extraction_big(self, tmp_path, is_ocr, expected_all): if is_ocr: df['text'] = df['text'].str.strip() - check_and_compare(df, expected_all, is_ocr=is_ocr) + check_and_compare(df, complete_df, is_ocr=is_ocr) - def test_append_result(self, tmp_path, expected_all): + def test_append_result(self, tmp_path, complete_df): result_path = tmp_path / 'result.parquet.gzip' extract(SAMPLES_DIR, result_path, saving_interval=1, features='all') @@ -103,9 +44,9 @@ def test_append_result(self, tmp_path, expected_all): # Small 'chunk_df_size' to append to result multiple times df = pd.read_parquet(result_path, engine=PARQUET_ENGINE) - check_and_compare(df, expected_all) + check_and_compare(df, complete_df) - def test_passing_paths_list(self, tmp_path, expected_all): + def test_passing_paths_list(self, tmp_path, complete_df): result_path = tmp_path / 'result.parquet.gzip' files_list = Path(SAMPLES_DIR).rglob('*.pdf') @@ -114,11 +55,11 @@ def test_passing_paths_list(self, tmp_path, expected_all): df = extract(files_list, result_path, small=True) - # Paths will be relative to pwd, so adapting expected_all - expected_all['path'] = expected_all['path'].apply( + # Paths will be relative to pwd, so adapting complete_df + complete_df['path'] = complete_df['path'].apply( lambda p: str(SAMPLES_DIR / p) ) - check_and_compare(df, expected_all) + check_and_compare(df, complete_df) def test_filter_processed(self, tmp_path): with open(SAMPLES_DIR / 'single_page1.pdf', 'rb') as f: @@ -166,13 +107,13 @@ class TestExtractionSmall: True, False, )) - def test_extraction_small(self, is_ocr, expected_all): + def test_extraction_small(self, is_ocr, complete_df): df = extract(SAMPLES_DIR, small=True, ocr_lang='eng', ocr=is_ocr) if is_ocr: df['text'] = df['text'].str.strip() - check_and_compare(df, expected_all, is_ocr=is_ocr) + check_and_compare(df, complete_df, is_ocr=is_ocr) def test_return_list(self): def sort(doc): @@ -226,30 +167,30 @@ def hash_images(doc): class TestParams: - def test_features_as_list(self, expected_all): + def test_features_as_list(self, complete_df): df = extract(SAMPLES_DIR, small=True, features=['text', 'image']) - check_and_compare(df, expected_all) + check_and_compare(df, complete_df) @pytest.mark.parametrize('excluded', [ 'text', 'image', ]) - def test_exclude_feature(self, excluded, expected_all): + def test_exclude_feature(self, excluded, complete_df): features = PdfExtractTask.list_features() features.remove(excluded) df = extract(SAMPLES_DIR, small=True, features=features) - columns = list(expected_all.columns) + columns = list(complete_df.columns) columns.remove(excluded) - check_and_compare(df, expected_all[columns]) + check_and_compare(df, complete_df[columns]) - def test_empty_feature(self, expected_all): + def test_empty_feature(self, complete_df): df = extract(SAMPLES_DIR, small=True, features='') columns = list(PdfExtractTask.fixed_featues) + ['error_bool'] - check_and_compare(df, expected_all[columns]) + check_and_compare(df, complete_df[columns]) @pytest.mark.parametrize('size', ( ('10x10'), @@ -285,7 +226,7 @@ def test_image_format(self, format_): (200, True), (2000, False), )) - def test_low_ocr_image(self, expected_all, ocr_image_size, is_low): + def test_low_ocr_image(self, complete_df, ocr_image_size, is_low): df = extract_text( SAMPLES_DIR, small=True, ocr=True, ocr_image_size=ocr_image_size, ocr_lang='eng' @@ -294,7 +235,7 @@ def test_low_ocr_image(self, expected_all, ocr_image_size, is_low): df = df.dropna(subset=['text']) serie = df.iloc[0] - expected = expected_all.dropna(subset=['text']) + expected = complete_df.dropna(subset=['text']) expected = expected[(expected.path == serie.path) & (expected.page == serie.page)] @@ -304,51 +245,3 @@ def test_low_ocr_image(self, expected_all, ocr_image_size, is_low): assert serie.text.strip() != expected_serie.text.strip() else: assert serie.text.strip() == expected_serie.text.strip() - - def test_imagefrombytes(self, image, image_bytes): - - assert image_from_bytes(image_bytes) == image - - def test_imagetobytes(self, image, image_bytes): - # png because jpeg change pixel values - calculated = image_from_bytes(image_to_bytes(image, 'png')) - - assert (np.array(calculated) == np.array(image)).all() - - -class TestExtractionFromMemory: - - @pytest.mark.parametrize('small', ( - True, - False, - )) - def test_passing_tasks(self, tmp_path, small): - with open(SAMPLES_DIR / 'single_page1.pdf', 'rb') as f: - pdf1_bin = f.read() - - with open(SAMPLES_DIR / 'multi_page1.pdf', 'rb') as f: - pdf2_bin = f.read() - - tasks = [ - ('doc1.pdf', pdf1_bin), # All pages - ('2.pdf', pdf2_bin, 2), # Just page 2 - ('pdf2.pdf', pdf2_bin, 3), # Just page 3 - ] - - expected_dict = { - 'path': ['pdf2.pdf', '2.pdf', 'doc1.pdf'], - 'page': [3, 2, 1], - 'text': ['Third page', 'Second page', 'My beautiful sample!'], - 'error': [None, None, None], - } - expected = pd.DataFrame(expected_dict) - - if small: - df = extract_text(tasks=tasks, small=small) - else: - result_path = tmp_path / 'result.parquet.gzip' - extract_text(tasks, result_path) - - df = pd.read_parquet(result_path, engine=PARQUET_ENGINE) - - check_and_compare(df, expected, list(expected.columns)) diff --git a/tests/test_extraction_memory.py b/tests/test_extraction_memory.py new file mode 100644 index 0000000..c18130c --- /dev/null +++ b/tests/test_extraction_memory.py @@ -0,0 +1,42 @@ +import pytest +import pandas as pd +from pdf2dataset import extract_text + +from .testing_dataframe import check_and_compare +from .conftest import SAMPLES_DIR, PARQUET_ENGINE + + +@pytest.mark.parametrize('small', ( + True, + False, +)) +def test_passing_tasks(tmp_path, small): + with open(SAMPLES_DIR / 'single_page1.pdf', 'rb') as f: + pdf1_bin = f.read() + + with open(SAMPLES_DIR / 'multi_page1.pdf', 'rb') as f: + pdf2_bin = f.read() + + tasks = [ + ('doc1.pdf', pdf1_bin), # All pages + ('2.pdf', pdf2_bin, 2), # Just page 2 + ('pdf2.pdf', pdf2_bin, 3), # Just page 3 + ] + + expected_dict = { + 'path': ['pdf2.pdf', '2.pdf', 'doc1.pdf'], + 'page': [3, 2, 1], + 'text': ['Third page', 'Second page', 'My beautiful sample!'], + 'error': [None, None, None], + } + expected = pd.DataFrame(expected_dict) + + if small: + df = extract_text(tasks=tasks, small=small) + else: + result_path = tmp_path / 'result.parquet.gzip' + extract_text(tasks, result_path) + + df = pd.read_parquet(result_path, engine=PARQUET_ENGINE) + + check_and_compare(df, expected, list(expected.columns))