-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #23 from icaropires/fix-custom-task
Fix custom task mechanism
- Loading branch information
Showing
6 changed files
with
228 additions
and
139 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,6 @@ | ||
[tool.poetry] | ||
name = "pdf2dataset" | ||
version = "0.5.2" | ||
version = "0.5.3" | ||
readme = "README.md" | ||
description = "Easily convert a subdirectory with big volume of PDF documents into a dataset, supports extracting text and images" | ||
authors = ["Ícaro Pires <[email protected]>"] | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,47 @@ | ||
import pytest | ||
from pathlib import Path | ||
import pandas as pd | ||
|
||
|
||
SAMPLES_DIR = Path('tests/samples') | ||
SAMPLE_IMAGE = SAMPLES_DIR / 'single_page1_1.jpeg' | ||
PARQUET_ENGINE = 'pyarrow' | ||
|
||
|
||
@pytest.fixture | ||
def complete_df(): | ||
|
||
def read_image(path, page): | ||
if page == -1: | ||
return None | ||
|
||
path = Path(path).with_suffix('') | ||
image_name = f'{path}_{page}.jpeg' | ||
image_path = Path(SAMPLES_DIR) / image_name | ||
|
||
with open(image_path, 'rb') as f: | ||
image_bin = f.read() | ||
|
||
return image_bin | ||
|
||
rows = [ | ||
['path', 'page', 'text', 'error_bool'], | ||
|
||
['multi_page1.pdf', 1, 'First page', False], | ||
['multi_page1.pdf', 2, 'Second page', False], | ||
['multi_page1.pdf', 3, 'Third page', False], | ||
['sub1/copy_multi_page1.pdf', 1, 'First page', False], | ||
['sub1/copy_multi_page1.pdf', 2, 'Second page', False], | ||
['sub1/copy_multi_page1.pdf', 3, 'Third page', False], | ||
['single_page1.pdf', 1, 'My beautiful sample!', False], | ||
['sub2/copy_single_page1.pdf', 1, 'My beautiful sample!', False], | ||
['invalid1.pdf', -1, None, True] | ||
] | ||
|
||
names = rows.pop(0) | ||
expected_dict = {n: r for n, r in zip(names, zip(*rows))} | ||
|
||
df = pd.DataFrame(expected_dict) | ||
df['image'] = df.apply(lambda row: read_image(row.path, row.page), axis=1) | ||
|
||
return df |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,105 @@ | ||
import pytest | ||
from pathlib import Path | ||
|
||
import pyarrow as pa | ||
import numpy as np | ||
from PIL import Image | ||
from pdf2dataset import ( | ||
PdfExtractTask, | ||
extract, | ||
feature, | ||
image_to_bytes, | ||
image_from_bytes, | ||
) | ||
|
||
from .conftest import SAMPLES_DIR, SAMPLE_IMAGE | ||
|
||
|
||
class MyCustomTask(PdfExtractTask): | ||
|
||
@feature('bool_') | ||
def get_is_page_even(self): | ||
return self.page % 2 == 0 | ||
|
||
@feature(is_helper=True) | ||
def get_doc_first_bytes(self): | ||
return self.file_bin[:10] | ||
|
||
@feature('list_', value_type=pa.string()) | ||
def get_list(self): | ||
return ['E0', 'E1', 'My super string!'] | ||
|
||
@feature('string', exceptions=[ValueError]) | ||
def get_wrong(self): | ||
raise ValueError("There was a problem!") | ||
|
||
|
||
@pytest.fixture | ||
def image(): | ||
return Image.open(SAMPLE_IMAGE) | ||
|
||
|
||
@pytest.fixture | ||
def image_bytes(): | ||
with open(SAMPLE_IMAGE, 'rb') as f: | ||
bytes_ = f.read() | ||
|
||
return bytes_ | ||
|
||
|
||
def test_imagefrombytes(image, image_bytes): | ||
|
||
assert image_from_bytes(image_bytes) == image | ||
|
||
|
||
def test_imagetobytes(image, image_bytes): | ||
# png because jpeg change pixel values | ||
calculated = image_from_bytes(image_to_bytes(image, 'png')) | ||
|
||
assert (np.array(calculated) == np.array(image)).all() | ||
|
||
|
||
def test_list_features(): | ||
inherited_features = PdfExtractTask.list_features() | ||
custom_features = MyCustomTask.list_features() | ||
|
||
# 3 because I've defined this number of (not helpers) custom features | ||
expected_num_features = len(inherited_features) + 3 | ||
assert expected_num_features == len(custom_features) | ||
|
||
assert set(inherited_features) < set(custom_features) | ||
|
||
assert set(['is_page_even', 'wrong', 'list']) < set(custom_features) | ||
|
||
|
||
def test_list_helper_features(): | ||
inherited_features = PdfExtractTask.list_helper_features() | ||
custom_features = MyCustomTask.list_helper_features() | ||
|
||
# 1 because I've defined one helpers custom feature | ||
expected_num_features = len(inherited_features) + 1 | ||
assert expected_num_features == len(custom_features) | ||
|
||
assert set(inherited_features) < set(custom_features) | ||
|
||
assert set(['doc_first_bytes']) < set(custom_features) | ||
|
||
|
||
def test_saving_to_disk(tmp_path): | ||
out_file = tmp_path / 'my_df.parquet.gzip' | ||
extract(SAMPLES_DIR, out_file, task_class=MyCustomTask) | ||
|
||
assert Path(out_file).exists() | ||
|
||
|
||
def test_columns_present(): | ||
df = extract('tests/samples', small=True, task_class=MyCustomTask) | ||
assert set(MyCustomTask.list_features()) < set(df.columns) | ||
|
||
|
||
def test_error_recorded(): | ||
df = extract('tests/samples', small=True, task_class=MyCustomTask) | ||
error_feature, error_msg = 'wrong', 'There was a problem' | ||
|
||
assert error_msg in df.iloc[0].error | ||
assert f'{error_feature}:' in df.iloc[0].error |
Oops, something went wrong.