Skip to content

Commit

Permalink
Merge pull request #16 from icaropires/small-fixes
Browse files Browse the repository at this point in the history
Add small fixes to usability
  • Loading branch information
icaropires authored Sep 1, 2020
2 parents c7c7278 + b97d2f5 commit c3f3472
Show file tree
Hide file tree
Showing 4 changed files with 27 additions and 20 deletions.
36 changes: 20 additions & 16 deletions pdf2dataset/extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,8 @@ def __init__(
self.max_files_memory = max_files_memory
self.files_pattern = files_pattern

self.num_skipped = None

self.task_class = task_class
self.task_params = {
'sel_features': features,
Expand Down Expand Up @@ -173,10 +175,11 @@ def get_pages_range(file_path, file_bin=None):

return pages

@staticmethod
def _get_processing_bar(num_tasks, iterable=None):
def _get_processing_bar(self, num_tasks, iterable=None):
num_skipped = self.num_skipped or 0

return tqdm(
iterable, total=num_tasks,
iterable, total=num_tasks, initial=num_skipped,
desc='Processing pages', unit='pages', dynamic_ncols=True
)

Expand All @@ -203,10 +206,10 @@ def _ray_process_aux(self, tasks, results_queue):
chunks = ichunked(tasks, int(self.chunksize))
num_initial = int(ray.available_resources()['CPU'])

futures = [self._submit_chunk_ray(c)
for c in it.islice(chunks, num_initial)]

with self._get_processing_bar(len(tasks)) as progress_bar:
futures = [self._submit_chunk_ray(c)
for c in it.islice(chunks, num_initial)]

while futures:
(finished, *_), rest = ray.wait(futures, num_returns=1)

Expand Down Expand Up @@ -294,16 +297,6 @@ def filter_processed_tasks(self, tasks):
return tasks

def _process_tasks(self, tasks):
num_total_tasks = len(tasks)
tasks = self.filter_processed_tasks(tasks)
num_skipped = num_total_tasks - len(tasks)

if num_skipped:
logging.warning(
"'%s' have already %d processed pages, skipping these...",
self.out_file, num_skipped
)

if self.chunksize is None:
chunk_by_cpu = (len(tasks)/self.num_cpus) / 100
max_chunksize = self.max_files_memory // self.num_cpus
Expand All @@ -318,4 +311,15 @@ def _process_tasks(self, tasks):

def apply(self):
tasks = self.gen_tasks()

num_total_tasks = len(tasks)
tasks = self.filter_processed_tasks(tasks)
self.num_skipped = num_total_tasks - len(tasks)

if self.num_skipped:
logging.warning(
"'%s' have already %d processed pages, skipping these...",
self.out_file, self.num_skipped
)

return self._process_tasks(tasks)
7 changes: 5 additions & 2 deletions pdf2dataset/pdf_extract_task.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from pdf2image import convert_from_bytes
from pdf2image.exceptions import PDFPageCountError, PDFSyntaxError
from PIL import Image as PilImage
from PIL.Image import DecompressionBombError

from .extract_task import ExtractTask, feature

Expand Down Expand Up @@ -117,8 +118,10 @@ def _extract_text_native(self):

return text

@feature('binary', is_helper=True,
exceptions=(PDFPageCountError, PDFSyntaxError))
@feature(
'binary', is_helper=True,
exceptions=(PDFPageCountError, PDFSyntaxError, DecompressionBombError)
)
def get_image_original(self):
images = convert_from_bytes(
self.file_bin, first_page=self.page,
Expand Down
2 changes: 1 addition & 1 deletion pdf2dataset/results.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ def get(self):
def is_tasks_processed(self, tasks):
def gen_is_processed(df):
num_checked = 0
all_tasks = set(tuple(task) for _, *task in df.itertuples())
all_tasks = set(tuple(task) for task in df.itertuples(index=False))

for task in tasks:
if num_checked == len(tasks):
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "pdf2dataset"
version = "0.5.0"
version = "0.5.1"
readme = "README.md"
description = "Easily convert a subdirectory with big volume of PDF documents into a dataset, supports extracting text and images"
authors = ["Ícaro Pires <[email protected]>"]
Expand Down

0 comments on commit c3f3472

Please sign in to comment.