Skip to content

Commit

Permalink
Add parameter to control max calculated chunksize and avoid memory pr…
Browse files Browse the repository at this point in the history
…oblems with ray

and bump version
  • Loading branch information
icaropires committed Aug 2, 2020
1 parent d4618be commit bcc17af
Show file tree
Hide file tree
Showing 3 changed files with 7 additions and 3 deletions.
7 changes: 5 additions & 2 deletions pdf2dataset/extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,8 @@ class TextExtraction:
def __init__(
self, input_dir, results_file='', *,
tmp_dir='', lang='por', ocr=False, small=False,
chunksize=None, chunk_df_size=10000, check_inputdir=True, **ray_params
chunksize=None, chunk_df_size=10000, check_inputdir=True,
max_docs_memory=3000, **ray_params
):

self.input_dir = Path(input_dir).resolve()
Expand Down Expand Up @@ -62,6 +63,7 @@ def __init__(
self.small = small
self.lang = lang
self.ocr = ocr
self.max_docs_memory = max_docs_memory

self._df_lock = threading.Lock()
self.chunk_df_size = chunk_df_size
Expand Down Expand Up @@ -374,7 +376,8 @@ def _apply_tasks(self, tasks):

if self.chunksize is None:
chunk_by_cpu = (len(not_processed)/self.num_cpus) / 100
self.chunksize = int(max(1, chunk_by_cpu))
max_chunksize = self.max_docs_memory // self.num_cpus
self.chunksize = int(max(1, min(chunk_by_cpu, max_chunksize)))

if len(processed):
logging.warning(
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "pdf2dataset"
version = "0.3.1"
version = "0.3.2"
readme = "README.md"
description = "Easily convert a big folder with PDFs into a dataset, with extracted text using OCR"
authors = ["Ícaro Pires <[email protected]>"]
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
fastparquet==0.4.0
more-itertools==8.4.0
opencv-python==4.2.0.34
packaging==20.4
pdf2image==1.13.1
Expand Down

0 comments on commit bcc17af

Please sign in to comment.