Skip to content

Commit

Permalink
Add patch to raise exceptions for invalid page numbers when specifyin…
Browse files Browse the repository at this point in the history
…g tasks
  • Loading branch information
icaropires committed Jul 29, 2020
1 parent 483c3cb commit 0466cc9
Show file tree
Hide file tree
Showing 2 changed files with 11 additions and 5 deletions.
14 changes: 10 additions & 4 deletions pdf2dataset/extract_not_dir.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,13 +23,12 @@ def _gen_extrationtasks(self, tasks):
'''

def uniform(task):
range_pages = None
page = None

if len(task) == 2:
doc, doc_bin = task
elif len(task) == 3:
doc, doc_bin, page = task
range_pages = [page]
else:
raise RuntimeError(
'Wrong task format, it must be'
Expand All @@ -42,8 +41,15 @@ def uniform(task):
f"Document '{doc}' name must ends with '.pdf'"
)

if not range_pages:
range_pages = self._get_pages_range(doc, doc_bin=doc_bin)
range_pages = self._get_pages_range(doc, doc_bin=doc_bin)

# -1 specifically because of the flag used by _get_pages_range
if page in range_pages and not page == -1:
range_pages = [page]
elif page is not None:
raise RuntimeError(
f"Page {page} doesn't exist in document {doc}!"
)

return Path(doc).resolve(), doc_bin, range_pages

Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "pdf2dataset"
version = "0.3.0"
version = "0.3.1"
readme = "README.md"
description = "Easily convert a big folder with PDFs into a dataset, with extracted text using OCR"
authors = ["Ícaro Pires <[email protected]>"]
Expand Down

0 comments on commit 0466cc9

Please sign in to comment.