Skip to content

Commit

Permalink
Big files download (#49)
Browse files Browse the repository at this point in the history
* Download big outputs

* [B]: Fix processing splitted PDFs
  • Loading branch information
akshowhini authored May 6, 2022
1 parent b3847ba commit a8d6b26
Show file tree
Hide file tree
Showing 5 changed files with 20 additions and 22 deletions.
11 changes: 4 additions & 7 deletions ExtractTable/FileOperations/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,19 +19,17 @@ class CheckFile:
def __init__(self, filepath: ty.Union[os.PathLike, str]):
self.filepath = filepath
self.type_error()
self.size_error()
self.is_big = self.is_big_size()

def type_error(self) -> ty.Union[Exception, None]:
"""To check file extension"""
if self.filepath.lower().endswith(self.__SUPPORTED_EXTENSIONS__):
return
raise ClientFileTypeError(Message=f"Allowed file types are {self.__SUPPORTED_EXTENSIONS__}")

def size_error(self) -> ty.Union[Exception, None]:
def is_big_size(self) -> bool:
# 1027 to create some buffer
if os.stat(self.filepath).st_size <= self.__THRESHOLD_SIZE__*1027*1027:
return
raise ClientFileSizeError(Message=f"File Size greater than the threshold {self.__THRESHOLD_SIZE__} Mb.")
return os.stat(self.filepath).st_size > self.__THRESHOLD_SIZE__*1027*1027


class PrepareInput:
Expand All @@ -55,11 +53,10 @@ def __init__(self, filepath: ty.Union[os.PathLike, str], pages: str):
print("[Info]: Aggregating user defined pages..", self.pages)
gather_pages = self._get_pages(self.filepath, pages)
self.filepath = self.pdf_separator(gather_pages)
CheckFile(self.filepath)

def pdf_separator(self, gather_pages: set):
"""PDF Splitter"""
merged_pdf = os.path.join(self.temp_dir, str(self.pages) + os.path.basename(self.filepath))
merged_pdf = os.path.join(self.temp_dir, str(self.pages) + "_" + os.path.basename(self.filepath))
with open(merged_pdf, 'wb') as out_file:
pdf_reader = PyPDF2.PdfFileReader(self.filepath)
pdf_writer = PyPDF2.PdfFileWriter()
Expand Down
23 changes: 14 additions & 9 deletions ExtractTable/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

import requests as rq

from .FileOperations import PrepareInput
from .FileOperations import PrepareInput, CheckFile
from .config import HOST, JobStatus
from .parsers import ValidateResponse
from .common import ConvertTo
Expand Down Expand Up @@ -98,6 +98,10 @@ def get_result(self, job_id: str, wait_time: int = 10, max_wait_time: int = 300)
time.sleep(max(10, int(wait_time)))
max_wait_time -= wait_time
resp = self._make_request('get', HOST.RESULT, params=params)

if resp.get('DownloadUrl', ''):
self.ServerResponse = rq.get(resp['DownloadUrl'])
self.server_response = resp = self.ServerResponse.json()

return resp

Expand Down Expand Up @@ -171,15 +175,16 @@ def process_file(
# To use the reference when saving the output
self.__setattr__('input_filename', os.path.basename(filepath))

try:
with PrepareInput(filepath, pages=pages) as infile:
with open(infile.filepath, 'rb') as fp:
with PrepareInput(filepath, pages=pages) as infile:
with open(infile.filepath, 'rb') as fp:
is_big_file = CheckFile(infile.filepath).is_big
if not is_big_file:
trigger_resp = self.trigger_process(fp, dup_check=dup_check, **kwargs)
except ClientFileSizeError:
big_gen = self.bigfile_upload(filepath=os.path.basename(filepath))
with open(filepath, 'rb') as ifile:
rq.post(big_gen['url'], data=big_gen['fields'], files={'file': ifile})
trigger_resp = self.trigger_process(None, signed_filename=big_gen["fields"]["key"], dup_check=dup_check, **kwargs)
else:
big_gen = self.bigfile_upload(filepath=os.path.basename(infile.filepath))
with open(infile.filepath, 'rb') as ifile:
rq.post(big_gen['url'], data=big_gen['fields'], files={'file': ifile})
trigger_resp = self.trigger_process(None, signed_filename=big_gen["fields"]["key"], dup_check=dup_check, **kwargs)

for _type, _obj in trigger_resp.items():
self.__setattr__(_type, _obj)
Expand Down
2 changes: 1 addition & 1 deletion ExtractTable/__version__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
VERSION = (2, 2, 0)
VERSION = (2, 3, 1)
PRERELEASE = None # "alpha", "beta" or "rc"
REVISION = None

Expand Down
3 changes: 0 additions & 3 deletions requirements.txt

This file was deleted.

3 changes: 1 addition & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,7 @@
with open('README.md', 'r') as f:
readme = f.read()

with open("requirements.txt") as fh:
requires = [x.strip() for x in fh.readlines()]
requires = ['requests>=2.21', 'pandas>=0.24', 'PyPDF2>=1.26']


def setup_package():
Expand Down

0 comments on commit a8d6b26

Please sign in to comment.