From a8d6b264193ab809c65328194046603f3aa4a91c Mon Sep 17 00:00:00 2001 From: akshowhini <33936764+akshowhini@users.noreply.github.com> Date: Fri, 6 May 2022 17:09:29 -0400 Subject: [PATCH] Big files download (#49) * Download big outputs * [B]: Fix processing splitted PDFs --- ExtractTable/FileOperations/__init__.py | 11 ++++------- ExtractTable/__init__.py | 23 ++++++++++++++--------- ExtractTable/__version__.py | 2 +- requirements.txt | 3 --- setup.py | 3 +-- 5 files changed, 20 insertions(+), 22 deletions(-) delete mode 100644 requirements.txt diff --git a/ExtractTable/FileOperations/__init__.py b/ExtractTable/FileOperations/__init__.py index 9e3b878..419301b 100644 --- a/ExtractTable/FileOperations/__init__.py +++ b/ExtractTable/FileOperations/__init__.py @@ -19,7 +19,7 @@ class CheckFile: def __init__(self, filepath: ty.Union[os.PathLike, str]): self.filepath = filepath self.type_error() - self.size_error() + self.is_big = self.is_big_size() def type_error(self) -> ty.Union[Exception, None]: """To check file extension""" @@ -27,11 +27,9 @@ def type_error(self) -> ty.Union[Exception, None]: return raise ClientFileTypeError(Message=f"Allowed file types are {self.__SUPPORTED_EXTENSIONS__}") - def size_error(self) -> ty.Union[Exception, None]: + def is_big_size(self) -> bool: # 1027 to create some buffer - if os.stat(self.filepath).st_size <= self.__THRESHOLD_SIZE__*1027*1027: - return - raise ClientFileSizeError(Message=f"File Size greater than the threshold {self.__THRESHOLD_SIZE__} Mb.") + return os.stat(self.filepath).st_size > self.__THRESHOLD_SIZE__*1027*1027 class PrepareInput: @@ -55,11 +53,10 @@ def __init__(self, filepath: ty.Union[os.PathLike, str], pages: str): print("[Info]: Aggregating user defined pages..", self.pages) gather_pages = self._get_pages(self.filepath, pages) self.filepath = self.pdf_separator(gather_pages) - CheckFile(self.filepath) def pdf_separator(self, gather_pages: set): """PDF Splitter""" - merged_pdf = os.path.join(self.temp_dir, str(self.pages) + os.path.basename(self.filepath)) + merged_pdf = os.path.join(self.temp_dir, str(self.pages) + "_" + os.path.basename(self.filepath)) with open(merged_pdf, 'wb') as out_file: pdf_reader = PyPDF2.PdfFileReader(self.filepath) pdf_writer = PyPDF2.PdfFileWriter() diff --git a/ExtractTable/__init__.py b/ExtractTable/__init__.py index 4639cff..3e698ae 100644 --- a/ExtractTable/__init__.py +++ b/ExtractTable/__init__.py @@ -11,7 +11,7 @@ import requests as rq -from .FileOperations import PrepareInput +from .FileOperations import PrepareInput, CheckFile from .config import HOST, JobStatus from .parsers import ValidateResponse from .common import ConvertTo @@ -98,6 +98,10 @@ def get_result(self, job_id: str, wait_time: int = 10, max_wait_time: int = 300) time.sleep(max(10, int(wait_time))) max_wait_time -= wait_time resp = self._make_request('get', HOST.RESULT, params=params) + + if resp.get('DownloadUrl', ''): + self.ServerResponse = rq.get(resp['DownloadUrl']) + self.server_response = resp = self.ServerResponse.json() return resp @@ -171,15 +175,16 @@ def process_file( # To use the reference when saving the output self.__setattr__('input_filename', os.path.basename(filepath)) - try: - with PrepareInput(filepath, pages=pages) as infile: - with open(infile.filepath, 'rb') as fp: + with PrepareInput(filepath, pages=pages) as infile: + with open(infile.filepath, 'rb') as fp: + is_big_file = CheckFile(infile.filepath).is_big + if not is_big_file: trigger_resp = self.trigger_process(fp, dup_check=dup_check, **kwargs) - except ClientFileSizeError: - big_gen = self.bigfile_upload(filepath=os.path.basename(filepath)) - with open(filepath, 'rb') as ifile: - rq.post(big_gen['url'], data=big_gen['fields'], files={'file': ifile}) - trigger_resp = self.trigger_process(None, signed_filename=big_gen["fields"]["key"], dup_check=dup_check, **kwargs) + else: + big_gen = self.bigfile_upload(filepath=os.path.basename(infile.filepath)) + with open(infile.filepath, 'rb') as ifile: + rq.post(big_gen['url'], data=big_gen['fields'], files={'file': ifile}) + trigger_resp = self.trigger_process(None, signed_filename=big_gen["fields"]["key"], dup_check=dup_check, **kwargs) for _type, _obj in trigger_resp.items(): self.__setattr__(_type, _obj) diff --git a/ExtractTable/__version__.py b/ExtractTable/__version__.py index 8916c84..6ec7765 100644 --- a/ExtractTable/__version__.py +++ b/ExtractTable/__version__.py @@ -1,4 +1,4 @@ -VERSION = (2, 2, 0) +VERSION = (2, 3, 1) PRERELEASE = None # "alpha", "beta" or "rc" REVISION = None diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index a22737e..0000000 --- a/requirements.txt +++ /dev/null @@ -1,3 +0,0 @@ -requests>=2.21 -pandas>=0.24 -PyPDF2>=1.26 diff --git a/setup.py b/setup.py index 7945a94..604b965 100644 --- a/setup.py +++ b/setup.py @@ -10,8 +10,7 @@ with open('README.md', 'r') as f: readme = f.read() -with open("requirements.txt") as fh: - requires = [x.strip() for x in fh.readlines()] +requires = ['requests>=2.21', 'pandas>=0.24', 'PyPDF2>=1.26'] def setup_package():