Skip to content

Commit

Permalink
HARMONY-1714: Use work item id in downloaded filename to avoid collis…
Browse files Browse the repository at this point in the history
…ions
  • Loading branch information
vinnyinverso committed Mar 22, 2024
1 parent 5d35a0d commit 51e95c9
Showing 1 changed file with 36 additions and 9 deletions.
45 changes: 36 additions & 9 deletions harmony/harmony.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
from tabnanny import check
import time
import platform
from uuid import UUID
from requests import Response
from requests.exceptions import JSONDecodeError
import requests.models
Expand Down Expand Up @@ -1078,13 +1079,38 @@ def result_urls(self,
if link['rel'] == 'data':
yield link['href']

def _is_staged_result(self, url):
url_parts = url.split('/')
possible_uuid = url_parts[-3]
possible_item_id = url_parts[-2]
try:
uuid_obj = UUID(possible_uuid, version=4)
except ValueError:
return False
if str(uuid_obj) != possible_uuid:
return False
if not possible_item_id.isnumeric():
return False
return True

def get_filename_from_url(self, url):
url_parts = url.split('/')
original_filename = url_parts[-1]

is_staged_result = self._is_staged_result(url)
if not is_staged_result:
return original_filename
item_id = url_parts[-2]
return f'{item_id}_{original_filename}'

def _download_file(self, url: str, directory: str = '', overwrite: bool = False) -> str:
"""Downloads data, saves it to a file, and returns the filename.
Performance should be close to native with an appropriate chunk size. This can be changed
via environment variable DOWNLOAD_CHUNK_SIZE.
Filenames are automatically determined by using the latter portion of the provided URL.
Filenames are automatically determined by using the latter portion of the provided URL
and will be prefixed by the item id generated by Harmony (when data was transformed from the original).
Args:
url: The location (URL) of the file to be downloaded
Expand All @@ -1099,26 +1125,27 @@ def _download_file(self, url: str, directory: str = '', overwrite: bool = False)
"""
chunksize = int(self.config.DOWNLOAD_CHUNK_SIZE)
session = self._session()
filename = url.split('/')[-1]

filename = self.get_filename_from_url(url)
path = filename
if directory:
filename = os.path.join(directory, filename)
path = os.path.join(directory, filename)

verbose = os.getenv('VERBOSE', 'TRUE')
if not overwrite and os.path.isfile(filename):
if not overwrite and os.path.isfile(path):
if verbose and verbose.upper() == 'TRUE':
print(filename)
return filename
print(path)
return path
else:
headers = {
"Accept-Encoding": "identity"
}
with session.get(url, stream=True, headers=headers) as r:
with open(filename, 'wb') as f:
with open(path, 'wb') as f:
shutil.copyfileobj(r.raw, f, length=chunksize)
if verbose and verbose.upper() == 'TRUE':
print(filename)
return filename
print(path)
return path

def download(self, url: str, directory: str = '', overwrite: bool = False) -> Future:
"""Downloads data and saves it to a file asynchronously.
Expand Down

0 comments on commit 51e95c9

Please sign in to comment.