HARMONY-1714: Use work item id in downloaded filename to avoid collis…

…ions
nasa · Mar 22, 2024 · 51e95c9 · 51e95c9
1 parent 5d35a0d
commit 51e95c9
Showing 1 changed file with 36 additions and 9 deletions.
diff --git a/harmony/harmony.py b/harmony/harmony.py
@@ -26,6 +26,7 @@
 from tabnanny import check
 import time
 import platform
+from uuid import UUID
 from requests import Response
 from requests.exceptions import JSONDecodeError
 import requests.models
@@ -1078,13 +1079,38 @@ def result_urls(self,
                 if link['rel'] == 'data':
                     yield link['href']
 
+    def _is_staged_result(self, url):
+        url_parts = url.split('/')
+        possible_uuid = url_parts[-3]
+        possible_item_id = url_parts[-2]
+        try:
+            uuid_obj = UUID(possible_uuid, version=4)
+        except ValueError:
+            return False
+        if str(uuid_obj) != possible_uuid:
+            return False
+        if not possible_item_id.isnumeric():
+            return False
+        return True
+
+    def get_filename_from_url(self, url):
+        url_parts = url.split('/')
+        original_filename = url_parts[-1]
+
+        is_staged_result = self._is_staged_result(url)
+        if not is_staged_result:
+            return original_filename
+        item_id = url_parts[-2]
+        return f'{item_id}_{original_filename}'
+
     def _download_file(self, url: str, directory: str = '', overwrite: bool = False) -> str:
         """Downloads data, saves it to a file, and returns the filename.
 
         Performance should be close to native with an appropriate chunk size. This can be changed
         via environment variable DOWNLOAD_CHUNK_SIZE.
 
-        Filenames are automatically determined by using the latter portion of the provided URL.
+        Filenames are automatically determined by using the latter portion of the provided URL
+        and will be prefixed by the item id generated by Harmony (when data was transformed from the original).
 
         Args:
             url: The location (URL) of the file to be downloaded
@@ -1099,26 +1125,27 @@ def _download_file(self, url: str, directory: str = '', overwrite: bool = False)
         """
         chunksize = int(self.config.DOWNLOAD_CHUNK_SIZE)
         session = self._session()
-        filename = url.split('/')[-1]
 
+        filename = self.get_filename_from_url(url)
+        path = filename
         if directory:
-            filename = os.path.join(directory, filename)
+            path = os.path.join(directory, filename)
 
         verbose = os.getenv('VERBOSE', 'TRUE')
-        if not overwrite and os.path.isfile(filename):
+        if not overwrite and os.path.isfile(path):
             if verbose and verbose.upper() == 'TRUE':
-                print(filename)
-            return filename
+                print(path)
+            return path
         else:
             headers = {
                 "Accept-Encoding": "identity"
             }
             with session.get(url, stream=True, headers=headers) as r:
-                with open(filename, 'wb') as f:
+                with open(path, 'wb') as f:
                     shutil.copyfileobj(r.raw, f, length=chunksize)
             if verbose and verbose.upper() == 'TRUE':
-                print(filename)
-            return filename
+                print(path)
+            return path
 
     def download(self, url: str, directory: str = '', overwrite: bool = False) -> Future:
         """Downloads data and saves it to a file asynchronously.