Merge pull request #142 from nansencenter/fix_earthdata_cmr_dl_url

Fix earthdata cmr download URL
nansencenter · May 16, 2024 · beb00f1 · beb00f1
2 parents 3c7453e + f5df8af
commit beb00f1
Show file tree

Hide file tree

Showing 2 changed files with 35 additions and 1 deletion.
diff --git a/geospaas_harvesting/providers/earthdata_cmr.py b/geospaas_harvesting/providers/earthdata_cmr.py
@@ -112,14 +112,22 @@ def _build_request_parameters(self, search_terms=None, time_range=(None, None),
 
         return request_parameters
 
+    def _find_download_url(self, entry):
+        """Return the first URL whose type is 'GET DATA'"""
+        urls = entry['umm']['RelatedUrls']
+        for url in urls:
+            if url.get('Type', '').lower() == 'get data':
+                return url['URL']
+        return urls[0]['URL']
+
     def _get_datasets_info(self, page):
         """Get dataset attributes from the current page and
         adds them to self._results.
         Returns True if attributes were found, False otherwise"""
         entries = json.loads(page)['items']
 
         for entry in entries:
-            url = entry['umm']['RelatedUrls'][0]['URL']
+            url = self._find_download_url(entry)
             self.logger.debug("Adding '%s' to the list of resources.", url)
             self._results.append(DatasetInfo(url, entry))
 

diff --git a/tests/providers/test_earthdata_cmr.py b/tests/providers/test_earthdata_cmr.py
@@ -169,6 +169,32 @@ def test_build_request_parameters_with_time_range_end_only(self):
                 }
             })
 
+    def test_find_download_url(self):
+        """Test finding a download URL in an entry"""
+        entry = {
+            'umm': {
+                'RelatedUrls': [
+                    {'URL': 'https://foo/bar.json', 'Type': 'EXTENDED METADATA'},
+                    {'URL': 'https://foo/bar.nc', 'Type': 'GET DATA'},
+                    {'URL': 'https://baz/bar.nc', 'Type': 'GET DATA'},
+                    {'URL': 'https://foo/qux.png', 'Type': 'DIRECT DOWNLOAD'},
+                ]
+            }
+        }
+        self.assertEqual(self.crawler._find_download_url(entry), 'https://foo/bar.nc')
+
+    def test_find_download_url_no_get_data(self):
+        """Test finding a download URL in an entry when no GET DATA type is available"""
+        entry = {
+            'umm': {
+                'RelatedUrls': [
+                    {'URL': 'https://foo/bar.nc', 'Type': 'DOWNLOAD'},
+                    {'URL': 'https://baz/bar.nc', 'Type': 'DOWNLOAD'},
+                ]
+            }
+        }
+        self.assertEqual(self.crawler._find_download_url(entry), 'https://foo/bar.nc')
+
     def test_get_datasets_info(self):
         """_get_datasets_info() should extract datasets information
         from a response page