From 810183ee424bfd1c6671baef26f99074e2f06221 Mon Sep 17 00:00:00 2001
From: Adrien Perrin <adrien.perrin@nersc.no>
Date: Wed, 17 Apr 2024 13:32:06 +0200
Subject: [PATCH 1/2] EarthDataCMRCrawler: find the right dl URL, not just the
 first one

---
 geospaas_harvesting/providers/earthdata_cmr.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/geospaas_harvesting/providers/earthdata_cmr.py b/geospaas_harvesting/providers/earthdata_cmr.py
index 32da952..6e3402d 100644
--- a/geospaas_harvesting/providers/earthdata_cmr.py
+++ b/geospaas_harvesting/providers/earthdata_cmr.py
@@ -112,6 +112,14 @@ def _build_request_parameters(self, search_terms=None, time_range=(None, None),
 
         return request_parameters
 
+    def _find_download_url(self, entry):
+        """Return the first URL whose type is 'GET DATA'"""
+        urls = entry['umm']['RelatedUrls']
+        for url in urls:
+            if url.get('Type', '').lower() == 'get data':
+                return url['URL']
+        return urls[0]['URL']
+
     def _get_datasets_info(self, page):
         """Get dataset attributes from the current page and
         adds them to self._results.
@@ -119,7 +127,7 @@ def _get_datasets_info(self, page):
         entries = json.loads(page)['items']
 
         for entry in entries:
-            url = entry['umm']['RelatedUrls'][0]['URL']
+            url = self._find_download_url(entry)
             self.logger.debug("Adding '%s' to the list of resources.", url)
             self._results.append(DatasetInfo(url, entry))
 

From f5df8af40723874a900fe89bf7969883e75ead5c Mon Sep 17 00:00:00 2001
From: Adrien Perrin <adrien.perrin@nersc.no>
Date: Wed, 17 Apr 2024 12:22:00 +0000
Subject: [PATCH 2/2] EarthDataCMRCrawler: add tests for _find_download_url

---
 tests/providers/test_earthdata_cmr.py | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/tests/providers/test_earthdata_cmr.py b/tests/providers/test_earthdata_cmr.py
index 2cb2d96..5a54821 100644
--- a/tests/providers/test_earthdata_cmr.py
+++ b/tests/providers/test_earthdata_cmr.py
@@ -169,6 +169,32 @@ def test_build_request_parameters_with_time_range_end_only(self):
                 }
             })
 
+    def test_find_download_url(self):
+        """Test finding a download URL in an entry"""
+        entry = {
+            'umm': {
+                'RelatedUrls': [
+                    {'URL': 'https://foo/bar.json', 'Type': 'EXTENDED METADATA'},
+                    {'URL': 'https://foo/bar.nc', 'Type': 'GET DATA'},
+                    {'URL': 'https://baz/bar.nc', 'Type': 'GET DATA'},
+                    {'URL': 'https://foo/qux.png', 'Type': 'DIRECT DOWNLOAD'},
+                ]
+            }
+        }
+        self.assertEqual(self.crawler._find_download_url(entry), 'https://foo/bar.nc')
+
+    def test_find_download_url_no_get_data(self):
+        """Test finding a download URL in an entry when no GET DATA type is available"""
+        entry = {
+            'umm': {
+                'RelatedUrls': [
+                    {'URL': 'https://foo/bar.nc', 'Type': 'DOWNLOAD'},
+                    {'URL': 'https://baz/bar.nc', 'Type': 'DOWNLOAD'},
+                ]
+            }
+        }
+        self.assertEqual(self.crawler._find_download_url(entry), 'https://foo/bar.nc')
+
     def test_get_datasets_info(self):
         """_get_datasets_info() should extract datasets information
         from a response page