From 810183ee424bfd1c6671baef26f99074e2f06221 Mon Sep 17 00:00:00 2001 From: Adrien Perrin Date: Wed, 17 Apr 2024 13:32:06 +0200 Subject: [PATCH 1/2] EarthDataCMRCrawler: find the right dl URL, not just the first one --- geospaas_harvesting/providers/earthdata_cmr.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/geospaas_harvesting/providers/earthdata_cmr.py b/geospaas_harvesting/providers/earthdata_cmr.py index 32da952..6e3402d 100644 --- a/geospaas_harvesting/providers/earthdata_cmr.py +++ b/geospaas_harvesting/providers/earthdata_cmr.py @@ -112,6 +112,14 @@ def _build_request_parameters(self, search_terms=None, time_range=(None, None), return request_parameters + def _find_download_url(self, entry): + """Return the first URL whose type is 'GET DATA'""" + urls = entry['umm']['RelatedUrls'] + for url in urls: + if url.get('Type', '').lower() == 'get data': + return url['URL'] + return urls[0]['URL'] + def _get_datasets_info(self, page): """Get dataset attributes from the current page and adds them to self._results. @@ -119,7 +127,7 @@ def _get_datasets_info(self, page): entries = json.loads(page)['items'] for entry in entries: - url = entry['umm']['RelatedUrls'][0]['URL'] + url = self._find_download_url(entry) self.logger.debug("Adding '%s' to the list of resources.", url) self._results.append(DatasetInfo(url, entry)) From f5df8af40723874a900fe89bf7969883e75ead5c Mon Sep 17 00:00:00 2001 From: Adrien Perrin Date: Wed, 17 Apr 2024 12:22:00 +0000 Subject: [PATCH 2/2] EarthDataCMRCrawler: add tests for _find_download_url --- tests/providers/test_earthdata_cmr.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/tests/providers/test_earthdata_cmr.py b/tests/providers/test_earthdata_cmr.py index 2cb2d96..5a54821 100644 --- a/tests/providers/test_earthdata_cmr.py +++ b/tests/providers/test_earthdata_cmr.py @@ -169,6 +169,32 @@ def test_build_request_parameters_with_time_range_end_only(self): } }) + def test_find_download_url(self): + """Test finding a download URL in an entry""" + entry = { + 'umm': { + 'RelatedUrls': [ + {'URL': 'https://foo/bar.json', 'Type': 'EXTENDED METADATA'}, + {'URL': 'https://foo/bar.nc', 'Type': 'GET DATA'}, + {'URL': 'https://baz/bar.nc', 'Type': 'GET DATA'}, + {'URL': 'https://foo/qux.png', 'Type': 'DIRECT DOWNLOAD'}, + ] + } + } + self.assertEqual(self.crawler._find_download_url(entry), 'https://foo/bar.nc') + + def test_find_download_url_no_get_data(self): + """Test finding a download URL in an entry when no GET DATA type is available""" + entry = { + 'umm': { + 'RelatedUrls': [ + {'URL': 'https://foo/bar.nc', 'Type': 'DOWNLOAD'}, + {'URL': 'https://baz/bar.nc', 'Type': 'DOWNLOAD'}, + ] + } + } + self.assertEqual(self.crawler._find_download_url(entry), 'https://foo/bar.nc') + def test_get_datasets_info(self): """_get_datasets_info() should extract datasets information from a response page