Skip to content

Commit

Permalink
Merge pull request #142 from nansencenter/fix_earthdata_cmr_dl_url
Browse files Browse the repository at this point in the history
Fix earthdata cmr download URL
  • Loading branch information
aperrin66 authored May 16, 2024
2 parents 3c7453e + f5df8af commit beb00f1
Show file tree
Hide file tree
Showing 2 changed files with 35 additions and 1 deletion.
10 changes: 9 additions & 1 deletion geospaas_harvesting/providers/earthdata_cmr.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,14 +112,22 @@ def _build_request_parameters(self, search_terms=None, time_range=(None, None),

return request_parameters

def _find_download_url(self, entry):
"""Return the first URL whose type is 'GET DATA'"""
urls = entry['umm']['RelatedUrls']
for url in urls:
if url.get('Type', '').lower() == 'get data':
return url['URL']
return urls[0]['URL']

def _get_datasets_info(self, page):
"""Get dataset attributes from the current page and
adds them to self._results.
Returns True if attributes were found, False otherwise"""
entries = json.loads(page)['items']

for entry in entries:
url = entry['umm']['RelatedUrls'][0]['URL']
url = self._find_download_url(entry)
self.logger.debug("Adding '%s' to the list of resources.", url)
self._results.append(DatasetInfo(url, entry))

Expand Down
26 changes: 26 additions & 0 deletions tests/providers/test_earthdata_cmr.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,32 @@ def test_build_request_parameters_with_time_range_end_only(self):
}
})

def test_find_download_url(self):
"""Test finding a download URL in an entry"""
entry = {
'umm': {
'RelatedUrls': [
{'URL': 'https://foo/bar.json', 'Type': 'EXTENDED METADATA'},
{'URL': 'https://foo/bar.nc', 'Type': 'GET DATA'},
{'URL': 'https://baz/bar.nc', 'Type': 'GET DATA'},
{'URL': 'https://foo/qux.png', 'Type': 'DIRECT DOWNLOAD'},
]
}
}
self.assertEqual(self.crawler._find_download_url(entry), 'https://foo/bar.nc')

def test_find_download_url_no_get_data(self):
"""Test finding a download URL in an entry when no GET DATA type is available"""
entry = {
'umm': {
'RelatedUrls': [
{'URL': 'https://foo/bar.nc', 'Type': 'DOWNLOAD'},
{'URL': 'https://baz/bar.nc', 'Type': 'DOWNLOAD'},
]
}
}
self.assertEqual(self.crawler._find_download_url(entry), 'https://foo/bar.nc')

def test_get_datasets_info(self):
"""_get_datasets_info() should extract datasets information
from a response page
Expand Down

0 comments on commit beb00f1

Please sign in to comment.