Skip to content

Commit

Permalink
Merge pull request #144 from nansencenter/add_aviso_provider
Browse files Browse the repository at this point in the history
Add AVISO provider
  • Loading branch information
aperrin66 authored Jun 11, 2024
2 parents 77de6b7 + 4316ac8 commit e74e89e
Show file tree
Hide file tree
Showing 5 changed files with 105 additions and 16 deletions.
2 changes: 2 additions & 0 deletions geospaas_harvesting/config.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""Configuration management"""
import logging

import geospaas_harvesting.providers.aviso as providers_aviso
import geospaas_harvesting.providers.base as providers_base
import geospaas_harvesting.providers.ceda as providers_ceda
import geospaas_harvesting.providers.cmems as providers_cmems
Expand Down Expand Up @@ -57,6 +58,7 @@ class ProvidersArgument(DictArgument):
}
"""
provider_types = {
'aviso': providers_aviso.AVISOProvider,
'ceda': providers_ceda.CEDAProvider,
'cmems_ftp': providers_cmems.CMEMSFTPProvider,
'copernicus_scihub': providers_copernicus_scihub.CopernicusScihubProvider,
Expand Down
24 changes: 14 additions & 10 deletions geospaas_harvesting/crawlers.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,10 +76,9 @@ def set_initial_state(self):
"""
raise NotImplementedError()

@classmethod
def _http_get(cls, url, request_parameters=None, max_tries=5, wait_time=5):
def _http_get(self, url, request_parameters=None, max_tries=5, wait_time=5):
"""Sends an HTTP GET request, retry in case of failure"""
cls.logger.debug("Getting page: '%s'", url)
self.logger.debug("Getting page: '%s'", url)

last_error = None
for try_index in range(max_tries):
Expand All @@ -94,8 +93,8 @@ def _http_get(cls, url, request_parameters=None, max_tries=5, wait_time=5):
raise
else:
last_error = error
cls.logger.warning('Error while sending request to %s, %d retries left',
url, max_tries - try_index - 1, exc_info=True)
self.logger.warning('Error while sending request to %s, %d retries left',
url, max_tries - try_index - 1, exc_info=True)
time.sleep(wait_time)
wait_time *= 2
raise RuntimeError(f"Max retries reached trying to get {url}") from last_error
Expand Down Expand Up @@ -336,6 +335,14 @@ def __eq__(self, other):
self.username == other.username and
self.password == other.password)

def _http_get(self, url, request_parameters=None, max_tries=5, wait_time=5):
if self.username is not None and self.password is not None:
if request_parameters is None:
request_parameters = {}
request_parameters['auth'] = (self.username, self.password)
return super()._http_get(url, request_parameters=request_parameters,
max_tries=max_tries, wait_time=wait_time)

@property
def base_url(self):
"""Get the root URL without the path"""
Expand Down Expand Up @@ -557,8 +564,6 @@ def _prepend_parent_path(parent_path, paths):

def _list_folder_contents(self, folder_path):
request_parameters = {}
if self.username is not None and self.password is not None:
request_parameters['auth'] = (self.username, self.password)
html = self._http_get(f"{self.base_url}{folder_path}", request_parameters).text
stripped_folder_path = self._strip_folder_page(folder_path)
return self._prepend_parent_path(stripped_folder_path, self._get_links(html))
Expand Down Expand Up @@ -647,10 +652,9 @@ def get_normalized_attributes(self, dataset_info, **kwargs):
"""
ddx_url = self.get_ddx_url(dataset_info.url)
# Get the metadata from the dataset as an XML tree
stream = io.BytesIO(utils.http_request('GET', ddx_url, stream=True).content)
stream = io.BytesIO(self._http_get(ddx_url, request_parameters={'stream': True}).content)
# Get all the global attributes of the Dataset into a dictionary
extracted_attributes = self._extract_attributes(
ET.parse(stream).getroot())
extracted_attributes = self._extract_attributes(ET.parse(stream).getroot())
# add the URL to the attributes passed to metanorm
self.add_url(dataset_info.url, extracted_attributes)
# Get the parameters needed to create a geospaas catalog dataset from the global attributes
Expand Down
25 changes: 25 additions & 0 deletions geospaas_harvesting/providers/aviso.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
"""Code for searching AVISO data (https://tds.aviso.altimetry.fr/thredds)"""
from .base import Provider, TimeFilterMixin
from ..arguments import StringArgument
from ..crawlers import ThreddsCrawler


class AVISOProvider(TimeFilterMixin, Provider):
"""Provider for AVISO's Thredds"""
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.url = 'https://tds.aviso.altimetry.fr/thredds'
self.search_parameters_parser.add_arguments([
StringArgument('directory', required=True),
StringArgument('include'),
])

def _make_crawler(self, parameters):
return ThreddsCrawler(
'/'.join((self.url, parameters['directory'].lstrip('/'))),
time_range=(parameters['start_time'], parameters['end_time']),
include=parameters.get('include'),
max_threads=30,
username=self.username,
password=self.password,
)
30 changes: 30 additions & 0 deletions tests/providers/test_aviso.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# pylint: disable=protected-access
"""Tests for the AVISO provider"""
import unittest
import unittest.mock as mock
from datetime import datetime, timezone

import geospaas_harvesting.crawlers as crawlers
from geospaas_harvesting.providers.aviso import AVISOProvider


class AVISOProviderTestCase(unittest.TestCase):
"""Tests for AVISOProvider"""

def test_make_crawler(self):
"""Test creating a crawler from parameters"""
provider = AVISOProvider(name='test', username='user', password='pass')
parameters = {
'start_time': datetime(2023, 1, 1, tzinfo=timezone.utc),
'end_time': datetime(2023, 1, 2, tzinfo=timezone.utc),
'directory': 'foo',
'include': '.*'
}
self.assertEqual(
provider._make_crawler(parameters),
crawlers.ThreddsCrawler(
'https://tds.aviso.altimetry.fr/thredds/foo',
include='.*',
time_range=(datetime(2023, 1, 1, tzinfo=timezone.utc),
datetime(2023, 1, 2, tzinfo=timezone.utc)),
username='user', password='pass'))
40 changes: 34 additions & 6 deletions tests/test_generic_crawlers.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ def test_http_get_retry(self):
http_500_error,
mock.Mock())
with self.assertLogs(crawlers.Crawler.logger, level=logging.WARNING):
crawlers.Crawler._http_get('url', max_tries=5, wait_time=30)
crawlers.Crawler()._http_get('url', max_tries=5, wait_time=30)

self.assertEqual(len(mock_request.mock_calls), 5)
self.assertListEqual(mock_sleep.mock_calls, [mock.call(30 * (2**i)) for i in range(4)])
Expand All @@ -96,7 +96,7 @@ def test_http_get_fails_eventually(self):

with self.assertLogs(crawlers.Crawler.logger, level=logging.WARNING), \
self.assertRaises(RuntimeError):
crawlers.Crawler._http_get('url')
crawlers.Crawler()._http_get('url')

self.assertEqual(len(mock_request.mock_calls), 5)
self.assertEqual(len(mock_sleep.mock_calls), 5)
Expand All @@ -108,7 +108,7 @@ def test_http_get_no_retry_error(self):
with mock.patch('geospaas_harvesting.utils.http_request') as mock_request:
mock_request.side_effect = requests.TooManyRedirects
with self.assertRaises(requests.RequestException):
self.assertIsNone(crawlers.Crawler._http_get('url'))
self.assertIsNone(crawlers.Crawler()._http_get('url'))

def test_http_get_error_on_404_status(self):
"""Test that an exception is raised in case of HTTP error code"""
Expand All @@ -117,7 +117,7 @@ def test_http_get_error_on_404_status(self):
with mock.patch('geospaas_harvesting.utils.http_request') as mock_request:
mock_request.side_effect = requests.HTTPError(response=response)
with self.assertRaises(requests.HTTPError):
crawlers.Crawler._http_get('http://foo')
crawlers.Crawler()._http_get('http://foo')

def test_abstract_get_normalized_attributes(self):
"""get_normalized_attributes() should raise a NotImplementedError"""
Expand Down Expand Up @@ -329,6 +329,31 @@ def test_equality(self):
'http://foo', (datetime(2024, 1, 2), datetime(2024, 1, 3)),
r'.*\.nc', 'user', 'password'))

def test_http_get_with_auth(self):
"""If no username and password are provided, HTTP requests
should not have an 'auth' parameter
"""
crawler = crawlers.DirectoryCrawler('', username='user', password='pass')
with mock.patch('geospaas_harvesting.crawlers.Crawler._http_get') as mock_get:
crawler._http_get('http://foo/bar')
crawler._http_get('http://foo/bar', request_parameters={'quz': 'qux'})
mock_get.assert_has_calls((
mock.call('http://foo/bar', request_parameters={'auth': ('user', 'pass')},
max_tries=5, wait_time=5),
mock.call('http://foo/bar', request_parameters={'quz': 'qux', 'auth': ('user', 'pass')},
max_tries=5, wait_time=5),
))

def test_http_get_no_auth(self):
"""If no username and password are provided, HTTP requests
should not have an 'auth' parameter
"""
crawler = crawlers.DirectoryCrawler('')
with mock.patch('geospaas_harvesting.crawlers.Crawler._http_get') as mock_get:
crawler._http_get('http://foo/bar')
mock_get.assert_called_with('http://foo/bar', request_parameters=None,
max_tries=5, wait_time=5)

def test_abstract_list_folder_contents(self):
"""
A NotImplementedError should be raised if the _list_folder_contents() method
Expand Down Expand Up @@ -709,7 +734,8 @@ def test_list_folder_contents_no_auth(self):
mock_http_get.return_value.text = '<html><html/>'
crawler = crawlers.HTMLDirectoryCrawler('http://foo')
crawler._list_folder_contents('/bar')
mock_http_get.assert_called_once_with('http://foo/bar', {})
mock_http_get.assert_called_once_with('http://foo/bar', request_parameters={},
max_tries=5, wait_time=5)

def test_list_folder_contents_with_auth(self):
"""If a username and password are provided, HTTP requests
Expand All @@ -719,7 +745,9 @@ def test_list_folder_contents_with_auth(self):
mock_http_get.return_value.text = '<html><html/>'
crawler = crawlers.HTMLDirectoryCrawler('http://foo', username='user', password='pass')
crawler._list_folder_contents('/bar')
mock_http_get.assert_called_once_with('http://foo/bar', {'auth': ('user', 'pass')})
mock_http_get.assert_called_once_with('http://foo/bar',
request_parameters={'auth': ('user', 'pass')},
max_tries=5, wait_time=5)

def test_get_normalized_attributes(self):
"""Test that the attributes are gotten using metanorm, and the
Expand Down

0 comments on commit e74e89e

Please sign in to comment.