diff --git a/Changelog.md b/Changelog.md index 2daea204..561b767e 100644 --- a/Changelog.md +++ b/Changelog.md @@ -8,6 +8,17 @@ Changes: Fixes: +- __Migration__ Enable per server timeouts and timing between requests. Replaces `RETRIEVE_DATAFRAME_TIMEOUT_SECONDS`. +- Fix invalid logging kwarg. +- Upgraded data retrieval logging to error to send to Sentry. +- Raise parse error when `-` is not used to split server and dataset. + +## 0.4.8-2 - 12/19/2022 + +Fixes + +- Gitops repo & CI working directory name mismatch + ## 0.4.8 - 12/19/2022 Changes: diff --git a/app/deployments/migrations/0040_erddapserver_request_refresh_time_seconds_and_more.py b/app/deployments/migrations/0040_erddapserver_request_refresh_time_seconds_and_more.py new file mode 100755 index 00000000..b2e2aebd --- /dev/null +++ b/app/deployments/migrations/0040_erddapserver_request_refresh_time_seconds_and_more.py @@ -0,0 +1,35 @@ +# Generated by Django 4.1.3 on 2022-12-20 00:58 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ("deployments", "0039_erddapserver_proxy_cors"), + ] + + operations = [ + migrations.AddField( + model_name="erddapserver", + name="request_refresh_time_seconds", + field=models.FloatField( + default=0, + help_text=( + "Minimum number of seconds to attempt to delay between requests. " + "If dataset refreshes are triggered independently (e.g. via ERDDAP subscriptions) " + "they might ignore this." + ), + verbose_name="Refresh request time in seconds", + ), + ), + migrations.AddField( + model_name="erddapserver", + name="request_timeout_seconds", + field=models.PositiveIntegerField( + default=60, + help_text="Seconds before requests time out.", + verbose_name="Request timeout in seconds", + ), + ), + ] diff --git a/app/deployments/models.py b/app/deployments/models.py index b52d92cc..c799363f 100644 --- a/app/deployments/models.py +++ b/app/deployments/models.py @@ -174,6 +174,20 @@ class ErddapServer(models.Model): "if the remote server does not support CORS" ), ) + request_refresh_time_seconds = models.FloatField( + "Refresh request time in seconds", + default=0, + help_text=( + "Minimum number of seconds to attempt to delay between requests. " + "If dataset refreshes are triggered independently " + "(e.g. via ERDDAP subscriptions) they might ignore this." + ), + ) + request_timeout_seconds = models.PositiveIntegerField( + "Request timeout in seconds", + default=60, + help_text=("Seconds before requests time out."), + ) def __str__(self): if self.name: diff --git a/app/deployments/tasks.py b/app/deployments/tasks.py index 9609d4e2..fea3bf92 100644 --- a/app/deployments/tasks.py +++ b/app/deployments/tasks.py @@ -1,4 +1,5 @@ import logging +import time from datetime import timedelta import requests @@ -39,7 +40,7 @@ def update_values_for_timeseries(timeseries): return except Timeout as error: - logger.warning( + logger.error( ( f"Timeout when trying to retrieve dataset {timeseries[0].dataset.name} " f"with constraint {timeseries[0].constraints}: {error}" @@ -76,7 +77,7 @@ def update_values_for_timeseries(timeseries): f"Unable to find position in dataframe for {series.platform.name} - " f"{series.variable}" ) - logger.warning(message) + logger.error(message) return try: @@ -112,12 +113,15 @@ def refresh_dataset(dataset_id: int, healthcheck: bool = False): dataset.refresh_attempted = timezone.now() dataset.save() + request_refresh_time_seconds = dataset.server.request_refresh_time_seconds + if healthcheck: dataset.healthcheck_start() groups = dataset.group_timeseries_by_constraint() for constraints, timeseries in groups.items(): + time.sleep(request_refresh_time_seconds) update_values_for_timeseries(timeseries) if healthcheck: @@ -185,7 +189,7 @@ def handle_500_time_range_error(timeseries_group, compare_text: str) -> bool: try: times_str = compare_text.rpartition("actual_range:")[-1].rpartition(")")[0] except (AttributeError, IndexError) as e: - logger.warning( + logger.error( ( f"Unable to access and attribute or index of {timeseries_group[0].dataset.name} " f"with constraint {timeseries_group[0].constraints}: {e}" @@ -207,7 +211,7 @@ def handle_500_time_range_error(timeseries_group, compare_text: str) -> bool: try: end_time = times[0] except IndexError: - logger.warning( + logger.error( ( "Unable to parse datetimes in error processing dataset " f"{timeseries_group[0].dataset.name} with constraint " @@ -225,7 +229,7 @@ def handle_500_time_range_error(timeseries_group, compare_text: str) -> bool: ts.end_time = end_time ts.save() - logger.warning( + logger.error( f"Set end time for {ts} to {end_time} based on responses", extra=error_extra(timeseries_group, compare_text), exc_info=True, @@ -257,7 +261,7 @@ def handle_500_unrecognized_constraint(timeseries_group, compare_text: str) -> b returns True if handled """ if "Unrecognized constraint variable=" in compare_text: - logger.warning( + logger.error( ( f"Invalid constraint variable for dataset {timeseries_group[0].dataset.name} " f"with constraints {timeseries_group[0].constraints}" @@ -303,7 +307,7 @@ def handle_400_errors(timeseries_group, compare_text: str) -> bool: def handle_429_too_many_requests(timeseries_group, compare_text: str) -> bool: """Too many requests too quickly to the server""" if "Too Many Requests" in compare_text and "code=429" in compare_text: - logger.warning( + logger.error( f"Too many requests to server {timeseries_group[0].dataset.server}", extra=error_extra(timeseries_group, compare_text), ) @@ -315,7 +319,7 @@ def handle_429_too_many_requests(timeseries_group, compare_text: str) -> bool: def handle_400_unrecognized_variable(timeseries_group, compare_text: str) -> bool: """When there is an unrecognized variable requested""" if "Unrecognized variable=" in compare_text: - logger.warning( + logger.error( f"Unrecognized variable for dataset {timeseries_group[0].dataset.name}", extra=error_extra(timeseries_group, compare_text), ) @@ -343,9 +347,9 @@ def handle_404_errors(timeseries_group, compare_text: str) -> bool: def handle_404_dataset_file_not_found(timeseries_group, compare_text: str) -> bool: if "java.io.FileNotFoundException" in compare_text and "code=404" in compare_text: - logger.warning( + logger.error( f"{timeseries_group[0].dataset.name} does not exist on the server", - error=error_extra(timeseries_group, compare_text), + extra=error_extra(timeseries_group, compare_text), ) return True @@ -355,9 +359,9 @@ def handle_404_dataset_file_not_found(timeseries_group, compare_text: str) -> bo def handle_404_no_matching_time(timeseries_group, compare_text: str) -> bool: """Handle when the station does not have time for the current request""" if "No data matches time" in compare_text and "code=404" in compare_text: - logger.warning( + logger.error( f"{timeseries_group[0].dataset.name} does not currently have a valid time", - error=error_extra(timeseries_group, compare_text), + extra=error_extra(timeseries_group, compare_text), ) return True @@ -370,7 +374,7 @@ def handle_404_no_matching_station(timeseries_group, compare_text: str) -> bool: "Your query produced no matching results" in compare_text and "There are no matching stations" in compare_text ): - logger.warning( + logger.error( ( f"{timeseries_group[0].dataset.name} does not have a requested station. " "Please check the constraints" @@ -388,7 +392,7 @@ def handle_404_no_matching_dataset_id(timeseries_group, compare_text: str) -> bo "Resource not found" in compare_text and "Currently unknown datasetID" in compare_text ): - logger.warning( + logger.error( ( f"{timeseries_group[0].dataset.name} is currently unknown by the server. " "Please investigate if the dataset has moved" @@ -418,7 +422,7 @@ def handle_http_errors(timeseries_group, error: HTTPError) -> bool: return True if error.response.status_code == 404: - logger.warning( + logger.error( ( f"No rows found for {timeseries_group[0].dataset.name} " f"with constraint {timeseries_group[0].constraints}: {error}" diff --git a/app/deployments/utils/erddap_datasets.py b/app/deployments/utils/erddap_datasets.py index 1a1bb651..b33398d8 100644 --- a/app/deployments/utils/erddap_datasets.py +++ b/app/deployments/utils/erddap_datasets.py @@ -1,10 +1,11 @@ -import os from datetime import datetime, timedelta from logging import getLogger from erddapy import ERDDAP from pandas import DataFrame +from ..models import ErddapServer + logger = getLogger(__name__) @@ -42,7 +43,12 @@ def setup_variables( return server -def retrieve_dataframe(server, dataset: str, constraints, timeseries) -> DataFrame: +def retrieve_dataframe( + server: ErddapServer, + dataset: str, + constraints, + timeseries, +) -> DataFrame: """Returns a dataframe from ERDDAP for a given dataset Attempts to sort the dataframe by time""" @@ -53,9 +59,7 @@ def retrieve_dataframe(server, dataset: str, constraints, timeseries) -> DataFra constraints=constraints, ) - e.requests_kwargs["timeout"] = float( - os.environ.get("RETRIEVE_DATAFRAME_TIMEOUT_SECONDS", 60), - ) + e.requests_kwargs["timeout"] = server.request_timeout_seconds df = e.to_pandas(parse_dates=True) diff --git a/app/deployments/views.py b/app/deployments/views.py index 81e1de59..421a283e 100644 --- a/app/deployments/views.py +++ b/app/deployments/views.py @@ -8,7 +8,7 @@ from django.views.decorators.cache import cache_page from rest_framework import viewsets from rest_framework.decorators import action -from rest_framework.exceptions import APIException +from rest_framework.exceptions import APIException, ParseError from rest_framework.response import Response from .models import ErddapDataset, ErddapServer, Platform @@ -49,20 +49,28 @@ class DatasetViewSet(viewsets.ReadOnlyModelViewSet): queryset = ErddapDataset.objects.select_related("server") serializer_class = ErddapDatasetSerializer - def retrieve(self, request, *args, **kwargs): # pylint: disable=unused-argument + def dataset(self, **kwargs) -> ErddapDataset: + """Retrieve the dataset from the request kwargs""" pk = kwargs["pk"] - server, dataset = pk.split("-", maxsplit=1) + try: + server, dataset = pk.split("-", maxsplit=1) + except ValueError: + raise ParseError( + detail="Invalid server-dataset key. Server and dataset must be split by `-`.", + ) + dataset = get_object_or_404(self.queryset, name=dataset, server__name=server) + return dataset + + def retrieve(self, request, *args, **kwargs): # pylint: disable=unused-argument + dataset = self.dataset(**kwargs) serializer = self.serializer_class(dataset, context={"request": request}) return Response(serializer.data) @action(detail=True) def refresh(self, request, **kwargs): - pk = kwargs["pk"] - - server, dataset = pk.split("-", maxsplit=1) - dataset = get_object_or_404(self.queryset, name=dataset, server__name=server) + dataset = self.dataset(**kwargs) refresh_dataset.delay(dataset.id, healthcheck=True)