Skip to content

Commit

Permalink
Clean up
Browse files Browse the repository at this point in the history
  • Loading branch information
mcarans committed Mar 4, 2025
1 parent 07de629 commit 6909f43
Show file tree
Hide file tree
Showing 3 changed files with 52 additions and 43 deletions.
33 changes: 12 additions & 21 deletions src/hdx/resource/changedetection/head_results.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
import logging
from http import HTTPStatus
from typing import Dict, List, Set, Tuple
from urllib.parse import urlsplit

from hdx.resource.changedetection.utilities import status_lookup
from .utilities import log_output, status_lookup
from hdx.utilities.dateparse import parse_date
from hdx.utilities.dictandlist import (
dict_of_lists_add,
Expand All @@ -29,14 +30,16 @@ def process(self) -> None:
what_changed = []
resource = self.resources[resource_id]
http_size, http_last_modified, etag, status = result
if status != 200:
if status in (403, 429):
if status != HTTPStatus.OK:
status_str = status_lookup[status]
if status in (
HTTPStatus.FORBIDDEN,
HTTPStatus.TOO_MANY_REQUESTS,
):
# Server may not like HEAD requests or too many requests
self._resources_to_get[resource_id] = resource
status = str(status)
dict_of_lists_add(self._retrying, status, resource_id)
what_changed = status_lookup.get(status, f"status {status}")
dict_of_lists_add(self._changes, what_changed, resource_id)
dict_of_lists_add(self._retrying, status_str, resource_id)
dict_of_lists_add(self._changes, status_str, resource_id)
continue
get_resource = False
etag_unchanged = True
Expand Down Expand Up @@ -79,21 +82,9 @@ def process(self) -> None:

def output(self) -> None:
logger.info("\nChanges detected:")
for what_changed, resource_ids in self._changes.items():
count = len(resource_ids)
if count < 5:
resource_ids = ", ".join(resource_ids)
logger.info(f"{what_changed}: {resource_ids}")
else:
logger.info(f"{what_changed}: {count}")
log_output(self._changes)
logger.info("\nWill get these:")
for status, resource_ids in self._retrying.items():
count = len(resource_ids)
if count < 5:
resource_ids = ", ".join(resource_ids)
logger.info(f"{status}: {resource_ids}")
else:
logger.info(f"{status}: {count}")
log_output(self._retrying)

def get_distributed_resources_to_get(self) -> List[Tuple]:
def get_netloc(x):
Expand Down
30 changes: 16 additions & 14 deletions src/hdx/resource/changedetection/results.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
import logging
from http import HTTPStatus
from typing import Dict, Tuple

from hdx.resource.changedetection.utilities import status_lookup
from .utilities import log_output, status_lookup
from hdx.utilities.dateparse import parse_date
from hdx.utilities.dictandlist import dict_of_lists_add

Expand All @@ -22,52 +23,53 @@ def process(self) -> None:
what_changed = []
resource = self.resources[resource_id]
http_size, http_last_modified, etag, status = result
if status != 0 and status != 200:
status_str = status_lookup.get(status, f"status {status}")
if status != 0 and status != HTTPStatus.OK:
status_str = status_lookup[status]
if status < 0:
if status < -90:
dict_of_lists_add(
self._changes, what_changed, resource_id
)
continue
what_changed.append(status_str)
else:
status_str = None
update = False
if etag:
if etag != resource[5]:
what_changed.append("etag")
update = True
else:
status = "no etag"
what_changed.append(status)
if http_size:
if http_size != resource[3]:
status = "size"
what_changed.append(status)
update = True
else:
what_changed.append("no size")
if http_last_modified:
http_last_modified = parse_date(http_last_modified)
if http_last_modified != resource[4]:
status = "modified"
what_changed.append(status)
update = True
else:
what_changed.append("no modified")
what_changed = "|".join(what_changed)
dict_of_lists_add(self._changes, what_changed, resource_id)
if what_changed:
if update:
self._resources_to_update[resource_id] = (
http_size,
http_last_modified,
etag,
)
what_changed = "|".join(what_changed)
if status_str:
what_changed = f"{status_str} {what_changed}"
dict_of_lists_add(self._changes, what_changed, resource_id)

def output(self) -> None:
logger.info("\nChanges detected:")
for what_changed, resource_ids in self._changes.items():
count = len(resource_ids)
if count < 5:
resource_ids = ", ".join(resource_ids)
logger.info(f"{what_changed}: {resource_ids}")
else:
logger.info(f"{what_changed}: {count}")
log_output(self._changes)

def get_resources_to_update(self) -> Dict[str, Tuple]:
return self._resources_to_update
32 changes: 24 additions & 8 deletions src/hdx/resource/changedetection/utilities.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,36 @@
import logging
from http import HTTPStatus
from typing import Dict

import aiohttp

logger = logging.getLogger(__name__)

status_lookup = {
-99: "Unspecified Server Error",
-100: "Resource too large",
-1: "Mimetype != HDX Format",
-2: "Signature != HDX Format",
}
status_lookup = {i.value: i.name for i in HTTPStatus}
status_lookup.update(
{
-99: "UNSPECIFIED SERVER ERROR",
-100: "TOO LARGE TO HASH",
-1: "MIMETYPE != HDX FORMAT",
-2: "SIGNATURE != HDX FORMAT",
}
)


def log_output(status_to_resourceids: Dict):
for status in sorted(status_to_resourceids):
resource_ids = status_to_resourceids[status]
count = len(resource_ids)
if count < 5:
resource_ids = ", ".join(resource_ids)
logger.info(f"{status}: {resource_ids}")
else:
logger.info(f"{status}: {count}")


def is_server_error(ex: BaseException) -> bool:
if isinstance(ex, aiohttp.ServerTimeoutError):
logger.info(f"Retrying {ex.strerror} {ex.filename}")
logger.info(f"Retrying: {str(ex)}")
return True
if isinstance(ex, aiohttp.ClientResponseError) and ex.status in (
408,
Expand All @@ -27,7 +43,7 @@ def is_server_error(ex: BaseException) -> bool:
):
# These are too common to log by default
logger.debug(
f"Retrying {ex.status} {ex.message} {ex.request_info.url}"
f"Retrying: {ex.status} {ex.message} {ex.request_info.url}"
)
return True
return False

0 comments on commit 6909f43

Please sign in to comment.