Skip to content

Commit

Permalink
Add error handler
Browse files Browse the repository at this point in the history
  • Loading branch information
b-j-mills committed Feb 4, 2025
1 parent 0007240 commit f1d2687
Show file tree
Hide file tree
Showing 4 changed files with 139 additions and 128 deletions.
2 changes: 1 addition & 1 deletion .config/ruff.toml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
line-length = 90
line-length = 95
exclude = ["_version.py"]

[lint]
Expand Down
73 changes: 37 additions & 36 deletions src/hdx/scraper/acled/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,12 @@
from os.path import dirname, expanduser, join

from hdx.api.configuration import Configuration
from hdx.api.utilities.hdx_error_handler import HDXErrorHandler
from hdx.data.user import User
from hdx.facades.infer_arguments import facade
from hdx.utilities.dateparse import now_utc
from hdx.utilities.downloader import Download
from hdx.utilities.path import (
wheretostart_tempdir_batch,
)
from hdx.utilities.path import temp_dir
from hdx.utilities.retriever import Retrieve

from src.hdx.scraper.acled.acled import Acled
Expand All @@ -30,58 +29,60 @@
def main(
save: bool = True,
use_saved: bool = False,
err_to_hdx: bool = False,
) -> None:
"""Generate datasets and create them in HDX
Args:
save (bool): Save downloaded data. Defaults to True.
use_saved (bool): Use saved data. Defaults to False.
err_to_hdx (bool): Whether to write any errors to HDX metadata. Defaults to False.
Returns:
None
"""
logger.info(f"##### {_USER_AGENT_LOOKUP} ####")
configuration = Configuration.read()
if not User.check_current_user_organization_access("hdx-hapi", "create_dataset"):
raise PermissionError("API Token does not give access to HDX-HAPI organisation!")

with wheretostart_tempdir_batch(folder=_USER_AGENT_LOOKUP) as info:
temp_dir = info["folder"]
with Download() as downloader:
retriever = Retrieve(
downloader=downloader,
fallback_dir=temp_dir,
saved_dir=_SAVED_DATA_DIR,
temp_dir=temp_dir,
save=save,
use_saved=use_saved,
)

acled = Acled(configuration, retriever, temp_dir)
acled.get_pcodes()

today = now_utc()
year = today.year
acled.download_data(year)

dataset = acled.generate_dataset()
dataset.update_from_yaml(
path=join(dirname(__file__), "config", "hdx_dataset_static.yaml")
)
dataset.create_in_hdx(
remove_additional_resources=True,
match_resource_order=False,
hxl_update=False,
updated_by_script=_UPDATED_BY_SCRIPT,
batch=info["batch"],
)
with HDXErrorHandler(write_to_hdx=err_to_hdx) as error_handler:
with temp_dir(folder=_USER_AGENT_LOOKUP) as temp_folder:
with Download() as downloader:
retriever = Retrieve(
downloader=downloader,
fallback_dir=temp_folder,
saved_dir=_SAVED_DATA_DIR,
temp_dir=temp_folder,
save=save,
use_saved=use_saved,
)

acled = Acled(configuration, retriever, temp_folder, error_handler)
acled.get_pcodes()

today = now_utc()
year = today.year
acled.download_data(year)

dataset = acled.generate_dataset()
dataset.update_from_yaml(
path=join(dirname(__file__), "config", "hdx_dataset_static.yaml")
)
dataset.create_in_hdx(
remove_additional_resources=True,
match_resource_order=False,
hxl_update=False,
updated_by_script=_UPDATED_BY_SCRIPT,
)

logger.info("HDX Scraper ACLED pipeline completed!")


if __name__ == "__main__":
facade(
main,
user_agent_config_yaml=join(expanduser("~"), ".useragents.yaml"),
user_agent_lookup=_USER_AGENT_LOOKUP,
project_config_yaml=join(
dirname(__file__), "config", "project_configuration.yaml"
),
project_config_yaml=join(dirname(__file__), "config", "project_configuration.yaml"),
)
32 changes: 21 additions & 11 deletions src/hdx/scraper/acled/acled.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

import numpy as np
from hdx.api.configuration import Configuration
from hdx.api.utilities.hdx_error_handler import HDXErrorHandler
from hdx.data.dataset import Dataset
from hdx.location.adminlevel import AdminLevel
from hdx.location.country import Country
Expand All @@ -17,13 +18,20 @@


class Acled:
def __init__(self, configuration: Configuration, retriever: Retrieve, temp_dir: str):
def __init__(
self,
configuration: Configuration,
retriever: Retrieve,
temp_dir: str,
error_handler: HDXErrorHandler,
):
self._configuration = configuration
self._retriever = retriever
self._temp_dir = temp_dir
self._admins = []
self._error_handler = error_handler
self.data = {}
self.dates = []
self._admins = []

def get_pcodes(self) -> None:
for admin_level in [1, 2]:
Expand Down Expand Up @@ -65,9 +73,9 @@ def download_data(self, current_year: int) -> None:
subset = contents[
["Admin2 Pcode", "Admin1", "Admin2", "event_type", "Month", "Year"]
]
subset.loc[contents["Admin2 Pcode"].isna(), "Admin2 Pcode"] = (
contents.loc[contents["Admin2 Pcode"].isna(), "Country"]
)
subset.loc[contents["Admin2 Pcode"].isna(), "Admin2 Pcode"] = contents.loc[
contents["Admin2 Pcode"].isna(), "Country"
]
duplicates = subset.duplicated(keep=False)
contents["error"] = None
contents.loc[duplicates, "error"] = "Duplicate row"
Expand Down Expand Up @@ -124,15 +132,19 @@ def download_data(self, current_year: int) -> None:
pcode = contents["Admin2 Pcode"][i]
if not pcode or pcode not in self._admins[1].pcodes:
admin1_name = contents["Admin1"][1]
adm1_pcode, _ = self._admins[0].get_pcode(
country_iso, admin1_name
)
adm1_pcode, _ = self._admins[0].get_pcode(country_iso, admin1_name)
admin2_name = contents["Admin2"][i]
if admin2_name:
pcode, _ = self._admins[1].get_pcode(
country_iso, admin2_name, parent=adm1_pcode
)
if not pcode:
self._error_handler.add_missing_value_message(
"ACLED",
dataset_name,
"admin 2 pcode",
admin2_name,
)
adm1_pcodes.append(None)
adm2_pcodes.append(None)
adm1_names.append(None)
Expand Down Expand Up @@ -210,9 +222,7 @@ def generate_dataset(self) -> Optional[Dataset]:
for date_range in reversed(self.data.keys()):
data = self.data[date_range].to_dict(orient="records")
resourcedata = {
"name": self._configuration["resource_name"].replace(
"date_range", date_range
),
"name": self._configuration["resource_name"].replace("date_range", date_range),
"description": self._configuration["resource_description"].replace(
"date_range", date_range
),
Expand Down
160 changes: 80 additions & 80 deletions tests/test_acled.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

import pytest
from hdx.api.configuration import Configuration
from hdx.api.utilities.hdx_error_handler import HDXErrorHandler
from hdx.data.dataset import Dataset
from hdx.utilities.downloader import Download
from hdx.utilities.path import temp_dir
Expand Down Expand Up @@ -49,86 +50,85 @@ def input_dir(self, fixtures_dir):
def config_dir(self, fixtures_dir):
return join("src", "hdx", "scraper", "acled", "config")

def test_acled(
self, configuration, read_dataset, fixtures_dir, input_dir, config_dir
):
with temp_dir(
"TestAcled",
delete_on_success=True,
delete_on_failure=False,
) as tempdir:
with Download(user_agent="test") as downloader:
retriever = Retrieve(
downloader=downloader,
fallback_dir=tempdir,
saved_dir=input_dir,
temp_dir=tempdir,
save=False,
use_saved=True,
)
acled = Acled(configuration, retriever, tempdir)
acled.get_pcodes()
acled.download_data(2025)
assert len(acled.dates) == 6
assert len(acled.data) == 2
assert len(acled.data["2025-2029"]) == 14373
def test_acled(self, configuration, read_dataset, fixtures_dir, input_dir, config_dir):
with HDXErrorHandler() as error_handler:
with temp_dir(
"TestAcled",
delete_on_success=True,
delete_on_failure=False,
) as temp_folder:
with Download(user_agent="test") as downloader:
retriever = Retrieve(
downloader=downloader,
fallback_dir=temp_folder,
saved_dir=input_dir,
temp_dir=temp_folder,
save=False,
use_saved=True,
)
acled = Acled(configuration, retriever, temp_folder, error_handler)
acled.get_pcodes()
acled.download_data(2025)
assert len(acled.dates) == 6
assert len(acled.data) == 2
assert len(acled.data["2025-2029"]) == 14373

dataset = acled.generate_dataset()
dataset.update_from_yaml(path=join(config_dir, "hdx_dataset_static.yaml"))
assert dataset == {
"caveats": "HDX HAPI is refreshed daily, but the source datasets may "
"have different update schedules. Please refer to the source "
"datasets for each subcategory to verify their specific update "
"frequency.\n",
"data_update_frequency": 7,
"dataset_date": "[1997-01-01T00:00:00 TO 2025-01-10T23:59:59]",
"dataset_preview": "no_preview",
"dataset_source": "Armed Conflict Location & Event Data Project "
"(ACLED)",
"groups": [{"name": "world"}],
"license_id": "hdx-other",
"license_other": "By using ACLED data you agree to abide by the "
"[Terms of Use and Attribution Policy](https://acleddata.com/terms-of-use/).",
"maintainer": "aa13de36-28c5-47a7-8d0b-6d7c754ba8c8",
"methodology": "Registry",
"name": "hdx-hapi-conflict-event-test",
"notes": "This dataset contains data obtained from the\n"
"[HDX Humanitarian API](https://hapi.humdata.org/) (HDX HAPI),\n"
"which provides standardized humanitarian indicators designed\n"
"for seamless interoperability from multiple sources.\n"
"The data facilitates automated workflows and visualizations\n"
"to support humanitarian decision making.\n"
"For more information, please see the HDX HAPI\n"
"[landing page](https://data.humdata.org/hapi)\n"
"and\n"
"[documentation](https://hdx-hapi.readthedocs.io/en/latest/).\n",
"owner_org": "hdx-hapi",
"package_creator": "HDX Data Systems Team",
"private": False,
"subnational": "1",
"tags": [
{
"name": "conflict-violence",
"vocabulary_id": "b891512e-9516-4bf5-962a-7a289772a2a1",
}
],
"title": "HDX HAPI - Coordination & Context: Conflict Events",
}
dataset = acled.generate_dataset()
dataset.update_from_yaml(path=join(config_dir, "hdx_dataset_static.yaml"))
assert dataset == {
"caveats": "HDX HAPI is refreshed daily, but the source datasets may "
"have different update schedules. Please refer to the source "
"datasets for each subcategory to verify their specific update "
"frequency.\n",
"data_update_frequency": 7,
"dataset_date": "[1997-01-01T00:00:00 TO 2025-01-10T23:59:59]",
"dataset_preview": "no_preview",
"dataset_source": "Armed Conflict Location & Event Data Project "
"(ACLED)",
"groups": [{"name": "world"}],
"license_id": "hdx-other",
"license_other": "By using ACLED data you agree to abide by the "
"[Terms of Use and Attribution Policy](https://acleddata.com/terms-of-use/).",
"maintainer": "aa13de36-28c5-47a7-8d0b-6d7c754ba8c8",
"methodology": "Registry",
"name": "hdx-hapi-conflict-event-test",
"notes": "This dataset contains data obtained from the\n"
"[HDX Humanitarian API](https://hapi.humdata.org/) (HDX HAPI),\n"
"which provides standardized humanitarian indicators designed\n"
"for seamless interoperability from multiple sources.\n"
"The data facilitates automated workflows and visualizations\n"
"to support humanitarian decision making.\n"
"For more information, please see the HDX HAPI\n"
"[landing page](https://data.humdata.org/hapi)\n"
"and\n"
"[documentation](https://hdx-hapi.readthedocs.io/en/latest/).\n",
"owner_org": "hdx-hapi",
"package_creator": "HDX Data Systems Team",
"private": False,
"subnational": "1",
"tags": [
{
"name": "conflict-violence",
"vocabulary_id": "b891512e-9516-4bf5-962a-7a289772a2a1",
}
],
"title": "HDX HAPI - Coordination & Context: Conflict Events",
}

resources = dataset.get_resources()
assert len(resources) == 2
assert resources[0] == {
"description": "Conflict Event data from HDX HAPI (2025-2029), "
"please see [the documentation](https://hdx-hapi.readthedocs.io/en/"
"latest/data_usage_guides/coordination_and_context/#conflict-events) "
"for more information",
"format": "csv",
"name": "Global Coordination & Context: Conflict Events (2025-2029)",
"resource_type": "file.upload",
"url_type": "upload",
}
resources = dataset.get_resources()
assert len(resources) == 2
assert resources[0] == {
"description": "Conflict Event data from HDX HAPI (2025-2029), "
"please see [the documentation](https://hdx-hapi.readthedocs.io/en/"
"latest/data_usage_guides/coordination_and_context/#conflict-events) "
"for more information",
"format": "csv",
"name": "Global Coordination & Context: Conflict Events (2025-2029)",
"resource_type": "file.upload",
"url_type": "upload",
}

assert filecmp.cmp(
join(tempdir, "hdx_hapi_conflict_event_global_2025-2029.csv"),
join(fixtures_dir, "hdx_hapi_conflict_event_global_2025-2029.csv"),
)
assert filecmp.cmp(
join(temp_folder, "hdx_hapi_conflict_event_global_2025-2029.csv"),
join(fixtures_dir, "hdx_hapi_conflict_event_global_2025-2029.csv"),
)

0 comments on commit f1d2687

Please sign in to comment.