Skip to content

Commit

Permalink
Add support for NIST PDR DOIs
Browse files Browse the repository at this point in the history
  • Loading branch information
jat255 committed Nov 7, 2024
1 parent 8a9a47a commit 593c339
Show file tree
Hide file tree
Showing 4 changed files with 236 additions and 6 deletions.
109 changes: 109 additions & 0 deletions pooch/downloaders.py
Original file line number Diff line number Diff line change
Expand Up @@ -531,6 +531,7 @@ class DOIDownloader: # pylint: disable=too-few-public-methods
* `figshare <https://www.figshare.com>`__
* `Zenodo <https://www.zenodo.org>`__
* `Dataverse <https://dataverse.org/>`__ instances
* The `NIST Public Data Repository <https://data.nist.gov/>`__
.. attention::
Expand Down Expand Up @@ -683,6 +684,7 @@ def doi_to_repository(doi):
FigshareRepository,
ZenodoRepository,
DataverseRepository,
NISTPDRRepository,
]

# Extract the DOI and the repository information
Expand Down Expand Up @@ -1161,3 +1163,110 @@ def populate_registry(self, pooch):
pooch.registry[filedata["dataFile"]["filename"]] = (
f"md5:{filedata['dataFile']['md5']}"
)


class NISTPDRRepository(DataRepository): # pylint: disable=missing-class-docstring
base_api_url = "https://data.nist.gov/rmm/records"

def __init__(self, doi, archive_url):
self.archive_url = archive_url
self.doi = doi
self._api_response = None

@classmethod
def initialize(cls, doi, archive_url):
"""
Initialize the data repository if the given URL points to a
corresponding repository.
Initializes a data repository object. This is done as part of
a chain of responsibility. If the class cannot handle the given
repository URL, it returns `None`. Otherwise a `DataRepository`
instance is returned.
Parameters
----------
doi : str
The DOI that identifies the repository
archive_url : str
The resolved URL for the DOI
"""

# Check whether this is a Zenodo URL
parsed_archive_url = parse_url(archive_url)
if parsed_archive_url["netloc"] != "data.nist.gov":
return None

return cls(doi, archive_url)

@property
def api_response(self):
"""Cached API response from NIST PDR"""
if self._api_response is None:
# Lazy import requests to speed up import time
import requests # pylint: disable=C0415

article_id = self.archive_url.split("/")[-1]
self._api_response = requests.get(
f"{self.base_api_url}/{article_id}",
timeout=DEFAULT_TIMEOUT,
)

return self._api_response

def download_url(self, file_name):
"""
Use the repository API to get the download URL for a file given
the archive URL.
Parameters
----------
file_name : str
The name of the file in the archive that will be downloaded.
Returns
-------
download_url : str
The HTTP URL that can be used to download the file.
"""
response = self.api_response.json()
# files is dictionary of nrdp:DataFile types,
# with filepath as key in the record's components:
files = {
i["filepath"]: i
for i in response['components'] if "nrdp:DataFile" in i['@type']
}
if file_name not in files:
raise ValueError(
f"File '{file_name}' not found in data archive "
f"{self.archive_url} (doi:{self.doi})."
)
# Generate download_url using the file id
download_url = files[file_name]["downloadURL"]
return download_url

def populate_registry(self, pooch):
"""
Populate the registry using the data repository's API
Parameters
----------
pooch : Pooch
The pooch instance that the registry will be added to.
Notes
-----
After Zenodo migrated to InvenioRDM on Oct 2023, their API changed. The
checksums for each file listed in the API reference is now an md5 sum.
This method supports both the legacy and the new API.
"""
files = {
i["filepath"]: i
for i in self.api_response.json()['components'] if "nrdp:DataFile" in i['@type']
}
for file_name, file_data in files.items():
checksum_value = file_data["checksum"]["hash"]
checksum_type = file_data["checksum"]["algorithm"]["tag"]
checksum = f"{checksum_type}:{checksum_value}"
pooch.registry[file_name] = checksum
90 changes: 90 additions & 0 deletions pooch/tests/test_downloaders.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,9 @@
"""
Test the downloader classes and functions separately from the Pooch core.
"""
import hashlib
import os
import re
import sys
from tempfile import TemporaryDirectory

Expand All @@ -33,6 +35,7 @@
FigshareRepository,
ZenodoRepository,
DataverseRepository,
NISTPDRRepository,
doi_to_url,
)
from ..processors import Unzip
Expand All @@ -45,6 +48,8 @@
pooch_test_zenodo_url,
pooch_test_zenodo_with_slash_url,
pooch_test_dataverse_url,
pooch_test_nist_pdr_url,
pooch_test_nist_pdr_nested_file,
)


Expand Down Expand Up @@ -140,6 +145,91 @@ def test_doi_downloader(url):
check_tiny_data(outfile)


@pytest.mark.network
def test_doi_downloader_nist_pdr():
"""
Test the DOI downloader for the NIST PDR.
The NIST PDR does not have a record with 'tiny-data.txt',
so uses a slightly different test implementation than test_doi_downloader()
"""
with TemporaryDirectory() as local_store:
downloader = DOIDownloader()
outfile = os.path.join(local_store, "README.txt")
to_download = pooch_test_nist_pdr_url("simple") + "README.txt"
downloader(to_download, outfile, None)

assert os.path.exists(outfile)
with open(outfile, encoding="utf-8") as tinydata:
content = tinydata.read()
true_content = "The `labbench` python library provides tools for instrument automation"
assert content.strip()[:70] == true_content


@pytest.mark.network
def test_doi_downloader_nist_pdr_file_in_collection():
"""
Test the DOI downloader for the NIST PDR.
This test tests a file deeper in a nested collection
"""
with TemporaryDirectory() as local_store:
downloader = DOIDownloader()
file_path = pooch_test_nist_pdr_nested_file()["filename"]
outfile = os.path.join(local_store, file_path.rsplit("/", maxsplit=1)[-1])
to_download = pooch_test_nist_pdr_url("nested_collection") + file_path
downloader(to_download, outfile, None)

assert os.path.exists(outfile)
with open(outfile, "rb") as tinydata:
assert (
hashlib.sha256(tinydata.read()).hexdigest()
== pooch_test_nist_pdr_nested_file()["checksum"][7:]
)


@pytest.mark.network
def test_doi_downloader_nist_pdr_missing_file():
"""
Test the DOI downloader for the NIST PDR.
This test tests a file deeper in a nested collection
"""
with TemporaryDirectory() as local_store:
downloader = DOIDownloader()
file_path = "nonexistent_file.txt"
outfile = os.path.join(local_store, file_path.rsplit("/", maxsplit=1)[-1])
to_download = pooch_test_nist_pdr_url("nested_collection") + file_path
with pytest.raises(
ValueError,
match=re.escape(
"File 'nonexistent_file.txt' not found in data archive "
"https://data.nist.gov/od/id/8C40CFA7931709DAE0532457068179072082 "
"(doi:10.18434/M32082).",
),
):
downloader(to_download, outfile, None)


def test_populate_registry_nist_pdr(tmp_path):
"""
Test if population of registry is correctly done for NIST PDR.
"""
# Create sample pooch object
puppy = Pooch(base_url="", path=tmp_path)
_doi = "10.18434/M32082"
_doi_url = doi_to_url(_doi)
# Create NIST PDR downloader
downloader = NISTPDRRepository(doi=_doi, archive_url=_doi_url)
# Populate registry
downloader.populate_registry(puppy)
assert len(puppy.registry) == 101
assert (
puppy.registry[pooch_test_nist_pdr_nested_file()["filename"]]
== pooch_test_nist_pdr_nested_file()["checksum"]
)


@pytest.mark.network
def test_zenodo_downloader_with_slash_in_fname():
"""
Expand Down
31 changes: 31 additions & 0 deletions pooch/tests/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,37 @@ def pooch_test_dataverse_url():
return url


def pooch_test_nist_pdr_url(file_type="simple"):
"""
Get the base URL for test data stored on the NIST Public Data Repository.
Returns
-------
url
The URL for pooch's test data.
"""
urls = {
"simple": "doi:10.18434/M32122/",
"nested_collection": "doi:10.18434/M32082/"
}
return urls[file_type]


def pooch_test_nist_pdr_nested_file():
"""
Get filename and checksum for a nested file stored on the NIST PDR.
Returns
-------
dict
Dictionary containing the filename and checksum for a test data file.
"""
return {
"filename": "EDS/Figure 5 - Point Scans/EDS Objects Supplement Raw Data/CSL/CSL_SQ.xlsx",
"checksum": "sha256:5867c7048743c4814d68e09b3738cd9b90a5bab102cec87126c77b2924e1070b",
}


def pooch_test_registry():
"""
Get a registry for the test data used in Pooch itself.
Expand Down
12 changes: 6 additions & 6 deletions pooch/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,11 +159,11 @@ def parse_url(url):
The DOI is a special case. The protocol will be "doi", the netloc will be
the DOI, and the path is what comes after the last "/".
The only exception are Zenodo dois: the protocol will be "doi", the netloc
will be composed by the "prefix/suffix" and the path is what comes after
the second "/". This allows to support special cases of Zenodo dois where
the path contains forward slashes "/", created by the GitHub-Zenodo
integration service.
The only exception are Zenodo and NIST PDR dois: the protocol will be
"doi", the netloc will be composed by the "prefix/suffix" and the path is
what comes after the second "/". This allows to support special cases of
Zenodo dois where the path contains forward slashes "/", created by the
GitHub-Zenodo integration service.
Parameters
----------
Expand All @@ -184,7 +184,7 @@ def parse_url(url):
if url.startswith("doi:"):
protocol = "doi"
parts = url[4:].split("/")
if "zenodo" in parts[1].lower():
if "zenodo" in parts[1].lower() or "10.18434" in parts:
netloc = "/".join(parts[:2])
path = "/" + "/".join(parts[2:])
else:
Expand Down

0 comments on commit 593c339

Please sign in to comment.