Add support for NIST PDR DOIs

fatiando · Nov 7, 2024 · 593c339 · 593c339
1 parent 8a9a47a
commit 593c339
Show file tree

Hide file tree

Showing 4 changed files with 236 additions and 6 deletions.
diff --git a/pooch/downloaders.py b/pooch/downloaders.py
@@ -531,6 +531,7 @@ class DOIDownloader:  # pylint: disable=too-few-public-methods
     * `figshare <https://www.figshare.com>`__
     * `Zenodo <https://www.zenodo.org>`__
     * `Dataverse <https://dataverse.org/>`__ instances
+    * The `NIST Public Data Repository <https://data.nist.gov/>`__
 
     .. attention::
 
@@ -683,6 +684,7 @@ def doi_to_repository(doi):
         FigshareRepository,
         ZenodoRepository,
         DataverseRepository,
+        NISTPDRRepository,
     ]
 
     # Extract the DOI and the repository information
@@ -1161,3 +1163,110 @@ def populate_registry(self, pooch):
             pooch.registry[filedata["dataFile"]["filename"]] = (
                 f"md5:{filedata['dataFile']['md5']}"
             )
+
+
+class NISTPDRRepository(DataRepository):  # pylint: disable=missing-class-docstring
+    base_api_url = "https://data.nist.gov/rmm/records"
+
+    def __init__(self, doi, archive_url):
+        self.archive_url = archive_url
+        self.doi = doi
+        self._api_response = None
+
+    @classmethod
+    def initialize(cls, doi, archive_url):
+        """
+        Initialize the data repository if the given URL points to a
+        corresponding repository.
+
+        Initializes a data repository object. This is done as part of
+        a chain of responsibility. If the class cannot handle the given
+        repository URL, it returns `None`. Otherwise a `DataRepository`
+        instance is returned.
+
+        Parameters
+        ----------
+        doi : str
+            The DOI that identifies the repository
+        archive_url : str
+            The resolved URL for the DOI
+        """
+
+        # Check whether this is a Zenodo URL
+        parsed_archive_url = parse_url(archive_url)
+        if parsed_archive_url["netloc"] != "data.nist.gov":
+            return None
+
+        return cls(doi, archive_url)
+
+    @property
+    def api_response(self):
+        """Cached API response from NIST PDR"""
+        if self._api_response is None:
+            # Lazy import requests to speed up import time
+            import requests  # pylint: disable=C0415
+
+            article_id = self.archive_url.split("/")[-1]
+            self._api_response = requests.get(
+                f"{self.base_api_url}/{article_id}",
+                timeout=DEFAULT_TIMEOUT,
+            )
+
+        return self._api_response
+
+    def download_url(self, file_name):
+        """
+        Use the repository API to get the download URL for a file given
+        the archive URL.
+
+        Parameters
+        ----------
+        file_name : str
+            The name of the file in the archive that will be downloaded.
+
+        Returns
+        -------
+        download_url : str
+            The HTTP URL that can be used to download the file.
+        """
+        response = self.api_response.json()
+        # files is dictionary of nrdp:DataFile types,
+        # with filepath as key in the record's components:
+        files = {
+            i["filepath"]: i
+            for i in response['components'] if "nrdp:DataFile" in i['@type']
+        }
+        if file_name not in files:
+            raise ValueError(
+                f"File '{file_name}' not found in data archive "
+                f"{self.archive_url} (doi:{self.doi})."
+            )
+        # Generate download_url using the file id
+        download_url = files[file_name]["downloadURL"]
+        return download_url
+
+    def populate_registry(self, pooch):
+        """
+        Populate the registry using the data repository's API
+
+        Parameters
+        ----------
+        pooch : Pooch
+            The pooch instance that the registry will be added to.
+
+        Notes
+        -----
+        After Zenodo migrated to InvenioRDM on Oct 2023, their API changed. The
+        checksums for each file listed in the API reference is now an md5 sum.
+
+        This method supports both the legacy and the new API.
+        """
+        files = {
+            i["filepath"]: i
+            for i in self.api_response.json()['components'] if "nrdp:DataFile" in i['@type']
+        }
+        for file_name, file_data in files.items():
+            checksum_value = file_data["checksum"]["hash"]
+            checksum_type = file_data["checksum"]["algorithm"]["tag"]
+            checksum = f"{checksum_type}:{checksum_value}"
+            pooch.registry[file_name] = checksum
diff --git a/pooch/tests/test_downloaders.py b/pooch/tests/test_downloaders.py
@@ -7,7 +7,9 @@
 """
 Test the downloader classes and functions separately from the Pooch core.
 """
+import hashlib
 import os
+import re
 import sys
 from tempfile import TemporaryDirectory
 
@@ -33,6 +35,7 @@
     FigshareRepository,
     ZenodoRepository,
     DataverseRepository,
+    NISTPDRRepository,
     doi_to_url,
 )
 from ..processors import Unzip
@@ -45,6 +48,8 @@
     pooch_test_zenodo_url,
     pooch_test_zenodo_with_slash_url,
     pooch_test_dataverse_url,
+    pooch_test_nist_pdr_url,
+    pooch_test_nist_pdr_nested_file,
 )
 
 
@@ -140,6 +145,91 @@ def test_doi_downloader(url):
         check_tiny_data(outfile)
 
 
+@pytest.mark.network
+def test_doi_downloader_nist_pdr():
+    """
+    Test the DOI downloader for the NIST PDR.
+
+    The NIST PDR does not have a record with 'tiny-data.txt',
+    so uses a slightly different test implementation than test_doi_downloader()
+    """
+    with TemporaryDirectory() as local_store:
+        downloader = DOIDownloader()
+        outfile = os.path.join(local_store, "README.txt")
+        to_download = pooch_test_nist_pdr_url("simple") + "README.txt"
+        downloader(to_download, outfile, None)
+
+        assert os.path.exists(outfile)
+        with open(outfile, encoding="utf-8") as tinydata:
+            content = tinydata.read()
+        true_content = "The `labbench` python library provides tools for instrument automation"
+        assert content.strip()[:70] == true_content
+
+
+@pytest.mark.network
+def test_doi_downloader_nist_pdr_file_in_collection():
+    """
+    Test the DOI downloader for the NIST PDR.
+
+    This test tests a file deeper in a nested collection
+    """
+    with TemporaryDirectory() as local_store:
+        downloader = DOIDownloader()
+        file_path = pooch_test_nist_pdr_nested_file()["filename"]
+        outfile = os.path.join(local_store, file_path.rsplit("/", maxsplit=1)[-1])
+        to_download = pooch_test_nist_pdr_url("nested_collection") + file_path
+        downloader(to_download, outfile, None)
+
+        assert os.path.exists(outfile)
+        with open(outfile, "rb") as tinydata:
+            assert (
+                hashlib.sha256(tinydata.read()).hexdigest()
+                == pooch_test_nist_pdr_nested_file()["checksum"][7:]
+            )
+
+
+@pytest.mark.network
+def test_doi_downloader_nist_pdr_missing_file():
+    """
+    Test the DOI downloader for the NIST PDR.
+
+    This test tests a file deeper in a nested collection
+    """
+    with TemporaryDirectory() as local_store:
+        downloader = DOIDownloader()
+        file_path = "nonexistent_file.txt"
+        outfile = os.path.join(local_store, file_path.rsplit("/", maxsplit=1)[-1])
+        to_download = pooch_test_nist_pdr_url("nested_collection") + file_path
+        with pytest.raises(
+            ValueError,
+            match=re.escape(
+                "File 'nonexistent_file.txt' not found in data archive "
+                "https://data.nist.gov/od/id/8C40CFA7931709DAE0532457068179072082 "
+                "(doi:10.18434/M32082).",
+            ),
+        ):
+            downloader(to_download, outfile, None)
+
+
+def test_populate_registry_nist_pdr(tmp_path):
+    """
+    Test if population of registry is correctly done for NIST PDR.
+    """
+    # Create sample pooch object
+    puppy = Pooch(base_url="", path=tmp_path)
+    _doi = "10.18434/M32082"
+    _doi_url = doi_to_url(_doi)
+    # Create NIST PDR downloader
+    downloader = NISTPDRRepository(doi=_doi, archive_url=_doi_url)
+    # Populate registry
+    downloader.populate_registry(puppy)
+    assert len(puppy.registry) == 101
+    assert (
+        puppy.registry[pooch_test_nist_pdr_nested_file()["filename"]]
+        == pooch_test_nist_pdr_nested_file()["checksum"]
+    )
+
+
 @pytest.mark.network
 def test_zenodo_downloader_with_slash_in_fname():
     """

diff --git a/pooch/tests/utils.py b/pooch/tests/utils.py
@@ -128,6 +128,37 @@ def pooch_test_dataverse_url():
     return url
 
 
+def pooch_test_nist_pdr_url(file_type="simple"):
+    """
+    Get the base URL for test data stored on the NIST Public Data Repository.
+
+    Returns
+    -------
+    url
+        The URL for pooch's test data.
+    """
+    urls = {
+        "simple": "doi:10.18434/M32122/",
+        "nested_collection": "doi:10.18434/M32082/"
+    }
+    return urls[file_type]
+
+
+def pooch_test_nist_pdr_nested_file():
+    """
+    Get filename and checksum for a nested file stored on the NIST PDR.
+
+    Returns
+    -------
+    dict
+        Dictionary containing the filename and checksum for a test data file.
+    """
+    return {
+        "filename": "EDS/Figure 5 - Point Scans/EDS Objects Supplement Raw Data/CSL/CSL_SQ.xlsx",
+        "checksum": "sha256:5867c7048743c4814d68e09b3738cd9b90a5bab102cec87126c77b2924e1070b",
+    }
+
+
 def pooch_test_registry():
     """
     Get a registry for the test data used in Pooch itself.

diff --git a/pooch/utils.py b/pooch/utils.py
@@ -159,11 +159,11 @@ def parse_url(url):
 
     The DOI is a special case. The protocol will be "doi", the netloc will be
     the DOI, and the path is what comes after the last "/".
-    The only exception are Zenodo dois: the protocol will be "doi", the netloc
-    will be composed by the "prefix/suffix" and the path is what comes after
-    the second "/". This allows to support special cases of Zenodo dois where
-    the path contains forward slashes "/", created by the GitHub-Zenodo
-    integration service.
+    The only exception are Zenodo and NIST PDR dois: the protocol will be
+    "doi", the netloc will be composed by the "prefix/suffix" and the path is
+    what comes after the second "/". This allows to support special cases of
+    Zenodo dois where the path contains forward slashes "/", created by the
+    GitHub-Zenodo integration service.
 
     Parameters
     ----------
@@ -184,7 +184,7 @@ def parse_url(url):
     if url.startswith("doi:"):
         protocol = "doi"
         parts = url[4:].split("/")
-        if "zenodo" in parts[1].lower():
+        if "zenodo" in parts[1].lower() or "10.18434" in parts:
             netloc = "/".join(parts[:2])
             path = "/" + "/".join(parts[2:])
         else: