Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

NCZarr Support Part I: Local Datasets #884

Closed
wants to merge 17 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/default-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ jobs:
- name: Python ${{ matrix.python-version }}
shell: bash -l {0}
run: |
conda create --name TEST python=${{ matrix.python-version }} pip "libnetcdf<4.8.0" --file requirements.txt --file test_requirements.txt --strict-channel-priority
conda create --name TEST python=${{ matrix.python-version }} pip --file requirements.txt --file test_requirements.txt --strict-channel-priority
source activate TEST
pip install -e . --no-deps --force-reinstall

Expand Down
32 changes: 22 additions & 10 deletions compliance_checker/protocols/netcdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,10 @@
Functions to assist in determining if the URL points to a netCDF file
"""

import zipfile

from pathlib import Path

import requests


Expand All @@ -22,17 +26,25 @@ def is_netcdf(url):
if url.endswith("nc"):
return True

# Brute force
with open(url, "rb") as f:
magic_number = f.read(4)
if len(magic_number) < 4:
return False
if is_classic_netcdf(magic_number):
return True
elif is_hdf5(magic_number):
return True

try:
# Brute force
with open(url, "rb") as f:
magic_number = f.read(4)
if len(magic_number) < 4:
return False
if is_classic_netcdf(magic_number):
return True
elif is_hdf5(magic_number):
return True
except PermissionError:
# open will fail for both a directory or a local url, either of which may be pointing to a Zarr dataset
# directory
return False
except OSError:
# local file url
return False

return False


def is_classic_netcdf(file_buffer):
Expand Down
25 changes: 15 additions & 10 deletions compliance_checker/protocols/opendap.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,14 +55,19 @@ def is_opendap(url):
das_url = url.replace("#fillmismatch", ".das")
else:
das_url = url + ".das"
response = requests.get(das_url, allow_redirects=True)
if "xdods-server" in response.headers:
return True
# Check if it is an access restricted ESGF thredds service
if (
response.status_code == 401
and "text/html" in response.headers["content-type"]
and "The following URL requires authentication:" in response.text
):
return True

try:
response = requests.get(das_url, allow_redirects=True)

if "xdods-server" in response.headers:
return True
# Check if it is an access restricted ESGF thredds service
if (
response.status_code == 401
and "text/html" in response.headers["content-type"]
and "The following URL requires authentication:" in response.text
):
return True
except requests.exceptions.InvalidSchema:
return False # not opendap if url + ".das" isn't found
return False
82 changes: 82 additions & 0 deletions compliance_checker/protocols/zarr.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
import zipfile

from pathlib import Path
from urllib.parse import urlparse
from urllib.request import url2pathname
from zipfile import ZipFile

from compliance_checker.protocols import netcdf


#


def is_zarr(url):
""" """

if netcdf.is_netcdf(url):
return False

if ".zarr" in url:
return True

if urlparse(url).scheme in ("https", "s3", "file"):
return True

if zipfile.is_zipfile(url):
if ".zmetadata" in ZipFile(url).namelist():
return True

if Path(url).is_dir():
if (Path(url) / ".zmetadata").exists():
return True

return False


def as_zarr(url):
"""
Transform pointers to zarr datasets to valid nczarr urls, as described in
https://www.unidata.ucar.edu/blogs/developer/entry/overview-of-zarr-support-in\n
url: str or Path to valid zarr dataset\n
Distinct from is_cdl etc in that it will return the appropriate URI \n\n

Not tested on Windows paths at the moment, as NCZarr is not supported in Windows\n

A valid Zarr dataset could be provided in any of the following forms:\n
"http://s3.amazonaws.com/bucket/dataset.zarr"\n
"http://s3.amazonaws.com/bucket/dataset.zarr"#mode=nczarr,s3\n
"/home/path/to/dataset.zarr"\n
Path('/home/path/to/dataset.zarr')\n
"file:///home/path/to/dataset.zarr"\n
"file:///home/path/to/dataset.randomExt#mode=nczarr,file"
"file:///home/path/to/dataset.zarr#mode=nczarr,zip"
"""

pr = urlparse(str(url))

if "mode=nczarr" in pr.fragment:
if pr.netloc:
return str(url) # already valid nczarr url
elif pr.scheme == "file":
return str(url) # already valid nczarr url

zarr_url = Path(
url2pathname(pr.path)
).resolve() # url2pathname necessary to avoid urlparse bug in windows

if pr.netloc:
mode = "s3"
elif zipfile.is_zipfile(zarr_url):
mode = "zip"
elif zarr_url.is_dir():
mode = "file"
else:
raise ValueError(
f"Could not identify {url},\nif #mode=nczarr,zarr, please pass this explicitly\nValid url options are described here\nhttps://www.unidata.ucar.edu/blogs/developer/entry/overview-of-zarr-support-in"
)

url_base = url if mode == "s3" else zarr_url.as_uri()

zarr_url = f"{url_base}#mode=nczarr,{mode}"
return zarr_url
14 changes: 12 additions & 2 deletions compliance_checker/suite.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import inspect
import itertools
import os
import platform
import re
import subprocess
import sys
Expand All @@ -16,7 +17,9 @@
from datetime import datetime, timezone
from distutils.version import StrictVersion
from operator import itemgetter
from urllib.parse import urlparse
from pathlib import Path
from urllib.parse import urljoin, urlparse
from urllib.request import url2pathname

import requests

Expand All @@ -29,7 +32,7 @@
from compliance_checker import MemoizedDataset, __version__, tempnc
from compliance_checker.base import BaseCheck, GenericFile, Result, fix_return_value
from compliance_checker.cf.cf import CFBaseCheck
from compliance_checker.protocols import cdl, erddap, netcdf, opendap
from compliance_checker.protocols import cdl, erddap, netcdf, opendap, zarr


# Ensure output is encoded as Unicode when checker output is redirected or piped
Expand Down Expand Up @@ -845,6 +848,13 @@ def load_local_dataset(self, ds_str):
if cdl.is_cdl(ds_str):
ds_str = self.generate_dataset(ds_str)

if zarr.is_zarr(ds_str):
if platform.system() != "Linux":
print(
f"WARNING: {platform.system()} OS detected. NCZarr is not officially supported for your OS as of when this API was written. Your mileage may vary."
)
return MemoizedDataset(zarr.as_zarr(ds_str))

if netcdf.is_netcdf(ds_str):
return MemoizedDataset(ds_str)

Expand Down
7 changes: 4 additions & 3 deletions compliance_checker/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,14 +24,15 @@ def generate_dataset(cdl_path, nc_path):
subprocess.call(["ncgen", "-o", str(nc_path), str(cdl_path)])


datadir = Path(resource_filename("compliance_checker", "tests/data")).resolve()
assert datadir.exists(), f"{datadir} not found"


def static_files(cdl_stem):
"""
Returns the Path to a valid nc dataset\n
replaces the old STATIC_FILES dict
"""
datadir = Path(resource_filename("compliance_checker", "tests/data")).resolve()
assert datadir.exists(), f"{datadir} not found"

cdl_paths = glob_down(datadir, f"{cdl_stem}.cdl", 3)
assert (
len(cdl_paths) > 0
Expand Down
1 change: 1 addition & 0 deletions compliance_checker/tests/data/trajectory.zarr/.zattrs
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{}
3 changes: 3 additions & 0 deletions compliance_checker/tests/data/trajectory.zarr/.zgroup
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{
"zarr_format": 2
}
Loading