Skip to content

Commit

Permalink
enable multi year resource handler to work with s3 files without any …
Browse files Browse the repository at this point in the history
…fancy stuff
  • Loading branch information
grantbuster committed Dec 20, 2024
1 parent bb83a9e commit 9139acf
Show file tree
Hide file tree
Showing 4 changed files with 147 additions and 19 deletions.
65 changes: 63 additions & 2 deletions rex/multi_time_resource.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from glob import glob
from itertools import chain
from fnmatch import fnmatch
import logging

import numpy as np
import pandas as pd
Expand All @@ -21,6 +22,9 @@
from rex.utilities.parse_keys import parse_keys, parse_slice


logger = logging.getLogger(__name__)


class MultiTimeH5:
"""
Class to handle h5 Resources stored over multiple temporal files
Expand Down Expand Up @@ -224,11 +228,23 @@ def _get_hsds_file_paths(h5_path, hsds_kwargs=None):
file_paths : list
List of filepaths for this handler to handle.
"""
import h5pyd
try:
import h5pyd
except Exception as e:
msg = (f'Tried to open hsds file path: "{h5_path}" with '
'h5pyd but could not import, try `pip install h5pyd`')
logger.error(msg)
raise ImportError(msg) from e

if hsds_kwargs is None:
hsds_kwargs = {}

if isinstance(h5_path, (list, tuple)):
msg = ('HSDS filepath must be a string, possibly with glob '
'pattern, but received list/tuple')
logger.error(msg)
raise TypeError(msg)

hsds_dir = os.path.dirname(h5_path)
fn = os.path.basename(h5_path)

Expand All @@ -237,12 +253,14 @@ def _get_hsds_file_paths(h5_path, hsds_kwargs=None):
'directory name! The directory must be explicit but the '
'filename can have wildcards. This HSDS h5_path input '
'cannot be used: {}'.format(h5_path))
logger.error(msg)
raise FileNotFoundError(msg)

if not fn:
msg = ('h5_path must be a unix shell style pattern with '
'wildcard * in order to find files, but received '
'directory specification: {}'.format(h5_path))
logger.error(msg)
raise FileInputError(msg)

with h5pyd.Folder(hsds_dir + '/', **hsds_kwargs) as f:
Expand All @@ -251,6 +269,47 @@ def _get_hsds_file_paths(h5_path, hsds_kwargs=None):

return file_paths

@staticmethod
def _get_s3_file_paths(h5_path):
"""
Get a list of h5 filepaths matching the h5_path specification from s3
Parameters
----------
h5_path : str
Unix shell style pattern path with * wildcards to multi-file
resource file sets. Files must have the same coordinates
but can have different datasets or time indexes.
Returns
-------
file_paths : list
List of filepaths for this handler to handle.
"""
try:
import s3fs
except Exception as e:
msg = (f'Tried to open s3 file path: "{h5_path}" with '
'fsspec but could not import, try '
'`pip install fsspec s3fs`')
logger.error(msg)
raise ImportError(msg) from e

s3 = s3fs.S3FileSystem(anon=True)

if isinstance(h5_path, (list, tuple)):
file_paths = [s3.glob(fp) for fp in h5_path]
file_paths = list(chain.from_iterable(file_paths))
elif isinstance(h5_path, str):
file_paths = s3.glob(h5_path)

# s3fs glob drops prefix for some reason
for i, fp in enumerate(file_paths):
if not fp.startswith('s3://'):
file_paths[i] = f's3://{fp}'

return file_paths

@classmethod
def _get_file_paths(cls, h5_path, hsds=False, hsds_kwargs=None):
"""
Expand All @@ -277,9 +336,11 @@ def _get_file_paths(cls, h5_path, hsds=False, hsds_kwargs=None):
List of filepaths for this handler to handle.
"""

if hsds:
if Resource.is_hsds_file(h5_path) or hsds:
file_paths = cls._get_hsds_file_paths(h5_path,
hsds_kwargs=hsds_kwargs)
elif Resource.is_s3_file(h5_path):
file_paths = cls._get_s3_file_paths(h5_path)
elif isinstance(h5_path, (list, tuple)):
file_paths = list(chain.from_iterable(glob(fp) for fp in h5_path))
for fp in file_paths:
Expand Down
36 changes: 24 additions & 12 deletions rex/multi_year_resource.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,14 +31,16 @@ def __init__(self, h5_path, years=None, res_cls=Resource, hsds=False,
h5_path : str
Unix shell style pattern path with * wildcards to multi-file
resource file sets. Files must have the same coordinates
but can have different datasets or time indexes.
but can have different datasets or time indexes. Can also be a path
on HSDS starting with /nrel/ or a path on s3 starting with s3://
years : list, optional
List of integer years to access, by default None
res_cls : obj
Resource class to use to open and access resource data
hsds : bool
Boolean flag to use h5pyd to handle .h5 'files' hosted on AWS
behind HSDS
behind HSDS, by default False. This is now redundant; file paths
starting with /nrel/ will be treated as hsds=True by default
hsds_kwargs : dict, optional
Dictionary of optional kwargs for h5pyd, e.g., bucket, username,
password, by default None
Expand Down Expand Up @@ -416,7 +418,8 @@ def __init__(self, h5_path, years=None, unscale=True, str_decode=True,
resource file sets. Files must have the same coordinates
but can have different datasets or time indexes. Can also be
an explicit list of multi time files, which themselves can
contain * wildcards.
contain * wildcards. Can also be a path on HSDS starting with
/nrel/ or a path on s3 starting with s3://
years : list, optional
List of years to access, by default None
unscale : bool
Expand All @@ -428,7 +431,8 @@ def __init__(self, h5_path, years=None, unscale=True, str_decode=True,
Resource handler to us to open individual .h5 files
hsds : bool, optional
Boolean flag to use h5pyd to handle .h5 'files' hosted on AWS
behind HSDS, by default False
behind HSDS, by default False. This is now redundant; file paths
starting with /nrel/ will be treated as hsds=True by default
hsds_kwargs : dict, optional
Dictionary of optional kwargs for h5pyd, e.g., bucket, username,
password, by default None
Expand Down Expand Up @@ -469,7 +473,8 @@ def __init__(self, h5_path, years=None, unscale=True, str_decode=True,
h5_path : str
Unix shell style pattern path with * wildcards to multi-file
resource file sets. Files must have the same coordinates
but can have different datasets or time indexes.
but can have different datasets or time indexes. Can also be a path
on HSDS starting with /nrel/ or a path on s3 starting with s3://
years : list, optional
List of years to access, by default None
unscale : bool
Expand All @@ -479,7 +484,8 @@ def __init__(self, h5_path, years=None, unscale=True, str_decode=True,
strings. Setting this to False will speed up the meta data read.
hsds : bool, optional
Boolean flag to use h5pyd to handle .h5 'files' hosted on AWS
behind HSDS, by default False
behind HSDS, by default False. This is now redundant; file paths
starting with /nrel/ will be treated as hsds=True by default
hsds_kwargs : dict, optional
Dictionary of optional kwargs for h5pyd, e.g., bucket, username,
password, by default None
Expand All @@ -505,7 +511,8 @@ def __init__(self, h5_path, years=None, unscale=True, str_decode=True,
resource file sets. Files must have the same coordinates
but can have different datasets or time indexes. Can also be
an explicit list of multi time files, which themselves can
contain * wildcards.
contain * wildcards. Can also be a path on HSDS starting with
/nrel/ or a path on s3 starting with s3://
years : list, optional
List of years to access, by default None
unscale : bool
Expand All @@ -515,7 +522,8 @@ def __init__(self, h5_path, years=None, unscale=True, str_decode=True,
strings. Setting this to False will speed up the meta data read.
hsds : bool, optional
Boolean flag to use h5pyd to handle .h5 'files' hosted on AWS
behind HSDS, by default False
behind HSDS, by default False. This is now redundant; file paths
starting with /nrel/ will be treated as hsds=True by default
hsds_kwargs : dict, optional
Dictionary of optional kwargs for h5pyd, e.g., bucket, username,
password, by default None
Expand All @@ -541,7 +549,8 @@ def __init__(self, h5_path, years=None, unscale=True, str_decode=True,
resource file sets. Files must have the same coordinates
but can have different datasets or time indexes. Can also be
an explicit list of multi time files, which themselves can
contain * wildcards.
contain * wildcards. Can also be a path on HSDS starting with
/nrel/ or a path on s3 starting with s3://
years : list, optional
List of years to access, by default None
unscale : bool
Expand All @@ -551,7 +560,8 @@ def __init__(self, h5_path, years=None, unscale=True, str_decode=True,
strings. Setting this to False will speed up the meta data read.
hsds : bool, optional
Boolean flag to use h5pyd to handle .h5 'files' hosted on AWS
behind HSDS, by default False
behind HSDS, by default False. This is now redundant; file paths
starting with /nrel/ will be treated as hsds=True by default
hsds_kwargs : dict, optional
Dictionary of optional kwargs for h5pyd, e.g., bucket, username,
password, by default None
Expand All @@ -577,7 +587,8 @@ def __init__(self, h5_path, years=None, unscale=True, str_decode=True,
resource file sets. Files must have the same coordinates
but can have different datasets or time indexes. Can also be
an explicit list of multi time files, which themselves can
contain * wildcards.
contain * wildcards. Can also be a path on HSDS starting with
/nrel/ or a path on s3 starting with s3://
years : list, optional
List of years to access, by default None
unscale : bool
Expand All @@ -587,7 +598,8 @@ def __init__(self, h5_path, years=None, unscale=True, str_decode=True,
strings. Setting this to False will speed up the meta data read.
hsds : bool, optional
Boolean flag to use h5pyd to handle .h5 'files' hosted on AWS
behind HSDS, by default False
behind HSDS, by default False. This is now redundant; file paths
starting with /nrel/ will be treated as hsds=True by default
hsds_kwargs : dict, optional
Dictionary of optional kwargs for h5pyd, e.g., bucket, username,
password, by default None
Expand Down
52 changes: 48 additions & 4 deletions rex/resource.py
Original file line number Diff line number Diff line change
Expand Up @@ -1112,8 +1112,8 @@ def open_dataset(self, ds_name):

return ds

@staticmethod
def open_file(file_path, mode='r', hsds=False, hsds_kwargs=None):
@classmethod
def open_file(cls, file_path, mode='r', hsds=False, hsds_kwargs=None):
"""Open a filepath to an h5, s3, or hsds nrel resource file with the
appropriate python object.
Expand All @@ -1140,7 +1140,7 @@ def open_file(file_path, mode='r', hsds=False, hsds_kwargs=None):
file on s3 using h5py and fsspec, or the file on HSDS using h5pyd.
"""

if file_path.startswith('/nrel/') or hsds:
if cls.is_hsds_file(file_path) or hsds:
if mode != 'r':
msg = 'Cannot write to files accessed via HSDS!'
logger.error(msg)
Expand All @@ -1160,7 +1160,7 @@ def open_file(file_path, mode='r', hsds=False, hsds_kwargs=None):
file = h5pyd.File(file_path, mode='r', use_cache=False,
**hsds_kwargs)

elif file_path.startswith('s3://'):
elif cls.is_s3_file(file_path):
if mode != 'r':
msg = 'Cannot write to files accessed via s3/fsspec!'
logger.error(msg)
Expand All @@ -1184,6 +1184,50 @@ def open_file(file_path, mode='r', hsds=False, hsds_kwargs=None):

return file

@staticmethod
def is_hsds_file(file_path):
"""Parse one or more filepath to determine if it is hsds
Parameters
----------
file_path : str | list
One or more file paths (only the first is parsed if multiple)
Returns
-------
is_hsds_file : bool
True if hsds
"""
if isinstance(file_path, (list, tuple)):
file_path = file_path[0]

if file_path.startswith('/nrel/'):
return True
else:
return False

@staticmethod
def is_s3_file(file_path):
"""Parse one or more filepath to determine if it is s3
Parameters
----------
file_path : str | list
One or more file paths (only the first is parsed if multiple)
Returns
-------
is_s3_file : bool
True if s3
"""
if isinstance(file_path, (list, tuple)):
file_path = file_path[0]

if file_path.startswith('s3://'):
return True
else:
return False

def get_attrs(self, dset=None):
"""
Get h5 attributes either from file or dataset
Expand Down
13 changes: 12 additions & 1 deletion tests/s3_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
test.
"""
import numpy as np
from rex import NSRDB, WindResource
from rex import NSRDB, WindResource, MultiYearResource


def test_nsrdb():
Expand Down Expand Up @@ -38,3 +38,14 @@ def test_sup3rcc():
temp = res['temperature_2m', 0:10, 0]
assert isinstance(dsets, list)
assert isinstance(temp, np.ndarray)


def test_multiyear():
"""Test retrieving multi year NSRDB data"""
files = ["s3://nrel-pds-nsrdb/current/nsrdb_199*.h5"]
with MultiYearResource(files) as res:
dsets = res.dsets
ghi = res['ghi', 0:10, 0]
assert res.shape[0] == 35040 # 2x years at 30min (1998 and 1999)
assert isinstance(dsets, list)
assert isinstance(ghi, np.ndarray)

0 comments on commit 9139acf

Please sign in to comment.