-
Notifications
You must be signed in to change notification settings - Fork 14
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
perform flattening as necessary in test suite (#130)
* move methods for flattening netcdf and hdf group structures to separate module * feature/PODAAC-5065 (#129) * fix way xarray open granules that have as a time unit * fix pylint * change function to use original function if can parse only change units if we can not parse * make xarray override into its own function * add test for override_decode_cf_datetime function * disable pyline one line instead of global * Update podaac/subsetter/subset.py Co-authored-by: Frank Greguska <[email protected]> * add missing parameter to docstring * typo in docstring * extract netcdf opening procedure from beginning of `subset() into a new function * update tests to use netcdf opening wrapper function, to prevent errors with tempo data * /version 2.3.0-alpha.5 * update `test_specified_variables()` to use netcdf opening wrapper function in multiple places to prevent errors with tempo data * cosmetic * clean up comment and use 'decode_times'=True for test * feature/issue 126 (#131) * Add variable leading slash flexibility * Add tests back to test file * changelog added and updated * Update podaac/subsetter/subset.py Co-authored-by: Frank Greguska <[email protected]> * update Syntax * resolve conflict Co-authored-by: nlensse1 <[email protected]> Co-authored-by: Frank Greguska <[email protected]> * /version 2.3.0-alpha.6 * Update build-pipeline.yml * /version 2.3.0-alpha.7 * Merge changes from origin/develop * Merge changes from issues/127 Co-authored-by: sliu008 <[email protected]> Co-authored-by: Frank Greguska <[email protected]> Co-authored-by: l2ss-py bot <[email protected]> Co-authored-by: Nick Lenssen <[email protected]> Co-authored-by: nlensse1 <[email protected]>
- Loading branch information
1 parent
3728c7d
commit f8122d4
Showing
7 changed files
with
528 additions
and
427 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,232 @@ | ||
from shutil import copy | ||
|
||
import h5py | ||
import netCDF4 as nc | ||
import numpy as np | ||
import xarray as xr | ||
|
||
GROUP_DELIM = '__' | ||
|
||
|
||
def transform_grouped_dataset(nc_dataset, file_to_subset): | ||
""" | ||
Transform a netCDF4 Dataset that has groups to an xarray compatible | ||
dataset. xarray does not work with groups, so this transformation | ||
will flatten the variables in the dataset and use the group path as | ||
the new variable name. For example, data_01 > km > sst would become | ||
'data_01__km__sst', where GROUP_DELIM is __. | ||
This same pattern is applied to dimensions, which are located under | ||
the appropriate group. They are renamed and placed in the root | ||
group. | ||
Parameters | ||
---------- | ||
nc_dataset : nc.Dataset | ||
netCDF4 Dataset that contains groups | ||
file_to_subset : str | ||
Returns | ||
------- | ||
nc.Dataset | ||
netCDF4 Dataset that does not contain groups and that has been | ||
flattened. | ||
""" | ||
|
||
# Close the existing read-only dataset and reopen in append mode | ||
nc_dataset.close() | ||
nc_dataset = nc.Dataset(file_to_subset, 'r+') | ||
|
||
dimensions = {} | ||
|
||
def walk(group_node, path): | ||
for key, item in group_node.items(): | ||
group_path = f'{path}{GROUP_DELIM}{key}' | ||
|
||
# If there are variables in this group, copy to root group | ||
# and then delete from current group | ||
if item.variables: | ||
# Copy variables to root group with new name | ||
for var_name, var in item.variables.items(): | ||
var_group_name = f'{group_path}{GROUP_DELIM}{var_name}' | ||
nc_dataset.variables[var_group_name] = var | ||
# Delete variables | ||
var_names = list(item.variables.keys()) | ||
for var_name in var_names: | ||
del item.variables[var_name] | ||
|
||
if item.dimensions: | ||
dims = list(item.dimensions.keys()) | ||
for dim_name in dims: | ||
new_dim_name = f'{group_path.replace("/", GROUP_DELIM)}{GROUP_DELIM}{dim_name}' | ||
item.dimensions[new_dim_name] = item.dimensions[dim_name] | ||
dimensions[new_dim_name] = item.dimensions[dim_name] | ||
item.renameDimension(dim_name, new_dim_name) | ||
|
||
# If there are subgroups in this group, call this function | ||
# again on that group. | ||
if item.groups: | ||
walk(item.groups, group_path) | ||
|
||
# Delete non-root groups | ||
group_names = list(group_node.keys()) | ||
for group_name in group_names: | ||
del group_node[group_name] | ||
|
||
for var_name in list(nc_dataset.variables.keys()): | ||
new_var_name = f'{GROUP_DELIM}{var_name}' | ||
nc_dataset.variables[new_var_name] = nc_dataset.variables[var_name] | ||
del nc_dataset.variables[var_name] | ||
|
||
walk(nc_dataset.groups, '') | ||
|
||
# Update the dimensions of the dataset in the root group | ||
nc_dataset.dimensions.update(dimensions) | ||
|
||
return nc_dataset | ||
|
||
|
||
def recombine_grouped_datasets(datasets, output_file, start_date): # pylint: disable=too-many-branches | ||
""" | ||
Given a list of xarray datasets, combine those datasets into a | ||
single netCDF4 Dataset and write to the disk. Each dataset has been | ||
transformed using its group path and needs to be un-transformed and | ||
placed in the appropriate group. | ||
Parameters | ||
---------- | ||
datasets : list (xr.Dataset) | ||
List of xarray datasets to be combined | ||
output_file : str | ||
Name of the output file to write the resulting NetCDF file to. | ||
""" | ||
|
||
base_dataset = nc.Dataset(output_file, mode='w') | ||
|
||
for dataset in datasets: | ||
group_lst = [] | ||
for var_name in dataset.variables.keys(): # need logic if there is data in the top level not in a group | ||
group_lst.append('/'.join(var_name.split(GROUP_DELIM)[:-1])) | ||
group_lst = ['/' if group == '' else group for group in group_lst] | ||
groups = set(group_lst) | ||
for group in groups: | ||
base_dataset.createGroup(group) | ||
|
||
for dim_name in list(dataset.dims.keys()): | ||
new_dim_name = dim_name.split(GROUP_DELIM)[-1] | ||
dim_group = _get_nested_group(base_dataset, dim_name) | ||
dim_group.createDimension(new_dim_name, dataset.dims[dim_name]) | ||
|
||
# Rename variables | ||
_rename_variables(dataset, base_dataset, start_date) | ||
|
||
# Remove group vars from base dataset | ||
for var_name in list(base_dataset.variables.keys()): | ||
if GROUP_DELIM in var_name: | ||
del base_dataset.variables[var_name] | ||
|
||
# Remove group dims from base dataset | ||
for dim_name in list(base_dataset.dimensions.keys()): | ||
if GROUP_DELIM in dim_name: | ||
del base_dataset.dimensions[dim_name] | ||
|
||
# Copy global attributes | ||
base_dataset.setncatts(datasets[0].attrs) | ||
# Write and close | ||
base_dataset.close() | ||
|
||
|
||
def _get_nested_group(dataset, group_path): | ||
nested_group = dataset | ||
for group in group_path.strip(GROUP_DELIM).split(GROUP_DELIM)[:-1]: | ||
nested_group = nested_group.groups[group] | ||
return nested_group | ||
|
||
|
||
def _rename_variables(dataset, base_dataset, start_date): | ||
for var_name in list(dataset.variables.keys()): | ||
new_var_name = var_name.split(GROUP_DELIM)[-1] | ||
var_group = _get_nested_group(base_dataset, var_name) | ||
variable = dataset.variables[var_name] | ||
var_dims = [x.split(GROUP_DELIM)[-1] for x in dataset.variables[var_name].dims] | ||
if np.issubdtype( | ||
dataset.variables[var_name].dtype, np.dtype(np.datetime64) | ||
) or np.issubdtype( | ||
dataset.variables[var_name].dtype, np.dtype(np.timedelta64) | ||
): | ||
if start_date: | ||
dataset.variables[var_name].values = (dataset.variables[var_name].values - np.datetime64(start_date))/np.timedelta64(1, 's') | ||
variable = dataset.variables[var_name] | ||
else: | ||
cf_dt_coder = xr.coding.times.CFDatetimeCoder() | ||
encoded_var = cf_dt_coder.encode(dataset.variables[var_name]) | ||
variable = encoded_var | ||
|
||
var_attrs = variable.attrs | ||
fill_value = var_attrs.get('_FillValue') | ||
var_attrs.pop('_FillValue', None) | ||
comp_args = {"zlib": True, "complevel": 1} | ||
|
||
if variable.dtype == object: | ||
var_group.createVariable(new_var_name, 'S1', var_dims, fill_value=fill_value, **comp_args) | ||
elif variable.dtype == 'timedelta64[ns]': | ||
var_group.createVariable(new_var_name, 'i4', var_dims, fill_value=fill_value, **comp_args) | ||
else: | ||
var_group.createVariable(new_var_name, variable.dtype, var_dims, fill_value=fill_value, **comp_args) | ||
|
||
# Copy attributes | ||
var_group.variables[new_var_name].setncatts(var_attrs) | ||
|
||
# Copy data | ||
var_group.variables[new_var_name].set_auto_maskandscale(False) | ||
var_group.variables[new_var_name][:] = variable.data | ||
|
||
|
||
def h5file_transform(finput): | ||
""" | ||
Transform a h5py Dataset that has groups to an xarray compatible | ||
dataset. xarray does not work with groups, so this transformation | ||
will flatten the variables in the dataset and use the group path as | ||
the new variable name. For example, data_01 > km > sst would become | ||
'data_01__km__sst', where GROUP_DELIM is __. | ||
Returns | ||
------- | ||
nc.Dataset | ||
netCDF4 Dataset that does not contain groups and that has been | ||
flattened. | ||
""" | ||
data_new = h5py.File(finput, 'r+') | ||
del_group_list = list(data_new.keys()) | ||
has_groups = bool(data_new['/']) | ||
|
||
def walk_h5py(data_new, group): | ||
# flattens h5py file | ||
for key, item in data_new[group].items(): | ||
group_path = f'{group}{key}' | ||
if isinstance(item, h5py.Dataset): | ||
new_var_name = group_path.replace('/', '__') | ||
|
||
data_new[new_var_name] = data_new[group_path] | ||
del data_new[group_path] | ||
|
||
elif isinstance(item, h5py.Group): | ||
if len(list(item.keys())) == 0: | ||
new_group_name = group_path.replace('/', '__') | ||
data_new[new_group_name] = data_new[group_path] | ||
|
||
walk_h5py(data_new, data_new[group_path].name + '/') | ||
|
||
walk_h5py(data_new, data_new.name) | ||
|
||
for del_group in del_group_list: | ||
del data_new[del_group] | ||
|
||
finputnc = '.'.join(finput.split('.')[:-1]) + '.nc' | ||
|
||
data_new.close() # close the h5py dataset | ||
copy(finput, finputnc) # copy to a nc file | ||
|
||
nc_dataset = nc.Dataset(finputnc, mode='r') | ||
|
||
return nc_dataset, has_groups |
Oops, something went wrong.