diff --git a/src/eva/eva_driver.py b/src/eva/eva_driver.py index 5096e32..fce4cdb 100644 --- a/src/eva/eva_driver.py +++ b/src/eva/eva_driver.py @@ -22,6 +22,7 @@ from eva.data.data_driver import data_driver from eva.time_series.time_series import add_empty_to_timeseries from eva.time_series.time_series import collapse_collection_to_time_series +from eva.time_series.time_series_utils import create_empty_data, get_filename, check_file from eva.transforms.transform_driver import transform_driver from eva.plotting.batch.base.plot_tools.figure_driver import figure_driver from eva.data.data_collections import DataCollections @@ -162,22 +163,10 @@ def read_transform_time_series(logger, timing, eva_dict, data_collections): if name == time_series_config['collection']: transform_dict['transforms'].append(transform) - # Enforce that first file exists - first_file = datasets_config[0]['filenames'][0] - if not os.path.isfile(first_file): - logger.abort('First file provided to timeseries must exist.') - elif os.stat(first_file).st_size == 0: - logger.abort('First file provided to timeseries must be nonzero.') - - # Prepare empty data array for missing times - data_collections_tmp = DataCollections() - data_driver(datasets_config[0], data_collections_tmp, timing, logger) - temp_collection = time_series_config["collection"] - temp_group = datasets_config[0]['groups'][0]['name'] - temp_var = datasets_config[0]['groups'][0]['variables'][0] - data_array = data_collections_tmp.get_variable_data_array(temp_collection, - temp_group, temp_var) - empty_data_array = xr.full_like(data_array, np.nan) + # Check if first file is empty. If it is, abort. + empty_dataset_config = datasets_config[0] + filename = get_filename(empty_dataset_config, logger) + check_file(filename, logger) # Loop over datasets reading each one in turn, internally appending the data_collections for ind, dataset_config in enumerate(datasets_config): @@ -185,18 +174,16 @@ def read_transform_time_series(logger, timing, eva_dict, data_collections): # Pull out information to check for missing date date = dates[ind] - # Check if file exists. If it doesn't, add empty and continue - filename = dataset_config['filenames'][0] + # Check if file exists, if not add empty and continue + filename = get_filename(dataset_config, logger) if not os.path.isfile(filename): - add_empty_to_timeseries(logger, date, time_series_config, - datasets_config[0], empty_data_array, - data_collections) + add_empty_to_timeseries(logger, date, ind, timing, time_series_config, + empty_dataset_config, data_collections) continue - # Check if file exists but is empty, add empty and continue + # Check if file exists but is size zero, add empty and continue elif os.stat(filename).st_size == 0: - add_empty_to_timeseries(logger, date, time_series_config, - datasets_config[0], empty_data_array, - data_collections) + add_empty_to_timeseries(logger, date, ind, timing, time_series_config, + empty_dataset_config, data_collections) continue # Create a temporary collection for this time step diff --git a/src/eva/tests/config/testIodaObsSpaceAmsuaN19_TimeSeries.yaml b/src/eva/tests/config/testIodaObsSpaceAmsuaN19_TimeSeries.yaml index 0d085cf..decc7f3 100644 --- a/src/eva/tests/config/testIodaObsSpaceAmsuaN19_TimeSeries.yaml +++ b/src/eva/tests/config/testIodaObsSpaceAmsuaN19_TimeSeries.yaml @@ -10,6 +10,15 @@ datasets: - name: ObsValue variables: &variables [brightnessTemperature] - name: hofx + #Empty + - name: experiment + type: IodaObsSpace + filenames: + - ${data_input_path}/ioda_obs_space.amsua_n19.hofx.2020-12-14T000000Z.nc4 + channels: *channels + groups: + - name: ObsValue + - name: hofx - name: experiment type: IodaObsSpace filenames: @@ -31,7 +40,7 @@ transforms: time_series: - begin_date: '2020-12-14T21:00:00' - final_date: '2020-12-15T03:00:00' + final_date: '2020-12-15T09:00:00' interval: 'PT6H' collection: experiment diff --git a/src/eva/tests/config/testJediVariationalBiasCorrectionAmsuaN19.yaml b/src/eva/tests/config/testJediVariationalBiasCorrectionAmsuaN19.yaml index c993a32..f57e286 100644 --- a/src/eva/tests/config/testJediVariationalBiasCorrectionAmsuaN19.yaml +++ b/src/eva/tests/config/testJediVariationalBiasCorrectionAmsuaN19.yaml @@ -11,10 +11,16 @@ datasets: bias_file: ${data_input_path}/gsi.x0048v2.bc.amsua_n19.2021-12-11T21:00:00Z.satbias lapse_file: ${data_input_path}/gsi.x0048v2.bc.amsua_n19.2021-12-11T21:00:00Z.tlapse + # Empty + - name: experiment + type: JediVariationalBiasCorrection + bias_file: ${data_input_path}/gsi.x0048v2.bc.amsua_n19.2021-12-12T03:00:00Z.satbias + lapse_file: ${data_input_path}/gsi.x0048v2.bc.amsua_n19.2021-12-12T03:00:00Z.tlapse + time_series: - - begin_date: '2020-12-15T00:00:00' - final_date: '2020-12-15T06:00:00' + - begin_date: '2021-12-11T15:00:00' + final_date: '2021-12-12T03:00:00' interval: 'PT6H' collection: experiment diff --git a/src/eva/time_series/time_series.py b/src/eva/time_series/time_series.py index 077e4a0..fcc5225 100644 --- a/src/eva/time_series/time_series.py +++ b/src/eva/time_series/time_series.py @@ -13,6 +13,7 @@ import numpy as np import xarray as xr from eva.data.data_collections import DataCollections +from eva.time_series.time_series_utils import create_empty_data # -------------------------------------------------------------------------------------------------- @@ -25,71 +26,13 @@ # -------------------------------------------------------------------------------------------------- -def add_empty_to_timeseries(logger, date, time_series_config, dataset_config, data_array, - data_collections): +def add_empty_to_timeseries(logger, date, ind, timing, time_series_config, + dataset_config, data_collections): ''' Add empty collection to timeseries for missing date ''' - collection_to_ts = dataset_config['name'] - group_list = [dataset_config['groups'][0]['name']] - variable_list = dataset_config['groups'][0]['variables'] - - for idx in range(1, len(dataset_config['groups'])): - group_list.append(dataset_config['groups'][idx]['name']) - - dataset_tmp = DataCollections() - - # Add groups and variables to empty dataset. - for group in group_list: - for variable in variable_list: - # Create empty data array - dataset_tmp.add_variable_to_collection("Empty", group, variable, data_array) - - # Extract xarray - dataset_tmp = dataset_tmp.get_data_collection("Empty") - - # Optional: aggregation methods - aggregation_methods = time_series_config.get('aggregation_methods', []) - - # If specifying aggregation methods it must be accompanied by a dimension - if aggregation_methods: - logger.assert_abort('dimension' in time_series_config, 'When specifying aggregation ' - 'methods a dimension must also be specified.') - dimension = time_series_config['dimension'] - - dataset_aggregated = xr.Dataset() - - # If there is no aggregation method specified, just add the dataset to the time series - if not aggregation_methods: - dataset_aggregated = xr.merge([dataset_aggregated, dataset_tmp]) - else: - for aggregation_method in aggregation_methods: - # Assert that aggregation_method is in the aggregation methods - logger.assert_abort(aggregation_method in xr_aggregation_methods, - f'Unknown aggregation method {aggregation_method}') - - # Compute the aggregation_method - nan for empty - dataset_am = xr_aggregation_methods[aggregation_method](dataset_tmp, dim=dimension) - - # Append each variable name in dataset_am with _aggregation_method - rename_dict = {var: f"{var}_{aggregation_method}" for var in dataset_am.data_vars} - dataset_am = dataset_am.rename(rename_dict) - - # Merge all the results into the aggregated dataset - dataset_aggregated = xr.merge([dataset_aggregated, dataset_am]) - - # Get all dims of dataset_aggregated and create empty array with those dims - dims = {dim: dataset_aggregated.sizes[dim] for dim in dataset_aggregated.dims} - data_array_shape = tuple(dims[dim] for dim in dims) - dataset_aggregated['MetaData::Dates'] = xr.DataArray(np.full(data_array_shape, date), - dims=dataset_aggregated.dims) - - # Add the time index to the aggregated dataset - dataset_aggregated = dataset_aggregated.expand_dims('TimeIndex') - dataset_aggregated['TimeIndex'] = [0] - - # Append the dataset with the aggregation - data_collections.create_or_add_to_collection(f'{collection_to_ts}_time_series', - dataset_aggregated, 'TimeIndex') + empty_data_collection = create_empty_data(time_series_config, dataset_config, timing, logger) + collapse_collection_to_time_series(logger, ind, date, time_series_config, data_collections, + empty_data_collection) # -------------------------------------------------------------------------------------------------- diff --git a/src/eva/time_series/time_series_utils.py b/src/eva/time_series/time_series_utils.py new file mode 100644 index 0000000..a2a961d --- /dev/null +++ b/src/eva/time_series/time_series_utils.py @@ -0,0 +1,43 @@ +import os +import numpy as np +import xarray as xr +from abc import ABC, abstractmethod +from eva.data.data_driver import data_driver +from eva.data.data_collections import DataCollections + + +filename_retrieval = { + "IodaObsSpace": lambda dataset_config: dataset_config["filenames"][0], + "JediVariationalBiasCorrection": lambda dataset_config: dataset_config["bias_file"], +} + + +def get_filename(dataset_config, logger): + """ Retrieve filename using given type """ + + dataset_type = dataset_config["type"] + logger.assert_abort(dataset_type in filename_retrieval, + f'Unknown dataset_type {dataset_type}') + filename = filename_retrieval[dataset_type](dataset_config) + return filename + + +def check_file(filename, logger): + """ Check if first file exists and is nonzero """ + + if not os.path.isfile(filename): + logger.abort('First file provided to timeseries must exist.') + elif os.stat(filename).st_size == 0: + logger.abort('First file provided to timeseries must be nonzero.') + + +def create_empty_data(timeseries_config, dataset_config, timing, logger): + """ Creating an empty data array to use for missing cycle times """ + dc_tmp = DataCollections() + collection = timeseries_config["collection"] + data_driver(dataset_config, dc_tmp, timing, logger) + dataset = dc_tmp.get_data_collection(collection) + empty_data = xr.full_like(dataset, np.nan) + dc = DataCollections() + dc.create_or_add_to_collection(collection, empty_data) + return dc