Skip to content

Commit

Permalink
fix: fixes to solution to pass testing
Browse files Browse the repository at this point in the history
  • Loading branch information
asewnath committed Nov 15, 2024
1 parent 4409bb5 commit 8faddd2
Show file tree
Hide file tree
Showing 5 changed files with 79 additions and 91 deletions.
37 changes: 12 additions & 25 deletions src/eva/eva_driver.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
from eva.data.data_driver import data_driver
from eva.time_series.time_series import add_empty_to_timeseries
from eva.time_series.time_series import collapse_collection_to_time_series
from eva.time_series.time_series_utils import create_empty_data, get_filename, check_file
from eva.transforms.transform_driver import transform_driver
from eva.plotting.batch.base.plot_tools.figure_driver import figure_driver
from eva.data.data_collections import DataCollections
Expand Down Expand Up @@ -162,41 +163,27 @@ def read_transform_time_series(logger, timing, eva_dict, data_collections):
if name == time_series_config['collection']:
transform_dict['transforms'].append(transform)

# Enforce that first file exists
first_file = datasets_config[0]['filenames'][0]
if not os.path.isfile(first_file):
logger.abort('First file provided to timeseries must exist.')
elif os.stat(first_file).st_size == 0:
logger.abort('First file provided to timeseries must be nonzero.')

# Prepare empty data array for missing times
data_collections_tmp = DataCollections()
data_driver(datasets_config[0], data_collections_tmp, timing, logger)
temp_collection = time_series_config["collection"]
temp_group = datasets_config[0]['groups'][0]['name']
temp_var = datasets_config[0]['groups'][0]['variables'][0]
data_array = data_collections_tmp.get_variable_data_array(temp_collection,
temp_group, temp_var)
empty_data_array = xr.full_like(data_array, np.nan)
# Check if first file is empty. If it is, abort.
empty_dataset_config = datasets_config[0]
filename = get_filename(empty_dataset_config, logger)
check_file(filename, logger)

# Loop over datasets reading each one in turn, internally appending the data_collections
for ind, dataset_config in enumerate(datasets_config):

# Pull out information to check for missing date
date = dates[ind]

# Check if file exists. If it doesn't, add empty and continue
filename = dataset_config['filenames'][0]
# Check if file exists, if not add empty and continue
filename = get_filename(dataset_config, logger)
if not os.path.isfile(filename):
add_empty_to_timeseries(logger, date, time_series_config,
datasets_config[0], empty_data_array,
data_collections)
add_empty_to_timeseries(logger, date, ind, timing, time_series_config,
empty_dataset_config, data_collections)
continue
# Check if file exists but is empty, add empty and continue
# Check if file exists but is size zero, add empty and continue
elif os.stat(filename).st_size == 0:
add_empty_to_timeseries(logger, date, time_series_config,
datasets_config[0], empty_data_array,
data_collections)
add_empty_to_timeseries(logger, date, ind, timing, time_series_config,
empty_dataset_config, data_collections)
continue

# Create a temporary collection for this time step
Expand Down
11 changes: 10 additions & 1 deletion src/eva/tests/config/testIodaObsSpaceAmsuaN19_TimeSeries.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,15 @@ datasets:
- name: ObsValue
variables: &variables [brightnessTemperature]
- name: hofx
#Empty
- name: experiment
type: IodaObsSpace
filenames:
- ${data_input_path}/ioda_obs_space.amsua_n19.hofx.2020-12-14T000000Z.nc4
channels: *channels
groups:
- name: ObsValue
- name: hofx
- name: experiment
type: IodaObsSpace
filenames:
Expand All @@ -31,7 +40,7 @@ transforms:
time_series:

- begin_date: '2020-12-14T21:00:00'
final_date: '2020-12-15T03:00:00'
final_date: '2020-12-15T09:00:00'
interval: 'PT6H'

collection: experiment
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,16 @@ datasets:
bias_file: ${data_input_path}/gsi.x0048v2.bc.amsua_n19.2021-12-11T21:00:00Z.satbias
lapse_file: ${data_input_path}/gsi.x0048v2.bc.amsua_n19.2021-12-11T21:00:00Z.tlapse

# Empty
- name: experiment
type: JediVariationalBiasCorrection
bias_file: ${data_input_path}/gsi.x0048v2.bc.amsua_n19.2021-12-12T03:00:00Z.satbias
lapse_file: ${data_input_path}/gsi.x0048v2.bc.amsua_n19.2021-12-12T03:00:00Z.tlapse

time_series:

- begin_date: '2020-12-15T00:00:00'
final_date: '2020-12-15T06:00:00'
- begin_date: '2021-12-11T15:00:00'
final_date: '2021-12-12T03:00:00'
interval: 'PT6H'

collection: experiment
Expand Down
69 changes: 6 additions & 63 deletions src/eva/time_series/time_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import numpy as np
import xarray as xr
from eva.data.data_collections import DataCollections
from eva.time_series.time_series_utils import create_empty_data

# --------------------------------------------------------------------------------------------------

Expand All @@ -25,71 +26,13 @@

# --------------------------------------------------------------------------------------------------

def add_empty_to_timeseries(logger, date, time_series_config, dataset_config, data_array,
data_collections):
def add_empty_to_timeseries(logger, date, ind, timing, time_series_config,
dataset_config, data_collections):

''' Add empty collection to timeseries for missing date '''
collection_to_ts = dataset_config['name']
group_list = [dataset_config['groups'][0]['name']]
variable_list = dataset_config['groups'][0]['variables']

for idx in range(1, len(dataset_config['groups'])):
group_list.append(dataset_config['groups'][idx]['name'])

dataset_tmp = DataCollections()

# Add groups and variables to empty dataset.
for group in group_list:
for variable in variable_list:
# Create empty data array
dataset_tmp.add_variable_to_collection("Empty", group, variable, data_array)

# Extract xarray
dataset_tmp = dataset_tmp.get_data_collection("Empty")

# Optional: aggregation methods
aggregation_methods = time_series_config.get('aggregation_methods', [])

# If specifying aggregation methods it must be accompanied by a dimension
if aggregation_methods:
logger.assert_abort('dimension' in time_series_config, 'When specifying aggregation '
'methods a dimension must also be specified.')
dimension = time_series_config['dimension']

dataset_aggregated = xr.Dataset()

# If there is no aggregation method specified, just add the dataset to the time series
if not aggregation_methods:
dataset_aggregated = xr.merge([dataset_aggregated, dataset_tmp])
else:
for aggregation_method in aggregation_methods:
# Assert that aggregation_method is in the aggregation methods
logger.assert_abort(aggregation_method in xr_aggregation_methods,
f'Unknown aggregation method {aggregation_method}')

# Compute the aggregation_method - nan for empty
dataset_am = xr_aggregation_methods[aggregation_method](dataset_tmp, dim=dimension)

# Append each variable name in dataset_am with _aggregation_method
rename_dict = {var: f"{var}_{aggregation_method}" for var in dataset_am.data_vars}
dataset_am = dataset_am.rename(rename_dict)

# Merge all the results into the aggregated dataset
dataset_aggregated = xr.merge([dataset_aggregated, dataset_am])

# Get all dims of dataset_aggregated and create empty array with those dims
dims = {dim: dataset_aggregated.sizes[dim] for dim in dataset_aggregated.dims}
data_array_shape = tuple(dims[dim] for dim in dims)
dataset_aggregated['MetaData::Dates'] = xr.DataArray(np.full(data_array_shape, date),
dims=dataset_aggregated.dims)

# Add the time index to the aggregated dataset
dataset_aggregated = dataset_aggregated.expand_dims('TimeIndex')
dataset_aggregated['TimeIndex'] = [0]

# Append the dataset with the aggregation
data_collections.create_or_add_to_collection(f'{collection_to_ts}_time_series',
dataset_aggregated, 'TimeIndex')
empty_data_collection = create_empty_data(time_series_config, dataset_config, timing, logger)
collapse_collection_to_time_series(logger, ind, date, time_series_config, data_collections,
empty_data_collection)


# --------------------------------------------------------------------------------------------------
Expand Down
43 changes: 43 additions & 0 deletions src/eva/time_series/time_series_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
import os
import numpy as np
import xarray as xr
from abc import ABC, abstractmethod
from eva.data.data_driver import data_driver
from eva.data.data_collections import DataCollections


filename_retrieval = {
"IodaObsSpace": lambda dataset_config: dataset_config["filenames"][0],
"JediVariationalBiasCorrection": lambda dataset_config: dataset_config["bias_file"],
}


def get_filename(dataset_config, logger):
""" Retrieve filename using given type """

dataset_type = dataset_config["type"]
logger.assert_abort(dataset_type in filename_retrieval,
f'Unknown dataset_type {dataset_type}')
filename = filename_retrieval[dataset_type](dataset_config)
return filename


def check_file(filename, logger):
""" Check if first file exists and is nonzero """

if not os.path.isfile(filename):
logger.abort('First file provided to timeseries must exist.')
elif os.stat(filename).st_size == 0:
logger.abort('First file provided to timeseries must be nonzero.')


def create_empty_data(timeseries_config, dataset_config, timing, logger):
""" Creating an empty data array to use for missing cycle times """
dc_tmp = DataCollections()
collection = timeseries_config["collection"]
data_driver(dataset_config, dc_tmp, timing, logger)
dataset = dc_tmp.get_data_collection(collection)
empty_data = xr.full_like(dataset, np.nan)
dc = DataCollections()
dc.create_or_add_to_collection(collection, empty_data)
return dc

0 comments on commit 8faddd2

Please sign in to comment.