fix: fixes to solution to pass testing

JCSDA-internal · Nov 15, 2024 · 8faddd2 · 8faddd2
1 parent 4409bb5
commit 8faddd2
Show file tree

Hide file tree

Showing 5 changed files with 79 additions and 91 deletions.
diff --git a/src/eva/eva_driver.py b/src/eva/eva_driver.py
@@ -22,6 +22,7 @@
 from eva.data.data_driver import data_driver
 from eva.time_series.time_series import add_empty_to_timeseries
 from eva.time_series.time_series import collapse_collection_to_time_series
+from eva.time_series.time_series_utils import create_empty_data, get_filename, check_file
 from eva.transforms.transform_driver import transform_driver
 from eva.plotting.batch.base.plot_tools.figure_driver import figure_driver
 from eva.data.data_collections import DataCollections
@@ -162,41 +163,27 @@ def read_transform_time_series(logger, timing, eva_dict, data_collections):
                 if name == time_series_config['collection']:
                     transform_dict['transforms'].append(transform)
 
-        # Enforce that first file exists
-        first_file = datasets_config[0]['filenames'][0]
-        if not os.path.isfile(first_file):
-            logger.abort('First file provided to timeseries must exist.')
-        elif os.stat(first_file).st_size == 0:
-            logger.abort('First file provided to timeseries must be nonzero.')
-
-        # Prepare empty data array for missing times
-        data_collections_tmp = DataCollections()
-        data_driver(datasets_config[0], data_collections_tmp, timing, logger)
-        temp_collection = time_series_config["collection"]
-        temp_group = datasets_config[0]['groups'][0]['name']
-        temp_var = datasets_config[0]['groups'][0]['variables'][0]
-        data_array = data_collections_tmp.get_variable_data_array(temp_collection,
-                                                                  temp_group, temp_var)
-        empty_data_array = xr.full_like(data_array, np.nan)
+        # Check if first file is empty. If it is, abort.
+        empty_dataset_config = datasets_config[0]
+        filename = get_filename(empty_dataset_config, logger)
+        check_file(filename, logger)
 
         # Loop over datasets reading each one in turn, internally appending the data_collections
         for ind, dataset_config in enumerate(datasets_config):
 
             # Pull out information to check for missing date
             date = dates[ind]
 
-            # Check if file exists. If it doesn't, add empty and continue
-            filename = dataset_config['filenames'][0]
+            # Check if file exists, if not add empty and continue
+            filename = get_filename(dataset_config, logger)
             if not os.path.isfile(filename):
-                add_empty_to_timeseries(logger, date, time_series_config,
-                                        datasets_config[0], empty_data_array,
-                                        data_collections)
+                add_empty_to_timeseries(logger, date, ind, timing, time_series_config,
+                                        empty_dataset_config, data_collections)
                 continue
-            # Check if file exists but is empty, add empty and continue
+            # Check if file exists but is size zero, add empty and continue
             elif os.stat(filename).st_size == 0:
-                add_empty_to_timeseries(logger, date, time_series_config,
-                                        datasets_config[0], empty_data_array,
-                                        data_collections)
+                add_empty_to_timeseries(logger, date, ind, timing, time_series_config,
+                                        empty_dataset_config, data_collections)
                 continue
 
             # Create a temporary collection for this time step

diff --git a/src/eva/tests/config/testIodaObsSpaceAmsuaN19_TimeSeries.yaml b/src/eva/tests/config/testIodaObsSpaceAmsuaN19_TimeSeries.yaml
@@ -10,6 +10,15 @@ datasets:
       - name: ObsValue
         variables: &variables [brightnessTemperature]
       - name: hofx
+  #Empty
+  - name: experiment
+    type: IodaObsSpace
+    filenames:
+      - ${data_input_path}/ioda_obs_space.amsua_n19.hofx.2020-12-14T000000Z.nc4
+    channels: *channels
+    groups:
+      - name: ObsValue
+      - name: hofx
   - name: experiment
     type: IodaObsSpace
     filenames:
@@ -31,7 +40,7 @@ transforms:
 time_series:
 
   - begin_date: '2020-12-14T21:00:00'
-    final_date: '2020-12-15T03:00:00'
+    final_date: '2020-12-15T09:00:00'
     interval: 'PT6H'
 
     collection: experiment

diff --git a/src/eva/tests/config/testJediVariationalBiasCorrectionAmsuaN19.yaml b/src/eva/tests/config/testJediVariationalBiasCorrectionAmsuaN19.yaml
@@ -11,10 +11,16 @@ datasets:
     bias_file: ${data_input_path}/gsi.x0048v2.bc.amsua_n19.2021-12-11T21:00:00Z.satbias
     lapse_file: ${data_input_path}/gsi.x0048v2.bc.amsua_n19.2021-12-11T21:00:00Z.tlapse
 
+  # Empty
+  - name: experiment
+    type: JediVariationalBiasCorrection
+    bias_file: ${data_input_path}/gsi.x0048v2.bc.amsua_n19.2021-12-12T03:00:00Z.satbias
+    lapse_file: ${data_input_path}/gsi.x0048v2.bc.amsua_n19.2021-12-12T03:00:00Z.tlapse
+
 time_series:
 
-  - begin_date: '2020-12-15T00:00:00'
-    final_date: '2020-12-15T06:00:00'
+  - begin_date: '2021-12-11T15:00:00'
+    final_date: '2021-12-12T03:00:00'
     interval: 'PT6H'
 
     collection: experiment

diff --git a/src/eva/time_series/time_series.py b/src/eva/time_series/time_series.py
@@ -13,6 +13,7 @@
 import numpy as np
 import xarray as xr
 from eva.data.data_collections import DataCollections
+from eva.time_series.time_series_utils import create_empty_data
 
 # --------------------------------------------------------------------------------------------------
 
@@ -25,71 +26,13 @@
 
 # --------------------------------------------------------------------------------------------------
 
-def add_empty_to_timeseries(logger, date, time_series_config, dataset_config, data_array,
-                            data_collections):
+def add_empty_to_timeseries(logger, date, ind, timing, time_series_config,
+                            dataset_config, data_collections):
 
     ''' Add empty collection to timeseries for missing date  '''
-    collection_to_ts = dataset_config['name']
-    group_list = [dataset_config['groups'][0]['name']]
-    variable_list = dataset_config['groups'][0]['variables']
-
-    for idx in range(1, len(dataset_config['groups'])):
-        group_list.append(dataset_config['groups'][idx]['name'])
-
-    dataset_tmp = DataCollections()
-
-    # Add groups and variables to empty dataset.
-    for group in group_list:
-        for variable in variable_list:
-            # Create empty data array
-            dataset_tmp.add_variable_to_collection("Empty", group, variable, data_array)
-
-    # Extract xarray
-    dataset_tmp = dataset_tmp.get_data_collection("Empty")
-
-    # Optional: aggregation methods
-    aggregation_methods = time_series_config.get('aggregation_methods', [])
-
-    # If specifying aggregation methods it must be accompanied by a dimension
-    if aggregation_methods:
-        logger.assert_abort('dimension' in time_series_config, 'When specifying aggregation '
-                            'methods a dimension must also be specified.')
-        dimension = time_series_config['dimension']
-
-    dataset_aggregated = xr.Dataset()
-
-    # If there is no aggregation method specified, just add the dataset to the time series
-    if not aggregation_methods:
-        dataset_aggregated = xr.merge([dataset_aggregated, dataset_tmp])
-    else:
-        for aggregation_method in aggregation_methods:
-            # Assert that aggregation_method is in the aggregation methods
-            logger.assert_abort(aggregation_method in xr_aggregation_methods,
-                                f'Unknown aggregation method {aggregation_method}')
-
-            # Compute the aggregation_method - nan for empty
-            dataset_am = xr_aggregation_methods[aggregation_method](dataset_tmp, dim=dimension)
-
-            # Append each variable name in dataset_am with _aggregation_method
-            rename_dict = {var: f"{var}_{aggregation_method}" for var in dataset_am.data_vars}
-            dataset_am = dataset_am.rename(rename_dict)
-
-            # Merge all the results into the aggregated dataset
-            dataset_aggregated = xr.merge([dataset_aggregated, dataset_am])
-
-    # Get all dims of dataset_aggregated and create empty array with those dims
-    dims = {dim: dataset_aggregated.sizes[dim] for dim in dataset_aggregated.dims}
-    data_array_shape = tuple(dims[dim] for dim in dims)
-    dataset_aggregated['MetaData::Dates'] = xr.DataArray(np.full(data_array_shape, date),
-                                                         dims=dataset_aggregated.dims)
-
-    # Add the time index to the aggregated dataset
-    dataset_aggregated = dataset_aggregated.expand_dims('TimeIndex')
-    dataset_aggregated['TimeIndex'] = [0]
-
-    # Append the dataset with the aggregation
-    data_collections.create_or_add_to_collection(f'{collection_to_ts}_time_series',
-                                                 dataset_aggregated, 'TimeIndex')
+    empty_data_collection = create_empty_data(time_series_config, dataset_config, timing, logger)
+    collapse_collection_to_time_series(logger, ind, date, time_series_config, data_collections,
+                                       empty_data_collection)
 
 
 # --------------------------------------------------------------------------------------------------

diff --git a/src/eva/time_series/time_series_utils.py b/src/eva/time_series/time_series_utils.py
@@ -0,0 +1,43 @@
+import os
+import numpy as np
+import xarray as xr
+from abc import ABC, abstractmethod
+from eva.data.data_driver import data_driver
+from eva.data.data_collections import DataCollections
+
+
+filename_retrieval = {
+    "IodaObsSpace": lambda dataset_config: dataset_config["filenames"][0],
+    "JediVariationalBiasCorrection": lambda dataset_config: dataset_config["bias_file"],
+}
+
+
+def get_filename(dataset_config, logger):
+    """ Retrieve filename using given type  """
+
+    dataset_type = dataset_config["type"]
+    logger.assert_abort(dataset_type in filename_retrieval,
+                        f'Unknown dataset_type {dataset_type}')
+    filename = filename_retrieval[dataset_type](dataset_config)
+    return filename
+
+
+def check_file(filename, logger):
+    """ Check if first file exists and is nonzero  """
+
+    if not os.path.isfile(filename):
+        logger.abort('First file provided to timeseries must exist.')
+    elif os.stat(filename).st_size == 0:
+        logger.abort('First file provided to timeseries must be nonzero.')
+
+
+def create_empty_data(timeseries_config, dataset_config, timing, logger):
+    """ Creating an empty data array to use for missing cycle times  """
+    dc_tmp = DataCollections()
+    collection = timeseries_config["collection"]
+    data_driver(dataset_config, dc_tmp, timing, logger)
+    dataset = dc_tmp.get_data_collection(collection)
+    empty_data = xr.full_like(dataset, np.nan)
+    dc = DataCollections()
+    dc.create_or_add_to_collection(collection, empty_data)
+    return dc