Merge pull request #32 from aradhakrishnanGFDL/mdtf-support

Mdtf support- Add schema file to config yaml
NOAA-GFDL · Aug 14, 2024 · cb007c1 · cb007c1
2 parents e7d568d + bc07968
commit cb007c1
Show file tree

Hide file tree

Showing 4 changed files with 194 additions and 2 deletions.
diff --git a/catalogbuilder/cats/mdtf_template.json b/catalogbuilder/cats/mdtf_template.json
@@ -0,0 +1,138 @@
+{
+  "esmcat_version": "0.0.1",
+  "attributes": [
+    {
+      "column_name": "activity_id",
+      "vocabulary": "",
+      "required": false
+    },
+    {
+      "column_name": "institution_id",
+      "vocabulary": "",
+      "required": false
+    },
+    {
+      "column_name": "source_id",
+      "vocabulary": "",
+      "required": false
+    },
+    {
+      "column_name": "experiment_id",
+      "vocabulary": "",
+      "required": true
+    },
+    {
+      "column_name": "frequency",
+      "vocabulary": "https://raw.githubusercontent.com/NOAA-GFDL/CMIP6_CVs/master/CMIP6_frequency.json",
+      "required": true
+    },
+    {
+      "column_name": "realm",
+      "vocabulary": "",
+      "required": true
+    },
+    {
+      "column_name": "table_id",
+      "vocabulary": "",
+      "required": false
+    },
+    {
+      "column_name": "member_id",
+      "vocabulary": "",
+      "required": false
+    },
+    {
+      "column_name": "grid_label",
+      "vocabulary": "",
+      "required": false
+    },
+    {
+      "column_name": "variable_id",
+      "vocabulary": "",
+      "required": true
+    },
+    {
+      "column_name": "time_range",
+      "vocabulary": "",
+      "required": true
+    },
+    {
+      "column_name": "chunk_freq",
+      "required": false
+    },
+    {
+      "column_name":"platform",
+      "vocabulary": "",
+      "required": false
+    },
+    {
+      "column_name":"target",
+      "vocabulary": "",
+      "required": false
+    },
+    {
+     "column_name": "cell_methods",
+      "vocabulary": "",
+      "required": "enhanced"
+    },
+    {
+      "column_name": "path",
+      "vocabulary": "",
+      "required": true
+    },
+    {
+      "column_name": "dimensions",
+      "vocabulary": "",
+      "required": "enhanced"
+    },
+    {
+      "column_name": "version_id",
+      "vocabulary": "",
+      "required": false
+    },
+    {
+      "column_name": "standard_name",
+      "vocabulary": "",
+      "required": "enhanced"
+    }
+  ],
+  "assets": {
+    "column_name": "path",
+    "format": "netcdf",
+    "format_column_name": null
+  },
+  "aggregation_control": {
+    "variable_column_name": "variable_id",
+    "groupby_attrs": [
+      "source_id",
+      "experiment_id",
+      "frequency",
+      "table_id",
+      "grid_label", 
+      "realm",
+      "member_id",
+      "chunk_freq"
+    ],
+    "aggregations": [
+      {
+        "type": "union",
+        "attribute_name": "variable_id",
+        "options": {}
+      },
+      {
+        "type": "join_existing",
+        "attribute_name": "time_range",
+        "options": {
+          "dim": "time",
+          "coords": "minimal",
+          "compat": "override"
+        }
+      }
+    ]
+  },
+  "id": "esm_catalog_ESM4",
+  "description": null,
+  "title": null,
+  "last_updated": "2023-05-07T16:35:52Z",
+  "catalog_file": "gfdl_autotest.csv"
+}
diff --git a/catalogbuilder/intakebuilder/configparser.py b/catalogbuilder/intakebuilder/configparser.py
@@ -30,4 +30,10 @@ def __init__(self, config):
             print("output_file_template :", self.output_file_template)
         except:
             raise KeyError("output_file_template does not exist in config")
+        try:
+            self.schema = configfile['schema']
+            print("schema:", self.schema)
+        except:
+            self.schema = None
+            pass
 
diff --git a/catalogbuilder/scripts/gen_intake_gfdl.py b/catalogbuilder/scripts/gen_intake_gfdl.py
@@ -27,7 +27,7 @@
         sys.exit("The module 'intakebuilder' is still not installed. Do you have intakebuilder in your sys.path or have you activated the conda environment with the intakebuilder package in it? ")
 
 package_dir = os.path.dirname(os.path.abspath(__file__))
-template_path = os.path.join(package_dir, '../cats/gfdl_template.json')
+#template_path = os.path.join(package_dir, '../cats/gfdl_template.json')
 
 def create_catalog(input_path=None, output_path=None, config=None, filter_realm=None, filter_freq=None, filter_chunk=None,
          overwrite=False, append=False, slow = False):
@@ -42,7 +42,13 @@ def create_catalog(input_path=None, output_path=None, config=None, filter_realm=
 
         input_path = configyaml.input_path
         output_path = configyaml.output_path
-
+
+    if config is None or not configyaml.schema:
+            print("We will use catalog builder catalogbuilder/cats/gfdl_template.json as your json schema")
+            template_path = os.path.join(package_dir, '../cats/gfdl_template.json')
+    else:
+            template_path = configyaml.schema
+            print("Using schema from config file", template_path)
     if not os.path.exists(input_path):
         sys.exit("Input path does not exist. Adjust configuration.")
     if not os.path.exists(Path(output_path).parent.absolute()):

diff --git a/catalogbuilder/tests/config-mdtf.yaml b/catalogbuilder/tests/config-mdtf.yaml
@@ -0,0 +1,42 @@
+#what kind of directory structure to expect? 
+#For a directory structure like /archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pdclim1850F/gfdl.ncrc5-deploy-prod-openmp/pp
+# the output_path_template is set as follows.
+#We have NA in those values that do not match up with any of the expected headerlist (CSV columns), otherwise we
+#simply specify the associated header name in the appropriate place. E.g. The third directory in the PP path example
+#above is the model (source_id), so the third list value in output_path_template is set to 'source_id'. We make sure
+#this is a valid value in headerlist as well.
+#The fourth directory is am5f3b1r0 which does not map to an existing header value. So we simply NA in output_path_template
+#for the fourth value.
+
+#catalog headers
+#The headerlist is expected column names in your catalog/csv file. This is usually determined by the users in conjuction
+#with the ESM collection specification standards and the appropriate workflows.
+
+headerlist: ["activity_id", "institution_id", "source_id", "experiment_id",
+                  "frequency", "realm", "table_id",
+                  "member_id", "grid_label", "variable_id",
+                  "time_range", "chunk_freq","platform","dimensions","cell_methods","standard_name","path"]
+
+#what kind of directory structure to expect?
+#For a directory structure like /archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pdclim1850F/gfdl.ncrc5-deploy-prod-openmp/pp
+# the output_path_template is set as follows.
+#We have NA in those values that do not match up with any of the expected headerlist (CSV columns), otherwise we
+#simply specify the associated header name in the appropriate place. E.g. The third directory in the PP path example
+#above is the model (source_id), so the third list value in output_path_template is set to 'source_id'. We make sure
+#this is a valid value in headerlist as well.
+#The fourth directory is am5f3b1r0 which does not map to an existing header value. So we simply NA in output_path_template
+#for the fourth value.
+
+output_path_template: ['NA','NA','source_id','NA','experiment_id','platform','custom_pp','realm','cell_methods','frequency','chunk_freq']
+
+output_file_template: ['realm','time_range','variable_id']
+
+#OUTPUT FILE INFO is currently passed as command-line argument.
+#We will revisit adding a csvfile, jsonfile and logfile configuration to the builder configuration file in the future.
+#csvfile =  #jsonfile =  #logfile =
+
+#######################################################
+
+schema: "/home/a1r/git/forkCatalogBuilder-/catalogbuilder/cats/mdtf_template.json" #if your json schema is slighlty different but vetted with MSD, you may use your json schema here
+input_path:  "/archive/am5/am5/am5f7b10r0/c96L65_am5f7b10r0_amip/gfdl.ncrc5-deploy-prod-openmp/pp/"
+output_path: "/home/a1r/github/noaa-gfdl/catalogs/c96L65_am5f7b10r0_amip30_test" # ENTER NAME OF THE CSV AND JSON, THE SUFFIX ALONE. e.g catalog (the builder then generates catalog.csv and catalog.json. This can also be an absolute path)