Skip to content

Commit

Permalink
Merge pull request #32 from aradhakrishnanGFDL/mdtf-support
Browse files Browse the repository at this point in the history
Mdtf support- Add schema file to config yaml
  • Loading branch information
ceblanton authored Aug 14, 2024
2 parents e7d568d + bc07968 commit cb007c1
Show file tree
Hide file tree
Showing 4 changed files with 194 additions and 2 deletions.
138 changes: 138 additions & 0 deletions catalogbuilder/cats/mdtf_template.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
{
"esmcat_version": "0.0.1",
"attributes": [
{
"column_name": "activity_id",
"vocabulary": "",
"required": false
},
{
"column_name": "institution_id",
"vocabulary": "",
"required": false
},
{
"column_name": "source_id",
"vocabulary": "",
"required": false
},
{
"column_name": "experiment_id",
"vocabulary": "",
"required": true
},
{
"column_name": "frequency",
"vocabulary": "https://raw.githubusercontent.com/NOAA-GFDL/CMIP6_CVs/master/CMIP6_frequency.json",
"required": true
},
{
"column_name": "realm",
"vocabulary": "",
"required": true
},
{
"column_name": "table_id",
"vocabulary": "",
"required": false
},
{
"column_name": "member_id",
"vocabulary": "",
"required": false
},
{
"column_name": "grid_label",
"vocabulary": "",
"required": false
},
{
"column_name": "variable_id",
"vocabulary": "",
"required": true
},
{
"column_name": "time_range",
"vocabulary": "",
"required": true
},
{
"column_name": "chunk_freq",
"required": false
},
{
"column_name":"platform",
"vocabulary": "",
"required": false
},
{
"column_name":"target",
"vocabulary": "",
"required": false
},
{
"column_name": "cell_methods",
"vocabulary": "",
"required": "enhanced"
},
{
"column_name": "path",
"vocabulary": "",
"required": true
},
{
"column_name": "dimensions",
"vocabulary": "",
"required": "enhanced"
},
{
"column_name": "version_id",
"vocabulary": "",
"required": false
},
{
"column_name": "standard_name",
"vocabulary": "",
"required": "enhanced"
}
],
"assets": {
"column_name": "path",
"format": "netcdf",
"format_column_name": null
},
"aggregation_control": {
"variable_column_name": "variable_id",
"groupby_attrs": [
"source_id",
"experiment_id",
"frequency",
"table_id",
"grid_label",
"realm",
"member_id",
"chunk_freq"
],
"aggregations": [
{
"type": "union",
"attribute_name": "variable_id",
"options": {}
},
{
"type": "join_existing",
"attribute_name": "time_range",
"options": {
"dim": "time",
"coords": "minimal",
"compat": "override"
}
}
]
},
"id": "esm_catalog_ESM4",
"description": null,
"title": null,
"last_updated": "2023-05-07T16:35:52Z",
"catalog_file": "gfdl_autotest.csv"
}
6 changes: 6 additions & 0 deletions catalogbuilder/intakebuilder/configparser.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,4 +30,10 @@ def __init__(self, config):
print("output_file_template :", self.output_file_template)
except:
raise KeyError("output_file_template does not exist in config")
try:
self.schema = configfile['schema']
print("schema:", self.schema)
except:
self.schema = None
pass

10 changes: 8 additions & 2 deletions catalogbuilder/scripts/gen_intake_gfdl.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
sys.exit("The module 'intakebuilder' is still not installed. Do you have intakebuilder in your sys.path or have you activated the conda environment with the intakebuilder package in it? ")

package_dir = os.path.dirname(os.path.abspath(__file__))
template_path = os.path.join(package_dir, '../cats/gfdl_template.json')
#template_path = os.path.join(package_dir, '../cats/gfdl_template.json')

def create_catalog(input_path=None, output_path=None, config=None, filter_realm=None, filter_freq=None, filter_chunk=None,
overwrite=False, append=False, slow = False):
Expand All @@ -42,7 +42,13 @@ def create_catalog(input_path=None, output_path=None, config=None, filter_realm=

input_path = configyaml.input_path
output_path = configyaml.output_path


if config is None or not configyaml.schema:
print("We will use catalog builder catalogbuilder/cats/gfdl_template.json as your json schema")
template_path = os.path.join(package_dir, '../cats/gfdl_template.json')
else:
template_path = configyaml.schema
print("Using schema from config file", template_path)
if not os.path.exists(input_path):
sys.exit("Input path does not exist. Adjust configuration.")
if not os.path.exists(Path(output_path).parent.absolute()):
Expand Down
42 changes: 42 additions & 0 deletions catalogbuilder/tests/config-mdtf.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
#what kind of directory structure to expect?
#For a directory structure like /archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pdclim1850F/gfdl.ncrc5-deploy-prod-openmp/pp
# the output_path_template is set as follows.
#We have NA in those values that do not match up with any of the expected headerlist (CSV columns), otherwise we
#simply specify the associated header name in the appropriate place. E.g. The third directory in the PP path example
#above is the model (source_id), so the third list value in output_path_template is set to 'source_id'. We make sure
#this is a valid value in headerlist as well.
#The fourth directory is am5f3b1r0 which does not map to an existing header value. So we simply NA in output_path_template
#for the fourth value.

#catalog headers
#The headerlist is expected column names in your catalog/csv file. This is usually determined by the users in conjuction
#with the ESM collection specification standards and the appropriate workflows.

headerlist: ["activity_id", "institution_id", "source_id", "experiment_id",
"frequency", "realm", "table_id",
"member_id", "grid_label", "variable_id",
"time_range", "chunk_freq","platform","dimensions","cell_methods","standard_name","path"]

#what kind of directory structure to expect?
#For a directory structure like /archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pdclim1850F/gfdl.ncrc5-deploy-prod-openmp/pp
# the output_path_template is set as follows.
#We have NA in those values that do not match up with any of the expected headerlist (CSV columns), otherwise we
#simply specify the associated header name in the appropriate place. E.g. The third directory in the PP path example
#above is the model (source_id), so the third list value in output_path_template is set to 'source_id'. We make sure
#this is a valid value in headerlist as well.
#The fourth directory is am5f3b1r0 which does not map to an existing header value. So we simply NA in output_path_template
#for the fourth value.

output_path_template: ['NA','NA','source_id','NA','experiment_id','platform','custom_pp','realm','cell_methods','frequency','chunk_freq']

output_file_template: ['realm','time_range','variable_id']

#OUTPUT FILE INFO is currently passed as command-line argument.
#We will revisit adding a csvfile, jsonfile and logfile configuration to the builder configuration file in the future.
#csvfile = #jsonfile = #logfile =

#######################################################

schema: "/home/a1r/git/forkCatalogBuilder-/catalogbuilder/cats/mdtf_template.json" #if your json schema is slighlty different but vetted with MSD, you may use your json schema here
input_path: "/archive/am5/am5/am5f7b10r0/c96L65_am5f7b10r0_amip/gfdl.ncrc5-deploy-prod-openmp/pp/"
output_path: "/home/a1r/github/noaa-gfdl/catalogs/c96L65_am5f7b10r0_amip30_test" # ENTER NAME OF THE CSV AND JSON, THE SUFFIX ALONE. e.g catalog (the builder then generates catalog.csv and catalog.json. This can also be an absolute path)

0 comments on commit cb007c1

Please sign in to comment.