diff --git a/catalogbuilder/cats/mdtf_template.json b/catalogbuilder/cats/mdtf_template.json new file mode 100644 index 0000000..09bac46 --- /dev/null +++ b/catalogbuilder/cats/mdtf_template.json @@ -0,0 +1,138 @@ +{ + "esmcat_version": "0.0.1", + "attributes": [ + { + "column_name": "activity_id", + "vocabulary": "", + "required": false + }, + { + "column_name": "institution_id", + "vocabulary": "", + "required": false + }, + { + "column_name": "source_id", + "vocabulary": "", + "required": false + }, + { + "column_name": "experiment_id", + "vocabulary": "", + "required": true + }, + { + "column_name": "frequency", + "vocabulary": "https://raw.githubusercontent.com/NOAA-GFDL/CMIP6_CVs/master/CMIP6_frequency.json", + "required": true + }, + { + "column_name": "realm", + "vocabulary": "", + "required": true + }, + { + "column_name": "table_id", + "vocabulary": "", + "required": false + }, + { + "column_name": "member_id", + "vocabulary": "", + "required": false + }, + { + "column_name": "grid_label", + "vocabulary": "", + "required": false + }, + { + "column_name": "variable_id", + "vocabulary": "", + "required": true + }, + { + "column_name": "time_range", + "vocabulary": "", + "required": true + }, + { + "column_name": "chunk_freq", + "required": false + }, + { + "column_name":"platform", + "vocabulary": "", + "required": false + }, + { + "column_name":"target", + "vocabulary": "", + "required": false + }, + { + "column_name": "cell_methods", + "vocabulary": "", + "required": "enhanced" + }, + { + "column_name": "path", + "vocabulary": "", + "required": true + }, + { + "column_name": "dimensions", + "vocabulary": "", + "required": "enhanced" + }, + { + "column_name": "version_id", + "vocabulary": "", + "required": false + }, + { + "column_name": "standard_name", + "vocabulary": "", + "required": "enhanced" + } + ], + "assets": { + "column_name": "path", + "format": "netcdf", + "format_column_name": null + }, + "aggregation_control": { + "variable_column_name": "variable_id", + "groupby_attrs": [ + "source_id", + "experiment_id", + "frequency", + "table_id", + "grid_label", + "realm", + "member_id", + "chunk_freq" + ], + "aggregations": [ + { + "type": "union", + "attribute_name": "variable_id", + "options": {} + }, + { + "type": "join_existing", + "attribute_name": "time_range", + "options": { + "dim": "time", + "coords": "minimal", + "compat": "override" + } + } + ] + }, + "id": "esm_catalog_ESM4", + "description": null, + "title": null, + "last_updated": "2023-05-07T16:35:52Z", + "catalog_file": "gfdl_autotest.csv" +} diff --git a/catalogbuilder/intakebuilder/configparser.py b/catalogbuilder/intakebuilder/configparser.py index e64bedc..b7c431a 100644 --- a/catalogbuilder/intakebuilder/configparser.py +++ b/catalogbuilder/intakebuilder/configparser.py @@ -30,4 +30,10 @@ def __init__(self, config): print("output_file_template :", self.output_file_template) except: raise KeyError("output_file_template does not exist in config") + try: + self.schema = configfile['schema'] + print("schema:", self.schema) + except: + self.schema = None + pass diff --git a/catalogbuilder/scripts/gen_intake_gfdl.py b/catalogbuilder/scripts/gen_intake_gfdl.py index 84bb84d..16cdb53 100755 --- a/catalogbuilder/scripts/gen_intake_gfdl.py +++ b/catalogbuilder/scripts/gen_intake_gfdl.py @@ -27,7 +27,7 @@ sys.exit("The module 'intakebuilder' is still not installed. Do you have intakebuilder in your sys.path or have you activated the conda environment with the intakebuilder package in it? ") package_dir = os.path.dirname(os.path.abspath(__file__)) -template_path = os.path.join(package_dir, '../cats/gfdl_template.json') +#template_path = os.path.join(package_dir, '../cats/gfdl_template.json') def create_catalog(input_path=None, output_path=None, config=None, filter_realm=None, filter_freq=None, filter_chunk=None, overwrite=False, append=False, slow = False): @@ -42,7 +42,13 @@ def create_catalog(input_path=None, output_path=None, config=None, filter_realm= input_path = configyaml.input_path output_path = configyaml.output_path - + + if config is None or not configyaml.schema: + print("We will use catalog builder catalogbuilder/cats/gfdl_template.json as your json schema") + template_path = os.path.join(package_dir, '../cats/gfdl_template.json') + else: + template_path = configyaml.schema + print("Using schema from config file", template_path) if not os.path.exists(input_path): sys.exit("Input path does not exist. Adjust configuration.") if not os.path.exists(Path(output_path).parent.absolute()): diff --git a/catalogbuilder/tests/config-mdtf.yaml b/catalogbuilder/tests/config-mdtf.yaml new file mode 100644 index 0000000..5a9e0b4 --- /dev/null +++ b/catalogbuilder/tests/config-mdtf.yaml @@ -0,0 +1,42 @@ +#what kind of directory structure to expect? +#For a directory structure like /archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pdclim1850F/gfdl.ncrc5-deploy-prod-openmp/pp +# the output_path_template is set as follows. +#We have NA in those values that do not match up with any of the expected headerlist (CSV columns), otherwise we +#simply specify the associated header name in the appropriate place. E.g. The third directory in the PP path example +#above is the model (source_id), so the third list value in output_path_template is set to 'source_id'. We make sure +#this is a valid value in headerlist as well. +#The fourth directory is am5f3b1r0 which does not map to an existing header value. So we simply NA in output_path_template +#for the fourth value. + +#catalog headers +#The headerlist is expected column names in your catalog/csv file. This is usually determined by the users in conjuction +#with the ESM collection specification standards and the appropriate workflows. + +headerlist: ["activity_id", "institution_id", "source_id", "experiment_id", + "frequency", "realm", "table_id", + "member_id", "grid_label", "variable_id", + "time_range", "chunk_freq","platform","dimensions","cell_methods","standard_name","path"] + +#what kind of directory structure to expect? +#For a directory structure like /archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pdclim1850F/gfdl.ncrc5-deploy-prod-openmp/pp +# the output_path_template is set as follows. +#We have NA in those values that do not match up with any of the expected headerlist (CSV columns), otherwise we +#simply specify the associated header name in the appropriate place. E.g. The third directory in the PP path example +#above is the model (source_id), so the third list value in output_path_template is set to 'source_id'. We make sure +#this is a valid value in headerlist as well. +#The fourth directory is am5f3b1r0 which does not map to an existing header value. So we simply NA in output_path_template +#for the fourth value. + +output_path_template: ['NA','NA','source_id','NA','experiment_id','platform','custom_pp','realm','cell_methods','frequency','chunk_freq'] + +output_file_template: ['realm','time_range','variable_id'] + +#OUTPUT FILE INFO is currently passed as command-line argument. +#We will revisit adding a csvfile, jsonfile and logfile configuration to the builder configuration file in the future. +#csvfile = #jsonfile = #logfile = + +####################################################### + +schema: "/home/a1r/git/forkCatalogBuilder-/catalogbuilder/cats/mdtf_template.json" #if your json schema is slighlty different but vetted with MSD, you may use your json schema here +input_path: "/archive/am5/am5/am5f7b10r0/c96L65_am5f7b10r0_amip/gfdl.ncrc5-deploy-prod-openmp/pp/" +output_path: "/home/a1r/github/noaa-gfdl/catalogs/c96L65_am5f7b10r0_amip30_test" # ENTER NAME OF THE CSV AND JSON, THE SUFFIX ALONE. e.g catalog (the builder then generates catalog.csv and catalog.json. This can also be an absolute path)