diff --git a/catalogbuilder/intakebuilder/getinfo.py b/catalogbuilder/intakebuilder/getinfo.py index 34a5e55..75fdf04 100644 --- a/catalogbuilder/intakebuilder/getinfo.py +++ b/catalogbuilder/intakebuilder/getinfo.py @@ -74,7 +74,6 @@ def getInfoFromFilename(filename,dictInfo,logger): ncfilename = filename.split(".")[0].split("_") varname = ncfilename[0] dictInfo["variable"] = varname - miptable = ncfilename[1] dictInfo["mip_table"] = miptable modelname = ncfilename[2] dictInfo["model"] = modelname @@ -250,7 +249,11 @@ def getStandardName(list_variable_id,list_realm): dictCF = {} try: url = "https://raw.githubusercontent.com/NOAA-GFDL/MDTF-diagnostics/b5e7916c203f3ba0b53e9e40fb8dc78ecc2cf5c3/data/gfdl-cmor-tables/gfdl_to_cmip5_vars.csv" - df = pd.read_csv(url, sep=",", header=0,index_col=False) + url2 = "https://raw.githubusercontent.com/NOAA-GFDL/MDTF-diagnostics/b5e7916c203f3ba0b53e9e40fb8dc78ecc2cf5c3/data/gfdl-cmor-tables/gfdl_to_cmip6_vars.csv" + df1 = pd.read_csv(url, sep=",", header=0,index_col=False) + df2 = pd.read_csv(url2, sep=",", header=0,index_col=False) + #TODO Add try catch except for concat operation if concat fails for some reason + df = pd.concat([df1,df2]).drop_duplicates().reset_index(drop=True) except IOError: print("Unable to open file") sys.exit(1) diff --git a/catalogbuilder/intakebuilder/gfdlcrawler.py b/catalogbuilder/intakebuilder/gfdlcrawler.py index cc1bdad..4071040 100644 --- a/catalogbuilder/intakebuilder/gfdlcrawler.py +++ b/catalogbuilder/intakebuilder/gfdlcrawler.py @@ -68,6 +68,11 @@ def crawlLocal(projectdir, dictFilter,dictFilterIgnore,logger,configyaml,slow): if not filename.endswith(".nc"): logger.debug("FILE does not end with .nc. Skipping", filepath) continue + #if our filename expectations are not met compared to the output_file_path_template in config, skip the loop. TODO revisit for statics + if ("static" not in filename): + if ((len(filename.split('.'))-1) != len(set_ftemplate)): + print("Skipping ",filename) + continue logger.info(dirpath+"/"+filename) dictInfo = {} dictInfo = getinfo.getProject(projectdir, dictInfo) diff --git a/configs/config-CM4.5v01_om5b06_piC_noBLING.yaml b/configs/config-CM4.5v01_om5b06_piC_noBLING.yaml new file mode 100644 index 0000000..aadf121 --- /dev/null +++ b/configs/config-CM4.5v01_om5b06_piC_noBLING.yaml @@ -0,0 +1,41 @@ +#what kind of directory structure to expect? +#For a directory structure like /archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pdclim1850F/gfdl.ncrc5-deploy-prod-openmp/pp +# the output_path_template is set as follows. +#We have NA in those values that do not match up with any of the expected headerlist (CSV columns), otherwise we +#simply specify the associated header name in the appropriate place. E.g. The third directory in the PP path example +#above is the model (source_id), so the third list value in output_path_template is set to 'source_id'. We make sure +#this is a valid value in headerlist as well. +#The fourth directory is am5f3b1r0 which does not map to an existing header value. So we simply NA in output_path_template +#for the fourth value. + +#catalog headers +#The headerlist is expected column names in your catalog/csv file. This is usually determined by the users in conjuction +#with the ESM collection specification standards and the appropriate workflows. + +headerlist: ["activity_id", "institution_id", "source_id", "experiment_id", + "frequency", "realm", "table_id", + "member_id", "grid_label", "variable_id", + "time_range", "chunk_freq","platform","dimensions","cell_methods","standard_name","path"] + +#what kind of directory structure to expect? +#For a directory structure like /archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pdclim1850F/gfdl.ncrc5-deploy-prod-openmp/pp +# the output_path_template is set as follows. +#We have NA in those values that do not match up with any of the expected headerlist (CSV columns), otherwise we +#simply specify the associated header name in the appropriate place. E.g. The third directory in the PP path example +#above is the model (source_id), so the third list value in output_path_template is set to 'source_id'. We make sure +#this is a valid value in headerlist as well. +#The fourth directory is am5f3b1r0 which does not map to an existing header value. So we simply NA in output_path_template +#for the fourth value. + +output_path_template: ['NA','NA','source_id','NA','experiment_id','platform','custom_pp','realm','cell_methods','frequency','chunk_freq'] + +output_file_template: ['realm','time_range','variable_id'] + +#OUTPUT FILE INFO is currently passed as command-line argument. +#We will revisit adding a csvfile, jsonfile and logfile configuration to the builder configuration file in the future. +#csvfile = #jsonfile = #logfile = + +####################################################### + +input_path: "/archive/John.Krasting/fre/FMS2024.02_OM5_20240724/CM4.5v01_om5b06_piC_noBLING/gfdl.ncrc5-intel23-prod-openmp/pp/" +output_path: "/home/a1r/github/noaa-gfdl/catalogs/CM4.5v01_om5b06_piC_noBLING" # ENTER NAME OF THE CSV AND JSON, THE SUFFIX ALONE. e.g catalog (the builder then generates catalog.csv and catalog.json. This can also be an absolute path) diff --git a/configs/config-CM4_piControl_C.yaml b/configs/config-CM4_piControl_C.yaml new file mode 100644 index 0000000..4f0e464 --- /dev/null +++ b/configs/config-CM4_piControl_C.yaml @@ -0,0 +1,42 @@ +#what kind of directory structure to expect? +#For a directory structure like /archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pdclim1850F/gfdl.ncrc5-deploy-prod-openmp/pp +# the output_path_template is set as follows. +#We have NA in those values that do not match up with any of the expected headerlist (CSV columns), otherwise we +#simply specify the associated header name in the appropriate place. E.g. The third directory in the PP path example +#above is the model (source_id), so the third list value in output_path_template is set to 'source_id'. We make sure +#this is a valid value in headerlist as well. +#The fourth directory is am5f3b1r0 which does not map to an existing header value. So we simply NA in output_path_template +#for the fourth value. + +#catalog headers +#The headerlist is expected column names in your catalog/csv file. This is usually determined by the users in conjuction +#with the ESM collection specification standards and the appropriate workflows. + +headerlist: ["activity_id", "institution_id", "source_id", "experiment_id", + "frequency", "realm", "table_id", + "member_id", "grid_label", "variable_id", + "time_range", "chunk_freq","platform","dimensions","cell_methods","standard_name","path"] + +#what kind of directory structure to expect? +#For a directory structure like /archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pdclim1850F/gfdl.ncrc5-deploy-prod-openmp/pp +# the output_path_template is set as follows. +#We have NA in those values that do not match up with any of the expected headerlist (CSV columns), otherwise we +#simply specify the associated header name in the appropriate place. E.g. The third directory in the PP path example +#above is the model (source_id), so the third list value in output_path_template is set to 'source_id'. We make sure +#this is a valid value in headerlist as well. +#The fourth directory is am5f3b1r0 which does not map to an existing header value. So we simply NA in output_path_template +#for the fourth value. + +#"/archive/oar.gfdl.cmip6/CM4/warsaw_201710_om4_v1.0.1/CM4_piControl_C/gfdl.ncrc4-intel16-prod-openmp/pp +output_path_template: ['NA','NA','source_id','NA','experiment_id','platform','custom_pp','realm','cell_methods','frequency','chunk_freq'] + +output_file_template: ['realm','time_range','variable_id'] + +#OUTPUT FILE INFO is currently passed as command-line argument. +#We will revisit adding a csvfile, jsonfile and logfile configuration to the builder configuration file in the future. +#csvfile = #jsonfile = #logfile = + +####################################################### + +input_path: "/archive/oar.gfdl.cmip6/CM4/warsaw_201710_om4_v1.0.1/CM4_piControl_C/gfdl.ncrc4-intel16-prod-openmp/pp" #"/uda/CMIP6/CMIP/NOAA-GFDL/GFDL-ESM4/" +output_path: "/home/a1r/github/noaa-gfdl/catalogs/CM4_piControl_C" diff --git a/configs/config-ESM4.5v01_om5b04_piC.yaml b/configs/config-ESM4.5v01_om5b04_piC.yaml new file mode 100644 index 0000000..2f6cfba --- /dev/null +++ b/configs/config-ESM4.5v01_om5b04_piC.yaml @@ -0,0 +1,41 @@ +#what kind of directory structure to expect? +#For a directory structure like /archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pdclim1850F/gfdl.ncrc5-deploy-prod-openmp/pp +# the output_path_template is set as follows. +#We have NA in those values that do not match up with any of the expected headerlist (CSV columns), otherwise we +#simply specify the associated header name in the appropriate place. E.g. The third directory in the PP path example +#above is the model (source_id), so the third list value in output_path_template is set to 'source_id'. We make sure +#this is a valid value in headerlist as well. +#The fourth directory is am5f3b1r0 which does not map to an existing header value. So we simply NA in output_path_template +#for the fourth value. + +#catalog headers +#The headerlist is expected column names in your catalog/csv file. This is usually determined by the users in conjuction +#with the ESM collection specification standards and the appropriate workflows. + +headerlist: ["activity_id", "institution_id", "source_id", "experiment_id", + "frequency", "realm", "table_id", + "member_id", "grid_label", "variable_id", + "time_range", "chunk_freq","platform","dimensions","cell_methods","standard_name","path"] + +#what kind of directory structure to expect? +#For a directory structure like /archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pdclim1850F/gfdl.ncrc5-deploy-prod-openmp/pp +# the output_path_template is set as follows. +#We have NA in those values that do not match up with any of the expected headerlist (CSV columns), otherwise we +#simply specify the associated header name in the appropriate place. E.g. The third directory in the PP path example +#above is the model (source_id), so the third list value in output_path_template is set to 'source_id'. We make sure +#this is a valid value in headerlist as well. +#The fourth directory is am5f3b1r0 which does not map to an existing header value. So we simply NA in output_path_template +#for the fourth value. + +output_path_template: ['NA','NA','source_id','NA','experiment_id','platform','custom_pp','realm','cell_methods','frequency','chunk_freq'] + +output_file_template: ['realm','time_range','variable_id'] + +#OUTPUT FILE INFO is currently passed as command-line argument. +#We will revisit adding a csvfile, jsonfile and logfile configuration to the builder configuration file in the future. +#csvfile = #jsonfile = #logfile = + +####################################################### + +input_path: "/archive/Eric.Stofferahn/CMIP7/ESM4/DEV/ESM4.5v01_om5b04_piC/gfdl.ncrc5-intel23-prod-openmp/pp/" +output_path: "/home/a1r/github/noaa-gfdl/catalogs/ESM4.5v01_om5b04_piC" # ENTER NAME OF THE CSV AND JSON, THE SUFFIX ALONE. e.g catalog (the builder then generates catalog.csv and catalog.json. This can also be an absolute path)