Skip to content

Commit

Permalink
Merge pull request #40 from aradhakrishnanGFDL/39-picontrol
Browse files Browse the repository at this point in the history
39 picontrol
  • Loading branch information
Ciheim authored Aug 22, 2024
2 parents f0b2cff + a6931a4 commit 1c41909
Show file tree
Hide file tree
Showing 5 changed files with 134 additions and 2 deletions.
7 changes: 5 additions & 2 deletions catalogbuilder/intakebuilder/getinfo.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,6 @@ def getInfoFromFilename(filename,dictInfo,logger):
ncfilename = filename.split(".")[0].split("_")
varname = ncfilename[0]
dictInfo["variable"] = varname
miptable = ncfilename[1]
dictInfo["mip_table"] = miptable
modelname = ncfilename[2]
dictInfo["model"] = modelname
Expand Down Expand Up @@ -250,7 +249,11 @@ def getStandardName(list_variable_id,list_realm):
dictCF = {}
try:
url = "https://raw.githubusercontent.com/NOAA-GFDL/MDTF-diagnostics/b5e7916c203f3ba0b53e9e40fb8dc78ecc2cf5c3/data/gfdl-cmor-tables/gfdl_to_cmip5_vars.csv"
df = pd.read_csv(url, sep=",", header=0,index_col=False)
url2 = "https://raw.githubusercontent.com/NOAA-GFDL/MDTF-diagnostics/b5e7916c203f3ba0b53e9e40fb8dc78ecc2cf5c3/data/gfdl-cmor-tables/gfdl_to_cmip6_vars.csv"
df1 = pd.read_csv(url, sep=",", header=0,index_col=False)
df2 = pd.read_csv(url2, sep=",", header=0,index_col=False)
#TODO Add try catch except for concat operation if concat fails for some reason
df = pd.concat([df1,df2]).drop_duplicates().reset_index(drop=True)
except IOError:
print("Unable to open file")
sys.exit(1)
Expand Down
5 changes: 5 additions & 0 deletions catalogbuilder/intakebuilder/gfdlcrawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,11 @@ def crawlLocal(projectdir, dictFilter,dictFilterIgnore,logger,configyaml,slow):
if not filename.endswith(".nc"):
logger.debug("FILE does not end with .nc. Skipping", filepath)
continue
#if our filename expectations are not met compared to the output_file_path_template in config, skip the loop. TODO revisit for statics
if ("static" not in filename):
if ((len(filename.split('.'))-1) != len(set_ftemplate)):
print("Skipping ",filename)
continue
logger.info(dirpath+"/"+filename)
dictInfo = {}
dictInfo = getinfo.getProject(projectdir, dictInfo)
Expand Down
41 changes: 41 additions & 0 deletions configs/config-CM4.5v01_om5b06_piC_noBLING.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
#what kind of directory structure to expect?
#For a directory structure like /archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pdclim1850F/gfdl.ncrc5-deploy-prod-openmp/pp
# the output_path_template is set as follows.
#We have NA in those values that do not match up with any of the expected headerlist (CSV columns), otherwise we
#simply specify the associated header name in the appropriate place. E.g. The third directory in the PP path example
#above is the model (source_id), so the third list value in output_path_template is set to 'source_id'. We make sure
#this is a valid value in headerlist as well.
#The fourth directory is am5f3b1r0 which does not map to an existing header value. So we simply NA in output_path_template
#for the fourth value.

#catalog headers
#The headerlist is expected column names in your catalog/csv file. This is usually determined by the users in conjuction
#with the ESM collection specification standards and the appropriate workflows.

headerlist: ["activity_id", "institution_id", "source_id", "experiment_id",
"frequency", "realm", "table_id",
"member_id", "grid_label", "variable_id",
"time_range", "chunk_freq","platform","dimensions","cell_methods","standard_name","path"]

#what kind of directory structure to expect?
#For a directory structure like /archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pdclim1850F/gfdl.ncrc5-deploy-prod-openmp/pp
# the output_path_template is set as follows.
#We have NA in those values that do not match up with any of the expected headerlist (CSV columns), otherwise we
#simply specify the associated header name in the appropriate place. E.g. The third directory in the PP path example
#above is the model (source_id), so the third list value in output_path_template is set to 'source_id'. We make sure
#this is a valid value in headerlist as well.
#The fourth directory is am5f3b1r0 which does not map to an existing header value. So we simply NA in output_path_template
#for the fourth value.

output_path_template: ['NA','NA','source_id','NA','experiment_id','platform','custom_pp','realm','cell_methods','frequency','chunk_freq']

output_file_template: ['realm','time_range','variable_id']

#OUTPUT FILE INFO is currently passed as command-line argument.
#We will revisit adding a csvfile, jsonfile and logfile configuration to the builder configuration file in the future.
#csvfile = #jsonfile = #logfile =

#######################################################

input_path: "/archive/John.Krasting/fre/FMS2024.02_OM5_20240724/CM4.5v01_om5b06_piC_noBLING/gfdl.ncrc5-intel23-prod-openmp/pp/"
output_path: "/home/a1r/github/noaa-gfdl/catalogs/CM4.5v01_om5b06_piC_noBLING" # ENTER NAME OF THE CSV AND JSON, THE SUFFIX ALONE. e.g catalog (the builder then generates catalog.csv and catalog.json. This can also be an absolute path)
42 changes: 42 additions & 0 deletions configs/config-CM4_piControl_C.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
#what kind of directory structure to expect?
#For a directory structure like /archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pdclim1850F/gfdl.ncrc5-deploy-prod-openmp/pp
# the output_path_template is set as follows.
#We have NA in those values that do not match up with any of the expected headerlist (CSV columns), otherwise we
#simply specify the associated header name in the appropriate place. E.g. The third directory in the PP path example
#above is the model (source_id), so the third list value in output_path_template is set to 'source_id'. We make sure
#this is a valid value in headerlist as well.
#The fourth directory is am5f3b1r0 which does not map to an existing header value. So we simply NA in output_path_template
#for the fourth value.

#catalog headers
#The headerlist is expected column names in your catalog/csv file. This is usually determined by the users in conjuction
#with the ESM collection specification standards and the appropriate workflows.

headerlist: ["activity_id", "institution_id", "source_id", "experiment_id",
"frequency", "realm", "table_id",
"member_id", "grid_label", "variable_id",
"time_range", "chunk_freq","platform","dimensions","cell_methods","standard_name","path"]

#what kind of directory structure to expect?
#For a directory structure like /archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pdclim1850F/gfdl.ncrc5-deploy-prod-openmp/pp
# the output_path_template is set as follows.
#We have NA in those values that do not match up with any of the expected headerlist (CSV columns), otherwise we
#simply specify the associated header name in the appropriate place. E.g. The third directory in the PP path example
#above is the model (source_id), so the third list value in output_path_template is set to 'source_id'. We make sure
#this is a valid value in headerlist as well.
#The fourth directory is am5f3b1r0 which does not map to an existing header value. So we simply NA in output_path_template
#for the fourth value.

#"/archive/oar.gfdl.cmip6/CM4/warsaw_201710_om4_v1.0.1/CM4_piControl_C/gfdl.ncrc4-intel16-prod-openmp/pp
output_path_template: ['NA','NA','source_id','NA','experiment_id','platform','custom_pp','realm','cell_methods','frequency','chunk_freq']

output_file_template: ['realm','time_range','variable_id']

#OUTPUT FILE INFO is currently passed as command-line argument.
#We will revisit adding a csvfile, jsonfile and logfile configuration to the builder configuration file in the future.
#csvfile = #jsonfile = #logfile =

#######################################################

input_path: "/archive/oar.gfdl.cmip6/CM4/warsaw_201710_om4_v1.0.1/CM4_piControl_C/gfdl.ncrc4-intel16-prod-openmp/pp" #"/uda/CMIP6/CMIP/NOAA-GFDL/GFDL-ESM4/"
output_path: "/home/a1r/github/noaa-gfdl/catalogs/CM4_piControl_C"
41 changes: 41 additions & 0 deletions configs/config-ESM4.5v01_om5b04_piC.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
#what kind of directory structure to expect?
#For a directory structure like /archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pdclim1850F/gfdl.ncrc5-deploy-prod-openmp/pp
# the output_path_template is set as follows.
#We have NA in those values that do not match up with any of the expected headerlist (CSV columns), otherwise we
#simply specify the associated header name in the appropriate place. E.g. The third directory in the PP path example
#above is the model (source_id), so the third list value in output_path_template is set to 'source_id'. We make sure
#this is a valid value in headerlist as well.
#The fourth directory is am5f3b1r0 which does not map to an existing header value. So we simply NA in output_path_template
#for the fourth value.

#catalog headers
#The headerlist is expected column names in your catalog/csv file. This is usually determined by the users in conjuction
#with the ESM collection specification standards and the appropriate workflows.

headerlist: ["activity_id", "institution_id", "source_id", "experiment_id",
"frequency", "realm", "table_id",
"member_id", "grid_label", "variable_id",
"time_range", "chunk_freq","platform","dimensions","cell_methods","standard_name","path"]

#what kind of directory structure to expect?
#For a directory structure like /archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pdclim1850F/gfdl.ncrc5-deploy-prod-openmp/pp
# the output_path_template is set as follows.
#We have NA in those values that do not match up with any of the expected headerlist (CSV columns), otherwise we
#simply specify the associated header name in the appropriate place. E.g. The third directory in the PP path example
#above is the model (source_id), so the third list value in output_path_template is set to 'source_id'. We make sure
#this is a valid value in headerlist as well.
#The fourth directory is am5f3b1r0 which does not map to an existing header value. So we simply NA in output_path_template
#for the fourth value.

output_path_template: ['NA','NA','source_id','NA','experiment_id','platform','custom_pp','realm','cell_methods','frequency','chunk_freq']

output_file_template: ['realm','time_range','variable_id']

#OUTPUT FILE INFO is currently passed as command-line argument.
#We will revisit adding a csvfile, jsonfile and logfile configuration to the builder configuration file in the future.
#csvfile = #jsonfile = #logfile =

#######################################################

input_path: "/archive/Eric.Stofferahn/CMIP7/ESM4/DEV/ESM4.5v01_om5b04_piC/gfdl.ncrc5-intel23-prod-openmp/pp/"
output_path: "/home/a1r/github/noaa-gfdl/catalogs/ESM4.5v01_om5b04_piC" # ENTER NAME OF THE CSV AND JSON, THE SUFFIX ALONE. e.g catalog (the builder then generates catalog.csv and catalog.json. This can also be an absolute path)

0 comments on commit 1c41909

Please sign in to comment.