Skip to content

Commit

Permalink
Merge pull request aradhakrishnanGFDL#44 from aradhakrishnanGFDL/36-b…
Browse files Browse the repository at this point in the history
…uilderconfig

Adding builderconfig back, with the dependent code adjusted to use this.
  • Loading branch information
Ciheim authored Feb 8, 2024
2 parents 007bbdf + 86b3241 commit 8fe86a2
Show file tree
Hide file tree
Showing 3 changed files with 79 additions and 10 deletions.
36 changes: 28 additions & 8 deletions intakebuilder/builderconfig.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,40 @@
#what kind of directory structure to expect?
#For a directory structure like /archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pdclim1850F/gfdl.ncrc5-deploy-prod-openmp/pp
# the output_path_template is set as follows.
#We have NA in those values that do not match up with any of the expected headerlist (CSV columns), otherwise we
#simply specify the associated header name in the appropriate place. E.g. The third directory in the PP path example
#above is the model (source_id), so the third list value in output_path_template is set to 'source_id'. We make sure
#this is a valid value in headerlist as well.
#The fourth directory is am5f3b1r0 which does not map to an existing header value. So we simply NA in output_path_template
#for the fourth value.

output_path_template = ['source_id','activity_id','experiment_id','platform','custom_pp','modeling_realm','custom_cell_methods','frequency','chunk_freq']

output_file_template = ['modeling_realm','temporal_subset','variable_id']

#catalog headers
#catalog headers
#The headerlist is expected column names in your catalog/csv file. This is usually determined by the users in conjuction
#with the ESM collection specification standards and the appropriate workflows.

headerlist = ["activity_id", "institution_id", "source_id", "experiment_id",
"frequency", "modeling_realm", "table_id",
"member_id", "grid_label", "variable_id",
"temporal_subset", "chunk_freq","grid_label","platform","dimensions","cell_methods","path"]

#OUTPUT FILE
#what kind of directory structure to expect?
#For a directory structure like /archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pdclim1850F/gfdl.ncrc5-deploy-prod-openmp/pp
# the output_path_template is set as follows.
#We have NA in those values that do not match up with any of the expected headerlist (CSV columns), otherwise we
#simply specify the associated header name in the appropriate place. E.g. The third directory in the PP path example
#above is the model (source_id), so the third list value in output_path_template is set to 'source_id'. We make sure
#this is a valid value in headerlist as well.
#The fourth directory is am5f3b1r0 which does not map to an existing header value. So we simply NA in output_path_template
#for the fourth value.

output_path_template = ['NA','NA','source_id','NA','experiment_id','platform','custom_pp','modeling_realm','cell_methods','frequency','chunk_freq']

output_file_template = ['modeling_realm','temporal_subset','variable_id']

#OUTPUT FILE INFO is currently passed as command-line argument.
#We will revisit adding a csvfile, jsonfile and logfile configuration to the builder configuration file in the future.
#csvfile = #jsonfile = #logfile =

csvfile = "/nbhome/a1r/intakebuilder_cats/intake_gfdl.csv"
logfile = "/tmp/intakegfdl.log"
#######################################################
######### ADDITIONAL SEARCH FILTERS ###########################

Expand Down
52 changes: 50 additions & 2 deletions intakebuilder/getinfo.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,32 @@ def getInfoFromFilename(filename,dictInfo,logger):
logger.debug("Filename not compatible with this version of the builder:"+filename)
return dictInfo

#adding this back to trace back some old errors
def getInfoFromGFDLFilename(filename,dictInfo,logger):
# 5 AR: get the following from the netCDF filename e.g. atmos.200501-200912.t_ref.nc
if(filename.endswith(".nc")):
ncfilename = filename.split(".")
varname = ncfilename[-2]
dictInfo["variable_id"] = varname
#miptable = "" #ncfilename[1]
#dictInfo["mip_table"] = miptable
#modelname = ncfilename[2]
#dictInfo["model"] = modelname
#expname = ncfilename[3]
#dictInfo["experiment_id"] = expname
#ens = ncfilename[4]
#dictInfo["ensemble_member"] = ens
#grid = ncfilename[5]
#dictInfo["grid_label"] = grid
try:
tsubset = ncfilename[1]
except IndexError:
tsubset = "null" #For fx fields
dictInfo["temporal_subset"] = tsubset
else:
logger.debug("Filename not compatible with this version of the builder:"+filename)
return dictInfo

def getInfoFromGFDLDRS(dirpath,projectdir,dictInfo):
'''
Returns info from project directory and the DRS path to the file
Expand All @@ -92,6 +118,27 @@ def getInfoFromGFDLDRS(dirpath,projectdir,dictInfo):

#Grab values based on their expected position in path
stemdir = dirpath.split("/")
# adding back older versions to ensure we get info from builderconfig
stemdir = dirpath.split("/")
nlen = len(builderconfig.output_path_template)
#lets go backwards and match given input directory to the template, add things to dictInfo
j = -1
cnt = 1
for i in range(nlen-1,0,-1):
try:
if(builderconfig.output_path_template[i] != "NA"):
dictInfo[builderconfig.output_path_template[i]] = stemdir[(j)]
except:
sys.exit("oops in getInfoFromGFDLDRS"+str(i)+str(j)+builderconfig.output_path_template[i]+stemdir[j])
j = j - 1
cnt = cnt + 1
# WE do not want to work with anythi:1
# ng that's not time series
if (dictInfo["cell_methods"] != "ts"):
print("Skipping non-timeseries data")
return {}
return dictInfo
'''
if stemdir[len(stemdir)-3] == "ts":
dictInfo['experiment_id'] = stemdir[len(stemdir)-7]
dictInfo['frequency'] = stemdir[len(stemdir)-2]
Expand All @@ -106,6 +153,7 @@ def getInfoFromGFDLDRS(dirpath,projectdir,dictInfo):
return dictInfo
'''

def getInfoFromDRS(dirpath,projectdir,dictInfo):
'''
Expand All @@ -114,8 +162,8 @@ def getInfoFromDRS(dirpath,projectdir,dictInfo):
:param drsstructure:
:return:
'''
stemdir = getStem(dirpath, projectdir)
#stemdir = dirpath.split(projectdir)[1].split("/") # drsstructure is the root
#stemdir = getStem(dirpath, projectdir)
stemdir = dirpath.split(projectdir)[1].split("/") # drsstructure is the root
try:
institute = stemdir[2]
except:
Expand Down
1 change: 1 addition & 0 deletions intakebuilder/gfdlcrawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ def crawlLocal(projectdir, dictFilter,dictFilterIgnore,logger):
continue
dictInfo["path"]=filepath
dictInfo = getinfo.getInfoFromGFDLDRS(dirpath, projectdir, dictInfo)
#sys.exit()
list_bad_modellabel = ["","piControl","land-hist","piClim-SO2","abrupt-4xCO2","hist-piAer","hist-piNTCF","piClim-ghg","piClim-OC","hist-GHG","piClim-BC","1pctCO2"]
list_bad_chunklabel = ['DO_NOT_USE']
if "source_id" in dictInfo:
Expand Down

0 comments on commit 8fe86a2

Please sign in to comment.