From eac9884dbe823f617bd4a7f7ed212759129f36e1 Mon Sep 17 00:00:00 2001 From: Aparna Radhakrishnan Date: Wed, 24 Jan 2024 17:02:36 -0500 Subject: [PATCH 1/3] Adding builderconfig back, with the dependent code adjusted to use this. --- intakebuilder/builderconfig.py | 5 ++-- intakebuilder/getinfo.py | 52 ++++++++++++++++++++++++++++++++-- intakebuilder/gfdlcrawler.py | 1 + 3 files changed, 54 insertions(+), 4 deletions(-) diff --git a/intakebuilder/builderconfig.py b/intakebuilder/builderconfig.py index 1b45449..a8564f6 100644 --- a/intakebuilder/builderconfig.py +++ b/intakebuilder/builderconfig.py @@ -1,6 +1,6 @@ #what kind of directory structure to expect? -output_path_template = ['source_id','activity_id','experiment_id','platform','custom_pp','modeling_realm','custom_cell_methods','frequency','chunk_freq'] +output_path_template = ['NA','NA','source_id','NA','experiment_id','platform','custom_pp','modeling_realm','cell_methods','frequency','chunk_freq'] output_file_template = ['modeling_realm','temporal_subset','variable_id'] @@ -13,7 +13,8 @@ #OUTPUT FILE -csvfile = "/nbhome/a1r/intakebuilder_cats/intake_gfdl.csv" +csvfile = "/nbhome/a1r/intakebuilder_cats/test_catalog.csv" +jsonfile = "/nbhome/a1r/intakebuilder_cats/test_catalog.json" logfile = "/tmp/intakegfdl.log" ####################################################### ######### ADDITIONAL SEARCH FILTERS ########################### diff --git a/intakebuilder/getinfo.py b/intakebuilder/getinfo.py index d1634c0..a757847 100644 --- a/intakebuilder/getinfo.py +++ b/intakebuilder/getinfo.py @@ -78,6 +78,32 @@ def getInfoFromFilename(filename,dictInfo,logger): logger.debug("Filename not compatible with this version of the builder:"+filename) return dictInfo +#adding this back to trace back some old errors +def getInfoFromGFDLFilename(filename,dictInfo,logger): + # 5 AR: get the following from the netCDF filename e.g. atmos.200501-200912.t_ref.nc + if(filename.endswith(".nc")): + ncfilename = filename.split(".") + varname = ncfilename[-2] + dictInfo["variable_id"] = varname + #miptable = "" #ncfilename[1] + #dictInfo["mip_table"] = miptable + #modelname = ncfilename[2] + #dictInfo["model"] = modelname + #expname = ncfilename[3] + #dictInfo["experiment_id"] = expname + #ens = ncfilename[4] + #dictInfo["ensemble_member"] = ens + #grid = ncfilename[5] + #dictInfo["grid_label"] = grid + try: + tsubset = ncfilename[1] + except IndexError: + tsubset = "null" #For fx fields + dictInfo["temporal_subset"] = tsubset + else: + logger.debug("Filename not compatible with this version of the builder:"+filename) + return dictInfo + def getInfoFromGFDLDRS(dirpath,projectdir,dictInfo): ''' Returns info from project directory and the DRS path to the file @@ -92,6 +118,27 @@ def getInfoFromGFDLDRS(dirpath,projectdir,dictInfo): #Grab values based on their expected position in path stemdir = dirpath.split("/") + # adding back older versions to ensure we get info from builderconfig + stemdir = dirpath.split("/") + nlen = len(builderconfig.output_path_template) + #lets go backwards and match given input directory to the template, add things to dictInfo + j = -1 + cnt = 1 + for i in range(nlen-1,0,-1): + try: + if(builderconfig.output_path_template[i] != "NA"): + dictInfo[builderconfig.output_path_template[i]] = stemdir[(j)] + except: + sys.exit("oops in getInfoFromGFDLDRS"+str(i)+str(j)+builderconfig.output_path_template[i]+stemdir[j]) + j = j - 1 + cnt = cnt + 1 + # WE do not want to work with anythi:1 + # ng that's not time series + if (dictInfo["cell_methods"] != "ts"): + print("Skipping non-timeseries data") + return {} + return dictInfo + ''' if stemdir[len(stemdir)-3] == "ts": dictInfo['experiment_id'] = stemdir[len(stemdir)-7] dictInfo['frequency'] = stemdir[len(stemdir)-2] @@ -106,6 +153,7 @@ def getInfoFromGFDLDRS(dirpath,projectdir,dictInfo): return dictInfo + ''' def getInfoFromDRS(dirpath,projectdir,dictInfo): ''' @@ -114,8 +162,8 @@ def getInfoFromDRS(dirpath,projectdir,dictInfo): :param drsstructure: :return: ''' - stemdir = getStem(dirpath, projectdir) - #stemdir = dirpath.split(projectdir)[1].split("/") # drsstructure is the root + #stemdir = getStem(dirpath, projectdir) + stemdir = dirpath.split(projectdir)[1].split("/") # drsstructure is the root try: institute = stemdir[2] except: diff --git a/intakebuilder/gfdlcrawler.py b/intakebuilder/gfdlcrawler.py index 6fc106d..49b0442 100644 --- a/intakebuilder/gfdlcrawler.py +++ b/intakebuilder/gfdlcrawler.py @@ -37,6 +37,7 @@ def crawlLocal(projectdir, dictFilter,dictFilterIgnore,logger): continue dictInfo["path"]=filepath dictInfo = getinfo.getInfoFromGFDLDRS(dirpath, projectdir, dictInfo) + #sys.exit() list_bad_modellabel = ["","piControl","land-hist","piClim-SO2","abrupt-4xCO2","hist-piAer","hist-piNTCF","piClim-ghg","piClim-OC","hist-GHG","piClim-BC","1pctCO2"] list_bad_chunklabel = ['DO_NOT_USE'] if "source_id" in dictInfo: From e58b0fa02d51c90d1268a4c8f6ee90394bf60e58 Mon Sep 17 00:00:00 2001 From: Aparna Radhakrishnan Date: Wed, 24 Jan 2024 17:11:36 -0500 Subject: [PATCH 2/3] doc added to builderconfig.py --- intakebuilder/builderconfig.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/intakebuilder/builderconfig.py b/intakebuilder/builderconfig.py index a8564f6..8892f05 100644 --- a/intakebuilder/builderconfig.py +++ b/intakebuilder/builderconfig.py @@ -1,4 +1,12 @@ #what kind of directory structure to expect? +#For a directory structure like /archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pdclim1850F/gfdl.ncrc5-deploy-prod-openmp/pp +# the output_path_template is set as follows. +#We have NA in those values that do not match up with any of the expected headerlist (CSV columns), otherwise we +#simply specify the associated header name in the appropriate place. E.g. The third directory in the PP path example +#above is the model (source_id), so the third list value in output_path_template is set to 'source_id'. We make sure +#this is a valid value in headerlist as well. +#The fourth directory is am5f3b1r0 which does not map to an existing header value. So we simply NA in output_path_template +#for the fourth value. output_path_template = ['NA','NA','source_id','NA','experiment_id','platform','custom_pp','modeling_realm','cell_methods','frequency','chunk_freq'] From 86b3241228a4bd7e017137c9c838f792b0144265 Mon Sep 17 00:00:00 2001 From: Aparna Date: Wed, 7 Feb 2024 11:56:09 -0500 Subject: [PATCH 3/3] config adjusted to rearrange header info, remove csv etc configs, add more docs. --- intakebuilder/builderconfig.py | 29 ++++++++++++++++++++--------- 1 file changed, 20 insertions(+), 9 deletions(-) diff --git a/intakebuilder/builderconfig.py b/intakebuilder/builderconfig.py index 8892f05..c5b7a23 100644 --- a/intakebuilder/builderconfig.py +++ b/intakebuilder/builderconfig.py @@ -8,22 +8,33 @@ #The fourth directory is am5f3b1r0 which does not map to an existing header value. So we simply NA in output_path_template #for the fourth value. -output_path_template = ['NA','NA','source_id','NA','experiment_id','platform','custom_pp','modeling_realm','cell_methods','frequency','chunk_freq'] - -output_file_template = ['modeling_realm','temporal_subset','variable_id'] - -#catalog headers +#catalog headers +#The headerlist is expected column names in your catalog/csv file. This is usually determined by the users in conjuction +#with the ESM collection specification standards and the appropriate workflows. headerlist = ["activity_id", "institution_id", "source_id", "experiment_id", "frequency", "modeling_realm", "table_id", "member_id", "grid_label", "variable_id", "temporal_subset", "chunk_freq","grid_label","platform","dimensions","cell_methods","path"] -#OUTPUT FILE +#what kind of directory structure to expect? +#For a directory structure like /archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pdclim1850F/gfdl.ncrc5-deploy-prod-openmp/pp +# the output_path_template is set as follows. +#We have NA in those values that do not match up with any of the expected headerlist (CSV columns), otherwise we +#simply specify the associated header name in the appropriate place. E.g. The third directory in the PP path example +#above is the model (source_id), so the third list value in output_path_template is set to 'source_id'. We make sure +#this is a valid value in headerlist as well. +#The fourth directory is am5f3b1r0 which does not map to an existing header value. So we simply NA in output_path_template +#for the fourth value. + +output_path_template = ['NA','NA','source_id','NA','experiment_id','platform','custom_pp','modeling_realm','cell_methods','frequency','chunk_freq'] + +output_file_template = ['modeling_realm','temporal_subset','variable_id'] + +#OUTPUT FILE INFO is currently passed as command-line argument. +#We will revisit adding a csvfile, jsonfile and logfile configuration to the builder configuration file in the future. +#csvfile = #jsonfile = #logfile = -csvfile = "/nbhome/a1r/intakebuilder_cats/test_catalog.csv" -jsonfile = "/nbhome/a1r/intakebuilder_cats/test_catalog.json" -logfile = "/tmp/intakegfdl.log" ####################################################### ######### ADDITIONAL SEARCH FILTERS ###########################