Merge pull request aradhakrishnanGFDL#44 from aradhakrishnanGFDL/36-b…

…uilderconfig Adding builderconfig back, with the dependent code adjusted to use this.
Ciheim · Feb 8, 2024 · 8fe86a2 · 8fe86a2
2 parents 007bbdf + 86b3241
commit 8fe86a2
Show file tree

Hide file tree

Showing 3 changed files with 79 additions and 10 deletions.
diff --git a/intakebuilder/builderconfig.py b/intakebuilder/builderconfig.py
@@ -1,20 +1,40 @@
 #what kind of directory structure to expect? 
+#For a directory structure like /archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pdclim1850F/gfdl.ncrc5-deploy-prod-openmp/pp
+# the output_path_template is set as follows.
+#We have NA in those values that do not match up with any of the expected headerlist (CSV columns), otherwise we
+#simply specify the associated header name in the appropriate place. E.g. The third directory in the PP path example
+#above is the model (source_id), so the third list value in output_path_template is set to 'source_id'. We make sure
+#this is a valid value in headerlist as well.
+#The fourth directory is am5f3b1r0 which does not map to an existing header value. So we simply NA in output_path_template
+#for the fourth value.
 
-output_path_template = ['source_id','activity_id','experiment_id','platform','custom_pp','modeling_realm','custom_cell_methods','frequency','chunk_freq']
-
-output_file_template = ['modeling_realm','temporal_subset','variable_id']
-
-#catalog headers 
+#catalog headers
+#The headerlist is expected column names in your catalog/csv file. This is usually determined by the users in conjuction
+#with the ESM collection specification standards and the appropriate workflows.
 
 headerlist = ["activity_id", "institution_id", "source_id", "experiment_id",
                   "frequency", "modeling_realm", "table_id",
                   "member_id", "grid_label", "variable_id",
                   "temporal_subset", "chunk_freq","grid_label","platform","dimensions","cell_methods","path"]
 
-#OUTPUT FILE  
+#what kind of directory structure to expect?
+#For a directory structure like /archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pdclim1850F/gfdl.ncrc5-deploy-prod-openmp/pp
+# the output_path_template is set as follows.
+#We have NA in those values that do not match up with any of the expected headerlist (CSV columns), otherwise we
+#simply specify the associated header name in the appropriate place. E.g. The third directory in the PP path example
+#above is the model (source_id), so the third list value in output_path_template is set to 'source_id'. We make sure
+#this is a valid value in headerlist as well.
+#The fourth directory is am5f3b1r0 which does not map to an existing header value. So we simply NA in output_path_template
+#for the fourth value.
+
+output_path_template = ['NA','NA','source_id','NA','experiment_id','platform','custom_pp','modeling_realm','cell_methods','frequency','chunk_freq']
+
+output_file_template = ['modeling_realm','temporal_subset','variable_id']
+
+#OUTPUT FILE INFO is currently passed as command-line argument.
+#We will revisit adding a csvfile, jsonfile and logfile configuration to the builder configuration file in the future.
+#csvfile =  #jsonfile =  #logfile =
 
-csvfile = "/nbhome/a1r/intakebuilder_cats/intake_gfdl.csv" 
-logfile = "/tmp/intakegfdl.log"
 #######################################################
 ######### ADDITIONAL SEARCH FILTERS ###########################
 

diff --git a/intakebuilder/getinfo.py b/intakebuilder/getinfo.py
@@ -78,6 +78,32 @@ def getInfoFromFilename(filename,dictInfo,logger):
         logger.debug("Filename not compatible with this version of the builder:"+filename)
     return dictInfo
 
+#adding this back to trace back some old errors
+def getInfoFromGFDLFilename(filename,dictInfo,logger):
+    # 5 AR: get the following from the netCDF filename e.g. atmos.200501-200912.t_ref.nc
+    if(filename.endswith(".nc")):
+        ncfilename = filename.split(".")
+        varname = ncfilename[-2]
+        dictInfo["variable_id"] = varname
+        #miptable = "" #ncfilename[1]
+        #dictInfo["mip_table"] = miptable
+        #modelname = ncfilename[2]
+        #dictInfo["model"] = modelname
+        #expname = ncfilename[3]
+        #dictInfo["experiment_id"] = expname
+        #ens = ncfilename[4]
+        #dictInfo["ensemble_member"] = ens
+        #grid = ncfilename[5]
+        #dictInfo["grid_label"] = grid
+        try:
+           tsubset = ncfilename[1]
+        except IndexError:
+           tsubset = "null" #For fx fields
+        dictInfo["temporal_subset"] = tsubset
+    else:
+        logger.debug("Filename not compatible with this version of the builder:"+filename)
+    return dictInfo
+
 def getInfoFromGFDLDRS(dirpath,projectdir,dictInfo):
     '''
     Returns info from project directory and the DRS path to the file
@@ -92,6 +118,27 @@ def getInfoFromGFDLDRS(dirpath,projectdir,dictInfo):
 
 #Grab values based on their expected position in path 
     stemdir = dirpath.split("/")
+   # adding back older versions to ensure we get info from builderconfig
+    stemdir = dirpath.split("/")
+    nlen = len(builderconfig.output_path_template)
+    #lets go backwards and match given input directory to the template, add things to dictInfo
+    j = -1
+    cnt = 1
+    for i in range(nlen-1,0,-1):
+      try:
+          if(builderconfig.output_path_template[i] != "NA"):
+             dictInfo[builderconfig.output_path_template[i]] = stemdir[(j)]
+      except:
+          sys.exit("oops in getInfoFromGFDLDRS"+str(i)+str(j)+builderconfig.output_path_template[i]+stemdir[j])
+      j = j - 1
+    cnt = cnt + 1
+    # WE do not want to work with anythi:1
+    # ng that's not time series
+    if (dictInfo["cell_methods"] != "ts"):
+       print("Skipping non-timeseries data")
+       return {}
+    return dictInfo
+    '''
     if stemdir[len(stemdir)-3] == "ts":
         dictInfo['experiment_id'] = stemdir[len(stemdir)-7]
         dictInfo['frequency'] = stemdir[len(stemdir)-2]
@@ -106,6 +153,7 @@ def getInfoFromGFDLDRS(dirpath,projectdir,dictInfo):
 
 
     return dictInfo
+    '''
 
 def getInfoFromDRS(dirpath,projectdir,dictInfo):
     '''
@@ -114,8 +162,8 @@ def getInfoFromDRS(dirpath,projectdir,dictInfo):
     :param drsstructure:
     :return:
     '''
-    stemdir = getStem(dirpath, projectdir)
-    #stemdir = dirpath.split(projectdir)[1].split("/")  # drsstructure is the root
+    #stemdir = getStem(dirpath, projectdir)
+    stemdir = dirpath.split(projectdir)[1].split("/")  # drsstructure is the root
     try:
         institute = stemdir[2]
     except:

diff --git a/intakebuilder/gfdlcrawler.py b/intakebuilder/gfdlcrawler.py
@@ -37,6 +37,7 @@ def crawlLocal(projectdir, dictFilter,dictFilterIgnore,logger):
                     continue
                dictInfo["path"]=filepath
                dictInfo = getinfo.getInfoFromGFDLDRS(dirpath, projectdir, dictInfo)
+               #sys.exit()
                list_bad_modellabel = ["","piControl","land-hist","piClim-SO2","abrupt-4xCO2","hist-piAer","hist-piNTCF","piClim-ghg","piClim-OC","hist-GHG","piClim-BC","1pctCO2"]
                list_bad_chunklabel = ['DO_NOT_USE']
                if "source_id" in dictInfo: