From eac9884dbe823f617bd4a7f7ed212759129f36e1 Mon Sep 17 00:00:00 2001
From: Aparna Radhakrishnan <aparna.radhakrishnan@noaa.gov>
Date: Wed, 24 Jan 2024 17:02:36 -0500
Subject: [PATCH 1/3] Adding builderconfig back, with the dependent code
 adjusted to use this.

---
 intakebuilder/builderconfig.py |  5 ++--
 intakebuilder/getinfo.py       | 52 ++++++++++++++++++++++++++++++++--
 intakebuilder/gfdlcrawler.py   |  1 +
 3 files changed, 54 insertions(+), 4 deletions(-)

diff --git a/intakebuilder/builderconfig.py b/intakebuilder/builderconfig.py
index 1b45449..a8564f6 100644
--- a/intakebuilder/builderconfig.py
+++ b/intakebuilder/builderconfig.py
@@ -1,6 +1,6 @@
 #what kind of directory structure to expect? 
 
-output_path_template = ['source_id','activity_id','experiment_id','platform','custom_pp','modeling_realm','custom_cell_methods','frequency','chunk_freq']
+output_path_template = ['NA','NA','source_id','NA','experiment_id','platform','custom_pp','modeling_realm','cell_methods','frequency','chunk_freq']
 
 output_file_template = ['modeling_realm','temporal_subset','variable_id']
 
@@ -13,7 +13,8 @@
 
 #OUTPUT FILE  
 
-csvfile = "/nbhome/a1r/intakebuilder_cats/intake_gfdl.csv" 
+csvfile = "/nbhome/a1r/intakebuilder_cats/test_catalog.csv"
+jsonfile = "/nbhome/a1r/intakebuilder_cats/test_catalog.json"
 logfile = "/tmp/intakegfdl.log"
 #######################################################
 ######### ADDITIONAL SEARCH FILTERS ###########################
diff --git a/intakebuilder/getinfo.py b/intakebuilder/getinfo.py
index d1634c0..a757847 100644
--- a/intakebuilder/getinfo.py
+++ b/intakebuilder/getinfo.py
@@ -78,6 +78,32 @@ def getInfoFromFilename(filename,dictInfo,logger):
         logger.debug("Filename not compatible with this version of the builder:"+filename)
     return dictInfo
 
+#adding this back to trace back some old errors
+def getInfoFromGFDLFilename(filename,dictInfo,logger):
+    # 5 AR: get the following from the netCDF filename e.g. atmos.200501-200912.t_ref.nc
+    if(filename.endswith(".nc")):
+        ncfilename = filename.split(".")
+        varname = ncfilename[-2]
+        dictInfo["variable_id"] = varname
+        #miptable = "" #ncfilename[1]
+        #dictInfo["mip_table"] = miptable
+        #modelname = ncfilename[2]
+        #dictInfo["model"] = modelname
+        #expname = ncfilename[3]
+        #dictInfo["experiment_id"] = expname
+        #ens = ncfilename[4]
+        #dictInfo["ensemble_member"] = ens
+        #grid = ncfilename[5]
+        #dictInfo["grid_label"] = grid
+        try:
+           tsubset = ncfilename[1]
+        except IndexError:
+           tsubset = "null" #For fx fields
+        dictInfo["temporal_subset"] = tsubset
+    else:
+        logger.debug("Filename not compatible with this version of the builder:"+filename)
+    return dictInfo
+
 def getInfoFromGFDLDRS(dirpath,projectdir,dictInfo):
     '''
     Returns info from project directory and the DRS path to the file
@@ -92,6 +118,27 @@ def getInfoFromGFDLDRS(dirpath,projectdir,dictInfo):
  
 #Grab values based on their expected position in path 
     stemdir = dirpath.split("/")
+   # adding back older versions to ensure we get info from builderconfig
+    stemdir = dirpath.split("/")
+    nlen = len(builderconfig.output_path_template)
+    #lets go backwards and match given input directory to the template, add things to dictInfo
+    j = -1
+    cnt = 1
+    for i in range(nlen-1,0,-1):
+      try:
+          if(builderconfig.output_path_template[i] != "NA"):
+             dictInfo[builderconfig.output_path_template[i]] = stemdir[(j)]
+      except:
+          sys.exit("oops in getInfoFromGFDLDRS"+str(i)+str(j)+builderconfig.output_path_template[i]+stemdir[j])
+      j = j - 1
+    cnt = cnt + 1
+    # WE do not want to work with anythi:1
+    # ng that's not time series
+    if (dictInfo["cell_methods"] != "ts"):
+       print("Skipping non-timeseries data")
+       return {}
+    return dictInfo
+    '''
     if stemdir[len(stemdir)-3] == "ts":
         dictInfo['experiment_id'] = stemdir[len(stemdir)-7]
         dictInfo['frequency'] = stemdir[len(stemdir)-2]
@@ -106,6 +153,7 @@ def getInfoFromGFDLDRS(dirpath,projectdir,dictInfo):
 
 
     return dictInfo
+    '''
 
 def getInfoFromDRS(dirpath,projectdir,dictInfo):
     '''
@@ -114,8 +162,8 @@ def getInfoFromDRS(dirpath,projectdir,dictInfo):
     :param drsstructure:
     :return:
     '''
-    stemdir = getStem(dirpath, projectdir)
-    #stemdir = dirpath.split(projectdir)[1].split("/")  # drsstructure is the root
+    #stemdir = getStem(dirpath, projectdir)
+    stemdir = dirpath.split(projectdir)[1].split("/")  # drsstructure is the root
     try:
         institute = stemdir[2]
     except:
diff --git a/intakebuilder/gfdlcrawler.py b/intakebuilder/gfdlcrawler.py
index 6fc106d..49b0442 100644
--- a/intakebuilder/gfdlcrawler.py
+++ b/intakebuilder/gfdlcrawler.py
@@ -37,6 +37,7 @@ def crawlLocal(projectdir, dictFilter,dictFilterIgnore,logger):
                     continue
                dictInfo["path"]=filepath
                dictInfo = getinfo.getInfoFromGFDLDRS(dirpath, projectdir, dictInfo)
+               #sys.exit()
                list_bad_modellabel = ["","piControl","land-hist","piClim-SO2","abrupt-4xCO2","hist-piAer","hist-piNTCF","piClim-ghg","piClim-OC","hist-GHG","piClim-BC","1pctCO2"]
                list_bad_chunklabel = ['DO_NOT_USE']
                if "source_id" in dictInfo: 

From e58b0fa02d51c90d1268a4c8f6ee90394bf60e58 Mon Sep 17 00:00:00 2001
From: Aparna Radhakrishnan <aparna.radhakrishnan@noaa.gov>
Date: Wed, 24 Jan 2024 17:11:36 -0500
Subject: [PATCH 2/3] doc added to builderconfig.py

---
 intakebuilder/builderconfig.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/intakebuilder/builderconfig.py b/intakebuilder/builderconfig.py
index a8564f6..8892f05 100644
--- a/intakebuilder/builderconfig.py
+++ b/intakebuilder/builderconfig.py
@@ -1,4 +1,12 @@
 #what kind of directory structure to expect? 
+#For a directory structure like /archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pdclim1850F/gfdl.ncrc5-deploy-prod-openmp/pp
+# the output_path_template is set as follows.
+#We have NA in those values that do not match up with any of the expected headerlist (CSV columns), otherwise we
+#simply specify the associated header name in the appropriate place. E.g. The third directory in the PP path example
+#above is the model (source_id), so the third list value in output_path_template is set to 'source_id'. We make sure
+#this is a valid value in headerlist as well.
+#The fourth directory is am5f3b1r0 which does not map to an existing header value. So we simply NA in output_path_template
+#for the fourth value.
 
 output_path_template = ['NA','NA','source_id','NA','experiment_id','platform','custom_pp','modeling_realm','cell_methods','frequency','chunk_freq']
 

From 86b3241228a4bd7e017137c9c838f792b0144265 Mon Sep 17 00:00:00 2001
From: Aparna <aparna.radhakrishnan@noaa.gov>
Date: Wed, 7 Feb 2024 11:56:09 -0500
Subject: [PATCH 3/3] config adjusted to rearrange header info, remove csv etc
 configs, add more docs.

---
 intakebuilder/builderconfig.py | 29 ++++++++++++++++++++---------
 1 file changed, 20 insertions(+), 9 deletions(-)

diff --git a/intakebuilder/builderconfig.py b/intakebuilder/builderconfig.py
index 8892f05..c5b7a23 100644
--- a/intakebuilder/builderconfig.py
+++ b/intakebuilder/builderconfig.py
@@ -8,22 +8,33 @@
 #The fourth directory is am5f3b1r0 which does not map to an existing header value. So we simply NA in output_path_template
 #for the fourth value.
 
-output_path_template = ['NA','NA','source_id','NA','experiment_id','platform','custom_pp','modeling_realm','cell_methods','frequency','chunk_freq']
-
-output_file_template = ['modeling_realm','temporal_subset','variable_id']
-
-#catalog headers 
+#catalog headers
+#The headerlist is expected column names in your catalog/csv file. This is usually determined by the users in conjuction
+#with the ESM collection specification standards and the appropriate workflows.
 
 headerlist = ["activity_id", "institution_id", "source_id", "experiment_id",
                   "frequency", "modeling_realm", "table_id",
                   "member_id", "grid_label", "variable_id",
                   "temporal_subset", "chunk_freq","grid_label","platform","dimensions","cell_methods","path"]
 
-#OUTPUT FILE  
+#what kind of directory structure to expect?
+#For a directory structure like /archive/am5/am5/am5f3b1r0/c96L65_am5f3b1r0_pdclim1850F/gfdl.ncrc5-deploy-prod-openmp/pp
+# the output_path_template is set as follows.
+#We have NA in those values that do not match up with any of the expected headerlist (CSV columns), otherwise we
+#simply specify the associated header name in the appropriate place. E.g. The third directory in the PP path example
+#above is the model (source_id), so the third list value in output_path_template is set to 'source_id'. We make sure
+#this is a valid value in headerlist as well.
+#The fourth directory is am5f3b1r0 which does not map to an existing header value. So we simply NA in output_path_template
+#for the fourth value.
+
+output_path_template = ['NA','NA','source_id','NA','experiment_id','platform','custom_pp','modeling_realm','cell_methods','frequency','chunk_freq']
+
+output_file_template = ['modeling_realm','temporal_subset','variable_id']
+
+#OUTPUT FILE INFO is currently passed as command-line argument.
+#We will revisit adding a csvfile, jsonfile and logfile configuration to the builder configuration file in the future.
+#csvfile =  #jsonfile =  #logfile =
 
-csvfile = "/nbhome/a1r/intakebuilder_cats/test_catalog.csv"
-jsonfile = "/nbhome/a1r/intakebuilder_cats/test_catalog.json"
-logfile = "/tmp/intakegfdl.log"
 #######################################################
 ######### ADDITIONAL SEARCH FILTERS ###########################