From 7037f6c72dc83c2153d6424ae5ebbd0d47bd9401 Mon Sep 17 00:00:00 2001 From: Alice Crawford Date: Fri, 8 Dec 2023 10:39:10 -0500 Subject: [PATCH 1/9] hysplit.py added check_attrs to the ModelBin and combine_dataset to make sure that the level heights attribute is a list and not an numpy array. The numpy array attributes don't write/read well from a netcdf file. --- monetio/models/hysplit.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/monetio/models/hysplit.py b/monetio/models/hysplit.py index f2199ac9..e5819cb7 100644 --- a/monetio/models/hysplit.py +++ b/monetio/models/hysplit.py @@ -37,6 +37,7 @@ 2023 12 Jan AMC get_thickness modified to calculate if the attribute specifying the vertical levels is bad 2023 03 Mar AMC get_latlon modified. replace x>=180 with x>=180+lon_tolerance 2023 03 Mar AMC get_latlongrid improved exception statements +2023 08 Dec AMC add check_attributes to ModelBin and combine_datatset to make sure level height attribute is a list """ @@ -619,7 +620,7 @@ class and need to be converted to a python # then merge with main dataframe. # self.dset = xr.concat([self.dset, dset],'levels') # self.dset = xr.merge([self.dset, dset],compat='override') - self.dset = xr.merge([self.dset, dset]) + self.dset = xr.merge([self.dset, dset],join='outer') # self.dset = xr.combine_by_coords([self.dset, dset]) # self.dset = xr.merge([self.dset, dset], compat='override') iimax += 1 @@ -640,6 +641,7 @@ class and need to be converted to a python if not self.dset.any(): return False if self.dset.variables: + self.atthash = check_attributes(self.atthash) self.dset.attrs = self.atthash # mgrid = self.makegrid(self.dset.coords["x"], self.dset.coords["y"]) mgrid = get_latlongrid(self.gridhash, self.dset.coords["x"], self.dset.coords["y"]) @@ -805,8 +807,12 @@ def combine_dataset( # and use that for the new coordinates. newhxr = reset_latlon_coords(newhxr) - for key in keylist: - newhxr = newhxr.assign_attrs({key: hxr.attrs[key]}) + # make sure all np.ndarrray in the attributes are converted to lists. + # the np arrays don't save well to netcdf file for some reason. + attrs = check_attributes(hxr.attrs) + newhxr = newhxr.assign_attrs(attrs) + #for key in keylist: + # newhxr = newhxr.assign_attrs({key: hxr.attrs[key]}) if check_grid: rval = fix_grid_continuity(newhxr) else: @@ -1115,6 +1121,7 @@ def add_species(dset, species=None): ppp += 1 # End of loop adding all species atthash = dset.attrs atthash["Species ID"] = sflist + atthash = check_attributes(atthash) total_par = total_par.assign_attrs(atthash) return total_par From 6da541cb3eab46c35dacbe1b8be5cfbfc2dcd5bf Mon Sep 17 00:00:00 2001 From: Alice Crawford Date: Wed, 3 Apr 2024 07:16:08 -0400 Subject: [PATCH 2/9] added hysplit_experimental.py for fix to combine_dataset --- monetio/models/hysplit_experimental.py | 1331 ++++++++++++++++++++++++ 1 file changed, 1331 insertions(+) create mode 100644 monetio/models/hysplit_experimental.py diff --git a/monetio/models/hysplit_experimental.py b/monetio/models/hysplit_experimental.py new file mode 100644 index 00000000..6014421c --- /dev/null +++ b/monetio/models/hysplit_experimental.py @@ -0,0 +1,1331 @@ +""" +HYPSLIT MODEL READER for binary concentration (cdump) output files + +This code developed at the NOAA Air Resources Laboratory. +Alice Crawford +Allison Ring + +------------- +Functions: +------------- +open_dataset : +combine_dataset : +get_latlongrid : +getlatlon : +hysp_heights: determines ash top height from HYSPLIT +hysp_massload: determines total mass loading from HYSPLIT +calc_aml: determines ash mass loading for each altitude layer from HYSPLIT +hysp_thresh: calculates mask array for ash mass loading threshold from HYSPLIT +add_species(dset): adds concentrations due to different species. + + +-------- +Classes +-------- +ModelBin + +Change log + +2021 13 May AMC get_latlongrid needed to be updated to match makegrid method. +2022 14 Nov AMC initialized self.dset in __init__() in ModelBin class +2022 14 Nov AMC modified fix_grid_continuity to not fail if passed empty Dataset. +2022 02 Dec AMC modified get_latlongrid inputs. do not need to input dataset, just dictionary. +2022 02 Dec AMC replaced makegrid method with get_latlongrid function to reduce duplicate code. +2022 02 Dec AMC get_latlongrid function utilizes getlatlon to reduce duplicate code. +2022 02 Dec AMC replaced np.arange with np.linspace in getlatlon. np.arange is unstable when step is not an integer. +2023 12 Jan AMC modified reset_latlon_coords so will work with data-arrays that have no latitude longitude coordinate. +2023 12 Jan AMC get_thickness modified to calculate if the attribute specifying the vertical levels is bad +2023 03 Mar AMC get_latlon modified. replace x>=180 with x>=180+lon_tolerance +2023 03 Mar AMC get_latlongrid improved exception statements +2023 08 Dec AMC add check_attributes to ModelBin and combine_datatset to make sure level height attribute is a list + + +""" +import datetime +import sys + +import numpy as np +import pandas as pd +import xarray as xr + + +def open_dataset( + fname, + drange=None, + century=None, + verbose=False, + sample_time_stamp="start", + check_grid=True, +): + """Short summary. + + Parameters + ---------- + fname : string + Name of "cdump" file. Binary HYSPLIT concentration output file. + + drange : list of two datetime objects + cdump file contains concentration as function of time. The drange + specifies what times should be loaded from the file. A value of None + will result in all times being loaded. + + century : integer (1900 or 2000) + + verbose : boolean + If True will print out extra messages + + sample_time_stamp : str + if 'end' then time in xarray will be the end of sampling time period. + else time is start of sampling time period. + + check_grid : boolean + if True call fix_grid_continuity to check to see that + xindx and yindx values are sequential (e.g. not [1,2,3,4,5,7]). + If they are not, then add missing values to the xarray.. + + Returns + ------- + dset : xarray DataSet + + CHANGES for PYTHON 3 + For python 3 the numpy char4 are read in as a numpy.bytes_ class and need to + be converted to a python + string by using decode('UTF-8'). + """ + # open the dataset using xarray + binfile = ModelBin( + fname, + drange=drange, + century=century, + verbose=verbose, + readwrite="r", + sample_time_stamp=sample_time_stamp, + ) + dset = binfile.dset + if check_grid: + return fix_grid_continuity(dset) + else: + return dset + + +def check_drange(drange, pdate1, pdate2): + """ + drange : list of two datetimes + pdate1 : datetime + pdate2 : datetime + + Returns + savedata : boolean + + + returns True if drange is between pdate1 and pdate2 + """ + savedata = True + testf = True + # if pdate1 is within drange then save the data. + # AND if pdate2 is within drange then save the data. + # if drange[0] > pdate1 then stop looping to look for more data + # this block sets savedata to true if data within specified time + # range or time range not specified + if drange is None: + savedata = True + elif pdate1 >= drange[0] and pdate1 <= drange[1] and pdate2 <= drange[1]: + savedata = True + elif pdate1 > drange[1] or pdate2 > drange[1]: + testf = False + savedata = False + else: + savedata = False + # END block + # if verbose: + # print(savedata, 'DATES :', pdate1, pdate2) + return testf, savedata + + +class ModelBin: + """ + represents a binary cdump (concentration) output file from HYSPLIT + methods: + readfile - opens and reads contents of cdump file into an xarray + self.dset + """ + + def __init__( + self, + filename, + drange=None, + century=None, + verbose=True, + readwrite="r", + sample_time_stamp="start", + ): + """ + drange : list of two datetime objects. + The read method will store data from the cdump file for which the + sample start is greater thand drange[0] and less than drange[1] + for which the sample stop is less than drange[1]. + + sample_time_stamp : str + if 'end' - time in xarray will indicate end of sampling time. + else - time in xarray will indicate start of sampling time. + century : integer + verbose : boolean + read + + """ + self.drange = drange + self.filename = filename + self.century = century + self.verbose = verbose + # list of tuples (date1, date2) of averaging periods with zero + # concentrations + self.zeroconcdates = [] + # list of tuples of averaging periods with nonzero concentrtations] + self.nonzeroconcdates = [] + self.atthash = {} + self.atthash["Starting Latitudes"] = [] + self.atthash["Starting Longitudes"] = [] + self.atthash["Starting Heights"] = [] + self.atthash["Source Date"] = [] + self.sample_time_stamp = sample_time_stamp + self.gridhash = {} + # self.llcrnr_lon = None + # self.llcrnr_lat = None + # self.nlat = None + # self.nlon = None + # self.dlat = None + # self.dlon = None + self.levels = None + self.dset = xr.Dataset() + + if readwrite == "r": + if verbose: + print("reading " + filename) + self.dataflag = self.readfile(filename, drange, verbose=verbose, century=century) + + @staticmethod + def define_struct(): + """Each record in the fortran binary begins and ends with 4 bytes which + specify the length of the record. These bytes are called pad below. + They are not used here, but are thrown out. The following block defines + a numpy dtype object for each record in the binary file.""" + from numpy import dtype + + real4 = ">f" + int4 = ">i" + int2 = ">i2" + char4 = ">a4" + + rec1 = dtype( + [ + ("pad1", int4), + ("model_id", char4), # meteorological model id + ("met_year", int4), # meteorological model starting time + ("met_month", int4), + ("met_day", int4), + ("met_hr", int4), + ("met_fhr", int4), # forecast hour + ("start_loc", int4), # number of starting locations + ("conc_pack", int4), # concentration packing flag (0=no, 1=yes) + ("pad2", int4), + ] + ) + + # start_loc in rec1 tell how many rec there are. + rec2 = dtype( + [ + ("pad1", int4), + ("r_year", int4), # release starting time + ("r_month", int4), + ("r_day", int4), + ("r_hr", int4), + ("s_lat", real4), # Release location + ("s_lon", real4), + ("s_ht", real4), + ("r_min", int4), # release startime time (minutes) + ("pad2", int4), + ] + ) + + rec3 = dtype( + [ + ("pad1", int4), + ("nlat", int4), + ("nlon", int4), + ("dlat", real4), + ("dlon", real4), + ("llcrnr_lat", real4), + ("llcrnr_lon", real4), + ("pad2", int4), + ] + ) + + rec4a = dtype( + [ + ("pad1", int4), + ("nlev", int4), # number of vertical levels in concentration grid + ] + ) + + rec4b = dtype([("levht", int4)]) # height of each level (meters above ground) + + rec5a = dtype( + [ + ("pad1", int4), + ("pad2", int4), + ("pollnum", int4), # number of different pollutants + ] + ) + + rec5b = dtype([("pname", char4)]) # identification string for each pollutant + + rec5c = dtype([("pad2", int4)]) + + rec6 = dtype( + [ + ("pad1", int4), + ("oyear", int4), # sample start time. + ("omonth", int4), + ("oday", int4), + ("ohr", int4), + ("omin", int4), + ("oforecast", int4), + ("pad3", int4), + ] + ) + + # rec7 has same form as rec6. #sample stop time. + + # record 8 is pollutant type identification string, output level. + + rec8a = dtype( + [ + ("pad1", int4), + ("poll", char4), # pollutant identification string + ("lev", int4), + ("ne", int4), # number of elements + ] + ) + + rec8b = dtype( + [ + ("indx", int2), # longitude index + ("jndx", int2), # latitude index + ("conc", real4), + ] + ) + + rec8c = dtype([("pad2", int4)]) + recs = ( + rec1, + rec2, + rec3, + rec4a, + rec4b, + rec5a, + rec5b, + rec5c, + rec6, + rec8a, + rec8b, + rec8c, + ) + return recs + + def parse_header(self, hdata1): + """ + hdata1 : dtype + Returns + nstartloc : int + number of starting locations in file. + """ + if len(hdata1["start_loc"]) != 1: + print("WARNING in ModelBin _readfile - number of starting locations " "incorrect") + print(hdata1["start_loc"]) + # in python 3 np.fromfile reads the record into a list even if it is + # just one number. + # so if the length of this record is greater than one something is + # wrong. + nstartloc = hdata1["start_loc"][0] + self.atthash["Meteorological Model ID"] = hdata1["model_id"][0].decode("UTF-8") + self.atthash["Number Start Locations"] = nstartloc + return nstartloc + + def parse_hdata2(self, hdata2, nstartloc, century): + # Loop through starting locations + for nnn in range(0, nstartloc): + # create list of starting latitudes, longitudes and heights. + lat = hdata2["s_lat"][nnn] + lon = hdata2["s_lon"][nnn] + hgt = hdata2["s_ht"][nnn] + + self.atthash["Starting Latitudes"].append(lat) + self.atthash["Starting Longitudes"].append(lon) + self.atthash["Starting Heights"].append(hgt) + + # try to guess century if century not given + if century is None: + if hdata2["r_year"][0] < 50: + century = 2000 + else: + century = 1900 + print("WARNING: Guessing Century for HYSPLIT concentration file", century) + # add sourcedate which is datetime.datetime object + sourcedate = datetime.datetime( + century + hdata2["r_year"][nnn], + hdata2["r_month"][nnn], + hdata2["r_day"][nnn], + hdata2["r_hr"][nnn], + hdata2["r_min"][nnn], + ) + + self.atthash["Source Date"].append(sourcedate.strftime("%Y%m%d.%H%M%S")) + + return century + + def parse_hdata3(self, hdata3): + # Description of concentration grid + ahash = {} + ahash["Number Lat Points"] = hdata3["nlat"][0] + ahash["Number Lon Points"] = hdata3["nlon"][0] + ahash["Latitude Spacing"] = hdata3["dlat"][0] + ahash["Longitude Spacing"] = hdata3["dlon"][0] + ahash["llcrnr longitude"] = hdata3["llcrnr_lon"][0] + ahash["llcrnr latitude"] = hdata3["llcrnr_lat"][0] + # self.llcrnr_lon = hdata3["llcrnr_lon"][0] + # self.llcrnr_lat = hdata3["llcrnr_lat"][0] + # self.nlat = hdata3["nlat"][0] + # self.nlon = hdata3["nlon"][0] + # self.dlat = hdata3["dlat"][0] + # self.dlon = hdata3["dlon"][0] + return ahash + + def parse_hdata4(self, hdata4a, hdata4b): + self.levels = hdata4b["levht"] + self.atthash["Number of Levels"] = hdata4a["nlev"][0] + self.atthash["Level top heights (m)"] = hdata4b["levht"] + + def parse_hdata6and7(self, hdata6, hdata7, century): + # if no data read then break out of the while loop. + if not hdata6: + return False, None, None + pdate1 = datetime.datetime( + century + int(hdata6["oyear"][0]), + int(hdata6["omonth"][0]), + int(hdata6["oday"][0]), + int(hdata6["ohr"][0]), + int(hdata6["omin"][0]) + ) + pdate2 = datetime.datetime( + century + int(hdata7["oyear"][0]), + int(hdata7["omonth"][0]), + int(hdata7["oday"][0]), + int(hdata7["ohr"][0]), + int(hdata7["omin"][0]) + ) + dt = pdate2 - pdate1 + sample_dt = dt.days * 24 + dt.seconds / 3600.0 + # self.atthash["Sampling Time"] = pdate2 - pdate1 + self.atthash["sample time hours"] = sample_dt + if self.sample_time_stamp == "end": + self.atthash["time description"] = "End of sampling time period" + else: + self.atthash["time description"] = "start of sampling time period" + return True, pdate1, pdate2 + + @staticmethod + def parse_hdata8(hdata8a, hdata8b, pdate1): + """ + hdata8a : dtype + hdata8b : dtype + pdate1 : datetime + + Returns: + concframe : DataFrame + """ + lev_name = hdata8a["lev"][0] + col_name = hdata8a["poll"][0].decode("UTF-8") + edata = hdata8b.byteswap().newbyteorder() # otherwise get endian error. + concframe = pd.DataFrame.from_records(edata) + concframe["levels"] = lev_name + concframe["t"] = pdate1 + + # rename jndx x + # rename indx y + names = concframe.columns.values + names = ["y" if x == "jndx" else x for x in names] + names = ["x" if x == "indx" else x for x in names] + names = ["z" if x == "levels" else x for x in names] + concframe.columns = names + concframe.set_index( + ["t", "z", "y", "x"], + inplace=True, + ) + concframe.rename(columns={"conc": col_name}, inplace=True) + # mgrid = np.meshgrid(lat, lon) + return concframe + + # def makegrid(self, xindx, yindx): + # """ + # xindx : list + # yindx : list + # """ + # attrs = {} + # attrs['llcrnr latitude'] = self.llcrnr_lat + # attrs['llcrnr longitude'] = self.llcrnr_lon + # attrs['Number Lat Points'] = self.nlat + # attrs['Number Lon Points'] = self.nlon + # attrs['Latitude Spacing'] = self.dlat + # attrs['Longitude Spacing'] = self.dlon + # mgrid = get_latlongrid(attrs,xindx,yindx) + # return mgrid + # checked HYSPLIT code. the grid points + # do represent center of the sampling area. + # slat = self.llcrnr_lat + # slon = self.llcrnr_lon + # lat = np.arange(slat, slat + self.nlat * self.dlat, self.dlat) + # lon = np.arange(slon, slon + self.nlon * self.dlon, self.dlon) + # hysplit always uses grid from -180 to 180 + # lon = np.array([x-360 if x>180 else x for x in lon]) + # fortran array indice start at 1. so xindx >=1. + # python array indice start at 0. + # lonlist = [lon[x - 1] for x in xindx] + # latlist = [lat[x - 1] for x in yindx] + # mgrid = np.meshgrid(lonlist, latlist) + # return mgrid + + def readfile(self, filename, drange, verbose, century): + """Data from the file is stored in an xarray, self.dset + returns False if all concentrations are zero else returns True. + INPUTS + filename - name of cdump file to open + drange - [date1, date2] - range of dates to load data for. if [] + then loads all data. + date1 and date2 should be datetime objects. + verbose - turns on print statements + century - if None will try to guess the century by looking + at the last two digits of the year. + For python 3 the numpy char4 are read in as a numpy.bytes_ + class and need to be converted to a python + string by using decode('UTF-8'). + + """ + # 8/16/2016 moved species=[] to before while loop. Added print + # statements when verbose. + # self.dset = xr.Dataset() + # dictionaries which will be turned into the dset attributes. + fid = open(filename, "rb") + + # each record in the fortran binary begins and ends with 4 bytes which + # specify the length of the record. + # These bytes are called pad1 and pad2 below. They are not used here, + # but are thrown out. + # The following block defines a numpy dtype object for each record in + # the binary file. + recs = self.define_struct() + rec1, rec2, rec3, rec4a = recs[0], recs[1], recs[2], recs[3] + rec4b, rec5a, rec5b, rec5c = recs[4], recs[5], recs[6], recs[7] + rec6, rec8a, rec8b, rec8c = recs[8], recs[9], recs[10], recs[11] + # rec7 = rec6 + # start_loc in rec1 tell how many rec there are. + tempzeroconcdates = [] + # Reads header data. This consists of records 1-5. + hdata1 = np.fromfile(fid, dtype=rec1, count=1) + nstartloc = self.parse_header(hdata1) + + hdata2 = np.fromfile(fid, dtype=rec2, count=nstartloc) + century = self.parse_hdata2(hdata2, nstartloc, century) + + hdata3 = np.fromfile(fid, dtype=rec3, count=1) + self.gridhash = self.parse_hdata3(hdata3) + if self.verbose: + print("Grid specs", self.gridhash) + # read record 4 which gives information about vertical levels. + hdata4a = np.fromfile(fid, dtype=rec4a, count=1) + hdata4b = np.fromfile( + fid, dtype=rec4b, count=hdata4a["nlev"][0] + ) # reads levels, count is number of levels. + self.parse_hdata4(hdata4a, hdata4b) + + # read record 5 which gives information about pollutants / species. + hdata5a = np.fromfile(fid, dtype=rec5a, count=1) + np.fromfile(fid, dtype=rec5b, count=hdata5a["pollnum"][0]) + np.fromfile(fid, dtype=rec5c, count=1) + self.atthash["Number of Species"] = hdata5a["pollnum"][0] + self.atthash["Species ID"] = [] + + # Loop to reads records 6-8. Number of loops is equal to number of + # output times. + # Only save data for output times within drange. if drange=[] then + # save all. + # Loop to go through each sampling time + iimax = 0 # check to make sure don't go above max number of iterations + iii = 0 # checks to see if some nonzero data was saved in xarray + # Safety valve - will not allow more than 1000 loops to be executed. + imax = 1e8 + testf = True + tvals=[] + while testf: + hdata6 = np.fromfile(fid, dtype=rec6, count=1) + hdata7 = np.fromfile(fid, dtype=rec6, count=1) + check, pdate1, pdate2 = self.parse_hdata6and7(hdata6, hdata7, century) + if not check: + break + testf, savedata = check_drange(drange, pdate1, pdate2) + if verbose: + print("sample time", pdate1, " to ", pdate2) + # datelist = [] + inc_iii = False + # LOOP to go through each level + for _ in range(self.atthash["Number of Levels"]): + # LOOP to go through each pollutant + for _ in range(self.atthash["Number of Species"]): + # record 8a has the number of elements (ne). If number of + # elements greater than 0 than there are concentrations. + hdata8a = np.fromfile(fid, dtype=rec8a, count=1) + # self.atthash["Species ID"].append( + # hdata8a["poll"][0].decode("UTF-8") + # ) + # if number of elements is nonzero then + if hdata8a["ne"] >= 1: + self.atthash["Species ID"].append(hdata8a["poll"][0].decode("UTF-8")) + # get rec8 - indx and jndx + hdata8b = np.fromfile(fid, dtype=rec8b, count=hdata8a["ne"][0]) + # add sample start time to list of start times with + # non zero conc + self.nonzeroconcdates.append(pdate1) + else: + tempzeroconcdates.append( + pdate1 + ) # or add sample start time to list of start times + # with zero conc. + # This is just padding. + np.fromfile(fid, dtype=rec8c, count=1) + # if savedata is set and nonzero concentrations then save + # the data in a pandas dataframe + if savedata and hdata8a["ne"] >= 1: + self.nonzeroconcdates.append(pdate1) + inc_iii = True + if self.sample_time_stamp == "end": + concframe = self.parse_hdata8(hdata8a, hdata8b, pdate2) + else: + concframe = self.parse_hdata8(hdata8a, hdata8b, pdate1) + dset = xr.Dataset.from_dataframe(concframe) + # if verbose: + # print("Adding ", "Pollutant", pollutant, "Level", lev) + # if this is the first time through. create dataframe + # for first level and pollutant. + if not self.dset.any(): + self.dset = dset + else: # create dataframe for level and pollutant and + # then merge with main dataframe. + # self.dset = xr.concat([self.dset, dset],'levels') + # self.dset = xr.merge([self.dset, dset],compat='override') + self.dset = xr.merge([self.dset, dset],join='outer') + # self.dset = xr.combine_by_coords([self.dset, dset]) + # self.dset = xr.merge([self.dset, dset], compat='override') + iimax += 1 + # END LOOP to go through each pollutant + # END LOOP to go through each level + # safety check - will stop sampling time while loop if goes over + # imax iterations. + if iimax > imax: + testf = False + print("greater than imax", testf, iimax, imax) + if inc_iii: + iii += 1 + + self.atthash.update(self.gridhash) + self.atthash["Species ID"] = list(set(self.atthash["Species ID"])) + self.atthash["Coordinate time description"] = "Beginning of sampling time" + # END OF Loop to go through each sampling time + if not self.dset.any(): + return False + if self.dset.variables: + self.atthash = check_attributes(self.atthash) + self.dset.attrs = self.atthash + # mgrid = self.makegrid(self.dset.coords["x"], self.dset.coords["y"]) + mgrid = get_latlongrid(self.gridhash, self.dset.coords["x"], self.dset.coords["y"]) + self.dset = self.dset.assign_coords(longitude=(("y", "x"), mgrid[0])) + self.dset = self.dset.assign_coords(latitude=(("y", "x"), mgrid[1])) + tvals = self.dset.t.values + self.dset = self.dset.assign_coords(time=("t",tvals)) + self.dset = self.dset.reset_coords() + self.dset = self.dset.set_coords(["time", "latitude", "longitude"]) + self.dset = reset_time_index(self.dset) + if iii == 0 and verbose: + print("Warning: ModelBin class _readfile method: no data in the date range found") + return False + return True + + +def combine_dataset( + blist, + drange=None, + species=None, + century=None, + verbose=False, + sample_time_stamp="start", + check_grid=True, +): + """ + Inputs : + blist : list of tuples + (filename, sourcetag, metdatatag) + + drange : list of two datetime objects. + d1 datetime object. first date to keep in DatArrayarray + d2 datetime object. last date to keep in DataArray + + sample_time_stamp : str + if 'end' then time in xarray will be the end of sampling time period. + else time is start of sampling time period. + + RETURNS + newhxr : an xarray data-array with 6 dimensions. + lat, lon, time, level, ensemble tag, source tag + + Note that if more than one species is present in the files, they are + added to get concentration from all species. If list of species is provided, + only those species will be added. + + Files need to have the same concentration grid defined. + """ + # iii = 0 + mlat_p = mlon_p = None + ylist = [] + dtlist = [] + splist = [] + xlist = [] + sourcelist = [] + enslist = [] + #for mmm, key in enumerate(blist): + # # fname = val[0] + for iii, bbb in enumerate(blist): + fname = bbb[0] + + if drange: + century = int(drange[0].year / 100) * 100 + hxr = open_dataset( + fname, + drange=drange, + century=century, + verbose=verbose, + sample_time_stamp=sample_time_stamp, + check_grid=False, + ) + else: # use all dates + hxr = open_dataset( + fname, + century=century, + verbose=verbose, + sample_time_stamp=sample_time_stamp, + check_grid=False, + ) + print('hxr') + print(hxr.time.values) + try: + mlat, mlon = getlatlon(hxr.attrs) + except Exception: + print("WARNING Cannot open ") + print(fname) + print(century) + print(hxr) + if iii > 0: + tt1 = np.array_equal(mlat, mlat_p) + tt2 = np.array_equal(mlon, mlon_p) + if not tt1 or not tt2: + print("WARNING: grids are not the same. cannot combine") + sys.exit() + mlat_p = mlat + mlon_p = mlon + + xrash = add_species(hxr, species=species) + enslist.append(bbb[1]) + sourcelist.append(bbb[2]) + dtlist.append(hxr.attrs["sample time hours"]) + splist.extend(xrash.attrs["Species ID"]) + tvals = xrash.time.values + tvals.sort() + if iii == 0: + stime = tvals[0] + xnew = reset_time_index(xrash,stime=stime) + # start time for indexing will always be earliest time in first file in list. + xrash = reset_time_index(xrash,stime=stime) + else: + xrash = reset_time_index(xrash,stime=stime) + ccc, xnew = xr.align(xrash, xnew, join="outer") + xlist.append(xrash) + ylist = [] + slist = [] + hlist = [] + for iii, temp in enumerate(xlist): + aaa, bbb = xr.align(temp, xnew, join="outer") + aaa = reset_time_coord(aaa) + aaa.expand_dims("ens") + aaa["ens"] = enslist[iii] + aaa.expand_dims("source") + aaa["source"] = sourcelist[iii] + hlist.append(aaa) + # concat along the 'ens' axis + #newhxr = xr.concat(hlist, "source") + sourcelist = list(set(sourcelist)) + enslist = list(set(enslist)) + if len(sourcelist)==1: + newhxr = xr.concat(hlist,dim='ens') + elif len(enslist)==1: + newhxr = xr.concat(hlist,dim='source') + else: + newhxr = xr.concat(hlist,dim='source') + + #ylist.append(new) + #slist.append(sourcelist[jjj]) + if dtlist: + dtlist = list(set(dtlist)) + dt = dtlist[0] + + #newhxr = xr.concat(ylist, "source") + #newhxr["source"] = slist + # newhxr['ens'] = metlist + + # newhxr is an xarray data-array with 6 dimensions. + # dt is the averaging time of the hysplit output. + newhxr = newhxr.assign_attrs({"sample time hours": dt}) + newhxr = newhxr.assign_attrs({"Species ID": list(set(splist))}) + newhxr.attrs.update(hxr.attrs) + + + # calculate the lat lon grid for the expanded dataset. + # and use that for the new coordinates. + newhxr = reset_latlon_coords(newhxr) + #newhxr = reset_time_coord(newhxr) + + # make sure all np.ndarrray in the attributes are converted to lists. + # the np arrays don't save well to netcdf file for some reason. + attrs = check_attributes(hxr.attrs) + newhxr = newhxr.assign_attrs(attrs) + keylist = ["coordinate start time",'coordinate time dt (hours)'] + for key in keylist: + if key in xnew.attrs.keys(): + newhxr = newhxr.assign_attrs({key:xnew.attrs[key]}) + + #for key in keylist: + # newhxr = newhxr.assign_attrs({key: hxr.attrs[key]}) + if check_grid: + rval = fix_grid_continuity(newhxr) + else: + rval = newhxr + return rval + + +# This function seems not useful. +# def get_even_latlongrid(dset, xlim, ylim): +# xindx = np.arange(xlim[0], xlim[1] + 1) +# yindx = np.arange(ylim[0], ylim[1] + 1) +# return get_latlongrid(dset, xindx, yindx) + +def reset_time_index(hxr,stime=None,dt=1): + """ + hxr : xarray DataSet as output from open_dataset or combine_dataset + """ + tvals = hxr.time.values + if stime is None: + stime = tvals[0] + ilist = get_time_index(tvals,stime,dt) + # create a time_index coordinate. + temp = hxr.assign_coords(t=('t',ilist)) + atthash = {'coordinate start time': stime} + atthash['coordinate time dt (hours)'] = dt + temp.attrs.update(atthash) + #temp = temp.drop_vars('time') + #temp = temp.assign_coords(time=('t',tvals)) + return temp + +def reset_time_coord(hxr): + atts = hxr.attrs + if 'coordinate start time' in atts.keys(): + stime = pd.to_datetime(atts['coordinate start time']) + if 'coordinate time dt (hours)' in atts.keys(): + dt = atts['coordinate time dt (hours)'] + index_values = hxr.t.values + tvals = get_time_values(index_values,stime,dt) + if 'time' in hxr.coords: + hxr.drop_vars('time') + hxr = hxr.assign_coords(time=("t",tvals)) + return hxr + +def reset_latlon_coords(hxr): + """ + hxr : xarray DataSet as output from open_dataset or combine_dataset + """ + mgrid = get_latlongrid(hxr.attrs, hxr.x.values, hxr.y.values) + if "latitude" in hxr.coords: + hxr = hxr.drop_vars("longitude") + if "longitude" in hxr.coords: + hxr = hxr.drop_vars("latitude") + hxr = hxr.assign_coords(latitude=(("y", "x"), mgrid[1])) + hxr = hxr.assign_coords(longitude=(("y", "x"), mgrid[0])) + return hxr + + +def fix_grid_continuity(dset): + # if dset is empty don't do anything + if not dset.any(): + return dset + + # if grid already continuous don't do anything. + if check_grid_continuity(dset): + return dset + xvv = dset.x.values + yvv = dset.y.values + + xlim = [xvv[0], xvv[-1]] + ylim = [yvv[0], yvv[-1]] + + xindx = np.arange(xlim[0], xlim[1] + 1) + yindx = np.arange(ylim[0], ylim[1] + 1) + + mgrid = get_latlongrid(dset.attrs, xindx, yindx) + # mgrid = get_even_latlongrid(dset, xlim, ylim) + conc = np.zeros_like(mgrid[0]) + dummy = xr.DataArray(conc, dims=["y", "x"]) + dummy = dummy.assign_coords(latitude=(("y", "x"), mgrid[1])) + dummy = dummy.assign_coords(longitude=(("y", "x"), mgrid[0])) + dummy = dummy.assign_coords(x=(("x"), xindx)) + dummy = dummy.assign_coords(y=(("y"), yindx)) + cdset, dummy2 = xr.align(dset, dummy, join="outer") + cdset = cdset.assign_coords(latitude=(("y", "x"), mgrid[1])) + cdset = cdset.assign_coords(longitude=(("y", "x"), mgrid[0])) + return cdset.fillna(0) + + +def check_grid_continuity(dset): + """ + checks to see if x and y coords are skipping over any grid points. + Since cdump files only store above 0 values, it is possible to have + a grid that is + y = [1,2,3,4,6,8] + if there are above zero values at 6 and 8 but not at 7. + This results in an xarray which has a grid that is not evenly spaced. + """ + xvv = dset.x.values + yvv = dset.y.values + tt1 = np.array([xvv[i] - xvv[i - 1] for i in np.arange(1, len(xvv))]) + tt2 = np.array([yvv[i] - yvv[i - 1] for i in np.arange(1, len(yvv))]) + if np.any(tt1 != 1): + return False + if np.any(tt2 != 1): + return False + return True + + +def get_time_values(index_values,stime,dt): + def apply(iii): + ddd = stime + datetime.timedelta(hours=float(iii)) + return ddd + return [apply(x) for x in index_values] + +def get_time_index(timevals,stime,dt): + """ + timevals : list of datetimes + stime : start time of time grid + dt : integer - time resolution in hours of time grid. + """ + def apply(ttt): + diff = pd.to_datetime(ttt)-stime + dh = diff.days*24 + diff.seconds/3600 + iii = dh/dt + return int(iii) + return [apply(x) for x in timevals] + + +def get_latlongrid(attrs, xindx, yindx): + """ + INPUTS + attrs : dictionary with grid specifications + xindx : list of integers > 0 + yindx : list of integers > 0 + RETURNS + mgrid : output of numpy meshgrid function. + Two 2d arrays of latitude, longitude. + The grid points in cdump file + represent center of the sampling area. + + NOTES : + This may return a grid that is not evenly spaced. + For instance if yindx is something like [1,2,3,4,5,7] then + the grid will not have even spacing in latitude and will 'skip' a latitude point. + + HYSPLIT grid indexing starts at 1. + + """ + xindx = np.array(xindx) + yindx = np.array(yindx) + if np.any(xindx <= 0): + raise Exception("HYSPLIT grid error xindex <=0") + if np.any(yindx <= 0): + raise Exception("HYSPLIT grid error yindex <=0") + lat, lon = getlatlon(attrs) + success = True + try: + lonlist = [lon[x - 1] for x in xindx] + # latlist = [lat[x - 1] for x in yindx] + except Exception as eee: + print(f"Exception {eee}") + print("try increasing Number Number Lon Points") + print(attrs) + print(xindx) + success = False + try: + # lonlist = [lon[x - 1] for x in xindx] + latlist = [lat[x - 1] for x in yindx] + except Exception as eee: + print(f"Exception {eee}") + print("try increasing Number Number Lat Points") + print(attrs) + print(yindx) + success = False + + if not success: + return None + mgrid = np.meshgrid(lonlist, latlist) + return mgrid + + +# def get_index_fromgrid(dset): +# llcrnr_lat = dset.attrs["llcrnr latitude"] +# llcrnr_lon = dset.attrs["llcrnr longitude"] +# nlat = dset.attrs["Number Lat Points"] +# nlon = dset.attrs["Number Lon Points"] +# dlat = dset.attrs["Latitude Spacing"] +# dlon = dset.attrs["Longitude Spacing"] + + +def getlatlon(attrs): + """ + Returns 1d array of lats and lons based on Concentration Grid + Defined in the dset attribute. + attrs : dictionary with grid specifications + RETURNS + lat : 1D array of latitudes + lon : 1D array of longitudes + """ + lon_tolerance = 0.001 + llcrnr_lat = attrs["llcrnr latitude"] + llcrnr_lon = attrs["llcrnr longitude"] + nlat = attrs["Number Lat Points"] + nlon = attrs["Number Lon Points"] + dlat = attrs["Latitude Spacing"] + dlon = attrs["Longitude Spacing"] + + lastlon = llcrnr_lon + (nlon - 1) * dlon + lastlat = llcrnr_lat + (nlat - 1) * dlat + # = int((lastlon - llcrnr_lon) / dlon) + lat = np.linspace(llcrnr_lat, lastlat, num=int(nlat)) + lon = np.linspace(llcrnr_lon, lastlon, num=int(nlon)) + # + lon = np.array([x - 360 if x >= 180 + lon_tolerance else x for x in lon]) + return lat, lon + + +def hysp_massload(dset, threshold=0, mult=1, zvals=None): + """Calculate mass loading from HYSPLIT xarray + INPUTS + dset: xarray dataset output by open_dataset OR + xarray data array output by combine_dataset + threshold : float + mult : float + zvals : list of levels to calculate mass loading over. + Outputs: + totl_aml : xarray data array + total ash mass loading (summed over all layers), ash mass loading + Units in (unit mass / m^2) + """ + # first calculate mass loading in each level. + aml_alts = calc_aml(dset) + # Then choose which levels to use for total mass loading. + if zvals: + aml_alts = aml_alts.isel(z=zvals) + if "z" not in aml_alts.dims: + aml_alts = aml_alts.expand_dims("z") + # + total_aml = aml_alts.sum(dim="z") + # Calculate conversion factors + # unitmass, mass63 = calc_MER(dset) + # Calculating the ash mass loading + total_aml2 = total_aml * mult + # Calculating total ash mass loading, accounting for the threshold + # Multiply binary threshold mask to data + total_aml_thresh = hysp_thresh(dset, threshold, mult=mult) + total_aml = total_aml2 * total_aml_thresh + return total_aml + + +def hysp_heights(dset, threshold, mult=1, height_mult=1 / 1000.0, mass_load=True, species=None): + """Calculate top-height from HYSPLIT xarray + Input: xarray dataset output by open_dataset OR + xarray data array output by combine_dataset + threshold : mass loading threshold (threshold = xx) + mult : convert from meters to other unit. default is 1/1000.0 to + convert to km. + Outputs: ash top heights, altitude levels""" + + # either get mass loading of each point + if mass_load: + aml_alts = calc_aml(dset) + # or get concentration at each point + else: + aml_alts = add_species(dset, species=species) + + # Create array of 0 and 1 (1 where data exists) + heights = aml_alts.where(aml_alts == 0.0, 1.0) + # Multiply each level by the altitude + height = _alt_multiply(heights) + height = height * height_mult # convert to km + # Determine top height: take max of heights array along z axis + top_hgt = height.max(dim="z") + # Apply ash mass loading threshold mask array + total_aml_thresh = hysp_thresh(dset, threshold, mult=mult) + top_height = top_hgt * total_aml_thresh + return top_height + + +# def calc_total_mass(dset): +# return -1 + + +def calc_aml(dset, species=None): + """Calculates the mass loading at each altitude for the dataset + Input: xarray dataset output by open_dataset OR + xarray data array output by combine_dataset + Output: total ash mass loading""" + # Totals values for all particles + if isinstance(dset, xr.core.dataset.Dataset): + total_par = add_species(dset, species=species) + else: + total_par = dset.copy() + # Multiplies the total particles by the altitude layer + # to create a mass loading for each altitude layer + aml_alts = _delta_multiply(total_par) + return aml_alts + + +def hysp_thresh(dset, threshold, mult=1): + """Calculates a threshold mask array based on the + ash mass loading from HYSPLIT xarray + Inputs: xarray, ash mass loading threshold (threshold = xx) + Outputs: ash mass loading threshold mask array + Returns 0 where values are below or equal to threshold. + Returns 1 where values are greater than threshold + + """ + # Calculate ash mass loading for xarray + aml_alts = calc_aml(dset) + total_aml = aml_alts.sum(dim="z") + # Calculate conversion factors + # unitmass, mass63 = calc_MER(dset) + # Calculating the ash mass loading + total_aml2 = total_aml * mult + # where puts value into places where condition is FALSE. + # place 0 in places where value is below or equal to threshold + total_aml_thresh = total_aml2.where(total_aml2 > threshold, 0.0) + # place 1 in places where value is greater than threshold + total_aml_thresh = total_aml_thresh.where(total_aml_thresh <= threshold, 1.0) + return total_aml_thresh + + +def add_species(dset, species=None): + """ + species : list of Species ID's. + if none then all ids in the "species ID" attribute will be used. + Calculate sum of particles. + """ + sflist = [] + splist = dset.attrs["Species ID"] + if not species: + species = dset.attrs["Species ID"] + else: + for val in species: + if val not in splist: + warn = "WARNING: hysplit.add_species function" + warn += ": species not found" + str(val) + "\n" + warn += " valid species ids are " + str.join(", ", splist) + print(warn) + sss = 0 + tmp = [] + # Looping through all species in dataset + while sss < len(splist): + if splist[sss] in species: + tmp.append(dset[splist[sss]].fillna(0)) + sflist.append(splist[sss]) + sss += 1 # End of loop through species + + total_par = tmp[0] + ppp = 1 + # Adding all species together + while ppp < len(tmp): + total_par = total_par + tmp[ppp] + ppp += 1 # End of loop adding all species + atthash = dset.attrs + atthash["Species ID"] = sflist + atthash = check_attributes(atthash) + total_par = total_par.assign_attrs(atthash) + return total_par + + +def calculate_thickness(cdump): + alts = cdump.z.values + thash = {} + aaa = 0 + for avalue in alts: + thash[avalue] = avalue - aaa + aaa = avalue + print(f"WARNING: thickness calculated from z values please verify {thash}") + return thash + + +def get_thickness(cdump): + """ + Input: + cdump : xarray DataArray with 'Level top heights (m)' as an attribute. + Returns: + thash : dictionary + key is the name of the z coordinate and value is the thickness of that layer in meters. + """ + cstr = "Level top heights (m)" + + calculate = False + if cstr not in cdump.attrs.keys(): + calculate = True + # check that the values in the attribute correspond to values in levels. + # python reads in this attribute as a numpy array + # but when writing the numpy array to netcdf file, it doesn't write correctly. + # sometimes cdump file is written with incorrect values in this attribute. + elif cstr in cdump.attrs.keys(): + alts = cdump.z.values + zvals = cdump.attrs[cstr] + # cra = [] + for aaa in alts: + # if a level is not found in the attribute array then use the calculate method. + if aaa not in zvals: + calculate = True + + if calculate: + print(f"warning: {cstr} attribute needed to calculate level thicknesses") + print("warning: alternative calculation from z dimension values") + thash = calculate_thickness(cdump) + else: + levs = cdump.attrs[cstr] + thash = {} + aaa = 0 + for level in levs: + thash[level] = level - aaa + aaa = level + return thash + + +def _delta_multiply(pars): + """ + # Calculate the delta altitude for each layer and + # multiplies concentration by layer thickness to return mass load. + # requires that the 'Level top heights (m)' is an attribute of pars. + + # pars: xarray data array + concentration with z coordinate. + # OUTPUT + # newpar : xarray data array + mass loading. + """ + thash = get_thickness(pars) + for iii, zzz in enumerate(pars.z.values): + delta = thash[zzz] + mml = pars.isel(z=iii) * delta + if iii == 0: + newpar = mml + else: + newpar = xr.concat([newpar, mml], "z") + if "z" not in newpar.dims: + newpar = newpar.expand_dims("z") + return newpar + + +def _delta_multiply_old(pars): + """ + # This method was faulty because layers with no concentrations were + # omitted. e.g. if layers were at 0,1000,2000,3000,4000,5000 but there were + # no mass below 20000 then would only see layers 3000,4000,5000 and thickness + # of 3000 layer would be calculated as 3000 instead of 1000. + # Calculate the delta altitude for each layer and + # multiplies concentration by layer thickness to return mass load. + + # pars: xarray data array + concentration with z coordinate. + # OUTPUT + # newpar : xarray data array + mass loading. + """ + xxx = 1 + alts = pars.coords["z"] + delta = [] + delta.append(alts[0]) + while xxx < (len(alts)): + delta.append(alts[xxx] - alts[xxx - 1]) + xxx += 1 + # Multiply each level by the delta altitude + yyy = 0 + while yyy < len(delta): + # modify so not dependent on placement of 'z' coordinate. + # pars[:, yyy, :, :] = pars[:, yyy, :, :] * delta[yyy] + mml = pars.isel(z=yyy) * delta[yyy] + if yyy == 0: + newpar = mml + else: + newpar = xr.concat([newpar, mml], "z") + if "z" not in newpar.dims: + newpar = newpar.expand_dims("z") + yyy += 1 # End of loop calculating heights + return newpar + + +def _alt_multiply(pars): + """ + # For calculating the top height + # Multiply "1s" in the input array by the altitude + """ + alts = pars.coords["z"] + yyy = 0 + while yyy < len(alts): + # modify so not dependent on placement of 'z' coordinate. + # pars[:, y, :, :] = pars[:, y, :, :] * alts[y] + mml = pars.isel(z=yyy) * alts[yyy] + if yyy == 0: + newpar = mml + else: + newpar = xr.concat([newpar, mml], "z") + yyy += 1 # End of loop calculating heights + if "z" not in newpar.dims: + newpar = newpar.expand_dims("z") + return newpar + + +def check_attributes(atthash): + # when writing to netcdf file, attributes which are numpy arrays do not write properly. + # need to change them to lists. + for key in atthash.keys(): + val = atthash[key] + if isinstance(val, np.ndarray): + newval = list(val) + atthash[key] = newval + return atthash + + +def write_with_compression(cxra, fname): + atthash = check_attributes(cxra.attrs) + cxra = cxra.assign_attrs(atthash) + cxra2 = cxra.to_dataset(name="POL") + ehash = {"zlib": True, "complevel": 9} + vlist = [x for x in cxra2.data_vars] + vhash = {} + for vvv in vlist: + vhash[vvv] = ehash + cxra2.to_netcdf(fname, encoding=vhash) From e726c65bba71042c73f388774f2b7f342ee64167 Mon Sep 17 00:00:00 2001 From: Alice Crawford Date: Thu, 4 Apr 2024 12:17:21 -0400 Subject: [PATCH 3/9] hysplit.py bug fixes to combine_dataset --- monetio/models/hysplit.py | 122 ++++++++++++++++++++++++++++++++++---- 1 file changed, 110 insertions(+), 12 deletions(-) diff --git a/monetio/models/hysplit.py b/monetio/models/hysplit.py index e5819cb7..8352e764 100644 --- a/monetio/models/hysplit.py +++ b/monetio/models/hysplit.py @@ -38,8 +38,9 @@ 2023 03 Mar AMC get_latlon modified. replace x>=180 with x>=180+lon_tolerance 2023 03 Mar AMC get_latlongrid improved exception statements 2023 08 Dec AMC add check_attributes to ModelBin and combine_datatset to make sure level height attribute is a list - - +2024 04 Mar AMC fixed bug in combine_dataset. align was not working properly with time coordinate in datetime64 when + datasets did not cover the same time periods. Add time2index and index2time functions to help. +2024 04 Mar AMC fixed bug in combine_dataset. Did not combine properly when length of ensemble and source lists > 1. """ import datetime import sys @@ -688,7 +689,13 @@ def combine_dataset( Files need to have the same concentration grid defined. """ - # iii = 0 + # 2024 04 March. when the input datasets did not have identical time coordinates, the align method of + # xarray was not working properly. Changing the time coordinate to an integer first + # fixes the problem. + # Another issue is that the combination only worked when either the source or the ensemble dimension + # had length of 1. Did not work properly with multiple sources and multiple ensembles. + # to fix this changed how enslist and sourcelist were defined and utilized. + mlat_p = mlon_p = None ylist = [] dtlist = [] @@ -699,6 +706,7 @@ def combine_dataset( # key is a tag indicating the source # value is tuple (filename, metdata) aaa = sorted(blist, key=lambda x: x[1]) + nnn = len(blist) blist = {} for val in aaa: if val[1] in blist.keys(): @@ -709,12 +717,12 @@ def combine_dataset( # mgrid = [] # first loop go through to get expanded dataset. xlist = [] - sourcelist = [] - enslist = [] + sourcelist = np.empty(shape=(nnn,nnn),dtype=object) + enslist = np.empty(shape=(nnn,nnn),dtype=object) for iii, key in enumerate(blist): # fname = val[0] xsublist = [] - for fname in blist[key]: + for jjj, fname in enumerate(blist[key]): if drange: century = int(drange[0].year / 100) * 100 hxr = open_dataset( @@ -752,8 +760,16 @@ def combine_dataset( # lon = hxr.longitude.isel(y=0).values # lat = hxr.latitude.isel(x=0).values xrash = add_species(hxr, species=species) + if iii==0 and jjj==0: + tvals = xrash.time.values + tvals.sort() + stime = tvals[0] + # make the time index integers + xrash = time2index(xrash,stime,dt=1) + xrash = xrash.drop_vars('time_values') xsublist.append(xrash) - enslist.append(fname[1]) + enslist[iii][jjj] = fname[1] + sourcelist[iii][jjj] = key dtlist.append(hxr.attrs["sample time hours"]) splist.extend(xrash.attrs["Species ID"]) if iii == 0: @@ -762,7 +778,6 @@ def combine_dataset( aaa, xnew = xr.align(xrash, xnew, join="outer") xnew = xnew.fillna(0) # iii += 1 - sourcelist.append(key) xlist.append(xsublist) # if verbose: # print("aligned --------------------------------------") @@ -770,23 +785,25 @@ def combine_dataset( # now go through and expand each one to the size of xnew. # iii = 0 # jjj = 0 + #import sys + #sys.exit() ylist = [] slist = [] - for jjj, sublist in enumerate(xlist): + for iii, sublist in enumerate(xlist): hlist = [] - for iii, temp in enumerate(sublist): + for jjj, temp in enumerate(sublist): # expand to same region as xnew aaa, bbb = xr.align(temp, xnew, join="outer") aaa = aaa.fillna(0) bbb = bbb.fillna(0) aaa.expand_dims("ens") - aaa["ens"] = enslist[iii] + aaa["ens"] = enslist[iii][jjj] # iii += 1 hlist.append(aaa) # concat along the 'ens' axis new = xr.concat(hlist, "ens") ylist.append(new) - slist.append(sourcelist[jjj]) + slist.append(sourcelist[iii][jjj]) # jjj += 1 if dtlist: dtlist = list(set(dtlist)) @@ -817,6 +834,7 @@ def combine_dataset( rval = fix_grid_continuity(newhxr) else: rval = newhxr + rval = index2time(rval) return rval @@ -825,6 +843,86 @@ def combine_dataset( # xindx = np.arange(xlim[0], xlim[1] + 1) # yindx = np.arange(ylim[0], ylim[1] + 1) # return get_latlongrid(dset, xindx, yindx) +def get_time_index(timevals,stime,dt): + """ + timevals : list of datetimes + stime : start time of time grid + dt : integer - time resolution in hours of time grid. + """ + def apply(ttt): + diff = pd.to_datetime(ttt)-stime + dh = diff.days*24 + diff.seconds/3600 + iii = dh/dt + return int(iii) + return [apply(x) for x in timevals] + +def get_time_values(index_values,stime,dt): + def apply(iii): + ddd = stime + datetime.timedelta(hours=float(iii)) + return ddd + return [apply(x) for x in index_values] + +def time2index(hxr,stime=None,dt=1): + """ + hxr : xarray DataSet as output from open_dataset or combine_dataset + make the time + + """ + stime_str = 'coordinate start time' + dt_str = 'coordinate time dt (hours)' + tvals = hxr.time.values + if stime is None: + stime = tvals[0] + ilist = get_time_index(tvals,stime,dt) + # create a time_index coordinate. + hxr = hxr.drop_vars('time') + hxr = hxr.assign_coords(time=('time',ilist)) + hxr = hxr.assign_coords(time_values=('time',tvals)) + atthash = {stime_str: stime} + atthash[dt_str] = dt + hxr.attrs.update(atthash) + #temp = temp.drop_vars('time') + #temp = temp.assign_coords(time=('t',tvals)) + return hxr + +def index2time(hxr): + """ + hxr : DataSet or DataArray with time coordinate in integer values. + should either have a time_values coordinate or attribute defining + coordinate start time and coordinate time delta. + Reverses changes made in time2index + """ + if 'time_values' in hxr.coords: + tvals = hxr.time_values.values + hxr = hxr.drop_vars('time') + hxr = hxr.drop_vars('time_values') + hxr = hxr.assign_coords(time=('time',tvals)) + else: + stime_str = 'coordinate start time' + dt_str = 'coordinate time dt (hours)' + if stime_str in hxr.attrs.keys(): + stime = pd.to_datetime(hxr.attrs[stime_str]) + if dt_str in hxr.attrs.keys(): + dt = hxr.attrs[dt_str] + ilist = hxr.time.values + tvals = get_time_values(ilist,stime,dt) + hxr = hxr.drop_vars('time') + hxr = hxr.assign_coords(time=('time',tvals)) + return hxr + + +def reset_time_coord(hxr): + atts = hxr.attrs + if 'coordinate start time' in atts.keys(): + stime = pd.to_datetime(atts['coordinate start time']) + if 'coordinate time dt (hours)' in atts.keys(): + dt = atts['coordinate time dt (hours)'] + index_values = hxr.t.values + tvals = get_time_values(index_values,stime,dt) + if 'time' in hxr.coords: + hxr.drop_vars('time') + hxr = hxr.assign_coords(time=("t",tvals)) + return hxr def reset_latlon_coords(hxr): From f2a36dd372bb437f942e4ba2317ee949c90ae136 Mon Sep 17 00:00:00 2001 From: Alice Crawford Date: Thu, 4 Apr 2024 17:52:28 -0400 Subject: [PATCH 4/9] hysplit.py further improvements to combine_dataset function. Refactored to use a helper class. update also allows the cdump files to be skipped over if they are empty. --- monetio/models/hysplit.py | 367 +++++++++++++++++++------------------- 1 file changed, 188 insertions(+), 179 deletions(-) diff --git a/monetio/models/hysplit.py b/monetio/models/hysplit.py index 8352e764..b766ae5e 100644 --- a/monetio/models/hysplit.py +++ b/monetio/models/hysplit.py @@ -38,9 +38,9 @@ 2023 03 Mar AMC get_latlon modified. replace x>=180 with x>=180+lon_tolerance 2023 03 Mar AMC get_latlongrid improved exception statements 2023 08 Dec AMC add check_attributes to ModelBin and combine_datatset to make sure level height attribute is a list -2024 04 Mar AMC fixed bug in combine_dataset. align was not working properly with time coordinate in datetime64 when - datasets did not cover the same time periods. Add time2index and index2time functions to help. -2024 04 Mar AMC fixed bug in combine_dataset. Did not combine properly when length of ensemble and source lists > 1. +2024 01 Apr AMC added logging. for combine_dataset add continue to exception so it won't fail. +2024 04 Mar AMC bug fixes to combine_dataset + """ import datetime import sys @@ -48,6 +48,9 @@ import numpy as np import pandas as pd import xarray as xr +import logging + +logger = logging.getLogger(__name__) def open_dataset( @@ -102,11 +105,14 @@ def open_dataset( readwrite="r", sample_time_stamp=sample_time_stamp, ) - dset = binfile.dset - if check_grid: - return fix_grid_continuity(dset) + if binfile.dataflag: + dset = binfile.dset + if check_grid: + return fix_grid_continuity(dset) + else: + return dset else: - return dset + return xr.Dataset() def check_drange(drange, pdate1, pdate2): @@ -201,7 +207,7 @@ def __init__( if readwrite == "r": if verbose: - print("reading " + filename) + logger.info("reading " + filename) self.dataflag = self.readfile(filename, drange, verbose=verbose, century=century) @staticmethod @@ -341,12 +347,14 @@ def parse_header(self, hdata1): number of starting locations in file. """ if len(hdata1["start_loc"]) != 1: - print("WARNING in ModelBin _readfile - number of starting locations " "incorrect") - print(hdata1["start_loc"]) + logger.warning("WARNING in ModelBin {} _readfile - number of starting locations incorrect".format(self.filename)) + logger.warning(str(hdata1["start_loc"])) + return None # in python 3 np.fromfile reads the record into a list even if it is # just one number. # so if the length of this record is greater than one something is # wrong. + # if it is empty or 0 then the cdump file is probably empty as well. nstartloc = hdata1["start_loc"][0] self.atthash["Meteorological Model ID"] = hdata1["model_id"][0].decode("UTF-8") self.atthash["Number Start Locations"] = nstartloc @@ -370,7 +378,7 @@ def parse_hdata2(self, hdata2, nstartloc, century): century = 2000 else: century = 1900 - print("WARNING: Guessing Century for HYSPLIT concentration file", century) + logger.info("WARNING: Guessing Century for HYSPLIT concentration file {}".format(century)) # add sourcedate which is datetime.datetime object sourcedate = datetime.datetime( century + hdata2["r_year"][nnn], @@ -415,14 +423,14 @@ def parse_hdata6and7(self, hdata6, hdata7, century): int(hdata6["omonth"][0]), int(hdata6["oday"][0]), int(hdata6["ohr"][0]), - int(hdata6["omin"][0]), + int(hdata6["omin"][0]) ) pdate2 = datetime.datetime( century + int(hdata7["oyear"][0]), int(hdata7["omonth"][0]), int(hdata7["oday"][0]), int(hdata7["ohr"][0]), - int(hdata7["omin"][0]), + int(hdata7["omin"][0]) ) dt = pdate2 - pdate1 sample_dt = dt.days * 24 + dt.seconds / 3600.0 @@ -533,7 +541,7 @@ class and need to be converted to a python # Reads header data. This consists of records 1-5. hdata1 = np.fromfile(fid, dtype=rec1, count=1) nstartloc = self.parse_header(hdata1) - + if nstartloc is None: return False hdata2 = np.fromfile(fid, dtype=rec2, count=nstartloc) century = self.parse_hdata2(hdata2, nstartloc, century) @@ -631,7 +639,7 @@ class and need to be converted to a python # imax iterations. if iimax > imax: testf = False - print("greater than imax", testf, iimax, imax) + logger.warning("greater than imax {},{},{}".format(testf, iimax, imax)) if inc_iii: iii += 1 @@ -656,6 +664,99 @@ class and need to be converted to a python return False return True +class CombineObject: + """ + Helper class for combine_dataset function. + """ + + + def __init__(self,blist,drange=None,century=None,sample_time_stamp='start'): + self.fname = blist[0] + self.source = blist[1] + self.ens = blist[2] + self.hxr = self.open(self.fname,drange,century,sample_time_stamp) + self._attrs = {} + if not self.empty: self.attrs = self.hxr.attrs + self.xrash = xr.DataArray() # created in process method. + + def grid_equal(self,other): + # other: another CombineObject object. + # checks to see if grid is equal + mlat, mlon = self.grid_definition + mlat2, mlon2 = other.grid_definition + if not np.array_equal(mlat,mlat2): + return False + if not np.array_equal(mlon,mlon2): + return False + return True + + def __lt__(self,other): + if self.start_time < other.start_time: + return True + if self.source < other.source: + return True + if self.ens < other.ens: + return True + return False + + @property + def empty(self): + if self.hxr.coords: + return False + else: + return True + + @property + def attrs(self): + return self._attrs + + @attrs.setter + def attrs(self,atthash): + if isinstance(atthash,dict): + self._attrs.update(atthash) + + + @property + def start_time(self): + tvals = self.hxr.time.values + tvals.sort() + return tvals[0] + + @property + def grid_definition(self): + return getlatlon(self.hxr.attrs) + + def process(self,stime=None,dt=None,species=None): + """ + add species, change time coordinate to an integer for alignment. + """ + xrash = add_species(self.hxr,species=species) + xrash = time2index(xrash,stime,dt) + xrash = xrash.drop_vars('time_values') + self.xrash = xrash + self.attrs = self.xrash.attrs + + @staticmethod + def open(fname,drange,century,sample_time_stamp='start',verbose=False): + if drange: + century = int(drange[0].year / 100) * 100 + hxr = open_dataset( + fname, + drange=drange, + century=century, + verbose=verbose, + sample_time_stamp=sample_time_stamp, + check_grid=False, + ) + else: # use all dates + hxr = open_dataset( + fname, + century=century, + verbose=verbose, + sample_time_stamp=sample_time_stamp, + check_grid=False, + ) + return hxr def combine_dataset( blist, @@ -688,6 +789,8 @@ def combine_dataset( only those species will be added. Files need to have the same concentration grid defined. + If files have no concentrations then they will be skipped. + """ # 2024 04 March. when the input datasets did not have identical time coordinates, the align method of # xarray was not working properly. Changing the time coordinate to an integer first @@ -696,153 +799,72 @@ def combine_dataset( # had length of 1. Did not work properly with multiple sources and multiple ensembles. # to fix this changed how enslist and sourcelist were defined and utilized. - mlat_p = mlon_p = None - ylist = [] - dtlist = [] - splist = [] - sourcelist = [] - # turn the input list int a dictionary - # blist : dictionary. - # key is a tag indicating the source - # value is tuple (filename, metdata) - aaa = sorted(blist, key=lambda x: x[1]) - nnn = len(blist) - blist = {} - for val in aaa: - if val[1] in blist.keys(): - blist[val[1]].append((val[0], val[2])) + + # create list of datasets to be combined and their properties. + # removes any cdumps that are empty. + xlist = [] # list of CombineObject objects + for bbb in blist: + cobject = CombineObject(bbb,drange,century,sample_time_stamp) + print('empty?', cobject.empty, cobject.hxr.coords) + if not cobject.empty: + xlist.append(cobject) else: - blist[val[1]] = [(val[0], val[2])] - - # mgrid = [] - # first loop go through to get expanded dataset. - xlist = [] - sourcelist = np.empty(shape=(nnn,nnn),dtype=object) - enslist = np.empty(shape=(nnn,nnn),dtype=object) - for iii, key in enumerate(blist): - # fname = val[0] - xsublist = [] - for jjj, fname in enumerate(blist[key]): - if drange: - century = int(drange[0].year / 100) * 100 - hxr = open_dataset( - fname[0], - drange=drange, - century=century, - verbose=verbose, - sample_time_stamp=sample_time_stamp, - check_grid=False, - ) - else: # use all dates - hxr = open_dataset( - fname[0], - century=century, - verbose=verbose, - sample_time_stamp=sample_time_stamp, - check_grid=False, - ) - try: - mlat, mlon = getlatlon(hxr.attrs) - except Exception: - print("WARNING Cannot open ") - print(fname[0]) - print(century) - print(hxr) - if iii > 0: - tt1 = np.array_equal(mlat, mlat_p) - tt2 = np.array_equal(mlon, mlon_p) - if not tt1 or not tt2: - print("WARNING: grids are not the same. cannot combine") - sys.exit() - mlat_p = mlat - mlon_p = mlon - - # lon = hxr.longitude.isel(y=0).values - # lat = hxr.latitude.isel(x=0).values - xrash = add_species(hxr, species=species) - if iii==0 and jjj==0: - tvals = xrash.time.values - tvals.sort() - stime = tvals[0] - # make the time index integers - xrash = time2index(xrash,stime,dt=1) - xrash = xrash.drop_vars('time_values') - xsublist.append(xrash) - enslist[iii][jjj] = fname[1] - sourcelist[iii][jjj] = key - dtlist.append(hxr.attrs["sample time hours"]) - splist.extend(xrash.attrs["Species ID"]) - if iii == 0: - xnew = xrash.copy() - else: - aaa, xnew = xr.align(xrash, xnew, join="outer") - xnew = xnew.fillna(0) - # iii += 1 - xlist.append(xsublist) - # if verbose: - # print("aligned --------------------------------------") - # xnew is now encompasses the area of all the data-arrays - # now go through and expand each one to the size of xnew. - # iii = 0 - # jjj = 0 - #import sys - #sys.exit() - ylist = [] + logger.warning('could not open {}'.format(bbb[0])) + + print(xlist[1:]) + # check that grids are equal by comparing each grid to the one before. + for iii, xobj in enumerate(xlist[1:]): + if not xobj.grid_equal(xlist[iii]): + logger.warning("WARNING: grids are not the same. cannot combine") + sys.exit() + + xlist.sort() + # use earliest time + svals = [x.start_time for x in xlist] + svals.sort() + print(svals) + stime = svals[0] + # process the data-arrays to be combined. + # change time coordinate to index, sum species. + [x.process(stime,dt=1,species=species) for x in xlist] + + # align to get biggest grid + xbig = xlist[0].xrash.copy() + for xobj in xlist[1:]: + aaa, xbig = xr.align(xobj.xrash,xbig,join='outer') + + #First group and concatenate along ensemble dimension. + sourcelist = list(set([x.source for x in xlist])) + outlist = [] slist = [] - for iii, sublist in enumerate(xlist): - hlist = [] - for jjj, temp in enumerate(sublist): - # expand to same region as xnew - aaa, bbb = xr.align(temp, xnew, join="outer") + for source in sourcelist: + # get all objects with that source + elist = [x for x in xlist if x.source==source] + inlist = [] + for eee in elist: + aaa, junk = xr.align(eee.xrash,xbig,join='outer') aaa = aaa.fillna(0) - bbb = bbb.fillna(0) - aaa.expand_dims("ens") - aaa["ens"] = enslist[iii][jjj] - # iii += 1 - hlist.append(aaa) - # concat along the 'ens' axis - new = xr.concat(hlist, "ens") - ylist.append(new) - slist.append(sourcelist[iii][jjj]) - # jjj += 1 - if dtlist: - dtlist = list(set(dtlist)) - dt = dtlist[0] - - newhxr = xr.concat(ylist, "source") - newhxr["source"] = slist - # newhxr['ens'] = metlist - - # newhxr is an xarray data-array with 6 dimensions. - # dt is the averaging time of the hysplit output. - newhxr = newhxr.assign_attrs({"sample time hours": dt}) - newhxr = newhxr.assign_attrs({"Species ID": list(set(splist))}) - newhxr.attrs.update(hxr.attrs) - keylist = ["time description"] - - # calculate the lat lon grid for the expanded dataset. - # and use that for the new coordinates. - newhxr = reset_latlon_coords(newhxr) - - # make sure all np.ndarrray in the attributes are converted to lists. - # the np arrays don't save well to netcdf file for some reason. - attrs = check_attributes(hxr.attrs) + aaa.expand_dims('ens') + aaa['ens'] = eee.ens + inlist.append(aaa) + # concat on ensemble dimension + inner = xr.concat(inlist,'ens') + outlist.append(inner) + # concat on source dimension + newhxr = xr.concat(outlist,'source') + newhxr['source'] = sourcelist + + atthash = xlist[0].hxr.attrs + attrs = check_attributes(atthash) newhxr = newhxr.assign_attrs(attrs) - #for key in keylist: - # newhxr = newhxr.assign_attrs({key: hxr.attrs[key]}) + newhxr = reset_latlon_coords(newhxr) + newhxr = index2time(newhxr) if check_grid: - rval = fix_grid_continuity(newhxr) + rval = fix_grid_continuity(newhxr) else: - rval = newhxr - rval = index2time(rval) + rval = newhxr return rval - -# This function seems not useful. -# def get_even_latlongrid(dset, xlim, ylim): -# xindx = np.arange(xlim[0], xlim[1] + 1) -# yindx = np.arange(ylim[0], ylim[1] + 1) -# return get_latlongrid(dset, xindx, yindx) def get_time_index(timevals,stime,dt): """ timevals : list of datetimes @@ -911,19 +933,6 @@ def index2time(hxr): return hxr -def reset_time_coord(hxr): - atts = hxr.attrs - if 'coordinate start time' in atts.keys(): - stime = pd.to_datetime(atts['coordinate start time']) - if 'coordinate time dt (hours)' in atts.keys(): - dt = atts['coordinate time dt (hours)'] - index_values = hxr.t.values - tvals = get_time_values(index_values,stime,dt) - if 'time' in hxr.coords: - hxr.drop_vars('time') - hxr = hxr.assign_coords(time=("t",tvals)) - return hxr - def reset_latlon_coords(hxr): """ @@ -1022,19 +1031,19 @@ def get_latlongrid(attrs, xindx, yindx): lonlist = [lon[x - 1] for x in xindx] # latlist = [lat[x - 1] for x in yindx] except Exception as eee: - print(f"Exception {eee}") - print("try increasing Number Number Lon Points") - print(attrs) - print(xindx) + logger.warning(f"Exception {eee}") + logger.warning("try increasing Number Number Lon Points") + #(attrs) + #print(xindx) success = False try: # lonlist = [lon[x - 1] for x in xindx] latlist = [lat[x - 1] for x in yindx] except Exception as eee: - print(f"Exception {eee}") - print("try increasing Number Number Lat Points") - print(attrs) - print(yindx) + logger.warning(f"Exception {eee}") + logger.warning("try increasing Number Number Lat Points") + #print(attrs) + #print(yindx) success = False if not success: @@ -1201,7 +1210,7 @@ def add_species(dset, species=None): warn = "WARNING: hysplit.add_species function" warn += ": species not found" + str(val) + "\n" warn += " valid species ids are " + str.join(", ", splist) - print(warn) + logger.warning(warn) sss = 0 tmp = [] # Looping through all species in dataset @@ -1231,7 +1240,7 @@ def calculate_thickness(cdump): for avalue in alts: thash[avalue] = avalue - aaa aaa = avalue - print(f"WARNING: thickness calculated from z values please verify {thash}") + logger.warning(f"WARNING: thickness calculated from z values please verify {thash}") return thash @@ -1262,8 +1271,8 @@ def get_thickness(cdump): calculate = True if calculate: - print(f"warning: {cstr} attribute needed to calculate level thicknesses") - print("warning: alternative calculation from z dimension values") + logger.warning(f"warning: {cstr} attribute needed to calculate level thicknesses") + logger.warning("warning: alternative calculation from z dimension values") thash = calculate_thickness(cdump) else: levs = cdump.attrs[cstr] From 9f58e9b393849aba92e66194d66c67fd6f0bd4ee Mon Sep 17 00:00:00 2001 From: Alice Crawford Date: Wed, 17 Apr 2024 15:51:46 -0400 Subject: [PATCH 5/9] removed hysplit_experimental.py and removed some print statements from hysplit.py --- monetio/models/hysplit.py | 6 +- monetio/models/hysplit_experimental.py | 1331 ------------------------ 2 files changed, 2 insertions(+), 1335 deletions(-) delete mode 100644 monetio/models/hysplit_experimental.py diff --git a/monetio/models/hysplit.py b/monetio/models/hysplit.py index b766ae5e..9c0aad70 100644 --- a/monetio/models/hysplit.py +++ b/monetio/models/hysplit.py @@ -670,7 +670,7 @@ class CombineObject: """ - def __init__(self,blist,drange=None,century=None,sample_time_stamp='start'): + def __init__(self,blist:tuple,drange=None,century=None,sample_time_stamp='start'): self.fname = blist[0] self.source = blist[1] self.ens = blist[2] @@ -805,13 +805,11 @@ def combine_dataset( xlist = [] # list of CombineObject objects for bbb in blist: cobject = CombineObject(bbb,drange,century,sample_time_stamp) - print('empty?', cobject.empty, cobject.hxr.coords) if not cobject.empty: xlist.append(cobject) else: logger.warning('could not open {}'.format(bbb[0])) - print(xlist[1:]) # check that grids are equal by comparing each grid to the one before. for iii, xobj in enumerate(xlist[1:]): if not xobj.grid_equal(xlist[iii]): @@ -822,7 +820,6 @@ def combine_dataset( # use earliest time svals = [x.start_time for x in xlist] svals.sort() - print(svals) stime = svals[0] # process the data-arrays to be combined. # change time coordinate to index, sum species. @@ -858,6 +855,7 @@ def combine_dataset( attrs = check_attributes(atthash) newhxr = newhxr.assign_attrs(attrs) newhxr = reset_latlon_coords(newhxr) + # change time coordinate back to datetime newhxr = index2time(newhxr) if check_grid: rval = fix_grid_continuity(newhxr) diff --git a/monetio/models/hysplit_experimental.py b/monetio/models/hysplit_experimental.py deleted file mode 100644 index 6014421c..00000000 --- a/monetio/models/hysplit_experimental.py +++ /dev/null @@ -1,1331 +0,0 @@ -""" -HYPSLIT MODEL READER for binary concentration (cdump) output files - -This code developed at the NOAA Air Resources Laboratory. -Alice Crawford -Allison Ring - -------------- -Functions: -------------- -open_dataset : -combine_dataset : -get_latlongrid : -getlatlon : -hysp_heights: determines ash top height from HYSPLIT -hysp_massload: determines total mass loading from HYSPLIT -calc_aml: determines ash mass loading for each altitude layer from HYSPLIT -hysp_thresh: calculates mask array for ash mass loading threshold from HYSPLIT -add_species(dset): adds concentrations due to different species. - - --------- -Classes --------- -ModelBin - -Change log - -2021 13 May AMC get_latlongrid needed to be updated to match makegrid method. -2022 14 Nov AMC initialized self.dset in __init__() in ModelBin class -2022 14 Nov AMC modified fix_grid_continuity to not fail if passed empty Dataset. -2022 02 Dec AMC modified get_latlongrid inputs. do not need to input dataset, just dictionary. -2022 02 Dec AMC replaced makegrid method with get_latlongrid function to reduce duplicate code. -2022 02 Dec AMC get_latlongrid function utilizes getlatlon to reduce duplicate code. -2022 02 Dec AMC replaced np.arange with np.linspace in getlatlon. np.arange is unstable when step is not an integer. -2023 12 Jan AMC modified reset_latlon_coords so will work with data-arrays that have no latitude longitude coordinate. -2023 12 Jan AMC get_thickness modified to calculate if the attribute specifying the vertical levels is bad -2023 03 Mar AMC get_latlon modified. replace x>=180 with x>=180+lon_tolerance -2023 03 Mar AMC get_latlongrid improved exception statements -2023 08 Dec AMC add check_attributes to ModelBin and combine_datatset to make sure level height attribute is a list - - -""" -import datetime -import sys - -import numpy as np -import pandas as pd -import xarray as xr - - -def open_dataset( - fname, - drange=None, - century=None, - verbose=False, - sample_time_stamp="start", - check_grid=True, -): - """Short summary. - - Parameters - ---------- - fname : string - Name of "cdump" file. Binary HYSPLIT concentration output file. - - drange : list of two datetime objects - cdump file contains concentration as function of time. The drange - specifies what times should be loaded from the file. A value of None - will result in all times being loaded. - - century : integer (1900 or 2000) - - verbose : boolean - If True will print out extra messages - - sample_time_stamp : str - if 'end' then time in xarray will be the end of sampling time period. - else time is start of sampling time period. - - check_grid : boolean - if True call fix_grid_continuity to check to see that - xindx and yindx values are sequential (e.g. not [1,2,3,4,5,7]). - If they are not, then add missing values to the xarray.. - - Returns - ------- - dset : xarray DataSet - - CHANGES for PYTHON 3 - For python 3 the numpy char4 are read in as a numpy.bytes_ class and need to - be converted to a python - string by using decode('UTF-8'). - """ - # open the dataset using xarray - binfile = ModelBin( - fname, - drange=drange, - century=century, - verbose=verbose, - readwrite="r", - sample_time_stamp=sample_time_stamp, - ) - dset = binfile.dset - if check_grid: - return fix_grid_continuity(dset) - else: - return dset - - -def check_drange(drange, pdate1, pdate2): - """ - drange : list of two datetimes - pdate1 : datetime - pdate2 : datetime - - Returns - savedata : boolean - - - returns True if drange is between pdate1 and pdate2 - """ - savedata = True - testf = True - # if pdate1 is within drange then save the data. - # AND if pdate2 is within drange then save the data. - # if drange[0] > pdate1 then stop looping to look for more data - # this block sets savedata to true if data within specified time - # range or time range not specified - if drange is None: - savedata = True - elif pdate1 >= drange[0] and pdate1 <= drange[1] and pdate2 <= drange[1]: - savedata = True - elif pdate1 > drange[1] or pdate2 > drange[1]: - testf = False - savedata = False - else: - savedata = False - # END block - # if verbose: - # print(savedata, 'DATES :', pdate1, pdate2) - return testf, savedata - - -class ModelBin: - """ - represents a binary cdump (concentration) output file from HYSPLIT - methods: - readfile - opens and reads contents of cdump file into an xarray - self.dset - """ - - def __init__( - self, - filename, - drange=None, - century=None, - verbose=True, - readwrite="r", - sample_time_stamp="start", - ): - """ - drange : list of two datetime objects. - The read method will store data from the cdump file for which the - sample start is greater thand drange[0] and less than drange[1] - for which the sample stop is less than drange[1]. - - sample_time_stamp : str - if 'end' - time in xarray will indicate end of sampling time. - else - time in xarray will indicate start of sampling time. - century : integer - verbose : boolean - read - - """ - self.drange = drange - self.filename = filename - self.century = century - self.verbose = verbose - # list of tuples (date1, date2) of averaging periods with zero - # concentrations - self.zeroconcdates = [] - # list of tuples of averaging periods with nonzero concentrtations] - self.nonzeroconcdates = [] - self.atthash = {} - self.atthash["Starting Latitudes"] = [] - self.atthash["Starting Longitudes"] = [] - self.atthash["Starting Heights"] = [] - self.atthash["Source Date"] = [] - self.sample_time_stamp = sample_time_stamp - self.gridhash = {} - # self.llcrnr_lon = None - # self.llcrnr_lat = None - # self.nlat = None - # self.nlon = None - # self.dlat = None - # self.dlon = None - self.levels = None - self.dset = xr.Dataset() - - if readwrite == "r": - if verbose: - print("reading " + filename) - self.dataflag = self.readfile(filename, drange, verbose=verbose, century=century) - - @staticmethod - def define_struct(): - """Each record in the fortran binary begins and ends with 4 bytes which - specify the length of the record. These bytes are called pad below. - They are not used here, but are thrown out. The following block defines - a numpy dtype object for each record in the binary file.""" - from numpy import dtype - - real4 = ">f" - int4 = ">i" - int2 = ">i2" - char4 = ">a4" - - rec1 = dtype( - [ - ("pad1", int4), - ("model_id", char4), # meteorological model id - ("met_year", int4), # meteorological model starting time - ("met_month", int4), - ("met_day", int4), - ("met_hr", int4), - ("met_fhr", int4), # forecast hour - ("start_loc", int4), # number of starting locations - ("conc_pack", int4), # concentration packing flag (0=no, 1=yes) - ("pad2", int4), - ] - ) - - # start_loc in rec1 tell how many rec there are. - rec2 = dtype( - [ - ("pad1", int4), - ("r_year", int4), # release starting time - ("r_month", int4), - ("r_day", int4), - ("r_hr", int4), - ("s_lat", real4), # Release location - ("s_lon", real4), - ("s_ht", real4), - ("r_min", int4), # release startime time (minutes) - ("pad2", int4), - ] - ) - - rec3 = dtype( - [ - ("pad1", int4), - ("nlat", int4), - ("nlon", int4), - ("dlat", real4), - ("dlon", real4), - ("llcrnr_lat", real4), - ("llcrnr_lon", real4), - ("pad2", int4), - ] - ) - - rec4a = dtype( - [ - ("pad1", int4), - ("nlev", int4), # number of vertical levels in concentration grid - ] - ) - - rec4b = dtype([("levht", int4)]) # height of each level (meters above ground) - - rec5a = dtype( - [ - ("pad1", int4), - ("pad2", int4), - ("pollnum", int4), # number of different pollutants - ] - ) - - rec5b = dtype([("pname", char4)]) # identification string for each pollutant - - rec5c = dtype([("pad2", int4)]) - - rec6 = dtype( - [ - ("pad1", int4), - ("oyear", int4), # sample start time. - ("omonth", int4), - ("oday", int4), - ("ohr", int4), - ("omin", int4), - ("oforecast", int4), - ("pad3", int4), - ] - ) - - # rec7 has same form as rec6. #sample stop time. - - # record 8 is pollutant type identification string, output level. - - rec8a = dtype( - [ - ("pad1", int4), - ("poll", char4), # pollutant identification string - ("lev", int4), - ("ne", int4), # number of elements - ] - ) - - rec8b = dtype( - [ - ("indx", int2), # longitude index - ("jndx", int2), # latitude index - ("conc", real4), - ] - ) - - rec8c = dtype([("pad2", int4)]) - recs = ( - rec1, - rec2, - rec3, - rec4a, - rec4b, - rec5a, - rec5b, - rec5c, - rec6, - rec8a, - rec8b, - rec8c, - ) - return recs - - def parse_header(self, hdata1): - """ - hdata1 : dtype - Returns - nstartloc : int - number of starting locations in file. - """ - if len(hdata1["start_loc"]) != 1: - print("WARNING in ModelBin _readfile - number of starting locations " "incorrect") - print(hdata1["start_loc"]) - # in python 3 np.fromfile reads the record into a list even if it is - # just one number. - # so if the length of this record is greater than one something is - # wrong. - nstartloc = hdata1["start_loc"][0] - self.atthash["Meteorological Model ID"] = hdata1["model_id"][0].decode("UTF-8") - self.atthash["Number Start Locations"] = nstartloc - return nstartloc - - def parse_hdata2(self, hdata2, nstartloc, century): - # Loop through starting locations - for nnn in range(0, nstartloc): - # create list of starting latitudes, longitudes and heights. - lat = hdata2["s_lat"][nnn] - lon = hdata2["s_lon"][nnn] - hgt = hdata2["s_ht"][nnn] - - self.atthash["Starting Latitudes"].append(lat) - self.atthash["Starting Longitudes"].append(lon) - self.atthash["Starting Heights"].append(hgt) - - # try to guess century if century not given - if century is None: - if hdata2["r_year"][0] < 50: - century = 2000 - else: - century = 1900 - print("WARNING: Guessing Century for HYSPLIT concentration file", century) - # add sourcedate which is datetime.datetime object - sourcedate = datetime.datetime( - century + hdata2["r_year"][nnn], - hdata2["r_month"][nnn], - hdata2["r_day"][nnn], - hdata2["r_hr"][nnn], - hdata2["r_min"][nnn], - ) - - self.atthash["Source Date"].append(sourcedate.strftime("%Y%m%d.%H%M%S")) - - return century - - def parse_hdata3(self, hdata3): - # Description of concentration grid - ahash = {} - ahash["Number Lat Points"] = hdata3["nlat"][0] - ahash["Number Lon Points"] = hdata3["nlon"][0] - ahash["Latitude Spacing"] = hdata3["dlat"][0] - ahash["Longitude Spacing"] = hdata3["dlon"][0] - ahash["llcrnr longitude"] = hdata3["llcrnr_lon"][0] - ahash["llcrnr latitude"] = hdata3["llcrnr_lat"][0] - # self.llcrnr_lon = hdata3["llcrnr_lon"][0] - # self.llcrnr_lat = hdata3["llcrnr_lat"][0] - # self.nlat = hdata3["nlat"][0] - # self.nlon = hdata3["nlon"][0] - # self.dlat = hdata3["dlat"][0] - # self.dlon = hdata3["dlon"][0] - return ahash - - def parse_hdata4(self, hdata4a, hdata4b): - self.levels = hdata4b["levht"] - self.atthash["Number of Levels"] = hdata4a["nlev"][0] - self.atthash["Level top heights (m)"] = hdata4b["levht"] - - def parse_hdata6and7(self, hdata6, hdata7, century): - # if no data read then break out of the while loop. - if not hdata6: - return False, None, None - pdate1 = datetime.datetime( - century + int(hdata6["oyear"][0]), - int(hdata6["omonth"][0]), - int(hdata6["oday"][0]), - int(hdata6["ohr"][0]), - int(hdata6["omin"][0]) - ) - pdate2 = datetime.datetime( - century + int(hdata7["oyear"][0]), - int(hdata7["omonth"][0]), - int(hdata7["oday"][0]), - int(hdata7["ohr"][0]), - int(hdata7["omin"][0]) - ) - dt = pdate2 - pdate1 - sample_dt = dt.days * 24 + dt.seconds / 3600.0 - # self.atthash["Sampling Time"] = pdate2 - pdate1 - self.atthash["sample time hours"] = sample_dt - if self.sample_time_stamp == "end": - self.atthash["time description"] = "End of sampling time period" - else: - self.atthash["time description"] = "start of sampling time period" - return True, pdate1, pdate2 - - @staticmethod - def parse_hdata8(hdata8a, hdata8b, pdate1): - """ - hdata8a : dtype - hdata8b : dtype - pdate1 : datetime - - Returns: - concframe : DataFrame - """ - lev_name = hdata8a["lev"][0] - col_name = hdata8a["poll"][0].decode("UTF-8") - edata = hdata8b.byteswap().newbyteorder() # otherwise get endian error. - concframe = pd.DataFrame.from_records(edata) - concframe["levels"] = lev_name - concframe["t"] = pdate1 - - # rename jndx x - # rename indx y - names = concframe.columns.values - names = ["y" if x == "jndx" else x for x in names] - names = ["x" if x == "indx" else x for x in names] - names = ["z" if x == "levels" else x for x in names] - concframe.columns = names - concframe.set_index( - ["t", "z", "y", "x"], - inplace=True, - ) - concframe.rename(columns={"conc": col_name}, inplace=True) - # mgrid = np.meshgrid(lat, lon) - return concframe - - # def makegrid(self, xindx, yindx): - # """ - # xindx : list - # yindx : list - # """ - # attrs = {} - # attrs['llcrnr latitude'] = self.llcrnr_lat - # attrs['llcrnr longitude'] = self.llcrnr_lon - # attrs['Number Lat Points'] = self.nlat - # attrs['Number Lon Points'] = self.nlon - # attrs['Latitude Spacing'] = self.dlat - # attrs['Longitude Spacing'] = self.dlon - # mgrid = get_latlongrid(attrs,xindx,yindx) - # return mgrid - # checked HYSPLIT code. the grid points - # do represent center of the sampling area. - # slat = self.llcrnr_lat - # slon = self.llcrnr_lon - # lat = np.arange(slat, slat + self.nlat * self.dlat, self.dlat) - # lon = np.arange(slon, slon + self.nlon * self.dlon, self.dlon) - # hysplit always uses grid from -180 to 180 - # lon = np.array([x-360 if x>180 else x for x in lon]) - # fortran array indice start at 1. so xindx >=1. - # python array indice start at 0. - # lonlist = [lon[x - 1] for x in xindx] - # latlist = [lat[x - 1] for x in yindx] - # mgrid = np.meshgrid(lonlist, latlist) - # return mgrid - - def readfile(self, filename, drange, verbose, century): - """Data from the file is stored in an xarray, self.dset - returns False if all concentrations are zero else returns True. - INPUTS - filename - name of cdump file to open - drange - [date1, date2] - range of dates to load data for. if [] - then loads all data. - date1 and date2 should be datetime objects. - verbose - turns on print statements - century - if None will try to guess the century by looking - at the last two digits of the year. - For python 3 the numpy char4 are read in as a numpy.bytes_ - class and need to be converted to a python - string by using decode('UTF-8'). - - """ - # 8/16/2016 moved species=[] to before while loop. Added print - # statements when verbose. - # self.dset = xr.Dataset() - # dictionaries which will be turned into the dset attributes. - fid = open(filename, "rb") - - # each record in the fortran binary begins and ends with 4 bytes which - # specify the length of the record. - # These bytes are called pad1 and pad2 below. They are not used here, - # but are thrown out. - # The following block defines a numpy dtype object for each record in - # the binary file. - recs = self.define_struct() - rec1, rec2, rec3, rec4a = recs[0], recs[1], recs[2], recs[3] - rec4b, rec5a, rec5b, rec5c = recs[4], recs[5], recs[6], recs[7] - rec6, rec8a, rec8b, rec8c = recs[8], recs[9], recs[10], recs[11] - # rec7 = rec6 - # start_loc in rec1 tell how many rec there are. - tempzeroconcdates = [] - # Reads header data. This consists of records 1-5. - hdata1 = np.fromfile(fid, dtype=rec1, count=1) - nstartloc = self.parse_header(hdata1) - - hdata2 = np.fromfile(fid, dtype=rec2, count=nstartloc) - century = self.parse_hdata2(hdata2, nstartloc, century) - - hdata3 = np.fromfile(fid, dtype=rec3, count=1) - self.gridhash = self.parse_hdata3(hdata3) - if self.verbose: - print("Grid specs", self.gridhash) - # read record 4 which gives information about vertical levels. - hdata4a = np.fromfile(fid, dtype=rec4a, count=1) - hdata4b = np.fromfile( - fid, dtype=rec4b, count=hdata4a["nlev"][0] - ) # reads levels, count is number of levels. - self.parse_hdata4(hdata4a, hdata4b) - - # read record 5 which gives information about pollutants / species. - hdata5a = np.fromfile(fid, dtype=rec5a, count=1) - np.fromfile(fid, dtype=rec5b, count=hdata5a["pollnum"][0]) - np.fromfile(fid, dtype=rec5c, count=1) - self.atthash["Number of Species"] = hdata5a["pollnum"][0] - self.atthash["Species ID"] = [] - - # Loop to reads records 6-8. Number of loops is equal to number of - # output times. - # Only save data for output times within drange. if drange=[] then - # save all. - # Loop to go through each sampling time - iimax = 0 # check to make sure don't go above max number of iterations - iii = 0 # checks to see if some nonzero data was saved in xarray - # Safety valve - will not allow more than 1000 loops to be executed. - imax = 1e8 - testf = True - tvals=[] - while testf: - hdata6 = np.fromfile(fid, dtype=rec6, count=1) - hdata7 = np.fromfile(fid, dtype=rec6, count=1) - check, pdate1, pdate2 = self.parse_hdata6and7(hdata6, hdata7, century) - if not check: - break - testf, savedata = check_drange(drange, pdate1, pdate2) - if verbose: - print("sample time", pdate1, " to ", pdate2) - # datelist = [] - inc_iii = False - # LOOP to go through each level - for _ in range(self.atthash["Number of Levels"]): - # LOOP to go through each pollutant - for _ in range(self.atthash["Number of Species"]): - # record 8a has the number of elements (ne). If number of - # elements greater than 0 than there are concentrations. - hdata8a = np.fromfile(fid, dtype=rec8a, count=1) - # self.atthash["Species ID"].append( - # hdata8a["poll"][0].decode("UTF-8") - # ) - # if number of elements is nonzero then - if hdata8a["ne"] >= 1: - self.atthash["Species ID"].append(hdata8a["poll"][0].decode("UTF-8")) - # get rec8 - indx and jndx - hdata8b = np.fromfile(fid, dtype=rec8b, count=hdata8a["ne"][0]) - # add sample start time to list of start times with - # non zero conc - self.nonzeroconcdates.append(pdate1) - else: - tempzeroconcdates.append( - pdate1 - ) # or add sample start time to list of start times - # with zero conc. - # This is just padding. - np.fromfile(fid, dtype=rec8c, count=1) - # if savedata is set and nonzero concentrations then save - # the data in a pandas dataframe - if savedata and hdata8a["ne"] >= 1: - self.nonzeroconcdates.append(pdate1) - inc_iii = True - if self.sample_time_stamp == "end": - concframe = self.parse_hdata8(hdata8a, hdata8b, pdate2) - else: - concframe = self.parse_hdata8(hdata8a, hdata8b, pdate1) - dset = xr.Dataset.from_dataframe(concframe) - # if verbose: - # print("Adding ", "Pollutant", pollutant, "Level", lev) - # if this is the first time through. create dataframe - # for first level and pollutant. - if not self.dset.any(): - self.dset = dset - else: # create dataframe for level and pollutant and - # then merge with main dataframe. - # self.dset = xr.concat([self.dset, dset],'levels') - # self.dset = xr.merge([self.dset, dset],compat='override') - self.dset = xr.merge([self.dset, dset],join='outer') - # self.dset = xr.combine_by_coords([self.dset, dset]) - # self.dset = xr.merge([self.dset, dset], compat='override') - iimax += 1 - # END LOOP to go through each pollutant - # END LOOP to go through each level - # safety check - will stop sampling time while loop if goes over - # imax iterations. - if iimax > imax: - testf = False - print("greater than imax", testf, iimax, imax) - if inc_iii: - iii += 1 - - self.atthash.update(self.gridhash) - self.atthash["Species ID"] = list(set(self.atthash["Species ID"])) - self.atthash["Coordinate time description"] = "Beginning of sampling time" - # END OF Loop to go through each sampling time - if not self.dset.any(): - return False - if self.dset.variables: - self.atthash = check_attributes(self.atthash) - self.dset.attrs = self.atthash - # mgrid = self.makegrid(self.dset.coords["x"], self.dset.coords["y"]) - mgrid = get_latlongrid(self.gridhash, self.dset.coords["x"], self.dset.coords["y"]) - self.dset = self.dset.assign_coords(longitude=(("y", "x"), mgrid[0])) - self.dset = self.dset.assign_coords(latitude=(("y", "x"), mgrid[1])) - tvals = self.dset.t.values - self.dset = self.dset.assign_coords(time=("t",tvals)) - self.dset = self.dset.reset_coords() - self.dset = self.dset.set_coords(["time", "latitude", "longitude"]) - self.dset = reset_time_index(self.dset) - if iii == 0 and verbose: - print("Warning: ModelBin class _readfile method: no data in the date range found") - return False - return True - - -def combine_dataset( - blist, - drange=None, - species=None, - century=None, - verbose=False, - sample_time_stamp="start", - check_grid=True, -): - """ - Inputs : - blist : list of tuples - (filename, sourcetag, metdatatag) - - drange : list of two datetime objects. - d1 datetime object. first date to keep in DatArrayarray - d2 datetime object. last date to keep in DataArray - - sample_time_stamp : str - if 'end' then time in xarray will be the end of sampling time period. - else time is start of sampling time period. - - RETURNS - newhxr : an xarray data-array with 6 dimensions. - lat, lon, time, level, ensemble tag, source tag - - Note that if more than one species is present in the files, they are - added to get concentration from all species. If list of species is provided, - only those species will be added. - - Files need to have the same concentration grid defined. - """ - # iii = 0 - mlat_p = mlon_p = None - ylist = [] - dtlist = [] - splist = [] - xlist = [] - sourcelist = [] - enslist = [] - #for mmm, key in enumerate(blist): - # # fname = val[0] - for iii, bbb in enumerate(blist): - fname = bbb[0] - - if drange: - century = int(drange[0].year / 100) * 100 - hxr = open_dataset( - fname, - drange=drange, - century=century, - verbose=verbose, - sample_time_stamp=sample_time_stamp, - check_grid=False, - ) - else: # use all dates - hxr = open_dataset( - fname, - century=century, - verbose=verbose, - sample_time_stamp=sample_time_stamp, - check_grid=False, - ) - print('hxr') - print(hxr.time.values) - try: - mlat, mlon = getlatlon(hxr.attrs) - except Exception: - print("WARNING Cannot open ") - print(fname) - print(century) - print(hxr) - if iii > 0: - tt1 = np.array_equal(mlat, mlat_p) - tt2 = np.array_equal(mlon, mlon_p) - if not tt1 or not tt2: - print("WARNING: grids are not the same. cannot combine") - sys.exit() - mlat_p = mlat - mlon_p = mlon - - xrash = add_species(hxr, species=species) - enslist.append(bbb[1]) - sourcelist.append(bbb[2]) - dtlist.append(hxr.attrs["sample time hours"]) - splist.extend(xrash.attrs["Species ID"]) - tvals = xrash.time.values - tvals.sort() - if iii == 0: - stime = tvals[0] - xnew = reset_time_index(xrash,stime=stime) - # start time for indexing will always be earliest time in first file in list. - xrash = reset_time_index(xrash,stime=stime) - else: - xrash = reset_time_index(xrash,stime=stime) - ccc, xnew = xr.align(xrash, xnew, join="outer") - xlist.append(xrash) - ylist = [] - slist = [] - hlist = [] - for iii, temp in enumerate(xlist): - aaa, bbb = xr.align(temp, xnew, join="outer") - aaa = reset_time_coord(aaa) - aaa.expand_dims("ens") - aaa["ens"] = enslist[iii] - aaa.expand_dims("source") - aaa["source"] = sourcelist[iii] - hlist.append(aaa) - # concat along the 'ens' axis - #newhxr = xr.concat(hlist, "source") - sourcelist = list(set(sourcelist)) - enslist = list(set(enslist)) - if len(sourcelist)==1: - newhxr = xr.concat(hlist,dim='ens') - elif len(enslist)==1: - newhxr = xr.concat(hlist,dim='source') - else: - newhxr = xr.concat(hlist,dim='source') - - #ylist.append(new) - #slist.append(sourcelist[jjj]) - if dtlist: - dtlist = list(set(dtlist)) - dt = dtlist[0] - - #newhxr = xr.concat(ylist, "source") - #newhxr["source"] = slist - # newhxr['ens'] = metlist - - # newhxr is an xarray data-array with 6 dimensions. - # dt is the averaging time of the hysplit output. - newhxr = newhxr.assign_attrs({"sample time hours": dt}) - newhxr = newhxr.assign_attrs({"Species ID": list(set(splist))}) - newhxr.attrs.update(hxr.attrs) - - - # calculate the lat lon grid for the expanded dataset. - # and use that for the new coordinates. - newhxr = reset_latlon_coords(newhxr) - #newhxr = reset_time_coord(newhxr) - - # make sure all np.ndarrray in the attributes are converted to lists. - # the np arrays don't save well to netcdf file for some reason. - attrs = check_attributes(hxr.attrs) - newhxr = newhxr.assign_attrs(attrs) - keylist = ["coordinate start time",'coordinate time dt (hours)'] - for key in keylist: - if key in xnew.attrs.keys(): - newhxr = newhxr.assign_attrs({key:xnew.attrs[key]}) - - #for key in keylist: - # newhxr = newhxr.assign_attrs({key: hxr.attrs[key]}) - if check_grid: - rval = fix_grid_continuity(newhxr) - else: - rval = newhxr - return rval - - -# This function seems not useful. -# def get_even_latlongrid(dset, xlim, ylim): -# xindx = np.arange(xlim[0], xlim[1] + 1) -# yindx = np.arange(ylim[0], ylim[1] + 1) -# return get_latlongrid(dset, xindx, yindx) - -def reset_time_index(hxr,stime=None,dt=1): - """ - hxr : xarray DataSet as output from open_dataset or combine_dataset - """ - tvals = hxr.time.values - if stime is None: - stime = tvals[0] - ilist = get_time_index(tvals,stime,dt) - # create a time_index coordinate. - temp = hxr.assign_coords(t=('t',ilist)) - atthash = {'coordinate start time': stime} - atthash['coordinate time dt (hours)'] = dt - temp.attrs.update(atthash) - #temp = temp.drop_vars('time') - #temp = temp.assign_coords(time=('t',tvals)) - return temp - -def reset_time_coord(hxr): - atts = hxr.attrs - if 'coordinate start time' in atts.keys(): - stime = pd.to_datetime(atts['coordinate start time']) - if 'coordinate time dt (hours)' in atts.keys(): - dt = atts['coordinate time dt (hours)'] - index_values = hxr.t.values - tvals = get_time_values(index_values,stime,dt) - if 'time' in hxr.coords: - hxr.drop_vars('time') - hxr = hxr.assign_coords(time=("t",tvals)) - return hxr - -def reset_latlon_coords(hxr): - """ - hxr : xarray DataSet as output from open_dataset or combine_dataset - """ - mgrid = get_latlongrid(hxr.attrs, hxr.x.values, hxr.y.values) - if "latitude" in hxr.coords: - hxr = hxr.drop_vars("longitude") - if "longitude" in hxr.coords: - hxr = hxr.drop_vars("latitude") - hxr = hxr.assign_coords(latitude=(("y", "x"), mgrid[1])) - hxr = hxr.assign_coords(longitude=(("y", "x"), mgrid[0])) - return hxr - - -def fix_grid_continuity(dset): - # if dset is empty don't do anything - if not dset.any(): - return dset - - # if grid already continuous don't do anything. - if check_grid_continuity(dset): - return dset - xvv = dset.x.values - yvv = dset.y.values - - xlim = [xvv[0], xvv[-1]] - ylim = [yvv[0], yvv[-1]] - - xindx = np.arange(xlim[0], xlim[1] + 1) - yindx = np.arange(ylim[0], ylim[1] + 1) - - mgrid = get_latlongrid(dset.attrs, xindx, yindx) - # mgrid = get_even_latlongrid(dset, xlim, ylim) - conc = np.zeros_like(mgrid[0]) - dummy = xr.DataArray(conc, dims=["y", "x"]) - dummy = dummy.assign_coords(latitude=(("y", "x"), mgrid[1])) - dummy = dummy.assign_coords(longitude=(("y", "x"), mgrid[0])) - dummy = dummy.assign_coords(x=(("x"), xindx)) - dummy = dummy.assign_coords(y=(("y"), yindx)) - cdset, dummy2 = xr.align(dset, dummy, join="outer") - cdset = cdset.assign_coords(latitude=(("y", "x"), mgrid[1])) - cdset = cdset.assign_coords(longitude=(("y", "x"), mgrid[0])) - return cdset.fillna(0) - - -def check_grid_continuity(dset): - """ - checks to see if x and y coords are skipping over any grid points. - Since cdump files only store above 0 values, it is possible to have - a grid that is - y = [1,2,3,4,6,8] - if there are above zero values at 6 and 8 but not at 7. - This results in an xarray which has a grid that is not evenly spaced. - """ - xvv = dset.x.values - yvv = dset.y.values - tt1 = np.array([xvv[i] - xvv[i - 1] for i in np.arange(1, len(xvv))]) - tt2 = np.array([yvv[i] - yvv[i - 1] for i in np.arange(1, len(yvv))]) - if np.any(tt1 != 1): - return False - if np.any(tt2 != 1): - return False - return True - - -def get_time_values(index_values,stime,dt): - def apply(iii): - ddd = stime + datetime.timedelta(hours=float(iii)) - return ddd - return [apply(x) for x in index_values] - -def get_time_index(timevals,stime,dt): - """ - timevals : list of datetimes - stime : start time of time grid - dt : integer - time resolution in hours of time grid. - """ - def apply(ttt): - diff = pd.to_datetime(ttt)-stime - dh = diff.days*24 + diff.seconds/3600 - iii = dh/dt - return int(iii) - return [apply(x) for x in timevals] - - -def get_latlongrid(attrs, xindx, yindx): - """ - INPUTS - attrs : dictionary with grid specifications - xindx : list of integers > 0 - yindx : list of integers > 0 - RETURNS - mgrid : output of numpy meshgrid function. - Two 2d arrays of latitude, longitude. - The grid points in cdump file - represent center of the sampling area. - - NOTES : - This may return a grid that is not evenly spaced. - For instance if yindx is something like [1,2,3,4,5,7] then - the grid will not have even spacing in latitude and will 'skip' a latitude point. - - HYSPLIT grid indexing starts at 1. - - """ - xindx = np.array(xindx) - yindx = np.array(yindx) - if np.any(xindx <= 0): - raise Exception("HYSPLIT grid error xindex <=0") - if np.any(yindx <= 0): - raise Exception("HYSPLIT grid error yindex <=0") - lat, lon = getlatlon(attrs) - success = True - try: - lonlist = [lon[x - 1] for x in xindx] - # latlist = [lat[x - 1] for x in yindx] - except Exception as eee: - print(f"Exception {eee}") - print("try increasing Number Number Lon Points") - print(attrs) - print(xindx) - success = False - try: - # lonlist = [lon[x - 1] for x in xindx] - latlist = [lat[x - 1] for x in yindx] - except Exception as eee: - print(f"Exception {eee}") - print("try increasing Number Number Lat Points") - print(attrs) - print(yindx) - success = False - - if not success: - return None - mgrid = np.meshgrid(lonlist, latlist) - return mgrid - - -# def get_index_fromgrid(dset): -# llcrnr_lat = dset.attrs["llcrnr latitude"] -# llcrnr_lon = dset.attrs["llcrnr longitude"] -# nlat = dset.attrs["Number Lat Points"] -# nlon = dset.attrs["Number Lon Points"] -# dlat = dset.attrs["Latitude Spacing"] -# dlon = dset.attrs["Longitude Spacing"] - - -def getlatlon(attrs): - """ - Returns 1d array of lats and lons based on Concentration Grid - Defined in the dset attribute. - attrs : dictionary with grid specifications - RETURNS - lat : 1D array of latitudes - lon : 1D array of longitudes - """ - lon_tolerance = 0.001 - llcrnr_lat = attrs["llcrnr latitude"] - llcrnr_lon = attrs["llcrnr longitude"] - nlat = attrs["Number Lat Points"] - nlon = attrs["Number Lon Points"] - dlat = attrs["Latitude Spacing"] - dlon = attrs["Longitude Spacing"] - - lastlon = llcrnr_lon + (nlon - 1) * dlon - lastlat = llcrnr_lat + (nlat - 1) * dlat - # = int((lastlon - llcrnr_lon) / dlon) - lat = np.linspace(llcrnr_lat, lastlat, num=int(nlat)) - lon = np.linspace(llcrnr_lon, lastlon, num=int(nlon)) - # - lon = np.array([x - 360 if x >= 180 + lon_tolerance else x for x in lon]) - return lat, lon - - -def hysp_massload(dset, threshold=0, mult=1, zvals=None): - """Calculate mass loading from HYSPLIT xarray - INPUTS - dset: xarray dataset output by open_dataset OR - xarray data array output by combine_dataset - threshold : float - mult : float - zvals : list of levels to calculate mass loading over. - Outputs: - totl_aml : xarray data array - total ash mass loading (summed over all layers), ash mass loading - Units in (unit mass / m^2) - """ - # first calculate mass loading in each level. - aml_alts = calc_aml(dset) - # Then choose which levels to use for total mass loading. - if zvals: - aml_alts = aml_alts.isel(z=zvals) - if "z" not in aml_alts.dims: - aml_alts = aml_alts.expand_dims("z") - # - total_aml = aml_alts.sum(dim="z") - # Calculate conversion factors - # unitmass, mass63 = calc_MER(dset) - # Calculating the ash mass loading - total_aml2 = total_aml * mult - # Calculating total ash mass loading, accounting for the threshold - # Multiply binary threshold mask to data - total_aml_thresh = hysp_thresh(dset, threshold, mult=mult) - total_aml = total_aml2 * total_aml_thresh - return total_aml - - -def hysp_heights(dset, threshold, mult=1, height_mult=1 / 1000.0, mass_load=True, species=None): - """Calculate top-height from HYSPLIT xarray - Input: xarray dataset output by open_dataset OR - xarray data array output by combine_dataset - threshold : mass loading threshold (threshold = xx) - mult : convert from meters to other unit. default is 1/1000.0 to - convert to km. - Outputs: ash top heights, altitude levels""" - - # either get mass loading of each point - if mass_load: - aml_alts = calc_aml(dset) - # or get concentration at each point - else: - aml_alts = add_species(dset, species=species) - - # Create array of 0 and 1 (1 where data exists) - heights = aml_alts.where(aml_alts == 0.0, 1.0) - # Multiply each level by the altitude - height = _alt_multiply(heights) - height = height * height_mult # convert to km - # Determine top height: take max of heights array along z axis - top_hgt = height.max(dim="z") - # Apply ash mass loading threshold mask array - total_aml_thresh = hysp_thresh(dset, threshold, mult=mult) - top_height = top_hgt * total_aml_thresh - return top_height - - -# def calc_total_mass(dset): -# return -1 - - -def calc_aml(dset, species=None): - """Calculates the mass loading at each altitude for the dataset - Input: xarray dataset output by open_dataset OR - xarray data array output by combine_dataset - Output: total ash mass loading""" - # Totals values for all particles - if isinstance(dset, xr.core.dataset.Dataset): - total_par = add_species(dset, species=species) - else: - total_par = dset.copy() - # Multiplies the total particles by the altitude layer - # to create a mass loading for each altitude layer - aml_alts = _delta_multiply(total_par) - return aml_alts - - -def hysp_thresh(dset, threshold, mult=1): - """Calculates a threshold mask array based on the - ash mass loading from HYSPLIT xarray - Inputs: xarray, ash mass loading threshold (threshold = xx) - Outputs: ash mass loading threshold mask array - Returns 0 where values are below or equal to threshold. - Returns 1 where values are greater than threshold - - """ - # Calculate ash mass loading for xarray - aml_alts = calc_aml(dset) - total_aml = aml_alts.sum(dim="z") - # Calculate conversion factors - # unitmass, mass63 = calc_MER(dset) - # Calculating the ash mass loading - total_aml2 = total_aml * mult - # where puts value into places where condition is FALSE. - # place 0 in places where value is below or equal to threshold - total_aml_thresh = total_aml2.where(total_aml2 > threshold, 0.0) - # place 1 in places where value is greater than threshold - total_aml_thresh = total_aml_thresh.where(total_aml_thresh <= threshold, 1.0) - return total_aml_thresh - - -def add_species(dset, species=None): - """ - species : list of Species ID's. - if none then all ids in the "species ID" attribute will be used. - Calculate sum of particles. - """ - sflist = [] - splist = dset.attrs["Species ID"] - if not species: - species = dset.attrs["Species ID"] - else: - for val in species: - if val not in splist: - warn = "WARNING: hysplit.add_species function" - warn += ": species not found" + str(val) + "\n" - warn += " valid species ids are " + str.join(", ", splist) - print(warn) - sss = 0 - tmp = [] - # Looping through all species in dataset - while sss < len(splist): - if splist[sss] in species: - tmp.append(dset[splist[sss]].fillna(0)) - sflist.append(splist[sss]) - sss += 1 # End of loop through species - - total_par = tmp[0] - ppp = 1 - # Adding all species together - while ppp < len(tmp): - total_par = total_par + tmp[ppp] - ppp += 1 # End of loop adding all species - atthash = dset.attrs - atthash["Species ID"] = sflist - atthash = check_attributes(atthash) - total_par = total_par.assign_attrs(atthash) - return total_par - - -def calculate_thickness(cdump): - alts = cdump.z.values - thash = {} - aaa = 0 - for avalue in alts: - thash[avalue] = avalue - aaa - aaa = avalue - print(f"WARNING: thickness calculated from z values please verify {thash}") - return thash - - -def get_thickness(cdump): - """ - Input: - cdump : xarray DataArray with 'Level top heights (m)' as an attribute. - Returns: - thash : dictionary - key is the name of the z coordinate and value is the thickness of that layer in meters. - """ - cstr = "Level top heights (m)" - - calculate = False - if cstr not in cdump.attrs.keys(): - calculate = True - # check that the values in the attribute correspond to values in levels. - # python reads in this attribute as a numpy array - # but when writing the numpy array to netcdf file, it doesn't write correctly. - # sometimes cdump file is written with incorrect values in this attribute. - elif cstr in cdump.attrs.keys(): - alts = cdump.z.values - zvals = cdump.attrs[cstr] - # cra = [] - for aaa in alts: - # if a level is not found in the attribute array then use the calculate method. - if aaa not in zvals: - calculate = True - - if calculate: - print(f"warning: {cstr} attribute needed to calculate level thicknesses") - print("warning: alternative calculation from z dimension values") - thash = calculate_thickness(cdump) - else: - levs = cdump.attrs[cstr] - thash = {} - aaa = 0 - for level in levs: - thash[level] = level - aaa - aaa = level - return thash - - -def _delta_multiply(pars): - """ - # Calculate the delta altitude for each layer and - # multiplies concentration by layer thickness to return mass load. - # requires that the 'Level top heights (m)' is an attribute of pars. - - # pars: xarray data array - concentration with z coordinate. - # OUTPUT - # newpar : xarray data array - mass loading. - """ - thash = get_thickness(pars) - for iii, zzz in enumerate(pars.z.values): - delta = thash[zzz] - mml = pars.isel(z=iii) * delta - if iii == 0: - newpar = mml - else: - newpar = xr.concat([newpar, mml], "z") - if "z" not in newpar.dims: - newpar = newpar.expand_dims("z") - return newpar - - -def _delta_multiply_old(pars): - """ - # This method was faulty because layers with no concentrations were - # omitted. e.g. if layers were at 0,1000,2000,3000,4000,5000 but there were - # no mass below 20000 then would only see layers 3000,4000,5000 and thickness - # of 3000 layer would be calculated as 3000 instead of 1000. - # Calculate the delta altitude for each layer and - # multiplies concentration by layer thickness to return mass load. - - # pars: xarray data array - concentration with z coordinate. - # OUTPUT - # newpar : xarray data array - mass loading. - """ - xxx = 1 - alts = pars.coords["z"] - delta = [] - delta.append(alts[0]) - while xxx < (len(alts)): - delta.append(alts[xxx] - alts[xxx - 1]) - xxx += 1 - # Multiply each level by the delta altitude - yyy = 0 - while yyy < len(delta): - # modify so not dependent on placement of 'z' coordinate. - # pars[:, yyy, :, :] = pars[:, yyy, :, :] * delta[yyy] - mml = pars.isel(z=yyy) * delta[yyy] - if yyy == 0: - newpar = mml - else: - newpar = xr.concat([newpar, mml], "z") - if "z" not in newpar.dims: - newpar = newpar.expand_dims("z") - yyy += 1 # End of loop calculating heights - return newpar - - -def _alt_multiply(pars): - """ - # For calculating the top height - # Multiply "1s" in the input array by the altitude - """ - alts = pars.coords["z"] - yyy = 0 - while yyy < len(alts): - # modify so not dependent on placement of 'z' coordinate. - # pars[:, y, :, :] = pars[:, y, :, :] * alts[y] - mml = pars.isel(z=yyy) * alts[yyy] - if yyy == 0: - newpar = mml - else: - newpar = xr.concat([newpar, mml], "z") - yyy += 1 # End of loop calculating heights - if "z" not in newpar.dims: - newpar = newpar.expand_dims("z") - return newpar - - -def check_attributes(atthash): - # when writing to netcdf file, attributes which are numpy arrays do not write properly. - # need to change them to lists. - for key in atthash.keys(): - val = atthash[key] - if isinstance(val, np.ndarray): - newval = list(val) - atthash[key] = newval - return atthash - - -def write_with_compression(cxra, fname): - atthash = check_attributes(cxra.attrs) - cxra = cxra.assign_attrs(atthash) - cxra2 = cxra.to_dataset(name="POL") - ehash = {"zlib": True, "complevel": 9} - vlist = [x for x in cxra2.data_vars] - vhash = {} - for vvv in vlist: - vhash[vvv] = ehash - cxra2.to_netcdf(fname, encoding=vhash) From fa6e15bef1113c072b95c45af41d68c9b7bb1fd7 Mon Sep 17 00:00:00 2001 From: Alice Crawford Date: Wed, 17 Apr 2024 15:57:12 -0400 Subject: [PATCH 6/9] formatting updates to hysplit.py --- monetio/models/hysplit.py | 203 ++++++++++++++++++++------------------ 1 file changed, 105 insertions(+), 98 deletions(-) diff --git a/monetio/models/hysplit.py b/monetio/models/hysplit.py index 9c0aad70..ba4ed7f7 100644 --- a/monetio/models/hysplit.py +++ b/monetio/models/hysplit.py @@ -347,9 +347,13 @@ def parse_header(self, hdata1): number of starting locations in file. """ if len(hdata1["start_loc"]) != 1: - logger.warning("WARNING in ModelBin {} _readfile - number of starting locations incorrect".format(self.filename)) + logger.warning( + "WARNING in ModelBin {} _readfile - number of starting locations incorrect".format( + self.filename + ) + ) logger.warning(str(hdata1["start_loc"])) - return None + return None # in python 3 np.fromfile reads the record into a list even if it is # just one number. # so if the length of this record is greater than one something is @@ -378,7 +382,9 @@ def parse_hdata2(self, hdata2, nstartloc, century): century = 2000 else: century = 1900 - logger.info("WARNING: Guessing Century for HYSPLIT concentration file {}".format(century)) + logger.info( + "WARNING: Guessing Century for HYSPLIT concentration file {}".format(century) + ) # add sourcedate which is datetime.datetime object sourcedate = datetime.datetime( century + hdata2["r_year"][nnn], @@ -423,14 +429,14 @@ def parse_hdata6and7(self, hdata6, hdata7, century): int(hdata6["omonth"][0]), int(hdata6["oday"][0]), int(hdata6["ohr"][0]), - int(hdata6["omin"][0]) + int(hdata6["omin"][0]), ) pdate2 = datetime.datetime( century + int(hdata7["oyear"][0]), int(hdata7["omonth"][0]), int(hdata7["oday"][0]), int(hdata7["ohr"][0]), - int(hdata7["omin"][0]) + int(hdata7["omin"][0]), ) dt = pdate2 - pdate1 sample_dt = dt.days * 24 + dt.seconds / 3600.0 @@ -541,7 +547,8 @@ class and need to be converted to a python # Reads header data. This consists of records 1-5. hdata1 = np.fromfile(fid, dtype=rec1, count=1) nstartloc = self.parse_header(hdata1) - if nstartloc is None: return False + if nstartloc is None: + return False hdata2 = np.fromfile(fid, dtype=rec2, count=nstartloc) century = self.parse_hdata2(hdata2, nstartloc, century) @@ -629,7 +636,7 @@ class and need to be converted to a python # then merge with main dataframe. # self.dset = xr.concat([self.dset, dset],'levels') # self.dset = xr.merge([self.dset, dset],compat='override') - self.dset = xr.merge([self.dset, dset],join='outer') + self.dset = xr.merge([self.dset, dset], join="outer") # self.dset = xr.combine_by_coords([self.dset, dset]) # self.dset = xr.merge([self.dset, dset], compat='override') iimax += 1 @@ -664,57 +671,57 @@ class and need to be converted to a python return False return True + class CombineObject: """ - Helper class for combine_dataset function. + Helper class for combine_dataset function. """ - - def __init__(self,blist:tuple,drange=None,century=None,sample_time_stamp='start'): + def __init__(self, blist: tuple, drange=None, century=None, sample_time_stamp="start"): self.fname = blist[0] self.source = blist[1] self.ens = blist[2] - self.hxr = self.open(self.fname,drange,century,sample_time_stamp) + self.hxr = self.open(self.fname, drange, century, sample_time_stamp) self._attrs = {} - if not self.empty: self.attrs = self.hxr.attrs - self.xrash = xr.DataArray() # created in process method. + if not self.empty: + self.attrs = self.hxr.attrs + self.xrash = xr.DataArray() # created in process method. - def grid_equal(self,other): + def grid_equal(self, other): # other: another CombineObject object. # checks to see if grid is equal mlat, mlon = self.grid_definition mlat2, mlon2 = other.grid_definition - if not np.array_equal(mlat,mlat2): - return False - if not np.array_equal(mlon,mlon2): - return False + if not np.array_equal(mlat, mlat2): + return False + if not np.array_equal(mlon, mlon2): + return False return True - def __lt__(self,other): - if self.start_time < other.start_time: - return True + def __lt__(self, other): + if self.start_time < other.start_time: + return True if self.source < other.source: - return True + return True if self.ens < other.ens: - return True + return True return False @property def empty(self): if self.hxr.coords: - return False + return False else: - return True + return True - @property + @property def attrs(self): return self._attrs @attrs.setter - def attrs(self,atthash): - if isinstance(atthash,dict): - self._attrs.update(atthash) - + def attrs(self, atthash): + if isinstance(atthash, dict): + self._attrs.update(atthash) @property def start_time(self): @@ -724,20 +731,20 @@ def start_time(self): @property def grid_definition(self): - return getlatlon(self.hxr.attrs) + return getlatlon(self.hxr.attrs) - def process(self,stime=None,dt=None,species=None): + def process(self, stime=None, dt=None, species=None): """ add species, change time coordinate to an integer for alignment. """ - xrash = add_species(self.hxr,species=species) - xrash = time2index(xrash,stime,dt) - xrash = xrash.drop_vars('time_values') + xrash = add_species(self.hxr, species=species) + xrash = time2index(xrash, stime, dt) + xrash = xrash.drop_vars("time_values") self.xrash = xrash self.attrs = self.xrash.attrs - + @staticmethod - def open(fname,drange,century,sample_time_stamp='start',verbose=False): + def open(fname, drange, century, sample_time_stamp="start", verbose=False): if drange: century = int(drange[0].year / 100) * 100 hxr = open_dataset( @@ -758,6 +765,7 @@ def open(fname,drange,century,sample_time_stamp='start',verbose=False): ) return hxr + def combine_dataset( blist, drange=None, @@ -794,62 +802,61 @@ def combine_dataset( """ # 2024 04 March. when the input datasets did not have identical time coordinates, the align method of # xarray was not working properly. Changing the time coordinate to an integer first - # fixes the problem. + # fixes the problem. # Another issue is that the combination only worked when either the source or the ensemble dimension # had length of 1. Did not work properly with multiple sources and multiple ensembles. # to fix this changed how enslist and sourcelist were defined and utilized. - # create list of datasets to be combined and their properties. # removes any cdumps that are empty. - xlist = [] # list of CombineObject objects + xlist = [] # list of CombineObject objects for bbb in blist: - cobject = CombineObject(bbb,drange,century,sample_time_stamp) + cobject = CombineObject(bbb, drange, century, sample_time_stamp) if not cobject.empty: - xlist.append(cobject) + xlist.append(cobject) else: - logger.warning('could not open {}'.format(bbb[0])) + logger.warning("could not open {}".format(bbb[0])) # check that grids are equal by comparing each grid to the one before. for iii, xobj in enumerate(xlist[1:]): - if not xobj.grid_equal(xlist[iii]): - logger.warning("WARNING: grids are not the same. cannot combine") - sys.exit() + if not xobj.grid_equal(xlist[iii]): + logger.warning("WARNING: grids are not the same. cannot combine") + sys.exit() xlist.sort() - # use earliest time + # use earliest time svals = [x.start_time for x in xlist] svals.sort() stime = svals[0] # process the data-arrays to be combined. # change time coordinate to index, sum species. - [x.process(stime,dt=1,species=species) for x in xlist] + [x.process(stime, dt=1, species=species) for x in xlist] # align to get biggest grid xbig = xlist[0].xrash.copy() for xobj in xlist[1:]: - aaa, xbig = xr.align(xobj.xrash,xbig,join='outer') - - #First group and concatenate along ensemble dimension. + aaa, xbig = xr.align(xobj.xrash, xbig, join="outer") + + # First group and concatenate along ensemble dimension. sourcelist = list(set([x.source for x in xlist])) outlist = [] slist = [] for source in sourcelist: # get all objects with that source - elist = [x for x in xlist if x.source==source] + elist = [x for x in xlist if x.source == source] inlist = [] for eee in elist: - aaa, junk = xr.align(eee.xrash,xbig,join='outer') + aaa, junk = xr.align(eee.xrash, xbig, join="outer") aaa = aaa.fillna(0) - aaa.expand_dims('ens') - aaa['ens'] = eee.ens + aaa.expand_dims("ens") + aaa["ens"] = eee.ens inlist.append(aaa) # concat on ensemble dimension - inner = xr.concat(inlist,'ens') + inner = xr.concat(inlist, "ens") outlist.append(inner) - # concat on source dimension - newhxr = xr.concat(outlist,'source') - newhxr['source'] = sourcelist + # concat on source dimension + newhxr = xr.concat(outlist, "source") + newhxr["source"] = sourcelist atthash = xlist[0].hxr.attrs attrs = check_attributes(atthash) @@ -858,53 +865,60 @@ def combine_dataset( # change time coordinate back to datetime newhxr = index2time(newhxr) if check_grid: - rval = fix_grid_continuity(newhxr) + rval = fix_grid_continuity(newhxr) else: - rval = newhxr + rval = newhxr return rval -def get_time_index(timevals,stime,dt): + +def get_time_index(timevals, stime, dt): """ timevals : list of datetimes stime : start time of time grid dt : integer - time resolution in hours of time grid. """ + def apply(ttt): - diff = pd.to_datetime(ttt)-stime - dh = diff.days*24 + diff.seconds/3600 - iii = dh/dt + diff = pd.to_datetime(ttt) - stime + dh = diff.days * 24 + diff.seconds / 3600 + iii = dh / dt return int(iii) - return [apply(x) for x in timevals] -def get_time_values(index_values,stime,dt): + return [apply(x) for x in timevals] + + +def get_time_values(index_values, stime, dt): def apply(iii): ddd = stime + datetime.timedelta(hours=float(iii)) return ddd - return [apply(x) for x in index_values] -def time2index(hxr,stime=None,dt=1): + return [apply(x) for x in index_values] + + +def time2index(hxr, stime=None, dt=1): """ hxr : xarray DataSet as output from open_dataset or combine_dataset make the time """ - stime_str = 'coordinate start time' - dt_str = 'coordinate time dt (hours)' + stime_str = "coordinate start time" + dt_str = "coordinate time dt (hours)" tvals = hxr.time.values if stime is None: - stime = tvals[0] - ilist = get_time_index(tvals,stime,dt) + stime = tvals[0] + ilist = get_time_index(tvals, stime, dt) # create a time_index coordinate. - hxr = hxr.drop_vars('time') - hxr = hxr.assign_coords(time=('time',ilist)) - hxr = hxr.assign_coords(time_values=('time',tvals)) + hxr = hxr.drop_vars("time") + hxr = hxr.assign_coords(time=("time", ilist)) + hxr = hxr.assign_coords(time_values=("time", tvals)) atthash = {stime_str: stime} atthash[dt_str] = dt hxr.attrs.update(atthash) - #temp = temp.drop_vars('time') - #temp = temp.assign_coords(time=('t',tvals)) + # temp = temp.drop_vars('time') + # temp = temp.assign_coords(time=('t',tvals)) return hxr + def index2time(hxr): """ hxr : DataSet or DataArray with time coordinate in integer values. @@ -912,24 +926,23 @@ def index2time(hxr): coordinate start time and coordinate time delta. Reverses changes made in time2index """ - if 'time_values' in hxr.coords: - tvals = hxr.time_values.values - hxr = hxr.drop_vars('time') - hxr = hxr.drop_vars('time_values') - hxr = hxr.assign_coords(time=('time',tvals)) + if "time_values" in hxr.coords: + tvals = hxr.time_values.values + hxr = hxr.drop_vars("time") + hxr = hxr.drop_vars("time_values") + hxr = hxr.assign_coords(time=("time", tvals)) else: - stime_str = 'coordinate start time' - dt_str = 'coordinate time dt (hours)' + stime_str = "coordinate start time" + dt_str = "coordinate time dt (hours)" if stime_str in hxr.attrs.keys(): - stime = pd.to_datetime(hxr.attrs[stime_str]) + stime = pd.to_datetime(hxr.attrs[stime_str]) if dt_str in hxr.attrs.keys(): - dt = hxr.attrs[dt_str] + dt = hxr.attrs[dt_str] ilist = hxr.time.values - tvals = get_time_values(ilist,stime,dt) - hxr = hxr.drop_vars('time') - hxr = hxr.assign_coords(time=('time',tvals)) - return hxr - + tvals = get_time_values(ilist, stime, dt) + hxr = hxr.drop_vars("time") + hxr = hxr.assign_coords(time=("time", tvals)) + return hxr def reset_latlon_coords(hxr): @@ -1027,21 +1040,15 @@ def get_latlongrid(attrs, xindx, yindx): success = True try: lonlist = [lon[x - 1] for x in xindx] - # latlist = [lat[x - 1] for x in yindx] except Exception as eee: logger.warning(f"Exception {eee}") logger.warning("try increasing Number Number Lon Points") - #(attrs) - #print(xindx) success = False try: - # lonlist = [lon[x - 1] for x in xindx] latlist = [lat[x - 1] for x in yindx] except Exception as eee: logger.warning(f"Exception {eee}") logger.warning("try increasing Number Number Lat Points") - #print(attrs) - #print(yindx) success = False if not success: From 73b2839a440e76364388ccf203c61a4abe1e5bee Mon Sep 17 00:00:00 2001 From: Alice Crawford Date: Wed, 17 Apr 2024 16:01:49 -0400 Subject: [PATCH 7/9] hysplit.py changed some logger statements to use f' rather than format. --- monetio/models/hysplit.py | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/monetio/models/hysplit.py b/monetio/models/hysplit.py index ba4ed7f7..8106324a 100644 --- a/monetio/models/hysplit.py +++ b/monetio/models/hysplit.py @@ -207,7 +207,7 @@ def __init__( if readwrite == "r": if verbose: - logger.info("reading " + filename) + logger.info(f"reading {filename}") self.dataflag = self.readfile(filename, drange, verbose=verbose, century=century) @staticmethod @@ -348,9 +348,7 @@ def parse_header(self, hdata1): """ if len(hdata1["start_loc"]) != 1: logger.warning( - "WARNING in ModelBin {} _readfile - number of starting locations incorrect".format( - self.filename - ) + f"WARNING in ModelBin {self.filename} _readfile - number of starting locations incorrect" ) logger.warning(str(hdata1["start_loc"])) return None @@ -382,9 +380,7 @@ def parse_hdata2(self, hdata2, nstartloc, century): century = 2000 else: century = 1900 - logger.info( - "WARNING: Guessing Century for HYSPLIT concentration file {}".format(century) - ) + logger.info(f"WARNING: Guessing Century for HYSPLIT concentration file {century}") # add sourcedate which is datetime.datetime object sourcedate = datetime.datetime( century + hdata2["r_year"][nnn], @@ -646,7 +642,7 @@ class and need to be converted to a python # imax iterations. if iimax > imax: testf = False - logger.warning("greater than imax {},{},{}".format(testf, iimax, imax)) + logger.warning(f"greater than imax {testf},{iimax},{imax}") if inc_iii: iii += 1 @@ -815,12 +811,12 @@ def combine_dataset( if not cobject.empty: xlist.append(cobject) else: - logger.warning("could not open {}".format(bbb[0])) + logger.warning(f"could not open {bbb[0]}") # check that grids are equal by comparing each grid to the one before. for iii, xobj in enumerate(xlist[1:]): if not xobj.grid_equal(xlist[iii]): - logger.warning("WARNING: grids are not the same. cannot combine") + logger.warning(f"WARNING: grids are not the same. cannot combine") sys.exit() xlist.sort() From a90d1a842c6bf62dec736eb0bb014875ea780834 Mon Sep 17 00:00:00 2001 From: Alice Crawford Date: Wed, 17 Apr 2024 16:10:43 -0400 Subject: [PATCH 8/9] two more linting changes to hysplit.py --- monetio/models/hysplit.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/monetio/models/hysplit.py b/monetio/models/hysplit.py index 8106324a..14b0041b 100644 --- a/monetio/models/hysplit.py +++ b/monetio/models/hysplit.py @@ -39,16 +39,16 @@ 2023 03 Mar AMC get_latlongrid improved exception statements 2023 08 Dec AMC add check_attributes to ModelBin and combine_datatset to make sure level height attribute is a list 2024 01 Apr AMC added logging. for combine_dataset add continue to exception so it won't fail. -2024 04 Mar AMC bug fixes to combine_dataset +2024 04 Mar AMC bug fixes to combine_dataset """ import datetime +import logging import sys import numpy as np import pandas as pd import xarray as xr -import logging logger = logging.getLogger(__name__) @@ -834,7 +834,8 @@ def combine_dataset( aaa, xbig = xr.align(xobj.xrash, xbig, join="outer") # First group and concatenate along ensemble dimension. - sourcelist = list(set([x.source for x in xlist])) + # sourcelist = list(set([x.source for x in xlist])) + sourcelist = list({x.source for x in xlist}) outlist = [] slist = [] for source in sourcelist: From 73e61e6a3347191c1669b5b3edb61f8202c2061c Mon Sep 17 00:00:00 2001 From: Alice Crawford Date: Wed, 17 Apr 2024 16:13:07 -0400 Subject: [PATCH 9/9] fixed flake8 violoations to hysplit.py --- monetio/models/hysplit.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/monetio/models/hysplit.py b/monetio/models/hysplit.py index 14b0041b..c76c5c8f 100644 --- a/monetio/models/hysplit.py +++ b/monetio/models/hysplit.py @@ -816,7 +816,7 @@ def combine_dataset( # check that grids are equal by comparing each grid to the one before. for iii, xobj in enumerate(xlist[1:]): if not xobj.grid_equal(xlist[iii]): - logger.warning(f"WARNING: grids are not the same. cannot combine") + logger.warning("WARNING: grids are not the same. cannot combine") sys.exit() xlist.sort() @@ -837,7 +837,6 @@ def combine_dataset( # sourcelist = list(set([x.source for x in xlist])) sourcelist = list({x.source for x in xlist}) outlist = [] - slist = [] for source in sourcelist: # get all objects with that source elist = [x for x in xlist if x.source == source]