maawoo · Nov 8, 2024 · Nov 8, 2024 · Nov 8, 2024 · Nov 12, 2024 · Nov 12, 2024
Showing with 180 additions and 84 deletions.

+3 −3 CITATION.cff

+1 −1 LICENSE

+19 −12 README.md

+15 −15 gedixr/ancillary.py

+61 −30 gedixr/gedi.py

+80 −22 gedixr/xr.py

+1 −1 pyproject.toml
diff --git a/CITATION.cff b/CITATION.cff
@@ -12,10 +12,10 @@ authors:
     orcid: 'https://orcid.org/0000-0002-5231-7208'
 identifiers:
   - type: url
-    value: 'https://github.com/maawoo/gedixr/tree/v0.3.1'
-    description: The URL of version 0.3.1 of the software.
+    value: 'https://github.com/maawoo/gedixr/tree/v0.4.0'
+    description: The URL of version 0.4.0 of the software.
 repository-code: 'https://github.com/maawoo/gedixr'
 license: MIT
 commit: f6742d2
-version: 0.3.1
+version: 0.4.0
 date-released: '2024-08-23'
diff --git a/LICENSE b/LICENSE
@@ -1,6 +1,6 @@
 MIT License
 
-Copyright (c) 2023 Marco Wolsza
+Copyright (c) 2023-2024 Marco Wolsza
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal

diff --git a/README.md b/README.md
@@ -23,9 +23,9 @@ pip install git+https://github.com/maawoo/gedixr.git
 See the [Tags](https://github.com/maawoo/gedixr/tags) section of the repository
 for available versions to install:
 ```bash
-conda env create --file https://raw.githubusercontent.com/maawoo/gedixr/v0.3.0/environment.yml
+conda env create --file https://raw.githubusercontent.com/maawoo/gedixr/v0.4.0/environment.yml
 conda activate gedixr_env
-pip install git+https://github.com/maawoo/gedixr.git@v0.3.0
+pip install git+https://github.com/maawoo/gedixr.git@v0.4.0
 ```
 
 ## Usage
@@ -57,7 +57,7 @@ extents, you can then merge both GeoDataFrames:
 ```python
 from gedixr.xr import merge_gdf
 
-gdf = merge_gdf(gdf_l2a=gdf_l2a, gdf_l2b=gdf_l2b)
+gdf = merge_gdf(l2a=gdf_l2a, l2b=gdf_l2b)
 ```
 
 If you want to rasterize the GeoDataFrame and use the data as an `xarray.Dataset`:
@@ -76,9 +76,9 @@ gdf = load_to_gdf(l2a="path/to/extracted_l2a.parquet")
 
 ### Custom subsetting
 If your GEDI data is not subsetted (i.e., each file covering an entire orbit), 
-you can provide a vector file (e.g. GeoJSON, GPKG, etc.) to extract metrics for 
-your area of interest. You can also provide a list of vector files to extract 
-for multiple areas at the same time:
+you can provide a vector file (e.g. GeoJSON, GeoPackage, etc.) to extract 
+metrics for your area of interest. You can also provide a list of vector files 
+to extract for multiple areas at the same time:
 ```python
 from gedixr.gedi import extract_data
 
@@ -118,7 +118,7 @@ via the `variables` parameter:
 - `rh98`: Relative height metrics at 98% interval
 
 **L2B**:
-- `rh100`: Height above ground of the received waveform signal start (rh101 from L2A)
+- `rh100`: Height above ground of the received waveform signal start (`rh101` from L2A)
 - `tcc`: Total canopy cover
 - `fhd`: Foliage Height Diversity
 - `pai`: Total Plant Area Index
@@ -130,13 +130,20 @@ product: [L2A](https://lpdaac.usgs.gov/products/gedi02_av002/) and [L2B](https:/
 The extraction process will automatically apply quality filtering based on the 
 `quality_flag`, `degrade_flag` and `sensitivity` variables using the following
 default values:
-- `quality_flag` not equal 0 
-- `degrade_flag` < 1
-- `sensitivity` > 0.9
+- `quality_flag` == 1 
+- `degrade_flag` == 0
+- `num_detectedmodes` > 0
+- abs(`ele_lowestmode` - `digital_elevation_model`) < 100
+
+Please note that `quality_flag` already includes filtering to a `sensitivity` 
+range of 0.9 - 1.0. 
+
+If you want to apply a different quality filtering strategy, you can disable the
+default filtering by setting `apply_quality_filter=False` and apply your own filtering
+after the extraction process.
 
 ## Notes
-<sup>1</sup>See [#1](https://github.com/maawoo/gedixr/issues/1) for a related 
-issue regarding the download of GEDI data.
+<sup>1</sup>See [#1](https://github.com/maawoo/gedixr/issues/1) for a related issue regarding the download of GEDI data.
 
 <sup>2</sup>The products need to be unzipped first which can seriously increase 
 the amount of disk space needed (~90 MB compressed -> ~3 GB uncompressed... per 

diff --git a/gedixr/ancillary.py b/gedixr/ancillary.py
@@ -105,8 +105,7 @@ def close_logging(log_handler: logging.Logger) -> None:
             log_handler.removeHandler(handler)
 
 
-def prepare_roi(vec: Path | list[Path]
-                ) -> dict[str, dict[str, Polygon | None]]:
+def prepare_vec(vec: Path | list[Path]) -> dict[str, dict[str, Polygon | None]]:
     """
     Prepares a vector file or list of vector files for spatial subsetting by
     extracting the geometry of each vector file and storing it in a dictionary.
@@ -125,24 +124,25 @@ def prepare_roi(vec: Path | list[Path]
         {'<Vector Basename>': {'geo': Polygon,
                                'gdf': None}}
     """
+    out = {}
     if not isinstance(vec, list):
         vec = [vec]
-
-    out = {}
-    for v in vec:
-        roi = gp.GeoDataFrame.from_file(str(v))
-        if not roi.crs == 'EPSG:4326':
-            roi = roi.to_crs('EPSG:4326')
-        if len(roi) > 1:
-            print("WARNING: Multi-feature polygon detected. Only the first "
-                  "feature will be used to subset the GEDI data!")
-        v_basename = Path(v).name.split('.')[0]
-        out[v_basename] = {'geo': roi.geometry[0], 'gdf': None}
-
+    for fp in vec:
+        key_base = Path(fp).name.split('.')[0]
+        gdf = gp.GeoDataFrame.from_file(str(fp))
+        if not gdf.crs == 'EPSG:4326':
+            gdf = gdf.to_crs('EPSG:4326')
+        if len(gdf) > 1:
+            for i, row in gdf.iterrows():
+                key = f"{key_base}_{i}"
+                out[key] = {'geo': row.geometry, 'gdf': None}
+        else:
+            out[key_base] = {'geo': gdf.iloc[0].geometry, 'gdf': None}
     return out
 
 
-def to_pathlib(x: str | list[str]) -> Path | list[Path]:
+def to_pathlib(x: str | list[str] | list[Path]
+               ) -> Path | list[Path]:
     """
     Convert string(s) to Path object(s).
     

diff --git a/gedixr/gedi.py b/gedixr/gedi.py
@@ -34,25 +34,34 @@
 _DEFAULT_BASE = {'L2A': [('shot', 'shot_number'),
                          ('latitude', 'lat_lowestmode'),
                          ('longitude', 'lon_lowestmode'),
+                         ('elev', 'elev_lowestmode'),
+                         ('elev_dem_tdx', 'digital_elevation_model'),
                          ('degrade_flag', 'degrade_flag'),
                          ('quality_flag', 'quality_flag'),
-                         ('sensitivity', 'sensitivity')],
+                         ('sensitivity', 'sensitivity'),
+                         ('num_detectedmodes', 'num_detectedmodes')],
                  'L2B': [('shot', 'shot_number'),
                          ('latitude', 'geolocation/lat_lowestmode'),
                          ('longitude', 'geolocation/lon_lowestmode'),
+                         ('elev', 'geolocation/elev_lowestmode'),
+                         ('elev_dem_tdx', 'geolocation/digital_elevation_model'),
                          ('degrade_flag', 'geolocation/degrade_flag'),
                          ('quality_flag', 'l2b_quality_flag'),
-                         ('sensitivity', 'sensitivity')]
+                         ('sensitivity', 'sensitivity'),
+                         ('num_detectedmodes', 'num_detectedmodes')]
                  }
 
+N_ERRORS = 0
+
 
 def extract_data(directory: str | Path,
                  gedi_product: str,
                  temp_unpack_zip: bool = False,
                  variables: Optional[list[tuple[str, str]]] = None,
                  beams: Optional[str| list[str]] = None,
                  filter_month: Optional[tuple[int, int]] = None,
-                 subset_vector: Optional[str | Path | list[str | Path]] = None
+                 subset_vector: Optional[str | Path | list[str | Path]] = None,
+                 apply_quality_filter: bool = True
                  ) -> (GeoDataFrame | dict[str, dict[str, GeoDataFrame | Polygon]]):
     """
     Extracts data from GEDI L2A or L2B files in HDF5 format using the following
@@ -61,7 +70,7 @@ def extract_data(directory: str | Path,
     (1) Search a root directory recursively for GEDI L2A or L2B HDF5 files
     (2) OPTIONAL: Filter files by month of acquisition
     (3) Extract data from each file for specified beams and variables into a Dataframe
-    (4) Filter out shots of poor quality
+    (4) OPTIONAL: Filter out shots of poor quality
     (5) Convert Dataframe to GeoDataFrame including geometry column
     (6) OPTIONAL: Subset shots spatially using intersection via provided vector
         file or list of vector files
@@ -101,6 +110,11 @@ def extract_data(directory: str | Path,
         Note that the basename of each vector file will be used in the output
         names, so it is recommended to give those files reasonable names
         beforehand!
+    apply_quality_filter: bool, optional
+        Apply a basic quality filter to the GEDI data? Default is True. This basic
+        filtering strategy will filter out shots with quality_flag != 1,
+        degrade_flag != 0, num_detectedmodes > 1, and difference between detected
+        elevation and DEM elevation < 100 m.
     
     Returns
     -------
@@ -116,7 +130,6 @@ def extract_data(directory: str | Path,
     subset_vector = anc.to_pathlib(x=subset_vector) if \
         (subset_vector is not None) else None
     log_handler, now = anc.set_logging(directory, gedi_product)
-    n_err = 0
     out_dict = None
     if gedi_product == 'L2A':
         variables = DEFAULT_VARIABLES['L2A'] if variables is None else variables
@@ -135,7 +148,7 @@ def extract_data(directory: str | Path,
     if filter_month is None:
         filter_month = (1, 12)
     if subset_vector is not None:
-        out_dict = anc.prepare_roi(vec=subset_vector)
+        out_dict = anc.prepare_vec(vec=subset_vector)
     layers = _DEFAULT_BASE[gedi_product] + variables
 
     tmp_dirs = None
@@ -170,15 +183,15 @@ def extract_data(directory: str | Path,
                 # (3) Extract data for specified beams and variables
                 df = pd.DataFrame(_from_file(gedi=gedi,
                                              gedi_fp=fp,
+                                             gedi_product=gedi_product,
                                              beams=beams,
                                              layers=layers,
                                              acq_time=date,
                                              log_handler=log_handler))
 
                 # (4) Filter by quality flags
-                df = filter_quality(df=df,
-                                    log_handler=log_handler,
-                                    gedi_path=fp)
+                if apply_quality_filter:
+                    df = filter_quality(df=df, log_handler=log_handler, gedi_path=fp)
 
                 # (5) Convert to GeoDataFrame, set 'Shot Number' as index and convert
                 # acquisition time to datetime
@@ -209,33 +222,41 @@ def extract_data(directory: str | Path,
             except Exception as msg:
                 anc.log(handler=log_handler, mode='exception', file=fp.name,
                         msg=str(msg))
-                n_err += 1
+                _error_counter()
 
         # (7) & (8)
+        flt = "flt0"
+        if apply_quality_filter:
+            flt = "flt1"
         out_dir = directory / 'extracted'
         out_dir.mkdir(exist_ok=True)
         if subset_vector is not None:
             for vec_base, _dict in out_dict.items():
                 if _dict['gdf'] is not None:
-                    out_name = f'{now}__{gedi_product}__subset_{vec_base}.parquet'
+                    out_name = f'{now}__{gedi_product}_{flt}__subset_{vec_base}.parquet'
                     _dict['gdf'].to_parquet(out_dir / out_name)
             return out_dict
         else:
             out = pd.concat(gdf_list_no_spatial_subset)
-            out_name = f'{now}__{gedi_product}.parquet'
+            out_name = f'{now}__{gedi_product}_{flt}.parquet'
             out.to_parquet(out_dir / out_name)
             return out
     except Exception as msg:
         anc.log(handler=log_handler, mode='exception', msg=str(msg))
-        n_err += 1
+        _error_counter()
     finally:
         _cleanup_tmp_dirs(tmp_dirs)
         anc.close_logging(log_handler=log_handler)
-        if n_err > 0:
-            print(f"WARNING: {n_err} errors occurred during the extraction "
+        if N_ERRORS > 0:
+            print(f"WARNING: {N_ERRORS} errors occurred during the extraction "
                   f"process. Please check the log file!")
 
 
+def _error_counter():
+    global N_ERRORS
+    N_ERRORS += 1
+
+
 def _date_from_gedi_file(gedi_path: Path) -> datetime:
     """Extract date string from GEDI filename and convert to datetime object."""
     date_str = re.search('[AB]_[0-9]{13}', gedi_path.name).group()
@@ -271,6 +292,7 @@ def _filepaths_from_zips(directory: Path,
 
 def _from_file(gedi: h5py.File,
                gedi_fp: Path,
+               gedi_product: str,
                beams: list[str],
                layers: list[tuple[str, str]],
                acq_time: datetime,
@@ -285,6 +307,8 @@ def _from_file(gedi: h5py.File,
         A loaded GEDI HDF5 file.
     gedi_fp: Path
         Path to the current GEDI HDF5 file.
+    gedi_product: str
+        GEDI product type. Either 'L2A' or 'L2B'.
     beams: list of str
         List of GEDI beams to extract values from.
     layers: list of tuple of str
@@ -302,24 +326,30 @@ def _from_file(gedi: h5py.File,
     """
     out = {}
     for beam in beams:
+        if beam not in list(gedi.keys()):
+            anc.log(handler=log_handler, mode='info', file=gedi_fp.name,
+                    msg=f"{beam} not found in file")
+            continue
         try:
             for k, v in layers:
-                if v.startswith('rh') and v != 'rh100':
+                if v.startswith('rh') and gedi_product == 'L2A':
                     if k not in out:
                         out[k] = []
-                    out[k].extend([round(h[int(v[2:])] * 100) for h in
-                                   gedi[f'{beam}/rh'][()]])
+                    idx = int(v[2:])
+                    out[k].extend([round(h_bin[idx] * 100) for h_bin in
+                                   gedi[f"{beam}/rh"][()]])
                 elif v == 'shot_number':
                     if k not in out:
                         out[k] = []
-                    out[k].extend([str(h) for h in gedi[f'{beam}/{v}'][()]])
+                    out[k].extend([f"{_id:0>18}" for _id in gedi[f"{beam}/{v}"][()]])
                 else:
                     if k not in out:
                         out[k] = []
-                    out[k].extend(gedi[f'{beam}/{v}'][()])
-        except Exception as e:
-            msg = f"{beam} doesn't exist and was skipped."
-            anc.log(handler=log_handler, mode='info', file=gedi_fp.name, msg=msg)
+                    out[k].extend(gedi[f"{beam}/{v}"][()])
+        except Exception as msg:
+            anc.log(handler=log_handler, mode='exception',
+                    file=f"{gedi_fp.name} ({beam})", msg=str(msg))
+            _error_counter()
     out['acq_time'] = [(str(acq_time)) for _ in range(len(out['shot']))]
     return out
 
@@ -330,9 +360,7 @@ def filter_quality(df: DataFrame,
                    ) -> DataFrame:
     """
     Filters a given pandas.Dataframe containing GEDI data using its quality
-    flags. The values used here have been adopted from the official GEDI L2A/L2B
-    tutorials:
-    https://git.earthdata.nasa.gov/projects/LPDUR/repos/gedi-v2-tutorials/browse
+    flags.
     
     Parameters
     ----------
@@ -349,11 +377,14 @@ def filter_quality(df: DataFrame,
         The quality-filtered dataframe.
     """
     len_before = len(df)
-    cond = ((df['quality_flag'].ne(0)) &
-            (df['degrade_flag'] < 1) &
-            (df['sensitivity'] > 0.9))
+    cond = (
+            (df['quality_flag'].eq(1)) &  # already includes 'sensitivity' > 0.9
+            (df['degrade_flag'].eq(0)) &
+            (df['num_detectedmodes'].ge(1)) &
+            (abs(df['elev'] - df['elev_dem_tdx']) < 100)
+    )
     df = df.where(cond).dropna()
-    df = df.drop(columns=['quality_flag', 'degrade_flag', 'sensitivity'])
+    df = df.drop(columns=['quality_flag', 'degrade_flag'])
     len_after = len_before - len(df)
     filt_perc = round((len_after / len_before) * 100, 2)
     msg = f"{str(len_after).zfill(5)}/{str(len_before).zfill(5)} " \