From 213b4a0b2fafb98ccebb692446397a2d1a1d1959 Mon Sep 17 00:00:00 2001
From: Scott Staniewicz <scott.j.staniewicz@jpl.nasa.gov>
Date: Mon, 29 Jul 2024 10:24:57 -0400
Subject: [PATCH] Add `normalize_burst_id`, fix parsing to allow COMPASS
 filenames (#54)

* add `normalize_burst_id` for consistent parsing

* use normalize in burst_db, group regexes in `constants`

* add ability to parse COMPASS outputs with `parse_filename` and `get_dataset_name`

* fix test expected
---
 src/opera_utils/_cslc.py          | 54 ++++++++++++++++++++-----------
 src/opera_utils/burst_frame_db.py |  3 +-
 src/opera_utils/bursts.py         | 10 ++++--
 src/opera_utils/constants.py      | 33 ++++++++-----------
 src/opera_utils/geometry.py       | 13 +++++++-
 src/opera_utils/stitching.py      | 10 ++++--
 tests/test_bursts.py              | 10 ++++++
 tests/test_cslc.py                | 12 +++++++
 8 files changed, 99 insertions(+), 46 deletions(-)
diff --git a/src/opera_utils/_cslc.py b/src/opera_utils/_cslc.py
index d33d644..d5c72f4 100644
--- a/src/opera_utils/_cslc.py
+++ b/src/opera_utils/_cslc.py
@@ -16,7 +16,8 @@
 from shapely import geometry, ops, wkt
 
 from ._types import Filename
-from .constants import OPERA_IDENTIFICATION
+from .bursts import normalize_burst_id
+from .constants import COMPASS_FILE_REGEX, CSLC_S1_FILE_REGEX, OPERA_IDENTIFICATION
 
 __all__ = [
     "CslcParseError",
@@ -34,19 +35,6 @@
 logger = logging.getLogger(__name__)
 
 
-CSLC_S1_FILE_REGEX = (
-    r"(?P<project>OPERA)_"
-    r"(?P<level>L2)_"
-    r"(?P<product_type>CSLC-S1)_"
-    r"(?P<burst_id>T\d{3}-\d+-IW\d)_"
-    r"(?P<start_datetime>\d{8}T\d{6}Z)_"
-    r"(?P<end_datetime>\d{8}T\d{6}Z)_"
-    r"(?P<sensor>S1[AB])_"
-    r"(?P<polarization>VV|HH)_"
-    r"v(?P<product_version>\d+\.\d+)"
-)
-
-
 class CslcParseError(ValueError):
     """Error raised for non-matching filename."""
 
@@ -75,6 +63,10 @@ def parse_filename(h5_filename: Filename) -> dict[str, str | datetime]:
         - polarization: str
         - product_version: str
 
+    Or, if the filename is a COMPASS-generated file,
+        - burst_id: str (lowercase with underscores)
+        - start_datetime: datetime (but no hour/minute/second info)
+
     Raises
     ------
     CslcParseError
@@ -82,13 +74,28 @@ def parse_filename(h5_filename: Filename) -> dict[str, str | datetime]:
 
     """
     name = Path(h5_filename).name
-    match = re.match(CSLC_S1_FILE_REGEX, name)
-    if match is None:
+    match: re.Match | None = None
+
+    if match := re.match(CSLC_S1_FILE_REGEX, name):
+        return _parse_cslc_product(match)
+    elif match := re.match(COMPASS_FILE_REGEX, name):
+        return _parse_compass(match)
+    else:
         raise CslcParseError(f"Unable to parse {h5_filename}")
 
+
+def _parse_compass(match: re.Match):
+    result = match.groupdict()
+    result["start_datetime"] = datetime.strptime(
+        result["start_datetime"], "%Y%m%d"
+    ).replace(tzinfo=timezone.utc)
+    return result
+
+
+def _parse_cslc_product(match: re.Match):
     result = match.groupdict()
     # Normalize to lowercase / underscore
-    result["burst_id"] = result["burst_id"].lower().replace("-", "_")
+    result["burst_id"] = normalize_burst_id(result["burst_id"])
     fmt = "%Y%m%dT%H%M%SZ"
     result["start_datetime"] = datetime.strptime(result["start_datetime"], fmt).replace(
         tzinfo=timezone.utc
@@ -118,8 +125,17 @@ def get_dataset_name(h5_filename: Filename) -> str:
         If the filename cannot be parsed.
 
     """
-    pol = parse_filename(h5_filename)["polarization"]
-    return f"/data/{pol}"
+    name = Path(h5_filename).name
+    parsed = parse_filename(name)
+    if "polarization" in parsed:
+        return f"/data/{parsed['polarization']}"
+    else:
+        # For compass, no polarization is given, so we have to check the file
+        with h5py.File(h5_filename) as hf:
+            if "VV" in hf["/data"]:
+                return "/data/VV"
+            else:
+                return "/data/HH"
 
 
 def get_zero_doppler_time(filename: Filename, type_: str = "start") -> datetime:
diff --git a/src/opera_utils/burst_frame_db.py b/src/opera_utils/burst_frame_db.py
index a44bb45..61c78a1 100644
--- a/src/opera_utils/burst_frame_db.py
+++ b/src/opera_utils/burst_frame_db.py
@@ -7,6 +7,7 @@
 
 from . import datasets
 from ._types import Bbox, PathOrStr
+from .bursts import normalize_burst_id
 
 
 def read_zipped_json(filename: PathOrStr):
@@ -189,7 +190,7 @@ def get_burst_to_frame_mapping(
     if json_file is None:
         json_file = datasets.fetch_burst_to_frame_mapping_file()
     js = read_zipped_json(json_file)
-    return js["data"][burst_id.lower().replace("-", "_")]
+    return js["data"][normalize_burst_id(burst_id)]
 
 
 def get_frame_ids_for_burst(
diff --git a/src/opera_utils/bursts.py b/src/opera_utils/bursts.py
index bc0363f..dc762e5 100644
--- a/src/opera_utils/bursts.py
+++ b/src/opera_utils/bursts.py
@@ -12,6 +12,7 @@
 logger = logging.getLogger(__name__)
 
 __all__ = [
+    "normalize_burst_id",
     "get_burst_id",
     "group_by_burst",
     "sort_by_burst_id",
@@ -19,6 +20,11 @@
 ]
 
 
+def normalize_burst_id(burst_id_str: str) -> str:
+    """Normalize the OPERA S1 burst id to lowercase/underscores."""
+    return burst_id_str.lower().replace("-", "_")
+
+
 def get_burst_id(
     filename: Filename, burst_id_fmt: Union[str, Pattern[str]] = OPERA_BURST_RE
 ) -> str:
@@ -44,9 +50,7 @@ def get_burst_id(
     """
     if not (m := re.search(burst_id_fmt, str(filename))):
         raise ValueError(f"Could not parse burst id from {filename}")
-    burst_str = m.group()
-    # Normalize
-    return burst_str.lower().replace("-", "_")
+    return normalize_burst_id(m.group())
 
 
 @overload
diff --git a/src/opera_utils/constants.py b/src/opera_utils/constants.py
index dae186a..66ead17 100644
--- a/src/opera_utils/constants.py
+++ b/src/opera_utils/constants.py
@@ -2,12 +2,6 @@
 
 import re
 
-__all__ = [
-    "OPERA_DATASET_NAME",
-    "OPERA_IDENTIFICATION",
-    "OPERA_BURST_RE",
-]
-
 # Specific to OPERA CSLC products:
 OPERA_DATASET_NAME = "/data/VV"
 OPERA_IDENTIFICATION = "/identification"
@@ -22,18 +16,17 @@
     r"[tT](?P<track>\d{3})[-_](?P<burst_id>\d{6})[-_](?P<subswath>iw[1-3])",
     re.IGNORECASE,
 )
-
-DEFAULT_TIFF_OPTIONS = (
-    "COMPRESS=DEFLATE",
-    "ZLEVEL=4",
-    "TILED=YES",
-    "BLOCKXSIZE=128",
-    "BLOCKYSIZE=128",
-)
-EXTRA_COMPRESSED_TIFF_OPTIONS = (
-    *DEFAULT_TIFF_OPTIONS,
-    # Note: we're dropping mantissa bits before we do not
-    # need prevision for LOS rasters (or incidence)
-    "NBITS=16",
-    "PREDICTOR=2",
+CSLC_S1_FILE_REGEX = (
+    r"(?P<project>OPERA)_"
+    r"(?P<level>L2)_"
+    r"(?P<product_type>CSLC-S1)_"
+    r"(?P<burst_id>T\d{3}-\d+-IW\d)_"
+    r"(?P<start_datetime>\d{8}T\d{6}Z)_"
+    r"(?P<end_datetime>\d{8}T\d{6}Z)_"
+    r"(?P<sensor>S1[AB])_"
+    r"(?P<polarization>VV|HH)_"
+    r"v(?P<product_version>\d+\.\d+)"
 )
+# https://github.com/opera-adt/COMPASS/blob/16a3c1da2a5db69b9e2007d798a1110d3a6c5f9f/src/compass/utils/runconfig.py#L316-L318
+# {burst_id_str}_{date_str}
+COMPASS_FILE_REGEX = r"(?P<burst_id>t\d{3}_\d+_iw\d)_(?P<start_datetime>\d{8}).h5"
diff --git a/src/opera_utils/geometry.py b/src/opera_utils/geometry.py
index ada0c8a..3267ab0 100644
--- a/src/opera_utils/geometry.py
+++ b/src/opera_utils/geometry.py
@@ -8,11 +8,22 @@
 from opera_utils import get_burst_ids_for_frame, stitching
 from opera_utils._types import PathOrStr
 from opera_utils._utils import format_nc_filename, scratch_directory
-from opera_utils.constants import EXTRA_COMPRESSED_TIFF_OPTIONS
 from opera_utils.download import download_cslc_static_layers
 
 logger = logging.getLogger(__name__)
 
+EXTRA_COMPRESSED_TIFF_OPTIONS = (
+    "COMPRESS=DEFLATE",
+    "ZLEVEL=4",
+    "TILED=YES",
+    "BLOCKXSIZE=128",
+    "BLOCKYSIZE=128",
+    # Note: we're dropping mantissa bits before we do not
+    # need prevision for LOS rasters (or incidence)
+    "NBITS=16",
+    "PREDICTOR=2",
+)
+
 
 class Layer(Enum):
     """Names of available datasets in CSLC static layers HDF5 files."""
diff --git a/src/opera_utils/stitching.py b/src/opera_utils/stitching.py
index 5962535..e7071fd 100644
--- a/src/opera_utils/stitching.py
+++ b/src/opera_utils/stitching.py
@@ -19,10 +19,16 @@
 from opera_utils._types import Bbox, PathOrStr
 from opera_utils._utils import _get_path_from_gdal_str, numpy_to_gdal_type
 
-from .constants import DEFAULT_TIFF_OPTIONS
-
 logger = logging.getLogger(__name__)
 
+DEFAULT_TIFF_OPTIONS = (
+    "COMPRESS=DEFLATE",
+    "ZLEVEL=4",
+    "TILED=YES",
+    "BLOCKXSIZE=128",
+    "BLOCKYSIZE=128",
+)
+
 
 def merge_images(
     file_list: Sequence[PathOrStr],
diff --git a/tests/test_bursts.py b/tests/test_bursts.py
index 4d31027..5c6ab5a 100644
--- a/tests/test_bursts.py
+++ b/tests/test_bursts.py
@@ -9,10 +9,20 @@
     filter_by_burst_id,
     get_burst_id,
     group_by_burst,
+    normalize_burst_id,
 )
 from opera_utils._helpers import flatten
 
 
+def test_normalize():
+    expected = "t087_165495_iw3"
+    assert expected == normalize_burst_id("T087-165495-IW3")
+    assert expected == normalize_burst_id("T087_165495_IW3")
+    assert expected == normalize_burst_id("t087_165495_IW3")
+    assert expected == normalize_burst_id("t087_165495_iw3")
+    assert expected == normalize_burst_id("t087-165495-iw3")
+
+
 def test_get_burst_id():
     assert (
         get_burst_id("t087_185678_iw2/20180210/t087_185678_iw2_20180210.h5")
diff --git a/tests/test_cslc.py b/tests/test_cslc.py
index 6b54a52..662ef78 100644
--- a/tests/test_cslc.py
+++ b/tests/test_cslc.py
@@ -74,6 +74,18 @@ def test_file_regex():
     assert result == expected
 
 
+def test_compass_regex():
+    filename = "t042_123456_iw2_20240102.h5"
+    result = parse_filename(filename)
+    expected = {
+        "burst_id": "t042_123456_iw2",
+        "start_datetime": datetime.datetime(
+            2024, 1, 2, 0, 0, 0, tzinfo=datetime.timezone.utc
+        ),
+    }
+    assert result == expected
+
+
 def test_get_radar_wavelength():
     wvl = get_radar_wavelength(TEST_FILE)
     assert wvl == 0.05546576