From 213b4a0b2fafb98ccebb692446397a2d1a1d1959 Mon Sep 17 00:00:00 2001 From: Scott Staniewicz Date: Mon, 29 Jul 2024 10:24:57 -0400 Subject: [PATCH] Add `normalize_burst_id`, fix parsing to allow COMPASS filenames (#54) * add `normalize_burst_id` for consistent parsing * use normalize in burst_db, group regexes in `constants` * add ability to parse COMPASS outputs with `parse_filename` and `get_dataset_name` * fix test expected --- src/opera_utils/_cslc.py | 54 ++++++++++++++++++++----------- src/opera_utils/burst_frame_db.py | 3 +- src/opera_utils/bursts.py | 10 ++++-- src/opera_utils/constants.py | 33 ++++++++----------- src/opera_utils/geometry.py | 13 +++++++- src/opera_utils/stitching.py | 10 ++++-- tests/test_bursts.py | 10 ++++++ tests/test_cslc.py | 12 +++++++ 8 files changed, 99 insertions(+), 46 deletions(-) diff --git a/src/opera_utils/_cslc.py b/src/opera_utils/_cslc.py index d33d644..d5c72f4 100644 --- a/src/opera_utils/_cslc.py +++ b/src/opera_utils/_cslc.py @@ -16,7 +16,8 @@ from shapely import geometry, ops, wkt from ._types import Filename -from .constants import OPERA_IDENTIFICATION +from .bursts import normalize_burst_id +from .constants import COMPASS_FILE_REGEX, CSLC_S1_FILE_REGEX, OPERA_IDENTIFICATION __all__ = [ "CslcParseError", @@ -34,19 +35,6 @@ logger = logging.getLogger(__name__) -CSLC_S1_FILE_REGEX = ( - r"(?POPERA)_" - r"(?PL2)_" - r"(?PCSLC-S1)_" - r"(?PT\d{3}-\d+-IW\d)_" - r"(?P\d{8}T\d{6}Z)_" - r"(?P\d{8}T\d{6}Z)_" - r"(?PS1[AB])_" - r"(?PVV|HH)_" - r"v(?P\d+\.\d+)" -) - - class CslcParseError(ValueError): """Error raised for non-matching filename.""" @@ -75,6 +63,10 @@ def parse_filename(h5_filename: Filename) -> dict[str, str | datetime]: - polarization: str - product_version: str + Or, if the filename is a COMPASS-generated file, + - burst_id: str (lowercase with underscores) + - start_datetime: datetime (but no hour/minute/second info) + Raises ------ CslcParseError @@ -82,13 +74,28 @@ def parse_filename(h5_filename: Filename) -> dict[str, str | datetime]: """ name = Path(h5_filename).name - match = re.match(CSLC_S1_FILE_REGEX, name) - if match is None: + match: re.Match | None = None + + if match := re.match(CSLC_S1_FILE_REGEX, name): + return _parse_cslc_product(match) + elif match := re.match(COMPASS_FILE_REGEX, name): + return _parse_compass(match) + else: raise CslcParseError(f"Unable to parse {h5_filename}") + +def _parse_compass(match: re.Match): + result = match.groupdict() + result["start_datetime"] = datetime.strptime( + result["start_datetime"], "%Y%m%d" + ).replace(tzinfo=timezone.utc) + return result + + +def _parse_cslc_product(match: re.Match): result = match.groupdict() # Normalize to lowercase / underscore - result["burst_id"] = result["burst_id"].lower().replace("-", "_") + result["burst_id"] = normalize_burst_id(result["burst_id"]) fmt = "%Y%m%dT%H%M%SZ" result["start_datetime"] = datetime.strptime(result["start_datetime"], fmt).replace( tzinfo=timezone.utc @@ -118,8 +125,17 @@ def get_dataset_name(h5_filename: Filename) -> str: If the filename cannot be parsed. """ - pol = parse_filename(h5_filename)["polarization"] - return f"/data/{pol}" + name = Path(h5_filename).name + parsed = parse_filename(name) + if "polarization" in parsed: + return f"/data/{parsed['polarization']}" + else: + # For compass, no polarization is given, so we have to check the file + with h5py.File(h5_filename) as hf: + if "VV" in hf["/data"]: + return "/data/VV" + else: + return "/data/HH" def get_zero_doppler_time(filename: Filename, type_: str = "start") -> datetime: diff --git a/src/opera_utils/burst_frame_db.py b/src/opera_utils/burst_frame_db.py index a44bb45..61c78a1 100644 --- a/src/opera_utils/burst_frame_db.py +++ b/src/opera_utils/burst_frame_db.py @@ -7,6 +7,7 @@ from . import datasets from ._types import Bbox, PathOrStr +from .bursts import normalize_burst_id def read_zipped_json(filename: PathOrStr): @@ -189,7 +190,7 @@ def get_burst_to_frame_mapping( if json_file is None: json_file = datasets.fetch_burst_to_frame_mapping_file() js = read_zipped_json(json_file) - return js["data"][burst_id.lower().replace("-", "_")] + return js["data"][normalize_burst_id(burst_id)] def get_frame_ids_for_burst( diff --git a/src/opera_utils/bursts.py b/src/opera_utils/bursts.py index bc0363f..dc762e5 100644 --- a/src/opera_utils/bursts.py +++ b/src/opera_utils/bursts.py @@ -12,6 +12,7 @@ logger = logging.getLogger(__name__) __all__ = [ + "normalize_burst_id", "get_burst_id", "group_by_burst", "sort_by_burst_id", @@ -19,6 +20,11 @@ ] +def normalize_burst_id(burst_id_str: str) -> str: + """Normalize the OPERA S1 burst id to lowercase/underscores.""" + return burst_id_str.lower().replace("-", "_") + + def get_burst_id( filename: Filename, burst_id_fmt: Union[str, Pattern[str]] = OPERA_BURST_RE ) -> str: @@ -44,9 +50,7 @@ def get_burst_id( """ if not (m := re.search(burst_id_fmt, str(filename))): raise ValueError(f"Could not parse burst id from {filename}") - burst_str = m.group() - # Normalize - return burst_str.lower().replace("-", "_") + return normalize_burst_id(m.group()) @overload diff --git a/src/opera_utils/constants.py b/src/opera_utils/constants.py index dae186a..66ead17 100644 --- a/src/opera_utils/constants.py +++ b/src/opera_utils/constants.py @@ -2,12 +2,6 @@ import re -__all__ = [ - "OPERA_DATASET_NAME", - "OPERA_IDENTIFICATION", - "OPERA_BURST_RE", -] - # Specific to OPERA CSLC products: OPERA_DATASET_NAME = "/data/VV" OPERA_IDENTIFICATION = "/identification" @@ -22,18 +16,17 @@ r"[tT](?P\d{3})[-_](?P\d{6})[-_](?Piw[1-3])", re.IGNORECASE, ) - -DEFAULT_TIFF_OPTIONS = ( - "COMPRESS=DEFLATE", - "ZLEVEL=4", - "TILED=YES", - "BLOCKXSIZE=128", - "BLOCKYSIZE=128", -) -EXTRA_COMPRESSED_TIFF_OPTIONS = ( - *DEFAULT_TIFF_OPTIONS, - # Note: we're dropping mantissa bits before we do not - # need prevision for LOS rasters (or incidence) - "NBITS=16", - "PREDICTOR=2", +CSLC_S1_FILE_REGEX = ( + r"(?POPERA)_" + r"(?PL2)_" + r"(?PCSLC-S1)_" + r"(?PT\d{3}-\d+-IW\d)_" + r"(?P\d{8}T\d{6}Z)_" + r"(?P\d{8}T\d{6}Z)_" + r"(?PS1[AB])_" + r"(?PVV|HH)_" + r"v(?P\d+\.\d+)" ) +# https://github.com/opera-adt/COMPASS/blob/16a3c1da2a5db69b9e2007d798a1110d3a6c5f9f/src/compass/utils/runconfig.py#L316-L318 +# {burst_id_str}_{date_str} +COMPASS_FILE_REGEX = r"(?Pt\d{3}_\d+_iw\d)_(?P\d{8}).h5" diff --git a/src/opera_utils/geometry.py b/src/opera_utils/geometry.py index ada0c8a..3267ab0 100644 --- a/src/opera_utils/geometry.py +++ b/src/opera_utils/geometry.py @@ -8,11 +8,22 @@ from opera_utils import get_burst_ids_for_frame, stitching from opera_utils._types import PathOrStr from opera_utils._utils import format_nc_filename, scratch_directory -from opera_utils.constants import EXTRA_COMPRESSED_TIFF_OPTIONS from opera_utils.download import download_cslc_static_layers logger = logging.getLogger(__name__) +EXTRA_COMPRESSED_TIFF_OPTIONS = ( + "COMPRESS=DEFLATE", + "ZLEVEL=4", + "TILED=YES", + "BLOCKXSIZE=128", + "BLOCKYSIZE=128", + # Note: we're dropping mantissa bits before we do not + # need prevision for LOS rasters (or incidence) + "NBITS=16", + "PREDICTOR=2", +) + class Layer(Enum): """Names of available datasets in CSLC static layers HDF5 files.""" diff --git a/src/opera_utils/stitching.py b/src/opera_utils/stitching.py index 5962535..e7071fd 100644 --- a/src/opera_utils/stitching.py +++ b/src/opera_utils/stitching.py @@ -19,10 +19,16 @@ from opera_utils._types import Bbox, PathOrStr from opera_utils._utils import _get_path_from_gdal_str, numpy_to_gdal_type -from .constants import DEFAULT_TIFF_OPTIONS - logger = logging.getLogger(__name__) +DEFAULT_TIFF_OPTIONS = ( + "COMPRESS=DEFLATE", + "ZLEVEL=4", + "TILED=YES", + "BLOCKXSIZE=128", + "BLOCKYSIZE=128", +) + def merge_images( file_list: Sequence[PathOrStr], diff --git a/tests/test_bursts.py b/tests/test_bursts.py index 4d31027..5c6ab5a 100644 --- a/tests/test_bursts.py +++ b/tests/test_bursts.py @@ -9,10 +9,20 @@ filter_by_burst_id, get_burst_id, group_by_burst, + normalize_burst_id, ) from opera_utils._helpers import flatten +def test_normalize(): + expected = "t087_165495_iw3" + assert expected == normalize_burst_id("T087-165495-IW3") + assert expected == normalize_burst_id("T087_165495_IW3") + assert expected == normalize_burst_id("t087_165495_IW3") + assert expected == normalize_burst_id("t087_165495_iw3") + assert expected == normalize_burst_id("t087-165495-iw3") + + def test_get_burst_id(): assert ( get_burst_id("t087_185678_iw2/20180210/t087_185678_iw2_20180210.h5") diff --git a/tests/test_cslc.py b/tests/test_cslc.py index 6b54a52..662ef78 100644 --- a/tests/test_cslc.py +++ b/tests/test_cslc.py @@ -74,6 +74,18 @@ def test_file_regex(): assert result == expected +def test_compass_regex(): + filename = "t042_123456_iw2_20240102.h5" + result = parse_filename(filename) + expected = { + "burst_id": "t042_123456_iw2", + "start_datetime": datetime.datetime( + 2024, 1, 2, 0, 0, 0, tzinfo=datetime.timezone.utc + ), + } + assert result == expected + + def test_get_radar_wavelength(): wvl = get_radar_wavelength(TEST_FILE) assert wvl == 0.05546576