Skip to content

Commit

Permalink
Add normalize_burst_id, fix parsing to allow COMPASS filenames (#54)
Browse files Browse the repository at this point in the history
* add `normalize_burst_id` for consistent parsing

* use normalize in burst_db, group regexes in `constants`

* add ability to parse COMPASS outputs with `parse_filename` and `get_dataset_name`

* fix test expected
  • Loading branch information
scottstanie authored Jul 29, 2024
1 parent 9b0c3c9 commit 213b4a0
Show file tree
Hide file tree
Showing 8 changed files with 99 additions and 46 deletions.
54 changes: 35 additions & 19 deletions src/opera_utils/_cslc.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,8 @@
from shapely import geometry, ops, wkt

from ._types import Filename
from .constants import OPERA_IDENTIFICATION
from .bursts import normalize_burst_id
from .constants import COMPASS_FILE_REGEX, CSLC_S1_FILE_REGEX, OPERA_IDENTIFICATION

__all__ = [
"CslcParseError",
Expand All @@ -34,19 +35,6 @@
logger = logging.getLogger(__name__)


CSLC_S1_FILE_REGEX = (
r"(?P<project>OPERA)_"
r"(?P<level>L2)_"
r"(?P<product_type>CSLC-S1)_"
r"(?P<burst_id>T\d{3}-\d+-IW\d)_"
r"(?P<start_datetime>\d{8}T\d{6}Z)_"
r"(?P<end_datetime>\d{8}T\d{6}Z)_"
r"(?P<sensor>S1[AB])_"
r"(?P<polarization>VV|HH)_"
r"v(?P<product_version>\d+\.\d+)"
)


class CslcParseError(ValueError):
"""Error raised for non-matching filename."""

Expand Down Expand Up @@ -75,20 +63,39 @@ def parse_filename(h5_filename: Filename) -> dict[str, str | datetime]:
- polarization: str
- product_version: str
Or, if the filename is a COMPASS-generated file,
- burst_id: str (lowercase with underscores)
- start_datetime: datetime (but no hour/minute/second info)
Raises
------
CslcParseError
If the filename does not match the expected pattern.
"""
name = Path(h5_filename).name
match = re.match(CSLC_S1_FILE_REGEX, name)
if match is None:
match: re.Match | None = None

if match := re.match(CSLC_S1_FILE_REGEX, name):
return _parse_cslc_product(match)
elif match := re.match(COMPASS_FILE_REGEX, name):
return _parse_compass(match)
else:
raise CslcParseError(f"Unable to parse {h5_filename}")


def _parse_compass(match: re.Match):
result = match.groupdict()
result["start_datetime"] = datetime.strptime(
result["start_datetime"], "%Y%m%d"
).replace(tzinfo=timezone.utc)
return result


def _parse_cslc_product(match: re.Match):
result = match.groupdict()
# Normalize to lowercase / underscore
result["burst_id"] = result["burst_id"].lower().replace("-", "_")
result["burst_id"] = normalize_burst_id(result["burst_id"])
fmt = "%Y%m%dT%H%M%SZ"
result["start_datetime"] = datetime.strptime(result["start_datetime"], fmt).replace(
tzinfo=timezone.utc
Expand Down Expand Up @@ -118,8 +125,17 @@ def get_dataset_name(h5_filename: Filename) -> str:
If the filename cannot be parsed.
"""
pol = parse_filename(h5_filename)["polarization"]
return f"/data/{pol}"
name = Path(h5_filename).name
parsed = parse_filename(name)
if "polarization" in parsed:
return f"/data/{parsed['polarization']}"
else:
# For compass, no polarization is given, so we have to check the file
with h5py.File(h5_filename) as hf:
if "VV" in hf["/data"]:
return "/data/VV"
else:
return "/data/HH"


def get_zero_doppler_time(filename: Filename, type_: str = "start") -> datetime:
Expand Down
3 changes: 2 additions & 1 deletion src/opera_utils/burst_frame_db.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

from . import datasets
from ._types import Bbox, PathOrStr
from .bursts import normalize_burst_id


def read_zipped_json(filename: PathOrStr):
Expand Down Expand Up @@ -189,7 +190,7 @@ def get_burst_to_frame_mapping(
if json_file is None:
json_file = datasets.fetch_burst_to_frame_mapping_file()
js = read_zipped_json(json_file)
return js["data"][burst_id.lower().replace("-", "_")]
return js["data"][normalize_burst_id(burst_id)]


def get_frame_ids_for_burst(
Expand Down
10 changes: 7 additions & 3 deletions src/opera_utils/bursts.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,19 @@
logger = logging.getLogger(__name__)

__all__ = [
"normalize_burst_id",
"get_burst_id",
"group_by_burst",
"sort_by_burst_id",
"filter_by_burst_id",
]


def normalize_burst_id(burst_id_str: str) -> str:
"""Normalize the OPERA S1 burst id to lowercase/underscores."""
return burst_id_str.lower().replace("-", "_")


def get_burst_id(
filename: Filename, burst_id_fmt: Union[str, Pattern[str]] = OPERA_BURST_RE
) -> str:
Expand All @@ -44,9 +50,7 @@ def get_burst_id(
"""
if not (m := re.search(burst_id_fmt, str(filename))):
raise ValueError(f"Could not parse burst id from {filename}")
burst_str = m.group()
# Normalize
return burst_str.lower().replace("-", "_")
return normalize_burst_id(m.group())


@overload
Expand Down
33 changes: 13 additions & 20 deletions src/opera_utils/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,6 @@

import re

__all__ = [
"OPERA_DATASET_NAME",
"OPERA_IDENTIFICATION",
"OPERA_BURST_RE",
]

# Specific to OPERA CSLC products:
OPERA_DATASET_NAME = "/data/VV"
OPERA_IDENTIFICATION = "/identification"
Expand All @@ -22,18 +16,17 @@
r"[tT](?P<track>\d{3})[-_](?P<burst_id>\d{6})[-_](?P<subswath>iw[1-3])",
re.IGNORECASE,
)

DEFAULT_TIFF_OPTIONS = (
"COMPRESS=DEFLATE",
"ZLEVEL=4",
"TILED=YES",
"BLOCKXSIZE=128",
"BLOCKYSIZE=128",
)
EXTRA_COMPRESSED_TIFF_OPTIONS = (
*DEFAULT_TIFF_OPTIONS,
# Note: we're dropping mantissa bits before we do not
# need prevision for LOS rasters (or incidence)
"NBITS=16",
"PREDICTOR=2",
CSLC_S1_FILE_REGEX = (
r"(?P<project>OPERA)_"
r"(?P<level>L2)_"
r"(?P<product_type>CSLC-S1)_"
r"(?P<burst_id>T\d{3}-\d+-IW\d)_"
r"(?P<start_datetime>\d{8}T\d{6}Z)_"
r"(?P<end_datetime>\d{8}T\d{6}Z)_"
r"(?P<sensor>S1[AB])_"
r"(?P<polarization>VV|HH)_"
r"v(?P<product_version>\d+\.\d+)"
)
# https://github.com/opera-adt/COMPASS/blob/16a3c1da2a5db69b9e2007d798a1110d3a6c5f9f/src/compass/utils/runconfig.py#L316-L318
# {burst_id_str}_{date_str}
COMPASS_FILE_REGEX = r"(?P<burst_id>t\d{3}_\d+_iw\d)_(?P<start_datetime>\d{8}).h5"
13 changes: 12 additions & 1 deletion src/opera_utils/geometry.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,22 @@
from opera_utils import get_burst_ids_for_frame, stitching
from opera_utils._types import PathOrStr
from opera_utils._utils import format_nc_filename, scratch_directory
from opera_utils.constants import EXTRA_COMPRESSED_TIFF_OPTIONS
from opera_utils.download import download_cslc_static_layers

logger = logging.getLogger(__name__)

EXTRA_COMPRESSED_TIFF_OPTIONS = (
"COMPRESS=DEFLATE",
"ZLEVEL=4",
"TILED=YES",
"BLOCKXSIZE=128",
"BLOCKYSIZE=128",
# Note: we're dropping mantissa bits before we do not
# need prevision for LOS rasters (or incidence)
"NBITS=16",
"PREDICTOR=2",
)


class Layer(Enum):
"""Names of available datasets in CSLC static layers HDF5 files."""
Expand Down
10 changes: 8 additions & 2 deletions src/opera_utils/stitching.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,16 @@
from opera_utils._types import Bbox, PathOrStr
from opera_utils._utils import _get_path_from_gdal_str, numpy_to_gdal_type

from .constants import DEFAULT_TIFF_OPTIONS

logger = logging.getLogger(__name__)

DEFAULT_TIFF_OPTIONS = (
"COMPRESS=DEFLATE",
"ZLEVEL=4",
"TILED=YES",
"BLOCKXSIZE=128",
"BLOCKYSIZE=128",
)


def merge_images(
file_list: Sequence[PathOrStr],
Expand Down
10 changes: 10 additions & 0 deletions tests/test_bursts.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,20 @@
filter_by_burst_id,
get_burst_id,
group_by_burst,
normalize_burst_id,
)
from opera_utils._helpers import flatten


def test_normalize():
expected = "t087_165495_iw3"
assert expected == normalize_burst_id("T087-165495-IW3")
assert expected == normalize_burst_id("T087_165495_IW3")
assert expected == normalize_burst_id("t087_165495_IW3")
assert expected == normalize_burst_id("t087_165495_iw3")
assert expected == normalize_burst_id("t087-165495-iw3")


def test_get_burst_id():
assert (
get_burst_id("t087_185678_iw2/20180210/t087_185678_iw2_20180210.h5")
Expand Down
12 changes: 12 additions & 0 deletions tests/test_cslc.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,18 @@ def test_file_regex():
assert result == expected


def test_compass_regex():
filename = "t042_123456_iw2_20240102.h5"
result = parse_filename(filename)
expected = {
"burst_id": "t042_123456_iw2",
"start_datetime": datetime.datetime(
2024, 1, 2, 0, 0, 0, tzinfo=datetime.timezone.utc
),
}
assert result == expected


def test_get_radar_wavelength():
wvl = get_radar_wavelength(TEST_FILE)
assert wvl == 0.05546576
Expand Down

0 comments on commit 213b4a0

Please sign in to comment.