Skip to content

Commit

Permalink
Proposal: better handling of partial timestamps (#134)
Browse files Browse the repository at this point in the history
* Proposal: better handling of partial timestamps

* Parse format string directly

* Add unit tests

* Windows is broken

* Windows is still broken

* Fix mypy

* Simplify logic

* Fix bug for month 12, add details to docstring
  • Loading branch information
adamjstewart authored Sep 16, 2021
1 parent 73ba391 commit 8c49b2a
Show file tree
Hide file tree
Showing 4 changed files with 127 additions and 4 deletions.
7 changes: 7 additions & 0 deletions tests/datasets/test_cdl.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

import os
import shutil
from datetime import datetime
from pathlib import Path
from typing import Generator

Expand Down Expand Up @@ -55,6 +56,12 @@ def test_add(self, dataset: CDL) -> None:
ds = dataset + dataset
assert isinstance(ds, ZipDataset)

def test_full_year(self, dataset: CDL) -> None:
bbox = dataset.bounds
time = datetime(2021, 6, 1).timestamp()
query = BoundingBox(bbox.minx, bbox.maxx, bbox.miny, bbox.maxy, time, time)
next(dataset.index.intersection(query))

def test_already_downloaded(self, dataset: CDL) -> None:
CDL(root=dataset.root, download=True)

Expand Down
59 changes: 59 additions & 0 deletions tests/datasets/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,12 @@

import builtins
import glob
import math
import os
import pickle
import shutil
import sys
from datetime import datetime
from pathlib import Path
from typing import Any, Generator, Tuple

Expand All @@ -19,6 +21,7 @@
from torchgeo.datasets.utils import (
BoundingBox,
collate_dict,
disambiguate_timestamp,
download_and_extract_archive,
download_radiant_mlhub,
extract_archive,
Expand Down Expand Up @@ -206,6 +209,62 @@ def test_invalid_t(self) -> None:
BoundingBox(0, 1, 2, 3, 5, 4)


@pytest.mark.parametrize(
"date_string,format,min_datetime,max_datetime",
[
("", "", 0, sys.maxsize),
(
"2021",
"%Y",
datetime(2021, 1, 1, 0, 0, 0, 0).timestamp(),
datetime(2021, 12, 31, 23, 59, 59, 999999).timestamp(),
),
(
"2021-09",
"%Y-%m",
datetime(2021, 9, 1, 0, 0, 0, 0).timestamp(),
datetime(2021, 9, 30, 23, 59, 59, 999999).timestamp(),
),
(
"2021-09-13",
"%Y-%m-%d",
datetime(2021, 9, 13, 0, 0, 0, 0).timestamp(),
datetime(2021, 9, 13, 23, 59, 59, 999999).timestamp(),
),
(
"2021-09-13 17",
"%Y-%m-%d %H",
datetime(2021, 9, 13, 17, 0, 0, 0).timestamp(),
datetime(2021, 9, 13, 17, 59, 59, 999999).timestamp(),
),
(
"2021-09-13 17:21",
"%Y-%m-%d %H:%M",
datetime(2021, 9, 13, 17, 21, 0, 0).timestamp(),
datetime(2021, 9, 13, 17, 21, 59, 999999).timestamp(),
),
(
"2021-09-13 17:21:53",
"%Y-%m-%d %H:%M:%S",
datetime(2021, 9, 13, 17, 21, 53, 0).timestamp(),
datetime(2021, 9, 13, 17, 21, 53, 999999).timestamp(),
),
(
"2021-09-13 17:21:53:000123",
"%Y-%m-%d %H:%M:%S:%f",
datetime(2021, 9, 13, 17, 21, 53, 123).timestamp(),
datetime(2021, 9, 13, 17, 21, 53, 123).timestamp(),
),
],
)
def test_disambiguate_timestamp(
date_string: str, format: str, min_datetime: float, max_datetime: float
) -> None:
mint, maxt = disambiguate_timestamp(date_string, format)
assert math.isclose(mint, min_datetime)
assert math.isclose(maxt, max_datetime)


def test_collate_dict() -> None:
samples = [
{
Expand Down
6 changes: 2 additions & 4 deletions torchgeo/datasets/geo.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
import os
import re
import sys
from datetime import datetime
from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, cast

import fiona
Expand All @@ -27,7 +26,7 @@
from torch import Tensor
from torch.utils.data import Dataset

from .utils import BoundingBox
from .utils import BoundingBox, disambiguate_timestamp

# https://github.com/pytorch/pytorch/issues/60979
# https://github.com/pytorch/pytorch/pull/61045
Expand Down Expand Up @@ -230,8 +229,7 @@ def __init__(
maxt: float = sys.maxsize
if "date" in match.groupdict():
date = match.group("date")
time = datetime.strptime(date, self.date_format)
mint = maxt = time.timestamp()
mint, maxt = disambiguate_timestamp(date, self.date_format)

coords = (minx, maxx, miny, maxy, mint, maxt)
self.index.insert(i, coords, filepath)
Expand Down
59 changes: 59 additions & 0 deletions torchgeo/datasets/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,10 @@
import gzip
import lzma
import os
import sys
import tarfile
import zipfile
from datetime import datetime, timedelta
from typing import Any, Dict, Iterator, List, Optional, Tuple, Union

import torch
Expand All @@ -22,6 +24,7 @@
"download_and_extract_archive",
"extract_archive",
"BoundingBox",
"disambiguate_timestamp",
"working_dir",
"collate_dict",
)
Expand Down Expand Up @@ -255,6 +258,62 @@ def intersects(self, other: "BoundingBox") -> bool:
)


def disambiguate_timestamp(date_str: str, format: str) -> Tuple[float, float]:
"""Disambiguate partial timestamps.
TorchGeo stores the timestamp of each file in a spatiotemporal R-tree. If the full
timestamp isn't known, a file could represent a range of time. For example, in the
CDL dataset, each mask spans an entire year. This method returns the maximum
possible range of timestamps that ``date_str`` could belong to. It does this by
parsing ``format`` to determine the level of precision of ``date_str``.
Args:
date_str: string representing date and time of a data point
format: format codes accepted by :meth:`datetime.datetime.strptime`
Returns:
(mint, maxt) tuple for indexing
"""
mint = datetime.strptime(date_str, format)

# TODO: This doesn't correctly handle literal `%%` characters in format
# TODO: May have issues with time zones, UTC vs. local time, and DST
# TODO: This is really tedious, is there a better way to do this?

if not any([f"%{c}" in format for c in "yYcxG"]):
# No temporal info
return 0, sys.maxsize
elif not any([f"%{c}" in format for c in "bBmjUWcxV"]):
# Year resolution
maxt = datetime(mint.year + 1, 1, 1)
elif not any([f"%{c}" in format for c in "aAwdjcxV"]):
# Month resolution
if mint.month == 12:
maxt = datetime(mint.year + 1, 1, 1)
else:
maxt = datetime(mint.year, mint.month + 1, 1)
elif not any([f"%{c}" in format for c in "HIcX"]):
# Day resolution
maxt = mint + timedelta(days=1)
elif not any([f"%{c}" in format for c in "McX"]):
# Hour resolution
maxt = mint + timedelta(hours=1)
elif not any([f"%{c}" in format for c in "ScX"]):
# Minute resolution
maxt = mint + timedelta(minutes=1)
elif not any([f"%{c}" in format for c in "f"]):
# Second resolution
maxt = mint + timedelta(seconds=1)
else:
# Microsecond resolution
maxt = mint + timedelta(microseconds=1)

mint -= timedelta(microseconds=1)
maxt -= timedelta(microseconds=1)

return mint.timestamp(), maxt.timestamp()


@contextlib.contextmanager
def working_dir(dirname: str, create: bool = False) -> Iterator[None]:
"""Context manager for changing directories.
Expand Down

0 comments on commit 8c49b2a

Please sign in to comment.