Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Proposal: better handling of partial timestamps #134

Merged
merged 8 commits into from
Sep 16, 2021
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions tests/datasets/test_cdl.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

import os
import shutil
from datetime import datetime
from pathlib import Path
from typing import Generator

Expand Down Expand Up @@ -55,6 +56,12 @@ def test_add(self, dataset: CDL) -> None:
ds = dataset + dataset
assert isinstance(ds, ZipDataset)

def test_full_year(self, dataset: CDL) -> None:
bbox = dataset.bounds
time = datetime(2021, 6, 1).timestamp()
query = BoundingBox(bbox.minx, bbox.maxx, bbox.miny, bbox.maxy, time, time)
next(dataset.index.intersection(query))

def test_already_downloaded(self, dataset: CDL) -> None:
CDL(root=dataset.root, download=True)

Expand Down
59 changes: 59 additions & 0 deletions tests/datasets/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,12 @@

import builtins
import glob
import math
import os
import pickle
import shutil
import sys
from datetime import datetime
from pathlib import Path
from typing import Any, Generator, Tuple

Expand All @@ -19,6 +21,7 @@
from torchgeo.datasets.utils import (
BoundingBox,
collate_dict,
disambiguate_timestamp,
download_and_extract_archive,
download_radiant_mlhub,
extract_archive,
Expand Down Expand Up @@ -206,6 +209,62 @@ def test_invalid_t(self) -> None:
BoundingBox(0, 1, 2, 3, 5, 4)


@pytest.mark.parametrize(
"date_string,format,min_datetime,max_datetime",
[
("", "", 0, sys.maxsize),
(
"2021",
"%Y",
datetime(2021, 1, 1, 0, 0, 0, 0).timestamp(),
datetime(2021, 12, 31, 23, 59, 59, 999999).timestamp(),
),
(
"2021-09",
"%Y-%m",
datetime(2021, 9, 1, 0, 0, 0, 0).timestamp(),
datetime(2021, 9, 30, 23, 59, 59, 999999).timestamp(),
),
(
"2021-09-13",
"%Y-%m-%d",
datetime(2021, 9, 13, 0, 0, 0, 0).timestamp(),
datetime(2021, 9, 13, 23, 59, 59, 999999).timestamp(),
),
(
"2021-09-13 17",
"%Y-%m-%d %H",
datetime(2021, 9, 13, 17, 0, 0, 0).timestamp(),
datetime(2021, 9, 13, 17, 59, 59, 999999).timestamp(),
),
(
"2021-09-13 17:21",
"%Y-%m-%d %H:%M",
datetime(2021, 9, 13, 17, 21, 0, 0).timestamp(),
datetime(2021, 9, 13, 17, 21, 59, 999999).timestamp(),
),
(
"2021-09-13 17:21:53",
"%Y-%m-%d %H:%M:%S",
datetime(2021, 9, 13, 17, 21, 53, 0).timestamp(),
datetime(2021, 9, 13, 17, 21, 53, 999999).timestamp(),
),
(
"2021-09-13 17:21:53:000123",
"%Y-%m-%d %H:%M:%S:%f",
datetime(2021, 9, 13, 17, 21, 53, 123).timestamp(),
datetime(2021, 9, 13, 17, 21, 53, 123).timestamp(),
),
],
)
def test_disambiguate_timestamp(
date_string: str, format: str, min_datetime: float, max_datetime: float
) -> None:
mint, maxt = disambiguate_timestamp(date_string, format)
assert math.isclose(mint, min_datetime)
assert math.isclose(maxt, max_datetime)


def test_collate_dict() -> None:
samples = [
{
Expand Down
6 changes: 2 additions & 4 deletions torchgeo/datasets/geo.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
import os
import re
import sys
from datetime import datetime
from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, cast

import fiona
Expand All @@ -27,7 +26,7 @@
from torch import Tensor
from torch.utils.data import Dataset

from .utils import BoundingBox
from .utils import BoundingBox, disambiguate_timestamp

# https://github.com/pytorch/pytorch/issues/60979
# https://github.com/pytorch/pytorch/pull/61045
Expand Down Expand Up @@ -230,8 +229,7 @@ def __init__(
maxt: float = sys.maxsize
if "date" in match.groupdict():
date = match.group("date")
time = datetime.strptime(date, self.date_format)
mint = maxt = time.timestamp()
mint, maxt = disambiguate_timestamp(date, self.date_format)

coords = (minx, maxx, miny, maxy, mint, maxt)
self.index.insert(i, coords, filepath)
Expand Down
47 changes: 47 additions & 0 deletions torchgeo/datasets/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,10 @@
import gzip
import lzma
import os
import sys
import tarfile
import zipfile
from datetime import datetime, timedelta
from typing import Any, Dict, Iterator, List, Optional, Tuple, Union

import torch
Expand All @@ -22,6 +24,7 @@
"download_and_extract_archive",
"extract_archive",
"BoundingBox",
"disambiguate_timestamp",
"working_dir",
"collate_dict",
)
Expand Down Expand Up @@ -255,6 +258,50 @@ def intersects(self, other: "BoundingBox") -> bool:
)


def disambiguate_timestamp(date_str: str, format: str) -> Tuple[float, float]:
"""Disambiguate partial timestamps.

adamjstewart marked this conversation as resolved.
Show resolved Hide resolved
Args:
date_str: string representing date and time of a data point
format: format codes accepted by :meth:`datetime.datetime.strptime`

Returns:
(mint, maxt) tuple for indexing
"""
mint = datetime.strptime(date_str, format)

# TODO: This doesn't correctly handle literal `%%` characters in format

# TODO: This is really tedious, is there a better way to do this?

if not any([f"%{c}" in format for c in "yYcxG"]):
# No temporal info
return 0, sys.maxsize
elif not any([f"%{c}" in format for c in "bBmjUWcxV"]):
# Year resolution
maxt = datetime(mint.year + 1, 1, 1) - timedelta(microseconds=1)
elif not any([f"%{c}" in format for c in "aAwdjcxV"]):
# Month resolution
maxt = datetime(mint.year, mint.month + 1, 1) - timedelta(microseconds=1)
adamjstewart marked this conversation as resolved.
Show resolved Hide resolved
elif not any([f"%{c}" in format for c in "HIcX"]):
# Day resolution
maxt = mint + timedelta(days=1) - timedelta(microseconds=1)
elif not any([f"%{c}" in format for c in "McX"]):
# Hour resolution
maxt = mint + timedelta(hours=1) - timedelta(microseconds=1)
elif not any([f"%{c}" in format for c in "ScX"]):
# Minute resolution
maxt = mint + timedelta(minutes=1) - timedelta(microseconds=1)
elif not any([f"%{c}" in format for c in "f"]):
# Second resolution
maxt = mint + timedelta(seconds=1) - timedelta(microseconds=1)
else:
# Microsecond resolution
maxt = mint

return mint.timestamp(), maxt.timestamp()


@contextlib.contextmanager
def working_dir(dirname: str, create: bool = False) -> Iterator[None]:
"""Context manager for changing directories.
Expand Down