diff --git a/ludwig/features/date_feature.py b/ludwig/features/date_feature.py index a462e2ffeb2..68cd644c573 100644 --- a/ludwig/features/date_feature.py +++ b/ludwig/features/date_feature.py @@ -19,13 +19,12 @@ import numpy as np import torch -from dateutil.parser import parse from ludwig.constants import COLUMN, DATE, PROC_COLUMN from ludwig.features.base_feature import BaseFeatureMixin, InputFeature from ludwig.schema.features.date_feature import DateInputFeatureConfig from ludwig.types import FeatureConfigDict, FeatureMetadataDict, PreprocessingConfigDict, TrainingSetMetadataDict -from ludwig.utils.date_utils import create_vector_from_datetime_obj +from ludwig.utils.date_utils import create_vector_from_datetime_obj, parse_datetime from ludwig.utils.types import DataFrame, TorchscriptPreprocessingInput logger = logging.getLogger(__name__) @@ -63,17 +62,20 @@ def get_feature_meta( return {"preprocessing": preprocessing_parameters} @staticmethod - def date_to_list(date_str, datetime_format, preprocessing_parameters): + def date_to_list(date_value, datetime_format, preprocessing_parameters): try: - if isinstance(date_str, datetime): - datetime_obj = date_str - elif datetime_format is not None: - datetime_obj = datetime.strptime(date_str, datetime_format) + if isinstance(date_value, datetime): + datetime_obj = date_value + elif isinstance(date_value, str) and datetime_format is not None: + try: + datetime_obj = datetime.strptime(date_value, datetime_format) + except ValueError: + datetime_obj = parse_datetime(date_value) else: - datetime_obj = parse(date_str) + datetime_obj = parse_datetime(date_value) except Exception as e: logger.error( - f"Error parsing date: '{date_str}' with error '{e}' " + f"Error parsing date: '{date_value}' with error '{e}' " "Please provide a datetime format that parses it " "in the preprocessing section of the date feature " "in the config. " @@ -83,7 +85,7 @@ def date_to_list(date_str, datetime_format, preprocessing_parameters): ) fill_value = preprocessing_parameters["fill_value"] if fill_value != "": - datetime_obj = parse(fill_value) + datetime_obj = parse_datetime(fill_value) else: datetime_obj = datetime.now() diff --git a/ludwig/utils/date_utils.py b/ludwig/utils/date_utils.py index be046676fdc..0405304f3c1 100644 --- a/ludwig/utils/date_utils.py +++ b/ludwig/utils/date_utils.py @@ -13,10 +13,17 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -from datetime import date +import time +from datetime import date, datetime +from typing import Union + +import numpy as np +from dateutil.parser import parse, ParserError from ludwig.api_annotations import DeveloperAPI +SCALE_S = np.floor(np.log10(time.time())) + @DeveloperAPI def create_vector_from_datetime_obj(datetime_obj): @@ -36,3 +43,54 @@ def create_vector_from_datetime_obj(datetime_obj): datetime_obj.second, second_of_day, ] + + +@DeveloperAPI +def parse_datetime(timestamp: Union[float, int, str]) -> datetime: + """Parse a datetime from a string or a numeric timestamp. + + Args: + timestamp: A datetime string or numeric timestamp. + + Returns: + A datetime representation of `timestamp`. + """ + try: + dt = parse(timestamp) + except (OverflowError, ParserError, TypeError): + dt = convert_number_to_datetime(timestamp) + + return dt + + +@DeveloperAPI +def convert_number_to_datetime(timestamp: Union[float, int, str]) -> datetime: + """Convert a numeric timestamp to a datetime object. + + `datetime` objects can be created from POSIX timestamps like those returned by `time.time()`. + + Args: + timestamp: A numeric timestamp. + + Returns: + A datetime representation of `timestamp`. + + Raises: + ValueError: Raised if `timestamp` is not a number or not a valid datetime. + """ + try: + timestamp = float(timestamp) + except TypeError: + raise ValueError(f"Provided value {timestamp} is not a valid numeric timestamp") + + # Determine the unit of the timestamp + ts_scale = np.floor(np.log10(timestamp)) + + # `datetime.datetime.fromtimestamp` expects a timestamp in seconds. Rescale the timestamp if it is not in seconds. + if SCALE_S < ts_scale: + delta = ts_scale - SCALE_S + timestamp = timestamp / np.power(10, delta) + + # Convert the timestamp to a datetime object. If it is not a valid timestamp, `ValueError` is raised. + dt = datetime.utcfromtimestamp(timestamp) + return dt diff --git a/tests/integration_tests/test_date_feature.py b/tests/integration_tests/test_date_feature.py new file mode 100644 index 00000000000..ea04edff36c --- /dev/null +++ b/tests/integration_tests/test_date_feature.py @@ -0,0 +1,104 @@ +import datetime +import time + +import pandas as pd +import pytest +from dateutil.parser import parse + +from ludwig.api import LudwigModel +from ludwig.constants import ( + BACKEND, + BINARY, + DATE, + EPOCHS, + FILL_WITH_CONST, + INPUT_FEATURES, + MISSING_VALUE_STRATEGY, + NAME, + OUTPUT_FEATURES, + PREPROCESSING, + RAY, + TRAINER, + TYPE, +) +from ludwig.utils.date_utils import create_vector_from_datetime_obj + +ray = pytest.importorskip("ray") + +pytestmark = [ + pytest.mark.distributed, +] + + +@pytest.fixture(scope="module") +def string_date_df() -> "pd.DataFrame": + df = pd.DataFrame.from_dict( + { + "date_feature": [str(datetime.datetime.now()) for i in range(100)], + "binary_feature": [i % 2 for i in range(100)], + } + ) + return df + + +@pytest.fixture(scope="module") +def int_date_df() -> "pd.DataFrame": + df = pd.DataFrame.from_dict( + { + "date_feature": [time.time_ns() for i in range(100)], + "binary_feature": [i % 2 for i in range(100)], + } + ) + return df + + +@pytest.fixture(scope="module") +def float_date_df() -> "pd.DataFrame": + df = pd.DataFrame.from_dict( + { + "date_feature": [time.time() for i in range(100)], + "binary_feature": [i % 2 for i in range(100)], + } + ) + return df + + +@pytest.mark.parametrize( + "date_df", + [ + pytest.param("string_date_df", id="string_date"), + pytest.param("int_date_df", id="int_date"), + pytest.param("float_date_df", id="float_date"), + ], +) +def test_date_feature_formats(date_df, request, ray_cluster_2cpu): + df = request.getfixturevalue(date_df) + + config = { + INPUT_FEATURES: [ + { + NAME: "date_feature", + TYPE: DATE, + PREPROCESSING: {MISSING_VALUE_STRATEGY: FILL_WITH_CONST, "fill_value": "1970-01-01 00:00:00"}, + } + ], + OUTPUT_FEATURES: [{NAME: "binary_feature", TYPE: BINARY}], + TRAINER: {EPOCHS: 2}, + BACKEND: {TYPE: RAY, "processor": {TYPE: "dask"}}, + } + + fill_value = create_vector_from_datetime_obj(parse("1970-01-01 00:00:00")) + + model = LudwigModel(config) + preprocessed = model.preprocess(df) + + # Because parsing errors are suppressed, we want to ensure that the data was preprocessed correctly. Sample data is + # drawn from the current time, so the recorded years should not match the fill value's year. + for date in preprocessed.training_set.to_df().compute().iloc[:, 0].values: + assert date[0] != fill_value[0] + + for date in preprocessed.validation_set.to_df().compute().iloc[:, 0].values: + assert date[0] != fill_value[0] + + for date in preprocessed.test_set.to_df().compute().iloc[:, 0].values: + assert date[0] != fill_value[0] diff --git a/tests/ludwig/features/test_date_feature.py b/tests/ludwig/features/test_date_feature.py index 4946cafd72e..2c7515a8f70 100644 --- a/tests/ludwig/features/test_date_feature.py +++ b/tests/ludwig/features/test_date_feature.py @@ -1,15 +1,18 @@ from copy import deepcopy from datetime import datetime +from typing import Any, List import pytest import torch +from dateutil.parser import parse -from ludwig.constants import ENCODER_OUTPUT +from ludwig.constants import ENCODER_OUTPUT, FILL_WITH_CONST, MISSING_VALUE_STRATEGY from ludwig.features import date_feature from ludwig.features.date_feature import DateInputFeature from ludwig.schema.features.date_feature import DateInputFeatureConfig from ludwig.schema.utils import load_config_with_kwargs from ludwig.types import FeatureConfigDict +from ludwig.utils.date_utils import create_vector_from_datetime_obj from ludwig.utils.misc_utils import merge_dict from ludwig.utils.torch_utils import get_torch_device @@ -59,6 +62,70 @@ def test_date_to_list(date_str, datetime_format, expected_list): ) +@pytest.fixture(scope="module") +def reference_date_list() -> List[int]: + return create_vector_from_datetime_obj(datetime.utcfromtimestamp(1691600953.443032)) + + +@pytest.fixture(scope="module") +def fill_value() -> str: + return "1970-01-01 00:00:00" + + +@pytest.fixture(scope="module") +def fill_value_list(fill_value: str) -> List[int]: + return create_vector_from_datetime_obj(parse(fill_value)) + + +@pytest.mark.parametrize( + "timestamp,datetime_format,expected_list", + [ + pytest.param(1691600953.443032, None, "reference_date_list", id="float-s"), + pytest.param(1691600953443.032, None, "reference_date_list", id="float-ms"), + pytest.param(1691600953, None, "reference_date_list", id="int-s"), + pytest.param(1691600953443, None, "reference_date_list", id="int-ms"), + pytest.param(1691600953.443032, "%d/%m/%y %H:%M:%S.%f", "reference_date_list", id="float-s-fmt"), + pytest.param(1691600953443.032, "%d/%m/%y %H:%M:%S.%f", "reference_date_list", id="float-ms-fmt"), + pytest.param(1691600953, "%d/%m/%y %H:%M:%S.%f", "reference_date_list", id="int-s-fmt"), + pytest.param(1691600953443, "%d/%m/%y %H:%M:%S.%f", "reference_date_list", id="int-ms-fmt"), + pytest.param("1691600953.443032", None, "reference_date_list", id="string[float]-s"), + pytest.param("1691600953443.0032", None, "reference_date_list", id="string[float]-ms"), + pytest.param("1691600953", None, "reference_date_list", id="string[int]-s"), + pytest.param("1691600953443", None, "reference_date_list", id="string[int]-ms"), + pytest.param("1691600953.443032", "%d/%m/%y %H:%M:%S.%f", "reference_date_list", id="string[float]-s-fmt"), + pytest.param("1691600953443.0032", "%d/%m/%y %H:%M:%S.%f", "reference_date_list", id="string[float]-ms-fmt"), + pytest.param("1691600953", "%d/%m/%y %H:%M:%S.%f", "reference_date_list", id="string[int]-s-fmt"), + pytest.param("1691600953443", "%d/%m/%y %H:%M:%S.%f", "reference_date_list", id="string[int]-ms-fmt"), + pytest.param("foo", None, "fill_value_list", id="string error"), + pytest.param([1691600953.443032], None, "fill_value_list", id="list error"), + pytest.param(None, None, "fill_value_list", id="NoneType error"), + ], +) +def test_date_to_list_numeric(timestamp: Any, datetime_format: str, expected_list: List[int], fill_value: str, request): + """Test that numeric datetime formats are converted correctly. + + Currently, we support int, float, and string representations of POSIX timestamps in seconds and milliseconds. Valid + timestamps should be converted to datetime lists by `luwdig.utils.date_utils.create_vector_from_datetime_object`. + If a string format is provided, it should be ignored. + + Args: + timestamp: Input to be converted to a date vector + datetime_format: Optional format string, should be ignored under the hood with these timestamps. + expected_list: The expected output of `DateFeatureMixin.date_to_list` + fill_value: Date to be used as fallback + request: pytest request fixture + """ + expected_result = request.getfixturevalue(expected_list) + + # The default fill value is `datetime.now`, for testing we override this to be a constant. + preprocessing_parameters = {MISSING_VALUE_STRATEGY: FILL_WITH_CONST, "fill_value": fill_value} + + # No exception should ever be raised from `date_to_list` due to a parsing error. The expected behavior is to fall + # back to the fill value. + dt = date_feature.DateInputFeature.date_to_list(timestamp, datetime_format, preprocessing_parameters) + assert dt == expected_result + + def test_date_to_list__DatetimeObjectFromParsedJSON(): preprocessing_parameters = None datetime_obj = datetime.fromisoformat("2022-06-25") diff --git a/tests/ludwig/utils/test_date_utils.py b/tests/ludwig/utils/test_date_utils.py new file mode 100644 index 00000000000..9f1599ffb13 --- /dev/null +++ b/tests/ludwig/utils/test_date_utils.py @@ -0,0 +1,49 @@ +import datetime +from contextlib import nullcontext as does_not_raise +from typing import Any, ContextManager + +import pytest + +from ludwig.utils.date_utils import convert_number_to_datetime + + +@pytest.fixture(scope="module") +def reference_datetime() -> datetime.datetime: + return datetime.datetime.utcfromtimestamp(1691600953.443032) + + +@pytest.mark.parametrize( + "timestamp,raises", + [ + pytest.param(1691600953.443032, does_not_raise(), id="float-s"), + pytest.param(1691600953443.032, does_not_raise(), id="float-ms"), + pytest.param(1691600953, does_not_raise(), id="int-s"), + pytest.param(1691600953443, does_not_raise(), id="int-ms"), + pytest.param("1691600953.443032", does_not_raise(), id="string[float]-s"), + pytest.param("1691600953443.0032", does_not_raise(), id="string[float]-ms"), + pytest.param("1691600953", does_not_raise(), id="string[int]-s"), + pytest.param("1691600953443", does_not_raise(), id="string[int]-ms"), + pytest.param("foo", pytest.raises(ValueError), id="string error"), + pytest.param([1691600953.443032], pytest.raises(ValueError), id="list error"), + pytest.param(datetime.datetime(2023, 8, 9, 13, 9, 13), pytest.raises(ValueError), id="datetime error"), + pytest.param(None, pytest.raises(ValueError), id="NoneType error"), + ], +) +def test_convert_number_to_datetime(reference_datetime: datetime.datetime, timestamp: Any, raises: ContextManager): + """Ensure that numeric timestamps are correctly converted to datetime objects. + + Args: + reference_datetime: A datetime object with the expected date/time + timestamp: The timestamp to convert in s or ms + raises: context manager to check for expected exceptions + """ + with raises: + dt = convert_number_to_datetime(timestamp) + + # Check that the returned datetime is accurate to the scale of seconds. + assert dt.year == reference_datetime.year + assert dt.month == reference_datetime.month + assert dt.day == reference_datetime.day + assert dt.hour == reference_datetime.hour + assert dt.minute == reference_datetime.minute + assert dt.second == reference_datetime.second