diff --git a/docs/conf.py b/docs/conf.py index 737360c..2eb3468 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -70,7 +70,7 @@ # The short X.Y version. version = "0.6" # The full version, including alpha/beta/rc tags. -release = "0.6.0-dev.27+pr.93" +release = "0.6.0-dev.28+pr.94" # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. diff --git a/docs/releasehistory.rst b/docs/releasehistory.rst index 9a159e2..cd2f14d 100644 --- a/docs/releasehistory.rst +++ b/docs/releasehistory.rst @@ -104,6 +104,7 @@ Version 0.6.0-dev - Improved time index check in time-dependent models (0.6.0-dev.24+pr.90, 0.6.0-dev.25+pr.91) - Changed the output type of `adtk.data.split_train_test` from a 2-tuple of lists to a list of 2-tuples (0.6.0-dev.26+pr.92) - Removed `adtk.data.resample` because its functionality is highly overlapped with pandas resampler module (0.6.0-dev.27+pr.93) +- Made `adtk.data.expand_event` accept events as pandas Series/DataFrame (0.6.0-dev.28-pr.94) Version 0.5.5 (Feb 24, 2020) =================================== diff --git a/setup.cfg b/setup.cfg index fbddf33..fef8a2d 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,6 +1,6 @@ [metadata] name = adtk -version = 0.6.0-dev.27+pr.93 +version = 0.6.0-dev.28+pr.94 author = Arundo Analytics, Inc. maintainer = Tailai Wen maintainer_email = tailai.wen@arundo.com diff --git a/src/adtk/__init__.py b/src/adtk/__init__.py index 32448ab..0f8ea88 100644 --- a/src/adtk/__init__.py +++ b/src/adtk/__init__.py @@ -20,4 +20,4 @@ """ -__version__ = "0.6.0-dev.27+pr.93" +__version__ = "0.6.0-dev.28+pr.94" diff --git a/src/adtk/data/_data.py b/src/adtk/data/_data.py index 1c11455..f447ee6 100644 --- a/src/adtk/data/_data.py +++ b/src/adtk/data/_data.py @@ -1,8 +1,6 @@ """Module is for data (time series and anomaly list) processing. """ -from functools import reduce -from math import gcd from typing import Dict, List, Optional, Tuple, Union, overload import numpy as np @@ -228,8 +226,8 @@ def to_events( For example, DatetimeIndex(['2017-01-01', '2017-01-02', '2017-01-03', '2017-01-04', '2017-01-05'], dtype='datetime64[ns]', freq='D') has daily frequency. If freq_as_period=True, each time point in the index - represents that day. Otherwsie, each time point represents the - instantaneous time instance of 00:00:00 on that day. + represents that day (24 hours). Otherwsie, each time point represents + the instantaneous time instance of 00:00:00 on that day. Default: True. @@ -313,6 +311,8 @@ def to_events( for start, end in zip(starts[:, 0], ends[:, 0]) ] else: + if labels.columns.duplicated().any(): + raise ValueError("Input DataFrame must have unique column names.") return { col: to_events(labels[col], freq_as_period, merge_consecutive) for col in labels.columns @@ -373,10 +373,10 @@ def to_labels( '2017-01-04', '2017-01-05'], dtype='datetime64[ns]', freq='D') has daily frequency. If freq_as_period=True, each time piont represents that day, and that day will be marked positive if an event in the event - list overlaps with the period of that day. Otherwsie, each time point - represents the instantaneous time instance of 00:00:00 on that day, and - that time point will be marked positive if an event in the event list - covers it. + list overlaps with the period of that day (24 hours). Otherwsie, each + time point represents the instantaneous time instance of 00:00:00 on + that day, and that time point will be marked positive if an event in + the event list covers it. Default: True. @@ -446,6 +446,7 @@ def expand_events( lists: List[Union[Tuple[pd.Timestamp, pd.Timestamp], pd.Timestamp]], left_expand: Union[pd.Timedelta, str, int] = 0, right_expand: Union[pd.Timedelta, str, int] = 0, + freq_as_period: bool = True, ) -> List[Union[Tuple[pd.Timestamp, pd.Timestamp], pd.Timestamp]]: ... @@ -457,35 +458,64 @@ def expand_events( ], left_expand: Union[pd.Timedelta, str, int] = 0, right_expand: Union[pd.Timedelta, str, int] = 0, + freq_as_period: bool = True, ) -> Dict[str, List[Union[Tuple[pd.Timestamp, pd.Timestamp], pd.Timestamp]]]: ... +@overload def expand_events( - lists: Union[ + lists: pd.Series, + left_expand: Union[pd.Timedelta, str, int] = 0, + right_expand: Union[pd.Timedelta, str, int] = 0, + freq_as_period: bool = True, +) -> pd.Series: + ... + + +@overload +def expand_events( # type:ignore + lists: pd.DataFrame, + left_expand: Union[pd.Timedelta, str, int] = 0, + right_expand: Union[pd.Timedelta, str, int] = 0, + freq_as_period: bool = True, +) -> pd.DataFrame: + ... + + +def expand_events( # type:ignore + events: Union[ List[Union[Tuple[pd.Timestamp, pd.Timestamp], pd.Timestamp]], Dict[ str, List[Union[Tuple[pd.Timestamp, pd.Timestamp], pd.Timestamp]] ], + pd.Series, + pd.DataFrame, ], left_expand: Union[pd.Timedelta, str, int] = 0, right_expand: Union[pd.Timedelta, str, int] = 0, + freq_as_period: bool = True, ) -> Union[ List[Union[Tuple[pd.Timestamp, pd.Timestamp], pd.Timestamp]], Dict[str, List[Union[Tuple[pd.Timestamp, pd.Timestamp], pd.Timestamp]]], + pd.Series, + pd.DataFrame, ]: - """Expand time windows in an event list. + """Expand duration of events. Parameters ---------- - lists: list or dict - A list of events, or a dict of lists of events. + events: list, dict, pandas Series, or pandas DataFrame + Events to be expanded. - If list, a list of events where an event is a pandas Timestamp if it is instantaneous or a 2-tuple of pandas Timestamps if it is a closed time interval. - If dict, each key-value pair represents an independent list of events. + - If pandas Series, it is binary where 1 represents events cover this + time point. + - If pandas DataFrame, each column is treated as an independent Series. left_expand: pandas Timedelta, str, or int, optional Time range to expand backward. @@ -505,9 +535,22 @@ def expand_events( Default: 0. + freq_as_period: bool, optional + Whether to regard time index with regular frequency (i.e. attribute + `freq` of time index is not None) as time intervals. Only used when + input events is pandas Series or DataFrame. + + For example, DatetimeIndex(['2017-01-01', '2017-01-02', '2017-01-03', + '2017-01-04', '2017-01-05'], dtype='datetime64[ns]', freq='D') has + daily frequency. If freq_as_period=True, each time point in the index + represents that day (24 hours). Otherwsie, each time point represents + the instantaneous time instance of 00:00:00 on that day. + + Default: True. + Returns ------- - list or dict + list, dict, pandas Series, or pandas DataFrame Expanded events. """ @@ -517,24 +560,59 @@ def expand_events( if not isinstance(right_expand, pd.Timedelta): right_expand = pd.Timedelta(right_expand) - if isinstance(lists, list): - expanded = ( + if isinstance(events, pd.Series): + labels = validate_series(events) # type: pd.Series + lists = to_events( + labels, freq_as_period=freq_as_period + ) # type:List[Union[Tuple[pd.Timestamp, pd.Timestamp], pd.Timestamp]] + expanded_lists = expand_events( # type:ignore + events=lists, left_expand=left_expand, right_expand=right_expand + ) # type:List[Union[Tuple[pd.Timestamp, pd.Timestamp], pd.Timestamp]] + expanded_labels = to_labels( + lists=expanded_lists, + time_index=labels.index, + freq_as_period=freq_as_period, + ) # type: pd.Series + expanded_labels.loc[ + (expanded_labels == False) & (labels.isna()) + ] = float("nan") + expanded_labels = expanded_labels.rename(labels.name) + expanded_labels.index = labels.index + return expanded_labels + elif isinstance(events, pd.DataFrame): + expanded_df = pd.concat( + [ + expand_events( + s, + left_expand=left_expand, + right_expand=right_expand, + freq_as_period=freq_as_period, + ) + for _, s in events.iteritems() + ], + axis=1, + ) # type: pd.DataFrame + return expanded_df + elif isinstance(events, list): + expanded_list = ( [] ) # type: List[Union[Tuple[pd.Timestamp, pd.Timestamp], pd.Timestamp]] - for ano in lists: + for ano in events: if isinstance(ano, tuple): - expanded.append((ano[0] - left_expand, ano[1] + right_expand)) + expanded_list.append( + (ano[0] - left_expand, ano[1] + right_expand) + ) else: - expanded.append((ano - left_expand, ano + right_expand)) - expanded = validate_events(expanded) - return expanded - elif isinstance(lists, dict): + expanded_list.append((ano - left_expand, ano + right_expand)) + expanded_list = validate_events(expanded_list) + return expanded_list + elif isinstance(events, dict): return { key: expand_events(value, left_expand, right_expand) - for key, value in lists.items() + for key, value in events.items() } else: - raise TypeError("Arugment `lists` must be a list or a dict of lists.") + raise TypeError("Arugment `events` must be a list or a dict of lists.") def split_train_test( diff --git a/src/adtk/visualization/_visualization.py b/src/adtk/visualization/_visualization.py index b155348..92c0cce 100644 --- a/src/adtk/visualization/_visualization.py +++ b/src/adtk/visualization/_visualization.py @@ -167,12 +167,16 @@ def plot( Default: 4. freq_as_period: bool, optional - Whether to treat time stamps following regular frequency as time - spans. E.g. time index [2010-01-01, 2010-02-01, 2010-03-01, 2010-04-01, - 2010-05-01] follows monthly frequency, and each time stamp represents - that month if freq_as_period is True. Otherwsie, each time stamp - represents the time point 00:00:00 on the first day of that month. This - is only used when anomaly given as binary series. + Whether to regard time index with regular frequency (i.e. attribute + `freq` of time index is not None) as time intervals. Only used when + anomaly is given as binary series. + + For example, DatetimeIndex(['2017-01-01', '2017-01-02', '2017-01-03', + '2017-01-04', '2017-01-05'], dtype='datetime64[ns]', freq='D') has + daily frequency. If freq_as_period=True, each time point in the index + represents that day (24 hours). Otherwsie, each time point represents + the instantaneous time instance of 00:00:00 on that day. + Default: True. axes: matplotlib Axes object, or array of Axes objects, optional diff --git a/tests/test_expand_events.py b/tests/test_expand_events.py index efeeab5..11cdcc8 100644 --- a/tests/test_expand_events.py +++ b/tests/test_expand_events.py @@ -2,7 +2,7 @@ from adtk.data import expand_events -events = [ +event_list = [ pd.Timestamp("2017-1-1 20:04:00"), (pd.Timestamp("2017-1-1 20:00:00"), pd.Timestamp("2017-1-1 20:05:59")), (pd.Timestamp("2017-1-1 20:03:00"), pd.Timestamp("2017-1-1 20:08:59")), @@ -12,10 +12,85 @@ pd.Timestamp("2017-1-1 21:03:00"), ] +nan = float("nan") +event_labels = pd.Series( + [0, 0, 1, 1, nan, 0, 1, 0, nan, 0, 0, 1], + index=pd.date_range(start="2017-1-1", periods=12, freq="D"), +) + + +def test_expand_event_series_freq(): + expanded_events = expand_events( + event_labels, + left_expand="1hour", + right_expand="1hour", + freq_as_period=True, + ) + true_expanded_events = pd.Series( + [0, 1, 1, 1, 1, 1, 1, 1, nan, 0, 1, 1], + index=pd.date_range(start="2017-1-1", periods=12, freq="D"), + ) + pd.testing.assert_series_equal( + true_expanded_events, expanded_events, check_dtype=False + ) + + +def test_expand_event_series_no_freq(): + expanded_events = expand_events( + event_labels, + left_expand="1hour", + right_expand="1hour", + freq_as_period=False, + ) + pd.testing.assert_series_equal( + event_labels, expanded_events, check_dtype=False + ) + + +def test_expand_event_df_freq(): + expanded_events = expand_events( + pd.concat( + [event_labels.rename("A"), event_labels.rename("B")], axis=1 + ), + left_expand="1hour", + right_expand="1hour", + freq_as_period=True, + ) + true_expanded_events = pd.Series( + [0, 1, 1, 1, 1, 1, 1, 1, nan, 0, 1, 1], + index=pd.date_range(start="2017-1-1", periods=12, freq="D"), + ) + true_expanded_events = pd.concat( + [true_expanded_events.rename("A"), true_expanded_events.rename("B")], + axis=1, + ) + pd.testing.assert_frame_equal( + true_expanded_events, expanded_events, check_dtype=False + ) + + +def test_expand_event_df_no_freq(): + expanded_events = expand_events( + pd.concat( + [event_labels.rename("A"), event_labels.rename("B")], axis=1 + ), + left_expand="1hour", + right_expand="1hour", + freq_as_period=False, + ) + + pd.testing.assert_frame_equal( + pd.concat( + [event_labels.rename("A"), event_labels.rename("B")], axis=1 + ), + expanded_events, + check_dtype=False, + ) + def test_expand_event_list(): expanded_events = expand_events( - events, left_expand="1min", right_expand="3min" + event_list, left_expand="1min", right_expand="3min" ) assert expanded_events == [ (pd.Timestamp("2017-1-1 19:59:00"), pd.Timestamp("2017-1-1 20:11:59")), @@ -26,7 +101,9 @@ def test_expand_event_list(): def test_expand_event_dict(): expanded_events = expand_events( - {"A": events, "B": events}, left_expand="1min", right_expand="3min" + {"A": event_list, "B": event_list}, + left_expand="1min", + right_expand="3min", ) assert expanded_events == { "A": [