Skip to content

Commit

Permalink
expand_events accepts binary series (#94)
Browse files Browse the repository at this point in the history
* updated docstring

* expand_events can accept Series and DF now

* fixed a bug on nan

* added unit test

* fixed a bug on df

* added unit test for df

* fixed typing

* update version number

* updated changelog
  • Loading branch information
tailaiw authored Mar 10, 2020
1 parent 33395fa commit e9dd499
Show file tree
Hide file tree
Showing 7 changed files with 195 additions and 35 deletions.
2 changes: 1 addition & 1 deletion docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@
# The short X.Y version.
version = "0.6"
# The full version, including alpha/beta/rc tags.
release = "0.6.0-dev.27+pr.93"
release = "0.6.0-dev.28+pr.94"

# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
Expand Down
1 change: 1 addition & 0 deletions docs/releasehistory.rst
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,7 @@ Version 0.6.0-dev
- Improved time index check in time-dependent models (0.6.0-dev.24+pr.90, 0.6.0-dev.25+pr.91)
- Changed the output type of `adtk.data.split_train_test` from a 2-tuple of lists to a list of 2-tuples (0.6.0-dev.26+pr.92)
- Removed `adtk.data.resample` because its functionality is highly overlapped with pandas resampler module (0.6.0-dev.27+pr.93)
- Made `adtk.data.expand_event` accept events as pandas Series/DataFrame (0.6.0-dev.28-pr.94)

Version 0.5.5 (Feb 24, 2020)
===================================
Expand Down
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[metadata]
name = adtk
version = 0.6.0-dev.27+pr.93
version = 0.6.0-dev.28+pr.94
author = Arundo Analytics, Inc.
maintainer = Tailai Wen
maintainer_email = [email protected]
Expand Down
2 changes: 1 addition & 1 deletion src/adtk/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,4 +20,4 @@
"""

__version__ = "0.6.0-dev.27+pr.93"
__version__ = "0.6.0-dev.28+pr.94"
124 changes: 101 additions & 23 deletions src/adtk/data/_data.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
"""Module is for data (time series and anomaly list) processing.
"""

from functools import reduce
from math import gcd
from typing import Dict, List, Optional, Tuple, Union, overload

import numpy as np
Expand Down Expand Up @@ -228,8 +226,8 @@ def to_events(
For example, DatetimeIndex(['2017-01-01', '2017-01-02', '2017-01-03',
'2017-01-04', '2017-01-05'], dtype='datetime64[ns]', freq='D') has
daily frequency. If freq_as_period=True, each time point in the index
represents that day. Otherwsie, each time point represents the
instantaneous time instance of 00:00:00 on that day.
represents that day (24 hours). Otherwsie, each time point represents
the instantaneous time instance of 00:00:00 on that day.
Default: True.
Expand Down Expand Up @@ -313,6 +311,8 @@ def to_events(
for start, end in zip(starts[:, 0], ends[:, 0])
]
else:
if labels.columns.duplicated().any():
raise ValueError("Input DataFrame must have unique column names.")
return {
col: to_events(labels[col], freq_as_period, merge_consecutive)
for col in labels.columns
Expand Down Expand Up @@ -373,10 +373,10 @@ def to_labels(
'2017-01-04', '2017-01-05'], dtype='datetime64[ns]', freq='D') has
daily frequency. If freq_as_period=True, each time piont represents
that day, and that day will be marked positive if an event in the event
list overlaps with the period of that day. Otherwsie, each time point
represents the instantaneous time instance of 00:00:00 on that day, and
that time point will be marked positive if an event in the event list
covers it.
list overlaps with the period of that day (24 hours). Otherwsie, each
time point represents the instantaneous time instance of 00:00:00 on
that day, and that time point will be marked positive if an event in
the event list covers it.
Default: True.
Expand Down Expand Up @@ -446,6 +446,7 @@ def expand_events(
lists: List[Union[Tuple[pd.Timestamp, pd.Timestamp], pd.Timestamp]],
left_expand: Union[pd.Timedelta, str, int] = 0,
right_expand: Union[pd.Timedelta, str, int] = 0,
freq_as_period: bool = True,
) -> List[Union[Tuple[pd.Timestamp, pd.Timestamp], pd.Timestamp]]:
...

Expand All @@ -457,35 +458,64 @@ def expand_events(
],
left_expand: Union[pd.Timedelta, str, int] = 0,
right_expand: Union[pd.Timedelta, str, int] = 0,
freq_as_period: bool = True,
) -> Dict[str, List[Union[Tuple[pd.Timestamp, pd.Timestamp], pd.Timestamp]]]:
...


@overload
def expand_events(
lists: Union[
lists: pd.Series,
left_expand: Union[pd.Timedelta, str, int] = 0,
right_expand: Union[pd.Timedelta, str, int] = 0,
freq_as_period: bool = True,
) -> pd.Series:
...


@overload
def expand_events( # type:ignore
lists: pd.DataFrame,
left_expand: Union[pd.Timedelta, str, int] = 0,
right_expand: Union[pd.Timedelta, str, int] = 0,
freq_as_period: bool = True,
) -> pd.DataFrame:
...


def expand_events( # type:ignore
events: Union[
List[Union[Tuple[pd.Timestamp, pd.Timestamp], pd.Timestamp]],
Dict[
str, List[Union[Tuple[pd.Timestamp, pd.Timestamp], pd.Timestamp]]
],
pd.Series,
pd.DataFrame,
],
left_expand: Union[pd.Timedelta, str, int] = 0,
right_expand: Union[pd.Timedelta, str, int] = 0,
freq_as_period: bool = True,
) -> Union[
List[Union[Tuple[pd.Timestamp, pd.Timestamp], pd.Timestamp]],
Dict[str, List[Union[Tuple[pd.Timestamp, pd.Timestamp], pd.Timestamp]]],
pd.Series,
pd.DataFrame,
]:
"""Expand time windows in an event list.
"""Expand duration of events.
Parameters
----------
lists: list or dict
A list of events, or a dict of lists of events.
events: list, dict, pandas Series, or pandas DataFrame
Events to be expanded.
- If list, a list of events where an event is a pandas Timestamp if it
is instantaneous or a 2-tuple of pandas Timestamps if it is a closed
time interval.
- If dict, each key-value pair represents an independent list of
events.
- If pandas Series, it is binary where 1 represents events cover this
time point.
- If pandas DataFrame, each column is treated as an independent Series.
left_expand: pandas Timedelta, str, or int, optional
Time range to expand backward.
Expand All @@ -505,9 +535,22 @@ def expand_events(
Default: 0.
freq_as_period: bool, optional
Whether to regard time index with regular frequency (i.e. attribute
`freq` of time index is not None) as time intervals. Only used when
input events is pandas Series or DataFrame.
For example, DatetimeIndex(['2017-01-01', '2017-01-02', '2017-01-03',
'2017-01-04', '2017-01-05'], dtype='datetime64[ns]', freq='D') has
daily frequency. If freq_as_period=True, each time point in the index
represents that day (24 hours). Otherwsie, each time point represents
the instantaneous time instance of 00:00:00 on that day.
Default: True.
Returns
-------
list or dict
list, dict, pandas Series, or pandas DataFrame
Expanded events.
"""
Expand All @@ -517,24 +560,59 @@ def expand_events(
if not isinstance(right_expand, pd.Timedelta):
right_expand = pd.Timedelta(right_expand)

if isinstance(lists, list):
expanded = (
if isinstance(events, pd.Series):
labels = validate_series(events) # type: pd.Series
lists = to_events(
labels, freq_as_period=freq_as_period
) # type:List[Union[Tuple[pd.Timestamp, pd.Timestamp], pd.Timestamp]]
expanded_lists = expand_events( # type:ignore
events=lists, left_expand=left_expand, right_expand=right_expand
) # type:List[Union[Tuple[pd.Timestamp, pd.Timestamp], pd.Timestamp]]
expanded_labels = to_labels(
lists=expanded_lists,
time_index=labels.index,
freq_as_period=freq_as_period,
) # type: pd.Series
expanded_labels.loc[
(expanded_labels == False) & (labels.isna())
] = float("nan")
expanded_labels = expanded_labels.rename(labels.name)
expanded_labels.index = labels.index
return expanded_labels
elif isinstance(events, pd.DataFrame):
expanded_df = pd.concat(
[
expand_events(
s,
left_expand=left_expand,
right_expand=right_expand,
freq_as_period=freq_as_period,
)
for _, s in events.iteritems()
],
axis=1,
) # type: pd.DataFrame
return expanded_df
elif isinstance(events, list):
expanded_list = (
[]
) # type: List[Union[Tuple[pd.Timestamp, pd.Timestamp], pd.Timestamp]]
for ano in lists:
for ano in events:
if isinstance(ano, tuple):
expanded.append((ano[0] - left_expand, ano[1] + right_expand))
expanded_list.append(
(ano[0] - left_expand, ano[1] + right_expand)
)
else:
expanded.append((ano - left_expand, ano + right_expand))
expanded = validate_events(expanded)
return expanded
elif isinstance(lists, dict):
expanded_list.append((ano - left_expand, ano + right_expand))
expanded_list = validate_events(expanded_list)
return expanded_list
elif isinstance(events, dict):
return {
key: expand_events(value, left_expand, right_expand)
for key, value in lists.items()
for key, value in events.items()
}
else:
raise TypeError("Arugment `lists` must be a list or a dict of lists.")
raise TypeError("Arugment `events` must be a list or a dict of lists.")


def split_train_test(
Expand Down
16 changes: 10 additions & 6 deletions src/adtk/visualization/_visualization.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,12 +167,16 @@ def plot(
Default: 4.
freq_as_period: bool, optional
Whether to treat time stamps following regular frequency as time
spans. E.g. time index [2010-01-01, 2010-02-01, 2010-03-01, 2010-04-01,
2010-05-01] follows monthly frequency, and each time stamp represents
that month if freq_as_period is True. Otherwsie, each time stamp
represents the time point 00:00:00 on the first day of that month. This
is only used when anomaly given as binary series.
Whether to regard time index with regular frequency (i.e. attribute
`freq` of time index is not None) as time intervals. Only used when
anomaly is given as binary series.
For example, DatetimeIndex(['2017-01-01', '2017-01-02', '2017-01-03',
'2017-01-04', '2017-01-05'], dtype='datetime64[ns]', freq='D') has
daily frequency. If freq_as_period=True, each time point in the index
represents that day (24 hours). Otherwsie, each time point represents
the instantaneous time instance of 00:00:00 on that day.
Default: True.
axes: matplotlib Axes object, or array of Axes objects, optional
Expand Down
83 changes: 80 additions & 3 deletions tests/test_expand_events.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from adtk.data import expand_events

events = [
event_list = [
pd.Timestamp("2017-1-1 20:04:00"),
(pd.Timestamp("2017-1-1 20:00:00"), pd.Timestamp("2017-1-1 20:05:59")),
(pd.Timestamp("2017-1-1 20:03:00"), pd.Timestamp("2017-1-1 20:08:59")),
Expand All @@ -12,10 +12,85 @@
pd.Timestamp("2017-1-1 21:03:00"),
]

nan = float("nan")
event_labels = pd.Series(
[0, 0, 1, 1, nan, 0, 1, 0, nan, 0, 0, 1],
index=pd.date_range(start="2017-1-1", periods=12, freq="D"),
)


def test_expand_event_series_freq():
expanded_events = expand_events(
event_labels,
left_expand="1hour",
right_expand="1hour",
freq_as_period=True,
)
true_expanded_events = pd.Series(
[0, 1, 1, 1, 1, 1, 1, 1, nan, 0, 1, 1],
index=pd.date_range(start="2017-1-1", periods=12, freq="D"),
)
pd.testing.assert_series_equal(
true_expanded_events, expanded_events, check_dtype=False
)


def test_expand_event_series_no_freq():
expanded_events = expand_events(
event_labels,
left_expand="1hour",
right_expand="1hour",
freq_as_period=False,
)
pd.testing.assert_series_equal(
event_labels, expanded_events, check_dtype=False
)


def test_expand_event_df_freq():
expanded_events = expand_events(
pd.concat(
[event_labels.rename("A"), event_labels.rename("B")], axis=1
),
left_expand="1hour",
right_expand="1hour",
freq_as_period=True,
)
true_expanded_events = pd.Series(
[0, 1, 1, 1, 1, 1, 1, 1, nan, 0, 1, 1],
index=pd.date_range(start="2017-1-1", periods=12, freq="D"),
)
true_expanded_events = pd.concat(
[true_expanded_events.rename("A"), true_expanded_events.rename("B")],
axis=1,
)
pd.testing.assert_frame_equal(
true_expanded_events, expanded_events, check_dtype=False
)


def test_expand_event_df_no_freq():
expanded_events = expand_events(
pd.concat(
[event_labels.rename("A"), event_labels.rename("B")], axis=1
),
left_expand="1hour",
right_expand="1hour",
freq_as_period=False,
)

pd.testing.assert_frame_equal(
pd.concat(
[event_labels.rename("A"), event_labels.rename("B")], axis=1
),
expanded_events,
check_dtype=False,
)


def test_expand_event_list():
expanded_events = expand_events(
events, left_expand="1min", right_expand="3min"
event_list, left_expand="1min", right_expand="3min"
)
assert expanded_events == [
(pd.Timestamp("2017-1-1 19:59:00"), pd.Timestamp("2017-1-1 20:11:59")),
Expand All @@ -26,7 +101,9 @@ def test_expand_event_list():

def test_expand_event_dict():
expanded_events = expand_events(
{"A": events, "B": events}, left_expand="1min", right_expand="3min"
{"A": event_list, "B": event_list},
left_expand="1min",
right_expand="3min",
)
assert expanded_events == {
"A": [
Expand Down

0 comments on commit e9dd499

Please sign in to comment.