From 5166ae456d589558fc273da185b646d5cb49fb78 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Braulio=20R=C3=ADos?= Date: Thu, 5 Oct 2023 15:21:24 -0300 Subject: [PATCH 1/4] Added open in colab link to recipes --- docs/src/recipes/aggregate_duplicated.ipynb | 2 ++ docs/src/recipes/aggregate_index.ipynb | 2 ++ docs/src/recipes/aggregate_interval.ipynb | 2 ++ docs/src/recipes/split_fraction.ipynb | 2 ++ docs/src/recipes/split_timestamp.ipynb | 2 ++ 5 files changed, 10 insertions(+) diff --git a/docs/src/recipes/aggregate_duplicated.ipynb b/docs/src/recipes/aggregate_duplicated.ipynb index 5b826ac60..9a899dfc4 100644 --- a/docs/src/recipes/aggregate_duplicated.ipynb +++ b/docs/src/recipes/aggregate_duplicated.ipynb @@ -7,6 +7,8 @@ "source": [ "# Unify events with identical timestamps\n", "\n", + "[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/google/temporian/blob/last-release/docs/src/recipes/aggregate_duplicated.ipynb)\n", + "\n", "This recipe shows how to avoid having duplicated timestamps in an `EventSet`. Events with identical timestamps are aggregated with a moving window operation (e.g: sum, average, max, min), preserving the original timestamp values (which may be non-uniform).\n", "\n", "\n", diff --git a/docs/src/recipes/aggregate_index.ipynb b/docs/src/recipes/aggregate_index.ipynb index 71e810c64..347b5f3b7 100644 --- a/docs/src/recipes/aggregate_index.ipynb +++ b/docs/src/recipes/aggregate_index.ipynb @@ -7,6 +7,8 @@ "source": [ "# Aggregate events from different indexes\n", "\n", + "[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/google/temporian/blob/last-release/docs/src/recipes/aggregate_index.ipynb)\n", + "\n", "This recipe applies when you have events indexed by one or more features, and you want to drop some index levels and unify the events with the same timestamps.\n", "\n", "In this example, we aggregate daily sales by store and product, into daily revenue for each individual store (i.e., the total sales for each day)." diff --git a/docs/src/recipes/aggregate_interval.ipynb b/docs/src/recipes/aggregate_interval.ipynb index 01ef94859..f63ede092 100644 --- a/docs/src/recipes/aggregate_interval.ipynb +++ b/docs/src/recipes/aggregate_interval.ipynb @@ -7,6 +7,8 @@ "source": [ "# Aggregate events at a fixed interval\n", "\n", + "[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/google/temporian/blob/last-release/docs/src/recipes/aggregate_interval.ipynb)\n", + "\n", "This recipe aggregates possibly non-uniformly sampled events into fixed-length intervals (e.g., seconds, hours, days, or weeks). In other words, it converts the event features into time series.\n", "\n", "For example, suppose we have the sales log from a store, where each sold item is represented by an event. Let's assume each sale event has a date-time, the sale price and the unit cost of the product. We want to calculate total daily sales, with one single event at `00:00` each day." diff --git a/docs/src/recipes/split_fraction.ipynb b/docs/src/recipes/split_fraction.ipynb index ce43720d9..1260cde1b 100644 --- a/docs/src/recipes/split_fraction.ipynb +++ b/docs/src/recipes/split_fraction.ipynb @@ -7,6 +7,8 @@ "source": [ "# Split data by fraction\n", "\n", + "[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/google/temporian/blob/last-release/docs/src/recipes/split_fraction.ipynb)\n", + "\n", "This recipe can be used to split an `EventSet` in two or more subsets, each with a specified fraction of the total number of data points.\n", "\n", "For example, to train a machine learning forecasting model, the data usually needs to be split into train, validation and test `EventSets`. In this case we'll use `60%` of the data for training, `20%` for validation, and `20%` for test." diff --git a/docs/src/recipes/split_timestamp.ipynb b/docs/src/recipes/split_timestamp.ipynb index 5160d36b0..bb1b520eb 100644 --- a/docs/src/recipes/split_timestamp.ipynb +++ b/docs/src/recipes/split_timestamp.ipynb @@ -7,6 +7,8 @@ "source": [ "# Split data at a given timestamp\n", "\n", + "[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/google/temporian/blob/last-release/docs/src/recipes/split_timestamp.ipynb)\n", + "\n", "This recipe can be used to split an `EventSet` in two or more subsets at fixed timestamps.\n", "\n", "This exact same procedure applies to multi-index data or the default single empty index.\n", From e33fc8b8a42bcb369b15f9ea7c3689cf79a4bca7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Braulio=20R=C3=ADos?= Date: Thu, 5 Oct 2023 16:17:09 -0300 Subject: [PATCH 2/4] Moved unary ops to evset.isnan() and evset.notnan() --- temporian/core/event_set_ops.py | 84 +++++++++++++++++++++++++++++++ temporian/core/operators/unary.py | 82 ------------------------------ 2 files changed, 84 insertions(+), 82 deletions(-) diff --git a/temporian/core/event_set_ops.py b/temporian/core/event_set_ops.py index 46a9f5783..572032719 100644 --- a/temporian/core/event_set_ops.py +++ b/temporian/core/event_set_ops.py @@ -1254,6 +1254,47 @@ def filter( return filter(self, condition=condition) + def isnan( + self: EventSetOrNode, + ) -> EventSetOrNode: + """Returns boolean features, `True` in the NaN elements of the + [`EventSet`][temporian.EventSet]. + + Note that for `int` and `bool` this will always be `False` since those types + don't support NaNs. It only makes actual sense to use on `float` (or + `tp.float32`) features. + + See also `evset.notnan()`. + + Example: + ```python + >>> a = tp.event_set( + ... timestamps=[1, 2, 3], + ... features={"M":[np.nan, 5., np.nan], "N": [-1, 0, 5]}, + ... ) + >>> b = a.isnan() + >>> b + indexes: ... + 'M': [ True False True] + 'N': [False False False] + ... + + >>> # Count nans + >>> b["M"].cast(int).cumsum() + indexes: ... + timestamps: [1. 2. 3.] + 'M': [1 1 2] + ... + + ``` + + Returns: + EventSet with boolean features. + """ + from temporian.core.operators.unary import isnan + + return isnan(self) + def join( self: EventSetOrNode, other: EventSetOrNode, @@ -1749,6 +1790,49 @@ def moving_sum( return moving_sum(self, window_length=window_length, sampling=sampling) + def notnan( + self: EventSetOrNode, + ) -> EventSetOrNode: + """Returns boolean features, `False` in the NaN elements of an + [`EventSet`][temporian.EventSet]. + + Equivalent to `~evset.isnan(...)`. + + Note that for `int` and `bool` this will always be `True` since those types + don't support NaNs. It only makes actual sense to use on `float` (or + `tp.float32`) features. + + See also `evset.isnan()`. + + Example: + ```python + >>> a = tp.event_set( + ... timestamps=[1, 2, 3], + ... features={"M":[np.nan, 5., np.nan], "N": [-1, 0, 5]}, + ... ) + >>> b = a.notnan() + >>> b + indexes: ... + 'M': [False True False] + 'N': [ True True True] + ... + + >>> # Filter only rows where "M" is not nan + >>> a.filter(b["M"]) + indexes: ... + 'M': [5.] + 'N': [0] + ... + + ``` + + Returns: + EventSet with boolean features. + """ + from temporian.core.operators.unary import notnan + + return notnan(self) + def prefix( self: EventSetOrNode, prefix: str, diff --git a/temporian/core/operators/unary.py b/temporian/core/operators/unary.py index 57b6d9e11..26b2d3bf4 100644 --- a/temporian/core/operators/unary.py +++ b/temporian/core/operators/unary.py @@ -242,43 +242,6 @@ def invert( def isnan( input: EventSetOrNode, ) -> EventSetOrNode: - """Returns boolean features, `True` in the NaN elements of an - [`EventSet`][temporian.EventSet]. - - Note that for `int` and `bool` this will always be `False` since those types - don't support NaNs. It only makes actual sense to use on `float` (or - `tp.float32`) features. - - See also [`tp.notnan()`][temporian.notnan]. - - Example: - ```python - >>> a = tp.event_set( - ... timestamps=[1, 2, 3], - ... features={"M":[np.nan, 5., np.nan], "N": [-1, 0, 5]}, - ... ) - >>> b = tp.isnan(a) - >>> b - indexes: ... - 'M': [ True False True] - 'N': [False False False] - ... - - >>> # Count nans - >>> b["M"].cast(int).cumsum() - indexes: ... - timestamps: [1. 2. 3.] - 'M': [1 1 2] - ... - - ``` - - Args: - input: EventSet to check for NaNs. - - Returns: - EventSet with boolean features. - """ assert isinstance(input, EventSetNode) return IsNanOperator( @@ -290,51 +253,6 @@ def isnan( def notnan( input: EventSetOrNode, ) -> EventSetOrNode: - """Returns boolean features, `False` in the NaN elements of an - [`EventSet`][temporian.EventSet]. - - Equivalent to `tp.invert(tp.isnan(...))`. - - Note that for `int` and `bool` this will always be `True` since those types - don't support NaNs. It only makes actual sense to use on `float` (or - `tp.float32`) features. - - See also [`tp.isnan()`][temporian.isnan]. - - Example: - ```python - >>> a = tp.event_set( - ... timestamps=[1, 2, 3], - ... features={"M":[np.nan, 5., np.nan], "N": [-1, 0, 5]}, - ... ) - >>> b = tp.isnan(a) - >>> b - indexes: ... - 'M': [ True False True] - 'N': [False False False] - ... - - >>> # Filter only not nan rows - >>> not_nans = ~b["M"] - >>> not_nans - indexes: ... - 'M': [False True False] - ... - - >>> a.filter(not_nans) - indexes: ... - 'M': [5.] - 'N': [0] - ... - - ``` - - Args: - input: EventSet to check for NaNs. - - Returns: - EventSet with boolean features. - """ assert isinstance(input, EventSetNode) return NotNanOperator( From 93e49f95bbc14bc483e6fc55b0d66131b954a778 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Braulio=20R=C3=ADos?= Date: Thu, 5 Oct 2023 16:22:15 -0300 Subject: [PATCH 3/4] Moved unary ops to evset.log() and evset.abs() --- temporian/core/event_set_ops.py | 54 +++++++++++++++++++++++++++++++ temporian/core/operators/unary.py | 50 ---------------------------- 2 files changed, 54 insertions(+), 50 deletions(-) diff --git a/temporian/core/event_set_ops.py b/temporian/core/event_set_ops.py index 572032719..606377eee 100644 --- a/temporian/core/event_set_ops.py +++ b/temporian/core/event_set_ops.py @@ -381,6 +381,33 @@ def __xor__(self, other: Any): # OPERATORS # ############# + def abs( + self: EventSetOrNode, + ) -> EventSetOrNode: + """Gets the absolute value of an [`EventSet`][temporian.EventSet]'s + features. + + Example: + ```python + >>> a = tp.event_set( + ... timestamps=[1, 2, 3], + ... features={"M":[np.nan, -1., 2.], "N": [-1, -3, 5]}, + ... ) + >>> a.abs() + indexes: ... + 'M': [nan 1. 2.] + 'N': [1 3 5] + ... + + ``` + + Returns: + EventSetOr with positive valued features. + """ + from temporian.core.operators.unary import abs + + return abs(self) + def add_index( self: EventSetOrNode, indexes: Union[str, List[str]] ) -> EventSetOrNode: @@ -1457,6 +1484,33 @@ def leak(self: EventSetOrNode, duration: Duration) -> EventSetOrNode: return leak(self, duration=duration) + def log(self: EventSetOrNode) -> EventSetOrNode: + """Calculates the natural logarithm of an [`EventSet`][temporian.EventSet]'s + features. + + Can only be used on floating point features. + + Example: + ```python + >>> a = tp.event_set( + ... timestamps=[1, 2, 3, 4, 5], + ... features={"M": [np.e, 1., 2., 10., -1.]}, + ... ) + >>> a.log() + indexes: ... + timestamps: [1. 2. 3. 4. 5.] + 'M': [1. 0. 0.6931 2.3026 nan] + ... + + ``` + + Returns: + EventSetOr with logarithm of input features. + """ + from temporian.core.operators.unary import log + + return log(self) + def moving_count( self: EventSetOrNode, window_length: WindowLength, diff --git a/temporian/core/operators/unary.py b/temporian/core/operators/unary.py index 26b2d3bf4..2a8b968fa 100644 --- a/temporian/core/operators/unary.py +++ b/temporian/core/operators/unary.py @@ -264,30 +264,6 @@ def notnan( def abs( input: EventSetOrNode, ) -> EventSetOrNode: - """Gets the absolute value of an [`EventSet`][temporian.EventSet]'s - features. - - Example: - ```python - >>> a = tp.event_set( - ... timestamps=[1, 2, 3], - ... features={"M":[np.nan, -1., 2.], "N": [-1, -3, 5]}, - ... ) - >>> b = tp.abs(a) - >>> b - indexes: ... - 'M': [nan 1. 2.] - 'N': [1 3 5] - ... - - ``` - - Args: - input: EventSetOr calculate absolute value. - - Returns: - EventSetOr with positive valued features. - """ assert isinstance(input, EventSetNode) return AbsOperator( @@ -299,32 +275,6 @@ def abs( def log( input: EventSetOrNode, ) -> EventSetOrNode: - """Calculates the natural logarithm of an [`EventSet`][temporian.EventSet]'s - features. - - Can only be used on floating point features. - - Example: - ```python - >>> a = tp.event_set( - ... timestamps=[1, 2, 3, 4, 5], - ... features={"M": [np.e, 1., 2., 10., -1.]}, - ... ) - >>> b = tp.log(a) - >>> b - indexes: ... - timestamps: [1. 2. 3. 4. 5.] - 'M': [1. 0. 0.6931 2.3026 nan] - ... - - ``` - - Args: - input: EventSetOr to calculate natural logarithm. - - Returns: - EventSetOr with logarithm of input features. - """ assert isinstance(input, EventSetNode) return LogOperator( From 6407efe652a483d7a6d1e4b9f9055d50c6c728aa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Braulio=20R=C3=ADos?= Date: Thu, 5 Oct 2023 16:32:11 -0300 Subject: [PATCH 4/4] Recipe with tick_calendar() & since_last(steps=2) --- docs/src/recipes/aggregate_calendar.ipynb | 182 ++++++++++++++++++++++ docs/src/recipes/index.md | 1 + 2 files changed, 183 insertions(+) create mode 100644 docs/src/recipes/aggregate_calendar.ipynb diff --git a/docs/src/recipes/aggregate_calendar.ipynb b/docs/src/recipes/aggregate_calendar.ipynb new file mode 100644 index 000000000..8ed5fb961 --- /dev/null +++ b/docs/src/recipes/aggregate_calendar.ipynb @@ -0,0 +1,182 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "c74f7111-1b6c-4454-9770-3f67eeadaca6", + "metadata": {}, + "source": [ + "# Aggregate events by calendar features (month/year)\n", + "\n", + "[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/google/temporian/blob/last-release/docs/src/recipes/aggregate_calendar.ipynb)\n", + "\n", + "In this recipe we'll learn how to aggregate events based on calendar features (e.g: monthly, yearly).\n", + "\n", + "For example, suppose we want to calculate total monthly sales, having one event per month that accumulates all sales of the past month.\n", + "\n", + "Here we'll use a more general use case: for **every month, show the sales of the past 2 months**. This covers the previous case as well, only by changing a parameter's value (`steps=1`)." + ] + }, + { + "cell_type": "markdown", + "id": "de274e0e-ab5a-46a0-b4b9-f1026b43076c", + "metadata": {}, + "source": [ + "## Example data\n", + "\n", + "Let's create some sale events with datetime samplings." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "10a56d43-d011-4e72-aed4-8d460d58c337", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import temporian as tp\n", + "\n", + "sales_data = pd.DataFrame(\n", + " data=[\n", + " # sale timestamp, price, cost\n", + " [\"2020-01-01 13:04\", 3.0, 1.0], # January\n", + " [\"2020-01-15 15:24\", 7.0, 3.0],\n", + " [\"2020-02-01 13:45\", 3.0, 1.0], # February\n", + " [\"2020-02-20 16:10\", 7.0, 3.0],\n", + " [\"2020-03-10 10:00\", 10.0, 5.0], # March\n", + " [\"2020-03-28 10:10\", 4.0, 2.0],\n", + " [\"2020-04-15 19:35\", 3.0, 1.0], # April\n", + " [\"2020-05-25 18:30\", 18.0, 2.0], # May\n", + " ],\n", + " columns=[\n", + " \"timestamp\",\n", + " \"unit_price\",\n", + " \"unit_cost\",\n", + " ],\n", + ")\n", + "\n", + "sales_evset = tp.from_pandas(sales_data)\n", + "sales_evset.plot()" + ] + }, + { + "cell_type": "markdown", + "id": "c8f6189a-ca80-4869-b8a6-35356ce42c02", + "metadata": {}, + "source": [ + "## Solution\n", + "We want to calculate every month, the accumulated sales from the last 2 months. So this is what we can do:\n", + "1. Create a tick on the first day of every month.\n", + "1. Use a `moving_sum` with variable window length, at each tick covering the duration since the last 2 months.\n", + "\n", + "### 1. Create a tick every month" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9e930b36-10f6-487f-8466-278a1a08956b", + "metadata": {}, + "outputs": [], + "source": [ + "# Period to cover. Includes the first day of the month after the last event.\n", + "time_span = tp.event_set(timestamps=[\"2020-01-01 00:00:00\", \"2020-06-01 00:00:00\"])\n", + "\n", + "# Tick first day of every month (equivalent: set mday=1)\n", + "monthly_ticks = time_span.tick_calendar(month='*')\n", + "\n", + "monthly_ticks.plot()" + ] + }, + { + "cell_type": "markdown", + "id": "aad79b2d-3fea-4e11-8d68-1e5d78f3b655", + "metadata": {}, + "source": [ + "### 2. Moving sum with variable window length\n", + "\n", + "The `window_length` argument can be an `EventSet` with one single feature, which specifies the duration (in seconds) of the window at each timestamp.\n", + "\n", + "Using the `since_last()` operator, we get exactly that: an `EventSet` with the duration (in seconds) since the last previous event, or since the number events indicated by the `steps` parameter. For example, using `steps=1` (default), would accumulate events by month, and using `steps=6` means a rolling sum over the previous 6 months.\n", + "\n", + "We want the last 2 months calculated every month, so tick every month and use `since_last(steps=2)`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7d92dff9-b4b6-42d1-8848-b37ff768ef7e", + "metadata": {}, + "outputs": [], + "source": [ + "# Same sampling as monthly_ticks, create single feature with the duration of the last 2 months (in seconds)\n", + "monthly_window_lengths = monthly_ticks.since_last(steps=2)\n", + "\n", + "# Remove 01/01 and 01/02 (not enough previous data)\n", + "monthly_window_lengths = monthly_window_lengths.filter(monthly_window_lengths.notnan())\n", + "\n", + "# Use since_last() feature as variable window length\n", + "moving_sum = sales_evset.moving_sum(window_length=monthly_window_lengths)\n", + "\n", + "moving_sum" + ] + }, + { + "cell_type": "markdown", + "id": "706f50a9-72ab-4673-aa38-8c3dacdcf49d", + "metadata": {}, + "source": [ + "## (Optional) Rename and plot\n", + "\n", + "Finally, we can rename features to match their actual meaning after aggregation.\n", + "\n", + "In this case we also calculate and plot the daily profit." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "664550bb-12ac-43c2-8216-4308843178b5", + "metadata": {}, + "outputs": [], + "source": [ + "# Rename aggregated features\n", + "monthly_sales = moving_sum.rename({\"unit_price\": \"monthly_revenue\", \"unit_cost\": \"monthly_cost\"})\n", + "\n", + "# Profit = revenue - cost\n", + "monthly_profit = (monthly_sales[\"monthly_revenue\"] - monthly_sales[\"monthly_cost\"]).rename(\"monthly_profit\")\n", + "\n", + "monthly_profit.plot(style='line', interactive=True, width_px=600)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f3a9c6e6-5cba-4950-900f-1878e87a98be", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.4" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/src/recipes/index.md b/docs/src/recipes/index.md index 7103f9329..f06d89fbf 100644 --- a/docs/src/recipes/index.md +++ b/docs/src/recipes/index.md @@ -7,6 +7,7 @@ in typical use cases. | Recipe | | ----------------------------------------------------------------------------------------------------- | +| [Aggregate events by calendar features (month/year)](aggregate_calendar.ipynb) | | [Aggregate events by fixed-length intervals (turn events into time-series)](aggregate_interval.ipynb) | | [Aggregate events from different indexes](aggregate_index.ipynb) | | [Unify events with identical timestamps](aggregate_duplicated.ipynb) |