Optimized the logic applying univariate model to DataFrame (#67)

* optimized applying univariate model to DF * updated version number and changelogs * minor optimization * Fixed a bug that model trained with Series cannot be applied to DataFrame due to name matching error * modified docstrings * updated version number * updated changelog
arundo · Feb 18, 2020 · 90b9145 · 90b9145
1 parent 9e9b86b
commit 90b9145
Show file tree

Hide file tree

Showing 11 changed files with 164 additions and 167 deletions.
diff --git a/docs/conf.py b/docs/conf.py
@@ -66,7 +66,7 @@
 # The short X.Y version.
 version = "0.5"
 # The full version, including alpha/beta/rc tags.
-release = "0.5.3"
+release = "0.5.4"
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.

diff --git a/docs/releasehistory.rst b/docs/releasehistory.rst
@@ -2,6 +2,13 @@
 Release History
 ***************
 
+Version 0.5.4 (Feb 18, 2020)
+===================================
+- Optimized the workflow of how a univariate model is applied to pandas DataFrame
+    - Added more informative error messages
+    - Fixed some bugs resulting in model-column matching error due to inconsistency between output Series names and DataFrame columns
+    - Clarified the workflow in the documentation
+
 Version 0.5.3 (Feb 12, 2020)
 ===================================
 - Quick hotfix to avoid errors caused by statsmodels v0.11 by requiring statsmodels dependency <0.11

diff --git a/setup.cfg b/setup.cfg
@@ -1,6 +1,6 @@
 [metadata]
 name = adtk
-version = 0.5.3
+version = 0.5.4
 author = Arundo Analytics, Inc.
 maintainer = Tailai Wen
 maintainer_email = [email protected]

diff --git a/src/adtk/__init__.py b/src/adtk/__init__.py
@@ -20,4 +20,4 @@
 
 """
 
-__version__ = "0.5.3"
+__version__ = "0.5.4"
diff --git a/src/adtk/_base.py b/src/adtk/_base.py
@@ -11,7 +11,9 @@ class _Model(ABC):
     def __init__(self, **kwargs):
         for key, value in kwargs.items():
             setattr(self, key, value)
-            self._fitted = False
+        self._fitted = (
+            0
+        )  # 0 for not fitted, 1 for fitted, 2 for univariate model fitted by DF
 
     @abstractmethod
     def _fit(self, ts):
@@ -91,37 +93,78 @@ def _fit(self, ts):
             s = ts.copy()
             self._fit_core(s)
             self._models = None
+            self._fitted = 1
         elif isinstance(ts, pd.DataFrame):
             df = ts.copy()
+            if df.columns.duplicated().any():
+                raise ValueError(
+                    "Input DataFrame must have unique column names."
+                )
             if self._need_fit:
                 self._update_models(df.columns)
                 # fit model for each column
                 for col in df.columns:
                     self._models[col].fit(df[col])
+                self._fitted = 2
+            else:
+                pass
         else:
             raise TypeError("Input must be a pandas Series or DataFrame.")
-        self._fitted = True
 
     def _predict(self, ts):
-        if self._need_fit and (not self._fitted):
+        if self._need_fit and (self._fitted == 0):
             raise RuntimeError("The model must be trained first.")
         if isinstance(ts, pd.Series):
+            if self._need_fit and (
+                self._fitted == 2
+            ):  # fitted by DF, to be applied to Series
+                raise RuntimeError(
+                    "The model was trained by a pandas DataFrame object, "
+                    "it can only be applied to a pandas DataFrame object."
+                )
             s = ts.copy()
             predicted = self._predict_core(s)
             # if a Series-to-Series operation, make sure Series name keeps
             if isinstance(predicted, pd.Series):
                 predicted.name = ts.name
         elif isinstance(ts, pd.DataFrame):
             df = ts.copy()
-            # if the model doesn't neef fit, initialize or reset a model for
-            # each column
-            if not self._need_fit:
-                self._update_models(df.columns)
-            # predict for each column
-            predicted = pd.concat(
-                [self._models[col]._predict(df[col]) for col in df.columns],
-                axis=1,
-            )
+            if df.columns.duplicated().any():
+                raise ValueError(
+                    "Input DataFrame must have unique column names."
+                )
+            if (not self._need_fit) or (self._fitted == 1):
+                # apply the model to each column
+                predicted = []
+                for col in df.columns:
+                    predicted_this_col = self._predict(df[col])
+                    if isinstance(predicted_this_col, pd.DataFrame):
+                        predicted_this_col = predicted_this_col.rename(
+                            columns={
+                                col1: "{}_{}".format(col, col1)
+                                for col1 in predicted_this_col.columns
+                            }
+                        )
+                    predicted.append(predicted_this_col)
+                predicted = pd.concat(predicted, axis=1)
+            else:
+                # predict for each column
+                if not (set(self._models.keys()) >= set(df.columns)):
+                    raise ValueError(
+                        "The model was trained by a pandas DataFrame with "
+                        "columns {}, but the input DataFrame contains columns "
+                        "{} which are unknown to the model.".format(
+                            list(set(self._models.keys())),
+                            list(set(df.columns) - set(self._models.keys())),
+                        )
+                    )
+                predicted = pd.concat(
+                    [
+                        self._models[col]._predict(df[col])
+                        for col in df.columns
+                    ],
+                    axis=1,
+                )
         else:
             raise TypeError("Input must be a pandas Series or DataFrame.")
         # make sure index freq is the same (because pandas has a bug that some
@@ -153,16 +196,24 @@ def fit_predict(self, ts):
 class _ModelHD(_Model):
     def _fit(self, df):
         if isinstance(df, pd.DataFrame):
+            if df.columns.duplicated().any():
+                raise ValueError(
+                    "Input DataFrame must have unique column names."
+                )
             df_copy = df.copy()
             self._fit_core(df_copy)
         else:
             raise TypeError("Input must be a pandas DataFrame.")
-        self._fitted = True
+        self._fitted = 1
 
     def _predict(self, df):
-        if self._need_fit and (not self._fitted):
+        if self._need_fit and (self._fitted == 0):
             raise RuntimeError("The model must be trained first.")
         if isinstance(df, pd.DataFrame):
+            if df.columns.duplicated().any():
+                raise ValueError(
+                    "Input DataFrame must have unique column names."
+                )
             df_copy = df.copy()
             predicted = self._predict_core(df_copy)
         else:

diff --git a/src/adtk/_detector_base.py b/src/adtk/_detector_base.py
@@ -25,9 +25,14 @@ def detect(self, ts, return_list=False):
         Parameters
         ----------
         ts: pandas.Series or pandas.DataFrame
-            Time series to detect anomalies from.
-            If a DataFrame with k columns, k univariate detectors will be
-            applied to them independently.
+            Time series to detect anomalies from. If a DataFrame with k
+            columns, it is treated as k independent univariate time series.
+
+            - If the detector was trained with a Series, the detector will be
+              applied to each univariate series independently;
+            - If the detector was trained with a DataFrame, i.e. the detector
+              is essentially k detectors, those detectors will be applied to
+              each univariate series respectivley.
 
         return_list: bool, optional
             Whether to return a list of anomalous time stamps, or a binary
@@ -66,8 +71,9 @@ def fit_detect(self, ts, return_list=False):
         ----------
         ts: pandas.Series or pandas.DataFrame
             Time series to be used for training and be detected for anomalies.
-            If a DataFrame with k columns, k univariate detectors will be
-            trained and applied to them independently.
+            If a DataFrame with k columns, it is treated as k independent
+            univariate time series, and k univariate detectors will be trained
+            and applied to each series independently.
 
         return_list: bool, optional
             Whether to return a list of anomalous time stamps, or a binary
@@ -109,8 +115,9 @@ def score(self, ts, anomaly_true, scoring="recall", **kwargs):
         ----------
         ts: pandas Series or pandas.DataFrame
             Time series to detect anomalies from.
-            If a DataFrame with k columns, k univariate detectors will be
-            applied to them independently.
+            If a DataFrame with k columns, it is treated as k independent
+            univariate time series, and k univariate detectors will be trained
+            and applied to each series independently.
 
         anomaly_true: pandas.Series, pandas.DataFrame, list, or dict
             True anomalies.

diff --git a/src/adtk/_transformer_base.py b/src/adtk/_transformer_base.py
@@ -21,9 +21,14 @@ def transform(self, ts):
         Parameters
         ----------
         ts: pandas.Series or pandas.DataFrame
-            Time series to be transformed.
-            If a DataFrame with k columns, k univariate transformers will be
-            applied to them independently.
+            Time series to be transformed. If a DataFrame with k columns, it is
+            treated as k independent univariate time series.
+
+            - If the transformer was trained with a Series, the transformer
+              will be applied to each univariate series independently;
+            - If the transformer was trained with a DataFrame, i.e. the
+              transformer is essentially k transformers, those transformers
+              will be applied to each univariate series respectivley.
 
         Returns
         -------
@@ -41,8 +46,9 @@ def fit_transform(self, ts):
         ----------
         ts: pandas.Series or pandas.DataFrame
             Time series to be used for training and be transformed.
-            If a DataFrame with k columns, k univariate transformers will be
-            applied to them independently.
+            If a DataFrame with k columns, it is treated as k independent
+            univariate time series, and k univariate transformers will be
+            trained and applied to each series independently.
 
         Returns
         -------

diff --git a/src/adtk/detector/detector_1d.py b/src/adtk/detector/detector_1d.py
@@ -39,13 +39,6 @@
 class CustomizedDetector1D(_Detector1D):
     """Detector derived from a user-given function and parameters.
 
-    This is an univariate detector. When it is applied to a multivariate time
-    series (i.e. pandas DataFrame), it will be applied to every series
-    independently. All parameters can be defined as a dict object where key-
-    value pairs are series names (i.e. column names of DataFrame) and the
-    model parameter for that series. If not, then the same parameter will be
-    applied to all series.
-
     Parameters
     ----------
     detect_func: function
@@ -133,13 +126,6 @@ class ThresholdAD(_Detector1D):
     This detector compares time series values with user-given thresholds, and
     identifies time points as anomalous when values are beyond the thresholds.
 
-    This is an univariate detector. When it is applied to a multivariate time
-    series (i.e. pandas DataFrame), it will be applied to every series
-    independently. All parameters can be defined as a dict object where key-
-    value pairs are series names (i.e. column names of DataFrame) and the
-    model parameter for that series. If not, then the same parameter will be
-    applied to all series.
-
     Parameters
     ----------
     low: float, optional
@@ -178,13 +164,6 @@ class QuantileAD(_Detector1D):
     of historical data, and identifies time points as anomalous when values
     are beyond the thresholds.
 
-    This is an univariate detector. When it is applied to a multivariate time
-    series (i.e. pandas DataFrame), it will be applied to every series
-    independently. All parameters can be defined as a dict object where key-
-    value pairs are series names (i.e. column names of DataFrame) and the
-    model parameter for that series. If not, then the same parameter will be
-    applied to all series.
-
     Parameters
     ----------
     low: float, optional
@@ -239,13 +218,6 @@ class InterQuartileRangeAD(_Detector1D):
     historical data, and identifies time points as anomalous when differences
     are beyond the inter-quartile range times a user-given factor c.
 
-    This is an univariate detector. When it is applied to a multivariate time
-    series (i.e. pandas DataFrame), it will be applied to every series
-    independently. All parameters can be defined as a dict object where key-
-    value pairs are series names (i.e. column names of DataFrame) and the
-    model parameter for that series. If not, then the same parameter will be
-    applied to all series.
-
     Parameters
     ----------
     c: float, or 2-tuple (float, float), optional
@@ -317,13 +289,6 @@ class GeneralizedESDTestAD(_Detector1D):
     follow an approximately normal distribution. Please only use this detector
     when this assumption holds.
 
-    This is an univariate detector. When it is applied to a multivariate time
-    series (i.e. pandas DataFrame), it will be applied to every series
-    independently. All parameters can be defined as a dict object where key-
-    value pairs are series names (i.e. column names of DataFrame) and the
-    model parameter for that series. If not, then the same parameter will be
-    applied to all series.
-
     [1] Rosner, Bernard (May 1983), Percentage Points for a Generalized ESD
     Many-Outlier Procedure,Technometrics, 25(2), pp. 165-172.
 
@@ -412,13 +377,6 @@ class PersistAD(_Detector1D):
     This detector is internally implemented as a `Pipenet` object. Advanced
     users may learn more details by checking attribute `pipe_`.
 
-    This is an univariate detector. When it is applied to a multivariate time
-    series (i.e. pandas DataFrame), it will be applied to every series
-    independently. All parameters can be defined as a dict object where key-
-    value pairs are series names (i.e. column names of DataFrame) and the
-    model parameter for that series. If not, then the same parameter will be
-    applied to all series.
-
     Parameters
     ----------
     window: int, optional
@@ -575,13 +533,6 @@ class LevelShiftAD(_Detector1D):
     This detector is internally implemented as a `Pipenet` object. Advanced
     users may learn more details by checking attribute `pipe_`.
 
-    This is an univariate detector. When it is applied to a multivariate time
-    series (i.e. pandas DataFrame), it will be applied to every series
-    independently. All parameters can be defined as a dict object where key-
-    value pairs are series names (i.e. column names of DataFrame) and the
-    model parameter for that series. If not, then the same parameter will be
-    applied to all series.
-
     Parameters
     ----------
     window: int, optional
@@ -723,13 +674,6 @@ class VolatilityShiftAD(_Detector1D):
     This detector is internally implemented as a `Pipenet` object. Advanced
     users may learn more details by checking attribute `pipe_`.
 
-    This is an univariate detector. When it is applied to a multivariate time
-    series (i.e. pandas DataFrame), it will be applied to every series
-    independently. All parameters can be defined as a dict object where key-
-    value pairs are series names (i.e. column names of DataFrame) and the
-    model parameter for that series. If not, then the same parameter will be
-    applied to all series.
-
     Parameters
     ----------
     window: int, optional
@@ -886,13 +830,6 @@ class AutoregressionAD(_Detector1D):
     This detector is internally implemented aattribute `pipe_`.nced
     users may learn more details by checking attribute `pipe_`.
 
-    This is an univariate detector. When it is applied to a multivariate time
-    series (i.e. pandas DataFrame), it will be applied to every series
-    independently. All parameters can be defined as a dict object where key-
-    value pairs are series names (i.e. column names of DataFrame) and the
-    model parameter for that series. If not, then the same parameter will be
-    applied to all series.
-
     Parameters
     ----------
     n_steps: int, optional
@@ -1042,13 +979,6 @@ class SeasonalAD(_Detector1D):
     This detector is internally implemented aattribute `pipe_`.nced
     users may learn more details by checking attribute `pipe_`.
 
-    This is an univariate detector. When it is applied to a multivariate time
-    series (i.e. pandas DataFrame), it will be applied to every series
-    independently. All parameters can be defined as a dict object where key-
-    value pairs are series names (i.e. column names of DataFrame) and the
-    model parameter for that series. If not, then the same parameter will be
-    applied to all series.
-
     Parameters
     ----------
     freq: int, optional
@@ -1084,12 +1014,7 @@ class SeasonalAD(_Detector1D):
 
     """
 
-    _default_params = {
-        "freq": None,
-        "side": "both",
-        "c": 3.0,
-        "trend": False,
-    }
+    _default_params = {"freq": None, "side": "both", "c": 3.0, "trend": False}
 
     def __init__(
         self,