From 119e293f4e33852a5b63bfe80b23a6b2b31675b6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Lejeune?= <54889281+Clej@users.noreply.github.com> Date: Fri, 10 Sep 2021 17:27:13 +0200 Subject: [PATCH 001/117] add inverste_transform method --- .../dim_reduction/feature_extraction/_fpca.py | 62 +++++++++++++++++++ 1 file changed, 62 insertions(+) diff --git a/skfda/preprocessing/dim_reduction/feature_extraction/_fpca.py b/skfda/preprocessing/dim_reduction/feature_extraction/_fpca.py index f551fc0e0..0f9cf28ab 100644 --- a/skfda/preprocessing/dim_reduction/feature_extraction/_fpca.py +++ b/skfda/preprocessing/dim_reduction/feature_extraction/_fpca.py @@ -151,6 +151,8 @@ def _fit_basis( else X.basis.n_basis ) n_samples = X.n_samples + # necessary in inverse_transform + self.n_samples_fitted_ = X.n_samples # check that the number of components is smaller than the sample size if self.n_components > X.n_samples: @@ -218,6 +220,9 @@ def _fit_basis( lower=True, ) + # this matrix is needed to compute inverse_transform + self._l_inv_j_t = l_inv_j_t + # the final matrix, C(L-1Jt)t for svd or (L-1Jt)-1CtC(L-1Jt)t for PCA final_matrix = ( X.coefficients @ np.transpose(l_inv_j_t) / np.sqrt(n_samples) @@ -326,6 +331,9 @@ def _fit_grid( # get the number of samples and the number of points of descretization n_samples, n_points_discretization = fd_data.shape + # necessary for inverse_transform + self.n_samples_fitted_ = n_samples + # if centering is True then subtract the mean function to each function # in FDataBasis X = self._center_if_necessary(X) @@ -480,3 +488,57 @@ def fit_transform( """ return self.fit(X, y).transform(X, y) + + def inverse_transform( + self, + pc_score: np.ndarray, + ) -> FData: + """ + Compute the reconstruction of samples given their ``n_components`` first principal components score i.e. a projection coefficient onto the fitted functional principal components. + In other words, it maps a coefficient vector, from the fitted functional principal components space, back to the input functional space. + Typically, ``pc_score`` might be an array returned by ``transform`` or ``fit_transform`` method. + + Args: + pc_score: ndarray of shape (n_samples, n_components). The principal components scores from which to perform the inverse transformation. + + Returns: + A FData object in the functional input space. + + """ + # check if the instance is fitted. + + # input format check: + if isinstance(pc_score, np.ndarray): + if pc_score.ndim == 1: + pc_score = pc_score[np.newaxis, :] + + if pc_score.shape[1] != self.n_components: + raise AttributeError("pc_score must be a numpy array with n_samples rows and n_components columns.") + else: + raise AttributeError("pc_score is not a numpy array.") + + # inverse_transform is slightly different wether .fit is applied to FDataGrid or FDataBasis + if isinstance(self.components_, FDataGrid): + # reconstruct the discretized functions + x_hat = (pc_score @ self.components_.data_matrix[:,:,0]) \ + @ (np.diag(np.sqrt(self.weights)) / np.sqrt(self.n_samples_fitted_)) + x_hat += self.mean_.data_matrix.reshape((1,self.mean_.grid_points[0].shape[0])) + + # format as FDataGrid according to fitted data format + return FDataGrid(data_matrix=x_hat, grid_points=self.mean_.grid_points[0], + argument_names=self.mean_.argument_names) + elif isinstance(self.components_, FDataBasis): + # reconstruct the basis coefficients + x_hat = (pc_score @ self.components_.coefficients) \ + @ (np.transpose(self._l_inv_j_t) / np.sqrt(self.n_samples_fitted_)) + x_hat += self.mean_.coefficients.reshape((1,self.mean_.coefficients.shape[1])) + # format as FDataBasis according to fitted data format + return FDataBasis(basis=self.mean_.basis, coefficients = x_hat, + argument_names=self.mean_.argument_names) + + + + + + + From dae1585931336851114ba87eebd7e29b38c3f4ef Mon Sep 17 00:00:00 2001 From: VNMabus Date: Sun, 12 Sep 2021 23:37:03 +0200 Subject: [PATCH 002/117] Add Zenodo metadata. --- .zenodo.json | 47 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) create mode 100644 .zenodo.json diff --git a/.zenodo.json b/.zenodo.json new file mode 100644 index 000000000..da2b22bce --- /dev/null +++ b/.zenodo.json @@ -0,0 +1,47 @@ +{ + 'creators': [ + { + 'affiliation': 'Universidad Autónoma de Madrid', + 'name': 'Carlos Ramos Carreño', + 'orcid': '0000-0003-2566-7058' + }, + { + 'affiliation': 'Universidad Autónoma de Madrid', + 'name': 'Alberto Suárez', + 'orcid': '0000-0003-4534-0909' + }, + { + 'affiliation': 'Universidad Autónoma de Madrid', + 'name': 'José Luis Torrecilla', + 'orcid': '0000-0003-3719-5190' + }, + { + 'name': 'Miguel Carbajo Berrocal' + }, + { + 'name': 'Pablo Marcos Manchón' + }, + { + 'name': 'Pablo Pérez Manso' + }, + { + 'name': 'Amanda Hernando Bernabé' + }, + { + 'name': 'David García Fernández' + }, + { + 'name': 'Yujian Hong' + }, + { + 'name': 'Pedro Martín Rodríguez-Ponga Eyriès' + }, + { + 'name': 'Álvaro Sánchez Romero' + }, + { + 'name': 'Elena Petrunina' + } + ], + 'license': 'BSD 3-Clause License', +} \ No newline at end of file From d77b353c8e18a93542c26fa8ad7a711704bd2729 Mon Sep 17 00:00:00 2001 From: VNMabus Date: Mon, 13 Sep 2021 00:01:46 +0200 Subject: [PATCH 003/117] Upgrade Zenodo names. --- .zenodo.json | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/.zenodo.json b/.zenodo.json index da2b22bce..95f1fd8e4 100644 --- a/.zenodo.json +++ b/.zenodo.json @@ -2,45 +2,45 @@ 'creators': [ { 'affiliation': 'Universidad Autónoma de Madrid', - 'name': 'Carlos Ramos Carreño', + 'name': 'Ramos-Carreño, Carlos', 'orcid': '0000-0003-2566-7058' }, { 'affiliation': 'Universidad Autónoma de Madrid', - 'name': 'Alberto Suárez', + 'name': 'Suárez, Alberto', 'orcid': '0000-0003-4534-0909' }, { 'affiliation': 'Universidad Autónoma de Madrid', - 'name': 'José Luis Torrecilla', + 'name': 'Torrecilla, José Luis', 'orcid': '0000-0003-3719-5190' }, { - 'name': 'Miguel Carbajo Berrocal' + 'name': 'Carbajo Berrocal, Miguel' }, { - 'name': 'Pablo Marcos Manchón' + 'name': 'Marcos Manchón, Pablo' }, { - 'name': 'Pablo Pérez Manso' + 'name': 'Pérez Manso, Pablo' }, { - 'name': 'Amanda Hernando Bernabé' + 'name': 'Hernando Bernabé, Amanda' }, { - 'name': 'David García Fernández' + 'name': 'García Fernández, David' }, { - 'name': 'Yujian Hong' + 'name': 'Hong, Yujian' }, { - 'name': 'Pedro Martín Rodríguez-Ponga Eyriès' + 'name': 'Rodríguez-Ponga Eyriès, Pedro Martín' }, { - 'name': 'Álvaro Sánchez Romero' + 'name': 'Sánchez Romero, Álvaro' }, { - 'name': 'Elena Petrunina' + 'name': 'Petrunina, Elena' } ], 'license': 'BSD 3-Clause License', From 3caf6f423dce0600ad8f0ef02a961784999cb34b Mon Sep 17 00:00:00 2001 From: VNMabus Date: Mon, 13 Sep 2021 00:34:14 +0200 Subject: [PATCH 004/117] Fix error in recent multimethod version. --- skfda/misc/_math.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skfda/misc/_math.py b/skfda/misc/_math.py index 252b4f79e..2e1cc2c9e 100644 --- a/skfda/misc/_math.py +++ b/skfda/misc/_math.py @@ -10,7 +10,6 @@ import multimethod import numpy as np - import scipy.integrate from .._utils import _same_domain, nquad_vec @@ -216,6 +215,7 @@ def inner_product( *, _matrix: bool = False, _domain_range: Optional[DomainRange] = None, + **kwargs, ) -> np.ndarray: r"""Return the usual (:math:`L_2`) inner product. From c965f71d41caf62423c4ae418995b549a720fd3a Mon Sep 17 00:00:00 2001 From: VNMabus Date: Thu, 16 Sep 2021 23:38:47 +0200 Subject: [PATCH 005/117] Small fixes. --- skfda/datasets/_real_datasets.py | 21 +++++++---- .../visualization/representation.py | 37 +++++++++---------- skfda/misc/_math.py | 2 +- 3 files changed, 33 insertions(+), 27 deletions(-) diff --git a/skfda/datasets/_real_datasets.py b/skfda/datasets/_real_datasets.py index 51aaf2c8a..933fa68d9 100644 --- a/skfda/datasets/_real_datasets.py +++ b/skfda/datasets/_real_datasets.py @@ -3,12 +3,13 @@ import numpy as np import pandas as pd -import rdata from numpy import ndarray from pandas import DataFrame, Series from sklearn.utils import Bunch from typing_extensions import Literal +import rdata + from .. import FDataGrid @@ -130,7 +131,7 @@ def fetch_cran( ) -def _ucr_to_fdatagrid(data: np.ndarray) -> FDataGrid: +def _ucr_to_fdatagrid(name: str, data: np.ndarray) -> FDataGrid: if data.dtype == np.object_: data = np.array(data.tolist()) @@ -142,7 +143,7 @@ def _ucr_to_fdatagrid(data: np.ndarray) -> FDataGrid: grid_points = range(data.shape[1]) - return FDataGrid(data, grid_points=grid_points) + return FDataGrid(data, grid_points=grid_points, dataset_name=name) def fetch_ucr(name: str, **kwargs: Any) -> Bunch: @@ -173,12 +174,18 @@ def fetch_ucr(name: str, **kwargs: Any) -> Bunch: dataset = repositories.ucr.fetch(name, **kwargs) - dataset['data'] = _ucr_to_fdatagrid(dataset['data']) + dataset['data'] = _ucr_to_fdatagrid( + name=dataset['name'], + data=dataset['data'], + ) dataset.pop('feature_names') data_test = dataset.get('data_test', None) if data_test is not None: - dataset['data_test'] = _ucr_to_fdatagrid(data_test) + dataset['data_test'] = _ucr_to_fdatagrid( + name=dataset['name'], + data=data_test, + ) return dataset @@ -436,7 +443,7 @@ def fetch_growth( target_name = "sex" target_categories = ["male", "female"] frame = None - + if as_frame: sex = pd.Categorical.from_codes(sex, categories=target_categories) frame = pd.DataFrame({ @@ -448,7 +455,7 @@ def fetch_growth( if return_X_y: return curves, sex - + return Bunch( data=curves, target=sex, diff --git a/skfda/exploratory/visualization/representation.py b/skfda/exploratory/visualization/representation.py index bde4576e9..edd3e5a62 100644 --- a/skfda/exploratory/visualization/representation.py +++ b/skfda/exploratory/visualization/representation.py @@ -7,7 +7,7 @@ like depth measures. """ -from typing import Any, Mapping, Optional, Sequence, Tuple, TypeVar, Union +from typing import Any, Dict, Optional, Sequence, Tuple, TypeVar, Union import matplotlib.cm import matplotlib.patches @@ -19,7 +19,7 @@ from typing_extensions import Protocol from ... import FDataGrid -from ..._utils import _to_domain_range, constants +from ..._utils import _to_domain_range, _to_grid_points, constants from ...representation._functional_data import FData from ...representation._typing import DomainRangeLike, GridPointsLike from ._baseplot import BasePlot @@ -46,7 +46,7 @@ def _get_color_info( group_names: Optional[Indexable[K, str]] = None, group_colors: Optional[Indexable[K, ColorLike]] = None, legend: bool = False, - kwargs: Optional[Mapping[str, Any]] = None, + kwargs: Optional[Dict[str, Any]] = None, ) -> Tuple[ Optional[ColorLike], Optional[Sequence[matplotlib.patches.Patch]], @@ -98,11 +98,11 @@ def _get_color_info( # otherwise if 'color' in kwargs: - sample_colors = fdata.n_samples * [kwargs.get("color")] + sample_colors = len(fdata) * [kwargs.get("color")] kwargs.pop('color') elif 'c' in kwargs: - sample_colors = fdata.n_samples * [kwargs.get("c")] + sample_colors = len(fdata) * [kwargs.get("c")] kwargs.pop('c') else: @@ -209,8 +209,7 @@ def __init__( legend: bool = False, **kwargs: Any, ) -> None: - BasePlot.__init__( - self, + super().__init__( chart, fig=fig, axes=axes, @@ -242,7 +241,7 @@ def __init__( for grad_color in self.gradient_criteria ] - self.gradient_list: Sequence[float] = ( + self.gradient_list: Optional[Sequence[float]] = ( [ aux / (self.max_grad - self.min_grad) for aux in aux_list @@ -280,9 +279,9 @@ def __init__( else: colormap = matplotlib.cm.get_cmap(self.colormap) - sample_colors = [None] * self.fdata.n_samples - for m in range(self.fdata.n_samples): - sample_colors[m] = colormap(self.gradient_list[m]) + sample_colors = [ + colormap(g for g in self.gradient_list), + ] self.sample_colors = sample_colors self.patches = patches @@ -310,13 +309,15 @@ def _plot( dtype=Artist, ) - color_dict: Mapping[str, Optional[ColorLike]] = {} + color_dict: Dict[str, Optional[ColorLike]] = {} if self.fdata.dim_domain == 1: if self.n_points is None: self.n_points = constants.N_POINTS_UNIDIMENSIONAL_PLOT_MESH + assert isinstance(self.n_points, int) + # Evaluates the object in a linspace eval_points = np.linspace(*self.domain_range[0], self.n_points) mat = self.fdata(eval_points) @@ -436,8 +437,7 @@ def __init__( legend: bool = False, **kwargs: Any, ) -> None: - BasePlot.__init__( - self, + super().__init__( chart, fig=fig, axes=axes, @@ -445,14 +445,13 @@ def __init__( n_cols=n_cols, ) self.fdata = fdata - self.grid_points = grid_points - self.evaluated_points = None - if self.grid_points is None: + if grid_points is None: # This can only be done for FDataGrid self.grid_points = self.fdata.grid_points self.evaluated_points = self.fdata.data_matrix else: + self.grid_points = _to_grid_points(grid_points) self.evaluated_points = self.fdata( self.grid_points, grid=True, ) @@ -507,7 +506,7 @@ def _plot( dtype=Artist, ) - color_dict: Mapping[str, Optional[ColorLike]] = {} + color_dict: Dict[str, Optional[ColorLike]] = {} if self.fdata.dim_domain == 1: @@ -550,7 +549,7 @@ def _plot( def set_color_dict( sample_colors: Any, ind: int, - color_dict: Mapping[str, Optional[ColorLike]], + color_dict: Dict[str, Optional[ColorLike]], ) -> None: """ Auxiliary method used to update color_dict. diff --git a/skfda/misc/_math.py b/skfda/misc/_math.py index 2e1cc2c9e..71637aca5 100644 --- a/skfda/misc/_math.py +++ b/skfda/misc/_math.py @@ -215,7 +215,7 @@ def inner_product( *, _matrix: bool = False, _domain_range: Optional[DomainRange] = None, - **kwargs, + **kwargs: Any, ) -> np.ndarray: r"""Return the usual (:math:`L_2`) inner product. From dbbee64e36f7e9a9ccb5d283fb0a87e428004043 Mon Sep 17 00:00:00 2001 From: VNMabus Date: Tue, 21 Sep 2021 19:03:29 +0200 Subject: [PATCH 006/117] Fix tutorial basis example. --- tutorial/plot_basis_representation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tutorial/plot_basis_representation.py b/tutorial/plot_basis_representation.py index ac85e8adf..e22763b52 100644 --- a/tutorial/plot_basis_representation.py +++ b/tutorial/plot_basis_representation.py @@ -174,7 +174,7 @@ X_basis = X.to_basis(basis) ax = axes.ravel()[n_basis - 1] - fig = X_basis.plot(ax=ax) + fig = X_basis.plot(axes=ax) ax.set_title(f"{n_basis} basis functions") fig.tight_layout() From 1a6fc2c01e39871e09fd2ec6d0b14d378d6b069f Mon Sep 17 00:00:00 2001 From: VNMabus Date: Sun, 26 Sep 2021 20:03:12 +0200 Subject: [PATCH 007/117] Rename outlier detectors. --- docs/modules/exploratory/outliers.rst | 6 +++--- skfda/exploratory/outliers/__init__.py | 4 ++-- skfda/exploratory/outliers/{_iqr.py => _boxplot.py} | 6 +++--- .../outliers/_directional_outlyingness.py | 4 ++-- .../visualization/_magnitude_shape_plot.py | 6 +++--- tests/test_outliers.py | 13 ++++++++----- 6 files changed, 21 insertions(+), 18 deletions(-) rename skfda/exploratory/outliers/{_iqr.py => _boxplot.py} (94%) diff --git a/docs/modules/exploratory/outliers.rst b/docs/modules/exploratory/outliers.rst index 0adba3291..fb79a7be6 100644 --- a/docs/modules/exploratory/outliers.rst +++ b/docs/modules/exploratory/outliers.rst @@ -10,8 +10,8 @@ identify the outliers. Each of the outlier detection methods in scikit-fda has the same API as the outlier detection methods of `scikit-learn `_. -Interquartile Range Outlier Detector ------------------------------------- +Boxplot Outlier Detector +------------------------ One of the most common ways of outlier detection is given by the functional data boxplot. An observation is marked as an outlier if it has points :math:`1.5 \cdot IQR` times outside the region containing the deepest 50% of the curves @@ -20,7 +20,7 @@ as an outlier if it has points :math:`1.5 \cdot IQR` times outside the region co .. autosummary:: :toctree: autosummary - skfda.exploratory.outliers.IQROutlierDetector + skfda.exploratory.outliers.BoxplotOutlierDetector DirectionalOutlierDetector diff --git a/skfda/exploratory/outliers/__init__.py b/skfda/exploratory/outliers/__init__.py index 760c34b32..d33e35798 100644 --- a/skfda/exploratory/outliers/__init__.py +++ b/skfda/exploratory/outliers/__init__.py @@ -1,7 +1,7 @@ +from ._boxplot import BoxplotOutlierDetector from ._directional_outlyingness import ( - DirectionalOutlierDetector, + MSPlotOutlierDetector, directional_outlyingness_stats, ) -from ._iqr import IQROutlierDetector from ._outliergram import OutliergramOutlierDetector from .neighbors_outlier import LocalOutlierFactor diff --git a/skfda/exploratory/outliers/_iqr.py b/skfda/exploratory/outliers/_boxplot.py similarity index 94% rename from skfda/exploratory/outliers/_iqr.py rename to skfda/exploratory/outliers/_boxplot.py index 5f13518a1..e7262a55a 100644 --- a/skfda/exploratory/outliers/_iqr.py +++ b/skfda/exploratory/outliers/_boxplot.py @@ -10,7 +10,7 @@ from . import _envelopes -class IQROutlierDetector( +class BoxplotOutlierDetector( BaseEstimator, # type: ignore OutlierMixin, # type: ignore ): @@ -35,7 +35,7 @@ class IQROutlierDetector( ... [-0.5, -0.5, -0.5, -1, -1, -1]] >>> grid_points = [0, 2, 4, 6, 8, 10] >>> fd = skfda.FDataGrid(data_matrix, grid_points) - >>> out_detector = IQROutlierDetector() + >>> out_detector = BoxplotOutlierDetector() >>> out_detector.fit_predict(fd) array([-1, 1, 1, -1]) @@ -50,7 +50,7 @@ def __init__( self.depth_method = depth_method self.factor = factor - def fit(self, X: FDataGrid, y: None = None) -> IQROutlierDetector: + def fit(self, X: FDataGrid, y: None = None) -> BoxplotOutlierDetector: depth_method = ( self.depth_method diff --git a/skfda/exploratory/outliers/_directional_outlyingness.py b/skfda/exploratory/outliers/_directional_outlyingness.py index 3276f9d4b..86313af68 100644 --- a/skfda/exploratory/outliers/_directional_outlyingness.py +++ b/skfda/exploratory/outliers/_directional_outlyingness.py @@ -246,7 +246,7 @@ def directional_outlyingness_stats( ) -class DirectionalOutlierDetector( +class MSPlotOutlierDetector( BaseEstimator, # type: ignore OutlierMixin, # type: ignore ): @@ -325,7 +325,7 @@ class DirectionalOutlierDetector( ... [-0.5, -0.5, -0.5, -1, -1, -1]] >>> grid_points = [0, 2, 4, 6, 8, 10] >>> fd = skfda.FDataGrid(data_matrix, grid_points) - >>> out_detector = DirectionalOutlierDetector() + >>> out_detector = MSPlotOutlierDetector() >>> out_detector.fit_predict(fd) array([1, 1, 1, 1]) diff --git a/skfda/exploratory/visualization/_magnitude_shape_plot.py b/skfda/exploratory/visualization/_magnitude_shape_plot.py index 54ce674e8..e9e45f21b 100644 --- a/skfda/exploratory/visualization/_magnitude_shape_plot.py +++ b/skfda/exploratory/visualization/_magnitude_shape_plot.py @@ -20,7 +20,7 @@ from ... import FDataGrid from ...representation._typing import NDArrayFloat, NDArrayInt from ..depth import Depth -from ..outliers import DirectionalOutlierDetector +from ..outliers import MSPlotOutlierDetector from ._baseplot import BasePlot @@ -38,7 +38,7 @@ class MagnitudeShapePlot(BasePlot): directional outlyingness (:math:`VO`) in the y-axis. The outliers are detected using an instance of - :class:`DirectionalOutlierDetector`. + :class:`MSPlotOutlierDetector`. For more information see :footcite:ts:`dai+genton_2018_visualization`. @@ -220,7 +220,7 @@ def __init__( raise NotImplementedError( "Only support 1 dimension on the codomain.") - self.outlier_detector = DirectionalOutlierDetector(**kwargs) + self.outlier_detector = MSPlotOutlierDetector(**kwargs) y = self.outlier_detector.fit_predict(fdatagrid) diff --git a/tests/test_outliers.py b/tests/test_outliers.py index 575b96d9f..ba351822b 100644 --- a/tests/test_outliers.py +++ b/tests/test_outliers.py @@ -1,11 +1,14 @@ -from skfda import FDataGrid -from skfda.exploratory.depth.multivariate import SimplicialDepth -from skfda.exploratory.outliers import DirectionalOutlierDetector -from skfda.exploratory.outliers import directional_outlyingness_stats import unittest import numpy as np +from skfda import FDataGrid +from skfda.exploratory.depth.multivariate import SimplicialDepth +from skfda.exploratory.outliers import ( + MSPlotOutlierDetector, + directional_outlyingness_stats, +) + class TestsDirectionalOutlyingness(unittest.TestCase): @@ -48,7 +51,7 @@ def test_asymptotic_formula(self): [-0.5, -0.5, -0.5, -1, -1, -1]] grid_points = [0, 2, 4, 6, 8, 10] fd = FDataGrid(data_matrix, grid_points) - out_detector = DirectionalOutlierDetector( + out_detector = MSPlotOutlierDetector( _force_asymptotic=True) prediction = out_detector.fit_predict(fd) np.testing.assert_allclose(prediction, From 2d28da0b5f438d0342361d5596f0df00bc49f6fd Mon Sep 17 00:00:00 2001 From: VNMabus Date: Thu, 30 Sep 2021 21:04:53 +0200 Subject: [PATCH 008/117] Rename Fisher-Rao metrics. The warping distance is now private. --- docs/modules/misc/metrics.rst | 5 +- skfda/misc/metrics/__init__.py | 8 +-- ...stic_metrics.py => _fisher_rao_metrics.py} | 63 ++++++++++--------- skfda/misc/metrics/_utils.py | 9 +-- skfda/representation/_typing.py | 3 +- tests/test_elastic.py | 20 +++--- 6 files changed, 57 insertions(+), 51 deletions(-) rename skfda/misc/metrics/{_elastic_metrics.py => _fisher_rao_metrics.py} (86%) diff --git a/docs/modules/misc/metrics.rst b/docs/modules/misc/metrics.rst index 9cb424244..d6a407b82 100644 --- a/docs/modules/misc/metrics.rst +++ b/docs/modules/misc/metrics.rst @@ -45,9 +45,8 @@ analysis and registration of functional data. :toctree: autosummary skfda.misc.metrics.fisher_rao_distance - skfda.misc.metrics.amplitude_distance - skfda.misc.metrics.phase_distance - skfda.misc.metrics.warping_distance + skfda.misc.metrics.fisher_rao_amplitude_distance + skfda.misc.metrics.fisher_rao_phase_distance Metric induced by a norm diff --git a/skfda/misc/metrics/__init__.py b/skfda/misc/metrics/__init__.py index d8dc69ebd..ad673d6af 100644 --- a/skfda/misc/metrics/__init__.py +++ b/skfda/misc/metrics/__init__.py @@ -1,10 +1,10 @@ """Metrics, norms and related utilities.""" -from ._elastic_metrics import ( - amplitude_distance, +from ._fisher_rao_metrics import ( + _fisher_rao_warping_distance, + fisher_rao_amplitude_distance, fisher_rao_distance, - phase_distance, - warping_distance, + fisher_rao_phase_distance, ) from ._lp_distances import ( LpDistance, diff --git a/skfda/misc/metrics/_elastic_metrics.py b/skfda/misc/metrics/_fisher_rao_metrics.py similarity index 86% rename from skfda/misc/metrics/_elastic_metrics.py rename to skfda/misc/metrics/_fisher_rao_metrics.py index 8025d4d4e..b2839cd36 100644 --- a/skfda/misc/metrics/_elastic_metrics.py +++ b/skfda/misc/metrics/_fisher_rao_metrics.py @@ -1,6 +1,6 @@ """Elastic metrics.""" -from typing import Any, TypeVar +from typing import Any, Optional, TypeVar import numpy as np import scipy.integrate @@ -12,6 +12,7 @@ from ...preprocessing.registration._warping import _normalize_scale from ...preprocessing.registration.elastic import SRSF from ...representation import FData +from ...representation._typing import NDArrayFloat from ._lp_distances import l2_distance from ._utils import _cast_to_grid @@ -22,14 +23,15 @@ def fisher_rao_distance( fdata1: T, fdata2: T, *, - eval_points: np.ndarray = None, + eval_points: Optional[NDArrayFloat] = None, _check: bool = True, -) -> np.ndarray: - r"""Compute the Fisher-Rao distance between two functional objects. +) -> NDArrayFloat: + r""" + Compute the Fisher-Rao distance between two functional objects. Let :math:`f_i` and :math:`f_j` be two functional observations, and let :math:`q_i` and :math:`q_j` be the corresponding SRSF - (see :class:`SRSF`), the fisher rao distance is defined as + (see :class:`SRSF`), the Fisher-Rao distance is defined as .. math:: d_{FR}(f_i, f_j) = \| q_i - q_j \|_2 = @@ -37,7 +39,7 @@ def fisher_rao_distance( sgn(\dot{f_j}(t))\sqrt{|\dot{f_j}(t)|} dt \right )^{\frac{1}{2}} If the observations are distributions of random variables the distance will - match with the usual fisher-rao distance in non-parametric form for + match with the usual Fisher-Rao distance in non-parametric form for probability distributions :footcite:`srivastava++_2011_ficher-rao`. If the observations are defined in a :term:`domain` different than (0,1) @@ -87,20 +89,21 @@ def fisher_rao_distance( return l2_distance(fdata1_srsf, fdata2_srsf) -def amplitude_distance( +def fisher_rao_amplitude_distance( fdata1: T, fdata2: T, *, - lam: float = 0.0, - eval_points: np.ndarray = None, + lam: float = 0, + eval_points: Optional[NDArrayFloat] = None, _check: bool = True, **kwargs: Any, -) -> np.ndarray: - r"""Compute the amplitude distance between two functional objects. +) -> NDArrayFloat: + r""" + Compute the Fisher-Rao amplitude distance between two functional objects. Let :math:`f_i` and :math:`f_j` be two functional observations, and let :math:`q_i` and :math:`q_j` be the corresponding SRSF - (see :class:`SRSF`), the amplitude distance is defined as + (see :class:`SRSF`), the Fisher-Rao amplitude distance is defined as .. math:: d_{A}(f_i, f_j)=min_{\gamma \in \Gamma}d_{FR}(f_i \circ \gamma,f_j) @@ -178,7 +181,7 @@ def amplitude_distance( fdata2_srsf = srsf.transform(fdata2) distance = l2_distance(fdata1_reg_srsf, fdata2_srsf) - if lam != 0.0: + if lam != 0: # L2 norm || sqrt(Dh) - 1 ||^2 warping_deriv = elastic_registration.warping_.derivative() penalty = warping_deriv(eval_points_normalized)[0, ..., 0] @@ -192,21 +195,22 @@ def amplitude_distance( return distance -def phase_distance( +def fisher_rao_phase_distance( fdata1: T, fdata2: T, *, - lam: float = 0.0, - eval_points: np.ndarray = None, + lam: float = 0, + eval_points: Optional[NDArrayFloat] = None, _check: bool = True, -) -> np.ndarray: - r"""Compute the phase distance between two functional objects. +) -> NDArrayFloat: + r""" + Compute the Fisher-Rao phase distance between two functional objects. Let :math:`f_i` and :math:`f_j` be two functional observations, and let :math:`\gamma_{ij}` the corresponding warping used in the elastic registration to align :math:`f_i` to :math:`f_j` (see - :func:`elastic_registration`). The phase distance between :math:`f_i` - and :math:`f_j` is defined as + :func:`elastic_registration`). The Fisher-Rao phase distance between + :math:`f_i` and :math:`f_j` is defined as .. math:: d_{P}(f_i, f_j) = d_{FR}(\gamma_{ij}, \gamma_{id}) = @@ -274,20 +278,21 @@ def phase_distance( return np.arccos(d) -def warping_distance( +def _fisher_rao_warping_distance( warping1: T, warping2: T, *, - eval_points: np.ndarray = None, + eval_points: Optional[NDArrayFloat] = None, _check: bool = True, -) -> np.ndarray: - r"""Compute the distance between warpings functions. +) -> NDArrayFloat: + r""" + Compute the Fisher-Rao distance between warpings functions. Let :math:`\gamma_i` and :math:`\gamma_j` be two warpings, defined in - :math:`\gamma_i:[a,b] \rightarrow [a,b]`. The distance in the - space of warping functions, :math:`\Gamma`, with the riemannian metric - given by the fisher-rao inner product can be computed using the structure - of hilbert sphere in their srsf's. + :math:`\gamma_i:[0,1] \rightarrow [0,1]`. The distance in the + space of warping functions, :math:`\Gamma`, with the Riemannian metric + given by the Fisher-Rao inner product can be computed using the structure + of Hilbert sphere in their SRSF's. .. math:: d_{\Gamma}(\gamma_i, \gamma_j) = cos^{-1} \left ( \int_0^1 @@ -296,7 +301,7 @@ def warping_distance( See :footcite:`srivastava+klassen_2016_analysis_probability` for a detailed explanation. - If the warpings are not defined in [0,1], an affine transformation is maked + If the warpings are not defined in [0,1], an affine transformation is made to change the :term:`domain`. Args: diff --git a/skfda/misc/metrics/_utils.py b/skfda/misc/metrics/_utils.py index cced983ac..2265edb5f 100644 --- a/skfda/misc/metrics/_utils.py +++ b/skfda/misc/metrics/_utils.py @@ -6,6 +6,7 @@ from ..._utils import _pairwise_symmetric from ...representation import FData, FDataGrid +from ...representation._typing import NDArrayFloat from ._typing import Metric, MetricElementType, Norm, VectorType T = TypeVar("T", bound=FData) @@ -27,7 +28,7 @@ def _check_compatible(fdata1: T, fdata2: T) -> None: def _cast_to_grid( fdata1: FData, fdata2: FData, - eval_points: np.ndarray = None, + eval_points: Optional[NDArrayFloat] = None, _check: bool = True, ) -> Tuple[FDataGrid, FDataGrid]: """Convert fdata1 and fdata2 to FDatagrid. @@ -124,7 +125,7 @@ class NormInducedMetric(Metric[VectorType]): def __init__(self, norm: Norm[VectorType]): self.norm = norm - def __call__(self, elem1: VectorType, elem2: VectorType) -> np.ndarray: + def __call__(self, elem1: VectorType, elem2: VectorType) -> NDArrayFloat: """Compute the induced norm between two vectors.""" return self.norm(elem1 - elem2) @@ -137,7 +138,7 @@ def pairwise_metric_optimization( metric: Any, elem1: Any, elem2: Optional[Any], -) -> np.ndarray: +) -> NDArrayFloat: r""" Optimized computation of a pairwise metric. @@ -173,7 +174,7 @@ def __call__( self, elem1: MetricElementType, elem2: Optional[MetricElementType] = None, - ) -> np.ndarray: + ) -> NDArrayFloat: """Evaluate the pairwise metric.""" optimized = pairwise_metric_optimization(self.metric, elem1, elem2) diff --git a/skfda/representation/_typing.py b/skfda/representation/_typing.py index 881ef6530..d74ead4b5 100644 --- a/skfda/representation/_typing.py +++ b/skfda/representation/_typing.py @@ -1,9 +1,10 @@ """Common types.""" from typing import Any, Optional, Sequence, Tuple, TypeVar, Union -import numpy as np from typing_extensions import Protocol +import numpy as np + try: from numpy.typing import ArrayLike except ImportError: diff --git a/tests/test_elastic.py b/tests/test_elastic.py index 47ea176c5..740b77ad7 100644 --- a/tests/test_elastic.py +++ b/tests/test_elastic.py @@ -8,11 +8,11 @@ from skfda.datasets import make_multimodal_samples, make_random_warping from skfda.misc.metrics import ( PairwiseMetric, - amplitude_distance, + _fisher_rao_warping_distance, + fisher_rao_amplitude_distance, fisher_rao_distance, + fisher_rao_phase_distance, l2_distance, - phase_distance, - warping_distance, ) from skfda.preprocessing.registration import ( ElasticRegistration, @@ -272,34 +272,34 @@ def test_fisher_rao_invariance(self) -> None: atol=0.01, ) - def test_amplitude_distance_limit(self) -> None: + def test_fisher_rao_amplitude_distance_limit(self) -> None: """Test limit of amplitude distance penalty.""" f = make_multimodal_samples(n_samples=1, random_state=1) g = make_multimodal_samples(n_samples=1, random_state=9999) - amplitude_limit = amplitude_distance(f, g, lam=1000) + amplitude_limit = fisher_rao_amplitude_distance(f, g, lam=1000) fr_distance = fisher_rao_distance(f, g) np.testing.assert_almost_equal(amplitude_limit, fr_distance) - def test_phase_distance_id(self) -> None: + def test_fisher_rao_phase_distance_id(self) -> None: """Test of phase distance invariance.""" f = make_multimodal_samples(n_samples=1, random_state=1) - phase = phase_distance(f, 2 * f) + phase = fisher_rao_phase_distance(f, 2 * f) np.testing.assert_allclose(phase, 0, atol=1e-7) - def test_warping_distance(self) -> None: + def test_fisher_rao_warping_distance(self) -> None: """Test of warping distance.""" t = np.linspace(0, 1, 1000) w1 = FDataGrid([t**5], t) w2 = FDataGrid([t**3], t) - d = warping_distance(w1, w2) + d = _fisher_rao_warping_distance(w1, w2) np.testing.assert_allclose(d, np.arccos(np.sqrt(15) / 4), atol=1e-3) - d = warping_distance(w2, w2) + d = _fisher_rao_warping_distance(w2, w2) np.testing.assert_allclose(d, 0, atol=2e-2) From 86f0e13231928facc39a8439a6566a09fdfb53b6 Mon Sep 17 00:00:00 2001 From: VNMabus Date: Sat, 2 Oct 2021 19:27:43 +0200 Subject: [PATCH 009/117] Rename Fisher Rao methods in order to be more explicit. --- docs/modules/exploratory/outliers.rst | 7 +- docs/modules/exploratory/stats.rst | 1 + docs/modules/misc/operators.rst | 3 +- docs/modules/preprocessing/registration.rst | 14 +- examples/plot_elastic_registration.py | 15 +- examples/plot_pairwise_alignment.py | 16 +- skfda/_utils/__init__.py | 1 + .../registration => _utils}/_warping.py | 32 +- skfda/datasets/_samples_generators.py | 11 +- skfda/exploratory/stats/__init__.py | 1 + skfda/exploratory/stats/_fisher_rao.py | 352 ++++++++ skfda/misc/metrics/__init__.py | 2 +- ...{_fisher_rao_metrics.py => _fisher_rao.py} | 21 +- skfda/misc/operators/__init__.py | 1 + skfda/misc/operators/_srvf.py | 231 +++++ skfda/preprocessing/registration/__init__.py | 6 +- .../preprocessing/registration/_fisher_rao.py | 308 +++++++ skfda/preprocessing/registration/elastic.py | 831 ------------------ tests/test_elastic.py | 39 +- tests/test_registration.py | 4 +- 20 files changed, 979 insertions(+), 917 deletions(-) rename skfda/{preprocessing/registration => _utils}/_warping.py (84%) create mode 100644 skfda/exploratory/stats/_fisher_rao.py rename skfda/misc/metrics/{_fisher_rao_metrics.py => _fisher_rao.py} (94%) create mode 100644 skfda/misc/operators/_srvf.py create mode 100644 skfda/preprocessing/registration/_fisher_rao.py delete mode 100644 skfda/preprocessing/registration/elastic.py diff --git a/docs/modules/exploratory/outliers.rst b/docs/modules/exploratory/outliers.rst index fb79a7be6..34b7cc4b3 100644 --- a/docs/modules/exploratory/outliers.rst +++ b/docs/modules/exploratory/outliers.rst @@ -23,16 +23,17 @@ as an outlier if it has points :math:`1.5 \cdot IQR` times outside the region co skfda.exploratory.outliers.BoxplotOutlierDetector -DirectionalOutlierDetector +MSPlotOutlierDetector -------------------------- -Other more novel way of outlier detection takes into account the magnitude and shape of the curves. Curves which have +Other more novel way of outlier detection is the one presented in the Magnitude-Shape plot, or +MS-plot. It takes into account the magnitude and shape of the curves. Curves which have a very different shape or magnitude are considered outliers. .. autosummary:: :toctree: autosummary - skfda.exploratory.outliers.DirectionalOutlierDetector + skfda.exploratory.outliers.MSPlotOutlierDetector For this method, it is necessary to compute the mean and variation of the directional outlyingness, which can be done with the following function. diff --git a/docs/modules/exploratory/stats.rst b/docs/modules/exploratory/stats.rst index 13eba1c95..6d9b7e8e7 100644 --- a/docs/modules/exploratory/stats.rst +++ b/docs/modules/exploratory/stats.rst @@ -18,6 +18,7 @@ measure of the location or central tendency of :term:`functional data`. skfda.exploratory.stats.trim_mean skfda.exploratory.stats.depth_based_median skfda.exploratory.stats.geometric_median + skfda.exploratory.stats.fisher_rao_karcher_mean Dispersion ---------- diff --git a/docs/modules/misc/operators.rst b/docs/modules/misc/operators.rst index d2a877a2e..201ccb1d8 100644 --- a/docs/modules/misc/operators.rst +++ b/docs/modules/misc/operators.rst @@ -11,4 +11,5 @@ The operators that are linear can also be used in the context of :toctree: autosummary skfda.misc.operators.Identity - skfda.misc.operators.LinearDifferentialOperator \ No newline at end of file + skfda.misc.operators.LinearDifferentialOperator + skfda.misc.operators.SRSF \ No newline at end of file diff --git a/docs/modules/preprocessing/registration.rst b/docs/modules/preprocessing/registration.rst index 67ca0cb8b..ecb9797d0 100644 --- a/docs/modules/preprocessing/registration.rst +++ b/docs/modules/preprocessing/registration.rst @@ -64,19 +64,7 @@ introduction to this topic along the usage of the corresponding functions. .. autosummary:: :toctree: autosummary - skfda.preprocessing.registration.ElasticRegistration - - -The module contains some routines related with the elastic registration, making -a transformation of the sampling, computing different means or distances based -on the elastic framework. - -.. autosummary:: - :toctree: autosummary - - skfda.preprocessing.registration.elastic.elastic_mean - skfda.preprocessing.registration.elastic.warping_mean - skfda.preprocessing.registration.elastic.SRSF + skfda.preprocessing.registration.ElasticFisherRaoRegistration Validation diff --git a/examples/plot_elastic_registration.py b/examples/plot_elastic_registration.py index e577eae37..e00465fe6 100644 --- a/examples/plot_elastic_registration.py +++ b/examples/plot_elastic_registration.py @@ -14,13 +14,14 @@ import skfda from skfda.datasets import fetch_growth, make_multimodal_samples -from skfda.preprocessing.registration import ElasticRegistration -from skfda.preprocessing.registration.elastic import elastic_mean +from skfda.exploratory.stats import fisher_rao_karcher_mean +from skfda.preprocessing.registration import ElasticFisherRaoRegistration ############################################################################## # In the example of pairwise alignment was shown the usage of -# :class:`~skfda.preprocessing.registration.ElasticRegistration` to align -# a set of functional observations to a given template or a set of templates. +# :class:`~skfda.preprocessing.registration.ElasticFisherRaoRegistration` to +# align a set of functional observations to a given template or a set of +# templates. # # In the groupwise alignment all the samples are aligned to the same template, # constructed to minimise some distance, generally a mean or a median. In the @@ -36,7 +37,7 @@ ############################################################################### # The following figure shows the -# :func:`~skfda.preprocessing.registration.elastic.elastic_mean` of the +# :func:`~skfda.exploratory.stats.fisher_rao_karcher_mean` of the # dataset and the cross-sectional mean, which correspond to the karcher-mean # under the :math:`\mathbb{L}^2` distance. # @@ -46,14 +47,14 @@ fig = fd.mean().plot(label="L2 mean") -elastic_mean(fd).plot(fig=fig, label="Elastic mean") +fisher_rao_karcher_mean(fd).plot(fig=fig, label="Elastic mean") fig.legend() ############################################################################## # In this case, the alignment completely reduces the amplitude variability # between the samples, aligning the maximum points correctly. -elastic_registration = ElasticRegistration() +elastic_registration = ElasticFisherRaoRegistration() fd_align = elastic_registration.fit_transform(fd) diff --git a/examples/plot_pairwise_alignment.py b/examples/plot_pairwise_alignment.py index 63f919b98..fdf74c0d1 100644 --- a/examples/plot_pairwise_alignment.py +++ b/examples/plot_pairwise_alignment.py @@ -14,10 +14,13 @@ import matplotlib.colors as clr import matplotlib.pyplot as plt import numpy as np -import skfda -from skfda.preprocessing.registration import ElasticRegistration, invert_warping +import skfda from skfda.datasets import make_multimodal_samples +from skfda.preprocessing.registration import ( + ElasticFisherRaoRegistration, + invert_warping, +) ############################################################################## # Given any two functions :math:`f` and :math:`g`, we define their @@ -53,12 +56,12 @@ # In this example :math:`g` will be used as template and :math:`f` will be # aligned to it. In the following figure it is shown the result of the # registration process, wich can be computed using -# :class:`~skfda.preprocessing.registration.ElasticRegistration`. +# :class:`~skfda.preprocessing.registration.ElasticFisherRaoRegistration`. # f, g = fd[0], fd[1] -elastic_registration = ElasticRegistration(template=g) +elastic_registration = ElasticFisherRaoRegistration(template=g) # Aligns f to g @@ -155,8 +158,7 @@ elastic_registration.warping_.plot(fig, color=c) # Plots identity -fig.axes[0].plot(t, t, color='C0', linestyle="--") - +fig.axes[0].plot(t, t, color='C0', linestyle="--") ############################################################################## @@ -196,7 +198,7 @@ # # Registration of the sets -elastic_registration = ElasticRegistration(template=g) +elastic_registration = ElasticFisherRaoRegistration(template=g) fd_registered = elastic_registration.fit_transform(fd) diff --git a/skfda/_utils/__init__.py b/skfda/_utils/__init__.py index 61c9714ae..979390ca3 100644 --- a/skfda/_utils/__init__.py +++ b/skfda/_utils/__init__.py @@ -20,3 +20,4 @@ check_is_univariate, nquad_vec, ) +from ._warping import invert_warping, normalize_scale, normalize_warping diff --git a/skfda/preprocessing/registration/_warping.py b/skfda/_utils/_warping.py similarity index 84% rename from skfda/preprocessing/registration/_warping.py rename to skfda/_utils/_warping.py index 00f8b3d75..5247c3941 100644 --- a/skfda/preprocessing/registration/_warping.py +++ b/skfda/_utils/_warping.py @@ -2,16 +2,18 @@ This module contains routines related to the registration procedure. """ +from __future__ import annotations -from typing import Optional +from typing import TYPE_CHECKING, Optional import numpy as np - from scipy.interpolate import PchipInterpolator -from ..._utils import _to_domain_range, check_is_univariate -from ...representation import FDataGrid -from ...representation._typing import ArrayLike, DomainRangeLike +from ..representation._typing import ArrayLike, DomainRangeLike, NDArrayFloat +from ._utils import _to_domain_range, check_is_univariate + +if TYPE_CHECKING: + from ..representation import FDataGrid def invert_warping( @@ -19,7 +21,8 @@ def invert_warping( *, output_points: Optional[ArrayLike] = None, ) -> FDataGrid: - r"""Compute the inverse of a diffeomorphism. + r""" + Compute the inverse of a diffeomorphism. Let :math:`\gamma : [a,b] \rightarrow [a,b]` be a function strictly increasing, calculates the corresponding inverse @@ -44,7 +47,6 @@ def invert_warping( Examples: >>> import numpy as np >>> from skfda import FDataGrid - >>> from skfda.preprocessing.registration import invert_warping We will construct the warping :math:`\gamma : [0,1] \rightarrow [0,1]` wich maps t to t^3. @@ -90,8 +92,13 @@ def invert_warping( return warping.copy(data_matrix=data_matrix, grid_points=output_points) -def _normalize_scale(t: np.ndarray, a: float = 0, b: float = 1) -> np.ndarray: - """Perfoms an afine translation to normalize an interval. +def normalize_scale( + t: NDArrayFloat, + a: float = 0, + b: float = 1, +) -> NDArrayFloat: + """ + Perfoms an afine translation to normalize an interval. Args: t: Array of dim 1 or 2 with at least 2 values. @@ -116,7 +123,8 @@ def normalize_warping( warping: FDataGrid, domain_range: Optional[DomainRangeLike] = None, ) -> FDataGrid: - r"""Rescale a warping to normalize their :term:`domain`. + r""" + Rescale a warping to normalize their :term:`domain`. Given a set of warpings :math:`\gamma_i:[a,b]\rightarrow [a,b]` it is used an affine traslation to change the domain of the transformation to @@ -138,11 +146,11 @@ def normalize_warping( else _to_domain_range(domain_range)[0] ) - data_matrix = _normalize_scale( + data_matrix = normalize_scale( warping.data_matrix[..., 0], *domain_range_tuple, ) - grid_points = _normalize_scale(warping.grid_points[0], *domain_range_tuple) + grid_points = normalize_scale(warping.grid_points[0], *domain_range_tuple) return warping.copy( data_matrix=data_matrix, diff --git a/skfda/datasets/_samples_generators.py b/skfda/datasets/_samples_generators.py index 1fd21cc6b..9d00bcdc6 100644 --- a/skfda/datasets/_samples_generators.py +++ b/skfda/datasets/_samples_generators.py @@ -2,15 +2,18 @@ from typing import Callable, Optional, Sequence, Union import numpy as np -import sklearn.utils - import scipy.integrate +import sklearn.utils from scipy.stats import multivariate_normal from .. import FDataGrid -from .._utils import RandomStateLike, _cartesian_product, _to_grid_points +from .._utils import ( + RandomStateLike, + _cartesian_product, + _to_grid_points, + normalize_warping, +) from ..misc import covariances -from ..preprocessing.registration import normalize_warping from ..representation._typing import DomainRangeLike, GridPointsLike from ..representation.interpolation import SplineInterpolation diff --git a/skfda/exploratory/stats/__init__.py b/skfda/exploratory/stats/__init__.py index 175abf8b3..c7d1e3d66 100644 --- a/skfda/exploratory/stats/__init__.py +++ b/skfda/exploratory/stats/__init__.py @@ -1,3 +1,4 @@ +from ._fisher_rao import _fisher_rao_warping_mean, fisher_rao_karcher_mean from ._stats import ( cov, depth_based_median, diff --git a/skfda/exploratory/stats/_fisher_rao.py b/skfda/exploratory/stats/_fisher_rao.py new file mode 100644 index 000000000..0906ba466 --- /dev/null +++ b/skfda/exploratory/stats/_fisher_rao.py @@ -0,0 +1,352 @@ +from __future__ import annotations + +from typing import Any, Optional + +import numpy as np +import scipy.integrate +from fdasrsf.utility_functions import optimum_reparam + +from ..._utils import check_is_univariate, invert_warping, normalize_scale +from ...misc.operators import SRSF +from ...representation import FDataGrid +from ...representation._typing import NDArrayFloat +from ...representation.interpolation import SplineInterpolation + +############################################################################### +# Based on the original implementation of J. Derek Tucker in # +# *fdasrsf_python* (https://github.com/jdtuck/fdasrsf_python) # +# and *ElasticFDA.jl* (https://github.com/jdtuck/ElasticFDA.jl). # +############################################################################### + + +def _elastic_alignment_array( + template_data: NDArrayFloat, + q_data: NDArrayFloat, + eval_points: NDArrayFloat, + penalty: float, + grid_dim: int, +) -> NDArrayFloat: + """ + Wrap the :func:`optimum_reparam` function of fdasrsf. + + Selects the corresponding routine depending on the dimensions of the + arrays. + + Args: + template_data: Array with the srsf of the template. + q_data: Array with the srsf of the curves + to be aligned. + eval_points: Discretisation points of the functions. + penalty: Penalisation term. + grid_dim: Dimension of the grid used in the alignment algorithm. + + Returns: + Array with the same shape than q_data with the srsf of + the functions aligned to the template(s). + + """ + return optimum_reparam( + np.ascontiguousarray(template_data.T), + np.ascontiguousarray(eval_points), + np.ascontiguousarray(q_data.T), + method="DP2", + lam=penalty, + grid_dim=grid_dim, + ).T + + +def _fisher_rao_warping_mean( + warping: FDataGrid, + *, + max_iter: int = 100, + tol: float = 1e-6, + step_size: float = 0.3, +) -> FDataGrid: + r""" + Compute the karcher mean of a set of warpings. + + Let :math:`\gamma_i i=1...n` be a set of warping functions + :math:`\gamma_i:[a,b] \rightarrow [a,b]` in :math:`\Gamma`, i.e., + monotone increasing and with the restriction :math:`\gamma_i(a)=a \, + \gamma_i(b)=b`. + + The karcher mean :math:`\bar \gamma` is defined as the warping that + minimises locally the sum of Fisher-Rao squared distances + :footcite:`srivastava+klassen_2016_analysis_orbit`. + + .. math:: + \bar \gamma = argmin_{\gamma \in \Gamma} \sum_{i=1}^{n} + d_{FR}^2(\gamma, \gamma_i) + + The computation is performed using the structure of Hilbert Sphere obtained + after a transformation of the warpings, see + :footcite:`srivastava++_2011_ficher-rao_orbit`. + + Args: + warping: Set of warpings. + max_iter: Maximum number of interations. Defaults to 100. + tol: Convergence criterion, if the norm of the mean of the + shooting vectors, :math:`| \bar v | 1e-10: + vmean += theta / np.sin(theta) * (psi_i - np.cos(theta) * mu) + + # Mean of shooting vectors + vmean /= warping.n_samples + v_norm = np.sqrt(scipy.integrate.simps(np.square(vmean))) + + # Convergence criterion + if v_norm < tol: + break + + # Calculate exponential map of mu + a = np.cos(step_size * v_norm) + b = np.sin(step_size * v_norm) / v_norm + mu = a * mu + b * vmean + + # Recover mean in original gamma space + warping_mean_ret = scipy.integrate.cumtrapz( + np.square(mu, out=mu)[0], + x=eval_points, + initial=0, + ) + + # Affine traslation to original scale + warping_mean_ret = normalize_scale( + warping_mean_ret, + a=original_eval_points[0], + b=original_eval_points[-1], + ) + + monotone_interpolation = SplineInterpolation( + interpolation_order=3, + monotone=True, + ) + + return FDataGrid( + [warping_mean_ret], + grid_points=original_eval_points, + interpolation=monotone_interpolation, + ) + + +def fisher_rao_karcher_mean( + fdatagrid: FDataGrid, + *, + penalty: float = 0, + center: bool = True, + max_iter: int = 20, + tol: float = 1e-3, + initial: Optional[float] = None, + grid_dim: int = 7, + **kwargs: Any, +) -> FDataGrid: + r""" + Compute the Karcher mean under the elastic metric. + + Calculates the Karcher mean of a set of functional samples in the amplitude + space :math:`\mathcal{A}=\mathcal{F}/\Gamma`. + + Let :math:`q_i` the corresponding SRSF of the observation :math:`f_i`. + The space :math:`\mathcal{A}` is defined using the equivalence classes + :math:`[q_i]=\{ q_i \circ \gamma \| \gamma \in \Gamma \}`, where + :math:`\Gamma` denotes the space of warping functions. The karcher mean + in this space is defined as + + .. math:: + [\mu_q] = argmin_{[q] \in \mathcal{A}} \sum_{i=1}^n + d_{\lambda}^2([q],[q_i]) + + Once :math:`[\mu_q]` is obtained it is selected the element of the + equivalence class which makes the mean of the warpings employed be the + identity. + + See :footcite:`srivastava+klassen_2016_analysis_karcher` and + :footcite:`srivastava++_2011_ficher-rao_karcher`. + + Args: + fdatagrid: Set of functions to compute the + mean. + penalty: Penalisation term. Defaults to 0. + center: If ``True`` it is computed the mean of the warpings and + used to select a central mean. Defaults ``True``. + max_iter: Maximum number of iterations. Defaults to 20. + tol: Convergence criterion, the algorithm will stop if + :math:`|mu_{(\nu)} - mu_{(\nu - 1)}|_2 / | mu_{(\nu-1)} |_2 < tol`. + initial: Value of the mean at the starting point. By default + takes the average of the initial points of the samples. + grid_dim: Dimension of the grid used in the alignment + algorithm. Defaults 7. + kwargs: Named options to be pased to :func:`_fisher_rao_warping_mean`. + + Returns: + FDatagrid with the mean of the functions. + + Raises: + ValueError: If the object is multidimensional or the shape of the srsf + do not match with the fdatagrid. + + References: + .. footbibliography:: + + """ + check_is_univariate(fdatagrid) + + srsf_transformer = SRSF(initial_value=0) + fdatagrid_srsf = srsf_transformer.fit_transform(fdatagrid) + eval_points = fdatagrid.grid_points[0] + + eval_points_normalized = normalize_scale(eval_points) + y_scale = eval_points[-1] - eval_points[0] + + interpolation = SplineInterpolation(interpolation_order=3, monotone=True) + + # Discretisation points + fdatagrid_normalized = FDataGrid( + fdatagrid(eval_points) / y_scale, + grid_points=eval_points_normalized, + ) + + srsf = fdatagrid_srsf(eval_points)[..., 0] + + # Initialize with function closest to the L2 mean with the L2 distance + centered = (srsf.T - srsf.mean(axis=0, keepdims=True).T).T + + distances = scipy.integrate.simps( + np.square(centered, out=centered), + eval_points_normalized, + axis=1, + ) + + # Initialization of iteration + mu = srsf[np.argmin(distances)] + mu_aux = np.empty(mu.shape) + mu_1 = np.empty(mu.shape) + + # Main iteration + for _ in range(max_iter): + + gammas_matrix = _elastic_alignment_array( + mu, + srsf, + eval_points_normalized, + penalty, + grid_dim, + ) + + gammas = FDataGrid( + gammas_matrix, + grid_points=eval_points_normalized, + interpolation=interpolation, + ) + + fdatagrid_normalized = fdatagrid_normalized.compose(gammas) + srsf = srsf_transformer.transform( + fdatagrid_normalized, + ).data_matrix[..., 0] + + # Next iteration + mu_1 = srsf.mean(axis=0, out=mu_1) + + # Convergence criterion + mu_norm = np.sqrt( + scipy.integrate.simps( + np.square(mu, out=mu_aux), + eval_points_normalized, + ), + ) + + mu_diff = np.sqrt( + scipy.integrate.simps( + np.square(mu - mu_1, out=mu_aux), + eval_points_normalized, + ), + ) + + if mu_diff / mu_norm < tol: + break + + mu = mu_1 + + if initial is None: + initial = fdatagrid.data_matrix[:, 0].mean() + + srsf_transformer.set_params(initial_value=initial) + + # Karcher mean orbit in space L2/Gamma + karcher_mean = srsf_transformer.inverse_transform( + fdatagrid.copy( + data_matrix=[mu], + grid_points=eval_points, + sample_names=("Karcher mean",), + ), + ) + + if center: + # Gamma mean in Hilbert Sphere + mean_normalized = _fisher_rao_warping_mean(gammas, **kwargs) + + gamma_mean = FDataGrid( + normalize_scale( + mean_normalized.data_matrix[..., 0], + a=eval_points[0], + b=eval_points[-1], + ), + grid_points=eval_points, + ) + + gamma_inverse = invert_warping(gamma_mean) + + karcher_mean = karcher_mean.compose(gamma_inverse) + + # Return center of the orbit + return karcher_mean diff --git a/skfda/misc/metrics/__init__.py b/skfda/misc/metrics/__init__.py index ad673d6af..9d55f478d 100644 --- a/skfda/misc/metrics/__init__.py +++ b/skfda/misc/metrics/__init__.py @@ -1,6 +1,6 @@ """Metrics, norms and related utilities.""" -from ._fisher_rao_metrics import ( +from ._fisher_rao import ( _fisher_rao_warping_distance, fisher_rao_amplitude_distance, fisher_rao_distance, diff --git a/skfda/misc/metrics/_fisher_rao_metrics.py b/skfda/misc/metrics/_fisher_rao.py similarity index 94% rename from skfda/misc/metrics/_fisher_rao_metrics.py rename to skfda/misc/metrics/_fisher_rao.py index b2839cd36..bdf6b61a5 100644 --- a/skfda/misc/metrics/_fisher_rao_metrics.py +++ b/skfda/misc/metrics/_fisher_rao.py @@ -5,14 +5,11 @@ import numpy as np import scipy.integrate -from ...preprocessing.registration import ( - ElasticRegistration, - normalize_warping, -) -from ...preprocessing.registration._warping import _normalize_scale -from ...preprocessing.registration.elastic import SRSF +from ..._utils import normalize_scale, normalize_warping +from ...preprocessing.registration import ElasticFisherRaoRegistration from ...representation import FData from ...representation._typing import NDArrayFloat +from ..operators import SRSF from ._lp_distances import l2_distance from ._utils import _cast_to_grid @@ -69,7 +66,7 @@ def fisher_rao_distance( ) # Both should have the same grid points - eval_points_normalized = _normalize_scale(fdata1.grid_points[0]) + eval_points_normalized = normalize_scale(fdata1.grid_points[0]) # Calculate the corresponding srsf and normalize to (0,1) fdata1 = fdata1.copy( @@ -155,7 +152,7 @@ def fisher_rao_amplitude_distance( ) # Both should have the same grid points - eval_points_normalized = _normalize_scale(fdata1.grid_points[0]) + eval_points_normalized = normalize_scale(fdata1.grid_points[0]) # Calculate the corresponding srsf and normalize to (0,1) fdata1 = fdata1.copy( @@ -167,7 +164,7 @@ def fisher_rao_amplitude_distance( domain_range=(0, 1), ) - elastic_registration = ElasticRegistration( + elastic_registration = ElasticFisherRaoRegistration( template=fdata2, penalty=lam, output_points=eval_points_normalized, @@ -216,6 +213,8 @@ def fisher_rao_phase_distance( d_{P}(f_i, f_j) = d_{FR}(\gamma_{ij}, \gamma_{id}) = arcos \left ( \int_0^1 \sqrt {\dot \gamma_{ij}(t)} dt \right ) + where :math:`\gamma_{id}` is the identity warping. + See :footcite:`srivastava+klassen_2016_analysis_phase` for a detailed explanation. @@ -247,7 +246,7 @@ def fisher_rao_phase_distance( ) # Rescale in the interval (0,1) - eval_points_normalized = _normalize_scale(fdata1.grid_points[0]) + eval_points_normalized = normalize_scale(fdata1.grid_points[0]) # Calculate the corresponding srsf and normalize to (0,1) fdata1 = fdata1.copy( @@ -259,7 +258,7 @@ def fisher_rao_phase_distance( domain_range=(0, 1), ) - elastic_registration = ElasticRegistration( + elastic_registration = ElasticFisherRaoRegistration( penalty=lam, template=fdata2, output_points=eval_points_normalized, diff --git a/skfda/misc/operators/__init__.py b/skfda/misc/operators/__init__.py index 7cac49f18..3825aea7c 100644 --- a/skfda/misc/operators/__init__.py +++ b/skfda/misc/operators/__init__.py @@ -8,3 +8,4 @@ gramian_matrix, gramian_matrix_optimization, ) +from ._srvf import SRSF diff --git a/skfda/misc/operators/_srvf.py b/skfda/misc/operators/_srvf.py new file mode 100644 index 000000000..1b1af8232 --- /dev/null +++ b/skfda/misc/operators/_srvf.py @@ -0,0 +1,231 @@ +from __future__ import annotations + +from typing import Optional + +import numpy as np +import scipy.integrate +from sklearn.base import BaseEstimator, TransformerMixin + +from ..._utils import check_is_univariate +from ...representation import FDataGrid +from ...representation._typing import ArrayLike +from ._operators import Operator + + +class SRSF( + Operator[FDataGrid, FDataGrid], + BaseEstimator, # type: ignore + TransformerMixin, # type: ignore +): + r"""Square-Root Slope Function (SRSF) transform. + + Let :math:`f : [a,b] \rightarrow \mathbb{R}` be an absolutely continuous + function, the SRSF transform is defined as + + .. math:: + SRSF(f(t)) = sgn(f(t)) \sqrt{|\dot f(t)|} = q(t) + + This representation it is used to compute the extended non-parametric + Fisher-Rao distance between functions, wich under the SRSF representation + becomes the usual :math:`\mathbb{L}^2` distance between functions. + See :footcite:`srivastava+klassen_2016_analysis_square`. + + The inverse SRSF transform is defined as + + .. math:: + f(t) = f(a) + \int_{a}^t q(t)|q(t)|dt . + + This transformation is a mapping up to constant. Given the SRSF and the + initial value :math:`f(a)` the original function can be obtained, for this + reason it is necessary to store the value :math:`f(a)` during the fit, + which is dropped due to derivation. If it is applied the inverse + transformation without fit the estimator it is assumed that :math:`f(a)=0`. + + Args: + eval_points: (array_like, optional): Set of points where the + functions are evaluated, by default uses the sample points of + the :class:`FDataGrid ` transformed. + initial_value (float, optional): Initial value to apply in the + inverse transformation. If `None` there are stored the initial + values of the functions during the transformation to apply + during the inverse transformation. Defaults None. + + Attributes: + eval_points: Set of points where the + functions are evaluated, by default uses the grid points of the + fdatagrid. + initial_value: Initial value to apply in the + inverse transformation. If `None` there are stored the initial + values of the functions during the transformation to apply + during the inverse transformation. Defaults None. + + Note: + Due to the use of derivatives it is recommended that the samples are + sufficiently smooth, or have passed a smoothing preprocessing before, + in order to achieve good results. + + References: + .. footbibliography:: + + Examples: + Create a toy dataset and apply the transformation and its inverse. + + >>> from skfda.datasets import make_sinusoidal_process + >>> from skfda.misc.operators import SRSF + >>> fd = make_sinusoidal_process(error_std=0, random_state=0) + >>> srsf = SRSF() + >>> srsf + SRSF(...) + + Fits the estimator (to apply the inverse transform) and apply the SRSF + + >>> q = srsf.fit_transform(fd) + + Apply the inverse transform. + + >>> fd_pull_back = srsf.inverse_transform(q) + + The original and the pull back `fd` are almost equal + + >>> zero = fd - fd_pull_back + >>> zero.data_matrix.flatten().round(3) + array([ 0., 0., 0., ..., -0., -0., -0.]) + + """ + + def __init__( + self, + output_points: Optional[ArrayLike] = None, + initial_value: Optional[float] = None, + ) -> None: + self.output_points = output_points + self.initial_value = initial_value + + def __call__(self, vector: FDataGrid) -> FDataGrid: + return self.fit_transform(vector) + + def fit(self, X: FDataGrid, y: None = None) -> SRSF: + """ + Return self. This transformer does not need to be fitted. + + Args: + X: Present for API conventions. + y: Present for API conventions. + + Returns: + (Estimator): self + + """ + return self + + def transform(self, X: FDataGrid, y: None = None) -> FDataGrid: + r""" + Compute the square-root slope function (SRSF) transform. + + Let :math:`f : [a,b] \rightarrow \mathbb{R}` be an absolutely + continuous function, the SRSF transform is defined as + :footcite:`srivastava+klassen_2016_analysis_square`: + + .. math:: + + SRSF(f(t)) = sgn(f(t)) \sqrt{\dot f(t)|} = q(t) + + Args: + X: Functions to be transformed. + y: Present for API conventions. + + Returns: + SRSF functions. + + Raises: + ValueError: If functions are not univariate. + + """ + check_is_univariate(X) + + if self.output_points is None: + output_points = X.grid_points[0] + else: + output_points = np.asarray(self.output_points) + + g = X.derivative() + + # Evaluation with the corresponding interpolation + data_matrix = g(output_points)[..., 0] + + # SRSF(f) = sign(f) * sqrt|Df| (avoiding multiple allocation) + sign_g = np.sign(data_matrix) + data_matrix = np.abs(data_matrix, out=data_matrix) + data_matrix = np.sqrt(data_matrix, out=data_matrix) + data_matrix *= sign_g + + # Store the values of the transformation + if self.initial_value is None: + a = X.domain_range[0][0] + self.initial_value_ = X(a).reshape(X.n_samples, 1, X.dim_codomain) + + return X.copy(data_matrix=data_matrix, grid_points=output_points) + + def inverse_transform(self, X: FDataGrid, y: None = None) -> FDataGrid: + r""" + Compute the inverse SRSF transform. + + Given the srsf and the initial value the original function can be + obtained as :footcite:`srivastava+klassen_2016_analysis_square`: + + .. math:: + f(t) = f(a) + \int_{a}^t q(t)|q(t)|dt + + where :math:`q(t)=SRSF(f(t))`. + + If it is applied this inverse transformation without fitting the + estimator it is assumed that :math:`f(a)=0`. + + Args: + X: SRSF to be transformed. + y: Present for API conventions. + + Returns: + Functions in the original space. + + Raises: + ValueError: If functions are multidimensional. + """ + check_is_univariate(X) + + stored_initial_value = getattr(self, 'initial_value_', None) + + if self.initial_value is None and stored_initial_value is None: + raise AttributeError( + "When initial_value=None is expected a " + "previous transformation of the data to " + "store the initial values to apply in the " + "inverse transformation. Also it is possible " + "to fix these values setting the attribute" + "initial value without a previous " + "transformation.", + ) + + if self.output_points is None: + output_points = X.grid_points[0] + else: + output_points = np.asarray(self.output_points) + + data_matrix = X(output_points) + + data_matrix *= np.abs(data_matrix) + + f_data_matrix = scipy.integrate.cumtrapz( + data_matrix, + x=output_points, + axis=1, + initial=0, + ) + + # If the transformer was fitted, sum the initial value + if self.initial_value is None: + f_data_matrix += self.initial_value_ + else: + f_data_matrix += self.initial_value + + return X.copy(data_matrix=f_data_matrix, grid_points=output_points) diff --git a/skfda/preprocessing/registration/__init__.py b/skfda/preprocessing/registration/__init__.py index 1894f4761..904d3b2e8 100644 --- a/skfda/preprocessing/registration/__init__.py +++ b/skfda/preprocessing/registration/__init__.py @@ -4,7 +4,9 @@ functional data, in basis as well in discretized form. """ -from . import elastic, validation +from ..._utils import invert_warping, normalize_warping +from . import validation +from ._fisher_rao import ElasticFisherRaoRegistration, ElasticRegistration from ._landmark_registration import ( landmark_registration, landmark_registration_warping, @@ -12,5 +14,3 @@ landmark_shift_deltas, ) from ._shift_registration import ShiftRegistration -from ._warping import invert_warping, normalize_warping -from .elastic import ElasticRegistration diff --git a/skfda/preprocessing/registration/_fisher_rao.py b/skfda/preprocessing/registration/_fisher_rao.py new file mode 100644 index 000000000..5ab761c2c --- /dev/null +++ b/skfda/preprocessing/registration/_fisher_rao.py @@ -0,0 +1,308 @@ + +from __future__ import annotations + +import warnings +from typing import Any, Callable, Optional, Union + +from sklearn.utils.validation import check_is_fitted + +from ... import FDataGrid +from ..._utils import check_is_univariate, invert_warping, normalize_scale +from ...exploratory.stats import fisher_rao_karcher_mean +from ...exploratory.stats._fisher_rao import _elastic_alignment_array +from ...misc.operators import SRSF +from ...representation._typing import ArrayLike +from ...representation.interpolation import SplineInterpolation +from .base import RegistrationTransformer + +_MeanType = Callable[[FDataGrid], FDataGrid] + + +class ElasticFisherRaoRegistration(RegistrationTransformer): + r"""Align a FDatagrid using the SRSF framework. + + Let :math:`f` be a function of the functional data object wich will be + aligned to the template :math:`g`. Calculates the warping wich minimises + the Fisher-Rao distance between :math:`g` and the registered function + :math:`f^*(t)=f(\gamma^*(t))=f \circ \gamma^*`. + + .. math:: + \gamma^* = argmin_{\gamma \in \Gamma} d_{\lambda}(f \circ + \gamma, g) + + Where :math:`d_{\lambda}` denotes the extended Fisher-Rao distance with a + penalty term, used to control the amount of warping. + + .. math:: + d_{\lambda}^2(f \circ \gamma, g) = \| SRSF(f \circ \gamma) + \sqrt{\dot{\gamma}} - SRSF(g)\|_{\mathbb{L}^2}^2 + \lambda + \mathcal{R}(\gamma) + + In the implementation it is used as penalty term + + .. math:: + \mathcal{R}(\gamma) = \|\sqrt{\dot{\gamma}}- 1 \|_{\mathbb{L}^2}^2 + + Wich restrict the amount of elasticity employed in the alignment. + + The registered function :math:`f^*(t)` can be calculated using the + composition :math:`f^*(t)=f(\gamma^*(t))`. + + If the template is not specified it is used the Karcher mean of the set of + functions under the elastic metric to perform the alignment, also known as + `elastic mean`, wich is the local minimum of the sum of squares of elastic + distances. See :func:`~elastic_mean`. + + In :footcite:`srivastava+klassen_2016_analysis_elastic` are described + extensively the algorithms employed and the SRSF framework. + + Args: + template (str, :class:`FDataGrid` or callable, optional): Template to + align the curves. Can contain 1 sample to align all the curves to + it or the same number of samples than the fdatagrid. By default + `elastic mean`, in which case :func:`elastic_mean` is called. + penalty_term (float, optional): Controls the amount of elasticity. + Defaults to 0. + output_points (array_like, optional): Set of points where the + functions are evaluated, by default uses the sample points of the + fdatagrid which will be transformed. + grid_dim (int, optional): Dimension of the grid used in the DP + alignment algorithm. Defaults 7. + + Attributes: + template\_: Template learned during fitting, + used for alignment in :meth:`transform`. + warping\_: Warping applied during the last + transformation. + + References: + .. footbibliography:: + + Examples: + Elastic registration of with train/test sets. + + >>> from skfda.preprocessing.registration import ( + ... ElasticFisherRaoRegistration, + ... ) + >>> from skfda.datasets import make_multimodal_samples + >>> X_train = make_multimodal_samples(n_samples=15, random_state=0) + >>> X_test = make_multimodal_samples(n_samples=3, random_state=1) + + Fit the transformer, which learns the elastic mean of the train + set as template. + + >>> elastic_registration = ElasticFisherRaoRegistration() + >>> elastic_registration.fit(X_train) + ElasticFisherRaoRegistration(...) + + Registration of the test set. + + >>> elastic_registration.transform(X_test) + FDataGrid(...) + + """ + + def __init__( + self, + *, + template: Union[FDataGrid, _MeanType] = fisher_rao_karcher_mean, + penalty: float = 0, + output_points: Optional[ArrayLike] = None, + grid_dim: int = 7, + ) -> None: + self.template = template + self.penalty = penalty + self.output_points = output_points + self.grid_dim = grid_dim + + def fit(self, X: FDataGrid, y: None = None) -> RegistrationTransformer: + """Fit the transformer. + + Learns the template used during the transformation. + + Args: + X: Functional observations used as training samples. If the + template provided is a FDataGrid this argument is ignored, as + it is not necessary to learn the template from the training + data. + y: Present for API conventions. + + Returns: + self. + + """ + if isinstance(self.template, FDataGrid): + self.template_ = self.template # Template already constructed + else: + self.template_ = self.template(X) + + # Constructs the SRSF of the template + srsf = SRSF(output_points=self.output_points, initial_value=0) + self._template_srsf = srsf.fit_transform(self.template_) + + return self + + def transform(self, X: FDataGrid, y: None = None) -> FDataGrid: + """Apply elastic registration to the data. + + Args: + X: Functional data to be registered. + y: Present for API conventions. + + Returns: + Registered samples. + + """ + check_is_fitted(self, '_template_srsf') + check_is_univariate(X) + + if ( + len(self._template_srsf) != 1 + and len(X) != len(self._template_srsf) + ): + + raise ValueError( + "The template should contain one sample to align " + "all the curves to the same function or the " + "same number of samples than X.", + ) + + srsf = SRSF(output_points=self.output_points, initial_value=0) + fdatagrid_srsf = srsf.fit_transform(X) + + # Points of discretization + if self.output_points is None: + output_points = fdatagrid_srsf.grid_points[0] + else: + output_points = self.output_points + + # Discretizacion in evaluation points + q_data = fdatagrid_srsf(output_points)[..., 0] + template_data = self._template_srsf(output_points)[..., 0] + + if q_data.shape[0] == 1: + q_data = q_data[0] + + if template_data.shape[0] == 1: + template_data = template_data[0] + + # Values of the warping + gamma = _elastic_alignment_array( + template_data, + q_data, + normalize_scale(output_points), + self.penalty, + self.grid_dim, + ) + + # Normalize warping to original interval + gamma = normalize_scale( + gamma, + a=output_points[0], + b=output_points[-1], + ) + + # Interpolation + interpolation = SplineInterpolation( + interpolation_order=3, + monotone=True, + ) + + self.warping_ = FDataGrid( + gamma, + output_points, + interpolation=interpolation, + ) + + return X.compose(self.warping_, eval_points=output_points) + + def inverse_transform(self, X: FDataGrid, y: None = None) -> FDataGrid: + r""" + Reverse the registration procedure previosly applied. + + Let :math:`gamma(t)` the warping applied to construct a registered + functional datum :math:`f^*(t)=f(\gamma(t))`. + + Given a functional datum :math:`f^*(t) it is computed + :math:`\gamma^{-1}(t)` to reverse the registration procedure + :math:`f(t)=f^*(\gamma^{-1}(t))`. + + Args: + X: Functional data to apply the reverse + transform. + y: Present for API conventions. + + Returns: + Functional data compose by the inverse warping. + + Raises: + ValueError: If the warpings :math:`\gamma` were not build via + :meth:`transform` or if the number of samples of `X` is + different than the number of samples of the dataset + previously transformed. + + Examples: + Center the datasets taking into account the misalignment. + + >>> from skfda.preprocessing.registration import ( + ... ElasticFisherRaoRegistration, + ... ) + >>> from skfda.datasets import make_multimodal_samples + >>> X = make_multimodal_samples(random_state=0) + + Registration of the dataset. + + >>> elastic_registration = ElasticFisherRaoRegistration() + >>> X = elastic_registration.fit_transform(X) + + Substract the elastic mean build as template during the + registration and reverse the transformation. + + >>> X = X - elastic_registration.template_ + >>> X_center = elastic_registration.inverse_transform(X) + >>> X_center + FDataGrid(...) + + + See also: + :func:`invert_warping` + + """ + warping = getattr(self, 'warping_', None) + + if warping is None: + raise ValueError( + "Data must be previosly transformed to apply the " + "inverse transform", + ) + elif len(X) != len(warping): + raise ValueError( + "Data must contain the same number of samples " + "than the dataset previously transformed", + ) + + inverse_warping = invert_warping(warping) + + return X.compose(inverse_warping, eval_points=self.output_points) + + +class ElasticRegistration(ElasticFisherRaoRegistration): + + def __init__( + self, + template: Union[FDataGrid, _MeanType] = fisher_rao_karcher_mean, + penalty: float = 0, + output_points: Optional[ArrayLike] = None, + grid_dim: int = 7, + ) -> None: + warnings.warn( + "ElasticRegistration is deprecated. " + "Use ElasticFisherRaoRegistration instead.", + DeprecationWarning, + ) + super().__init__( + template=template, + penalty=penalty, + output_points=output_points, + grid_dim=grid_dim, + ) diff --git a/skfda/preprocessing/registration/elastic.py b/skfda/preprocessing/registration/elastic.py deleted file mode 100644 index c90280c85..000000000 --- a/skfda/preprocessing/registration/elastic.py +++ /dev/null @@ -1,831 +0,0 @@ - -from __future__ import annotations - -from typing import Callable, Optional, Union - -import numpy as np -from sklearn.base import BaseEstimator, TransformerMixin -from sklearn.utils.validation import check_is_fitted - -import scipy.integrate -from fdasrsf.utility_functions import optimum_reparam - -from ... import FDataGrid -from ..._utils import check_is_univariate -from ...representation._typing import ArrayLike -from ...representation.interpolation import SplineInterpolation -from ._warping import _normalize_scale, invert_warping -from .base import RegistrationTransformer - -############################################################################### -# Based on the original implementation of J. Derek Tucker in # -# *fdasrsf_python* (https://github.com/jdtuck/fdasrsf_python) # -# and *ElasticFDA.jl* (https://github.com/jdtuck/ElasticFDA.jl). # -############################################################################### - -_MeanType = Callable[[FDataGrid], FDataGrid] - - -class SRSF(BaseEstimator, TransformerMixin): # type: ignore - r"""Square-Root Slope Function (SRSF) transform. - - Let :math:`f : [a,b] \rightarrow \mathbb{R}` be an absolutely continuous - function, the SRSF transform is defined as - - .. math:: - SRSF(f(t)) = sgn(f(t)) \sqrt{|\dot f(t)|} = q(t) - - This representation it is used to compute the extended non-parametric - Fisher-Rao distance between functions, wich under the SRSF representation - becomes the usual :math:`\mathbb{L}^2` distance between functions. - See :footcite:`srivastava+klassen_2016_analysis_square`. - - The inverse SRSF transform is defined as - - .. math:: - f(t) = f(a) + \int_{a}^t q(t)|q(t)|dt . - - This transformation is a mapping up to constant. Given the SRSF and the - initial value :math:`f(a)` the original function can be obtained, for this - reason it is necessary to store the value :math:`f(a)` during the fit, - which is dropped due to derivation. If it is applied the inverse - transformation without fit the estimator it is assumed that :math:`f(a)=0`. - - Args: - eval_points: (array_like, optional): Set of points where the - functions are evaluated, by default uses the sample points of - the :class:`FDataGrid ` transformed. - initial_value (float, optional): Initial value to apply in the - inverse transformation. If `None` there are stored the initial - values of the functions during the transformation to apply - during the inverse transformation. Defaults None. - - Attributes: - eval_points: Set of points where the - functions are evaluated, by default uses the grid points of the - fdatagrid. - initial_value: Initial value to apply in the - inverse transformation. If `None` there are stored the initial - values of the functions during the transformation to apply - during the inverse transformation. Defaults None. - - Note: - Due to the use of derivatives it is recommended that the samples are - sufficiently smooth, or have passed a smoothing preprocessing before, - in order to achieve good results. - - References: - .. footbibliography:: - - Examples: - Create a toy dataset and apply the transformation and its inverse. - - >>> from skfda.datasets import make_sinusoidal_process - >>> from skfda.preprocessing.registration.elastic import SRSF - >>> fd = make_sinusoidal_process(error_std=0, random_state=0) - >>> srsf = SRSF() - >>> srsf - SRSF(...) - - Fits the estimator (to apply the inverse transform) and apply the SRSF - - >>> q = srsf.fit_transform(fd) - - Apply the inverse transform. - - >>> fd_pull_back = srsf.inverse_transform(q) - - The original and the pull back `fd` are almost equal - - >>> zero = fd - fd_pull_back - >>> zero.data_matrix.flatten().round(3) - array([ 0., 0., 0., ..., -0., -0., -0.]) - - """ - - def __init__( - self, - output_points: Optional[ArrayLike] = None, - initial_value: Optional[float] = None, - ) -> None: - self.output_points = output_points - self.initial_value = initial_value - - def fit(self, X: FDataGrid, y: None = None) -> SRSF: - """ - Return self. This transformer does not need to be fitted. - - Args: - X: Present for API conventions. - y: Present for API conventions. - - Returns: - (Estimator): self - - """ - return self - - def transform(self, X: FDataGrid, y: None = None) -> FDataGrid: - r"""Compute the square-root slope function (SRSF) transform. - - Let :math:`f : [a,b] \rightarrow \mathbb{R}` be an absolutely - continuous function, the SRSF transform is defined as - :footcite:`srivastava+klassen_2016_analysis_square`: - - .. math:: - - SRSF(f(t)) = sgn(f(t)) \sqrt{\dot f(t)|} = q(t) - - Args: - X: Functions to be transformed. - y: Present for API conventions. - - Returns: - SRSF functions. - - Raises: - ValueError: If functions are not univariate. - - """ - check_is_univariate(X) - - if self.output_points is None: - output_points = X.grid_points[0] - else: - output_points = np.asarray(self.output_points) - - g = X.derivative() - - # Evaluation with the corresponding interpolation - data_matrix = g(output_points)[..., 0] - - # SRSF(f) = sign(f) * sqrt|Df| (avoiding multiple allocation) - sign_g = np.sign(data_matrix) - data_matrix = np.abs(data_matrix, out=data_matrix) - data_matrix = np.sqrt(data_matrix, out=data_matrix) - data_matrix *= sign_g - - # Store the values of the transformation - if self.initial_value is None: - a = X.domain_range[0][0] - self.initial_value_ = X(a).reshape(X.n_samples, 1, X.dim_codomain) - - return X.copy(data_matrix=data_matrix, grid_points=output_points) - - def inverse_transform(self, X: FDataGrid, y: None = None) -> FDataGrid: - r"""Compute the inverse SRSF transform. - - Given the srsf and the initial value the original function can be - obtained as :footcite:`srivastava+klassen_2016_analysis_square`: - - .. math:: - f(t) = f(a) + \int_{a}^t q(t)|q(t)|dt - - where :math:`q(t)=SRSF(f(t))`. - - If it is applied this inverse transformation without fitting the - estimator it is assumed that :math:`f(a)=0`. - - Args: - X: SRSF to be transformed. - y: Present for API conventions. - - Returns: - Functions in the original space. - - Raises: - ValueError: If functions are multidimensional. - """ - check_is_univariate(X) - - stored_initial_value = getattr(self, 'initial_value_', None) - - if self.initial_value is None and stored_initial_value is None: - raise AttributeError( - "When initial_value=None is expected a " - "previous transformation of the data to " - "store the initial values to apply in the " - "inverse transformation. Also it is possible " - "to fix these values setting the attribute" - "initial value without a previous " - "transformation.", - ) - - if self.output_points is None: - output_points = X.grid_points[0] - else: - output_points = np.asarray(self.output_points) - - data_matrix = X(output_points) - - data_matrix *= np.abs(data_matrix) - - f_data_matrix = scipy.integrate.cumtrapz( - data_matrix, - x=output_points, - axis=1, - initial=0, - ) - - # If the transformer was fitted, sum the initial value - if self.initial_value is None: - f_data_matrix += self.initial_value_ - else: - f_data_matrix += self.initial_value - - return X.copy(data_matrix=f_data_matrix, grid_points=output_points) - - -def _elastic_alignment_array( - template_data: np.ndarray, - q_data: np.ndarray, - eval_points: np.ndarray, - penalty: float, - grid_dim: int, -) -> np.ndarray: - """ - Wrap the :func:`optimum_reparam` function of fdasrsf. - - Selects the corresponding routine depending on the dimensions of the - arrays. - - Args: - template_data: Array with the srsf of the template. - q_data: Array with the srsf of the curves - to be aligned. - eval_points: Discretisation points of the functions. - penalty: Penalisation term. - grid_dim: Dimension of the grid used in the alignment algorithm. - - Returns: - Array with the same shape than q_data with the srsf of - the functions aligned to the template(s). - - """ - return optimum_reparam( - np.ascontiguousarray(template_data.T), - np.ascontiguousarray(eval_points), - np.ascontiguousarray(q_data.T), - method="DP2", - lam=penalty, grid_dim=grid_dim, - ).T - - -def warping_mean( - warping: FDataGrid, - *, - max_iter: int = 100, - tol: float = 1e-6, - step_size: float = 0.3, -) -> FDataGrid: - r"""Compute the karcher mean of a set of warpings. - - Let :math:`\gamma_i i=1...n` be a set of warping functions - :math:`\gamma_i:[a,b] \rightarrow [a,b]` in :math:`\Gamma`, i.e., - monotone increasing and with the restriction :math:`\gamma_i(a)=a \, - \gamma_i(b)=b`. - - The karcher mean :math:`\bar \gamma` is defined as the warping that - minimises locally the sum of Fisher-Rao squared distances - :footcite:`srivastava+klassen_2016_analysis_orbit`. - - .. math:: - \bar \gamma = argmin_{\gamma \in \Gamma} \sum_{i=1}^{n} - d_{FR}^2(\gamma, \gamma_i) - - The computation is performed using the structure of Hilbert Sphere obtained - after a transformation of the warpings, see - :footcite:`srivastava++_2011_ficher-rao_orbit`. - - Args: - warping: Set of warpings. - max_iter: Maximum number of interations. Defaults to 100. - tol: Convergence criterion, if the norm of the mean of the - shooting vectors, :math:`| \bar v | 1e-10: - vmean += theta / np.sin(theta) * (psi_i - np.cos(theta) * mu) - - # Mean of shooting vectors - vmean /= warping.n_samples - v_norm = np.sqrt(scipy.integrate.simps(np.square(vmean))) - - # Convergence criterion - if v_norm < tol: - break - - # Calculate exponential map of mu - a = np.cos(step_size * v_norm) - b = np.sin(step_size * v_norm) / v_norm - mu = a * mu + b * vmean - - # Recover mean in original gamma space - warping_mean = scipy.integrate.cumtrapz( - np.square(mu, out=mu)[0], - x=eval_points, - initial=0, - ) - - # Affine traslation to original scale - warping_mean = _normalize_scale( - warping_mean, - a=original_eval_points[0], - b=original_eval_points[-1], - ) - - monotone_interpolation = SplineInterpolation( - interpolation_order=3, - monotone=True, - ) - - return FDataGrid( - [warping_mean], - grid_points=original_eval_points, - interpolation=monotone_interpolation, - ) - - -def elastic_mean( - fdatagrid: FDataGrid, - *, - penalty: float = 0, - center: bool = True, - max_iter: int = 20, - tol: float = 1e-3, - initial: Optional[float] = None, - grid_dim: int = 7, - **kwargs, -) -> FDataGrid: - r"""Compute the karcher mean under the elastic metric. - - Calculates the karcher mean of a set of functional samples in the amplitude - space :math:`\mathcal{A}=\mathcal{F}/\Gamma`. - - Let :math:`q_i` the corresponding SRSF of the observation :math:`f_i`. - The space :math:`\mathcal{A}` is defined using the equivalence classes - :math:`[q_i]=\{ q_i \circ \gamma \| \gamma \in \Gamma \}`, where - :math:`\Gamma` denotes the space of warping functions. The karcher mean - in this space is defined as - - .. math:: - [\mu_q] = argmin_{[q] \in \mathcal{A}} \sum_{i=1}^n - d_{\lambda}^2([q],[q_i]) - - Once :math:`[\mu_q]` is obtained it is selected the element of the - equivalence class which makes the mean of the warpings employed be the - identity. - - See :footcite:`srivastava+klassen_2016_analysis_karcher` and - :footcite:`srivastava++_2011_ficher-rao_karcher`. - - Args: - fdatagrid: Set of functions to compute the - mean. - penalty: Penalisation term. Defaults to 0. - center: If ``True`` it is computed the mean of the warpings and - used to select a central mean. Defaults ``True``. - max_iter: Maximum number of iterations. Defaults to 20. - tol: Convergence criterion, the algorithm will stop if - :math:`|mu_{(\nu)} - mu_{(\nu - 1)}|_2 / | mu_{(\nu-1)} |_2 < tol`. - initial: Value of the mean at the starting point. By default - takes the average of the initial points of the samples. - grid_dim: Dimension of the grid used in the alignment - algorithm. Defaults 7. - kwargs: Named options to be pased to :func:`warping_mean`. - - Returns: - FDatagrid with the mean of the functions. - - Raises: - ValueError: If the object is multidimensional or the shape of the srsf - do not match with the fdatagrid. - - References: - .. footbibliography:: - - """ - check_is_univariate(fdatagrid) - - srsf_transformer = SRSF(initial_value=0) - fdatagrid_srsf = srsf_transformer.fit_transform(fdatagrid) - eval_points = fdatagrid.grid_points[0] - - eval_points_normalized = _normalize_scale(eval_points) - y_scale = eval_points[-1] - eval_points[0] - - interpolation = SplineInterpolation(interpolation_order=3, monotone=True) - - # Discretisation points - fdatagrid_normalized = FDataGrid( - fdatagrid(eval_points) / y_scale, - grid_points=eval_points_normalized, - ) - - srsf = fdatagrid_srsf(eval_points)[..., 0] - - # Initialize with function closest to the L2 mean with the L2 distance - centered = (srsf.T - srsf.mean(axis=0, keepdims=True).T).T - - distances = scipy.integrate.simps( - np.square(centered, out=centered), - eval_points_normalized, axis=1, - ) - - # Initialization of iteration - mu = srsf[np.argmin(distances)] - mu_aux = np.empty(mu.shape) - mu_1 = np.empty(mu.shape) - - # Main iteration - for _ in range(max_iter): - - gammas_matrix = _elastic_alignment_array( - mu, - srsf, - eval_points_normalized, - penalty, - grid_dim, - ) - - gammas = FDataGrid( - gammas_matrix, - grid_points=eval_points_normalized, - interpolation=interpolation, - ) - - fdatagrid_normalized = fdatagrid_normalized.compose(gammas) - srsf = srsf_transformer.transform( - fdatagrid_normalized, - ).data_matrix[..., 0] - - # Next iteration - mu_1 = srsf.mean(axis=0, out=mu_1) - - # Convergence criterion - mu_norm = np.sqrt( - scipy.integrate.simps( - np.square(mu, out=mu_aux), - eval_points_normalized, - ), - ) - - mu_diff = np.sqrt( - scipy.integrate.simps( - np.square(mu - mu_1, out=mu_aux), - eval_points_normalized, - ), - ) - - if mu_diff / mu_norm < tol: - break - - mu = mu_1 - - if initial is None: - initial = fdatagrid.data_matrix[:, 0].mean() - - srsf_transformer.set_params(initial_value=initial) - - # Karcher mean orbit in space L2/Gamma - karcher_mean = srsf_transformer.inverse_transform( - fdatagrid.copy( - data_matrix=[mu], - grid_points=eval_points, - sample_names=("Karcher mean",), - ), - ) - - if center: - # Gamma mean in Hilbert Sphere - mean_normalized = warping_mean(gammas, **kwargs) - - gamma_mean = FDataGrid( - _normalize_scale( - mean_normalized.data_matrix[..., 0], - a=eval_points[0], - b=eval_points[-1], - ), - grid_points=eval_points, - ) - - gamma_inverse = invert_warping(gamma_mean) - - karcher_mean = karcher_mean.compose(gamma_inverse) - - # Return center of the orbit - return karcher_mean - - -class ElasticRegistration(RegistrationTransformer): - r"""Align a FDatagrid using the SRSF framework. - - Let :math:`f` be a function of the functional data object wich will be - aligned to the template :math:`g`. Calculates the warping wich minimises - the Fisher-Rao distance between :math:`g` and the registered function - :math:`f^*(t)=f(\gamma^*(t))=f \circ \gamma^*`. - - .. math:: - \gamma^* = argmin_{\gamma \in \Gamma} d_{\lambda}(f \circ - \gamma, g) - - Where :math:`d_{\lambda}` denotes the extended Fisher-Rao distance with a - penalty term, used to control the amount of warping. - - .. math:: - d_{\lambda}^2(f \circ \gamma, g) = \| SRSF(f \circ \gamma) - \sqrt{\dot{\gamma}} - SRSF(g)\|_{\mathbb{L}^2}^2 + \lambda - \mathcal{R}(\gamma) - - In the implementation it is used as penalty term - - .. math:: - \mathcal{R}(\gamma) = \|\sqrt{\dot{\gamma}}- 1 \|_{\mathbb{L}^2}^2 - - Wich restrict the amount of elasticity employed in the alignment. - - The registered function :math:`f^*(t)` can be calculated using the - composition :math:`f^*(t)=f(\gamma^*(t))`. - - If the template is not specified it is used the Karcher mean of the set of - functions under the elastic metric to perform the alignment, also known as - `elastic mean`, wich is the local minimum of the sum of squares of elastic - distances. See :func:`~elastic_mean`. - - In :footcite:`srivastava+klassen_2016_analysis_elastic` are described - extensively the algorithms employed and the SRSF framework. - - Args: - template (str, :class:`FDataGrid` or callable, optional): Template to - align the curves. Can contain 1 sample to align all the curves to - it or the same number of samples than the fdatagrid. By default - `elastic mean`, in which case :func:`elastic_mean` is called. - penalty_term (float, optional): Controls the amount of elasticity. - Defaults to 0. - output_points (array_like, optional): Set of points where the - functions are evaluated, by default uses the sample points of the - fdatagrid which will be transformed. - grid_dim (int, optional): Dimension of the grid used in the DP - alignment algorithm. Defaults 7. - - Attributes: - template\_: Template learned during fitting, - used for alignment in :meth:`transform`. - warping\_: Warping applied during the last - transformation. - - References: - .. footbibliography:: - - Examples: - Elastic registration of with train/test sets. - - >>> from skfda.preprocessing.registration import \ - ... ElasticRegistration - >>> from skfda.datasets import make_multimodal_samples - >>> X_train = make_multimodal_samples(n_samples=15, random_state=0) - >>> X_test = make_multimodal_samples(n_samples=3, random_state=1) - - Fit the transformer, which learns the elastic mean of the train - set as template. - - >>> elastic_registration = ElasticRegistration() - >>> elastic_registration.fit(X_train) - ElasticRegistration(...) - - Registration of the test set. - - >>> elastic_registration.transform(X_test) - FDataGrid(...) - - """ - - def __init__( - self, - template: Union[FDataGrid, _MeanType] = elastic_mean, - penalty: float = 0, - output_points: Optional[ArrayLike] = None, - grid_dim: int = 7, - ) -> None: - self.template = template - self.penalty = penalty - self.output_points = output_points - self.grid_dim = grid_dim - - def fit(self, X: FDataGrid, y: None = None) -> RegistrationTransformer: - """Fit the transformer. - - Learns the template used during the transformation. - - Args: - X: Functional observations used as training samples. If the - template provided is a FDataGrid this argument is ignored, as - it is not necessary to learn the template from the training - data. - y: Present for API conventions. - - Returns: - self. - - """ - if isinstance(self.template, FDataGrid): - self.template_ = self.template # Template already constructed - else: - self.template_ = self.template(X) - - # Constructs the SRSF of the template - srsf = SRSF(output_points=self.output_points, initial_value=0) - self._template_srsf = srsf.fit_transform(self.template_) - - return self - - def transform(self, X: FDataGrid, y: None = None) -> FDataGrid: - """Apply elastic registration to the data. - - Args: - X: Functional data to be registered. - y: Present for API conventions. - - Returns: - Registered samples. - - """ - check_is_fitted(self, '_template_srsf') - check_is_univariate(X) - - if ( - len(self._template_srsf) != 1 - and len(X) != len(self._template_srsf) - ): - - raise ValueError( - "The template should contain one sample to align " - "all the curves to the same function or the " - "same number of samples than X.", - ) - - srsf = SRSF(output_points=self.output_points, initial_value=0) - fdatagrid_srsf = srsf.fit_transform(X) - - # Points of discretization - if self.output_points is None: - output_points = fdatagrid_srsf.grid_points[0] - else: - output_points = self.output_points - - # Discretizacion in evaluation points - q_data = fdatagrid_srsf(output_points)[..., 0] - template_data = self._template_srsf(output_points)[..., 0] - - if q_data.shape[0] == 1: - q_data = q_data[0] - - if template_data.shape[0] == 1: - template_data = template_data[0] - - # Values of the warping - gamma = _elastic_alignment_array( - template_data, - q_data, - _normalize_scale(output_points), - self.penalty, - self.grid_dim, - ) - - # Normalize warping to original interval - gamma = _normalize_scale( - gamma, - a=output_points[0], - b=output_points[-1], - ) - - # Interpolation - interpolation = SplineInterpolation( - interpolation_order=3, - monotone=True, - ) - - self.warping_ = FDataGrid( - gamma, - output_points, - interpolation=interpolation, - ) - - return X.compose(self.warping_, eval_points=output_points) - - def inverse_transform(self, X: FDataGrid, y: None = None) -> FDataGrid: - r"""Reverse the registration procedure previosly applied. - - Let :math:`gamma(t)` the warping applied to construct a registered - functional datum :math:`f^*(t)=f(\gamma(t))`. - - Given a functional datum :math:`f^*(t) it is computed - :math:`\gamma^{-1}(t)` to reverse the registration procedure - :math:`f(t)=f^*(\gamma^{-1}(t))`. - - Args: - X: Functional data to apply the reverse - transform. - y: Present for API conventions. - - Returns: - Functional data compose by the inverse warping. - - Raises: - ValueError: If the warpings :math:`\gamma` were not build via - :meth:`transform` or if the number of samples of `X` is - different than the number of samples of the dataset - previously transformed. - - Examples: - - Center the datasets taking into account the misalignment. - - >>> from skfda.preprocessing.registration import \ - ... ElasticRegistration - >>> from skfda.datasets import make_multimodal_samples - >>> X = make_multimodal_samples(random_state=0) - - Registration of the dataset. - - >>> elastic_registration = ElasticRegistration() - >>> X = elastic_registration.fit_transform(X) - - Substract the elastic mean build as template during the - registration and reverse the transformation. - - >>> X = X - elastic_registration.template_ - >>> X_center = elastic_registration.inverse_transform(X) - >>> X_center - FDataGrid(...) - - - See also: - :func:`invert_warping` - - """ - - warping = getattr(self, 'warping_', None) - - if warping is None: - raise ValueError( - "Data must be previosly transformed to apply the " - "inverse transform", - ) - elif len(X) != len(warping): - raise ValueError( - "Data must contain the same number of samples " - "than the dataset previously transformed", - ) - - inverse_warping = invert_warping(warping) - - return X.compose(inverse_warping, eval_points=self.output_points) diff --git a/tests/test_elastic.py b/tests/test_elastic.py index 740b77ad7..e42e81b1d 100644 --- a/tests/test_elastic.py +++ b/tests/test_elastic.py @@ -5,7 +5,12 @@ import numpy as np from skfda import FDataGrid +from skfda._utils import invert_warping, normalize_warping from skfda.datasets import make_multimodal_samples, make_random_warping +from skfda.exploratory.stats import ( + _fisher_rao_warping_mean, + fisher_rao_karcher_mean, +) from skfda.misc.metrics import ( PairwiseMetric, _fisher_rao_warping_distance, @@ -14,22 +19,14 @@ fisher_rao_phase_distance, l2_distance, ) -from skfda.preprocessing.registration import ( - ElasticRegistration, - invert_warping, - normalize_warping, -) -from skfda.preprocessing.registration.elastic import ( - SRSF, - elastic_mean, - warping_mean, -) +from skfda.misc.operators import SRSF +from skfda.preprocessing.registration import ElasticFisherRaoRegistration metric = PairwiseMetric(l2_distance) pairwise_fisher_rao = PairwiseMetric(fisher_rao_distance) -class TestElasticRegistration(unittest.TestCase): +class TestElasticFisherRaoRegistration(unittest.TestCase): """Test elastic registration.""" def setUp(self) -> None: @@ -110,7 +107,7 @@ def test_srsf_conversion(self) -> None: def test_template_alignment(self) -> None: """Test alignment to 1 template.""" - reg = ElasticRegistration(template=self.template) + reg = ElasticFisherRaoRegistration(template=self.template) register = reg.fit_transform(self.unimodal_samples) distances = metric(self.template, register) @@ -118,7 +115,7 @@ def test_template_alignment(self) -> None: def test_one_to_one_alignment(self) -> None: """Test alignment to 1 sample to a template.""" - reg = ElasticRegistration(template=self.template) + reg = ElasticFisherRaoRegistration(template=self.template) register = reg.fit_transform(self.unimodal_samples[0]) distances = metric(self.template, register) @@ -127,7 +124,7 @@ def test_one_to_one_alignment(self) -> None: def test_set_alignment(self) -> None: """Test alignment 3 curves to set with 3 templates.""" # Should give same result than test_template_alignment - reg = ElasticRegistration(template=self.template_rep) + reg = ElasticFisherRaoRegistration(template=self.template_rep) register = reg.fit_transform(self.unimodal_samples) distances = metric(self.template, register) @@ -136,7 +133,7 @@ def test_set_alignment(self) -> None: def test_default_alignment(self) -> None: """Test alignment by default.""" # Should give same result than test_template_alignment - reg = ElasticRegistration() + reg = ElasticFisherRaoRegistration() register = reg.fit_transform(self.unimodal_samples) values = register([-0.25, -0.1, 0, 0.1, 0.25]) @@ -158,7 +155,7 @@ def test_default_alignment(self) -> None: def test_callable_alignment(self) -> None: """Test alignment by default.""" # Should give same result than test_template_alignment - reg = ElasticRegistration(template=elastic_mean) + reg = ElasticFisherRaoRegistration(template=fisher_rao_karcher_mean) register = reg.fit_transform(self.unimodal_samples) values = register([-0.25, -0.1, 0, 0.1, 0.25]) @@ -178,7 +175,7 @@ def test_callable_alignment(self) -> None: def test_simmetry_of_aligment(self) -> None: """Check registration using inverse composition.""" - reg = ElasticRegistration(template=self.template) + reg = ElasticFisherRaoRegistration(template=self.template) reg.fit_transform(self.unimodal_samples) warping = reg.warping_ inverse = invert_warping(warping) @@ -189,7 +186,7 @@ def test_simmetry_of_aligment(self) -> None: def test_raises(self) -> None: """Test that the assertions raise when appropriate.""" - reg = ElasticRegistration() + reg = ElasticFisherRaoRegistration() # Inverse transform without previous transform with np.testing.assert_raises(ValueError): @@ -201,7 +198,7 @@ def test_raises(self) -> None: reg.inverse_transform(self.unimodal_samples[0]) # FDataGrid as template with n != 1 and n!= n_samples to transform - reg = ElasticRegistration(template=self.unimodal_samples).fit( + reg = ElasticFisherRaoRegistration(template=self.unimodal_samples).fit( self.unimodal_samples[0], ) with np.testing.assert_raises(ValueError): @@ -209,7 +206,7 @@ def test_raises(self) -> None: def test_score(self) -> None: """Test score method of the transformer.""" - reg = ElasticRegistration() + reg = ElasticFisherRaoRegistration() reg.fit(self.unimodal_samples) score = reg.score(self.unimodal_samples) np.testing.assert_almost_equal(score, 0.999389) @@ -217,7 +214,7 @@ def test_score(self) -> None: def test_warping_mean(self) -> None: """Test the warping_mean function.""" warping = make_random_warping(start=-1, random_state=0) - mean = warping_mean(warping) + mean = _fisher_rao_warping_mean(warping) values = mean([-1, -0.5, 0, 0.5, 1]) expected = [[[-1], [-0.376241], [0.136193], [0.599291], [1]]] np.testing.assert_array_almost_equal(values, expected) diff --git a/tests/test_registration.py b/tests/test_registration.py index 0398899b3..7b36f533e 100644 --- a/tests/test_registration.py +++ b/tests/test_registration.py @@ -4,7 +4,7 @@ from sklearn.exceptions import NotFittedError from skfda import FDataGrid -from skfda._utils import _check_estimator +from skfda._utils import invert_warping, normalize_warping from skfda.datasets import ( make_multimodal_landmarks, make_multimodal_samples, @@ -13,12 +13,10 @@ from skfda.exploratory.stats import mean from skfda.preprocessing.registration import ( ShiftRegistration, - invert_warping, landmark_registration, landmark_registration_warping, landmark_shift, landmark_shift_deltas, - normalize_warping, ) from skfda.preprocessing.registration.validation import ( AmplitudePhaseDecomposition, From 0a51974c68e7c89bbef313259fdbb2f32b229d78 Mon Sep 17 00:00:00 2001 From: VNMabus Date: Sun, 3 Oct 2021 16:35:35 +0200 Subject: [PATCH 010/117] Improve covariance typing. --- skfda/misc/covariances.py | 293 +++++++++++++++++++++++--------------- 1 file changed, 179 insertions(+), 114 deletions(-) diff --git a/skfda/misc/covariances.py b/skfda/misc/covariances.py index 2ea24e301..c712f70b5 100644 --- a/skfda/misc/covariances.py +++ b/skfda/misc/covariances.py @@ -1,19 +1,29 @@ +from __future__ import annotations + import abc -import numbers +from typing import Callable, Sequence, Tuple, Union import matplotlib.pyplot as plt - import numpy as np import sklearn.gaussian_process.kernels as sklearn_kern +from matplotlib.figure import Figure from ..exploratory.visualization._utils import _create_figure, _figure_to_svg +from ..representation._typing import ArrayLike, NDArrayFloat -def _squared_norms(x, y): +def _squared_norms(x: NDArrayFloat, y: NDArrayFloat) -> NDArrayFloat: return ((x[np.newaxis, :, :] - y[:, np.newaxis, :]) ** 2).sum(2) -def _transform_to_2d(t): +CovarianceLike = Union[ + float, + NDArrayFloat, + Callable[[ArrayLike, ArrayLike], NDArrayFloat], +] + + +def _transform_to_2d(t: ArrayLike) -> NDArrayFloat: """Transform 1d arrays in column vectors.""" t = np.asarray(t) @@ -26,18 +36,21 @@ def _transform_to_2d(t): return t -def _execute_covariance(covariance, x, y): - """Execute a covariance function. - """ +def _execute_covariance( + covariance: CovarianceLike, + x: ArrayLike, + y: ArrayLike, +) -> NDArrayFloat: + """Execute a covariance function.""" x = _transform_to_2d(x) y = _transform_to_2d(y) - if isinstance(covariance, numbers.Number): - return covariance + if isinstance(covariance, (int, float)): + return np.array(covariance) else: if callable(covariance): result = covariance(x, y) - elif hasattr(covariance, "shape"): + elif isinstance(covariance, np.ndarray): result = covariance else: # GPy kernel @@ -49,62 +62,73 @@ def _execute_covariance(covariance, x, y): class Covariance(abc.ABC): - """Abstract class for covariance functions""" + """Abstract class for covariance functions.""" + + _parameters_str: Sequence[Tuple[str, str]] + _latex_formula: str @abc.abstractmethod - def __call__(self, x, y): + def __call__(self, x: ArrayLike, y: ArrayLike) -> NDArrayFloat: pass - def heatmap(self, limits=(-1, 1)): - """ - Return a heatmap plot of the covariance function. - - """ - + def heatmap(self, limits: Tuple[float, float] = (-1, 1)) -> Figure: + """Return a heatmap plot of the covariance function.""" x = np.linspace(*limits, 1000) cov_matrix = self(x, x) fig = _create_figure() ax = fig.add_subplot(1, 1, 1) - ax.imshow(cov_matrix, extent=[limits[0], limits[1], - limits[1], limits[0]]) + ax.imshow( + cov_matrix, + extent=[limits[0], limits[1], limits[1], limits[0]], + ) ax.set_title(f"Covariance function in [{limits[0]}, {limits[1]}]") return fig - def _sample_trajectories_plot(self): + def _sample_trajectories_plot(self) -> Figure: from ..datasets import make_gaussian_process fd = make_gaussian_process( - start=-1, n_samples=10, cov=self, random_state=0) + start=-1, + n_samples=10, + cov=self, + random_state=0, + ) fig = fd.plot() fig.axes[0].set_title("Sample trajectories") return fig - def __repr__(self): + def __repr__(self) -> str: - params = ', '.join(f'{n}={getattr(self, n)}' - for n, _ in self._parameters) + params_str = ', '.join( + f'{n}={getattr(self, n)}' for n, _ in self._parameters_str + ) - return (f"{self.__module__}.{type(self).__qualname__}(" - f"{params}" - f")") + return ( + f"{self.__module__}.{type(self).__qualname__}(" + f"{params_str}" + f")" + ) - def _latex_content(self): - params = ''.join(fr'{l} &= {getattr(self, n)} \\' - for n, l in self._parameters) + def _latex_content(self) -> str: + params_str = ''.join( + fr'{l} &= {getattr(self, n)} \\' for n, l in self._parameters_str + ) - return (fr"{self._latex_formula} \\" - r"\text{where:}" - r"\begin{aligned}" - fr"\qquad{params}" - r"\end{aligned}") + return ( + fr"{self._latex_formula} \\" + r"\text{where:}" + r"\begin{aligned}" + fr"\qquad{params_str}" + r"\end{aligned}" + ) - def _repr_latex_(self): + def _repr_latex_(self) -> str: return fr"\(\displaystyle {self._latex_content()}\)" - def _repr_html_(self): + def _repr_html_(self) -> str: fig = self.heatmap() heatmap = _figure_to_svg(fig) plt.close(fig) @@ -115,18 +139,20 @@ def _repr_html_(self): row_style = '' - def column_style(percent, margin_top=0): - return (f'style="display: inline-block; ' - f'margin:0; ' - f'margin-top: {margin_top}; ' - f'width:{percent}%; ' - f'height:auto;' - f'vertical-align: middle"') - - html = f""" + def column_style(percent: float, margin_top: str = "0") -> str: + return ( + f'style="display: inline-block; ' + f'margin:0; ' + f'margin-top: {margin_top}; ' + f'width:{percent}%; ' + f'height:auto;' + f'vertical-align: middle"' + ) + + return fr"""
- \\[{self._latex_content()}\\] + \[{self._latex_content()}\]
@@ -139,12 +165,12 @@ def column_style(percent, margin_top=0):
""" - return html - - def to_sklearn(self): - """Convert it to a sklearn kernel, if there is one""" - raise NotImplementedError(f"{type(self).__name__} covariance not " - f"implemented in scikit-learn") + def to_sklearn(self) -> sklearn_kern.Kernel: + """Convert it to a sklearn kernel, if there is one.""" + raise NotImplementedError( + f"{type(self).__name__} covariance not " + f"implemented in scikit-learn", + ) class Brownian(Covariance): @@ -197,25 +223,32 @@ class Brownian(Covariance): Brownian() """ - _latex_formula = (r"K(x, x') = \sigma^2 \frac{|x - \mathcal{O}| + " - r"|x' - \mathcal{O}| - |x - x'|}{2}") + _latex_formula = ( + r"K(x, x') = \sigma^2 \frac{|x - \mathcal{O}| + " + r"|x' - \mathcal{O}| - |x - x'|}{2}" + ) - _parameters = [("variance", r"\sigma^2"), - ("origin", r"\mathcal{O}")] + _parameters_str = [ + ("variance", r"\sigma^2"), + ("origin", r"\mathcal{O}"), + ] - def __init__(self, *, variance: float = 1., origin=0.): + def __init__(self, *, variance: float = 1, origin: float = 0) -> None: self.variance = variance self.origin = origin - def __call__(self, x, y): + def __call__(self, x: ArrayLike, y: ArrayLike) -> NDArrayFloat: x = _transform_to_2d(x) - self.origin y = _transform_to_2d(y) - self.origin sum_norms = np.add.outer( np.linalg.norm(x, axis=-1), - np.linalg.norm(y, axis=-1)) + np.linalg.norm(y, axis=-1), + ) norm_sub = np.linalg.norm( - x[:, np.newaxis, :] - y[np.newaxis, :, :], axis=-1) + x[:, np.newaxis, :] - y[np.newaxis, :, :], + axis=-1, + ) return self.variance * (sum_norms - norm_sub) / 2 @@ -264,25 +297,29 @@ class Linear(Covariance): Linear() """ + _latex_formula = r"K(x, x') = \sigma^2 (x^T x' + c)" - _parameters = [("variance", r"\sigma^2"), - ("intercept", r"c")] + _parameters_str = [ + ("variance", r"\sigma^2"), + ("intercept", "c"), + ] - def __init__(self, *, variance: float=1., intercept: float=0.): + def __init__(self, *, variance: float = 1, intercept: float = 0) -> None: self.variance = variance self.intercept = intercept - def __call__(self, x, y): + def __call__(self, x: ArrayLike, y: ArrayLike) -> NDArrayFloat: x = _transform_to_2d(x) y = _transform_to_2d(y) return self.variance * (x @ y.T + self.intercept) - def to_sklearn(self): - """Convert it to a sklearn kernel, if there is one""" - return (self.variance * - (sklearn_kern.DotProduct(0) + self.intercept)) + def to_sklearn(self) -> sklearn_kern.Kernel: + return ( + self.variance + * (sklearn_kern.DotProduct(0) + self.intercept) + ) class Polynomial(Covariance): @@ -330,33 +367,44 @@ class Polynomial(Covariance): Polynomial() """ - _latex_formula = r"K(x, x') = \sigma^2 (\alpha x^T x' + c)^d" - _parameters = [("variance", r"\sigma^2"), - ("intercept", r"c"), - ("slope", r"\alpha"), - ("degree", r"d")] + _latex_formula = r"K(x, x') = \sigma^2 (\alpha x^T x' + c)^d" - def __init__(self, *, variance: float=1., intercept: float=0., - slope: float=1., degree: float=2.): + _parameters_str = [ + ("variance", r"\sigma^2"), + ("intercept", "c"), + ("slope", r"\alpha"), + ("degree", "d"), + ] + + def __init__( + self, + *, + variance: float = 1, + intercept: float = 0, + slope: float = 1, + degree: float = 2, + ) -> None: self.variance = variance self.intercept = intercept self.slope = slope self.degree = degree - def __call__(self, x, y): + def __call__(self, x: ArrayLike, y: ArrayLike) -> NDArrayFloat: x = _transform_to_2d(x) y = _transform_to_2d(y) - return self.variance * (self.slope * x @ y.T - + self.intercept) ** self.degree + return ( + self.variance + * (self.slope * x @ y.T + self.intercept) ** self.degree + ) - def to_sklearn(self): - """Convert it to a sklearn kernel, if there is one""" - return (self.variance * - (self.slope * - sklearn_kern.DotProduct(0) + + self.intercept) - ** self.degree) + def to_sklearn(self) -> sklearn_kern.Kernel: + return ( + self.variance + * (self.slope * sklearn_kern.DotProduct(0) + self.intercept) + ** self.degree + ) class Gaussian(Covariance): @@ -402,17 +450,22 @@ class Gaussian(Covariance): Gaussian() """ - _latex_formula = (r"K(x, x') = \sigma^2 \exp\left(-\frac{\|x - x'\|^2}{2l^2}" - r"\right)") - _parameters = [("variance", r"\sigma^2"), - ("length_scale", r"l")] + _latex_formula = ( + r"K(x, x') = \sigma^2 \exp\left(-\frac{\|x - x'\|^2}{2l^2}" + r"\right)" + ) - def __init__(self, *, variance: float=1., length_scale: float=1.): + _parameters_str = [ + ("variance", r"\sigma^2"), + ("length_scale", "l"), + ] + + def __init__(self, *, variance: float = 1, length_scale: float = 1): self.variance = variance self.length_scale = length_scale - def __call__(self, x, y): + def __call__(self, x: ArrayLike, y: ArrayLike) -> NDArrayFloat: x = _transform_to_2d(x) y = _transform_to_2d(y) @@ -420,10 +473,10 @@ def __call__(self, x, y): return self.variance * np.exp(-x_y / (2 * self.length_scale ** 2)) - def to_sklearn(self): - """Convert it to a sklearn kernel, if there is one""" - return (self.variance * - sklearn_kern.RBF(length_scale=self.length_scale)) + def to_sklearn(self) -> sklearn_kern.Kernel: + return ( + self.variance * sklearn_kern.RBF(length_scale=self.length_scale) + ) class Exponential(Covariance): @@ -469,27 +522,38 @@ class Exponential(Covariance): Exponential() """ - _latex_formula = (r"K(x, x') = \sigma^2 \exp\left(-\frac{||x - x'||}{l}" - r"\right)") - - _parameters = [("variance", r"\sigma^2"), - ("length_scale", r"l")] - def __init__(self, *, variance: float=1., length_scale: float=1.): + _latex_formula = ( + r"K(x, x') = \sigma^2 \exp\left(-\frac{||x - x'||}{l}" + r"\right)" + ) + + _parameters_str = [ + ("variance", r"\sigma^2"), + ("length_scale", "l"), + ] + + def __init__( + self, + *, + variance: float = 1, + length_scale: float = 1, + ) -> None: self.variance = variance self.length_scale = length_scale - def __call__(self, x, y): + def __call__(self, x: ArrayLike, y: ArrayLike) -> NDArrayFloat: x = _transform_to_2d(x) y = _transform_to_2d(y) x_y = _squared_norms(x, y) return self.variance * np.exp(-np.sqrt(x_y) / (self.length_scale)) - def to_sklearn(self): - """Convert it to a sklearn kernel, if there is one""" - return (self.variance * - sklearn_kern.Matern(length_scale=self.length_scale, nu=0.5)) + def to_sklearn(self) -> sklearn_kern.Kernel: + return ( + self.variance + * sklearn_kern.Matern(length_scale=self.length_scale, nu=0.5) + ) class WhiteNoise(Covariance): @@ -539,18 +603,19 @@ class WhiteNoise(Covariance): """ - _latex_formula = (r"K(x, x')= \begin{cases} \sigma^2, \quad x = x' \\" - r"0, \quad x \neq x'\\ \end{cases}") + _latex_formula = ( + r"K(x, x')= \begin{cases} \sigma^2, \quad x = x' \\" + r"0, \quad x \neq x'\\ \end{cases}" + ) - _parameters = [("variance", r"\sigma^2")] + _parameters_str = [("variance", r"\sigma^2")] - def __init__(self, *, variance: float = 1.): + def __init__(self, *, variance: float = 1): self.variance = variance - def __call__(self, x, y): + def __call__(self, x: ArrayLike, y: ArrayLike) -> NDArrayFloat: x = _transform_to_2d(x) return self.variance * np.eye(x.shape[0]) - def to_sklearn(self): - """Convert it to a sklearn kernel, if there is one""" + def to_sklearn(self) -> sklearn_kern.Kernel: return sklearn_kern.WhiteKernel(noise_level=self.variance) From eb490391d132c8a745f02000fb94b58b7ef09215 Mon Sep 17 00:00:00 2001 From: VNMabus Date: Sun, 3 Oct 2021 20:37:41 +0200 Subject: [PATCH 011/117] Add Matern covariance. --- docs/modules/misc/covariances.rst | 1 + skfda/misc/covariances.py | 123 ++++++++++++++++++++++++++++++ tests/test_covariances.py | 98 ++++++++++++++++-------- 3 files changed, 190 insertions(+), 32 deletions(-) diff --git a/docs/modules/misc/covariances.rst b/docs/modules/misc/covariances.rst index f27137ef8..f7976d88d 100644 --- a/docs/modules/misc/covariances.rst +++ b/docs/modules/misc/covariances.rst @@ -14,4 +14,5 @@ processes. These functions can be used as covariances in skfda.misc.covariances.Gaussian skfda.misc.covariances.Linear skfda.misc.covariances.Polynomial + skfda.misc.covariances.Matern skfda.misc.covariances.WhiteNoise \ No newline at end of file diff --git a/skfda/misc/covariances.py b/skfda/misc/covariances.py index c712f70b5..7acda93fa 100644 --- a/skfda/misc/covariances.py +++ b/skfda/misc/covariances.py @@ -7,6 +7,7 @@ import numpy as np import sklearn.gaussian_process.kernels as sklearn_kern from matplotlib.figure import Figure +from scipy.special import gamma, kv from ..exploratory.visualization._utils import _create_figure, _figure_to_svg from ..representation._typing import ArrayLike, NDArrayFloat @@ -619,3 +620,125 @@ def __call__(self, x: ArrayLike, y: ArrayLike) -> NDArrayFloat: def to_sklearn(self) -> sklearn_kern.Kernel: return sklearn_kern.WhiteKernel(noise_level=self.variance) + + +class Matern(Covariance): + r""" + Matérn covariance function. + + The covariance function is + + .. math:: + K(x, x') = \sigma^2 \frac{2^{1-\nu}}{\Gamma(\nu)} + \left( \frac{\sqrt{2\nu}|x - x'|}{l} \right)^{\nu} + K_{\nu}\left( \frac{\sqrt{2\nu}|x - x'|}{l} \right) + + where :math:`\sigma^2` is the variance, :math:`l` is the length scale + and :math:`\nu` controls the smoothness of the related Gaussian process. + The trajectories of a Gaussian process with Matérn covariance is + :math:`\lceil \nu \rceil - 1` times differentiable. + + + Heatmap plot of the covariance function: + + .. jupyter-execute:: + + from skfda.misc.covariances import Matern + import matplotlib.pyplot as plt + + Matern().heatmap(limits=(0, 1)) + plt.show() + + Example of Gaussian process trajectories using this covariance: + + .. jupyter-execute:: + + from skfda.misc.covariances import Matern + from skfda.datasets import make_gaussian_process + import matplotlib.pyplot as plt + + gp = make_gaussian_process( + n_samples=10, cov=Matern(), random_state=0) + gp.plot() + plt.show() + + Default representation in a Jupyter notebook: + + .. jupyter-execute:: + + from skfda.misc.covariances import Matern + + Matern() + + """ + _latex_formula = ( + r"K(x, x') = \sigma^2 \frac{2^{1-\nu}}{\Gamma(\nu)}" + r"\left( \frac{\sqrt{2\nu}|x - x'|}{l} \right)^{\nu}" + r"K_{\nu}\left( \frac{\sqrt{2\nu}|x - x'|}{l} \right)" + ) + + _parameters_str = [ + ("variance", r"\sigma^2"), + ("length_scale", "l"), + ("nu", r"\nu"), + ] + + def __init__( + self, + *, + variance: float = 1, + length_scale: float = 1, + nu: float = 1.5, + ) -> None: + self.variance = variance + self.length_scale = length_scale + self.nu = nu + + def __call__(self, x: ArrayLike, y: ArrayLike) -> NDArrayFloat: + x = _transform_to_2d(x) + y = _transform_to_2d(y) + + x_y_squared = _squared_norms(x, y) + x_y = np.sqrt(x_y_squared) + + p = self.nu - 0.5 + if p.is_integer(): + # Formula for half-integers + p = int(p) + body = np.sqrt(2 * p + 1) * x_y / self.length_scale + exponential = np.exp(-body) + power_list = np.full(shape=(p,) + body.shape, fill_value=2 * body) + power_list = np.cumprod(power_list, axis=0) + power_list = np.concatenate( + (power_list[::-1], [np.ones_like(body)]), + ) + power_list = np.moveaxis(power_list, 0, -1) + numerator = np.cumprod(np.arange(p, 0, -1)) + numerator = np.concatenate(([1], numerator)) + denom1 = np.cumprod(np.arange(2 * p, p, -1)) + denom1 = np.concatenate((denom1[::-1], [1])) + denom2 = np.cumprod(np.arange(1, p + 1)) + denom2 = np.concatenate(([1], denom2)) + + sum_terms = power_list * numerator / (denom1 * denom2) + return self.variance * exponential * np.sum(sum_terms, axis=-1) + elif self.nu == np.inf: + return self.variance * np.exp(-x_y_squared / (2 * self.length_scale ** 2)) + else: + # General formula + scaling = 2**(1 - self.nu) / gamma(self.nu) + body = np.sqrt(2 * self.nu) * x_y / self.length_scale + power = body**self.nu + bessel = kv(self.nu, body) + + with np.errstate(invalid='ignore'): + eval_cov = self.variance * scaling * power * bessel + + # Values with nan are where the distance is 0 + return np.nan_to_num(eval_cov, nan=self.variance) + + def to_sklearn(self) -> sklearn_kern.Kernel: + return ( + self.variance + * sklearn_kern.Matern(length_scale=self.length_scale, nu=self.nu) + ) diff --git a/tests/test_covariances.py b/tests/test_covariances.py index a4e29024d..3f6b1be1f 100644 --- a/tests/test_covariances.py +++ b/tests/test_covariances.py @@ -1,70 +1,104 @@ import unittest import numpy as np + import skfda class TestsSklearn(unittest.TestCase): - def setUp(self): + def setUp(self) -> None: unittest.TestCase.setUp(self) self.x = np.linspace(-1, 1, 1000)[:, np.newaxis] - def _test_compare_sklearn(self, cov: skfda.misc.covariances.Covariance): + def _test_compare_sklearn( + self, + cov: skfda.misc.covariances.Covariance, + ) -> None: cov_sklearn = cov.to_sklearn() cov_matrix = cov(self.x, self.x) cov_sklearn_matrix = cov_sklearn(self.x) np.testing.assert_array_almost_equal(cov_matrix, cov_sklearn_matrix) - def test_linear(self): + def test_linear(self) -> None: - for variance in [1, 2]: - for intercept in [0, 1, 2]: + for variance in (1, 2): + for intercept in (0, 1, 2): with self.subTest(variance=variance, intercept=intercept): cov = skfda.misc.covariances.Linear( variance=variance, intercept=intercept) self._test_compare_sklearn(cov) - def test_polynomial(self): - - for variance in [1, 2]: - for intercept in [0, 1, 2]: - for slope in [1, 2]: - for degree in [1, 2, 3]: - with self.subTest(variance=variance, - intercept=intercept, - slope=slope, - degree=degree): + def test_polynomial(self) -> None: + + for variance in (1, 2): + for intercept in (0, 1, 2): + for slope in (1, 2): + for degree in (1, 2, 3): + with self.subTest( + variance=variance, + intercept=intercept, + slope=slope, + degree=degree, + ): cov = skfda.misc.covariances.Polynomial( - variance=variance, intercept=intercept, - slope=slope, degree=degree) + variance=variance, + intercept=intercept, + slope=slope, + degree=degree, + ) self._test_compare_sklearn(cov) - def test_gaussian(self): + def test_gaussian(self) -> None: - for variance in [1, 2]: - for length_scale in [0.5, 1, 2]: - with self.subTest(variance=variance, - length_scale=length_scale): + for variance in (1, 2): + for length_scale in (0.5, 1, 2): + with self.subTest( + variance=variance, + length_scale=length_scale, + ): cov = skfda.misc.covariances.Gaussian( - variance=variance, length_scale=length_scale) + variance=variance, + length_scale=length_scale, + ) self._test_compare_sklearn(cov) - def test_exponential(self): + def test_exponential(self) -> None: - for variance in [1, 2]: - for length_scale in [0.5, 1, 2]: - with self.subTest(variance=variance, - length_scale=length_scale): + for variance in (1, 2): + for length_scale in (0.5, 1, 2): + with self.subTest( + variance=variance, + length_scale=length_scale, + ): cov = skfda.misc.covariances.Exponential( - variance=variance, length_scale=length_scale) + variance=variance, + length_scale=length_scale, + ) self._test_compare_sklearn(cov) - def test_white_noise(self): - - for variance in [1, 2]: + def test_matern(self) -> None: + + for variance in (1, 2): + for length_scale in (0.5, 1, 2): + for nu in (0.5, 1, 1.5, 2, 2.5, 3.5, 4.5, np.inf): + with self.subTest( + variance=variance, + length_scale=length_scale, + nu=nu, + ): + cov = skfda.misc.covariances.Matern( + variance=variance, + length_scale=length_scale, + nu=nu, + ) + self._test_compare_sklearn(cov) + + def test_white_noise(self) -> None: + + for variance in (1, 2): with self.subTest(variance=variance): cov = skfda.misc.covariances.WhiteNoise(variance=variance) self._test_compare_sklearn(cov) From ff15eca94d083ddb1bb6ed97927290799c77124f Mon Sep 17 00:00:00 2001 From: VNMabus Date: Tue, 5 Oct 2021 18:45:42 +0200 Subject: [PATCH 012/117] Rename fisher rao registration. --- docs/modules/preprocessing/registration.rst | 2 +- examples/plot_elastic_registration.py | 6 ++--- examples/plot_pairwise_alignment.py | 8 +++---- skfda/misc/metrics/_fisher_rao.py | 6 ++--- skfda/preprocessing/registration/__init__.py | 2 +- .../preprocessing/registration/_fisher_rao.py | 16 +++++++------- tests/test_elastic.py | 22 +++++++++---------- 7 files changed, 31 insertions(+), 31 deletions(-) diff --git a/docs/modules/preprocessing/registration.rst b/docs/modules/preprocessing/registration.rst index ecb9797d0..838bcb896 100644 --- a/docs/modules/preprocessing/registration.rst +++ b/docs/modules/preprocessing/registration.rst @@ -64,7 +64,7 @@ introduction to this topic along the usage of the corresponding functions. .. autosummary:: :toctree: autosummary - skfda.preprocessing.registration.ElasticFisherRaoRegistration + skfda.preprocessing.registration.FisherRaoElasticRegistration Validation diff --git a/examples/plot_elastic_registration.py b/examples/plot_elastic_registration.py index e00465fe6..43dd4caa8 100644 --- a/examples/plot_elastic_registration.py +++ b/examples/plot_elastic_registration.py @@ -15,11 +15,11 @@ import skfda from skfda.datasets import fetch_growth, make_multimodal_samples from skfda.exploratory.stats import fisher_rao_karcher_mean -from skfda.preprocessing.registration import ElasticFisherRaoRegistration +from skfda.preprocessing.registration import FisherRaoElasticRegistration ############################################################################## # In the example of pairwise alignment was shown the usage of -# :class:`~skfda.preprocessing.registration.ElasticFisherRaoRegistration` to +# :class:`~skfda.preprocessing.registration.FisherRaoElasticRegistration` to # align a set of functional observations to a given template or a set of # templates. # @@ -54,7 +54,7 @@ # In this case, the alignment completely reduces the amplitude variability # between the samples, aligning the maximum points correctly. -elastic_registration = ElasticFisherRaoRegistration() +elastic_registration = FisherRaoElasticRegistration() fd_align = elastic_registration.fit_transform(fd) diff --git a/examples/plot_pairwise_alignment.py b/examples/plot_pairwise_alignment.py index fdf74c0d1..33b60f3b8 100644 --- a/examples/plot_pairwise_alignment.py +++ b/examples/plot_pairwise_alignment.py @@ -18,7 +18,7 @@ import skfda from skfda.datasets import make_multimodal_samples from skfda.preprocessing.registration import ( - ElasticFisherRaoRegistration, + FisherRaoElasticRegistration, invert_warping, ) @@ -56,12 +56,12 @@ # In this example :math:`g` will be used as template and :math:`f` will be # aligned to it. In the following figure it is shown the result of the # registration process, wich can be computed using -# :class:`~skfda.preprocessing.registration.ElasticFisherRaoRegistration`. +# :class:`~skfda.preprocessing.registration.FisherRaoElasticRegistration`. # f, g = fd[0], fd[1] -elastic_registration = ElasticFisherRaoRegistration(template=g) +elastic_registration = FisherRaoElasticRegistration(template=g) # Aligns f to g @@ -198,7 +198,7 @@ # # Registration of the sets -elastic_registration = ElasticFisherRaoRegistration(template=g) +elastic_registration = FisherRaoElasticRegistration(template=g) fd_registered = elastic_registration.fit_transform(fd) diff --git a/skfda/misc/metrics/_fisher_rao.py b/skfda/misc/metrics/_fisher_rao.py index bdf6b61a5..2d94af7d3 100644 --- a/skfda/misc/metrics/_fisher_rao.py +++ b/skfda/misc/metrics/_fisher_rao.py @@ -6,7 +6,7 @@ import scipy.integrate from ..._utils import normalize_scale, normalize_warping -from ...preprocessing.registration import ElasticFisherRaoRegistration +from ...preprocessing.registration import FisherRaoElasticRegistration from ...representation import FData from ...representation._typing import NDArrayFloat from ..operators import SRSF @@ -164,7 +164,7 @@ def fisher_rao_amplitude_distance( domain_range=(0, 1), ) - elastic_registration = ElasticFisherRaoRegistration( + elastic_registration = FisherRaoElasticRegistration( template=fdata2, penalty=lam, output_points=eval_points_normalized, @@ -258,7 +258,7 @@ def fisher_rao_phase_distance( domain_range=(0, 1), ) - elastic_registration = ElasticFisherRaoRegistration( + elastic_registration = FisherRaoElasticRegistration( penalty=lam, template=fdata2, output_points=eval_points_normalized, diff --git a/skfda/preprocessing/registration/__init__.py b/skfda/preprocessing/registration/__init__.py index 904d3b2e8..dd5a91482 100644 --- a/skfda/preprocessing/registration/__init__.py +++ b/skfda/preprocessing/registration/__init__.py @@ -6,7 +6,7 @@ from ..._utils import invert_warping, normalize_warping from . import validation -from ._fisher_rao import ElasticFisherRaoRegistration, ElasticRegistration +from ._fisher_rao import ElasticRegistration, FisherRaoElasticRegistration from ._landmark_registration import ( landmark_registration, landmark_registration_warping, diff --git a/skfda/preprocessing/registration/_fisher_rao.py b/skfda/preprocessing/registration/_fisher_rao.py index 5ab761c2c..7f19df145 100644 --- a/skfda/preprocessing/registration/_fisher_rao.py +++ b/skfda/preprocessing/registration/_fisher_rao.py @@ -18,7 +18,7 @@ _MeanType = Callable[[FDataGrid], FDataGrid] -class ElasticFisherRaoRegistration(RegistrationTransformer): +class FisherRaoElasticRegistration(RegistrationTransformer): r"""Align a FDatagrid using the SRSF framework. Let :math:`f` be a function of the functional data object wich will be @@ -82,7 +82,7 @@ class ElasticFisherRaoRegistration(RegistrationTransformer): Elastic registration of with train/test sets. >>> from skfda.preprocessing.registration import ( - ... ElasticFisherRaoRegistration, + ... FisherRaoElasticRegistration, ... ) >>> from skfda.datasets import make_multimodal_samples >>> X_train = make_multimodal_samples(n_samples=15, random_state=0) @@ -91,9 +91,9 @@ class ElasticFisherRaoRegistration(RegistrationTransformer): Fit the transformer, which learns the elastic mean of the train set as template. - >>> elastic_registration = ElasticFisherRaoRegistration() + >>> elastic_registration = FisherRaoElasticRegistration() >>> elastic_registration.fit(X_train) - ElasticFisherRaoRegistration(...) + FisherRaoElasticRegistration(...) Registration of the test set. @@ -245,14 +245,14 @@ def inverse_transform(self, X: FDataGrid, y: None = None) -> FDataGrid: Center the datasets taking into account the misalignment. >>> from skfda.preprocessing.registration import ( - ... ElasticFisherRaoRegistration, + ... FisherRaoElasticRegistration, ... ) >>> from skfda.datasets import make_multimodal_samples >>> X = make_multimodal_samples(random_state=0) Registration of the dataset. - >>> elastic_registration = ElasticFisherRaoRegistration() + >>> elastic_registration = FisherRaoElasticRegistration() >>> X = elastic_registration.fit_transform(X) Substract the elastic mean build as template during the @@ -286,7 +286,7 @@ def inverse_transform(self, X: FDataGrid, y: None = None) -> FDataGrid: return X.compose(inverse_warping, eval_points=self.output_points) -class ElasticRegistration(ElasticFisherRaoRegistration): +class ElasticRegistration(FisherRaoElasticRegistration): def __init__( self, @@ -297,7 +297,7 @@ def __init__( ) -> None: warnings.warn( "ElasticRegistration is deprecated. " - "Use ElasticFisherRaoRegistration instead.", + "Use FisherRaoElasticRegistration instead.", DeprecationWarning, ) super().__init__( diff --git a/tests/test_elastic.py b/tests/test_elastic.py index e42e81b1d..9527e4491 100644 --- a/tests/test_elastic.py +++ b/tests/test_elastic.py @@ -20,13 +20,13 @@ l2_distance, ) from skfda.misc.operators import SRSF -from skfda.preprocessing.registration import ElasticFisherRaoRegistration +from skfda.preprocessing.registration import FisherRaoElasticRegistration metric = PairwiseMetric(l2_distance) pairwise_fisher_rao = PairwiseMetric(fisher_rao_distance) -class TestElasticFisherRaoRegistration(unittest.TestCase): +class TestFisherRaoElasticRegistration(unittest.TestCase): """Test elastic registration.""" def setUp(self) -> None: @@ -107,7 +107,7 @@ def test_srsf_conversion(self) -> None: def test_template_alignment(self) -> None: """Test alignment to 1 template.""" - reg = ElasticFisherRaoRegistration(template=self.template) + reg = FisherRaoElasticRegistration(template=self.template) register = reg.fit_transform(self.unimodal_samples) distances = metric(self.template, register) @@ -115,7 +115,7 @@ def test_template_alignment(self) -> None: def test_one_to_one_alignment(self) -> None: """Test alignment to 1 sample to a template.""" - reg = ElasticFisherRaoRegistration(template=self.template) + reg = FisherRaoElasticRegistration(template=self.template) register = reg.fit_transform(self.unimodal_samples[0]) distances = metric(self.template, register) @@ -124,7 +124,7 @@ def test_one_to_one_alignment(self) -> None: def test_set_alignment(self) -> None: """Test alignment 3 curves to set with 3 templates.""" # Should give same result than test_template_alignment - reg = ElasticFisherRaoRegistration(template=self.template_rep) + reg = FisherRaoElasticRegistration(template=self.template_rep) register = reg.fit_transform(self.unimodal_samples) distances = metric(self.template, register) @@ -133,7 +133,7 @@ def test_set_alignment(self) -> None: def test_default_alignment(self) -> None: """Test alignment by default.""" # Should give same result than test_template_alignment - reg = ElasticFisherRaoRegistration() + reg = FisherRaoElasticRegistration() register = reg.fit_transform(self.unimodal_samples) values = register([-0.25, -0.1, 0, 0.1, 0.25]) @@ -155,7 +155,7 @@ def test_default_alignment(self) -> None: def test_callable_alignment(self) -> None: """Test alignment by default.""" # Should give same result than test_template_alignment - reg = ElasticFisherRaoRegistration(template=fisher_rao_karcher_mean) + reg = FisherRaoElasticRegistration(template=fisher_rao_karcher_mean) register = reg.fit_transform(self.unimodal_samples) values = register([-0.25, -0.1, 0, 0.1, 0.25]) @@ -175,7 +175,7 @@ def test_callable_alignment(self) -> None: def test_simmetry_of_aligment(self) -> None: """Check registration using inverse composition.""" - reg = ElasticFisherRaoRegistration(template=self.template) + reg = FisherRaoElasticRegistration(template=self.template) reg.fit_transform(self.unimodal_samples) warping = reg.warping_ inverse = invert_warping(warping) @@ -186,7 +186,7 @@ def test_simmetry_of_aligment(self) -> None: def test_raises(self) -> None: """Test that the assertions raise when appropriate.""" - reg = ElasticFisherRaoRegistration() + reg = FisherRaoElasticRegistration() # Inverse transform without previous transform with np.testing.assert_raises(ValueError): @@ -198,7 +198,7 @@ def test_raises(self) -> None: reg.inverse_transform(self.unimodal_samples[0]) # FDataGrid as template with n != 1 and n!= n_samples to transform - reg = ElasticFisherRaoRegistration(template=self.unimodal_samples).fit( + reg = FisherRaoElasticRegistration(template=self.unimodal_samples).fit( self.unimodal_samples[0], ) with np.testing.assert_raises(ValueError): @@ -206,7 +206,7 @@ def test_raises(self) -> None: def test_score(self) -> None: """Test score method of the transformer.""" - reg = ElasticFisherRaoRegistration() + reg = FisherRaoElasticRegistration() reg.fit(self.unimodal_samples) score = reg.score(self.unimodal_samples) np.testing.assert_almost_equal(score, 0.999389) From 98bf97076e0179b2e0f9e248be75a5a8dde917e4 Mon Sep 17 00:00:00 2001 From: VNMabus Date: Tue, 5 Oct 2021 22:54:30 +0200 Subject: [PATCH 013/117] Rename landmark_shift to landmark_shift_registration. --- docs/modules/preprocessing/registration.rst | 2 +- examples/plot_landmark_shift.py | 14 +++--- skfda/preprocessing/registration/__init__.py | 1 + .../registration/_landmark_registration.py | 44 +++++++++++++------ tests/test_registration.py | 24 +++++++--- 5 files changed, 60 insertions(+), 25 deletions(-) diff --git a/docs/modules/preprocessing/registration.rst b/docs/modules/preprocessing/registration.rst index 838bcb896..3be271f67 100644 --- a/docs/modules/preprocessing/registration.rst +++ b/docs/modules/preprocessing/registration.rst @@ -36,7 +36,7 @@ by performing a translation in the time scale. See the .. autosummary:: :toctree: autosummary - skfda.preprocessing.registration.landmark_shift + skfda.preprocessing.registration.landmark_shift_registration skfda.preprocessing.registration.landmark_shift_deltas diff --git a/examples/plot_landmark_shift.py b/examples/plot_landmark_shift.py index ec1722581..4723398e0 100644 --- a/examples/plot_landmark_shift.py +++ b/examples/plot_landmark_shift.py @@ -22,7 +22,8 @@ # :func:`~skfda.datasets.make_multimodal_samples`, which in this case will be # used to generate gaussian-like samples with a mode near to 0. # Each sample will be shifted to align their modes to a reference point using -# the function :func:`~skfda.preprocessing.registration.landmark_shift`. +# the function +# :func:`~skfda.preprocessing.registration.landmark_shift_registration`. fd = skfda.datasets.make_multimodal_samples(random_state=1) fd.extrapolation = 'bounds' #  See extrapolation for a detailed explanation. @@ -63,7 +64,7 @@ # The following figure shows the result of shifting the curves to align their # landmarks at 0. -fd_registered = skfda.preprocessing.registration.landmark_shift( +fd_registered = skfda.preprocessing.registration.landmark_shift_registration( fd, landmarks, location=0, @@ -81,14 +82,14 @@ # the point that minimizes the maximum amount of shift. # Curves aligned restricting the domain -fd_restricted = skfda.preprocessing.registration.landmark_shift( +fd_restricted = skfda.preprocessing.registration.landmark_shift_registration( fd, landmarks, restrict_domain=True, ) # Curves aligned to default point without restrict domain -fd_extrapolated = skfda.preprocessing.registration.landmark_shift( +fd_extrapolated = skfda.preprocessing.registration.landmark_shift_registration( fd, landmarks, ) @@ -127,7 +128,10 @@ # or by default will be chosen the point that minimizes the maximum amount # of displacement. -fd_registered = skfda.preprocessing.registration.landmark_shift(fd, landmarks) +fd_registered = skfda.preprocessing.registration.landmark_shift_registration( + fd, + landmarks, +) fd_registered.plot() diff --git a/skfda/preprocessing/registration/__init__.py b/skfda/preprocessing/registration/__init__.py index dd5a91482..a4ebed94a 100644 --- a/skfda/preprocessing/registration/__init__.py +++ b/skfda/preprocessing/registration/__init__.py @@ -12,5 +12,6 @@ landmark_registration_warping, landmark_shift, landmark_shift_deltas, + landmark_shift_registration, ) from ._shift_registration import ShiftRegistration diff --git a/skfda/preprocessing/registration/_landmark_registration.py b/skfda/preprocessing/registration/_landmark_registration.py index 2bdafaaa6..11bd8db70 100644 --- a/skfda/preprocessing/registration/_landmark_registration.py +++ b/skfda/preprocessing/registration/_landmark_registration.py @@ -4,12 +4,13 @@ """ from __future__ import annotations -from typing import Callable, Optional, Sequence, Union +import warnings +from typing import Any, Callable, Optional, Sequence, Union import numpy as np from ...representation import FData, FDataGrid -from ...representation._typing import ArrayLike, GridPointsLike +from ...representation._typing import ArrayLike, GridPointsLike, NDArrayFloat from ...representation.extrapolation import ExtrapolationLike from ...representation.interpolation import SplineInterpolation @@ -21,7 +22,7 @@ def landmark_shift_deltas( fd: FData, landmarks: ArrayLike, location: Union[_FixedLocation, _LocationCallable, None] = None, -) -> np.ndarray: +) -> NDArrayFloat: r"""Return the corresponding shifts to align the landmarks of the curves. Let :math:`t^*` the time where the landmarks of the curves will be @@ -83,7 +84,7 @@ def landmark_shift_deltas( f" length than the number of samples ({fd.n_samples})", ) - loc_array: Union[float, Sequence[float], np.ndarray] + loc_array: Union[float, Sequence[float], NDArrayFloat] # Parses location if location is None: @@ -102,6 +103,20 @@ def landmark_shift_deltas( def landmark_shift( + *args: Any, + **kwargs: Any, +) -> FDataGrid: + + warnings.warn( + "Function 'landmark_shift' has been renamed. " + "Use 'landmark_shift_registration' instead.", + DeprecationWarning, + ) + + return landmark_shift_registration(*args, **kwargs) + + +def landmark_shift_registration( fd: FData, landmarks: ArrayLike, location: Union[_FixedLocation, _LocationCallable, None] = None, @@ -110,16 +125,17 @@ def landmark_shift( extrapolation: Optional[ExtrapolationLike] = None, grid_points: Optional[GridPointsLike] = None, ) -> FDataGrid: - r"""Perform a shift of the curves to align the landmarks. + r""" + Perform a shift of the curves to align the landmarks. - Let :math:`t^*` the time where the landmarks of the curves will be - aligned, :math:`t_i` the location of the landmarks for each curve - and :math:`\delta_i= t_i - t^*`. + Let :math:`t^*` the time where the landmarks of the curves will be + aligned, :math:`t_i` the location of the landmarks for each curve + and :math:`\delta_i= t_i - t^*`. - The registered samples will have their feature aligned. + The registered samples will have their feature aligned. - .. math:: - x_i^*(t^*)=x_i(t^* + \delta_i)=x_i(t_i) + .. math:: + x_i^*(t^*)=x_i(t^* + \delta_i)=x_i(t_i) Args: fd: Functional data object. @@ -149,7 +165,9 @@ def landmark_shift( Examples: >>> from skfda.datasets import make_multimodal_landmarks >>> from skfda.datasets import make_multimodal_samples - >>> from skfda.preprocessing.registration import landmark_shift + >>> from skfda.preprocessing.registration import ( + ... landmark_shift_registration, + ... ) We will create a data with landmarks as example @@ -159,7 +177,7 @@ def landmark_shift( The function will return the sample registered - >>> landmark_shift(fd, landmarks) + >>> landmark_shift_registration(fd, landmarks) FDataGrid(...) """ diff --git a/tests/test_registration.py b/tests/test_registration.py index 7b36f533e..7944602dd 100644 --- a/tests/test_registration.py +++ b/tests/test_registration.py @@ -15,8 +15,8 @@ ShiftRegistration, landmark_registration, landmark_registration_warping, - landmark_shift, landmark_shift_deltas, + landmark_shift_registration, ) from skfda.preprocessing.registration.validation import ( AmplitudePhaseDecomposition, @@ -110,7 +110,7 @@ def test_landmark_shift_deltas(self): shifts = landmark_shift_deltas(fd, landmarks).round(3) np.testing.assert_almost_equal(shifts, [0.25, -0.25, -0.231]) - def test_landmark_shift(self): + def test_landmark_shift_registration(self): fd = make_multimodal_samples(n_samples=3, random_state=1) landmarks = make_multimodal_landmarks(n_samples=3, random_state=1) @@ -119,28 +119,40 @@ def test_landmark_shift(self): original_modes = fd(landmarks.reshape((3, 1, 1)), aligned=False) # Test default location - fd_registered = landmark_shift(fd, landmarks) + fd_registered = landmark_shift_registration(fd, landmarks) center = (landmarks.max() + landmarks.min()) / 2 reg_modes = fd_registered(center) # Test callable location np.testing.assert_almost_equal(reg_modes, original_modes, decimal=2) - fd_registered = landmark_shift(fd, landmarks, location=np.mean) + fd_registered = landmark_shift_registration( + fd, + landmarks, + location=np.mean, + ) center = np.mean(landmarks) reg_modes = fd_registered(center) np.testing.assert_almost_equal(reg_modes, original_modes, decimal=2) # Test integer location - fd_registered = landmark_shift(fd, landmarks, location=0) + fd_registered = landmark_shift_registration( + fd, + landmarks, + location=0, + ) center = np.mean(landmarks) reg_modes = fd_registered(0) np.testing.assert_almost_equal(reg_modes, original_modes, decimal=2) # Test array location - fd_registered = landmark_shift(fd, landmarks, location=[0, 0.1, 0.2]) + fd_registered = landmark_shift_registration( + fd, + landmarks, + location=[0, 0.1, 0.2], + ) reg_modes = fd_registered([[0], [.1], [.2]], aligned=False) np.testing.assert_almost_equal(reg_modes, original_modes, decimal=2) From ca5924e8a694b1d4dc4b9a5b89ae0bbbb6045719 Mon Sep 17 00:00:00 2001 From: VNMabus Date: Tue, 5 Oct 2021 23:18:41 +0200 Subject: [PATCH 014/117] Rename landmark_registration to landmark_elastic_registration --- docs/modules/preprocessing/registration.rst | 4 +-- examples/plot_landmark_registration.py | 8 ++--- skfda/preprocessing/registration/__init__.py | 3 +- .../registration/_landmark_registration.py | 30 ++++++++++++++----- tests/test_registration.py | 19 ++++++------ 5 files changed, 41 insertions(+), 23 deletions(-) diff --git a/docs/modules/preprocessing/registration.rst b/docs/modules/preprocessing/registration.rst index 3be271f67..72ebf2fa8 100644 --- a/docs/modules/preprocessing/registration.rst +++ b/docs/modules/preprocessing/registration.rst @@ -47,8 +47,8 @@ See the :ref:`sphx_glr_auto_examples_plot_landmark_registration.py` example. .. autosummary:: :toctree: autosummary - skfda.preprocessing.registration.landmark_registration - skfda.preprocessing.registration.landmark_registration_warping + skfda.preprocessing.registration.landmark_elastic_registration + skfda.preprocessing.registration.landmark_elastic_registration_warping Elastic Registration diff --git a/examples/plot_landmark_registration.py b/examples/plot_landmark_registration.py index dbcfc7d51..250da7cef 100644 --- a/examples/plot_landmark_registration.py +++ b/examples/plot_landmark_registration.py @@ -66,7 +66,7 @@ # # After the identification of the landmarks asociated with the features of # each of our curves we can construct the warping function with the function -# :func:`~skfda.preprocessing.registration.landmark_registration_warping`. +# :func:`~skfda.preprocessing.registration.landmark_elastic_registration_warping`. # # Let :math:`h_i` be the warping function corresponding with the curve # :math:`i`, :math:`t_{ij}` the time where the curve :math:`i` has their @@ -79,7 +79,7 @@ # # In this case we will place the landmarks at -0.5 and 0.5. -warping = skfda.preprocessing.registration.landmark_registration_warping( +warping = skfda.preprocessing.registration.landmark_elastic_registration_warping( fd, landmarks, location=[-0.5, 0.5], @@ -109,13 +109,13 @@ # # If we do not need the warping function we can obtain the registered curves # directly using the function -# :func:`~skfda.preprocessing.registration.landmark_registration`. +# :func:`~skfda.preprocessing.registration.landmark_elastic_registration`. # # If the position of the new location of the landmarks is not specified the # mean position is taken. # -fd_registered = skfda.preprocessing.registration.landmark_registration( +fd_registered = skfda.preprocessing.registration.landmark_elastic_registration( fd, landmarks, ) diff --git a/skfda/preprocessing/registration/__init__.py b/skfda/preprocessing/registration/__init__.py index a4ebed94a..287dc2309 100644 --- a/skfda/preprocessing/registration/__init__.py +++ b/skfda/preprocessing/registration/__init__.py @@ -8,8 +8,9 @@ from . import validation from ._fisher_rao import ElasticRegistration, FisherRaoElasticRegistration from ._landmark_registration import ( + landmark_elastic_registration, + landmark_elastic_registration_warping, landmark_registration, - landmark_registration_warping, landmark_shift, landmark_shift_deltas, landmark_shift_registration, diff --git a/skfda/preprocessing/registration/_landmark_registration.py b/skfda/preprocessing/registration/_landmark_registration.py index 11bd8db70..e96b5296b 100644 --- a/skfda/preprocessing/registration/_landmark_registration.py +++ b/skfda/preprocessing/registration/_landmark_registration.py @@ -191,7 +191,7 @@ def landmark_shift_registration( ) -def landmark_registration_warping( +def landmark_elastic_registration_warping( fd: FData, landmarks: ArrayLike, *, @@ -235,7 +235,7 @@ def landmark_registration_warping( >>> from skfda.datasets import make_multimodal_landmarks >>> from skfda.datasets import make_multimodal_samples >>> from skfda.preprocessing.registration import ( - ... landmark_registration_warping) + ... landmark_elastic_registration_warping) We will create a data with landmarks as example @@ -247,7 +247,7 @@ def landmark_registration_warping( The function will return the corresponding warping function - >>> warping = landmark_registration_warping(fd, landmarks) + >>> warping = landmark_elastic_registration_warping(fd, landmarks) >>> warping FDataGrid(...) @@ -319,6 +319,20 @@ def landmark_registration_warping( def landmark_registration( + *args: Any, + **kwargs: Any, +) -> FDataGrid: + + warnings.warn( + "Function 'landmark_registration' has been renamed. " + "Use 'landmark_elastic_registration' instead.", + DeprecationWarning, + ) + + return landmark_elastic_registration(*args, **kwargs) + + +def landmark_elastic_registration( fd: FData, landmarks: ArrayLike, *, @@ -357,7 +371,9 @@ def landmark_registration( Examples: >>> from skfda.datasets import make_multimodal_landmarks >>> from skfda.datasets import make_multimodal_samples - >>> from skfda.preprocessing.registration import landmark_registration + >>> from skfda.preprocessing.registration import ( + ... landmark_elastic_registration, + ... ) >>> from skfda.representation.basis import BSpline We will create a data with landmarks as example @@ -370,17 +386,17 @@ def landmark_registration( The function will return the registered curves - >>> landmark_registration(fd, landmarks) + >>> landmark_elastic_registration(fd, landmarks) FDataGrid(...) This method will work for FDataBasis as for FDataGrids >>> fd = fd.to_basis(BSpline(n_basis=12)) - >>> landmark_registration(fd, landmarks) + >>> landmark_elastic_registration(fd, landmarks) FDataGrid(...) """ - warping = landmark_registration_warping( + warping = landmark_elastic_registration_warping( fd, landmarks, location=location, diff --git a/tests/test_registration.py b/tests/test_registration.py index 7944602dd..3d2bbe728 100644 --- a/tests/test_registration.py +++ b/tests/test_registration.py @@ -13,8 +13,8 @@ from skfda.exploratory.stats import mean from skfda.preprocessing.registration import ( ShiftRegistration, - landmark_registration, - landmark_registration_warping, + landmark_elastic_registration, + landmark_elastic_registration_warping, landmark_shift_deltas, landmark_shift_registration, ) @@ -157,25 +157,26 @@ def test_landmark_shift_registration(self): np.testing.assert_almost_equal(reg_modes, original_modes, decimal=2) - def test_landmark_registration_warping(self): + def test_landmark_elastic_registration_warping(self): fd = make_multimodal_samples(n_samples=3, n_modes=2, random_state=9) landmarks = make_multimodal_landmarks(n_samples=3, n_modes=2, random_state=9) landmarks = landmarks.squeeze() # Default location - warping = landmark_registration_warping(fd, landmarks) + warping = landmark_elastic_registration_warping(fd, landmarks) center = (landmarks.max(axis=0) + landmarks.min(axis=0)) / 2 np.testing.assert_almost_equal( warping(center)[..., 0], landmarks, decimal=1) # Fixed location center = [.3, .6] - warping = landmark_registration_warping(fd, landmarks, location=center) + warping = landmark_elastic_registration_warping( + fd, landmarks, location=center) np.testing.assert_almost_equal( warping(center)[..., 0], landmarks, decimal=3) - def test_landmark_registration(self): + def test_landmark_elastic_registration(self): fd = make_multimodal_samples(n_samples=3, n_modes=2, random_state=9) landmarks = make_multimodal_landmarks(n_samples=3, n_modes=2, random_state=9) @@ -184,14 +185,14 @@ def test_landmark_registration(self): original_values = fd(landmarks.reshape(3, 2), aligned=False) # Default location - fd_reg = landmark_registration(fd, landmarks) + fd_reg = landmark_elastic_registration(fd, landmarks) center = (landmarks.max(axis=0) + landmarks.min(axis=0)) / 2 np.testing.assert_almost_equal(fd_reg(center), original_values, decimal=2) # Fixed location center = [.3, .6] - fd_reg = landmark_registration(fd, landmarks, location=center) + fd_reg = landmark_elastic_registration(fd, landmarks, location=center) np.testing.assert_array_almost_equal(fd_reg(center), original_values, decimal=2) @@ -400,7 +401,7 @@ def test_mse_decomposition(self) -> None: fd = make_multimodal_samples(n_samples=3, random_state=1) landmarks = make_multimodal_landmarks(n_samples=3, random_state=1) landmarks = landmarks.squeeze() - warping = landmark_registration_warping(fd, landmarks) + warping = landmark_elastic_registration_warping(fd, landmarks) fd_registered = fd.compose(warping) scorer = AmplitudePhaseDecomposition() ret = scorer.stats(fd, fd_registered) From 3ae75f9b969db03ccb9016517c23e74a336246c7 Mon Sep 17 00:00:00 2001 From: VNMabus Date: Wed, 6 Oct 2021 00:13:59 +0200 Subject: [PATCH 015/117] Rename ShiftRegistration to LeastSquaresShiftRegistration --- docs/modules/preprocessing/registration.rst | 2 +- examples/plot_shift_registration.py | 10 +-- skfda/preprocessing/registration/__init__.py | 5 +- .../preprocessing/registration/_fisher_rao.py | 2 +- ...ration.py => _lstsq_shift_registration.py} | 62 +++++++++++++++---- .../preprocessing/registration/validation.py | 32 ++++++---- tests/test_registration.py | 33 +++++----- tutorial/plot_skfda_sklearn.py | 3 +- 8 files changed, 100 insertions(+), 49 deletions(-) rename skfda/preprocessing/registration/{_shift_registration.py => _lstsq_shift_registration.py} (88%) diff --git a/docs/modules/preprocessing/registration.rst b/docs/modules/preprocessing/registration.rst index 72ebf2fa8..5e0ef379c 100644 --- a/docs/modules/preprocessing/registration.rst +++ b/docs/modules/preprocessing/registration.rst @@ -20,7 +20,7 @@ is shown the basic usage of this method. .. autosummary:: :toctree: autosummary - skfda.preprocessing.registration.ShiftRegistration + skfda.preprocessing.registration.LeastSquaresShiftRegistration Landmark Registration diff --git a/examples/plot_shift_registration.py b/examples/plot_shift_registration.py index 4b6c128e6..b87ec0339 100644 --- a/examples/plot_shift_registration.py +++ b/examples/plot_shift_registration.py @@ -14,7 +14,7 @@ import matplotlib.pyplot as plt from skfda.datasets import make_sinusoidal_process -from skfda.preprocessing.registration import ShiftRegistration +from skfda.preprocessing.registration import LeastSquaresShiftRegistration from skfda.representation.basis import Fourier ############################################################################## @@ -41,14 +41,14 @@ ############################################################################## # We will use the -# :func:`~skfda.preprocessing.registration.ShiftRegistration` transformer, -# which is suitable due to the periodicity of the dataset and the small -# amount of amplitude variation. +# :func:`~skfda.preprocessing.registration.LeastSquaresShiftRegistration` +# transformer, which is suitable due to the periodicity of the dataset and +# the small amount of amplitude variation. # # We can observe how the sinusoidal pattern is easily distinguishable # once the alignment has been made. -shift_registration = ShiftRegistration() +shift_registration = LeastSquaresShiftRegistration() fd_registered = shift_registration.fit_transform(fd_basis) fd_registered.plot() diff --git a/skfda/preprocessing/registration/__init__.py b/skfda/preprocessing/registration/__init__.py index 287dc2309..8f857ae08 100644 --- a/skfda/preprocessing/registration/__init__.py +++ b/skfda/preprocessing/registration/__init__.py @@ -15,4 +15,7 @@ landmark_shift_deltas, landmark_shift_registration, ) -from ._shift_registration import ShiftRegistration +from ._lstsq_shift_registration import ( + LeastSquaresShiftRegistration, + ShiftRegistration, +) diff --git a/skfda/preprocessing/registration/_fisher_rao.py b/skfda/preprocessing/registration/_fisher_rao.py index 7f19df145..ea25aae5f 100644 --- a/skfda/preprocessing/registration/_fisher_rao.py +++ b/skfda/preprocessing/registration/_fisher_rao.py @@ -296,7 +296,7 @@ def __init__( grid_dim: int = 7, ) -> None: warnings.warn( - "ElasticRegistration is deprecated. " + "ElasticRegistration has been renamed. " "Use FisherRaoElasticRegistration instead.", DeprecationWarning, ) diff --git a/skfda/preprocessing/registration/_shift_registration.py b/skfda/preprocessing/registration/_lstsq_shift_registration.py similarity index 88% rename from skfda/preprocessing/registration/_shift_registration.py rename to skfda/preprocessing/registration/_lstsq_shift_registration.py index 0416e2e49..d6f88667d 100644 --- a/skfda/preprocessing/registration/_shift_registration.py +++ b/skfda/preprocessing/registration/_lstsq_shift_registration.py @@ -1,6 +1,7 @@ """Class to apply Shift Registration to functional data""" from __future__ import annotations +import warnings from typing import Callable, Optional, Tuple, TypeVar, Union import numpy as np @@ -11,7 +12,7 @@ from ..._utils import check_is_univariate from ...misc._math import inner_product from ...misc.metrics._lp_norms import l2_norm -from ...representation._typing import ArrayLike, GridPointsLike +from ...representation._typing import ArrayLike, GridPointsLike, NDArrayFloat from ...representation.extrapolation import ExtrapolationLike from .base import RegistrationTransformer @@ -19,8 +20,8 @@ TemplateFunction = Callable[[FDataGrid], FDataGrid] -class ShiftRegistration(RegistrationTransformer): - r"""Register a functional dataset using shift alignment. +class LeastSquaresShiftRegistration(RegistrationTransformer): + r"""Register data using shift alignment by least squares criterion. Realizes the registration of a set of curves using a shift aligment :footcite:`ramsay+silverman_2005_functional_shift`. @@ -92,7 +93,9 @@ class ShiftRegistration(RegistrationTransformer): the method. Examples: - >>> from skfda.preprocessing.registration import ShiftRegistration + >>> from skfda.preprocessing.registration import ( + ... LeastSquaresShiftRegistration, + ... ) >>> from skfda.datasets import make_sinusoidal_process >>> from skfda.representation.basis import Fourier @@ -101,7 +104,7 @@ class ShiftRegistration(RegistrationTransformer): >>> fd = make_sinusoidal_process(n_samples=10, error_std=0, ... random_state=1) - >>> reg = ShiftRegistration(extrapolation="periodic") + >>> reg = LeastSquaresShiftRegistration(extrapolation="periodic") >>> fd_registered = reg.fit_transform(fd) >>> fd_registered FDataGrid(...) @@ -153,7 +156,7 @@ def _compute_deltas( self, fd: FData, template: Union[Literal["mean"], FData, TemplateFunction], - ) -> Tuple[np.ndarray, FDataGrid]: + ) -> Tuple[NDArrayFloat, FDataGrid]: """Compute the shifts to perform the registration. Args: @@ -249,7 +252,8 @@ def _compute_deltas( return delta, template_iter def fit_transform(self, X: T, y: None = None) -> T: - """Fit the estimator and transform the data. + """ + Fit the estimator and transform the data. Args: X: Functional dataset to be transformed. @@ -271,7 +275,7 @@ def fit_transform(self, X: T, y: None = None) -> T: grid_points=self.grid_points, ) - def fit(self, X: FData, y: None = None) -> ShiftRegistration: + def fit(self, X: FData, y: None = None) -> LeastSquaresShiftRegistration: """Fit the estimator. Args: @@ -307,7 +311,8 @@ def fit(self, X: FData, y: None = None) -> ShiftRegistration: return self def transform(self, X: FData, y: None = None) -> FDataGrid: - """Register the data. + """ + Register the data. Transforms the data using the template previously learned during fitting. @@ -346,7 +351,8 @@ def transform(self, X: FData, y: None = None) -> FDataGrid: ) def inverse_transform(self, X: FData, y: None = None) -> FDataGrid: - """Applies the inverse transformation. + """ + Apply the inverse transformation. Applies the opossite shift used in the last call to `transform`. @@ -361,14 +367,16 @@ def inverse_transform(self, X: FData, y: None = None) -> FDataGrid: Creates a synthetic functional dataset. - >>> from skfda.preprocessing.registration import ShiftRegistration + >>> from skfda.preprocessing.registration import ( + ... LeastSquaresShiftRegistration, + ... ) >>> from skfda.datasets import make_sinusoidal_process >>> fd = make_sinusoidal_process(error_std=0, random_state=1) >>> fd.extrapolation = 'periodic' Dataset registration and centering. - >>> reg = ShiftRegistration() + >>> reg = LeastSquaresShiftRegistration() >>> fd_registered = reg.fit_transform(fd) >>> fd_centered = fd_registered - fd_registered.mean() @@ -397,3 +405,33 @@ def inverse_transform(self, X: FData, y: None = None) -> FDataGrid: extrapolation=self.extrapolation, grid_points=self.grid_points, ) + + +class ShiftRegistration(LeastSquaresShiftRegistration): + + def __init__( + self, + max_iter: int = 5, + tol: float = 1e-2, + template: Union[Literal["mean"], FData, TemplateFunction] = "mean", + extrapolation: Optional[ExtrapolationLike] = None, + step_size: float = 1, + restrict_domain: bool = False, + initial: Union[Literal["zeros"], ArrayLike] = "zeros", + grid_points: Optional[GridPointsLike] = None, + ) -> None: + warnings.warn( + "ShiftRegistration has been renamed. " + "Use LeastSquaresShiftRegistration instead.", + DeprecationWarning, + ) + super().__init__( + max_iter=max_iter, + tol=tol, + template=template, + extrapolation=extrapolation, + step_size=step_size, + restrict_domain=restrict_domain, + initial=initial, + grid_points=grid_points, + ) diff --git a/skfda/preprocessing/registration/validation.py b/skfda/preprocessing/registration/validation.py index 92bc99bbf..61e99eb3d 100644 --- a/skfda/preprocessing/registration/validation.py +++ b/skfda/preprocessing/registration/validation.py @@ -215,15 +215,17 @@ class AmplitudePhaseDecomposition( >>> from skfda.preprocessing.registration.validation import \ ... AmplitudePhaseDecomposition - >>> from skfda.preprocessing.registration import ShiftRegistration + >>> from skfda.preprocessing.registration import ( + ... LeastSquaresShiftRegistration, + ... ) >>> from skfda.datasets import make_sinusoidal_process >>> X = make_sinusoidal_process(error_std=0, random_state=0) Fit the registration procedure. - >>> shift_registration = ShiftRegistration() + >>> shift_registration = LeastSquaresShiftRegistration() >>> shift_registration.fit(X) - ShiftRegistration(...) + LeastSquaresShiftRegistration(...) Compute the :math:`R^2` correlation index @@ -369,15 +371,17 @@ class LeastSquares(RegistrationScorer): >>> from skfda.preprocessing.registration.validation import \ ... LeastSquares - >>> from skfda.preprocessing.registration import ShiftRegistration + >>> from skfda.preprocessing.registration import ( + ... LeastSquaresShiftRegistration, + ... ) >>> from skfda.datasets import make_sinusoidal_process >>> X = make_sinusoidal_process(error_std=0, random_state=0) Fit the registration procedure. - >>> shift_registration = ShiftRegistration() + >>> shift_registration = LeastSquaresShiftRegistration() >>> shift_registration.fit(X) - ShiftRegistration(...) + LeastSquaresShiftRegistration(...) Compute the least squares score. >>> scorer = LeastSquares() @@ -481,15 +485,17 @@ class SobolevLeastSquares(RegistrationScorer): >>> from skfda.preprocessing.registration.validation import \ ... SobolevLeastSquares - >>> from skfda.preprocessing.registration import ShiftRegistration + >>> from skfda.preprocessing.registration import ( + ... LeastSquaresShiftRegistration, + ... ) >>> from skfda.datasets import make_sinusoidal_process >>> X = make_sinusoidal_process(error_std=0, random_state=0) Fit the registration procedure. - >>> shift_registration = ShiftRegistration() + >>> shift_registration = LeastSquaresShiftRegistration() >>> shift_registration.fit(X) - ShiftRegistration(...) + LeastSquaresShiftRegistration(...) Compute the sobolev least squares score. >>> scorer = SobolevLeastSquares() @@ -574,15 +580,17 @@ class PairwiseCorrelation(RegistrationScorer): >>> from skfda.preprocessing.registration.validation import \ ... PairwiseCorrelation - >>> from skfda.preprocessing.registration import ShiftRegistration + >>> from skfda.preprocessing.registration import ( + ... LeastSquaresShiftRegistration, + ... ) >>> from skfda.datasets import make_sinusoidal_process >>> X = make_sinusoidal_process(error_std=0, random_state=0) Fit the registration procedure. - >>> shift_registration = ShiftRegistration() + >>> shift_registration = LeastSquaresShiftRegistration() >>> shift_registration.fit(X) - ShiftRegistration(...) + LeastSquaresShiftRegistration(...) Compute the pairwise correlation score. >>> scorer = PairwiseCorrelation() diff --git a/tests/test_registration.py b/tests/test_registration.py index 3d2bbe728..869511d4b 100644 --- a/tests/test_registration.py +++ b/tests/test_registration.py @@ -12,7 +12,7 @@ ) from skfda.exploratory.stats import mean from skfda.preprocessing.registration import ( - ShiftRegistration, + LeastSquaresShiftRegistration, landmark_elastic_registration, landmark_elastic_registration_warping, landmark_shift_deltas, @@ -197,7 +197,7 @@ def test_landmark_elastic_registration(self): decimal=2) -class TestShiftRegistration(unittest.TestCase): +class TestLeastSquaresShiftRegistration(unittest.TestCase): """Test shift registration""" def setUp(self): @@ -208,7 +208,7 @@ def setUp(self): def test_fit_transform(self): - reg = ShiftRegistration() + reg = LeastSquaresShiftRegistration() # Test fit transform with FDataGrid fd_reg = reg.fit_transform(self.fd) @@ -233,7 +233,7 @@ def test_fit_and_transform(self): fd = make_sinusoidal_process(n_samples=2, error_std=0, random_state=10) - reg = ShiftRegistration() + reg = LeastSquaresShiftRegistration() response = reg.fit(self.fd) # Check attributes and returned value @@ -246,7 +246,7 @@ def test_fit_and_transform(self): def test_inverse_transform(self): - reg = ShiftRegistration() + reg = LeastSquaresShiftRegistration() fd = reg.fit_transform(self.fd) fd = reg.inverse_transform(fd) @@ -255,7 +255,7 @@ def test_inverse_transform(self): def test_raises(self): - reg = ShiftRegistration() + reg = LeastSquaresShiftRegistration() # Test not fitted with np.testing.assert_raises(NotFittedError): @@ -294,16 +294,16 @@ def test_raises(self): def test_template(self): - reg = ShiftRegistration() + reg = LeastSquaresShiftRegistration() fd_registered_1 = reg.fit_transform(self.fd) - reg_2 = ShiftRegistration(template=reg.template_) + reg_2 = LeastSquaresShiftRegistration(template=reg.template_) fd_registered_2 = reg_2.fit_transform(self.fd) - reg_3 = ShiftRegistration(template=mean) + reg_3 = LeastSquaresShiftRegistration(template=mean) fd_registered_3 = reg_3.fit_transform(self.fd) - reg_4 = ShiftRegistration(template=reg.template_) + reg_4 = LeastSquaresShiftRegistration(template=reg.template_) fd_registered_4 = reg_4.fit(self.fd).transform(self.fd) np.testing.assert_array_almost_equal(fd_registered_1.data_matrix, @@ -318,13 +318,13 @@ def test_template(self): fd_registered_4.data_matrix) def test_restrict_domain(self) -> None: - reg = ShiftRegistration(restrict_domain=True) + reg = LeastSquaresShiftRegistration(restrict_domain=True) fd_registered_1 = reg.fit_transform(self.fd) np.testing.assert_array_almost_equal( np.array(fd_registered_1.domain_range).round(3), [[0.022, 0.969]]) - reg2 = ShiftRegistration( + reg2 = LeastSquaresShiftRegistration( restrict_domain=True, template=reg.template_.copy(domain_range=self.fd.domain_range), ) @@ -334,21 +334,22 @@ def test_restrict_domain(self) -> None: fd_registered_2.data_matrix, fd_registered_1.data_matrix, decimal=3) - reg3 = ShiftRegistration(restrict_domain=True, template=mean) + reg3 = LeastSquaresShiftRegistration( + restrict_domain=True, template=mean) fd_registered_3 = reg3.fit_transform(self.fd) np.testing.assert_array_almost_equal( fd_registered_3.data_matrix, fd_registered_1.data_matrix) def test_initial_estimation(self): - reg = ShiftRegistration(initial=[-0.02161235, 0.03032652]) + reg = LeastSquaresShiftRegistration(initial=[-0.02161235, 0.03032652]) reg.fit_transform(self.fd) # Only needed 1 iteration until convergence self.assertEqual(reg.n_iter_, 1) def test_custom_grid_points(self): - reg = ShiftRegistration(grid_points=np.linspace(0, 1, 50)) + reg = LeastSquaresShiftRegistration(grid_points=np.linspace(0, 1, 50)) reg.fit_transform(self.fd) @@ -358,7 +359,7 @@ class TestRegistrationValidation(unittest.TestCase): def setUp(self) -> None: """Initialize the samples.""" self.X = make_sinusoidal_process(error_std=0, random_state=0) - self.shift_registration = ShiftRegistration().fit(self.X) + self.shift_registration = LeastSquaresShiftRegistration().fit(self.X) def test_amplitude_phase_score(self) -> None: """Test basic usage of AmplitudePhaseDecomposition.""" diff --git a/tutorial/plot_skfda_sklearn.py b/tutorial/plot_skfda_sklearn.py index 1b51bf765..aa33451af 100644 --- a/tutorial/plot_skfda_sklearn.py +++ b/tutorial/plot_skfda_sklearn.py @@ -196,6 +196,7 @@ # to classify the data. from skfda.preprocessing.dim_reduction import variable_selection as vs +from skfda.preprocessing.registration import LeastSquaresShiftRegistration from sklearn.pipeline import Pipeline from sklearn.svm import SVC @@ -204,7 +205,7 @@ X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) pipeline = Pipeline([ - ("registration", skfda.preprocessing.registration.ShiftRegistration()), + ("registration", LeastSquaresShiftRegistration()), ("dim_reduction", vs.RKHSVariableSelection(n_features_to_select=3)), ("classifier", SVC()), ]) From abf00eb8616fcdbffb14b0381954cd274f6b87a2 Mon Sep 17 00:00:00 2001 From: VNMabus Date: Fri, 8 Oct 2021 18:12:14 +0200 Subject: [PATCH 016/117] Improve style of registration tests. --- tests/test_registration.py | 219 +++++++++++++++++++++++-------------- 1 file changed, 136 insertions(+), 83 deletions(-) diff --git a/tests/test_registration.py b/tests/test_registration.py index 869511d4b..9b25f6719 100644 --- a/tests/test_registration.py +++ b/tests/test_registration.py @@ -29,63 +29,77 @@ class TestWarping(unittest.TestCase): - """Test warpings functions""" - - def setUp(self): - """Initialization of samples""" + """Test warpings functions.""" + def setUp(self) -> None: + """Initialize samples.""" self.time = np.linspace(-1, 1, 50) interpolation = SplineInterpolation(3, monotone=True) - self.polynomial = FDataGrid([self.time**3, self.time**5], - self.time, interpolation=interpolation) - - def test_invert_warping(self): + self.polynomial = FDataGrid( + [self.time**3, self.time**5], + self.time, + interpolation=interpolation, + ) + def test_invert_warping(self) -> None: + """Test that the composition with invert warping is the identity.""" inverse = invert_warping(self.polynomial) # Check if identity - id = self.polynomial.compose(inverse) + identity = self.polynomial.compose(inverse) - np.testing.assert_array_almost_equal([self.time, self.time], - id.data_matrix[..., 0], - decimal=3) - - def test_standard_normalize_warping(self): - """Test normalization to (0, 1)""" + np.testing.assert_array_almost_equal( + [self.time, self.time], + identity.data_matrix[..., 0], + decimal=3, + ) + def test_standard_normalize_warping(self) -> None: + """Test normalization to (0, 1).""" normalized = normalize_warping(self.polynomial, (0, 1)) # Test new domain range (0, 1) np.testing.assert_array_equal(normalized.domain_range, [(0, 1)]) - np.testing.assert_array_almost_equal(normalized.grid_points[0], - np.linspace(0, 1, 50)) - np.testing.assert_array_almost_equal( - normalized(0)[..., 0], [[0.], [0.]]) + normalized.grid_points[0], + np.linspace(0, 1, 50), + ) np.testing.assert_array_almost_equal( - normalized(1)[..., 0], [[1.], [1.]]) + normalized(0)[..., 0], + [[0], [0]], + ) - def test_standard_normalize_warping_default_value(self): - """Test normalization """ + np.testing.assert_array_almost_equal( + normalized(1)[..., 0], + [[1.0], [1.0]], + ) + def test_standard_normalize_warping_default_value(self) -> None: + """Test normalization.""" normalized = normalize_warping(self.polynomial) # Test new domain range (0, 1) np.testing.assert_array_equal(normalized.domain_range, [(-1, 1)]) - np.testing.assert_array_almost_equal(normalized.grid_points[0], - np.linspace(-1, 1, 50)) + np.testing.assert_array_almost_equal( + normalized.grid_points[0], + np.linspace(-1, 1, 50), + ) np.testing.assert_array_almost_equal( - normalized(-1)[..., 0], [[-1], [-1]]) + normalized(-1)[..., 0], + [[-1], [-1]], + ) np.testing.assert_array_almost_equal( - normalized(1)[..., 0], [[1.], [1.]]) + normalized(1)[..., 0], + [[1.0], [1.0]], + ) - def test_normalize_warping(self): - """Test normalization to (a, b)""" + def test_normalize_warping(self) -> None: + """Test normalization to (a, b).""" a = -4 b = 3 domain = (a, b) @@ -94,15 +108,17 @@ def test_normalize_warping(self): # Test new domain range (0, 1) np.testing.assert_array_equal(normalized.domain_range, [domain]) - np.testing.assert_array_almost_equal(normalized.grid_points[0], - np.linspace(*domain, 50)) + np.testing.assert_array_almost_equal( + normalized.grid_points[0], + np.linspace(*domain, 50), + ) np.testing.assert_array_equal(normalized(a)[..., 0], [[a], [a]]) np.testing.assert_array_equal(normalized(b)[..., 0], [[b], [b]]) - def test_landmark_shift_deltas(self): - + def test_landmark_shift_deltas(self) -> None: + """Test landmark shift deltas.""" fd = make_multimodal_samples(n_samples=3, random_state=1) landmarks = make_multimodal_landmarks(n_samples=3, random_state=1) landmarks = landmarks.squeeze() @@ -110,14 +126,16 @@ def test_landmark_shift_deltas(self): shifts = landmark_shift_deltas(fd, landmarks).round(3) np.testing.assert_almost_equal(shifts, [0.25, -0.25, -0.231]) - def test_landmark_shift_registration(self): - + def test_landmark_shift_registration(self) -> None: + """Test landmark shift registration.""" fd = make_multimodal_samples(n_samples=3, random_state=1) landmarks = make_multimodal_landmarks(n_samples=3, random_state=1) landmarks = landmarks.squeeze() - original_modes = fd(landmarks.reshape((3, 1, 1)), - aligned=False) + original_modes = fd( + landmarks.reshape((3, 1, 1)), + aligned=False, + ) # Test default location fd_registered = landmark_shift_registration(fd, landmarks) center = (landmarks.max() + landmarks.min()) / 2 @@ -153,33 +171,47 @@ def test_landmark_shift_registration(self): landmarks, location=[0, 0.1, 0.2], ) - reg_modes = fd_registered([[0], [.1], [.2]], aligned=False) + reg_modes = fd_registered([[0], [0.1], [0.2]], aligned=False) np.testing.assert_almost_equal(reg_modes, original_modes, decimal=2) - def test_landmark_elastic_registration_warping(self): + def test_landmark_elastic_registration_warping(self) -> None: + """Test the warpings in landmark elastic registration.""" fd = make_multimodal_samples(n_samples=3, n_modes=2, random_state=9) - landmarks = make_multimodal_landmarks(n_samples=3, n_modes=2, - random_state=9) + landmarks = make_multimodal_landmarks( + n_samples=3, + n_modes=2, + random_state=9, + ) landmarks = landmarks.squeeze() # Default location warping = landmark_elastic_registration_warping(fd, landmarks) center = (landmarks.max(axis=0) + landmarks.min(axis=0)) / 2 np.testing.assert_almost_equal( - warping(center)[..., 0], landmarks, decimal=1) + warping(center)[..., 0], + landmarks, + decimal=1, + ) # Fixed location - center = [.3, .6] + center = [0.3, 0.6] warping = landmark_elastic_registration_warping( fd, landmarks, location=center) np.testing.assert_almost_equal( - warping(center)[..., 0], landmarks, decimal=3) + warping(center)[..., 0], + landmarks, + decimal=3, + ) - def test_landmark_elastic_registration(self): + def test_landmark_elastic_registration(self) -> None: + """Test landmark elastic registration.""" fd = make_multimodal_samples(n_samples=3, n_modes=2, random_state=9) - landmarks = make_multimodal_landmarks(n_samples=3, n_modes=2, - random_state=9) + landmarks = make_multimodal_landmarks( + n_samples=3, + n_modes=2, + random_state=9, + ) landmarks = landmarks.squeeze() original_values = fd(landmarks.reshape(3, 2), aligned=False) @@ -187,26 +219,35 @@ def test_landmark_elastic_registration(self): # Default location fd_reg = landmark_elastic_registration(fd, landmarks) center = (landmarks.max(axis=0) + landmarks.min(axis=0)) / 2 - np.testing.assert_almost_equal(fd_reg(center), original_values, - decimal=2) + np.testing.assert_almost_equal( + fd_reg(center), + original_values, + decimal=2, + ) # Fixed location center = [.3, .6] fd_reg = landmark_elastic_registration(fd, landmarks, location=center) - np.testing.assert_array_almost_equal(fd_reg(center), original_values, - decimal=2) + np.testing.assert_array_almost_equal( + fd_reg(center), + original_values, + decimal=2, + ) class TestLeastSquaresShiftRegistration(unittest.TestCase): - """Test shift registration""" + """Test shift registration.""" - def setUp(self): - """Initialization of samples""" - self.fd = make_sinusoidal_process(n_samples=2, error_std=0, - random_state=1) + def setUp(self) -> None: + """Initialize samples.""" + self.fd = make_sinusoidal_process( + n_samples=2, + error_std=0, + random_state=1, + ) self.fd.extrapolation = "periodic" - def test_fit_transform(self): + def test_fit_transform(self) -> None: reg = LeastSquaresShiftRegistration() @@ -228,11 +269,8 @@ def test_fit_transform(self): deltas = reg.deltas_.round(3) np.testing.assert_array_almost_equal(deltas, [-0.022, 0.03]) - def test_fit_and_transform(self): - """Test wrapper of shift_registration_deltas""" - - fd = make_sinusoidal_process(n_samples=2, error_std=0, random_state=10) - + def test_fit_and_transform(self) -> None: + """Test wrapper of shift_registration_deltas.""" reg = LeastSquaresShiftRegistration() response = reg.fit(self.fd) @@ -240,20 +278,22 @@ def test_fit_and_transform(self): self.assertTrue(hasattr(reg, 'template_')) self.assertTrue(response is reg) - fd_registered = reg.transform(fd) deltas = reg.deltas_.round(3) np.testing.assert_allclose(deltas, [0.071, -0.072]) - def test_inverse_transform(self): + def test_inverse_transform(self) -> None: reg = LeastSquaresShiftRegistration() fd = reg.fit_transform(self.fd) fd = reg.inverse_transform(fd) - np.testing.assert_array_almost_equal(fd.data_matrix, - self.fd.data_matrix, decimal=3) + np.testing.assert_array_almost_equal( + fd.data_matrix, + self.fd.data_matrix, + decimal=3, + ) - def test_raises(self): + def test_raises(self) -> None: reg = LeastSquaresShiftRegistration() @@ -292,7 +332,7 @@ def test_raises(self): with np.testing.assert_raises(ValueError): reg.fit_transform(self.fd) - def test_template(self): + def test_template(self) -> None: reg = LeastSquaresShiftRegistration() fd_registered_1 = reg.fit_transform(self.fd) @@ -306,23 +346,31 @@ def test_template(self): reg_4 = LeastSquaresShiftRegistration(template=reg.template_) fd_registered_4 = reg_4.fit(self.fd).transform(self.fd) - np.testing.assert_array_almost_equal(fd_registered_1.data_matrix, - fd_registered_3.data_matrix) + np.testing.assert_array_almost_equal( + fd_registered_1.data_matrix, + fd_registered_3.data_matrix, + ) # With the template fixed could vary the convergence - np.testing.assert_array_almost_equal(fd_registered_1.data_matrix, - fd_registered_2.data_matrix, - decimal=3) + np.testing.assert_array_almost_equal( + fd_registered_1.data_matrix, + fd_registered_2.data_matrix, + decimal=3, + ) - np.testing.assert_array_almost_equal(fd_registered_2.data_matrix, - fd_registered_4.data_matrix) + np.testing.assert_array_almost_equal( + fd_registered_2.data_matrix, + fd_registered_4.data_matrix, + ) def test_restrict_domain(self) -> None: reg = LeastSquaresShiftRegistration(restrict_domain=True) fd_registered_1 = reg.fit_transform(self.fd) np.testing.assert_array_almost_equal( - np.array(fd_registered_1.domain_range).round(3), [[0.022, 0.969]]) + np.array(fd_registered_1.domain_range).round(3), + [[0.022, 0.969]], + ) reg2 = LeastSquaresShiftRegistration( restrict_domain=True, @@ -331,24 +379,30 @@ def test_restrict_domain(self) -> None: fd_registered_2 = reg2.fit_transform(self.fd) np.testing.assert_array_almost_equal( - fd_registered_2.data_matrix, fd_registered_1.data_matrix, - decimal=3) + fd_registered_2.data_matrix, + fd_registered_1.data_matrix, + decimal=3, + ) reg3 = LeastSquaresShiftRegistration( - restrict_domain=True, template=mean) + restrict_domain=True, + template=mean, + ) fd_registered_3 = reg3.fit_transform(self.fd) np.testing.assert_array_almost_equal( - fd_registered_3.data_matrix, fd_registered_1.data_matrix) + fd_registered_3.data_matrix, + fd_registered_1.data_matrix, + ) - def test_initial_estimation(self): + def test_initial_estimation(self) -> None: reg = LeastSquaresShiftRegistration(initial=[-0.02161235, 0.03032652]) reg.fit_transform(self.fd) # Only needed 1 iteration until convergence self.assertEqual(reg.n_iter_, 1) - def test_custom_grid_points(self): + def test_custom_grid_points(self) -> None: reg = LeastSquaresShiftRegistration(grid_points=np.linspace(0, 1, 50)) reg.fit_transform(self.fd) @@ -411,7 +465,7 @@ def test_mse_decomposition(self) -> None: np.testing.assert_allclose(ret.r_squared, 0.9910806875) np.testing.assert_allclose(ret.c_r, 0.9593073773) - def test_raises_amplitude_phase(self): + def test_raises_amplitude_phase(self) -> None: scorer = AmplitudePhaseDecomposition() # Inconsistent number of functions registered @@ -424,5 +478,4 @@ def test_raises_amplitude_phase(self): if __name__ == '__main__': - print() unittest.main() From 93d21bb8e886174ffed7b06b1c4264104311a65a Mon Sep 17 00:00:00 2001 From: VNMabus Date: Fri, 8 Oct 2021 18:33:20 +0200 Subject: [PATCH 017/117] Fix test bug. --- tests/test_registration.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tests/test_registration.py b/tests/test_registration.py index 9b25f6719..506115fa0 100644 --- a/tests/test_registration.py +++ b/tests/test_registration.py @@ -271,6 +271,12 @@ def test_fit_transform(self) -> None: def test_fit_and_transform(self) -> None: """Test wrapper of shift_registration_deltas.""" + fd = make_sinusoidal_process( + n_samples=2, + error_std=0, + random_state=10, + ) + reg = LeastSquaresShiftRegistration() response = reg.fit(self.fd) @@ -278,6 +284,7 @@ def test_fit_and_transform(self) -> None: self.assertTrue(hasattr(reg, 'template_')) self.assertTrue(response is reg) + reg.transform(fd) deltas = reg.deltas_.round(3) np.testing.assert_allclose(deltas, [0.071, -0.072]) From 96acd62fbf5539f24546b34f4fb47f24aaa513c3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Lejeune?= <54889281+Clej@users.noreply.github.com> Date: Fri, 8 Oct 2021 20:05:05 +0200 Subject: [PATCH 018/117] correct code formatting in inverse_transform --- .../dim_reduction/feature_extraction/_fpca.py | 52 +++++++++---------- 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/skfda/preprocessing/dim_reduction/feature_extraction/_fpca.py b/skfda/preprocessing/dim_reduction/feature_extraction/_fpca.py index 0f9cf28ab..3f476f5d4 100644 --- a/skfda/preprocessing/dim_reduction/feature_extraction/_fpca.py +++ b/skfda/preprocessing/dim_reduction/feature_extraction/_fpca.py @@ -152,7 +152,7 @@ def _fit_basis( ) n_samples = X.n_samples # necessary in inverse_transform - self.n_samples_fitted_ = X.n_samples + self.n_samples_ = X.n_samples # check that the number of components is smaller than the sample size if self.n_components > X.n_samples: @@ -332,7 +332,7 @@ def _fit_grid( n_samples, n_points_discretization = fd_data.shape # necessary for inverse_transform - self.n_samples_fitted_ = n_samples + self.n_samples_ = n_samples # if centering is True then subtract the mean function to each function # in FDataBasis @@ -491,37 +491,43 @@ def fit_transform( def inverse_transform( self, - pc_score: np.ndarray, + pc_scores: np.ndarray, ) -> FData: """ - Compute the reconstruction of samples given their ``n_components`` first principal components score i.e. a projection coefficient onto the fitted functional principal components. - In other words, it maps a coefficient vector, from the fitted functional principal components space, back to the input functional space. - Typically, ``pc_score`` might be an array returned by ``transform`` or ``fit_transform`` method. + Compute the recovery from the fitted principal components scores. + + In other words, + it maps ``pc_scores``, from the fitted functional PCs' space, + back to the input functional space. + ``pc_score`` might be an array returned by ``transform`` or ``fit_transform`` method. Args: - pc_score: ndarray of shape (n_samples, n_components). The principal components scores from which to perform the inverse transformation. + pc_score: ndarray (n_samples, n_components). Returns: A FData object in the functional input space. """ - # check if the instance is fitted. + # check the instance is fitted. # input format check: - if isinstance(pc_score, np.ndarray): - if pc_score.ndim == 1: - pc_score = pc_score[np.newaxis, :] - - if pc_score.shape[1] != self.n_components: - raise AttributeError("pc_score must be a numpy array with n_samples rows and n_components columns.") + if isinstance(pc_scores, np.ndarray): + if pc_scores.ndim == 1: + pc_scores = pc_score[np.newaxis, :] + + if pc_scores.shape[1] != self.n_components: + raise AttributeError( + "pc_score must be a numpy array " + "with n_samples rows and n_components columns.") else: raise AttributeError("pc_score is not a numpy array.") - # inverse_transform is slightly different wether .fit is applied to FDataGrid or FDataBasis + # inverse_transform is slightly different wether + # .fit was applied to FDataGrid or FDataBasis object if isinstance(self.components_, FDataGrid): # reconstruct the discretized functions - x_hat = (pc_score @ self.components_.data_matrix[:,:,0]) \ - @ (np.diag(np.sqrt(self.weights)) / np.sqrt(self.n_samples_fitted_)) + x_hat = (pc_scores @ self.components_.data_matrix[:,:,0]) \ + @ (np.diag(np.sqrt(self.weights)) / np.sqrt(self.n_samples_)) x_hat += self.mean_.data_matrix.reshape((1,self.mean_.grid_points[0].shape[0])) # format as FDataGrid according to fitted data format @@ -529,16 +535,10 @@ def inverse_transform( argument_names=self.mean_.argument_names) elif isinstance(self.components_, FDataBasis): # reconstruct the basis coefficients - x_hat = (pc_score @ self.components_.coefficients) \ - @ (np.transpose(self._l_inv_j_t) / np.sqrt(self.n_samples_fitted_)) + x_hat = (pc_scores @ self.components_.coefficients) \ + @ (np.transpose(self._l_inv_j_t) / np.sqrt(self.n_samples_)) x_hat += self.mean_.coefficients.reshape((1,self.mean_.coefficients.shape[1])) # format as FDataBasis according to fitted data format return FDataBasis(basis=self.mean_.basis, coefficients = x_hat, argument_names=self.mean_.argument_names) - - - - - - - + \ No newline at end of file From 98c367d82f0be95e00135a68d75f01bed652724c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Lejeune?= <54889281+Clej@users.noreply.github.com> Date: Sat, 9 Oct 2021 17:39:07 +0200 Subject: [PATCH 019/117] correct code formatting with flake8 according to setup.cfg --- .../dim_reduction/feature_extraction/_fpca.py | 28 ++++++++++--------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/skfda/preprocessing/dim_reduction/feature_extraction/_fpca.py b/skfda/preprocessing/dim_reduction/feature_extraction/_fpca.py index 3f476f5d4..ddc924232 100644 --- a/skfda/preprocessing/dim_reduction/feature_extraction/_fpca.py +++ b/skfda/preprocessing/dim_reduction/feature_extraction/_fpca.py @@ -496,13 +496,13 @@ def inverse_transform( """ Compute the recovery from the fitted principal components scores. - In other words, + In other words, it maps ``pc_scores``, from the fitted functional PCs' space, back to the input functional space. - ``pc_score`` might be an array returned by ``transform`` or ``fit_transform`` method. + ``pc_scores`` might be an array returned by ``transform`` method. Args: - pc_score: ndarray (n_samples, n_components). + pc_scores: ndarray (n_samples, n_components). Returns: A FData object in the functional input space. @@ -513,7 +513,7 @@ def inverse_transform( # input format check: if isinstance(pc_scores, np.ndarray): if pc_scores.ndim == 1: - pc_scores = pc_score[np.newaxis, :] + pc_scores = pc_scores[np.newaxis, :] if pc_scores.shape[1] != self.n_components: raise AttributeError( @@ -522,23 +522,25 @@ def inverse_transform( else: raise AttributeError("pc_score is not a numpy array.") - # inverse_transform is slightly different wether + # inverse_transform is slightly different wether # .fit was applied to FDataGrid or FDataBasis object if isinstance(self.components_, FDataGrid): # reconstruct the discretized functions - x_hat = (pc_scores @ self.components_.data_matrix[:,:,0]) \ + x_hat = (pc_scores @ self.components_.data_matrix[:, :, 0]) \ @ (np.diag(np.sqrt(self.weights)) / np.sqrt(self.n_samples_)) - x_hat += self.mean_.data_matrix.reshape((1,self.mean_.grid_points[0].shape[0])) + x_hat += self.mean_.data_matrix.reshape( + (1, self.mean_.grid_points[0].shape[0])) # format as FDataGrid according to fitted data format - return FDataGrid(data_matrix=x_hat, grid_points=self.mean_.grid_points[0], - argument_names=self.mean_.argument_names) + return FDataGrid(data_matrix=x_hat, + grid_points=self.mean_.grid_points[0], + argument_names=self.mean_.argument_names) elif isinstance(self.components_, FDataBasis): # reconstruct the basis coefficients x_hat = (pc_scores @ self.components_.coefficients) \ @ (np.transpose(self._l_inv_j_t) / np.sqrt(self.n_samples_)) - x_hat += self.mean_.coefficients.reshape((1,self.mean_.coefficients.shape[1])) + x_hat += self.mean_.coefficients.reshape( + (1, self.mean_.coefficients.shape[1])) # format as FDataBasis according to fitted data format - return FDataBasis(basis=self.mean_.basis, coefficients = x_hat, - argument_names=self.mean_.argument_names) - \ No newline at end of file + return FDataBasis(basis=self.mean_.basis, coefficients=x_hat, + argument_names=self.mean_.argument_names) From a924091c27c01738781a96f473b62b1cfb268be0 Mon Sep 17 00:00:00 2001 From: VNMabus Date: Sun, 10 Oct 2021 13:33:28 +0200 Subject: [PATCH 020/117] Pass keyword parameters to matplotlib functions in Graphplot --- skfda/exploratory/visualization/representation.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/skfda/exploratory/visualization/representation.py b/skfda/exploratory/visualization/representation.py index edd3e5a62..a56dcc309 100644 --- a/skfda/exploratory/visualization/representation.py +++ b/skfda/exploratory/visualization/representation.py @@ -256,6 +256,7 @@ def __init__( self.group_names = group_names self.legend = legend self.colormap = colormap + self.kwargs = kwargs if domain_range is None: self.domain_range = self.fdata.domain_range @@ -330,6 +331,7 @@ def _plot( self.artists[j, i] = axes[i].plot( eval_points, mat[j, ..., i].T, + **self.kwargs, **color_dict, )[0] @@ -365,6 +367,7 @@ def _plot( X, Y, Z[h, ..., k], + **self.kwargs, **color_dict, ) From 59031951781368e3fa6c5c210c95fdc0aa6c7fe8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Lejeune?= <54889281+Clej@users.noreply.github.com> Date: Sun, 10 Oct 2021 16:21:44 +0200 Subject: [PATCH 021/117] Typo comment _fpca.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Carlos Ramos Carreño --- skfda/preprocessing/dim_reduction/feature_extraction/_fpca.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skfda/preprocessing/dim_reduction/feature_extraction/_fpca.py b/skfda/preprocessing/dim_reduction/feature_extraction/_fpca.py index ddc924232..e17503f8d 100644 --- a/skfda/preprocessing/dim_reduction/feature_extraction/_fpca.py +++ b/skfda/preprocessing/dim_reduction/feature_extraction/_fpca.py @@ -522,7 +522,7 @@ def inverse_transform( else: raise AttributeError("pc_score is not a numpy array.") - # inverse_transform is slightly different wether + # inverse_transform is slightly different whether # .fit was applied to FDataGrid or FDataBasis object if isinstance(self.components_, FDataGrid): # reconstruct the discretized functions From f2048c3738f2cb7b8c187b958c7b685fa05d8bd4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Lejeune?= <54889281+Clej@users.noreply.github.com> Date: Sun, 10 Oct 2021 16:24:07 +0200 Subject: [PATCH 022/117] Update skfda/preprocessing/dim_reduction/feature_extraction/_fpca.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Carlos Ramos Carreño --- skfda/preprocessing/dim_reduction/feature_extraction/_fpca.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/skfda/preprocessing/dim_reduction/feature_extraction/_fpca.py b/skfda/preprocessing/dim_reduction/feature_extraction/_fpca.py index e17503f8d..c9bc23597 100644 --- a/skfda/preprocessing/dim_reduction/feature_extraction/_fpca.py +++ b/skfda/preprocessing/dim_reduction/feature_extraction/_fpca.py @@ -518,7 +518,8 @@ def inverse_transform( if pc_scores.shape[1] != self.n_components: raise AttributeError( "pc_score must be a numpy array " - "with n_samples rows and n_components columns.") + "with n_samples rows and n_components columns.", + ) else: raise AttributeError("pc_score is not a numpy array.") From fc0276b83d99c3850248cd1fd9c61ed7748ab1f8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Lejeune?= <54889281+Clej@users.noreply.github.com> Date: Sun, 10 Oct 2021 16:24:38 +0200 Subject: [PATCH 023/117] Style corrections in _fpca.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Carlos Ramos Carreño --- skfda/preprocessing/dim_reduction/feature_extraction/_fpca.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/skfda/preprocessing/dim_reduction/feature_extraction/_fpca.py b/skfda/preprocessing/dim_reduction/feature_extraction/_fpca.py index c9bc23597..9ee4af4dd 100644 --- a/skfda/preprocessing/dim_reduction/feature_extraction/_fpca.py +++ b/skfda/preprocessing/dim_reduction/feature_extraction/_fpca.py @@ -527,8 +527,10 @@ def inverse_transform( # .fit was applied to FDataGrid or FDataBasis object if isinstance(self.components_, FDataGrid): # reconstruct the discretized functions - x_hat = (pc_scores @ self.components_.data_matrix[:, :, 0]) \ + x_hat = ( + (pc_scores @ self.components_.data_matrix[:, :, 0]) @ (np.diag(np.sqrt(self.weights)) / np.sqrt(self.n_samples_)) + ) x_hat += self.mean_.data_matrix.reshape( (1, self.mean_.grid_points[0].shape[0])) From a38d5143be89d8c39b4da0b4acf81e2e2fa4d5e6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Lejeune?= <54889281+Clej@users.noreply.github.com> Date: Sun, 10 Oct 2021 16:25:36 +0200 Subject: [PATCH 024/117] Styleguide corrections in _fpca.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Carlos Ramos Carreño --- .../dim_reduction/feature_extraction/_fpca.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/skfda/preprocessing/dim_reduction/feature_extraction/_fpca.py b/skfda/preprocessing/dim_reduction/feature_extraction/_fpca.py index 9ee4af4dd..ee9836ff4 100644 --- a/skfda/preprocessing/dim_reduction/feature_extraction/_fpca.py +++ b/skfda/preprocessing/dim_reduction/feature_extraction/_fpca.py @@ -535,9 +535,11 @@ def inverse_transform( (1, self.mean_.grid_points[0].shape[0])) # format as FDataGrid according to fitted data format - return FDataGrid(data_matrix=x_hat, - grid_points=self.mean_.grid_points[0], - argument_names=self.mean_.argument_names) + return FDataGrid( + data_matrix=x_hat, + grid_points=self.mean_.grid_points[0], + argument_names=self.mean_.argument_names, + ) elif isinstance(self.components_, FDataBasis): # reconstruct the basis coefficients x_hat = (pc_scores @ self.components_.coefficients) \ From 90a679fd67bc4e635ac42e50f9d6c8d12e39b70f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Lejeune?= <54889281+Clej@users.noreply.github.com> Date: Sun, 10 Oct 2021 16:26:00 +0200 Subject: [PATCH 025/117] Styleguide corrections in _fpca.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Carlos Ramos Carreño --- skfda/preprocessing/dim_reduction/feature_extraction/_fpca.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/skfda/preprocessing/dim_reduction/feature_extraction/_fpca.py b/skfda/preprocessing/dim_reduction/feature_extraction/_fpca.py index ee9836ff4..2c494f1f5 100644 --- a/skfda/preprocessing/dim_reduction/feature_extraction/_fpca.py +++ b/skfda/preprocessing/dim_reduction/feature_extraction/_fpca.py @@ -542,8 +542,10 @@ def inverse_transform( ) elif isinstance(self.components_, FDataBasis): # reconstruct the basis coefficients - x_hat = (pc_scores @ self.components_.coefficients) \ + x_hat = ( + (pc_scores @ self.components_.coefficients) @ (np.transpose(self._l_inv_j_t) / np.sqrt(self.n_samples_)) + ) x_hat += self.mean_.coefficients.reshape( (1, self.mean_.coefficients.shape[1])) # format as FDataBasis according to fitted data format From 62695e8d6dd606048c0c04808db236d9ec5f06ea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Lejeune?= <54889281+Clej@users.noreply.github.com> Date: Sun, 10 Oct 2021 16:26:17 +0200 Subject: [PATCH 026/117] Styleguide corrections in _fpca.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Carlos Ramos Carreño --- skfda/preprocessing/dim_reduction/feature_extraction/_fpca.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/skfda/preprocessing/dim_reduction/feature_extraction/_fpca.py b/skfda/preprocessing/dim_reduction/feature_extraction/_fpca.py index 2c494f1f5..4dc229efb 100644 --- a/skfda/preprocessing/dim_reduction/feature_extraction/_fpca.py +++ b/skfda/preprocessing/dim_reduction/feature_extraction/_fpca.py @@ -547,7 +547,8 @@ def inverse_transform( @ (np.transpose(self._l_inv_j_t) / np.sqrt(self.n_samples_)) ) x_hat += self.mean_.coefficients.reshape( - (1, self.mean_.coefficients.shape[1])) + (1, self.mean_.coefficients.shape[1]), + ) # format as FDataBasis according to fitted data format return FDataBasis(basis=self.mean_.basis, coefficients=x_hat, argument_names=self.mean_.argument_names) From b793d876ed15c0937e49bccf07cf5890dba5c00c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Lejeune?= <54889281+Clej@users.noreply.github.com> Date: Sun, 10 Oct 2021 16:26:38 +0200 Subject: [PATCH 027/117] Style corrections in _fpca.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Carlos Ramos Carreño --- .../dim_reduction/feature_extraction/_fpca.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/skfda/preprocessing/dim_reduction/feature_extraction/_fpca.py b/skfda/preprocessing/dim_reduction/feature_extraction/_fpca.py index 4dc229efb..8cb795671 100644 --- a/skfda/preprocessing/dim_reduction/feature_extraction/_fpca.py +++ b/skfda/preprocessing/dim_reduction/feature_extraction/_fpca.py @@ -550,5 +550,8 @@ def inverse_transform( (1, self.mean_.coefficients.shape[1]), ) # format as FDataBasis according to fitted data format - return FDataBasis(basis=self.mean_.basis, coefficients=x_hat, - argument_names=self.mean_.argument_names) + return FDataBasis( + basis=self.mean_.basis, + coefficients=x_hat, + argument_names=self.mean_.argument_names, + ) From 41ac129ba477dea8999c5c0e2f5363063f3a7029 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Lejeune?= <54889281+Clej@users.noreply.github.com> Date: Sun, 10 Oct 2021 16:26:58 +0200 Subject: [PATCH 028/117] Style corrections in _fpca.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Carlos Ramos Carreño --- skfda/preprocessing/dim_reduction/feature_extraction/_fpca.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/skfda/preprocessing/dim_reduction/feature_extraction/_fpca.py b/skfda/preprocessing/dim_reduction/feature_extraction/_fpca.py index 8cb795671..5f587a758 100644 --- a/skfda/preprocessing/dim_reduction/feature_extraction/_fpca.py +++ b/skfda/preprocessing/dim_reduction/feature_extraction/_fpca.py @@ -532,7 +532,8 @@ def inverse_transform( @ (np.diag(np.sqrt(self.weights)) / np.sqrt(self.n_samples_)) ) x_hat += self.mean_.data_matrix.reshape( - (1, self.mean_.grid_points[0].shape[0])) + (1, self.mean_.grid_points[0].shape[0]), + ) # format as FDataGrid according to fitted data format return FDataGrid( From 133eb7cb839e4cb6a9161e756c82de27a48190d8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Lejeune?= <54889281+Clej@users.noreply.github.com> Date: Mon, 11 Oct 2021 18:05:21 +0200 Subject: [PATCH 029/117] code typo in inverse_transform --- .../dim_reduction/feature_extraction/_fpca.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/skfda/preprocessing/dim_reduction/feature_extraction/_fpca.py b/skfda/preprocessing/dim_reduction/feature_extraction/_fpca.py index 5f587a758..cc5cc29b5 100644 --- a/skfda/preprocessing/dim_reduction/feature_extraction/_fpca.py +++ b/skfda/preprocessing/dim_reduction/feature_extraction/_fpca.py @@ -505,7 +505,7 @@ def inverse_transform( pc_scores: ndarray (n_samples, n_components). Returns: - A FData object in the functional input space. + A FData object. """ # check the instance is fitted. @@ -517,19 +517,19 @@ def inverse_transform( if pc_scores.shape[1] != self.n_components: raise AttributeError( - "pc_score must be a numpy array " + "pc_scores must be a numpy array " "with n_samples rows and n_components columns.", ) else: - raise AttributeError("pc_score is not a numpy array.") + raise AttributeError("pc_scores is not a numpy array.") # inverse_transform is slightly different whether # .fit was applied to FDataGrid or FDataBasis object if isinstance(self.components_, FDataGrid): # reconstruct the discretized functions x_hat = ( - (pc_scores @ self.components_.data_matrix[:, :, 0]) - @ (np.diag(np.sqrt(self.weights)) / np.sqrt(self.n_samples_)) + (pc_scores @ (self.components_.data_matrix[:, :, 0]) + @ (np.diag(np.sqrt(self.weights)) / np.sqrt(self.n_samples_))) ) x_hat += self.mean_.data_matrix.reshape( (1, self.mean_.grid_points[0].shape[0]), @@ -544,8 +544,8 @@ def inverse_transform( elif isinstance(self.components_, FDataBasis): # reconstruct the basis coefficients x_hat = ( - (pc_scores @ self.components_.coefficients) - @ (np.transpose(self._l_inv_j_t) / np.sqrt(self.n_samples_)) + (pc_scores @ (self.components_.coefficients) + @ (np.transpose(self._l_inv_j_t) / np.sqrt(self.n_samples_))) ) x_hat += self.mean_.coefficients.reshape( (1, self.mean_.coefficients.shape[1]), From cc9bbf850558692de7fe1e0ec7595c51d52235ad Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Lejeune?= <54889281+Clej@users.noreply.github.com> Date: Mon, 11 Oct 2021 18:10:22 +0200 Subject: [PATCH 030/117] add inverse_transform tests for FDataGrid --- tests/test_fpca.py | 60 +++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 59 insertions(+), 1 deletion(-) diff --git a/tests/test_fpca.py b/tests/test_fpca.py index ef0db4fcb..12f29b4d2 100644 --- a/tests/test_fpca.py +++ b/tests/test_fpca.py @@ -2,9 +2,10 @@ import unittest import numpy as np +from numpy.lib.index_tricks import nd_grid from skfda import FDataBasis, FDataGrid -from skfda.datasets import fetch_weather +from skfda.datasets import fetch_weather, make_multimodal_samples from skfda.misc.operators import LinearDifferentialOperator from skfda.misc.regularization import TikhonovRegularization from skfda.preprocessing.dim_reduction.feature_extraction import FPCA @@ -449,6 +450,63 @@ def test_grid_fpca_regularization_fit_result(self) -> None: rtol=1e-2, ) + def test_grid_fpca_inverse_transform(self) -> None: + """Compare the reconstructions to fitting non-random data.""" + + # Randomly, draw a true function that generates the dataset. + def draw_one_random_fun(n_grid) -> FDataGrid: + modes_location = np.random.uniform(-10., 10., size=50) + noise = 10**-2 + fd_random = make_multimodal_samples( + start=0., + stop=15., + n_samples=int(1), + points_per_dim=n_grid, + n_modes=modes_location.size, + noise=noise, + modes_location=modes_location, + random_state=42 + ) + return fd_random + + # test function w.r.t n_samples, n_grid + def test_vs_dim(n_samples, n_grid, base_fun): + fd_random_all_equal = base_fun + # Concatenate random FDataGrid 'n_sample's times + for _ in range(1, n_samples - 1): + fd_random_all_equal = fd_random_all_equal.concatenate(base_fun) + + # Take the allowed maximum number of components + # In almost high dimension: n_components=n_samples-1 < n_samples + # In low dimension: n_components=n_grid << n_samples + fpca = FPCA(n_components=np.min([n_samples - 1, n_grid])) + + # Project the non-random dataset on FPCs + pc_scores_fd_random_all_equal = fpca.fit_transform( + fd_random_all_equal + ) + # Project the pc scores back to the input functional space + fd_random_all_equal_hat = fpca.inverse_transform( + pc_scores_fd_random_all_equal + ) + + # Compare fitting data to the reconstructed ones + np.testing.assert_allclose( + fd_random_all_equal.data_matrix, + fd_random_all_equal_hat.data_matrix + ) + + # Low dimensional case (n_samples>n_grid) + n_samples = int(10**3) + n_grid = int(10**2) + true_fun = draw_one_random_fun(n_grid) + test_vs_dim(n_samples=n_samples, n_grid=n_grid, base_fun=true_fun) + + # (almotst) High dimensional case (n_samples Date: Tue, 12 Oct 2021 01:18:44 +0200 Subject: [PATCH 031/117] Add cosine similarity. --- docs/Makefile | 7 +- docs/modules/misc.rst | 45 ++++ .../preprocessing/dim_reduction/fpca.rst | 2 +- skfda/misc/__init__.py | 2 + skfda/misc/_math.py | 177 ++++++++++++-- tests/test_math.py | 215 +++++++++++++++--- 6 files changed, 394 insertions(+), 54 deletions(-) diff --git a/docs/Makefile b/docs/Makefile index ad2c23326..071754233 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -55,13 +55,14 @@ clean: rm -rf modules/autosummary rm -rf modules/exploratory/visualization/autosummary rm -rf modules/exploratory/autosummary + rm -rf modules/inference/autosummary rm -rf modules/math/autosummary - rm -rf modules/preprocessing/autosummary - rm -rf modules/representation/autosummary rm -rf modules/misc/autosummary rm -rf modules/ml/autosummary rm -rf modules/ml/clustering/autosummary - rm -rf modules/inference/autosummary + rm -rf modules/preprocessing/autosummary + rm -rf modules/preprocessing/dim_reduction/autosummary + rm -rf modules/representation/autosummary rm -rf backreferences .PHONY: html diff --git a/docs/modules/misc.rst b/docs/modules/misc.rst index e72660025..bf12d16ba 100644 --- a/docs/modules/misc.rst +++ b/docs/modules/misc.rst @@ -3,9 +3,54 @@ Miscellaneous Miscellaneous functions and objects. +This module groups classes and functions useful to work with functional data +but which do not belong to other categories. + +Mathematical operations +----------------------- + +Some math operations between functional data objects are directly available +in this module. +The most important ones are the ones that efficiently compute the inner +product between functions: + +.. autosummary:: + :toctree: autosummary + + skfda.misc.inner_product + skfda.misc.inner_product_matrix + +A concept related with the inner product is that of the cosine similarity +between functions: + +.. autosummary:: + :toctree: autosummary + + skfda.misc.cosine_similarity + skfda.misc.cosine_similarity_matrix + +Submodules +---------- + +In addition the following modules provide useful functionality to work with +functional data: + +- :doc:`misc/covariances`: Contains covariance functions to use with + and :func:`~skfda.datasets.make_gaussian_process` +- :doc:`misc/metrics`: Contains functional data metrics, suitable to being + used with several metric-based machine-learning tools. +- :doc:`misc/operators`: Contains operators, or functions over functions. +- :doc:`misc/regularization`: Contains regularization functions, usable in + contexts such as + :class:`linear regression `, + :class:`FPCA `, + or :class:`basis smoothing `. + + .. toctree:: :maxdepth: 4 :caption: Modules: + :hidden: misc/covariances misc/metrics diff --git a/docs/modules/preprocessing/dim_reduction/fpca.rst b/docs/modules/preprocessing/dim_reduction/fpca.rst index c6cc9bfd8..3ba0e5ad5 100644 --- a/docs/modules/preprocessing/dim_reduction/fpca.rst +++ b/docs/modules/preprocessing/dim_reduction/fpca.rst @@ -21,4 +21,4 @@ FPCA for functional data in both representations .. autosummary:: :toctree: autosummary - skfda.preprocessing.dim_reduction.projection.FPCA + skfda.preprocessing.dim_reduction.feature_extraction.FPCA diff --git a/skfda/misc/__init__.py b/skfda/misc/__init__.py index 83431f624..46e52bc3a 100644 --- a/skfda/misc/__init__.py +++ b/skfda/misc/__init__.py @@ -1,5 +1,7 @@ from . import covariances, kernels, lstsq, metrics, operators, regularization from ._math import ( + cosine_similarity, + cosine_similarity_matrix, cumsum, exp, inner_product, diff --git a/skfda/misc/_math.py b/skfda/misc/_math.py index 71637aca5..ad1b4b231 100644 --- a/skfda/misc/_math.py +++ b/skfda/misc/_math.py @@ -14,12 +14,12 @@ from .._utils import _same_domain, nquad_vec from ..representation import FData, FDataBasis, FDataGrid -from ..representation._typing import DomainRange +from ..representation._typing import ArrayLike, DomainRange, NDArrayFloat from ..representation.basis import Basis Vector = TypeVar( "Vector", - bound=Union[np.ndarray, Basis, Callable[[np.ndarray], np.ndarray]], + bound=Union[NDArrayFloat, Basis, Callable[[ArrayLike], NDArrayFloat]], ) @@ -216,8 +216,9 @@ def inner_product( _matrix: bool = False, _domain_range: Optional[DomainRange] = None, **kwargs: Any, -) -> np.ndarray: - r"""Return the usual (:math:`L_2`) inner product. +) -> NDArrayFloat: + r""" + Return the usual (:math:`L_2`) inner product. Calculates the inner product between matching samples in two FDataGrid objects. @@ -225,12 +226,12 @@ def inner_product( For two samples x and y the inner product is defined as: .. math:: - = \sum_i x_i y_i + \langle x, y \rangle = \sum_i x_i y_i for multivariate data and .. math:: - = \int_a^b x(t)y(t)dt + \langle x, y \rangle = \int_a^b x(t)y(t)dt for functional data. @@ -242,8 +243,7 @@ def inner_product( arg2: Second sample. Returns: - numpy.darray: Vector with the inner products of each pair of - samples. + Vector with the inner products of each pair of samples. Examples: This function can compute the multivariate inner product. @@ -335,7 +335,7 @@ def _inner_product_fdatagrid( arg2: FDataGrid, *, _matrix: bool = False, -) -> np.ndarray: +) -> NDArrayFloat: if not np.array_equal( arg1.grid_points, @@ -388,9 +388,9 @@ def _inner_product_fdatabasis( arg2: Union[FDataBasis, Basis], *, _matrix: bool = False, - inner_product_matrix: Optional[np.ndarray] = None, + inner_product_matrix: Optional[NDArrayFloat] = None, force_numerical: bool = False, -) -> np.ndarray: +) -> NDArrayFloat: if not _same_domain(arg1, arg2): raise ValueError("Both Objects should have the same domain_range") @@ -448,12 +448,12 @@ def _inner_product_fdatabasis( def _inner_product_integrate( - arg1: Callable[[np.ndarray], np.ndarray], - arg2: Callable[[np.ndarray], np.ndarray], + arg1: Callable[[ArrayLike], NDArrayFloat], + arg2: Callable[[ArrayLike], NDArrayFloat], *, _matrix: bool = False, _domain_range: Optional[DomainRange] = None, -) -> np.ndarray: +) -> NDArrayFloat: domain_range: DomainRange @@ -477,7 +477,7 @@ def _inner_product_integrate( len_arg1 = len(arg1(left_domain)) len_arg2 = len(arg2(left_domain)) - def integrand(*args: np.ndarray) -> np.ndarray: # noqa: WPS430 + def integrand(*args: NDArrayFloat) -> NDArrayFloat: # noqa: WPS430 f1 = arg1(args)[:, 0, :] f2 = arg2(args)[:, 0, :] @@ -504,20 +504,22 @@ def inner_product_matrix( arg1: Vector, arg2: Optional[Vector] = None, **kwargs: Any, -) -> np.ndarray: +) -> NDArrayFloat: """ Return the inner product matrix between is arguments. - If arg2 is ``None`` returns the Gram matrix. - Args: arg1: First sample. - arg2: Second sample. + arg2: Second sample. If it is ``None`` returns the inner product + between the samples in ``arg1``. kwargs: Keyword arguments for inner product. Returns: Inner product matrix between samples. + See also: + :func:`inner_product` + """ if isinstance(arg1, Basis): arg1 = arg1.to_basis() @@ -528,3 +530,140 @@ def inner_product_matrix( arg2 = arg1 return inner_product(arg1, arg2, _matrix=True, **kwargs) + + +def cosine_similarity( + arg1: Vector, + arg2: Vector, +) -> NDArrayFloat: + r""" + Return the cosine similarity. + + Calculates the cosine similarity between matching samples in two + FDataGrid objects. + + For two samples x and y the cosine similarity is defined as: + + .. math:: + \cos \text{sim}(x, y) = \frac{\langle x, y \rangle}{ + \sqrt{\langle x, x \rangle \langle y, y \rangle}} + + where :math:`\langle {}\cdot{}, {}\cdot{} \rangle` is the inner product. + + The two arguments must have the same number of samples, or one should + contain only one sample (and will be broadcasted). + + Args: + arg1: First sample. + arg2: Second sample. + + Returns: + Vector with the cosine similarity of each pair of samples. + + Examples: + This function can compute the multivariate cosine similarity. + + >>> import numpy as np + >>> from skfda.misc import cosine_similarity + >>> + >>> array1 = np.array([1, 2, 3]) + >>> array2 = np.array([4, 5, 6]) + >>> cosine_similarity(array1, array2) + 0.9746318461970762 + + If the arrays contain more than one sample + + >>> array1 = np.array([[1, 2, 3], [2, 3, 4]]) + >>> array2 = np.array([[4, 5, 6], [1, 1, 1]]) + >>> cosine_similarity(array1, array2) + array([ 0.97463185, 0.96490128]) + + The cosine similarity of the :math:'f(x) = x` and the constant + :math:`y=1` defined over the interval [0,1] is the area of the + triangle delimited by the the lines y = 0, x = 1 and y = x; 0.5, + multiplied by :math:`\sqrt{3}`. + + >>> import skfda + >>> + >>> x = np.linspace(0,1,1000) + >>> + >>> fd1 = skfda.FDataGrid(x,x) + >>> fd2 = skfda.FDataGrid(np.ones(len(x)),x) + >>> cosine_similarity(fd1, fd2) + array([ 0.8660254]) + + If the FDataGrid object contains more than one sample + + >>> fd1 = skfda.FDataGrid([x, np.ones(len(x))], x) + >>> fd2 = skfda.FDataGrid([np.ones(len(x)), x] ,x) + >>> cosine_similarity(fd1, fd2).round(2) + array([ 0.87, 0.87]) + + If one argument contains only one sample it is + broadcasted. + + >>> fd1 = skfda.FDataGrid([x, np.ones(len(x))], x) + >>> fd2 = skfda.FDataGrid([np.ones(len(x))] ,x) + >>> cosine_similarity(fd1, fd2).round(2) + array([ 0.87, 1. ]) + + It also work with basis objects + + >>> basis = skfda.representation.basis.Monomial(n_basis=3) + >>> + >>> fd1 = skfda.FDataBasis(basis, [0, 1, 0]) + >>> fd2 = skfda.FDataBasis(basis, [1, 0, 0]) + >>> cosine_similarity(fd1, fd2) + array([ 0.8660254]) + + >>> basis = skfda.representation.basis.Monomial(n_basis=3) + >>> + >>> fd1 = skfda.FDataBasis(basis, [[0, 1, 0], [0, 0, 1]]) + >>> fd2 = skfda.FDataBasis(basis, [1, 0, 0]) + >>> cosine_similarity(fd1, fd2) + array([ 0.8660254 , 0.74535599]) + + >>> basis = skfda.representation.basis.Monomial(n_basis=3) + >>> + >>> fd1 = skfda.FDataBasis(basis, [[0, 1, 0], [0, 0, 1]]) + >>> fd2 = skfda.FDataBasis(basis, [[1, 0, 0], [0, 1, 0]]) + >>> cosine_similarity(fd1, fd2) + array([ 0.8660254 , 0.96824584]) + + """ + inner_prod = inner_product(arg1, arg2) + norm1 = np.sqrt(inner_product(arg1, arg1)) + norm2 = np.sqrt(inner_product(arg2, arg2)) + + return inner_prod / norm1 / norm2 + + +def cosine_similarity_matrix( + arg1: Vector, + arg2: Optional[Vector] = None, +) -> NDArrayFloat: + """ + Return the cosine similarity matrix between is arguments. + + Args: + arg1: First sample. + arg2: Second sample. If it is ``None`` returns the cosine similarity + between the samples in ``arg1``. + + Returns: + Cosine similarity matrix between samples. + + See also: + :func:`cosine_similarity` + + """ + inner_matrix = inner_product_matrix(arg1, arg2) + + if arg2 is None or arg2 is arg1: + norm1 = np.sqrt(np.diag(inner_matrix)) + norm2 = norm1 + else: + norm1 = np.sqrt(inner_product(arg1, arg1)) + norm2 = np.sqrt(inner_product(arg2, arg2)) + + return inner_matrix / norm1[:, np.newaxis] / norm2[np.newaxis, :] diff --git a/tests/test_math.py b/tests/test_math.py index dee31333f..20ccc486e 100644 --- a/tests/test_math.py +++ b/tests/test_math.py @@ -1,54 +1,74 @@ +"""Test the math module.""" import unittest +from typing import Sequence import numpy as np import skfda from skfda._utils import _pairwise_symmetric +from skfda.representation._typing import NDArrayFloat from skfda.representation.basis import Monomial, Tensor, VectorValued -def ndm(*args): - return [x[(None,) * i + (slice(None),) + (None,) * (len(args) - i - 1)] - for i, x in enumerate(args)] +def _ndm(*args: NDArrayFloat) -> Sequence[NDArrayFloat]: + return [ + x[(None,) * i + (slice(None),) + (None,) * (len(args) - i - 1)] + for i, x in enumerate(args) + ] class InnerProductTest(unittest.TestCase): - - def test_several_variables(self): - - def f(x, y, z): + """Tests for the inner product.""" + + def test_several_variables(self) -> None: + """Test inner_product with functions of several variables.""" + def f( + x: NDArrayFloat, + y: NDArrayFloat, + z: NDArrayFloat, + ) -> NDArrayFloat: return x * y * z t = np.linspace(0, 1, 30) - x2, y2, z2 = ndm(t, 2 * t, 3 * t) + x2, y2, z2 = _ndm(t, 2 * t, 3 * t) data_matrix = f(x2, y2, z2) grid_points = [t, 2 * t, 3 * t] fd = skfda.FDataGrid( - data_matrix[np.newaxis, ...], grid_points=grid_points) + data_matrix[np.newaxis, ...], + grid_points=grid_points, + ) - basis = Tensor([Monomial(n_basis=5, domain_range=(0, 1)), - Monomial(n_basis=5, domain_range=(0, 2)), - Monomial(n_basis=5, domain_range=(0, 3))]) + basis = Tensor([ + Monomial(n_basis=5, domain_range=(0, 1)), + Monomial(n_basis=5, domain_range=(0, 2)), + Monomial(n_basis=5, domain_range=(0, 3)), + ]) fd_basis = fd.to_basis(basis) res = 8 np.testing.assert_allclose( - skfda.misc.inner_product(fd, fd), res, rtol=1e-4) + skfda.misc.inner_product(fd, fd), + res, + rtol=1e-4, + ) np.testing.assert_allclose( - skfda.misc.inner_product(fd_basis, fd_basis), res, rtol=1e-4) - - def test_vector_valued(self): - - def f(x): + skfda.misc.inner_product(fd_basis, fd_basis), + res, + rtol=1e-4, + ) + + def test_vector_valued(self) -> None: + """Test inner_product with vector valued functions.""" + def f(x: NDArrayFloat) -> NDArrayFloat: return x**2 - def g(y): + def g(y: NDArrayFloat) -> NDArrayFloat: return 3 * y t = np.linspace(0, 1, 100) @@ -58,32 +78,46 @@ def g(y): grid_points = [t] fd = skfda.FDataGrid( - data_matrix, grid_points=grid_points) + data_matrix, + grid_points=grid_points, + ) - basis = VectorValued([Monomial(n_basis=5), - Monomial(n_basis=5)]) + basis = VectorValued([ + Monomial(n_basis=5), + Monomial(n_basis=5), + ]) fd_basis = fd.to_basis(basis) res = 1 / 5 + 3 np.testing.assert_allclose( - skfda.misc.inner_product(fd, fd), res, rtol=1e-5) + skfda.misc.inner_product(fd, fd), + res, + rtol=1e-5, + ) np.testing.assert_allclose( - skfda.misc.inner_product(fd_basis, fd_basis), res, rtol=1e-5) - - def test_matrix(self): + skfda.misc.inner_product(fd_basis, fd_basis), + res, + rtol=1e-5, + ) + def test_matrix(self) -> None: + """Test inner_product_matrix function.""" basis = skfda.representation.basis.BSpline(n_basis=12) X = skfda.datasets.make_gaussian_process( - n_samples=10, n_features=20, + n_samples=10, + n_features=20, cov=skfda.misc.covariances.Gaussian(), - random_state=0) + random_state=0, + ) Y = skfda.datasets.make_gaussian_process( - n_samples=10, n_features=20, + n_samples=10, + n_features=20, cov=skfda.misc.covariances.Gaussian(), - random_state=1) + random_state=1, + ) X_basis = X.to_basis(basis) Y_basis = Y.to_basis(basis) @@ -99,6 +133,125 @@ def test_matrix(self): np.testing.assert_allclose(gram, gram_pairwise) +class CosineSimilarityVectorTest(unittest.TestCase): + """Tests for cosine similarity for vectors.""" + + def setUp(self) -> None: + self.arr = np.array([ + [0, 0, 1], + [1, 1, 1], + [1, 2, 3], + [1, 0, 1], + ]) + + self.arr_samelen = np.array([ + [2, 4, 1], + [7, 2, 1], + [0, 1, 0], + [3, 2, 0], + ]) + + self.arr_short = np.array([ + [2, 4, 6], + [5, 1, 7], + ]) + + def test_cosine_similarity_elementwise(self) -> None: + """Elementwise example for vectors.""" + np.testing.assert_allclose( + skfda.misc.inner_product(self.arr, self.arr_samelen), + [1, 10, 2, 3], + ) + + np.testing.assert_allclose( + skfda.misc.cosine_similarity(self.arr, self.arr_samelen), + [ + 1 / np.sqrt(21), + 10 / np.sqrt(3) / np.sqrt(54), + 2 / np.sqrt(14), + 3 / np.sqrt(2) / np.sqrt(13), + ], + ) + + def test_cosine_similarity_matrix_one(self) -> None: + """Matrix example for vectors with one input.""" + + for arr2 in (None, self.arr): + + np.testing.assert_allclose( + skfda.misc.inner_product_matrix(self.arr, arr2), + [ + [1, 1, 3, 1], + [1, 3, 6, 2], + [3, 6, 14, 4], + [1, 2, 4, 2], + ], + ) + + np.testing.assert_allclose( + skfda.misc.cosine_similarity_matrix(self.arr, arr2), + [ + [ + 1, + 1 / np.sqrt(3), + 3 / np.sqrt(14), + 1 / np.sqrt(2), + ], + [ + 1 / np.sqrt(3), + 3 / np.sqrt(3 * 3), + 6 / np.sqrt(3 * 14), + 2 / np.sqrt(3 * 2), + ], + [ + 3 / np.sqrt(14), + 6 / np.sqrt(14 * 3), + 14 / np.sqrt(14 * 14), + 4 / np.sqrt(14 * 2), + ], + [ + 1 / np.sqrt(2), + 2 / np.sqrt(2 * 3), + 4 / np.sqrt(2 * 14), + 2 / np.sqrt(2 * 2), + ], + ], + ) + + def test_cosine_similarity_matrix_two(self) -> None: + """Matrix example for vectors with two inputs.""" + np.testing.assert_allclose( + skfda.misc.inner_product_matrix(self.arr, self.arr_short), + [ + [6, 7], + [12, 13], + [28, 28], + [8, 12], + ], + ) + + np.testing.assert_allclose( + skfda.misc.cosine_similarity_matrix(self.arr, self.arr_short), + [ + [ + 6 / np.sqrt(56), + 7 / np.sqrt(75), + ], + [ + 12 / np.sqrt(3) / np.sqrt(56), + 13 / np.sqrt(3) / np.sqrt(75), + ], + [ + 28 / np.sqrt(14) / np.sqrt(56), + 28 / np.sqrt(14) / np.sqrt(75), + ], + [ + 8 / np.sqrt(2) / np.sqrt(56), + 12 / np.sqrt(2) / np.sqrt(75), + ], + ], + ) + + if __name__ == "__main__": - #import sys;sys.argv = ['', 'Test.testName'] unittest.main() From b177557047981f6020a9dba67571e5372de15ea2 Mon Sep 17 00:00:00 2001 From: VNMabus Date: Tue, 12 Oct 2021 19:41:01 +0200 Subject: [PATCH 032/117] Add angular distance. --- docs/modules/misc/metrics.rst | 12 +++++ skfda/misc/metrics/__init__.py | 1 + skfda/misc/metrics/_angular.py | 83 +++++++++++++++++++++++++++++ skfda/misc/metrics/_lp_distances.py | 15 +++--- skfda/misc/metrics/_typing.py | 9 ++-- 5 files changed, 109 insertions(+), 11 deletions(-) create mode 100644 skfda/misc/metrics/_angular.py diff --git a/docs/modules/misc/metrics.rst b/docs/modules/misc/metrics.rst index d6a407b82..5f662289a 100644 --- a/docs/modules/misc/metrics.rst +++ b/docs/modules/misc/metrics.rst @@ -34,6 +34,18 @@ value of ``p`` must be explicitly passed in each call. skfda.misc.metrics.lp_norm skfda.misc.metrics.lp_distance + +Angular distance +---------------- + +The angular distance (using the normalized "angle" between functions given +by the inner product) is also available, and useful in some contexts. + +.. autosummary:: + :toctree: autosummary + + skfda.misc.metrics.angular_distance + Elastic distances ----------------- diff --git a/skfda/misc/metrics/__init__.py b/skfda/misc/metrics/__init__.py index 9d55f478d..a2e2ac88a 100644 --- a/skfda/misc/metrics/__init__.py +++ b/skfda/misc/metrics/__init__.py @@ -1,5 +1,6 @@ """Metrics, norms and related utilities.""" +from ._angular import angular_distance from ._fisher_rao import ( _fisher_rao_warping_distance, fisher_rao_amplitude_distance, diff --git a/skfda/misc/metrics/_angular.py b/skfda/misc/metrics/_angular.py new file mode 100644 index 000000000..18316dc85 --- /dev/null +++ b/skfda/misc/metrics/_angular.py @@ -0,0 +1,83 @@ +from __future__ import annotations + +from typing import Optional, TypeVar, Union + +import numpy as np +from typing_extensions import Final + +from ...representation import FData +from ...representation._typing import NDArrayFloat +from .._math import cosine_similarity, cosine_similarity_matrix +from ._utils import pairwise_metric_optimization + +T = TypeVar("T", bound=Union[NDArrayFloat, FData]) + + +class AngularDistance(): + r""" + Calculate the angular distance between two objects. + + For each pair of observations x and y the angular distance between them is + defined as the normalized "angle" between them: + + .. math:: + d(x, y) = \frac{\arccos \left(\frac{\langle x, y \rangle}{ + \sqrt{\langle x, x \rangle \langle y, y \rangle}} \right)}{\pi} + + where :math:`\langle {}\cdot{}, {}\cdot{} \rangle` is the inner product. + This distance is defined in the interval [0, 1]. + + Args: + e1: First object. + e2: Second object. + + Returns: + Numpy vector where the i-th coordinate has the angular distance between + the i-th element of the first object and the i-th element of the second + one. + + Examples: + Computes the angular distances between an object containing functional + data corresponding to the functions y = 1 and y = x defined over the + interval [0, 1] and another ones containing data of the functions y + = 0 and y = x/2. The result then is an array of size 2 with the + computed l2 distance between the functions in the same position in + both. + + >>> import skfda + >>> import numpy as np + >>> + >>> x = np.linspace(0, 1, 1001) + >>> fd = skfda.FDataGrid([np.ones(len(x)), x], x) + >>> fd2 = skfda.FDataGrid([2*np.ones(len(x)), np.cos(x)], x) + >>> + >>> skfda.misc.metrics.angular_distance(fd, fd2).round(2) + array([ 0. , 0.22]) + + """ + + def __call__( + self, + e1: T, + e2: T, + ) -> NDArrayFloat: + """Compute the distance.""" + return np.arccos(cosine_similarity(e1, e2)) / np.pi + + def __repr__(self) -> str: + return ( + "{type(self).__name__}()" + ) + + +angular_distance: Final = AngularDistance() + + +@pairwise_metric_optimization.register +def _pairwise_metric_optimization_angular( + metric: AngularDistance, + elem1: Union[NDArrayFloat, FData], + elem2: Optional[Union[NDArrayFloat, FData]], +) -> NDArrayFloat: + + return np.arccos(cosine_similarity_matrix(elem1, elem2)) / np.pi diff --git a/skfda/misc/metrics/_lp_distances.py b/skfda/misc/metrics/_lp_distances.py index badf3f76c..d59d50a35 100644 --- a/skfda/misc/metrics/_lp_distances.py +++ b/skfda/misc/metrics/_lp_distances.py @@ -1,5 +1,6 @@ """Implementation of Lp distances.""" +from __future__ import annotations import math from typing import Optional, TypeVar, Union @@ -8,6 +9,7 @@ from typing_extensions import Final from ...representation import FData +from ...representation._typing import NDArrayFloat from ._lp_norms import LpNorm from ._typing import Norm from ._utils import NormInducedMetric, pairwise_metric_optimization @@ -16,7 +18,8 @@ class LpDistance(NormInducedMetric[FData]): - r"""Lp distance for FDataGrid objects. + r""" + Lp distance for functional data objects. Calculates the distance between two functional objects. @@ -24,7 +27,7 @@ class LpDistance(NormInducedMetric[FData]): as: .. math:: - d(f, g) = d(g, f) = \| f - g \|_p + d(x, y) = \| x - y \|_p where :math:`\| {}\cdot{} \|_p` denotes the :func:`Lp norm `. @@ -74,7 +77,7 @@ class LpDistance(NormInducedMetric[FData]): def __init__( self, p: float, - vector_norm: Union[Norm[np.ndarray], float, None] = None, + vector_norm: Union[Norm[NDArrayFloat], float, None] = None, ) -> None: self.p = p @@ -100,7 +103,7 @@ def _pairwise_metric_optimization_lp_fdata( metric: LpDistance, elem1: FData, elem2: Optional[FData], -) -> np.ndarray: +) -> NDArrayFloat: from ...misc import inner_product, inner_product_matrix vector_norm = metric.vector_norm @@ -141,8 +144,8 @@ def lp_distance( fdata2: T, *, p: float, - vector_norm: Union[Norm[np.ndarray], float, None] = None, -) -> np.ndarray: + vector_norm: Union[Norm[NDArrayFloat], float, None] = None, +) -> NDArrayFloat: r""" Lp distance for FDataGrid objects. diff --git a/skfda/misc/metrics/_typing.py b/skfda/misc/metrics/_typing.py index 1b3e76c84..8b44bd216 100644 --- a/skfda/misc/metrics/_typing.py +++ b/skfda/misc/metrics/_typing.py @@ -4,10 +4,9 @@ from builtins import isinstance from typing import Any, TypeVar, Union, overload -import numpy as np from typing_extensions import Final, Literal, Protocol -from ...representation._typing import Vector +from ...representation._typing import NDArrayFloat, Vector VectorType = TypeVar("VectorType", contravariant=True, bound=Vector) MetricElementType = TypeVar("MetricElementType", contravariant=True) @@ -29,7 +28,7 @@ class Norm(Protocol[VectorType]): """Protocol for a norm of a vector.""" @abstractmethod - def __call__(self, __vector: VectorType) -> np.ndarray: # noqa: WPS112 + def __call__(self, __vector: VectorType) -> NDArrayFloat: # noqa: WPS112 """Compute the norm of a vector.""" @@ -41,8 +40,8 @@ def __call__( self, __e1: MetricElementType, # noqa: WPS112 __e2: MetricElementType, # noqa: WPS112 - ) -> np.ndarray: - """Compute the norm of a vector.""" + ) -> NDArrayFloat: + """Compute the metric between two vectors.""" _NonStringMetric = TypeVar( From 0cd0d3422c0900b3d80b1269313d2a4df157b555 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Lejeune?= <54889281+Clej@users.noreply.github.com> Date: Tue, 12 Oct 2021 20:01:06 +0200 Subject: [PATCH 033/117] corrected inverse_transform for FDataBasis --- .../dim_reduction/feature_extraction/_fpca.py | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/skfda/preprocessing/dim_reduction/feature_extraction/_fpca.py b/skfda/preprocessing/dim_reduction/feature_extraction/_fpca.py index cc5cc29b5..cb274a9c4 100644 --- a/skfda/preprocessing/dim_reduction/feature_extraction/_fpca.py +++ b/skfda/preprocessing/dim_reduction/feature_extraction/_fpca.py @@ -220,9 +220,6 @@ def _fit_basis( lower=True, ) - # this matrix is needed to compute inverse_transform - self._l_inv_j_t = l_inv_j_t - # the final matrix, C(L-1Jt)t for svd or (L-1Jt)-1CtC(L-1Jt)t for PCA final_matrix = ( X.coefficients @ np.transpose(l_inv_j_t) / np.sqrt(n_samples) @@ -525,12 +522,13 @@ def inverse_transform( # inverse_transform is slightly different whether # .fit was applied to FDataGrid or FDataBasis object + # Does not work (boundary problem in x_hat and bias reconstruction) if isinstance(self.components_, FDataGrid): - # reconstruct the discretized functions - x_hat = ( - (pc_scores @ (self.components_.data_matrix[:, :, 0]) - @ (np.diag(np.sqrt(self.weights)) / np.sqrt(self.n_samples_))) + x_hat = np.matmul( + pc_scores, + self.components_.data_matrix[:,:,0] ) + # uncenter x_hat += self.mean_.data_matrix.reshape( (1, self.mean_.grid_points[0].shape[0]), ) @@ -543,10 +541,7 @@ def inverse_transform( ) elif isinstance(self.components_, FDataBasis): # reconstruct the basis coefficients - x_hat = ( - (pc_scores @ (self.components_.coefficients) - @ (np.transpose(self._l_inv_j_t) / np.sqrt(self.n_samples_))) - ) + x_hat = np.dot(pc_scores, self.components_.coefficients) x_hat += self.mean_.coefficients.reshape( (1, self.mean_.coefficients.shape[1]), ) From e841ce97a8ecbdfc30dab161f4543a84f39ee5f4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Lejeune?= <54889281+Clej@users.noreply.github.com> Date: Tue, 12 Oct 2021 20:10:49 +0200 Subject: [PATCH 034/117] code style issue --- skfda/preprocessing/dim_reduction/feature_extraction/_fpca.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skfda/preprocessing/dim_reduction/feature_extraction/_fpca.py b/skfda/preprocessing/dim_reduction/feature_extraction/_fpca.py index cb274a9c4..32be82b3d 100644 --- a/skfda/preprocessing/dim_reduction/feature_extraction/_fpca.py +++ b/skfda/preprocessing/dim_reduction/feature_extraction/_fpca.py @@ -526,7 +526,7 @@ def inverse_transform( if isinstance(self.components_, FDataGrid): x_hat = np.matmul( pc_scores, - self.components_.data_matrix[:,:,0] + self.components_.data_matrix[:, :, 0] ) # uncenter x_hat += self.mean_.data_matrix.reshape( From 28f27f2bd00d2a1bfed66abd7761156e7318f7aa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Lejeune?= <54889281+Clej@users.noreply.github.com> Date: Tue, 12 Oct 2021 20:44:15 +0200 Subject: [PATCH 035/117] random offsets in data generation for FDataBasis case --- tests/test_fpca.py | 87 ++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 77 insertions(+), 10 deletions(-) diff --git a/tests/test_fpca.py b/tests/test_fpca.py index 12f29b4d2..60e3e797a 100644 --- a/tests/test_fpca.py +++ b/tests/test_fpca.py @@ -2,6 +2,7 @@ import unittest import numpy as np +from numpy.core.fromnumeric import size from numpy.lib.index_tricks import nd_grid from skfda import FDataBasis, FDataGrid @@ -9,7 +10,7 @@ from skfda.misc.operators import LinearDifferentialOperator from skfda.misc.regularization import TikhonovRegularization from skfda.preprocessing.dim_reduction.feature_extraction import FPCA -from skfda.representation.basis import Fourier +from skfda.representation.basis import Fourier, BSpline class FPCATestCase(unittest.TestCase): @@ -451,8 +452,10 @@ def test_grid_fpca_regularization_fit_result(self) -> None: ) def test_grid_fpca_inverse_transform(self) -> None: - """Compare the reconstructions to fitting non-random data.""" - + """Compare the reconstructions.data_matrix to fitting data.""" + + seed = 42 + np.random.seed(seed) # Randomly, draw a true function that generates the dataset. def draw_one_random_fun(n_grid) -> FDataGrid: modes_location = np.random.uniform(-10., 10., size=50) @@ -460,12 +463,12 @@ def draw_one_random_fun(n_grid) -> FDataGrid: fd_random = make_multimodal_samples( start=0., stop=15., - n_samples=int(1), + n_samples=1, points_per_dim=n_grid, n_modes=modes_location.size, noise=noise, modes_location=modes_location, - random_state=42 + random_state=seed ) return fd_random @@ -497,16 +500,80 @@ def test_vs_dim(n_samples, n_grid, base_fun): ) # Low dimensional case (n_samples>n_grid) - n_samples = int(10**3) - n_grid = int(10**2) + n_samples = 10**3 + n_grid = 10**2 true_fun = draw_one_random_fun(n_grid) test_vs_dim(n_samples=n_samples, n_grid=n_grid, base_fun=true_fun) - # (almotst) High dimensional case (n_samples None: + """Compare the coef reconstructions to fitting data.""" + + seed = 42 + np.random.seed(seed) + # Draw a true function in a given basis with random coef. + def draw_one_random_fun(basis): + coef = np.random.uniform(-10., 10., size=basis.n_basis) + fd_random = FDataBasis( + basis=basis, + coefficients=coef + ) + return fd_random + + # test function w.t.t n_samples and basis + def test_vs_dim(n_samples, base_fun): + fd_random = base_fun.copy() + offset = np.random.uniform(-5., 5., size=n_samples) + # Random offsetting base_fun and form dataset fd_random + for i in range(n_samples): + fd_i = base_fun.copy() + fd_i.coefficients += offset[i] + fd_random = fd_random.concatenate(fd_i) + + # Take the allowed maximum number of components + # In almost high dimension: n_components=n_samples-1 < n_samples + # In low dimension: n_components=n_basis< Date: Wed, 13 Oct 2021 00:20:47 +0200 Subject: [PATCH 036/117] Optimize pairwise Fisher-Rao distance. --- skfda/misc/metrics/_angular.py | 2 +- skfda/misc/metrics/_fisher_rao.py | 99 +++++++++++++++++++++++-------- skfda/misc/metrics/_utils.py | 3 +- 3 files changed, 76 insertions(+), 28 deletions(-) diff --git a/skfda/misc/metrics/_angular.py b/skfda/misc/metrics/_angular.py index 18316dc85..b0cf3e519 100644 --- a/skfda/misc/metrics/_angular.py +++ b/skfda/misc/metrics/_angular.py @@ -66,7 +66,7 @@ def __call__( def __repr__(self) -> str: return ( - "{type(self).__name__}()" + f"{type(self).__name__}()" ) diff --git a/skfda/misc/metrics/_fisher_rao.py b/skfda/misc/metrics/_fisher_rao.py index 2d94af7d3..f63bb80f8 100644 --- a/skfda/misc/metrics/_fisher_rao.py +++ b/skfda/misc/metrics/_fisher_rao.py @@ -1,28 +1,57 @@ """Elastic metrics.""" -from typing import Any, Optional, TypeVar +from typing import Any, Optional, Tuple, TypeVar, Union import numpy as np import scipy.integrate +from typing_extensions import Final from ..._utils import normalize_scale, normalize_warping from ...preprocessing.registration import FisherRaoElasticRegistration -from ...representation import FData +from ...representation import FData, FDataGrid from ...representation._typing import NDArrayFloat from ..operators import SRSF from ._lp_distances import l2_distance -from ._utils import _cast_to_grid +from ._utils import PairwiseMetric, _cast_to_grid, pairwise_metric_optimization T = TypeVar("T", bound=FData) -def fisher_rao_distance( +def _transformation_for_fisher_rao( fdata1: T, fdata2: T, *, eval_points: Optional[NDArrayFloat] = None, _check: bool = True, -) -> NDArrayFloat: +) -> Tuple[FDataGrid, FDataGrid]: + fdata1, fdata2 = _cast_to_grid( + fdata1, + fdata2, + eval_points=eval_points, + _check=_check, + ) + + # Both should have the same grid points + eval_points_normalized = normalize_scale(fdata1.grid_points[0]) + + # Calculate the corresponding srsf and normalize to (0,1) + fdata1 = fdata1.copy( + grid_points=eval_points_normalized, + domain_range=(0, 1), + ) + fdata2 = fdata2.copy( + grid_points=eval_points_normalized, + domain_range=(0, 1), + ) + + srsf = SRSF(initial_value=0) + fdata1_srsf = srsf.fit_transform(fdata1) + fdata2_srsf = srsf.transform(fdata2) + + return fdata1_srsf, fdata2_srsf + + +class FisherRaoDistance(): r""" Compute the Fisher-Rao distance between two functional objects. @@ -58,32 +87,50 @@ def fisher_rao_distance( .. footbibliography:: """ - fdata1, fdata2 = _cast_to_grid( - fdata1, - fdata2, - eval_points=eval_points, - _check=_check, - ) - # Both should have the same grid points - eval_points_normalized = normalize_scale(fdata1.grid_points[0]) + def __call__( + self, + fdata1: T, + fdata2: T, + *, + eval_points: Optional[NDArrayFloat] = None, + _check: bool = True, + ) -> NDArrayFloat: + """Compute the distance.""" + # Return the L2 distance of the SRSF + return l2_distance(*_transformation_for_fisher_rao( + fdata1, + fdata2, + eval_points=eval_points, + _check=_check, + )) + + def __repr__(self) -> str: + return ( + f"{type(self).__name__}()" + ) + + +fisher_rao_distance: Final = FisherRaoDistance() + + +@pairwise_metric_optimization.register +def _pairwise_metric_optimization_fisher_rao( + metric: FisherRaoDistance, + elem1: T, + elem2: Optional[T], +) -> NDArrayFloat: - # Calculate the corresponding srsf and normalize to (0,1) - fdata1 = fdata1.copy( - grid_points=eval_points_normalized, - domain_range=(0, 1), - ) - fdata2 = fdata2.copy( - grid_points=eval_points_normalized, - domain_range=(0, 1), + new_elem2 = elem1.copy() if elem2 is None else elem2 + + new_elem1, new_elem2 = _transformation_for_fisher_rao( + elem1, + new_elem2, ) - srsf = SRSF(initial_value=0) - fdata1_srsf = srsf.fit_transform(fdata1) - fdata2_srsf = srsf.transform(fdata2) + pairwise = PairwiseMetric(l2_distance) - # Return the L2 distance of the SRSF - return l2_distance(fdata1_srsf, fdata2_srsf) + return pairwise(new_elem1, None if elem2 is None else new_elem2) def fisher_rao_amplitude_distance( diff --git a/skfda/misc/metrics/_utils.py b/skfda/misc/metrics/_utils.py index 2265edb5f..f2394c194 100644 --- a/skfda/misc/metrics/_utils.py +++ b/skfda/misc/metrics/_utils.py @@ -31,7 +31,8 @@ def _cast_to_grid( eval_points: Optional[NDArrayFloat] = None, _check: bool = True, ) -> Tuple[FDataGrid, FDataGrid]: - """Convert fdata1 and fdata2 to FDatagrid. + """ + Convert fdata1 and fdata2 to FDatagrid. Checks if the fdatas passed as argument are unidimensional and compatible and converts them to FDatagrid to compute their distances. From 27cae061e08e831cb05602b00013bb09d68b861d Mon Sep 17 00:00:00 2001 From: VNMabus Date: Thu, 14 Oct 2021 03:16:40 +0200 Subject: [PATCH 037/117] Add tranformation metric --- docs/modules/misc/metrics.rst | 12 +++++ skfda/misc/metrics/__init__.py | 1 + skfda/misc/metrics/_utils.py | 81 +++++++++++++++++++++++++++++++++- 3 files changed, 92 insertions(+), 2 deletions(-) diff --git a/docs/modules/misc/metrics.rst b/docs/modules/misc/metrics.rst index 5f662289a..b0823c15b 100644 --- a/docs/modules/misc/metrics.rst +++ b/docs/modules/misc/metrics.rst @@ -86,3 +86,15 @@ of objets. The following class can compute that efficiently: :toctree: autosummary skfda.misc.metrics.PairwiseMetric + + +Transformation metric +--------------------- + +Some metrics, such as those based in derivatives, can be expressed as a +transformation followed by another metric: + +.. autosummary:: + :toctree: autosummary + + skfda.misc.metrics.TransformationMetric diff --git a/skfda/misc/metrics/__init__.py b/skfda/misc/metrics/__init__.py index a2e2ac88a..b2fc981f2 100644 --- a/skfda/misc/metrics/__init__.py +++ b/skfda/misc/metrics/__init__.py @@ -19,5 +19,6 @@ from ._utils import ( NormInducedMetric, PairwiseMetric, + TransformationMetric, pairwise_metric_optimization, ) diff --git a/skfda/misc/metrics/_utils.py b/skfda/misc/metrics/_utils.py index f2394c194..be047dbfe 100644 --- a/skfda/misc/metrics/_utils.py +++ b/skfda/misc/metrics/_utils.py @@ -1,12 +1,12 @@ """Utilities for norms and metrics.""" -from typing import Any, Generic, Optional, Tuple, TypeVar +from typing import Any, Callable, Generic, Optional, Tuple, TypeVar import multimethod import numpy as np from ..._utils import _pairwise_symmetric from ...representation import FData, FDataGrid -from ...representation._typing import NDArrayFloat +from ...representation._typing import NDArrayFloat, Vector from ._typing import Metric, MetricElementType, Norm, VectorType T = TypeVar("T", bound=FData) @@ -187,3 +187,80 @@ def __call__( def __repr__(self) -> str: return f"{type(self).__name__}(metric={self.metric})" + + +Original = TypeVar("Original", bound=Vector) +Transformed = TypeVar("Transformed", bound=Vector) + + +class TransformationMetric(Generic[Original, Transformed], Metric[Original]): + """ + Compute a distance after transforming the data. + + This is a convenience function to compute a metric after a transformation + is applied to the data. It can be used, for example, to compute + Sobolev-like metrics. + + Args: + e1: First object. + e2: Second object. + + Returns: + Distance. + + Examples: + Compute the L2 distance between the function derivatives. + >>> import skfda + >>> from skfda.misc.metrics import l2_distance, TransformationMetric + + >>> x = np.linspace(0, 1, 1001) + >>> fd = skfda.FDataGrid([x], x) + >>> fd2 = skfda.FDataGrid([x/2], x) + + >>> dist = TransformationMetric( + ... transformation=lambda x: x.derivative(), + ... metric=l2_distance, + ... ) + >>> dist(fd, fd2) + array([ 0.5]) + + """ + + def __init__( + self, + transformation: Callable[[Original], Transformed], + metric: Metric[Transformed], + ): + self.transformation = transformation + self.metric = metric + + def __call__( + self, + e1: Original, + e2: Original, + ) -> NDArrayFloat: + """Compute the distance.""" + e1_trans = self.transformation(e1) + e2_trans = self.transformation(e2) + + return self.metric(e1_trans, e2_trans) + + def __repr__(self) -> str: + return ( + f"{type(self).__name__}()" + ) + + +@pairwise_metric_optimization.register +def _pairwise_metric_optimization_transformation_distance( + metric: TransformationMetric[Any, Any], + e1: T, + e2: Optional[T], +) -> NDArrayFloat: + + e1_trans = metric.transformation(e1) + e2_trans = None if e2 is None else metric.transformation(e2) + + pairwise = PairwiseMetric(metric.metric) + + return pairwise(e1_trans, e2_trans) From 2c754b73da291e7e5d253774d41b429899c014e7 Mon Sep 17 00:00:00 2001 From: VNMabus Date: Thu, 14 Oct 2021 12:30:57 +0200 Subject: [PATCH 038/117] Fix typo. --- skfda/misc/metrics/_utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/skfda/misc/metrics/_utils.py b/skfda/misc/metrics/_utils.py index be047dbfe..460fa1526 100644 --- a/skfda/misc/metrics/_utils.py +++ b/skfda/misc/metrics/_utils.py @@ -210,6 +210,7 @@ class TransformationMetric(Generic[Original, Transformed], Metric[Original]): Examples: Compute the L2 distance between the function derivatives. + >>> import skfda >>> from skfda.misc.metrics import l2_distance, TransformationMetric From 99ceb18289664940ec323e41e5a3fd94fcb7650f Mon Sep 17 00:00:00 2001 From: VNMabus Date: Thu, 14 Oct 2021 14:09:01 +0200 Subject: [PATCH 039/117] Try to fix tests. --- setup.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index d8a7354cd..d272c11a4 100644 --- a/setup.cfg +++ b/setup.cfg @@ -4,7 +4,7 @@ test=pytest [tool:pytest] addopts = --doctest-modules doctest_optionflags = NORMALIZE_WHITESPACE ELLIPSIS -norecursedirs = '.*', 'build', 'dist' '*.egg' 'venv' .svn _build docs/auto_examples examples docs/auto_tutorial tutorial +norecursedirs = .* build dist *.egg venv .svn _build docs/auto_examples examples docs/auto_tutorial tutorial [flake8] ignore = From c93ca4d767c6821952acd47d919bbb6dd6856f36 Mon Sep 17 00:00:00 2001 From: VNMabus Date: Thu, 14 Oct 2021 19:26:57 +0200 Subject: [PATCH 040/117] Clip values in cosine similarity. --- skfda/misc/_math.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/skfda/misc/_math.py b/skfda/misc/_math.py index ad1b4b231..b0f8a482d 100644 --- a/skfda/misc/_math.py +++ b/skfda/misc/_math.py @@ -532,6 +532,16 @@ def inner_product_matrix( return inner_product(arg1, arg2, _matrix=True, **kwargs) +def _clip_cosine(array: NDArrayFloat) -> NDArrayFloat: + """Clip cosine values to prevent numerical errors.""" + small_val = 1e-6 + + # If the difference is too large, there could be a problem + assert np.all((-1 - small_val < array) & (array < 1 + small_val)) + + return np.clip(array, -1, 1) + + def cosine_similarity( arg1: Vector, arg2: Vector, @@ -635,7 +645,7 @@ def cosine_similarity( norm1 = np.sqrt(inner_product(arg1, arg1)) norm2 = np.sqrt(inner_product(arg2, arg2)) - return inner_prod / norm1 / norm2 + return _clip_cosine(inner_prod / norm1 / norm2) def cosine_similarity_matrix( @@ -666,4 +676,6 @@ def cosine_similarity_matrix( norm1 = np.sqrt(inner_product(arg1, arg1)) norm2 = np.sqrt(inner_product(arg2, arg2)) - return inner_matrix / norm1[:, np.newaxis] / norm2[np.newaxis, :] + return _clip_cosine( + inner_matrix / norm1[:, np.newaxis] / norm2[np.newaxis, :], + ) From 9500b3e9a3ce1f445084c1473c4113a139e0c23e Mon Sep 17 00:00:00 2001 From: VNMabus Date: Fri, 15 Oct 2021 23:59:47 +0200 Subject: [PATCH 041/117] Allow lp_distance with numpy arrays. --- skfda/misc/metrics/_lp_distances.py | 8 ++++-- skfda/misc/metrics/_lp_norms.py | 38 +++++++++++++++++------------ 2 files changed, 28 insertions(+), 18 deletions(-) diff --git a/skfda/misc/metrics/_lp_distances.py b/skfda/misc/metrics/_lp_distances.py index d59d50a35..b1d25d8b2 100644 --- a/skfda/misc/metrics/_lp_distances.py +++ b/skfda/misc/metrics/_lp_distances.py @@ -14,10 +14,10 @@ from ._typing import Norm from ._utils import NormInducedMetric, pairwise_metric_optimization -T = TypeVar("T", bound=FData) +T = TypeVar("T", NDArrayFloat, FData) -class LpDistance(NormInducedMetric[FData]): +class LpDistance(NormInducedMetric[Union[NDArrayFloat, FData]]): r""" Lp distance for functional data objects. @@ -86,6 +86,10 @@ def __init__( super().__init__(norm) + # This method is retyped here to work with either arrays or functions + def __call__(self, elem1: T, elem2: T) -> NDArrayFloat: # noqa: WPS612 + return super().__call__(elem1, elem2) + def __repr__(self) -> str: return ( f"{type(self).__name__}(" diff --git a/skfda/misc/metrics/_lp_norms.py b/skfda/misc/metrics/_lp_norms.py index e2db76cda..b7a1c796c 100644 --- a/skfda/misc/metrics/_lp_norms.py +++ b/skfda/misc/metrics/_lp_norms.py @@ -7,11 +7,14 @@ import scipy.integrate from typing_extensions import Final +from skfda.representation._typing import NDArrayFloat + from ...representation import FData, FDataBasis +from ...representation._typing import NDArrayFloat from ._typing import Norm -class LpNorm(Norm[FData]): +class LpNorm(): r""" Norm of all the observations in a FDataGrid object. @@ -86,7 +89,7 @@ class LpNorm(Norm[FData]): def __init__( self, p: float, - vector_norm: Union[Norm[np.ndarray], float, None] = None, + vector_norm: Union[Norm[NDArrayFloat], float, None] = None, ) -> None: # Checks that the lp normed is well defined @@ -102,10 +105,13 @@ def __repr__(self) -> str: f"p={self.p}, vector_norm={self.vector_norm})" ) - def __call__(self, fdata: FData) -> np.ndarray: + def __call__(self, vector: Union[NDArrayFloat, FData]) -> NDArrayFloat: """Compute the Lp norm of a functional data object.""" from ...misc import inner_product + if isinstance(vector, np.ndarray): + return np.linalg.norm(vector, ord=self.p, axis=-1) + vector_norm = self.vector_norm if vector_norm is None: @@ -113,27 +119,27 @@ def __call__(self, fdata: FData) -> np.ndarray: # Special case, the inner product is heavily optimized if self.p == vector_norm == 2: - return np.sqrt(inner_product(fdata, fdata)) + return np.sqrt(inner_product(vector, vector)) - if isinstance(fdata, FDataBasis): + if isinstance(vector, FDataBasis): if self.p != 2: raise NotImplementedError - start, end = fdata.domain_range[0] + start, end = vector.domain_range[0] integral = scipy.integrate.quad_vec( - lambda x: np.power(np.abs(fdata(x)), self.p), + lambda x: np.power(np.abs(vector(x)), self.p), start, end, ) res = np.sqrt(integral[0]).flatten() else: - data_matrix = fdata.data_matrix + data_matrix = vector.data_matrix original_shape = data_matrix.shape data_matrix = data_matrix.reshape(-1, original_shape[-1]) data_matrix = (np.linalg.norm( - fdata.data_matrix, + vector.data_matrix, ord=vector_norm, axis=-1, keepdims=True, @@ -149,13 +155,13 @@ def __call__(self, fdata: FData) -> np.ndarray: axis=tuple(range(1, data_matrix.ndim)), ) - elif fdata.dim_domain == 1: + elif vector.dim_domain == 1: # Computes the norm, approximating the integral with Simpson's # rule. res = scipy.integrate.simps( data_matrix[..., 0] ** self.p, - x=fdata.grid_points, + x=vector.grid_points, ) ** (1 / self.p) else: @@ -174,11 +180,11 @@ def __call__(self, fdata: FData) -> np.ndarray: def lp_norm( - fdata: FData, + vector: Union[NDArrayFloat, FData], *, p: float, - vector_norm: Union[Norm[np.ndarray], float, None] = None, -) -> np.ndarray: + vector_norm: Union[Norm[NDArrayFloat], float, None] = None, +) -> NDArrayFloat: r"""Calculate the norm of all the observations in a FDataGrid object. For each observation f the Lp norm is defined as: @@ -218,7 +224,7 @@ def lp_norm( :class:`LpNorm` in those cases. Args: - fdata: FData object. + vector: Vector object. p: p of the lp norm. Must be greater or equal than 1. If ``p=math.inf`` it is used the L infinity metric. Defaults to 2. @@ -261,4 +267,4 @@ def lp_norm( :class:`LpNorm` """ - return LpNorm(p=p, vector_norm=vector_norm)(fdata) + return LpNorm(p=p, vector_norm=vector_norm)(vector) From 3917ae3d480d26383ad55e623d4d9ffa6fbdebca Mon Sep 17 00:00:00 2001 From: VNMabus Date: Mon, 18 Oct 2021 01:37:35 +0200 Subject: [PATCH 042/117] mRMR (no tests yet). --- skfda/_utils/__init__.py | 2 + skfda/_utils/_utils.py | 36 ++ .../variable_selection/__init__.py | 4 +- .../variable_selection/maxima_hunting.py | 78 ++-- .../dim_reduction/variable_selection/mrmr.py | 397 ++++++++++++++++++ .../recursive_maxima_hunting.py | 36 +- 6 files changed, 488 insertions(+), 65 deletions(-) create mode 100644 skfda/preprocessing/dim_reduction/variable_selection/mrmr.py diff --git a/skfda/_utils/__init__.py b/skfda/_utils/__init__.py index 979390ca3..4c52efada 100644 --- a/skfda/_utils/__init__.py +++ b/skfda/_utils/__init__.py @@ -8,6 +8,8 @@ _classifier_fit_depth_methods, _classifier_get_classes, _classifier_get_depth_methods, + _compute_dependence, + _DependenceMeasure, _evaluate_grid, _int_to_real, _pairwise_symmetric, diff --git a/skfda/_utils/_utils.py b/skfda/_utils/_utils.py index b901099eb..f718a55c5 100644 --- a/skfda/_utils/_utils.py +++ b/skfda/_utils/_utils.py @@ -728,3 +728,39 @@ def _classifier_fit_depth_methods( ) return classes, class_depth_methods_ + + +_DependenceMeasure = Callable[[np.ndarray, np.ndarray], np.ndarray] + + +def _compute_dependence( + X: np.ndarray, + y: np.ndarray, + *, + dependence_measure: _DependenceMeasure, +) -> np.ndarray: + """ + Compute dependence between points and target. + + Computes the dependence of each point in each trajectory in X with the + corresponding class label in Y. + + """ + from dcor import rowwise + + # Move n_samples to the end + # The shape is now input_shape + n_samples + n_output + X = np.moveaxis(X, 0, -2) + + input_shape = X.shape[:-2] + + # Join input in a list for rowwise + X = X.reshape(-1, X.shape[-2], X.shape[-1]) + + if y.ndim == 1: + y = np.atleast_2d(y).T + Y = np.array([y] * len(X)) + + dependence_results = rowwise(dependence_measure, X, Y) + + return dependence_results.reshape(input_shape) diff --git a/skfda/preprocessing/dim_reduction/variable_selection/__init__.py b/skfda/preprocessing/dim_reduction/variable_selection/__init__.py index ecb8cb0f6..6d85fff30 100644 --- a/skfda/preprocessing/dim_reduction/variable_selection/__init__.py +++ b/skfda/preprocessing/dim_reduction/variable_selection/__init__.py @@ -1,5 +1,5 @@ -from . import maxima_hunting - +from . import maxima_hunting, mrmr, recursive_maxima_hunting from ._rkvs import RKHSVariableSelection from .maxima_hunting import MaximaHunting +from .mrmr import MinimumRedundancyMaximumRelevance from .recursive_maxima_hunting import RecursiveMaximaHunting diff --git a/skfda/preprocessing/dim_reduction/variable_selection/maxima_hunting.py b/skfda/preprocessing/dim_reduction/variable_selection/maxima_hunting.py index 74f2c35f4..57097f1cf 100644 --- a/skfda/preprocessing/dim_reduction/variable_selection/maxima_hunting.py +++ b/skfda/preprocessing/dim_reduction/variable_selection/maxima_hunting.py @@ -1,53 +1,23 @@ """Maxima Hunting dimensionality reduction and related methods.""" from __future__ import annotations -from typing import Callable, Optional +from typing import Callable, Optional, Union import numpy as np +import scipy.signal import sklearn.base import sklearn.utils -import scipy.signal -from dcor import rowwise, u_distance_correlation_sqr +from dcor import u_distance_correlation_sqr +from ...._utils import _compute_dependence, _DependenceMeasure from ....representation import FDataGrid +from ....representation._typing import NDArrayFloat, NDArrayInt -_DependenceMeasure = Callable[[np.ndarray, np.ndarray], np.ndarray] -_LocalMaximaSelector = Callable[[np.ndarray], np.ndarray] - - -def _compute_dependence( - X: np.ndarray, - y: np.ndarray, - *, - dependence_measure: _DependenceMeasure, -) -> np.ndarray: - """ - Compute dependence between points and target. - - Computes the dependence of each point in each trajectory in X with the - corresponding class label in Y. - - """ - # Move n_samples to the end - # The shape is now input_shape + n_samples + n_output - X = np.moveaxis(X, 0, -2) - - input_shape = X.shape[:-2] +_LocalMaximaSelector = Callable[[FDataGrid], NDArrayInt] - # Join input in a list for rowwise - X = X.reshape(-1, X.shape[-2], X.shape[-1]) - if y.ndim == 1: - y = np.atleast_2d(y).T - Y = np.array([y] * len(X)) - - dependence_results = rowwise(dependence_measure, X, Y) - - return dependence_results.reshape(input_shape) - - -def select_local_maxima(X: np.ndarray, *, order: int = 1) -> np.ndarray: +def select_local_maxima(X: FDataGrid, *, order: int = 1) -> NDArrayInt: r""" Compute local maxima of an array. @@ -65,11 +35,12 @@ def select_local_maxima(X: np.ndarray, *, order: int = 1) -> np.ndarray: Indexes of the local maxima. Examples: + >>> from skfda import FDataGrid >>> from skfda.preprocessing.dim_reduction.variable_selection.\ ... maxima_hunting import select_local_maxima >>> import numpy as np - >>> x = np.array([2, 1, 1, 1, 2, 3, 3, 3, 2, 3, 4, 3, 2]) + >>> x = FDataGrid(np.array([2, 1, 1, 1, 2, 3, 3, 3, 2, 3, 4, 3, 2])) >>> select_local_maxima(x).astype(np.int_) array([ 0, 5, 7, 10]) @@ -77,22 +48,24 @@ def select_local_maxima(X: np.ndarray, *, order: int = 1) -> np.ndarray: if a point is still a maxima, effectively eliminating small local maxima. - >>> x = np.array([2, 1, 1, 1, 2, 3, 3, 3, 2, 3, 4, 3, 2]) + >>> x = FDataGrid(np.array([2, 1, 1, 1, 2, 3, 3, 3, 2, 3, 4, 3, 2])) >>> select_local_maxima(x, order=3).astype(np.int_) array([ 0, 5, 10]) """ + X_array = X.data_matrix[0, ..., 0] + indexes = scipy.signal.argrelextrema( - X, + X_array, comparator=np.greater_equal, order=order, )[0] # Discard flat - maxima = X[indexes] + maxima = X_array[indexes] - left_points = np.take(X, indexes - 1, mode='clip') - right_points = np.take(X, indexes + 1, mode='clip') + left_points = np.take(X_array, indexes - 1, mode='clip') + right_points = np.take(X_array, indexes + 1, mode='clip') is_not_flat = (maxima > left_points) | (maxima > right_points) @@ -194,7 +167,11 @@ def __init__( self.dependence_measure = dependence_measure self.local_maxima_selector = local_maxima_selector - def fit(self, X: FDataGrid, y: np.ndarray) -> MaximaHunting: # noqa: D102 + def fit( + self, + X: FDataGrid, + y: Union[NDArrayInt, NDArrayFloat], + ) -> MaximaHunting: # noqa: D102 self.features_shape_ = X.data_matrix.shape[1:] self.dependence_ = _compute_dependence( @@ -203,14 +180,19 @@ def fit(self, X: FDataGrid, y: np.ndarray) -> MaximaHunting: # noqa: D102 dependence_measure=self.dependence_measure, ) - self.indexes_ = self.local_maxima_selector(self.dependence_) + self.indexes_ = self.local_maxima_selector( + FDataGrid( + self.dependence_, + grid_points=X.grid_points, + ), + ) sorting_indexes = np.argsort(self.dependence_[self.indexes_])[::-1] self.sorted_indexes_ = self.indexes_[sorting_indexes] return self - def get_support(self, indices: bool = False) -> np.ndarray: # noqa: D102 + def get_support(self, indices: bool = False) -> NDArrayInt: # noqa: D102 if indices: return self.indexes_ @@ -221,8 +203,8 @@ def get_support(self, indices: bool = False) -> np.ndarray: # noqa: D102 def transform( # noqa: D102 self, X: FDataGrid, - y: Optional[np.ndarray] = None, - ) -> np.ndarray: + y: Optional[Union[NDArrayInt, NDArrayFloat]] = None, + ) -> NDArrayFloat: sklearn.utils.validation.check_is_fitted(self) diff --git a/skfda/preprocessing/dim_reduction/variable_selection/mrmr.py b/skfda/preprocessing/dim_reduction/variable_selection/mrmr.py new file mode 100644 index 000000000..73107c0ea --- /dev/null +++ b/skfda/preprocessing/dim_reduction/variable_selection/mrmr.py @@ -0,0 +1,397 @@ +from __future__ import annotations + +import operator +from typing import Callable, NamedTuple, Optional, Tuple, Union, overload + +import numpy as np +import sklearn.base +import sklearn.utils.validation +from sklearn.metrics import mutual_info_score +from typing_extensions import Final, Literal + +from ...._utils import _compute_dependence, _DependenceMeasure +from ....representation._typing import NDArrayFloat, NDArrayInt +from ....representation.grid import FDataGrid + +_Criterion = Callable[[NDArrayFloat, NDArrayFloat], NDArrayFloat] + + +class Method(NamedTuple): + """Predefined mRMR method.""" + + relevance_dependence_measure: _DependenceMeasure + redundancy_dependence_measure: _DependenceMeasure + criterion: _Criterion + + +def mutual_information( + x: NDArrayFloat, + y: NDArrayFloat, +) -> NDArrayFloat: + + x = np.ravel(x) + y = np.ravel(y) + + # Calculate bins with the Sturges rule + bins = int(1 + np.ceil(np.log2(len(x)))) + c_xy = np.histogram2d(x, y, bins)[0] + mi = mutual_info_score(None, None, contingency=c_xy) + return mi + + +MID: Final = Method( + relevance_dependence_measure=mutual_information, + redundancy_dependence_measure=mutual_information, + criterion=operator.sub, +) + + +MIQ: Final = Method( + relevance_dependence_measure=mutual_information, + redundancy_dependence_measure=mutual_information, + criterion=operator.truediv, +) + + +MethodName = Literal["MID", "MIQ"] + + +def _parse_method(name: MethodName) -> Method: + if name == "MID": + return MID + elif name == "MIQ": + return MIQ + + +def _mrmr( + X: NDArrayFloat, + Y: Union[NDArrayInt, NDArrayFloat], + n_features_to_select: int = 1, + relevance_dependence_measure: _DependenceMeasure = mutual_information, + redundancy_dependence_measure: _DependenceMeasure = mutual_information, + criterion: _Criterion = operator.truediv, +) -> Tuple[NDArrayInt, NDArrayFloat, NDArrayFloat]: + indexes = list(range(X.shape[1])) + + selected_features = [] + scores = [] + selected_relevances = [] + + relevances = _compute_dependence( + X[..., np.newaxis], + Y, + dependence_measure=relevance_dependence_measure, + ) + redundancies = np.zeros((X.shape[1], X.shape[1])) + + max_index = np.argmax(relevances) + selected_features.append(indexes[max_index]) + scores.append(relevances[max_index]) + selected_relevances.append(relevances[max_index]) + + indexes = np.delete(indexes, max_index) + + # TODO: Vectorize + for i in range(1, n_features_to_select): + + # Calculate redundancies of the last selected variable + last_selected = selected_features[i - 1] + + for j in range(X.shape[1]): + if not redundancies[last_selected, j]: + redundancies[last_selected, j] = redundancy_dependence_measure( + X[:, last_selected, np.newaxis], + X[:, j, np.newaxis], + ) + redundancies[j, last_selected] = redundancies[last_selected, j] + + W = np.mean( + redundancies[np.ix_(selected_features[:i], indexes)], + axis=0, + ) + + coef = criterion(relevances[indexes], W) + + max_index = np.argmax(coef) + selected_features.append(indexes[max_index]) + scores.append(coef[max_index]) + selected_relevances.append(relevances[max_index]) + + indexes = np.delete(indexes, max_index) + + return ( + np.asarray(selected_features), + np.asarray(scores), + np.asarray(relevances), + ) + + +class MinimumRedundancyMaximumRelevance( + sklearn.base.BaseEstimator, # type: ignore + sklearn.base.TransformerMixin, # type: ignore +): + """ + Minimum redundancy maximum relevance (mRMR) method. + + Parameters: + n_features_to_select: Number of features to select. + method: Predefined method to use (MID or MIQ). + dependence_measure: Dependence measure to use both for relevance and + for redundancy. + relevance_dependence_measure: Dependence measure used to compute + relevance. + redundancy_dependence_measure: Dependence measure used to compute + redundancy. + criterion: Criterion to combine relevance and redundancy. Common + choices include the difference and the quotient. + + Examples: + >>> from skfda.preprocessing.dim_reduction import variable_selection + >>> from skfda.datasets import make_gaussian_process + >>> import skfda + >>> import numpy as np + >>> import operator + >>> import dcor + + We create trajectories from two classes, one with zero mean and the + other with a peak-like mean. Both have Brownian covariance. + + >>> n_samples = 1000 + >>> n_features = 100 + >>> + >>> def mean_1(t): + ... return ( + ... np.abs(t - 0.25) + ... - 2 * np.abs(t - 0.5) + ... + np.abs(t - 0.75) + ... ) + >>> + >>> X_0 = make_gaussian_process( + ... n_samples=n_samples // 2, + ... n_features=n_features, + ... random_state=0, + ... ) + >>> X_1 = make_gaussian_process( + ... n_samples=n_samples // 2, + ... n_features=n_features, + ... mean=mean_1, + ... random_state=1, + ... ) + >>> X = skfda.concatenate((X_0, X_1)) + >>> + >>> y = np.zeros(n_samples) + >>> y [n_samples // 2:] = 1 + + Select the relevant points to distinguish the two classes + + >>> mrmr = variable_selection.MinimumRedundancyMaximumRelevance( + ... n_features_to_select=3, + ... dependence_measure=dcor.u_distance_correlation_sqr, + ... criterion=operator.truediv, + ... ) + >>> _ = mrmr.fit(X, y) + >>> point_mask = mrmr.get_support() + >>> points = X.grid_points[0][point_mask] + + Apply the learned dimensionality reduction + + >>> X_dimred = mrmr.transform(X) + >>> len(X.grid_points[0]) + 100 + >>> X_dimred.shape + (1000, 3) + + """ + + @overload + def __init__( + self, + *, + n_features_to_select: int = 1, + ) -> None: + pass + + @overload + def __init__( + self, + *, + n_features_to_select: int = 1, + method: Union[Method, MethodName], + ) -> None: + pass + + @overload + def __init__( + self, + *, + n_features_to_select: int = 1, + dependence_measure: _DependenceMeasure, + criterion: _Criterion, + ) -> None: + pass + + @overload + def __init__( + self, + *, + n_features_to_select: int = 1, + relevance_dependence_measure: _DependenceMeasure, + redundancy_dependence_measure: _DependenceMeasure, + criterion: _Criterion, + ) -> None: + pass + + def __init__( + self, + *, + n_features_to_select: int = 1, + method: Union[Method, MethodName, None] = None, + dependence_measure: Optional[_DependenceMeasure] = None, + relevance_dependence_measure: Optional[_DependenceMeasure] = None, + redundancy_dependence_measure: Optional[_DependenceMeasure] = None, + criterion: Optional[_Criterion] = None, + ) -> None: + self.n_features_to_select = n_features_to_select + self.method = method + self.dependence_measure = dependence_measure + self.relevance_dependence_measure = relevance_dependence_measure + self.redundancy_dependence_measure = redundancy_dependence_measure + self.criterion = criterion + + def _validate_parameters(self) -> None: + method = MIQ if all( + p is None for p in ( + self.method, + self.dependence_measure, + self.relevance_dependence_measure, + self.redundancy_dependence_measure, + self.criterion, + ) + ) else self.method + + if method: + if ( + self.dependence_measure + or self.relevance_dependence_measure + or self.redundancy_dependence_measure + or self.criterion + ): + raise ValueError( + "The 'method' parameter and the parameters " + "'dependency_measure', 'relevance_dependence_measure' " + "'redundancy_dependence_measure' and 'criterion' are " + "incompatible", + ) + + method = ( + _parse_method(method) + if isinstance(method, str) else method + ) + + self.relevance_dependence_measure_ = ( + method.relevance_dependence_measure + ) + self.redundancy_dependence_measure_ = ( + method.redundancy_dependence_measure + ) + self.criterion_ = ( + method.criterion + ) + + else: + if self.criterion is None: + raise ValueError( + "You must specify a criterion parameter", + ) + + self.criterion_ = ( + self.criterion + ) + + if self.dependence_measure: + if ( + self.relevance_dependence_measure + or self.redundancy_dependence_measure + ): + raise ValueError( + "The 'dependency_measure' parameter and the " + "parameters 'relevance_dependence_measure' " + "and 'redundancy_dependence_measure' " + "are incompatible", + ) + + self.relevance_dependence_measure_ = ( + self.dependence_measure + ) + self.redundancy_dependence_measure_ = ( + self.dependence_measure + ) + else: + if not self.relevance_dependence_measure: + raise ValueError( + "Missing parameter 'relevance_dependence_measure'", + ) + if not self.redundancy_dependence_measure: + raise ValueError( + "Missing parameter 'redundancy_dependence_measure'", + ) + self.relevance_dependence_measure_ = ( + self.relevance_dependence_measure + ) + self.redundancy_dependence_measure_ = ( + self.redundancy_dependence_measure + ) + + def fit( + self, + X: FDataGrid, + y: Union[NDArrayInt, NDArrayFloat], + ) -> MinimumRedundancyMaximumRelevance: + + self._validate_parameters() + + X_array = X.data_matrix[..., 0] + + X_array, y = sklearn.utils.validation.check_X_y(X_array, y) + + self.features_shape_ = X_array.shape[1:] + + self.results_ = _mrmr( + X=X_array, + Y=y, + n_features_to_select=self.n_features_to_select, + relevance_dependence_measure=self.relevance_dependence_measure_, + redundancy_dependence_measure=self.redundancy_dependence_measure_, + criterion=self.criterion_, + )[0] + + return self + + def transform( + self, + X: FDataGrid, + y: None = None, + ) -> NDArrayFloat: + + X_array = X.data_matrix[..., 0] + + sklearn.utils.validation.check_is_fitted(self) + + X_array = sklearn.utils.validation.check_array(X_array) + + if X_array.shape[1:] != self.features_shape_: + raise ValueError( + "The trajectories have a different number of " + "points than the ones fitted", + ) + + return X_array[:, self.results_] + + def get_support(self, indices: bool = False) -> NDArrayInt: + indexes_unraveled = self.results_ + if indices: + return indexes_unraveled + else: + mask = np.zeros(self.features_shape_[0], dtype=bool) + mask[self.results_] = True + return mask diff --git a/skfda/preprocessing/dim_reduction/variable_selection/recursive_maxima_hunting.py b/skfda/preprocessing/dim_reduction/variable_selection/recursive_maxima_hunting.py index 3c100e859..762fa0d2c 100644 --- a/skfda/preprocessing/dim_reduction/variable_selection/recursive_maxima_hunting.py +++ b/skfda/preprocessing/dim_reduction/variable_selection/recursive_maxima_hunting.py @@ -1,19 +1,19 @@ import abc import copy -import dcor import numbers import random +import numpy as np +import numpy.linalg as linalg +import numpy.ma as ma import scipy.stats import sklearn.base import sklearn.utils -import numpy as np -import numpy.linalg as linalg -import numpy.ma as ma +import dcor +from ...._utils import _compute_dependence from ....representation import FDataGrid -from .maxima_hunting import _compute_dependence def _transform_to_2d(t): @@ -844,17 +844,23 @@ class RecursiveMaximaHunting( >>> n_features = 100 >>> >>> def mean_1(t): - ... return (np.abs(t - 0.25) - ... - 2 * np.abs(t - 0.5) - ... + np.abs(t - 0.75)) + ... return ( + ... np.abs(t - 0.25) + ... - 2 * np.abs(t - 0.5) + ... + np.abs(t - 0.75) + ... ) >>> - >>> X_0 = make_gaussian_process(n_samples=n_samples // 2, - ... n_features=n_features, - ... random_state=0) - >>> X_1 = make_gaussian_process(n_samples=n_samples // 2, - ... n_features=n_features, - ... mean=mean_1, - ... random_state=1) + >>> X_0 = make_gaussian_process( + ... n_samples=n_samples // 2, + ... n_features=n_features, + ... random_state=0, + ... ) + >>> X_1 = make_gaussian_process( + ... n_samples=n_samples // 2, + ... n_features=n_features, + ... mean=mean_1, + ... random_state=1, + ... ) >>> X = skfda.concatenate((X_0, X_1)) >>> >>> y = np.zeros(n_samples) From 3e6632bcff932a7da06d1a115aa67fc6c0b309e6 Mon Sep 17 00:00:00 2001 From: VNMabus Date: Tue, 19 Oct 2021 19:43:36 +0200 Subject: [PATCH 043/117] Improve mutual information estimation. --- .../dim_reduction/variable_selection/mrmr.py | 76 +++++++++++++------ 1 file changed, 53 insertions(+), 23 deletions(-) diff --git a/skfda/preprocessing/dim_reduction/variable_selection/mrmr.py b/skfda/preprocessing/dim_reduction/variable_selection/mrmr.py index 73107c0ea..d3c0f8925 100644 --- a/skfda/preprocessing/dim_reduction/variable_selection/mrmr.py +++ b/skfda/preprocessing/dim_reduction/variable_selection/mrmr.py @@ -1,15 +1,27 @@ from __future__ import annotations import operator -from typing import Callable, NamedTuple, Optional, Tuple, Union, overload +from typing import ( + Any, + Callable, + Dict, + NamedTuple, + Optional, + Tuple, + Union, + overload, +) import numpy as np import sklearn.base import sklearn.utils.validation -from sklearn.metrics import mutual_info_score +from sklearn.feature_selection import ( + mutual_info_classif, + mutual_info_regression, +) from typing_extensions import Final, Literal -from ...._utils import _compute_dependence, _DependenceMeasure +from ...._utils import RandomStateLike, _compute_dependence, _DependenceMeasure from ....representation._typing import NDArrayFloat, NDArrayInt from ....representation.grid import FDataGrid @@ -26,17 +38,23 @@ class Method(NamedTuple): def mutual_information( x: NDArrayFloat, - y: NDArrayFloat, + y: Union[NDArrayInt, NDArrayFloat], + n_neighbors: Optional[int] = None, + random_state: RandomStateLike = None, ) -> NDArrayFloat: + """Compute mutual information.""" + y = y.ravel() + + method = ( + mutual_info_regression if issubclass(y.dtype.type, np.floating) + else mutual_info_classif + ) - x = np.ravel(x) - y = np.ravel(y) + extra_args: Dict[str, Any] = {} + if n_neighbors is not None: + extra_args['n_neighbors'] = n_neighbors - # Calculate bins with the Sturges rule - bins = int(1 + np.ceil(np.log2(len(x)))) - c_xy = np.histogram2d(x, y, bins)[0] - mi = mutual_info_score(None, None, contingency=c_xy) - return mi + return method(x, y, random_state=random_state, **extra_args) MID: Final = Method( @@ -84,12 +102,12 @@ def _mrmr( ) redundancies = np.zeros((X.shape[1], X.shape[1])) - max_index = np.argmax(relevances) + max_index = int(np.argmax(relevances)) selected_features.append(indexes[max_index]) scores.append(relevances[max_index]) selected_relevances.append(relevances[max_index]) - indexes = np.delete(indexes, max_index) + indexes.remove(max_index) # TODO: Vectorize for i in range(1, n_features_to_select): @@ -112,12 +130,12 @@ def _mrmr( coef = criterion(relevances[indexes], W) - max_index = np.argmax(coef) + max_index = int(np.argmax(coef)) selected_features.append(indexes[max_index]) scores.append(coef[max_index]) selected_relevances.append(relevances[max_index]) - indexes = np.delete(indexes, max_index) + indexes.remove(max_index) return ( np.asarray(selected_features), @@ -179,15 +197,15 @@ class MinimumRedundancyMaximumRelevance( ... ) >>> X = skfda.concatenate((X_0, X_1)) >>> - >>> y = np.zeros(n_samples) + >>> y = np.zeros(n_samples, dtype=np.int_) >>> y [n_samples // 2:] = 1 - Select the relevant points to distinguish the two classes + Select the relevant points to distinguish the two classes. You + may specify a method such as MIQ (the default) or MID. >>> mrmr = variable_selection.MinimumRedundancyMaximumRelevance( ... n_features_to_select=3, - ... dependence_measure=dcor.u_distance_correlation_sqr, - ... criterion=operator.truediv, + ... method="MID", ... ) >>> _ = mrmr.fit(X, y) >>> point_mask = mrmr.get_support() @@ -201,6 +219,18 @@ class MinimumRedundancyMaximumRelevance( >>> X_dimred.shape (1000, 3) + It is also possible to specify the measure of dependence used (or + even different ones for relevance and redundancy) as well as the + function to combine relevance and redundancy (usually the division + or subtraction operations). + + >>> mrmr = variable_selection.MinimumRedundancyMaximumRelevance( + ... n_features_to_select=3, + ... dependence_measure=dcor.u_distance_correlation_sqr, + ... criterion=operator.truediv, + ... ) + >>> _ = mrmr.fit(X, y) + """ @overload @@ -391,7 +421,7 @@ def get_support(self, indices: bool = False) -> NDArrayInt: indexes_unraveled = self.results_ if indices: return indexes_unraveled - else: - mask = np.zeros(self.features_shape_[0], dtype=bool) - mask[self.results_] = True - return mask + + mask = np.zeros(self.features_shape_[0], dtype=bool) + mask[self.results_] = True + return mask From 2588786cf9facae07e36a290b941001676de7be7 Mon Sep 17 00:00:00 2001 From: VNMabus Date: Wed, 20 Oct 2021 10:12:35 +0200 Subject: [PATCH 044/117] Improve docs. --- docs/conf.py | 8 ++ docs/modules/preprocessing/dim_reduction.rst | 19 ++--- tutorial/plot_basis_representation.py | 77 +++++++++++++------- tutorial/plot_getting_data.py | 41 +++++++---- tutorial/plot_introduction.py | 9 ++- tutorial/plot_skfda_sklearn.py | 4 + 6 files changed, 105 insertions(+), 53 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index a59de0212..13ef76f12 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -20,6 +20,7 @@ import os import sys +import warnings import pkg_resources # -- Extensions to the Napoleon GoogleDocstring class --------------------- @@ -274,6 +275,13 @@ def __call__(self, filename: str) -> str: 'within_subsection_order': SkfdaExplicitSubOrder, } +warnings.filterwarnings( + "ignore", + category=UserWarning, + message='Matplotlib is currently using agg, which is a' + ' non-GUI backend, so cannot show the figure.', +) + autosummary_generate = True autodoc_typehints = "description" napoleon_use_rtype = True diff --git a/docs/modules/preprocessing/dim_reduction.rst b/docs/modules/preprocessing/dim_reduction.rst index d45a85909..7f1b7a0d7 100644 --- a/docs/modules/preprocessing/dim_reduction.rst +++ b/docs/modules/preprocessing/dim_reduction.rst @@ -21,6 +21,7 @@ following: skfda.preprocessing.dim_reduction.variable_selection.MaximaHunting skfda.preprocessing.dim_reduction.variable_selection.RecursiveMaximaHunting skfda.preprocessing.dim_reduction.variable_selection.RKHSVariableSelection + skfda.preprocessing.dim_reduction.variable_selection.MinimumRedundancyMaximumRelevance .. toctree:: :hidden: @@ -29,15 +30,15 @@ following: dim_reduction/recursive_maxima_hunting -Projection ----------- -Another way to reduce the dimension is through projection. For example, in -functional principal component analysis, we project the data samples -into a smaller sample of functions that preserve the maximum sample +Feature extraction +------------------ +Other dimensionality reduction methods construct new features from +existing ones. For example, in functional principal component +analysis, we project the data samples into a smaller sample of +functions that preserve most of the original variance. -.. toctree:: - :maxdepth: 4 - :caption: Modules: +.. autosummary:: + :toctree: autosummary - dim_reduction/fpca \ No newline at end of file + skfda.preprocessing.dim_reduction.feature_extraction.FPCA \ No newline at end of file diff --git a/tutorial/plot_basis_representation.py b/tutorial/plot_basis_representation.py index e22763b52..641659c21 100644 --- a/tutorial/plot_basis_representation.py +++ b/tutorial/plot_basis_representation.py @@ -30,11 +30,17 @@ # operations. ############################################################################## -# In order to show these operations, we create the first FDatagrid and plot -# it. +# In order to show the vector operations, we create two FDatagrids with +# two functions each, +# :math:`\mathbf{X}_1 = \{x_{1i}: \mathbb{R} \to \mathbb{R}\}, i=1,2` and +# :math:`\mathbf{X}_2 = \{x_{2i}: \mathbb{R} \to \mathbb{R}\}, i=1,2`, +# and plot them. import numpy as np import skfda +import matplotlib.pyplot as plt + +fig, axes = plt.subplots(1, 2, figsize=(8, 3)) t = np.linspace(0, 1, 100) @@ -46,37 +52,44 @@ grid_points=t, ) -fd.plot() - -############################################################################## -# Functions can be multiplied by an scalar. This only changes the scale of -# the functions, but not their shape. - -scalar_mul = 3 * fd - -scalar_mul.plot() - -############################################################################## -# We need two objects to show the sum. Thus we create a second FDatagrid and -# plot it. +fd.plot(axes=axes[0]) +axes[0].set_title(r"$\mathbf{X}_1$") fd2 = skfda.FDataGrid( data_matrix=[ 3 * t**2, # First function - np.log(t), # Second function + np.log(t + 0.1), # Second function ], grid_points=t, ) -fd2.plot() +fd2.plot(axes=axes[1]) +axes[1].set_title(r"$\mathbf{X}_2$") + +plt.show() ############################################################################## -# We can now plot the sum of both :class:`~skfda.representation.grid.FDataGrid` -# objects. +# Functions can be multiplied by an scalar. This only changes the scale of +# the functions, but not their shape. Note that all the functions in the +# dataset are affected. +# +# It is also possible to add two functions together. If you do that with +# two :class:`~skfda.representation.grid.FDataGrid` objects with the same +# length, the corresponding functions will be added. + +fig, axes = plt.subplots(1, 2, figsize=(8, 3)) + +scalar_mul = 3 * fd + +scalar_mul.plot(axes=axes[0]) +axes[0].set_title(r"$3 \mathbf{X}_1$") fd_sum = fd + fd2 -fd_sum.plot() +fd_sum.plot(axes=axes[1]) +axes[1].set_title(r"$\mathbf{X}_1 + \mathbf{X}_2$") + +plt.show() ############################################################################## # Infinite (Schauder) basis @@ -117,7 +130,7 @@ # a truncated monomial basis (and thus it is a polynomial): # # .. math:: -# x(t) = 3 + 2x - 4x^2 + x^3 +# x(t) = 3 + 2t - 4t^2 + t^3 basis = skfda.representation.basis.Monomial( n_basis=4, @@ -132,6 +145,7 @@ ) fd_basis.plot() +plt.show() ############################################################################## # Conversion between FDataGrid and FDataBasis @@ -156,8 +170,6 @@ # can see that as more basis functions are used, the basis representation # provides a better representation of the real data. -import matplotlib.pyplot as plt - max_basis = 9 X, y = skfda.datasets.fetch_phoneme(return_X_y=True) @@ -178,6 +190,7 @@ ax.set_title(f"{n_basis} basis functions") fig.tight_layout() +plt.show() ############################################################################## # List of available basis functions @@ -198,6 +211,7 @@ X = X[:5] X.plot() +plt.show() ############################################################################## # Monomial basis @@ -214,9 +228,9 @@ # # As a basis for functional data analysis, however, it has several issues that # usually make preferrable to use other basis instead. First, the usual basis -# :math:`\{1, x, x^2, x^3, \ldots\}` is not orthogonal under the standard -# inner product in :math:`L^2`, that is :math:`\langle x, y \rangle = -# \int_{\mathcal{T}} x(t) y(t) dt`. This inhibits some +# :math:`\{1, t, t^2, t^3, \ldots\}` is not orthogonal under the standard +# inner product in :math:`L^2`, that is :math:`\langle x_1, x_2 \rangle = +# \int_{\mathcal{T}} x_1(t) x_2(t) dt`. This inhibits some # performance optimizations that are available for operations that require # inner products. It is possible to find an orthogonal basis of polynomials, # but it will not be as easy to understand, losing many of its advantages. @@ -231,6 +245,7 @@ basis = skfda.representation.basis.Monomial(n_basis=5) basis.plot() +plt.show() ############################################################################## # We now show how the previous observations are represented using the first @@ -238,6 +253,7 @@ X_basis = X.to_basis(basis) X_basis.plot() +plt.show() ############################################################################## # Fourier basis @@ -264,6 +280,7 @@ basis = skfda.representation.basis.Fourier(n_basis=5) basis.plot() +plt.show() ############################################################################## # We now show how the previous observations are represented using the first @@ -271,6 +288,7 @@ X_basis = X.to_basis(basis) X_basis.plot() +plt.show() ############################################################################## # B-spline basis @@ -298,6 +316,7 @@ basis = skfda.representation.basis.BSpline(n_basis=5) basis.plot() +plt.show() ############################################################################## # We now show how the previous observations are represented using the first @@ -305,6 +324,7 @@ X_basis = X.to_basis(basis) X_basis.plot() +plt.show() ############################################################################## # Constant basis @@ -360,6 +380,7 @@ # We only plot the first function fd_basis[0].plot() +plt.show() ############################################################################## # Finite element basis @@ -409,6 +430,7 @@ ]) plt.triplot(vertices[:, 0], vertices[:, 1], cells) +plt.show() ############################################################################## # We now represent the digits dataset in this basis. @@ -422,6 +444,7 @@ # We only plot the first function fd_basis[0].plot() +plt.show() ############################################################################## # Vector-valued basis @@ -445,6 +468,7 @@ X, y = skfda.datasets.fetch_weather(return_X_y=True) X.plot() +plt.show() ############################################################################## # We will express this dataset as a basis expansion. Temperatures @@ -464,3 +488,4 @@ X_basis = X.to_basis(basis) X_basis.plot() +plt.show() diff --git a/tutorial/plot_getting_data.py b/tutorial/plot_getting_data.py index ef0e48ff5..4c6522b8d 100644 --- a/tutorial/plot_getting_data.py +++ b/tutorial/plot_getting_data.py @@ -29,26 +29,30 @@ # points. # This kind of functional data is easily representable in scikit-fda using # the :class:`~skfda.representation.grid.FDataGrid` class. -# # The :class:`~skfda.representation.grid.FDataGrid` has two important -# attributes: ``data_matrix`` and ``grid_points``. The attribute -# ``grid_points`` is a tuple with the same length as the number of domain -# dimensions (that is, one for curves, two for surfaces...). Each of its -# elements is a 1D numpy :class:`~numpy.ndarray` containing the measurement -# points for that particular dimension. The attribute ``data_matrix`` is a +# attributes: ``data_matrix`` and ``grid_points``. +# +# The attribute ``grid_points`` is a tuple with the same length as the +# number of domain dimensions (that is, one for curves, two for surfaces...). +# Each of its elements is a 1D numpy :class:`~numpy.ndarray` containing the +# grid points for that particular dimension, +# .. math:: +# ((t_1, \ldots, t_{M_i}))_{i=1}^p, +# where :math:`M_i` is the number of measurement points for each "argument" +# or domain coordinate of the function :math:`i` and :math:`p` is the domain +# dimension. +# +# The attribute ``data_matrix`` is a # numpy :class:`~numpy.ndarray` containing the measured values of the # functions in the grid spanned by the grid points. For functions -# :math:`\{f_i: \mathbb{R}^p \to \mathbb{R}^q\}_{i=1}^N` this is a tensor -# with dimensions :math:`N \times M_1 \times \ldots \times M_p \times q`, -# where :math:`M_i` is the number of measurement points for the domain -# dimension :math:`i`. +# :math:`\{x_i: \mathbb{R}^p \to \mathbb{R}^q\}_{i=1}^N` this is a tensor +# with dimensions :math:`N \times M_1 \times \ldots \times M_p \times q`. ############################################################################## # In order to create a :class:`~skfda.representation.grid.FDataGrid`, these # attributes may be provided. The attributes are converted to # :class:`~numpy.ndarray` when necessary. - -############################################################################## +# # .. note:: # # The grid points can be omitted, @@ -65,10 +69,11 @@ ############################################################################## # The following example shows the creation of a # :class:`~skfda.representation.grid.FDataGrid` with two functions (curves) -# :math:`\{f_i: \mathbb{R} \to \mathbb{R}\}, i=1,2` measured at the same +# :math:`\{x_i: \mathbb{R} \to \mathbb{R}\}, i=1,2` measured at the same # (non-equispaced) points. import skfda +import matplotlib.pyplot as plt grid_points = [0, 0.2, 0.5, 0.9, 1] # Grid points of the curves data_matrix = [ @@ -82,6 +87,7 @@ ) fd.plot() +plt.show() ############################################################################## # Advanced example @@ -90,7 +96,7 @@ # In order to better understand the FDataGrid structure, you can consider the # following example, in which a :class:`~skfda.representation.grid.FDataGrid` # object is created, containing just one function (vector-valued surface) -# :math:`f: \mathbb{R}^2 \to \mathbb{R}^4`. +# :math:`x: \mathbb{R}^2 \to \mathbb{R}^4`. grid_points_surface = [ @@ -133,6 +139,7 @@ ) fd.plot() +plt.show() ############################################################################## # Importing data @@ -183,6 +190,7 @@ # Plot the first 2 observations fd[0].plot() fd[1].plot() +plt.show() ############################################################################## @@ -200,6 +208,7 @@ X, y = skfda.datasets.fetch_growth(return_X_y=True) X.plot(group=y) +plt.show() ############################################################################## # Datasets from CRAN @@ -227,6 +236,7 @@ data = skfda.datasets.fetch_cran("MCO", "fda.usc") data["MCO"]["intact"].plot() +plt.show() ############################################################################## # Datasets from the UEA & UCR Time Series Classification Repository @@ -247,12 +257,14 @@ # Load ArrowHead dataset from UCR dataset = skfda.datasets.fetch_ucr("ArrowHead") dataset["data"].plot() +plt.show() ############################################################################## # Load BasicMotions dataset from UEA dataset = skfda.datasets.fetch_ucr("BasicMotions") dataset["data"].plot() +plt.show() ############################################################################## # Synthetic data @@ -289,6 +301,7 @@ ) fd.plot() +plt.show() ############################################################################## # In order to know all the available functionalities to load existing and diff --git a/tutorial/plot_introduction.py b/tutorial/plot_introduction.py index bda6df5b7..801d63381 100644 --- a/tutorial/plot_introduction.py +++ b/tutorial/plot_introduction.py @@ -89,7 +89,7 @@ # In :term:`FDA`, the inputs or parameters of a function are assumed to be # continuous parameters, and so are the outputs, or values of the function. # Thus, it is usual to restrict our functional observations to be functions -# :math:`\{f_i: \mathcal{T} \subseteq \mathbb{R}^p \to \mathbb{R}^q\}_{i=1}^N`. +# :math:`\{x_i: \mathcal{T} \subseteq \mathbb{R}^p \to \mathbb{R}^q\}_{i=1}^N`. # In this case both the domain and codomain are (subsets of) vector spaces of # real numbers, and one could talk of the dimension of each of them as a # vector space (in this case the domain dimension is :math:`p` and the @@ -98,7 +98,7 @@ # The most common case of functional observation, and the one that has # received more attention in the functional data literature, is the case of # functions -# :math:`\{f_i: \mathcal{T} \subseteq \mathbb{R} \to \mathbb{R}\}_{i=1}^N` +# :math:`\{x_i: \mathcal{T} \subseteq \mathbb{R} \to \mathbb{R}\}_{i=1}^N` # (curves or trajectories). ############################################################################## @@ -109,10 +109,12 @@ # :math:`[0, 18]` and both the domain and codomain have a dimension of one. import skfda +import matplotlib.pyplot as plt X, y = skfda.datasets.fetch_growth(return_X_y=True) X.plot() +plt.show() ############################################################################## # Functions where the domain dimension is greater than one ( @@ -130,8 +132,7 @@ # is two. We can see that by default each coordinate of the values of the # function is plotted as a separate coordinate function. -import skfda - X, y = skfda.datasets.fetch_weather(return_X_y=True) X.plot() +plt.show() diff --git a/tutorial/plot_skfda_sklearn.py b/tutorial/plot_skfda_sklearn.py index aa33451af..ed67e4f59 100644 --- a/tutorial/plot_skfda_sklearn.py +++ b/tutorial/plot_skfda_sklearn.py @@ -75,6 +75,7 @@ import skfda from sklearn.model_selection import train_test_split +import matplotlib.pyplot as plt X, y = skfda.datasets.fetch_growth(return_X_y=True) @@ -83,6 +84,7 @@ classifier = skfda.ml.classification.NearestCentroid() classifier.fit(X_train, y_train) classifier.centroids_.plot() +plt.show() ############################################################################## # Transformers @@ -118,6 +120,7 @@ X_smooth = smoother.fit_transform(X) X_smooth.plot() +plt.show() ############################################################################## # Predictors (classifiers, regressors, clusterers...) @@ -156,6 +159,7 @@ y_pred = clusterer.fit_predict(X) X.plot(group=y_pred) +plt.show() ############################################################################## # Metaestimators From c301a8a8a832b2e752ea8b8e922b169fc7a7e8f6 Mon Sep 17 00:00:00 2001 From: VNMabus Date: Wed, 20 Oct 2021 10:24:21 +0200 Subject: [PATCH 045/117] Small typing fix. --- tutorial/plot_getting_data.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tutorial/plot_getting_data.py b/tutorial/plot_getting_data.py index 4c6522b8d..4ab4b4d33 100644 --- a/tutorial/plot_getting_data.py +++ b/tutorial/plot_getting_data.py @@ -110,7 +110,7 @@ # 0.2 [ # Value at (0.2, 0) - [1, 2, 3, 4], + [1, 2, 3, 4.1], # Value at (0.2, 1.5) [0, 1, -1.3, 2], ], @@ -124,7 +124,7 @@ # 0.7 [ # Value at (0.7, 0) - [0, 0, 1, 1], + [0, 0, 1.1, 1], # Value at (0.7, 1.5) [-3, 5, -0.5, -2], ], From 7bd3556d5d9b76580eebfe9caf06ae015fe4e619 Mon Sep 17 00:00:00 2001 From: pedrorponga Date: Thu, 21 Oct 2021 22:53:55 +0200 Subject: [PATCH 046/117] Nearest Centroid and Depth --- docs/modules/ml/classification.rst | 22 +++++++++++++++++++- docs/modules/ml/clustering.rst | 2 +- docs/modules/preprocessing/dim_reduction.rst | 4 ++-- 3 files changed, 24 insertions(+), 4 deletions(-) diff --git a/docs/modules/ml/classification.rst b/docs/modules/ml/classification.rst index d87071a81..c2d894fbf 100644 --- a/docs/modules/ml/classification.rst +++ b/docs/modules/ml/classification.rst @@ -5,7 +5,6 @@ Classification Module with classes to perform classification of functional data. - Nearest Neighbors ----------------- @@ -21,8 +20,29 @@ it is explained the basic usage of these estimators. skfda.ml.classification.KNeighborsClassifier skfda.ml.classification.RadiusNeighborsClassifier + +Nearest Centroid +---------------- + +This module contains `nearest centroid +`_ estimators to +perform classification. + +.. autosummary:: + :toctree: autosummary + skfda.ml.classification.NearestCentroid skfda.ml.classification.DTMClassifier + + +Depth +----- + +This module contains depth based estimators to perform classification. + +.. autosummary:: + :toctree: autosummary + skfda.ml.classification.DDClassifier skfda.ml.classification.DDGClassifier skfda.ml.classification.MaximumDepthClassifier diff --git a/docs/modules/ml/clustering.rst b/docs/modules/ml/clustering.rst index c6099bfdb..ec2ee052a 100644 --- a/docs/modules/ml/clustering.rst +++ b/docs/modules/ml/clustering.rst @@ -43,7 +43,7 @@ clusters given a metric between their elements, in order to cluster together elements that are close from each other. This is repeated until a desired number of clusters is obtained. The resulting hierarchy of clusters can be represented as a tree, called a dendogram. The following hierarchical -clusterings are supported: +clusterings are supported: .. autosummary:: :toctree: autosummary diff --git a/docs/modules/preprocessing/dim_reduction.rst b/docs/modules/preprocessing/dim_reduction.rst index 7f1b7a0d7..119dcfaba 100644 --- a/docs/modules/preprocessing/dim_reduction.rst +++ b/docs/modules/preprocessing/dim_reduction.rst @@ -22,7 +22,7 @@ following: skfda.preprocessing.dim_reduction.variable_selection.RecursiveMaximaHunting skfda.preprocessing.dim_reduction.variable_selection.RKHSVariableSelection skfda.preprocessing.dim_reduction.variable_selection.MinimumRedundancyMaximumRelevance - + .. toctree:: :hidden: :maxdepth: 4 @@ -34,7 +34,7 @@ Feature extraction ------------------ Other dimensionality reduction methods construct new features from existing ones. For example, in functional principal component -analysis, we project the data samples into a smaller sample of +analysis, we project the data samples into a smaller sample of functions that preserve most of the original variance. From 18baf7e9111b025d10b13e9dbda160d17eee539a Mon Sep 17 00:00:00 2001 From: VNMabus Date: Fri, 22 Oct 2021 19:15:45 +0200 Subject: [PATCH 047/117] Small typing fix in BasisSmoother. --- skfda/preprocessing/smoothing/_basis.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/skfda/preprocessing/smoothing/_basis.py b/skfda/preprocessing/smoothing/_basis.py index 77d51191f..2a128a512 100644 --- a/skfda/preprocessing/smoothing/_basis.py +++ b/skfda/preprocessing/smoothing/_basis.py @@ -11,13 +11,11 @@ import numpy as np from typing_extensions import Final -import scipy.linalg - from ..._utils import _cartesian_product, _to_grid_points from ...misc.lstsq import LstsqMethod, solve_regularized_weighted_lstsq from ...misc.regularization import TikhonovRegularization from ...representation import FData, FDataBasis, FDataGrid -from ...representation._typing import GridPointsLike +from ...representation._typing import GridPointsLike, NDArrayFloat from ...representation.basis import Basis from ._linear import _LinearSmoother @@ -210,7 +208,7 @@ def __init__( basis: Basis, *, smoothing_parameter: float = 1.0, - weights: Optional[np.ndarray] = None, + weights: Optional[NDArrayFloat] = None, regularization: Optional[TikhonovRegularization[FDataGrid]] = None, output_points: Optional[GridPointsLike] = None, method: LstsqMethod = 'svd', @@ -228,8 +226,8 @@ def _coef_matrix( self, input_points: GridPointsLike, *, - data_matrix: Optional[np.ndarray] = None, - ) -> np.ndarray: + data_matrix: Optional[NDArrayFloat] = None, + ) -> NDArrayFloat: """Get the matrix that gives the coefficients.""" from ...misc.regularization import compute_penalty_matrix @@ -260,7 +258,7 @@ def _hat_matrix( self, input_points: GridPointsLike, output_points: GridPointsLike, - ) -> np.ndarray: + ) -> NDArrayFloat: basis_values_output = self.basis.evaluate( _cartesian_product( _to_grid_points(output_points), From bd1233207aa741b01c817265b3d8d425dcc249eb Mon Sep 17 00:00:00 2001 From: VNMabus Date: Sat, 23 Oct 2021 23:30:53 +0200 Subject: [PATCH 048/117] Move hovering functionality to base plot. --- skfda/exploratory/visualization/_baseplot.py | 128 +++++++++++++++++- .../visualization/_multiple_display.py | 120 ---------------- 2 files changed, 127 insertions(+), 121 deletions(-) diff --git a/skfda/exploratory/visualization/_baseplot.py b/skfda/exploratory/visualization/_baseplot.py index 56eeab59b..caf3a0a6b 100644 --- a/skfda/exploratory/visualization/_baseplot.py +++ b/skfda/exploratory/visualization/_baseplot.py @@ -10,8 +10,12 @@ import matplotlib.pyplot as plt import numpy as np +from matplotlib.artist import Artist from matplotlib.axes import Axes +from matplotlib.backend_bases import LocationEvent, MouseEvent +from matplotlib.collections import PathCollection from matplotlib.figure import Figure +from matplotlib.text import Annotation from ._utils import _figure_to_svg, _get_figure_and_axes, _set_figure_layout @@ -44,6 +48,7 @@ def __init__( self.axes = axes self.n_rows = n_rows self.n_cols = n_cols + self._tag = self._create_annotation() def _plot( self, @@ -56,7 +61,7 @@ def plot( self, ) -> Figure: """ - Abstract method used to plot the object and its data. + Plot the object and its data. Returns: Figure: figure object in which the displays and @@ -73,6 +78,12 @@ def plot( ) self._plot(fig, axes) + + self._hover_event_id = fig.canvas.mpl_connect( + 'motion_notify_event', + self.hover, + ) + return fig @property @@ -117,3 +128,118 @@ def _repr_svg_(self) -> str: self.fig = self.plot() plt.close(self.fig) return _figure_to_svg(self.fig) + + def _create_annotation(self) -> Annotation: + tag = Annotation( + "", + xy=(0, 0), + xytext=(20, 20), + textcoords="offset points", + bbox={ + "boxstyle": "round", + "fc": "w", + }, + arrowprops={ + "arrowstyle": "->", + }, + ) + + tag.get_bbox_patch().set_facecolor(color='khaki') + intensity = 0.8 + tag.get_bbox_patch().set_alpha(intensity) + + return tag + + def _update_annotation( + self, + tag: Annotation, + *, + axes: Axes, + sample_number: int, + position: Tuple[float, float], + ) -> None: + """ + Auxiliary method used to update the hovering annotations. + + Method used to update the annotations that appear while + hovering a scattered point. The annotations indicate + the index and coordinates of the point hovered. + Args: + tag: Annotation to update. + axes: Axes were the annotation belongs. + sample_number: Number of the current sample. + """ + xdata_graph, ydata_graph = position + + tag.xy = (xdata_graph, ydata_graph) + text = f"{sample_number}: ({xdata_graph:.2f}, {ydata_graph:.2f})" + tag.set_text(text) + + x_axis = axes.get_xlim() + y_axis = axes.get_ylim() + + label_xpos = 20 + label_ypos = 20 + if (xdata_graph - x_axis[0]) > (x_axis[1] - xdata_graph): + label_xpos = -80 + + if (ydata_graph - y_axis[0]) > (y_axis[1] - ydata_graph): + label_ypos = -20 + + if tag.figure: + tag.remove() + tag.figure = None + axes.add_artist(tag) + tag.set_transform(axes.transData) + tag.set_position((label_xpos, label_ypos)) + + def _sample_artist_from_event( + self, + event: LocationEvent, + ) -> Optional[Tuple[int, Artist]]: + """Get the number of sample and artist under a location event.""" + if self.artists is None: + return None + + try: + i = self.axes_.index(event.inaxes) + except ValueError: + return None + + for j, artist in enumerate(self.artists[:, i]): + if not isinstance(artist, PathCollection): + return None + + if artist.contains(event)[0]: + return j, artist + + return None + + def hover(self, event: MouseEvent) -> None: + """ + Activate the annotation when hovering a point. + + Callback method that activates the annotation when hovering + a specific point in a graph. The annotation is a description + of the point containing its coordinates. + Args: + event: event object containing the artist of the point + hovered. + + """ + found_artist = self._sample_artist_from_event(event) + + if event.inaxes is not None and found_artist is not None: + sample_number, artist = found_artist + + self._update_annotation( + self._tag, + axes=event.inaxes, + sample_number=sample_number, + position=artist.get_offsets()[0], + ) + self._tag.set_visible(True) + self.fig_.canvas.draw_idle() + elif self._tag.get_visible(): + self._tag.set_visible(False) + self.fig_.canvas.draw_idle() diff --git a/skfda/exploratory/visualization/_multiple_display.py b/skfda/exploratory/visualization/_multiple_display.py index 79316a1d8..d96620ce6 100644 --- a/skfda/exploratory/visualization/_multiple_display.py +++ b/skfda/exploratory/visualization/_multiple_display.py @@ -88,7 +88,6 @@ def __init__( self.sliders: List[Widget] = [] self.criteria: List[List[int]] = [] self.selected_sample: Optional[int] = None - self._tag = self._create_annotation() if len(criteria) != 0 and not isinstance(criteria[0], Sequence): criteria = (criteria,) @@ -206,70 +205,6 @@ def _create_sliders( label=label, ) - def _create_annotation(self) -> Annotation: - tag = Annotation( - "", - xy=(0, 0), - xytext=(20, 20), - textcoords="offset points", - bbox={ - "boxstyle": "round", - "fc": "w", - }, - arrowprops={ - "arrowstyle": "->", - }, - ) - - tag.get_bbox_patch().set_facecolor(color='khaki') - intensity = 0.8 - tag.get_bbox_patch().set_alpha(intensity) - - return tag - - def _update_annotation( - self, - tag: Annotation, - *, - axes: Axes, - sample_number: int, - position: Tuple[float, float], - ) -> None: - """ - Auxiliary method used to update the hovering annotations. - - Method used to update the annotations that appear while - hovering a scattered point. The annotations indicate - the index and coordinates of the point hovered. - Args: - tag: Annotation to update. - axes: Axes were the annotation belongs. - sample_number: Number of the current sample. - """ - xdata_graph, ydata_graph = position - - tag.xy = (xdata_graph, ydata_graph) - text = f"{sample_number}: ({xdata_graph:.2f}, {ydata_graph:.2f})" - tag.set_text(text) - - x_axis = axes.get_xlim() - y_axis = axes.get_ylim() - - label_xpos = 20 - label_ypos = 20 - if (xdata_graph - x_axis[0]) > (x_axis[1] - xdata_graph): - label_xpos = -80 - - if (ydata_graph - y_axis[0]) > (y_axis[1] - ydata_graph): - label_ypos = -20 - - if tag.figure: - tag.remove() - tag.figure = None - axes.add_artist(tag) - tag.set_transform(axes.transData) - tag.set_position((label_xpos, label_ypos)) - def plot(self) -> Figure: """ Plot Multiple Display method. @@ -303,68 +238,13 @@ def plot(self) -> Figure: disp.plot() int_index = end_index - self.fig.canvas.mpl_connect('motion_notify_event', self.hover) self.fig.canvas.mpl_connect('pick_event', self.pick) - self._tag.set_visible(False) - self.fig.suptitle("Multiple display") self.fig.tight_layout() return self.fig - def _sample_artist_from_event( - self, - event: LocationEvent, - ) -> Optional[Tuple[int, Artist]]: - """Get the number of sample and artist under a location event.""" - for d in self.displays: - if d.artists is None: - continue - - try: - i = d.axes_.index(event.inaxes) - except ValueError: - continue - - for j, artist in enumerate(d.artists[:, i]): - if not isinstance(artist, PathCollection): - return None - - if artist.contains(event)[0]: - return j, artist - - return None - - def hover(self, event: MouseEvent) -> None: - """ - Activate the annotation when hovering a point. - - Callback method that activates the annotation when hovering - a specific point in a graph. The annotation is a description - of the point containing its coordinates. - Args: - event: event object containing the artist of the point - hovered. - - """ - found_artist = self._sample_artist_from_event(event) - - if event.inaxes is not None and found_artist is not None: - sample_number, artist = found_artist - - self._update_annotation( - self._tag, - axes=event.inaxes, - sample_number=sample_number, - position=artist.get_offsets()[0], - ) - self._tag.set_visible(True) - self.fig.canvas.draw_idle() - elif self._tag.get_visible(): - self._tag.set_visible(False) - self.fig.canvas.draw_idle() - def pick(self, event: Event) -> None: """ Activate interactive functionality when picking a point. From 6c94f528cc664a47f2e6dc148e7f28473555c9c9 Mon Sep 17 00:00:00 2001 From: VNMabus Date: Mon, 25 Oct 2021 00:19:39 +0200 Subject: [PATCH 049/117] Add n_components parameter to CoefficientsTransformer. --- .../basis/_coefficients_transformer.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/skfda/representation/basis/_coefficients_transformer.py b/skfda/representation/basis/_coefficients_transformer.py index 1f5f42db6..17fbc5952 100644 --- a/skfda/representation/basis/_coefficients_transformer.py +++ b/skfda/representation/basis/_coefficients_transformer.py @@ -1,9 +1,11 @@ from __future__ import annotations -import numpy as np +from typing import Optional + from sklearn.base import BaseEstimator, TransformerMixin from sklearn.utils.validation import check_is_fitted +from .._typing import NDArrayFloat from ._fdatabasis import FDataBasis @@ -29,9 +31,16 @@ class CoefficientsTransformer( >>> transformer.fit_transform(fd) array([[ 0.5, 1. , 2. , 0.5], [ 1.5, 1. , 4. , 0.5]]) + >>> transformer = CoefficientsTransformer(n_components=2) + >>> transformer.fit_transform(fd) + array([[ 0.5, 1. ], + [ 1.5, 1. ]]) """ + def __init__(self, n_components: Optional[int] = None) -> None: + self.n_components = n_components + def fit( # noqa: D102 self, X: FDataBasis, @@ -46,10 +55,10 @@ def transform( # noqa: D102 self, X: FDataBasis, y: None = None, - ) -> np.ndarray: + ) -> NDArrayFloat: check_is_fitted(self) assert X.basis == self.basis_ - return X.coefficients.copy() + return X.coefficients[:, :self.n_components].copy() From 719c063892468c61a337f42f675e4d7d4eb3d34b Mon Sep 17 00:00:00 2001 From: VNMabus Date: Mon, 25 Oct 2021 01:17:17 +0200 Subject: [PATCH 050/117] Improve plot annotations. Sample names are shown when available. Annotation clipping removed. --- skfda/exploratory/visualization/_baseplot.py | 27 +++++-- .../visualization/_magnitude_shape_plot.py | 71 +++++-------------- 2 files changed, 38 insertions(+), 60 deletions(-) diff --git a/skfda/exploratory/visualization/_baseplot.py b/skfda/exploratory/visualization/_baseplot.py index caf3a0a6b..56294e206 100644 --- a/skfda/exploratory/visualization/_baseplot.py +++ b/skfda/exploratory/visualization/_baseplot.py @@ -17,6 +17,7 @@ from matplotlib.figure import Figure from matplotlib.text import Annotation +from ...representation import FData from ._utils import _figure_to_svg, _get_figure_and_axes, _set_figure_layout @@ -142,6 +143,8 @@ def _create_annotation(self) -> Annotation: arrowprops={ "arrowstyle": "->", }, + annotation_clip=False, + clip_on=False, ) tag.get_bbox_patch().set_facecolor(color='khaki') @@ -156,6 +159,7 @@ def _update_annotation( *, axes: Axes, sample_number: int, + fdata: Optional[FData], position: Tuple[float, float], ) -> None: """ @@ -172,7 +176,19 @@ def _update_annotation( xdata_graph, ydata_graph = position tag.xy = (xdata_graph, ydata_graph) - text = f"{sample_number}: ({xdata_graph:.2f}, {ydata_graph:.2f})" + + sample_name = ( + fdata.sample_names[sample_number] + if fdata is not None + else None + ) + + sample_descr = f" ({sample_name})" if sample_name is not None else "" + + text = ( + f"{sample_number}{sample_descr}: " + f"({xdata_graph:.2f}, {ydata_graph:.2f})" + ) tag.set_text(text) x_axis = axes.get_xlim() @@ -196,8 +212,8 @@ def _update_annotation( def _sample_artist_from_event( self, event: LocationEvent, - ) -> Optional[Tuple[int, Artist]]: - """Get the number of sample and artist under a location event.""" + ) -> Optional[Tuple[int, Optional[FData], Artist]]: + """Get the number, fdata and artist under a location event.""" if self.artists is None: return None @@ -211,7 +227,7 @@ def _sample_artist_from_event( return None if artist.contains(event)[0]: - return j, artist + return j, getattr(self, "fdata", None), artist return None @@ -230,12 +246,13 @@ def hover(self, event: MouseEvent) -> None: found_artist = self._sample_artist_from_event(event) if event.inaxes is not None and found_artist is not None: - sample_number, artist = found_artist + sample_number, fdata, artist = found_artist self._update_annotation( self._tag, axes=event.inaxes, sample_number=sample_number, + fdata=fdata, position=artist.get_offsets()[0], ) self._tag.set_visible(True) diff --git a/skfda/exploratory/visualization/_magnitude_shape_plot.py b/skfda/exploratory/visualization/_magnitude_shape_plot.py index e9e45f21b..25a165928 100644 --- a/skfda/exploratory/visualization/_magnitude_shape_plot.py +++ b/skfda/exploratory/visualization/_magnitude_shape_plot.py @@ -7,7 +7,7 @@ """ from __future__ import annotations -from typing import Optional, Sequence, Union +from typing import Any, Optional, Sequence, Union import matplotlib import matplotlib.pyplot as plt @@ -43,12 +43,12 @@ class MagnitudeShapePlot(BasePlot): For more information see :footcite:ts:`dai+genton_2018_visualization`. Args: - fdatagrid (FDataGrid): Object containing the data. + fdata (FDataGrid): Object containing the data. multivariate_depth (:ref:`depth measure `, optional): Method used to order the data. Defaults to :class:`projection depth `. pointwise_weights (array_like, optional): an array containing the - weights of each points of discretisati on where values have + weights of each points of discretisation where values have been recorded. alpha (float, optional): Denotes the quantile to choose the cutoff value for detecting outliers Defaults to 0.993, which is used @@ -75,7 +75,7 @@ class MagnitudeShapePlot(BasePlot): Attributes: points(numpy.ndarray): 2-dimensional matrix where each row contains the points plotted in the graph. - outliers (1-D array, (fdatagrid.n_samples,)): Contains 1 or 0 to denote + outliers (1-D array, (fdata.n_samples,)): Contains 1 or 0 to denote if a sample is an outlier or not, respecively. colormap(matplotlib.pyplot.LinearSegmentedColormap, optional): Colormap from which the colors of the plot are extracted. Defaults to @@ -113,7 +113,7 @@ class MagnitudeShapePlot(BasePlot): >>> fd = skfda.FDataGrid(data_matrix, grid_points) >>> MagnitudeShapePlot(fd) MagnitudeShapePlot( - FDataGrid=FDataGrid( + fdata=FDataGrid( array([[[ 1. ], [ 1. ], [ 2. ], @@ -163,70 +163,31 @@ class MagnitudeShapePlot(BasePlot): def __init__( self, - fdatagrid: FDataGrid, + fdata: FDataGrid, chart: Union[Figure, Axes, None] = None, *, fig: Optional[Figure] = None, axes: Optional[Sequence[Axes]] = None, - **kwargs, + **kwargs: Any, ) -> None: - """Initialization of the MagnitudeShapePlot class. - - Args: - fdatagrid: Object containing the data. - multivariate_depth (:ref:`depth measure `, - optional): Method used to order the data. Defaults to - :class:`projection depth - `. - pointwise_weights: an array containing the - weights of each points of discretisati on where values have - been recorded. - alpha: Denotes the quantile to choose the cutoff - value for detecting outliers Defaults to 0.993, which is used - in the classical boxplot. - assume_centered: If True, the support of the - robust location and the covariance estimates is computed, and a - covariance estimate is recomputed from it, without centering - the data. Useful to work with data whose mean is significantly - equal to zero but is not exactly zero. If False, default value, - the robust location and covariance are directly computed with - the FastMCD algorithm without additional treatment. - support_fraction: The proportion of points to be included in the - support of the raw MCD estimate. - Default is None, which implies that the minimum value of - support_fraction will be used within the algorithm: - [n_sample + n_features + 1] / 2 - random_state: If int, random_state is the seed used by the random - number generator; If RandomState instance, random_state is - the random number generator; If None, the random number - generator is the RandomState instance used by np.random. - By default, it is 0. - chart: figure over with the graphs are plotted or axis over - where the graphs are plotted. If None and ax is also - None, the figure is initialized. - fig: figure over with the graphs are plotted in case ax is not - specified. If None and ax is also None, the figure is - initialized. - axes: axis where the graphs are plotted. If None, see param fig. - - """ + BasePlot.__init__( self, chart, fig=fig, axes=axes, ) - if fdatagrid.dim_codomain > 1: + if fdata.dim_codomain > 1: raise NotImplementedError( "Only support 1 dimension on the codomain.") self.outlier_detector = MSPlotOutlierDetector(**kwargs) - y = self.outlier_detector.fit_predict(fdatagrid) + y = self.outlier_detector.fit_predict(fdata) outliers = (y == -1) - self._fdatagrid = fdatagrid + self._fdata = fdata self._outliers = outliers self._colormap = plt.cm.get_cmap('seismic') self._color = 0.2 @@ -236,8 +197,8 @@ def __init__( self.title = 'MS-Plot' @property - def fdatagrid(self) -> FDataGrid: - return self._fdatagrid + def fdata(self) -> FDataGrid: + return self._fdata @property def multivariate_depth(self) -> Optional[Depth[NDArrayFloat]]: @@ -297,7 +258,7 @@ def outliercol(self, value: float) -> None: @property def n_samples(self) -> int: - return self.fdatagrid.n_samples + return self.fdata.n_samples def _plot( self, @@ -309,7 +270,7 @@ def _plot( (self.n_samples, 1), dtype=Artist, ) - colors = np.zeros((self.fdatagrid.n_samples, 4)) + colors = np.zeros((self.fdata.n_samples, 4)) colors[np.where(self.outliers == 1)] = self.colormap(self.outliercol) colors[np.where(self.outliers == 0)] = self.colormap(self.color) @@ -332,7 +293,7 @@ def __repr__(self) -> str: """Return repr(self).""" return ( f"MagnitudeShapePlot(" - f"\nFDataGrid={repr(self.fdatagrid)}," + f"\nfdata={repr(self.fdata)}," f"\nmultivariate_depth={self.multivariate_depth}," f"\npointwise_weights={repr(self.pointwise_weights)}," f"\nalpha={repr(self.alpha)}," From a9953a3104195ce9796397d094b17b1b90fd090f Mon Sep 17 00:00:00 2001 From: VNMabus Date: Mon, 25 Oct 2021 02:13:22 +0200 Subject: [PATCH 051/117] Fix examples. --- examples/plot_magnitude_shape.py | 34 ++++++--- examples/plot_magnitude_shape_synthetic.py | 89 +++++++++++++++------- 2 files changed, 83 insertions(+), 40 deletions(-) diff --git a/examples/plot_magnitude_shape.py b/examples/plot_magnitude_shape.py index 777b1524e..987d05bef 100644 --- a/examples/plot_magnitude_shape.py +++ b/examples/plot_magnitude_shape.py @@ -39,9 +39,11 @@ nlabels = len(label_names) label_colors = colormap(np.arange(nlabels) / (nlabels - 1)) -fd_temperatures.plot(group=target.codes, - group_colors=label_colors, - group_names=label_names) +fd_temperatures.plot( + group=target.codes, + group_colors=label_colors, + group_names=label_names, +) ############################################################################## # The MS-Plot is generated. In order to show the results, the @@ -50,8 +52,10 @@ # between outliers or not. In particular the tones of the default colormap, # (which is 'seismic' and can be customized), are assigned. -msplot = MagnitudeShapePlot(fdatagrid=fd_temperatures, - multivariate_depth=SimplicialDepth()) +msplot = MagnitudeShapePlot( + fd_temperatures, + multivariate_depth=SimplicialDepth(), +) color = 0.3 outliercol = 0.7 @@ -64,9 +68,11 @@ # To show the utility of the plot, the curves are plotted according to the # distinction made by the MS-Plot (outliers or not) with the same colors. -fd_temperatures.plot(group=msplot.outliers.astype(int), - group_colors=msplot.colormap([color, outliercol]), - group_names=['nonoutliers', 'outliers']) +fd_temperatures.plot( + group=msplot.outliers.astype(int), + group_colors=msplot.colormap([color, outliercol]), + group_names=['nonoutliers', 'outliers'], +) ############################################################################## # We can observe that most of the curves pointed as outliers belong either to @@ -82,8 +88,10 @@ # :func:`~skfda.exploratory.depth.IntegratedDepth` in the # MS-Plot. -msplot = MagnitudeShapePlot(fdatagrid=fd_temperatures, - multivariate_depth=IntegratedDepth().multivariate_depth) +msplot = MagnitudeShapePlot( + fd_temperatures, + multivariate_depth=IntegratedDepth().multivariate_depth, +) msplot.color = color msplot.outliercol = outliercol @@ -120,5 +128,7 @@ ############################################################################## # We now plot the curves with their corresponding color: -fd_temperatures.plot(group=labels, - group_colors=colormap([color, outliercol, 0.9])) +fd_temperatures.plot( + group=labels, + group_colors=colormap([color, outliercol, 0.9]), +) diff --git a/examples/plot_magnitude_shape_synthetic.py b/examples/plot_magnitude_shape_synthetic.py index 4242bc9f5..18af299c4 100644 --- a/examples/plot_magnitude_shape_synthetic.py +++ b/examples/plot_magnitude_shape_synthetic.py @@ -12,10 +12,10 @@ import matplotlib.pyplot as plt import numpy as np + import skfda from skfda.exploratory.visualization import MagnitudeShapePlot - ############################################################################## # First, we generate a synthetic dataset following [DaWe18]_ @@ -23,68 +23,89 @@ n_samples = 200 fd = skfda.datasets.make_gaussian_process( - n_samples=n_samples, n_features=100, + n_samples=n_samples, + n_features=100, cov=skfda.misc.covariances.Exponential(), mean=lambda t: 4 * t, - random_state=random_state) + random_state=random_state, +) ############################################################################## # We now add the outliers magnitude_outlier = skfda.datasets.make_gaussian_process( - n_samples=1, n_features=100, + n_samples=1, + n_features=100, cov=skfda.misc.covariances.Exponential(), mean=lambda t: 4 * t + 20, - random_state=random_state) + random_state=random_state, +) shape_outlier_shift = skfda.datasets.make_gaussian_process( - n_samples=1, n_features=100, + n_samples=1, + n_features=100, cov=skfda.misc.covariances.Exponential(), mean=lambda t: 4 * t + 10 * (t > 0.4), - random_state=random_state) + random_state=random_state, +) shape_outlier_peak = skfda.datasets.make_gaussian_process( - n_samples=1, n_features=100, + n_samples=1, + n_features=100, cov=skfda.misc.covariances.Exponential(), mean=lambda t: 4 * t - 10 * ((0.25 < t) & (t < 0.3)), - random_state=random_state) + random_state=random_state, +) shape_outlier_sin = skfda.datasets.make_gaussian_process( - n_samples=1, n_features=100, + n_samples=1, + n_features=100, cov=skfda.misc.covariances.Exponential(), mean=lambda t: 4 * t + 2 * np.sin(18 * t), - random_state=random_state) + random_state=random_state, +) shape_outlier_slope = skfda.datasets.make_gaussian_process( - n_samples=1, n_features=100, + n_samples=1, + n_features=100, cov=skfda.misc.covariances.Exponential(), mean=lambda t: 10 * t, - random_state=random_state) + random_state=random_state, +) magnitude_shape_outlier = skfda.datasets.make_gaussian_process( - n_samples=1, n_features=100, + n_samples=1, + n_features=100, cov=skfda.misc.covariances.Exponential(), mean=lambda t: 4 * t + 2 * np.sin(18 * t) - 20, - random_state=random_state) + random_state=random_state, +) -fd = fd.concatenate(magnitude_outlier, shape_outlier_shift, - shape_outlier_peak, shape_outlier_sin, - shape_outlier_slope, magnitude_shape_outlier) +fd = fd.concatenate( + magnitude_outlier, + shape_outlier_shift, + shape_outlier_peak, + shape_outlier_sin, + shape_outlier_slope, + magnitude_shape_outlier, +) ############################################################################## # The data is plotted to show the curves we are working with. labels = [0] * n_samples + [1] * 6 -fd.plot(group=labels, - group_colors=['lightgrey', 'black']) +fd.plot( + group=labels, + group_colors=['lightgrey', 'black'], +) ############################################################################## # The MS-Plot is generated. In order to show the results, the # :func:`~skfda.exploratory.visualization.MagnitudeShapePlot.plot` # method is used. -msplot = MagnitudeShapePlot(fdatagrid=fd) +msplot = MagnitudeShapePlot(fd) msplot.plot() @@ -93,19 +114,31 @@ # in a different color labels = [0] * n_samples + [1, 2, 3, 4, 5, 6] -colors = ['lightgrey', 'orange', 'blue', 'black', - 'green', 'brown', 'lightblue'] - -fd.plot(group=labels, - group_colors=colors) +colors = [ + 'lightgrey', + 'orange', + 'blue', + 'black', + 'green', + 'brown', + 'lightblue', +] + +fd.plot( + group=labels, + group_colors=colors, +) ############################################################################## # We now show the points in the MS-plot using the same colors fig = plt.figure() ax = fig.add_subplot(1, 1, 1) -ax.scatter(msplot.points[:, 0].ravel(), msplot.points[:, 1].ravel(), - c=colors[0:1] * n_samples + colors[1:]) +ax.scatter( + msplot.points[:, 0].ravel(), + msplot.points[:, 1].ravel(), + c=colors[:1] * n_samples + colors[1:], +) ax.set_title("MS-Plot") ax.set_xlabel("magnitude outlyingness") ax.set_ylabel("shape outlyingness") From 33cab7488187dbe41cdfb432f1a78a3cf42c8053 Mon Sep 17 00:00:00 2001 From: pedrorponga Date: Mon, 25 Oct 2021 21:00:05 +0200 Subject: [PATCH 052/117] inheritance and private method _centroid --- .../classification/_centroid_classifiers.py | 51 +++++-------------- 1 file changed, 12 insertions(+), 39 deletions(-) diff --git a/skfda/ml/classification/_centroid_classifiers.py b/skfda/ml/classification/_centroid_classifiers.py index 33a7c9b38..a9f408312 100644 --- a/skfda/ml/classification/_centroid_classifiers.py +++ b/skfda/ml/classification/_centroid_classifiers.py @@ -1,9 +1,8 @@ """Centroid-based models for supervised classification.""" from __future__ import annotations -from typing import Callable, Generic, Optional, TypeVar +from typing import Any, Callable, Generic, Optional, TypeVar -from numpy import ndarray from sklearn.base import BaseEstimator, ClassifierMixin from sklearn.utils.validation import check_is_fitted as sklearn_check_is_fitted @@ -12,6 +11,7 @@ from ...exploratory.stats import mean, trim_mean from ...misc.metrics import Metric, PairwiseMetric, l2_distance from ...representation import FData +from ...representation._typing import NDArrayInt T = TypeVar("T", bound=FData) @@ -74,7 +74,7 @@ def __init__( self.metric = metric self.centroid = centroid - def fit(self, X: T, y: ndarray) -> NearestCentroid[T]: + def fit(self, X: T, y: NDArrayInt) -> NearestCentroid[T]: """Fit the model using X as training data and y as target values. Args: @@ -97,7 +97,7 @@ def fit(self, X: T, y: ndarray) -> NearestCentroid[T]: return self - def predict(self, X: T) -> ndarray: + def predict(self, X: T) -> Any: """Predict the class labels for the provided data. Args: @@ -116,11 +116,7 @@ def predict(self, X: T) -> ndarray: ] -class DTMClassifier( - BaseEstimator, # type: ignore - ClassifierMixin, # type: ignore - Generic[T], -): +class DTMClassifier(NearestCentroid[T]): """Distance to trimmed means (DTM) classification. Test samples are classified to the class that minimizes the distance of @@ -186,41 +182,18 @@ def __init__( ) -> None: self.proportiontocut = proportiontocut self.depth_method = depth_method - self.metric = metric - - def fit(self, X: T, y: ndarray) -> DTMClassifier[T]: - """Fit the model using X as training data and y as target values. - Args: - X: FDataGrid with the training data. - y: Target values of shape = (n_samples). - - Returns: - self - """ if self.depth_method is None: self.depth_method = ModifiedBandDepth() - self._clf = NearestCentroid( - metric=self.metric, - centroid=lambda fdatagrid: trim_mean( + def _centroid(fdatagrid: T) -> T: + return trim_mean( fdatagrid, self.proportiontocut, depth_method=self.depth_method, - ), - ) - self._clf.fit(X, y) + ) - return self - - def predict(self, X: T) -> ndarray: - """Predict the class labels for the provided data. - - Args: - X: FDataGrid with the test samples. - - Returns: - Array of shape (n_samples) or - (n_samples, n_outputs) with class labels for each data sample. - """ - return self._clf.predict(X) + super().__init__( + metric, + centroid=_centroid, + ) From 54949c37c905cccab8943716c95254bd557e0b1d Mon Sep 17 00:00:00 2001 From: pedrorponga Date: Mon, 25 Oct 2021 22:50:37 +0200 Subject: [PATCH 053/117] _centroid method --- .../ml/classification/_centroid_classifiers.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/skfda/ml/classification/_centroid_classifiers.py b/skfda/ml/classification/_centroid_classifiers.py index a9f408312..e4d246340 100644 --- a/skfda/ml/classification/_centroid_classifiers.py +++ b/skfda/ml/classification/_centroid_classifiers.py @@ -97,7 +97,7 @@ def fit(self, X: T, y: NDArrayInt) -> NearestCentroid[T]: return self - def predict(self, X: T) -> Any: + def predict(self, X: T) -> NDArrayInt: """Predict the class labels for the provided data. Args: @@ -174,6 +174,13 @@ class DTMClassifier(NearestCentroid[T]): """ + def _centroid(self, fdatagrid: T) -> T: + return trim_mean( + fdatagrid, + self.proportiontocut, + depth_method=self.depth_method, + ) + def __init__( self, proportiontocut: float, @@ -186,14 +193,7 @@ def __init__( if self.depth_method is None: self.depth_method = ModifiedBandDepth() - def _centroid(fdatagrid: T) -> T: - return trim_mean( - fdatagrid, - self.proportiontocut, - depth_method=self.depth_method, - ) - super().__init__( metric, - centroid=_centroid, + centroid=self._centroid, ) From 129685440d0b57fd36480245cca86c77ba06f8e3 Mon Sep 17 00:00:00 2001 From: pedrorponga Date: Mon, 25 Oct 2021 23:21:58 +0200 Subject: [PATCH 054/117] test un-linked copies --- .../classification/_centroid_classifiers.py | 16 ++++----- tests/test_classification.py | 33 +++++++++++++++++++ 2 files changed, 41 insertions(+), 8 deletions(-) create mode 100644 tests/test_classification.py diff --git a/skfda/ml/classification/_centroid_classifiers.py b/skfda/ml/classification/_centroid_classifiers.py index e4d246340..8c3d8e346 100644 --- a/skfda/ml/classification/_centroid_classifiers.py +++ b/skfda/ml/classification/_centroid_classifiers.py @@ -1,7 +1,7 @@ """Centroid-based models for supervised classification.""" from __future__ import annotations -from typing import Any, Callable, Generic, Optional, TypeVar +from typing import Callable, Generic, Optional, TypeVar from sklearn.base import BaseEstimator, ClassifierMixin from sklearn.utils.validation import check_is_fitted as sklearn_check_is_fitted @@ -174,13 +174,6 @@ class DTMClassifier(NearestCentroid[T]): """ - def _centroid(self, fdatagrid: T) -> T: - return trim_mean( - fdatagrid, - self.proportiontocut, - depth_method=self.depth_method, - ) - def __init__( self, proportiontocut: float, @@ -197,3 +190,10 @@ def __init__( metric, centroid=self._centroid, ) + + def _centroid(self, fdatagrid: T) -> T: + return trim_mean( + fdatagrid, + self.proportiontocut, + depth_method=self.depth_method, + ) diff --git a/tests/test_classification.py b/tests/test_classification.py new file mode 100644 index 000000000..f0500ad9e --- /dev/null +++ b/tests/test_classification.py @@ -0,0 +1,33 @@ +import unittest + +import numpy as np +from sklearn.base import clone +from sklearn.model_selection import train_test_split + +from skfda.datasets import fetch_growth +from skfda.ml.classification import DTMClassifier + + +class TestClassification(unittest.TestCase): + + def setUp(self) -> None: + X, y = fetch_growth(return_X_y=True) + self.X_train, self.X_test, self.y_train, self.y_test = train_test_split( + X, y, test_size=0.25, stratify=y, random_state=0) + + def test_dtm_independent_copy(self) -> None: + + clf = DTMClassifier(proportiontocut=0.25) + clf1 = clone(clf) + clf2 = DTMClassifier(proportiontocut=0.75) + clf1.proportiontocut = 0.75 + clf1.fit(self.X_train, self.y_train) + clf2.fit(self.X_train, self.y_train) + np.testing.assert_array_equal( + clf1.predict(self.X_test), clf2.predict(self.X_test) + ) + + +if __name__ == '__main__': + print() + unittest.main() From 5323af00c25f7eba61931001dfd86867716a8861 Mon Sep 17 00:00:00 2001 From: pedrorponga Date: Mon, 25 Oct 2021 23:33:41 +0200 Subject: [PATCH 055/117] Style --- tests/test_classification.py | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/tests/test_classification.py b/tests/test_classification.py index f0500ad9e..95f4d39c3 100644 --- a/tests/test_classification.py +++ b/tests/test_classification.py @@ -1,3 +1,5 @@ +"""Tests of classification methods.""" + import unittest import numpy as np @@ -8,15 +10,22 @@ from skfda.ml.classification import DTMClassifier -class TestClassification(unittest.TestCase): +class TestCentroidClassifiers(unittest.TestCase): + """Tests for centroid classifiers.""" def setUp(self) -> None: + """Establish train and test data sets.""" X, y = fetch_growth(return_X_y=True) self.X_train, self.X_test, self.y_train, self.y_test = train_test_split( - X, y, test_size=0.25, stratify=y, random_state=0) + X, + y, + test_size=0.25, + stratify=y, + random_state=0, + ) def test_dtm_independent_copy(self) -> None: - + """Check that copies are un-linked.""" clf = DTMClassifier(proportiontocut=0.25) clf1 = clone(clf) clf2 = DTMClassifier(proportiontocut=0.75) @@ -24,10 +33,10 @@ def test_dtm_independent_copy(self) -> None: clf1.fit(self.X_train, self.y_train) clf2.fit(self.X_train, self.y_train) np.testing.assert_array_equal( - clf1.predict(self.X_test), clf2.predict(self.X_test) + clf1.predict(self.X_test), + clf2.predict(self.X_test), ) if __name__ == '__main__': - print() unittest.main() From 98b707c9d5bd23a0ce642dbd6f91ee801b289f28 Mon Sep 17 00:00:00 2001 From: pedrorponga Date: Mon, 25 Oct 2021 23:42:03 +0200 Subject: [PATCH 056/117] Style --- tests/test_classification.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tests/test_classification.py b/tests/test_classification.py index 95f4d39c3..f3fb484b1 100644 --- a/tests/test_classification.py +++ b/tests/test_classification.py @@ -16,13 +16,17 @@ class TestCentroidClassifiers(unittest.TestCase): def setUp(self) -> None: """Establish train and test data sets.""" X, y = fetch_growth(return_X_y=True) - self.X_train, self.X_test, self.y_train, self.y_test = train_test_split( + X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.25, stratify=y, random_state=0, ) + self._X_train = X_train + self._X_test = X_test + self._y_train = y_train + self._y_test = y_test def test_dtm_independent_copy(self) -> None: """Check that copies are un-linked.""" From 9d1cce0ab8edde88346852cea98826f4587751ba Mon Sep 17 00:00:00 2001 From: pedrorponga Date: Tue, 26 Oct 2021 06:58:14 +0200 Subject: [PATCH 057/117] Typo --- tests/test_classification.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/test_classification.py b/tests/test_classification.py index f3fb484b1..4b95486b2 100644 --- a/tests/test_classification.py +++ b/tests/test_classification.py @@ -34,11 +34,11 @@ def test_dtm_independent_copy(self) -> None: clf1 = clone(clf) clf2 = DTMClassifier(proportiontocut=0.75) clf1.proportiontocut = 0.75 - clf1.fit(self.X_train, self.y_train) - clf2.fit(self.X_train, self.y_train) + clf1.fit(self._X_train, self._y_train) + clf2.fit(self._X_train, self._y_train) np.testing.assert_array_equal( - clf1.predict(self.X_test), - clf2.predict(self.X_test), + clf1.predict(self._X_test), + clf2.predict(self._X_test), ) From 658e68145ef620229be423e51520b2afb867659b Mon Sep 17 00:00:00 2001 From: pedrorponga Date: Tue, 26 Oct 2021 07:10:54 +0200 Subject: [PATCH 058/117] Style --- tests/test_classification.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/test_classification.py b/tests/test_classification.py index 4b95486b2..9bdfce53d 100644 --- a/tests/test_classification.py +++ b/tests/test_classification.py @@ -33,9 +33,11 @@ def test_dtm_independent_copy(self) -> None: clf = DTMClassifier(proportiontocut=0.25) clf1 = clone(clf) clf2 = DTMClassifier(proportiontocut=0.75) + clf1.proportiontocut = 0.75 clf1.fit(self._X_train, self._y_train) clf2.fit(self._X_train, self._y_train) + np.testing.assert_array_equal( clf1.predict(self._X_test), clf2.predict(self._X_test), From 6df5474393a6ad5dce2bdec1433b61571717b092 Mon Sep 17 00:00:00 2001 From: pedrorponga Date: Tue, 26 Oct 2021 07:19:11 +0200 Subject: [PATCH 059/117] types --- tests/test_classification.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tests/test_classification.py b/tests/test_classification.py index 9bdfce53d..f6ab506ff 100644 --- a/tests/test_classification.py +++ b/tests/test_classification.py @@ -9,6 +9,8 @@ from skfda.datasets import fetch_growth from skfda.ml.classification import DTMClassifier +from skfda.representation import FData + class TestCentroidClassifiers(unittest.TestCase): """Tests for centroid classifiers.""" @@ -30,15 +32,15 @@ def setUp(self) -> None: def test_dtm_independent_copy(self) -> None: """Check that copies are un-linked.""" - clf = DTMClassifier(proportiontocut=0.25) + clf: DTMClassifier[FData] = DTMClassifier(proportiontocut=0.25) clf1 = clone(clf) - clf2 = DTMClassifier(proportiontocut=0.75) + clf2: DTMClassifier[FData] = DTMClassifier(proportiontocut=0.75) clf1.proportiontocut = 0.75 clf1.fit(self._X_train, self._y_train) clf2.fit(self._X_train, self._y_train) - np.testing.assert_array_equal( + np.testing.assert_array_equal( # type: ignore clf1.predict(self._X_test), clf2.predict(self._X_test), ) From 21bfd9be857e094668ddf43ef6ca2a710b7bdc1c Mon Sep 17 00:00:00 2001 From: pedrorponga Date: Tue, 26 Oct 2021 07:21:33 +0200 Subject: [PATCH 060/117] Blank line --- tests/test_classification.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/test_classification.py b/tests/test_classification.py index f6ab506ff..e14eb4816 100644 --- a/tests/test_classification.py +++ b/tests/test_classification.py @@ -8,7 +8,6 @@ from skfda.datasets import fetch_growth from skfda.ml.classification import DTMClassifier - from skfda.representation import FData From 1395dddb36bf6bee5c311b673399f4b1e70b8ba8 Mon Sep 17 00:00:00 2001 From: pedrorponga Date: Tue, 26 Oct 2021 19:50:40 +0200 Subject: [PATCH 061/117] depth classifiers refactoring --- skfda/ml/classification/_depth_classifiers.py | 216 +++++++++--------- 1 file changed, 113 insertions(+), 103 deletions(-) diff --git a/skfda/ml/classification/_depth_classifiers.py b/skfda/ml/classification/_depth_classifiers.py index acd85adf6..7c2f51f77 100644 --- a/skfda/ml/classification/_depth_classifiers.py +++ b/skfda/ml/classification/_depth_classifiers.py @@ -5,118 +5,21 @@ from typing import Generic, Optional, Sequence, TypeVar, Union import numpy as np -from numpy import ndarray from scipy.interpolate import lagrange from sklearn.base import BaseEstimator, ClassifierMixin, clone from sklearn.metrics import accuracy_score from sklearn.pipeline import make_pipeline from sklearn.utils.validation import check_is_fitted as sklearn_check_is_fitted -from ..._utils import _classifier_fit_depth_methods +from ..._utils import _classifier_fit_depth_methods, _classifier_get_classes from ...exploratory.depth import Depth, ModifiedBandDepth from ...preprocessing.dim_reduction.feature_extraction import DDGTransformer +from ...representation._typing import NDArrayInt, NDArrayFloat from ...representation.grid import FData T = TypeVar("T", bound=FData) -class MaximumDepthClassifier( - BaseEstimator, # type: ignore - ClassifierMixin, # type: ignore - Generic[T], -): - """Maximum depth classifier for functional data. - - Test samples are classified to the class where they are deeper. - - Parameters: - depth_method: - The depth class to use when calculating the depth of a test - sample in a class. See the documentation of the depths module - for a list of available depths. By default it is ModifiedBandDepth. - Examples: - Firstly, we will import and split the Berkeley Growth Study dataset - - >>> from skfda.datasets import fetch_growth - >>> from sklearn.model_selection import train_test_split - >>> dataset = fetch_growth() - >>> fd = dataset['data'] - >>> y = dataset['target'] - >>> X_train, X_test, y_train, y_test = train_test_split( - ... fd, y, test_size=0.25, stratify=y, random_state=0) - - We will fit a Maximum depth classifier - - >>> from skfda.ml.classification import MaximumDepthClassifier - >>> clf = MaximumDepthClassifier() - >>> clf.fit(X_train, y_train) - MaximumDepthClassifier(...) - - We can predict the class of new samples - - >>> clf.predict(X_test) # Predict labels for test samples - array([1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, - 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1]) - - Finally, we calculate the mean accuracy for the test data - - >>> clf.score(X_test, y_test) - 0.875 - - See also: - :class:`~skfda.ml.classification.DDClassifier` - :class:`~skfda.ml.classification.DDGClassifier` - - References: - Ghosh, A. K. and Chaudhuri, P. (2005b). On maximum depth and - related classifiers. Scandinavian Journal of Statistics, 32, 327–350. - """ - - def __init__(self, depth_method: Optional[Depth[T]] = None) -> None: - self.depth_method = depth_method - - def fit(self, X: T, y: ndarray) -> MaximumDepthClassifier[T]: - """Fit the model using X as training data and y as target values. - - Args: - X: FDataGrid with the training data. - y: Target values of shape = (n_samples). - - Returns: - self - """ - if self.depth_method is None: - self.depth_method = ModifiedBandDepth() - - classes, class_depth_methods = _classifier_fit_depth_methods( - X, y, [self.depth_method], - ) - - self._classes = classes - self.class_depth_methods_ = class_depth_methods - - return self - - def predict(self, X: T) -> ndarray: - """Predict the class labels for the provided data. - - Args: - X: FDataGrid with the test samples. - - Returns: - Array of shape (n_samples) with class labels - for each data sample. - """ - sklearn_check_is_fitted(self) - - depths = [ - depth_method.predict(X) - for depth_method in self.class_depth_methods_ - ] - - return self._classes[np.argmax(depths, axis=0)] - - class DDClassifier( BaseEstimator, # type: ignore ClassifierMixin, # type: ignore @@ -186,7 +89,7 @@ def __init__( self.depth_method = depth_method self.degree = degree - def fit(self, X: T, y: ndarray) -> DDClassifier[T]: + def fit(self, X: T, y: NDArrayInt) -> DDClassifier[T]: """Fit the model using X as training data and y as target values. Args: @@ -244,7 +147,7 @@ def fit(self, X: T, y: ndarray) -> DDClassifier[T]: return self - def predict(self, X: T) -> ndarray: + def predict(self, X: T) -> NDArrayInt: """Predict the class labels for the provided data. Args: @@ -356,7 +259,7 @@ def __init__( self.multivariate_classifier = multivariate_classifier self.depth_method = depth_method - def fit(self, X: T, y: ndarray) -> DDGClassifier[T]: + def fit(self, X: T, y: NDArrayInt) -> DDGClassifier[T]: """Fit the model using X as training data and y as target values. Args: @@ -375,7 +278,7 @@ def fit(self, X: T, y: ndarray) -> DDGClassifier[T]: return self - def predict(self, X: T) -> ndarray: + def predict(self, X: T) -> NDArrayInt: """Predict the class labels for the provided data. Args: @@ -386,3 +289,110 @@ def predict(self, X: T) -> ndarray: for each data sample. """ return self._pipeline.predict(X) + + +class _ArgMaxClassifier( + BaseEstimator, # type: ignore + ClassifierMixin, # type: ignore +): + """Arg max classifier for multivariate data. + + Test samples are classified to the class that corresponds to the + index of the highest coordinate. + + Examples: + + >>> import numpy as np + >>> X = np.array([[1,5], [3,2], [4,1]]) + >>> y = np.array([1, 0, 0]) + + We will fit am ArgMax classifier + + >>> from skfda.ml.classification._depth_classifiers import _ArgMaxClassifier + >>> clf = _ArgMaxClassifier() + >>> clf.fit(X, y) + _ArgMaxClassifier(...) + + We can predict the class of new samples + + >>> clf.predict(X) # Predict labels for test samples + array([1, 0, 0]) + """ + + def fit(self, X: NDArrayFloat, y: NDArrayInt) -> _ArgMaxClassifier: + """Fit the model using X as training data and y as target values. + + Args: + X: Array with the training data. + y: Target values of shape = (n_samples). + + Returns: + self + """ + classes, _ = _classifier_get_classes(y) + self._classes = classes + return self + + def predict(self, X: NDArrayFloat) -> NDArrayInt: + """Predict the class labels for the provided data. + + Args: + X: Array with the test samples. + + Returns: + Array of shape (n_samples) with class labels + for each data sample. + """ + return self._classes[np.argmax(X, axis=1)] + + +class MaximumDepthClassifier(DDGClassifier[T]): + """Maximum depth classifier for functional data. + + Test samples are classified to the class where they are deeper. + + Parameters: + depth_method: + The depth class to use when calculating the depth of a test + sample in a class. See the documentation of the depths module + for a list of available depths. By default it is ModifiedBandDepth. + Examples: + Firstly, we will import and split the Berkeley Growth Study dataset + + >>> from skfda.datasets import fetch_growth + >>> from sklearn.model_selection import train_test_split + >>> dataset = fetch_growth() + >>> fd = dataset['data'] + >>> y = dataset['target'] + >>> X_train, X_test, y_train, y_test = train_test_split( + ... fd, y, test_size=0.25, stratify=y, random_state=0) + + We will fit a Maximum depth classifier + + >>> from skfda.ml.classification import MaximumDepthClassifier + >>> clf = MaximumDepthClassifier() + >>> clf.fit(X_train, y_train) + MaximumDepthClassifier(...) + + We can predict the class of new samples + + >>> clf.predict(X_test) # Predict labels for test samples + array([1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, + 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1]) + + Finally, we calculate the mean accuracy for the test data + + >>> clf.score(X_test, y_test) + 0.875 + + See also: + :class:`~skfda.ml.classification.DDClassifier` + :class:`~skfda.ml.classification.DDGClassifier` + + References: + Ghosh, A. K. and Chaudhuri, P. (2005b). On maximum depth and + related classifiers. Scandinavian Journal of Statistics, 32, 327–350. + """ + + def __init__(self, depth_method: Optional[Depth[T]] = None) -> None: + super().__init__(_ArgMaxClassifier(), depth_method) From f334a8f2ba1df816a5b06921209c35a7feae1cb5 Mon Sep 17 00:00:00 2001 From: pedrorponga Date: Tue, 26 Oct 2021 20:21:45 +0200 Subject: [PATCH 062/117] Style --- skfda/ml/classification/_depth_classifiers.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/skfda/ml/classification/_depth_classifiers.py b/skfda/ml/classification/_depth_classifiers.py index 7c2f51f77..74cadcb5f 100644 --- a/skfda/ml/classification/_depth_classifiers.py +++ b/skfda/ml/classification/_depth_classifiers.py @@ -14,7 +14,7 @@ from ..._utils import _classifier_fit_depth_methods, _classifier_get_classes from ...exploratory.depth import Depth, ModifiedBandDepth from ...preprocessing.dim_reduction.feature_extraction import DDGTransformer -from ...representation._typing import NDArrayInt, NDArrayFloat +from ...representation._typing import NDArrayFloat, NDArrayInt from ...representation.grid import FData T = TypeVar("T", bound=FData) @@ -295,20 +295,20 @@ class _ArgMaxClassifier( BaseEstimator, # type: ignore ClassifierMixin, # type: ignore ): - """Arg max classifier for multivariate data. + r"""Arg max classifier for multivariate data. Test samples are classified to the class that corresponds to the index of the highest coordinate. Examples: - >>> import numpy as np >>> X = np.array([[1,5], [3,2], [4,1]]) >>> y = np.array([1, 0, 0]) We will fit am ArgMax classifier - >>> from skfda.ml.classification._depth_classifiers import _ArgMaxClassifier + >>> from skfda.ml.classification._depth_classifiers import \ + ... _ArgMaxClassifier >>> clf = _ArgMaxClassifier() >>> clf.fit(X, y) _ArgMaxClassifier(...) From 0fc2e2086e84fdd8e1150baf6113af27cdb7ee09 Mon Sep 17 00:00:00 2001 From: pedrorponga Date: Tue, 26 Oct 2021 22:28:38 +0200 Subject: [PATCH 063/117] tests --- .../classification/_centroid_classifiers.py | 6 +- tests/test_classification.py | 131 +++++++++++++++++- 2 files changed, 131 insertions(+), 6 deletions(-) diff --git a/skfda/ml/classification/_centroid_classifiers.py b/skfda/ml/classification/_centroid_classifiers.py index 8c3d8e346..7ed6c92f2 100644 --- a/skfda/ml/classification/_centroid_classifiers.py +++ b/skfda/ml/classification/_centroid_classifiers.py @@ -183,15 +183,15 @@ def __init__( self.proportiontocut = proportiontocut self.depth_method = depth_method - if self.depth_method is None: - self.depth_method = ModifiedBandDepth() - super().__init__( metric, centroid=self._centroid, ) def _centroid(self, fdatagrid: T) -> T: + if self.depth_method is None: + self.depth_method = ModifiedBandDepth() + return trim_mean( fdatagrid, self.proportiontocut, diff --git a/tests/test_classification.py b/tests/test_classification.py index e14eb4816..66df3f1c3 100644 --- a/tests/test_classification.py +++ b/tests/test_classification.py @@ -5,14 +5,26 @@ import numpy as np from sklearn.base import clone from sklearn.model_selection import train_test_split +from sklearn.neighbors import KNeighborsClassifier as _KNeighborsClassifier +from sklearn.utils.estimator_checks import parametrize_with_checks from skfda.datasets import fetch_growth -from skfda.ml.classification import DTMClassifier +from skfda.misc.metrics import l2_distance +from skfda.ml.classification import ( + DDClassifier, + DDGClassifier, + DTMClassifier, + KNeighborsClassifier, + MaximumDepthClassifier, + NearestCentroid, + RadiusNeighborsClassifier, +) +from skfda.ml.classification._depth_classifiers import _ArgMaxClassifier from skfda.representation import FData -class TestCentroidClassifiers(unittest.TestCase): - """Tests for centroid classifiers.""" +class TestClassifiers(unittest.TestCase): + """Tests for classifiers.""" def setUp(self) -> None: """Establish train and test data sets.""" @@ -44,6 +56,119 @@ def test_dtm_independent_copy(self) -> None: clf2.predict(self._X_test), ) + def test_dtm_classifier(self) -> None: + """Check DTM classifier.""" + clf: DTMClassifier[FData] = DTMClassifier(proportiontocut=0.25) + clf.fit(self._X_train, self._y_train) + + np.testing.assert_array_equal( # type: ignore + clf.predict(self._X_test), + [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, + 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1], + ) + + def test_centroid_classifier(self) -> None: + """Check NearestCentroid classifier.""" + clf: NearestCentroid[FData] = NearestCentroid() + clf.fit(self._X_train, self._y_train) + + np.testing.assert_array_equal( # type: ignore + clf.predict(self._X_test), + [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, + 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1], + ) + + def test_dtm_inheritance(self) -> None: + """Check that DTM is a subclass of NearestCentroid.""" + clf1: NearestCentroid[FData] = NearestCentroid() + clf2: DTMClassifier[FData] = DTMClassifier( + proportiontocut=0.0, + metric=l2_distance, + ) + clf1.fit(self._X_train, self._y_train) + clf2.fit(self._X_train, self._y_train) + + np.testing.assert_array_equal( # type: ignore + clf1.predict(self._X_test), + clf2.predict(self._X_test), + ) + + def test_maximumdepth_classifier(self) -> None: + """Check MaximumDepth classifier.""" + clf: MaximumDepthClassifier[FData] = MaximumDepthClassifier() + clf.fit(self._X_train, self._y_train) + + np.testing.assert_array_equal( # type: ignore + clf.predict(self._X_test), + [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, + 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1], + ) + + def test_DD_classifier(self) -> None: + """Check DD classifier.""" + clf: DDClassifier[FData] = DDClassifier(degree=2) + clf.fit(self._X_train, self._y_train) + + np.testing.assert_array_equal( # type: ignore + clf.predict(self._X_test), + [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, + 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1], + ) + + def test_DDG_classifier(self) -> None: + """Check DDG classifier.""" + clf: DDGClassifier[FData] = DDGClassifier(_KNeighborsClassifier()) + clf.fit(self._X_train, self._y_train) + + np.testing.assert_array_equal( # type: ignore + clf.predict(self._X_test), + [1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, + 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1], + ) + + def test_maximumdepth_inheritance(self) -> None: + """Check that MaximumDepth is a subclass of DDG.""" + clf1: DDGClassifier[FData] = DDGClassifier(_ArgMaxClassifier()) + clf2: MaximumDepthClassifier[FData] = MaximumDepthClassifier() + clf1.fit(self._X_train, self._y_train) + clf2.fit(self._X_train, self._y_train) + + np.testing.assert_array_equal( # type: ignore + clf1.predict(self._X_test), + clf2.predict(self._X_test), + ) + + def test_KNeighbors_classifier(self) -> None: + """Check KNeighbors classifier.""" + clf: KNeighborsClassifier[FData] = KNeighborsClassifier() + clf.fit(self._X_train, self._y_train) + + np.testing.assert_array_equal( # type: ignore + clf.predict(self._X_test), + [0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, + 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1], + ) + + def test_RadiusNeighbors_classifier(self) -> None: + """Check RadiusNeighbors classifier.""" + clf = RadiusNeighborsClassifier(radius=15) + clf.fit(self._X_train, self._y_train) + + np.testing.assert_array_equal( # type: ignore + clf.predict(self._X_test), + [0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, + 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1], + ) + + def test_RadiusNeighbors_small_raidus(self) -> None: + """Check that an error is raised if radius too small.""" + clf: RadiusNeighborsClassifier[FData] = RadiusNeighborsClassifier( + radius=1) + clf.fit(self._X_train, self._y_train) + + with np.testing.assert_raises(ValueError): + clf.predict(self._X_test) + if __name__ == '__main__': unittest.main() From 5df3d3334057a4732f02bae97b13f79f6ad2fa2c Mon Sep 17 00:00:00 2001 From: pedrorponga Date: Tue, 26 Oct 2021 22:38:29 +0200 Subject: [PATCH 064/117] Style --- tests/test_classification.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tests/test_classification.py b/tests/test_classification.py index 66df3f1c3..8aa82f673 100644 --- a/tests/test_classification.py +++ b/tests/test_classification.py @@ -6,7 +6,6 @@ from sklearn.base import clone from sklearn.model_selection import train_test_split from sklearn.neighbors import KNeighborsClassifier as _KNeighborsClassifier -from sklearn.utils.estimator_checks import parametrize_with_checks from skfda.datasets import fetch_growth from skfda.misc.metrics import l2_distance @@ -82,7 +81,7 @@ def test_dtm_inheritance(self) -> None: """Check that DTM is a subclass of NearestCentroid.""" clf1: NearestCentroid[FData] = NearestCentroid() clf2: DTMClassifier[FData] = DTMClassifier( - proportiontocut=0.0, + proportiontocut=0, metric=l2_distance, ) clf1.fit(self._X_train, self._y_train) @@ -140,7 +139,7 @@ def test_maximumdepth_inheritance(self) -> None: def test_KNeighbors_classifier(self) -> None: """Check KNeighbors classifier.""" - clf: KNeighborsClassifier[FData] = KNeighborsClassifier() + clf = KNeighborsClassifier() clf.fit(self._X_train, self._y_train) np.testing.assert_array_equal( # type: ignore From 8f552a18908a6b97739da373d6b4ef88044ac4e2 Mon Sep 17 00:00:00 2001 From: pedrorponga Date: Tue, 26 Oct 2021 22:41:37 +0200 Subject: [PATCH 065/117] Style --- tests/test_classification.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/test_classification.py b/tests/test_classification.py index 8aa82f673..8f406195f 100644 --- a/tests/test_classification.py +++ b/tests/test_classification.py @@ -103,7 +103,7 @@ def test_maximumdepth_classifier(self) -> None: 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1], ) - def test_DD_classifier(self) -> None: + def test_dd_classifier(self) -> None: """Check DD classifier.""" clf: DDClassifier[FData] = DDClassifier(degree=2) clf.fit(self._X_train, self._y_train) @@ -114,7 +114,7 @@ def test_DD_classifier(self) -> None: 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1], ) - def test_DDG_classifier(self) -> None: + def test_ddg_classifier(self) -> None: """Check DDG classifier.""" clf: DDGClassifier[FData] = DDGClassifier(_KNeighborsClassifier()) clf.fit(self._X_train, self._y_train) @@ -137,7 +137,7 @@ def test_maximumdepth_inheritance(self) -> None: clf2.predict(self._X_test), ) - def test_KNeighbors_classifier(self) -> None: + def test_kneighbors_classifier(self) -> None: """Check KNeighbors classifier.""" clf = KNeighborsClassifier() clf.fit(self._X_train, self._y_train) @@ -148,7 +148,7 @@ def test_KNeighbors_classifier(self) -> None: 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1], ) - def test_RadiusNeighbors_classifier(self) -> None: + def test_radiusneighbors_classifier(self) -> None: """Check RadiusNeighbors classifier.""" clf = RadiusNeighborsClassifier(radius=15) clf.fit(self._X_train, self._y_train) @@ -159,7 +159,7 @@ def test_RadiusNeighbors_classifier(self) -> None: 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1], ) - def test_RadiusNeighbors_small_raidus(self) -> None: + def test_radiusneighbors_small_raidus(self) -> None: """Check that an error is raised if radius too small.""" clf: RadiusNeighborsClassifier[FData] = RadiusNeighborsClassifier( radius=1) From 861b6363f007d73528e4975385a60ec9fe74a1d1 Mon Sep 17 00:00:00 2001 From: pedrorponga Date: Tue, 26 Oct 2021 22:44:01 +0200 Subject: [PATCH 066/117] Style --- tests/test_classification.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/test_classification.py b/tests/test_classification.py index 8f406195f..001e2f532 100644 --- a/tests/test_classification.py +++ b/tests/test_classification.py @@ -161,8 +161,7 @@ def test_radiusneighbors_classifier(self) -> None: def test_radiusneighbors_small_raidus(self) -> None: """Check that an error is raised if radius too small.""" - clf: RadiusNeighborsClassifier[FData] = RadiusNeighborsClassifier( - radius=1) + clf = RadiusNeighborsClassifier(radius=1) clf.fit(self._X_train, self._y_train) with np.testing.assert_raises(ValueError): From 670b0452fd431f09e1b287d8dae97df139fa328b Mon Sep 17 00:00:00 2001 From: pedrorponga Date: Tue, 26 Oct 2021 22:46:58 +0200 Subject: [PATCH 067/117] Style --- tests/test_classification.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/test_classification.py b/tests/test_classification.py index 001e2f532..c8a1d4a02 100644 --- a/tests/test_classification.py +++ b/tests/test_classification.py @@ -99,8 +99,10 @@ def test_maximumdepth_classifier(self) -> None: np.testing.assert_array_equal( # type: ignore clf.predict(self._X_test), - [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, - 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1], + [ + 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, + 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, + ], ) def test_dd_classifier(self) -> None: From 9afeadbcba7d885b3f4ed4889bd6880decca0e00 Mon Sep 17 00:00:00 2001 From: pedrorponga Date: Tue, 26 Oct 2021 22:50:15 +0200 Subject: [PATCH 068/117] Style --- tests/test_classification.py | 36 ++++++++++++++++++++++++------------ 1 file changed, 24 insertions(+), 12 deletions(-) diff --git a/tests/test_classification.py b/tests/test_classification.py index c8a1d4a02..6e426e4c5 100644 --- a/tests/test_classification.py +++ b/tests/test_classification.py @@ -62,8 +62,10 @@ def test_dtm_classifier(self) -> None: np.testing.assert_array_equal( # type: ignore clf.predict(self._X_test), - [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, - 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1], + [ + 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, + 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, + ], ) def test_centroid_classifier(self) -> None: @@ -73,8 +75,10 @@ def test_centroid_classifier(self) -> None: np.testing.assert_array_equal( # type: ignore clf.predict(self._X_test), - [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, - 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1], + [ + 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, + 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, + ], ) def test_dtm_inheritance(self) -> None: @@ -112,8 +116,10 @@ def test_dd_classifier(self) -> None: np.testing.assert_array_equal( # type: ignore clf.predict(self._X_test), - [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, - 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1], + [ + 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, + 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, + ], ) def test_ddg_classifier(self) -> None: @@ -123,8 +129,10 @@ def test_ddg_classifier(self) -> None: np.testing.assert_array_equal( # type: ignore clf.predict(self._X_test), - [1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, - 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1], + [ + 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, + 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, + ], ) def test_maximumdepth_inheritance(self) -> None: @@ -146,8 +154,10 @@ def test_kneighbors_classifier(self) -> None: np.testing.assert_array_equal( # type: ignore clf.predict(self._X_test), - [0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, - 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1], + [ + 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, + 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, + ], ) def test_radiusneighbors_classifier(self) -> None: @@ -157,8 +167,10 @@ def test_radiusneighbors_classifier(self) -> None: np.testing.assert_array_equal( # type: ignore clf.predict(self._X_test), - [0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, - 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1], + [ + 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, + 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, + ], ) def test_radiusneighbors_small_raidus(self) -> None: From a7cced733f512dade53ec21a71278965dd13f60b Mon Sep 17 00:00:00 2001 From: Alvaro Date: Tue, 26 Oct 2021 23:16:23 +0200 Subject: [PATCH 069/117] Per class feature constructor --- skfda/_utils/__init__.py | 1 + skfda/_utils/_utils.py | 16 ++- .../feature_extraction/__init__.py | 1 + .../_per_class_feature_transformer.py | 105 ++++++++++++++++++ 4 files changed, 122 insertions(+), 1 deletion(-) create mode 100644 skfda/preprocessing/dim_reduction/feature_extraction/_per_class_feature_transformer.py diff --git a/skfda/_utils/__init__.py b/skfda/_utils/__init__.py index 4c52efada..9b3612616 100644 --- a/skfda/_utils/__init__.py +++ b/skfda/_utils/__init__.py @@ -8,6 +8,7 @@ _classifier_fit_depth_methods, _classifier_get_classes, _classifier_get_depth_methods, + _classifier_fit_feature_transformer, _compute_dependence, _DependenceMeasure, _evaluate_grid, diff --git a/skfda/_utils/_utils.py b/skfda/_utils/_utils.py index f718a55c5..9a8f42d3d 100644 --- a/skfda/_utils/_utils.py +++ b/skfda/_utils/_utils.py @@ -23,7 +23,7 @@ import scipy.integrate from numpy import ndarray from pandas.api.indexers import check_array_indexer -from sklearn.base import clone +from sklearn.base import TransformerMixin, clone from sklearn.preprocessing import LabelEncoder from sklearn.utils.multiclass import check_classification_targets from typing_extensions import Literal, Protocol @@ -729,6 +729,20 @@ def _classifier_fit_depth_methods( return classes, class_depth_methods_ +def _classifier_fit_feature_transformer( + X: T, + y: ndarray, + transformer: TransformerMixin +) -> Tuple[ndarray, Sequence[TransformerMixin]]: + classes, y_ind = _classifier_get_classes(y) + + class_feature_transformers = [ + clone(transformer).fit(X[y_ind == cur_class], y[y_ind == cur_class]) + for cur_class in range(classes.size) + ] + + return classes, class_feature_transformers + _DependenceMeasure = Callable[[np.ndarray, np.ndarray], np.ndarray] diff --git a/skfda/preprocessing/dim_reduction/feature_extraction/__init__.py b/skfda/preprocessing/dim_reduction/feature_extraction/__init__.py index 16355e236..8c8f9895b 100644 --- a/skfda/preprocessing/dim_reduction/feature_extraction/__init__.py +++ b/skfda/preprocessing/dim_reduction/feature_extraction/__init__.py @@ -1,3 +1,4 @@ """Feature extraction.""" from ._ddg_transformer import DDGTransformer +from ._per_class_feature_transformer import PerClassFeatureTransformer from ._fpca import FPCA diff --git a/skfda/preprocessing/dim_reduction/feature_extraction/_per_class_feature_transformer.py b/skfda/preprocessing/dim_reduction/feature_extraction/_per_class_feature_transformer.py new file mode 100644 index 000000000..cc646a5fb --- /dev/null +++ b/skfda/preprocessing/dim_reduction/feature_extraction/_per_class_feature_transformer.py @@ -0,0 +1,105 @@ +"""Feature extraction transformers for dimensionality reduction.""" +from __future__ import annotations +import numpy as np +from typing import TypeVar +from sklearn.base import TransformerMixin +from sklearn.utils.validation import check_is_fitted as sklearn_check_is_fitted +from ....representation.grid import FData +from ...._utils import _classifier_fit_feature_transformer + +T = TypeVar("T", bound=FData) + +class PerClassFeatureTransformer(TransformerMixin): + + def __init__( + self, + transformer: TransformerMixin + ) -> None: + self.transformer= transformer + self._validate_transformer() + + def _validate_transformer( + self + ) -> None: + """ + Checks that the transformer passed is scikit-learn-like and that uses target data in fit + + Args: + None + + Returns: + None + """ + if not (hasattr(self.transformer, "fit") or hasattr(self.transformer, "fit_transform")) or not hasattr( + self.transformer, "transform" + ): + raise TypeError( + "Transformer should implement fit and " + "transform. '%s' (type %s) doesn't" % (self.transformer, type(self.transformer)) + ) + + tags = self.transformer._get_tags() + + if not(tags['stateless'] and tags['requires_y']): + raise TypeError( + "Transformer should use target data in fit." + " '%s' (type %s) doesn't" % (self.transformer, type(self.transformer)) + ) + + + def fit( + self, + X: T, + y: np.ndarray + ) -> PerClassFeatureTransformer: + """ + Fit the model on each class using X as training data and y as target values. + + Args: + X: FDataGrid with the training data. + y: Target values of shape = (n_samples). + + Returns: + self + """ + classes, class_feature_transformers = _classifier_fit_feature_transformer( + X, y, self.transformer + ) + + self._classes = classes + self._class_feature_transformers_ = class_feature_transformers + + return self + + + def transform(self, X: T) -> np.ndarray: + """ + Transform the provided data using the already fitted transformer. + + Args: + X: FDataGrid with the test samples. + + Returns: + Array of shape (n_samples, G). + """ + sklearn_check_is_fitted(self) + + return [ + feature_transformer.transform(X) + for feature_transformer in self._class_feature_transformers_ + ] + + + def fit_transform(self, X: T, y: np.ndarray) -> np.ndarray: + """ + Fits and transforms the provided data + using the transformer specified when initializing the class. + + Args: + X: FDataGrid with the samples. + y: Target values of shape = (n_samples) + + Returns: + Array of shape (n_samples, G). + """ + return self.fit(X, y).transform(X) From 5a4991e1ee251facea1db69129fdfb32ef63dc05 Mon Sep 17 00:00:00 2001 From: VNMabus Date: Sun, 31 Oct 2021 18:56:11 +0100 Subject: [PATCH 070/117] Improved tests of inverse transform. Found and corrected bug in transform (weights not multiplied). Use the Simpson quadrature weigths in FPCA by default. --- skfda/misc/covariances.py | 5 +- skfda/misc/metrics/_fisher_rao.py | 5 +- .../dim_reduction/feature_extraction/_fpca.py | 55 +++--- tests/test_fpca.py | 160 +++++++++--------- 4 files changed, 115 insertions(+), 110 deletions(-) diff --git a/skfda/misc/covariances.py b/skfda/misc/covariances.py index 7acda93fa..5358a57fa 100644 --- a/skfda/misc/covariances.py +++ b/skfda/misc/covariances.py @@ -9,7 +9,6 @@ from matplotlib.figure import Figure from scipy.special import gamma, kv -from ..exploratory.visualization._utils import _create_figure, _figure_to_svg from ..representation._typing import ArrayLike, NDArrayFloat @@ -74,6 +73,8 @@ def __call__(self, x: ArrayLike, y: ArrayLike) -> NDArrayFloat: def heatmap(self, limits: Tuple[float, float] = (-1, 1)) -> Figure: """Return a heatmap plot of the covariance function.""" + from ..exploratory.visualization._utils import _create_figure + x = np.linspace(*limits, 1000) cov_matrix = self(x, x) @@ -130,6 +131,8 @@ def _repr_latex_(self) -> str: return fr"\(\displaystyle {self._latex_content()}\)" def _repr_html_(self) -> str: + from ..exploratory.visualization._utils import _figure_to_svg + fig = self.heatmap() heatmap = _figure_to_svg(fig) plt.close(fig) diff --git a/skfda/misc/metrics/_fisher_rao.py b/skfda/misc/metrics/_fisher_rao.py index f63bb80f8..469ed0479 100644 --- a/skfda/misc/metrics/_fisher_rao.py +++ b/skfda/misc/metrics/_fisher_rao.py @@ -7,7 +7,6 @@ from typing_extensions import Final from ..._utils import normalize_scale, normalize_warping -from ...preprocessing.registration import FisherRaoElasticRegistration from ...representation import FData, FDataGrid from ...representation._typing import NDArrayFloat from ..operators import SRSF @@ -191,6 +190,8 @@ def fisher_rao_amplitude_distance( .. footbibliography:: """ + from ...preprocessing.registration import FisherRaoElasticRegistration + fdata1, fdata2 = _cast_to_grid( fdata1, fdata2, @@ -285,6 +286,8 @@ def fisher_rao_phase_distance( .. footbibliography:: """ + from ...preprocessing.registration import FisherRaoElasticRegistration + fdata1, fdata2 = _cast_to_grid( fdata1, fdata2, diff --git a/skfda/preprocessing/dim_reduction/feature_extraction/_fpca.py b/skfda/preprocessing/dim_reduction/feature_extraction/_fpca.py index 32be82b3d..6e2f7cabe 100644 --- a/skfda/preprocessing/dim_reduction/feature_extraction/_fpca.py +++ b/skfda/preprocessing/dim_reduction/feature_extraction/_fpca.py @@ -5,11 +5,13 @@ from typing import Callable, Optional, TypeVar, Union import numpy as np +import scipy.integrate +from scipy.linalg import solve_triangular from sklearn.base import BaseEstimator, TransformerMixin from sklearn.decomposition import PCA -from scipy.linalg import solve_triangular - +from ....misc import inner_product_matrix +from ....misc.metrics import l2_norm from ....misc.regularization import ( TikhonovRegularization, compute_penalty_matrix, @@ -341,9 +343,8 @@ def _fit_grid( # in trapezoidal rule, suppose \deltax_k = x_k - x_{k-1}, the # weight vector is as follows: [\deltax_1/2, \deltax_1/2 + # \deltax_2/2, \deltax_2/2 + \deltax_3/2, ... , \deltax_n/2] - differences = np.diff(X.grid_points[0]) - differences = np.concatenate(((0,), differences, (0,))) - self.weights = (differences[:-1] + differences[1:]) / 2 + identity = np.eye(len(X.grid_points[0])) + self.weights = scipy.integrate.simps(identity, X.grid_points[0]) elif callable(self.weights): self.weights = self.weights(X.grid_points[0]) # if its a FDataGrid then we need to reduce the dimension to 1-D @@ -387,6 +388,7 @@ def _fit_grid( ), sample_names=(None,) * self.n_components, ) + self.explained_variance_ratio_ = pca.explained_variance_ratio_ self.explained_variance_ = pca.explained_variance_ @@ -413,6 +415,7 @@ def _transform_grid( return ( X.data_matrix.reshape(X.data_matrix.shape[:-1]) + * self.weights @ np.transpose( self.components_.data_matrix.reshape( self.components_.data_matrix.shape[:-1], @@ -524,30 +527,22 @@ def inverse_transform( # .fit was applied to FDataGrid or FDataBasis object # Does not work (boundary problem in x_hat and bias reconstruction) if isinstance(self.components_, FDataGrid): - x_hat = np.matmul( - pc_scores, - self.components_.data_matrix[:, :, 0] - ) - # uncenter - x_hat += self.mean_.data_matrix.reshape( - (1, self.mean_.grid_points[0].shape[0]), - ) - # format as FDataGrid according to fitted data format - return FDataGrid( - data_matrix=x_hat, - grid_points=self.mean_.grid_points[0], - argument_names=self.mean_.argument_names, - ) + additional_args = { + "data_matrix": np.einsum( + 'nc,c...->n...', + pc_scores, + self.components_.data_matrix, + ), + } + elif isinstance(self.components_, FDataBasis): - # reconstruct the basis coefficients - x_hat = np.dot(pc_scores, self.components_.coefficients) - x_hat += self.mean_.coefficients.reshape( - (1, self.mean_.coefficients.shape[1]), - ) - # format as FDataBasis according to fitted data format - return FDataBasis( - basis=self.mean_.basis, - coefficients=x_hat, - argument_names=self.mean_.argument_names, - ) + + additional_args = { + "coefficients": pc_scores @ self.components_.coefficients, + } + + return self.mean_.copy( + **additional_args, + sample_names=(None,) * len(pc_scores), + ) + self.mean_ diff --git a/tests/test_fpca.py b/tests/test_fpca.py index 60e3e797a..fc9ccd29c 100644 --- a/tests/test_fpca.py +++ b/tests/test_fpca.py @@ -2,15 +2,14 @@ import unittest import numpy as np -from numpy.core.fromnumeric import size -from numpy.lib.index_tricks import nd_grid +import skfda from skfda import FDataBasis, FDataGrid -from skfda.datasets import fetch_weather, make_multimodal_samples +from skfda.datasets import fetch_weather from skfda.misc.operators import LinearDifferentialOperator from skfda.misc.regularization import TikhonovRegularization from skfda.preprocessing.dim_reduction.feature_extraction import FPCA -from skfda.representation.basis import Fourier, BSpline +from skfda.representation.basis import Basis, BSpline, Fourier class FPCATestCase(unittest.TestCase): @@ -342,7 +341,7 @@ def test_grid_fpca_transform_result(self) -> None: [234.40195237], [345.39374006], ]) - np.testing.assert_allclose(scores, results, rtol=1e-6) + np.testing.assert_allclose(scores, results) def test_grid_fpca_regularization_fit_result(self) -> None: """Compare the components in grid against the fda.usc package.""" @@ -451,94 +450,99 @@ def test_grid_fpca_regularization_fit_result(self) -> None: rtol=1e-2, ) + def draw_one_random_fun( + self, + basis: Basis, + random_state: np.random.RandomState, + ) -> FDataBasis: + """Draw a true function in a given basis with random coef.""" + coef = random_state.uniform(-10, 10, size=basis.n_basis) + return FDataBasis( + basis=basis, + coefficients=coef, + ) + def test_grid_fpca_inverse_transform(self) -> None: """Compare the reconstructions.data_matrix to fitting data.""" - - seed = 42 - np.random.seed(seed) - # Randomly, draw a true function that generates the dataset. - def draw_one_random_fun(n_grid) -> FDataGrid: - modes_location = np.random.uniform(-10., 10., size=50) - noise = 10**-2 - fd_random = make_multimodal_samples( - start=0., - stop=15., - n_samples=1, - points_per_dim=n_grid, - n_modes=modes_location.size, - noise=noise, - modes_location=modes_location, - random_state=seed - ) - return fd_random + random_state = np.random.RandomState(seed=42) + + def test_vs_dim( + n_samples: int, + n_grid: int, + base_fun: FDataBasis, + ) -> None: + """Test function w.r.t n_samples, n_grid.""" + # Random offsetting base_fun and form dataset fd_random + offset = random_state.uniform(-5, 5, size=n_samples) - # test function w.r.t n_samples, n_grid - def test_vs_dim(n_samples, n_grid, base_fun): - fd_random_all_equal = base_fun - # Concatenate random FDataGrid 'n_sample's times - for _ in range(1, n_samples - 1): - fd_random_all_equal = fd_random_all_equal.concatenate(base_fun) + fd_random = FDataBasis( + basis=base_fun.basis, + coefficients=base_fun.coefficients * offset[:, np.newaxis], + ).to_grid(np.linspace(0, 1, n_grid)) # Take the allowed maximum number of components # In almost high dimension: n_components=n_samples-1 < n_samples # In low dimension: n_components=n_grid << n_samples - fpca = FPCA(n_components=np.min([n_samples - 1, n_grid])) + fpca = FPCA( + n_components=min(n_samples - 1, n_grid), + ) # Project the non-random dataset on FPCs pc_scores_fd_random_all_equal = fpca.fit_transform( - fd_random_all_equal + fd_random, ) + # Project the pc scores back to the input functional space - fd_random_all_equal_hat = fpca.inverse_transform( - pc_scores_fd_random_all_equal + fd_random_hat = fpca.inverse_transform( + pc_scores_fd_random_all_equal, ) # Compare fitting data to the reconstructed ones np.testing.assert_allclose( - fd_random_all_equal.data_matrix, - fd_random_all_equal_hat.data_matrix + fd_random.data_matrix, + fd_random_hat.data_matrix, ) # Low dimensional case (n_samples>n_grid) - n_samples = 10**3 - n_grid = 10**2 - true_fun = draw_one_random_fun(n_grid) + n_samples = 1000 + n_grid = 100 + bsp = BSpline( + domain_range=(0, 50), + n_basis=100, + order=3, + ) + true_fun = self.draw_one_random_fun(bsp, random_state) test_vs_dim(n_samples=n_samples, n_grid=n_grid, base_fun=true_fun) # (almost) High dimensional case (n_samples None: """Compare the coef reconstructions to fitting data.""" + random_state = np.random.RandomState(seed=42) + + def test_vs_dim(n_samples: int, base_fun: FDataBasis) -> None: + """Test function w.t.t n_samples and basis.""" + # Random offsetting base_fun and form dataset fd_random + offset = random_state.uniform(-5, 5, size=n_samples) - seed = 42 - np.random.seed(seed) - # Draw a true function in a given basis with random coef. - def draw_one_random_fun(basis): - coef = np.random.uniform(-10., 10., size=basis.n_basis) fd_random = FDataBasis( - basis=basis, - coefficients=coef + basis=base_fun.basis, + coefficients=base_fun.coefficients * offset[:, np.newaxis], ) - return fd_random - - # test function w.t.t n_samples and basis - def test_vs_dim(n_samples, base_fun): - fd_random = base_fun.copy() - offset = np.random.uniform(-5., 5., size=n_samples) - # Random offsetting base_fun and form dataset fd_random - for i in range(n_samples): - fd_i = base_fun.copy() - fd_i.coefficients += offset[i] - fd_random = fd_random.concatenate(fd_i) # Take the allowed maximum number of components # In almost high dimension: n_components=n_samples-1 < n_samples # In low dimension: n_components=n_basis< Date: Mon, 1 Nov 2021 01:14:01 +0100 Subject: [PATCH 071/117] Improve registration hierarchy. --- skfda/_utils/__init__.py | 5 + skfda/_utils/_sklearn_adapter.py | 63 +++++++++++ .../preprocessing/registration/_fisher_rao.py | 45 +++----- .../registration/_lstsq_shift_registration.py | 105 ++++++------------ skfda/preprocessing/registration/base.py | 104 +++++++++++++++-- tests/test_registration.py | 7 +- 6 files changed, 210 insertions(+), 119 deletions(-) create mode 100644 skfda/_utils/_sklearn_adapter.py diff --git a/skfda/_utils/__init__.py b/skfda/_utils/__init__.py index 4c52efada..fc9972af9 100644 --- a/skfda/_utils/__init__.py +++ b/skfda/_utils/__init__.py @@ -1,4 +1,9 @@ from . import constants +from ._sklearn_adapter import ( + BaseEstimator, + InductiveTransformerMixin, + TransformerMixin, +) from ._utils import ( RandomStateLike, _cartesian_product, diff --git a/skfda/_utils/_sklearn_adapter.py b/skfda/_utils/_sklearn_adapter.py new file mode 100644 index 000000000..266970f71 --- /dev/null +++ b/skfda/_utils/_sklearn_adapter.py @@ -0,0 +1,63 @@ +from __future__ import annotations + +from abc import ABC, abstractmethod +from typing import Any, Generic, Optional, TypeVar, overload + +import sklearn.base + +SelfType = TypeVar("SelfType") +Input = TypeVar("Input") +Output = TypeVar("Output") +Target = TypeVar("Target") + + +class BaseEstimator( + ABC, + sklearn.base.BaseEstimator, # type: ignore[misc] +): + pass + + +class TransformerMixin( + ABC, + Generic[Input, Output, Target], + sklearn.base.TransformerMixin, # type: ignore[misc] +): + + def fit( + self: SelfType, + X: Input, + y: Optional[Target] = None, + ) -> SelfType: + + return self + + @overload # type: ignore[misc] + def fit_transform( + self, + X: Input, + y: Optional[Target] = None, + ) -> Output: + pass + + def fit_transform( + self, + X: Input, + y: Optional[Target] = None, + **fit_params: Any, + ) -> Output: + + return super().fit_transform(X, y, **fit_params) + + +class InductiveTransformerMixin( + TransformerMixin[Input, Output, Target], +): + + @abstractmethod + def transform( + self: SelfType, + X: Input, + ) -> Output: + + pass diff --git a/skfda/preprocessing/registration/_fisher_rao.py b/skfda/preprocessing/registration/_fisher_rao.py index ea25aae5f..6cdcd761f 100644 --- a/skfda/preprocessing/registration/_fisher_rao.py +++ b/skfda/preprocessing/registration/_fisher_rao.py @@ -1,8 +1,8 @@ - +"""Fisher-Rao elastic registration.""" from __future__ import annotations import warnings -from typing import Any, Callable, Optional, Union +from typing import Callable, Optional, TypeVar, Union from sklearn.utils.validation import check_is_fitted @@ -13,12 +13,15 @@ from ...misc.operators import SRSF from ...representation._typing import ArrayLike from ...representation.interpolation import SplineInterpolation -from .base import RegistrationTransformer +from .base import InductiveRegistrationTransformer _MeanType = Callable[[FDataGrid], FDataGrid] +SelfType = TypeVar("SelfType", bound="FisherRaoElasticRegistration") -class FisherRaoElasticRegistration(RegistrationTransformer): +class FisherRaoElasticRegistration( + InductiveRegistrationTransformer[FDataGrid, FDataGrid], +): r"""Align a FDatagrid using the SRSF framework. Let :math:`f` be a function of the functional data object wich will be @@ -57,16 +60,16 @@ class FisherRaoElasticRegistration(RegistrationTransformer): extensively the algorithms employed and the SRSF framework. Args: - template (str, :class:`FDataGrid` or callable, optional): Template to + template: Template to align the curves. Can contain 1 sample to align all the curves to it or the same number of samples than the fdatagrid. By default `elastic mean`, in which case :func:`elastic_mean` is called. - penalty_term (float, optional): Controls the amount of elasticity. + penalty_term: Controls the amount of elasticity. Defaults to 0. - output_points (array_like, optional): Set of points where the + output_points: Set of points where the functions are evaluated, by default uses the sample points of the fdatagrid which will be transformed. - grid_dim (int, optional): Dimension of the grid used in the DP + grid_dim: Dimension of the grid used in the DP alignment algorithm. Defaults 7. Attributes: @@ -115,22 +118,8 @@ def __init__( self.output_points = output_points self.grid_dim = grid_dim - def fit(self, X: FDataGrid, y: None = None) -> RegistrationTransformer: - """Fit the transformer. - - Learns the template used during the transformation. - - Args: - X: Functional observations used as training samples. If the - template provided is a FDataGrid this argument is ignored, as - it is not necessary to learn the template from the training - data. - y: Present for API conventions. - - Returns: - self. + def fit(self: SelfType, X: FDataGrid, y: None = None) -> SelfType: - """ if isinstance(self.template, FDataGrid): self.template_ = self.template # Template already constructed else: @@ -143,16 +132,7 @@ def fit(self, X: FDataGrid, y: None = None) -> RegistrationTransformer: return self def transform(self, X: FDataGrid, y: None = None) -> FDataGrid: - """Apply elastic registration to the data. - - Args: - X: Functional data to be registered. - y: Present for API conventions. - Returns: - Registered samples. - - """ check_is_fitted(self, '_template_srsf') check_is_univariate(X) @@ -287,6 +267,7 @@ def inverse_transform(self, X: FDataGrid, y: None = None) -> FDataGrid: class ElasticRegistration(FisherRaoElasticRegistration): + """Deprecated name for FisherRaoElasticRegistration.""" def __init__( self, diff --git a/skfda/preprocessing/registration/_lstsq_shift_registration.py b/skfda/preprocessing/registration/_lstsq_shift_registration.py index d6f88667d..0df35017d 100644 --- a/skfda/preprocessing/registration/_lstsq_shift_registration.py +++ b/skfda/preprocessing/registration/_lstsq_shift_registration.py @@ -1,4 +1,4 @@ -"""Class to apply Shift Registration to functional data""" +"""Shift registration of functional data by least squares.""" from __future__ import annotations import warnings @@ -14,13 +14,16 @@ from ...misc.metrics._lp_norms import l2_norm from ...representation._typing import ArrayLike, GridPointsLike, NDArrayFloat from ...representation.extrapolation import ExtrapolationLike -from .base import RegistrationTransformer +from .base import InductiveRegistrationTransformer +SelfType = TypeVar("SelfType", bound="LeastSquaresShiftRegistration[FData]") T = TypeVar("T", bound=FData) TemplateFunction = Callable[[FDataGrid], FDataGrid] -class LeastSquaresShiftRegistration(RegistrationTransformer): +class LeastSquaresShiftRegistration( + InductiveRegistrationTransformer[T, T], +): r"""Register data using shift alignment by least squares criterion. Realizes the registration of a set of curves using a shift aligment @@ -252,52 +255,26 @@ def _compute_deltas( return delta, template_iter def fit_transform(self, X: T, y: None = None) -> T: - """ - Fit the estimator and transform the data. - - Args: - X: Functional dataset to be transformed. - y: not used, present for API consistency by convention. - Returns: - Functional data registered. - - """ deltas, template = self._compute_deltas(X, self.template) self.deltas_ = deltas self.template_ = template - return X.shift( + shifted = X.shift( self.deltas_, restrict_domain=self.restrict_domain, extrapolation=self.extrapolation, grid_points=self.grid_points, ) + shifted.argument_names = None + return shifted - def fit(self, X: FData, y: None = None) -> LeastSquaresShiftRegistration: - """Fit the estimator. - - Args: - X: Functional dataset used to construct the template for - the alignment. - y: not used, present for API consistency by convention. - - Returns: - self - - Raises: - AttributeError: If this method is call when restrict_domain=True. - - """ - if self.restrict_domain: - raise AttributeError( - "fit and predict are not available when " - "restrict_domain=True, fitting and " - "transformation should be done together. Use " - "an extrapolation method with " - "restrict_domain=False or fit_predict", - ) + def fit( + self: SelfType, + X: FData, + y: None = None, + ) -> SelfType: # If the template is an FData, fit doesnt learn anything if isinstance(self.template, FData): @@ -311,26 +288,10 @@ def fit(self, X: FData, y: None = None) -> LeastSquaresShiftRegistration: return self def transform(self, X: FData, y: None = None) -> FDataGrid: - """ - Register the data. - - Transforms the data using the template previously learned during - fitting. - - Args: - X: Functional dataset to be transformed. - y: not used, present for API consistency by convention. - - Returns: - Functional data registered. - - Raises: - AttributeError: If this method is call when restrict_domain=True. - """ if self.restrict_domain: raise AttributeError( - "fit and predict are not available when " + "transform is not available when " "restrict_domain=True, fitting and " "transformation should be done together. Use " "an extrapolation method with " @@ -343,12 +304,14 @@ def transform(self, X: FData, y: None = None) -> FDataGrid: deltas, _ = self._compute_deltas(X, self.template_) self.deltas_ = deltas - return X.shift( + shifted = X.shift( deltas, restrict_domain=self.restrict_domain, extrapolation=self.extrapolation, grid_points=self.grid_points, ) + shifted.argument_names = None + return shifted def inverse_transform(self, X: FData, y: None = None) -> FDataGrid: """ @@ -364,26 +327,25 @@ def inverse_transform(self, X: FData, y: None = None) -> FDataGrid: Functional data registered. Examples: + Creates a synthetic functional dataset. - Creates a synthetic functional dataset. + >>> from skfda.preprocessing.registration import ( + ... LeastSquaresShiftRegistration, + ... ) + >>> from skfda.datasets import make_sinusoidal_process + >>> fd = make_sinusoidal_process(error_std=0, random_state=1) + >>> fd.extrapolation = 'periodic' - >>> from skfda.preprocessing.registration import ( - ... LeastSquaresShiftRegistration, - ... ) - >>> from skfda.datasets import make_sinusoidal_process - >>> fd = make_sinusoidal_process(error_std=0, random_state=1) - >>> fd.extrapolation = 'periodic' + Dataset registration and centering. - Dataset registration and centering. + >>> reg = LeastSquaresShiftRegistration() + >>> fd_registered = reg.fit_transform(fd) + >>> fd_centered = fd_registered - fd_registered.mean() - >>> reg = LeastSquaresShiftRegistration() - >>> fd_registered = reg.fit_transform(fd) - >>> fd_centered = fd_registered - fd_registered.mean() + Reverse the translation applied during the registration. - Reverse the translation applied during the registration. - - >>> reg.inverse_transform(fd_centered) - FDataGrid(...) + >>> reg.inverse_transform(fd_centered) + FDataGrid(...) """ deltas = getattr(self, "deltas_", None) @@ -407,7 +369,8 @@ def inverse_transform(self, X: FData, y: None = None) -> FDataGrid: ) -class ShiftRegistration(LeastSquaresShiftRegistration): +class ShiftRegistration(LeastSquaresShiftRegistration[T]): + """Deprecated name for LeastSquaresShiftRegistration.""" def __init__( self, diff --git a/skfda/preprocessing/registration/base.py b/skfda/preprocessing/registration/base.py index 8187cf670..2d2c6892b 100644 --- a/skfda/preprocessing/registration/base.py +++ b/skfda/preprocessing/registration/base.py @@ -4,23 +4,82 @@ This module contains the abstract base class for all registration methods. """ +from __future__ import annotations -from abc import ABC - -from sklearn.base import BaseEstimator, TransformerMixin +from abc import abstractmethod +from typing import Any, TypeVar, overload from ... import FData +from ..._utils import ( + BaseEstimator, + InductiveTransformerMixin, + TransformerMixin, +) + +SelfType = TypeVar("SelfType") +Input = TypeVar("Input", bound=FData) +Output = TypeVar("Output", bound=FData) class RegistrationTransformer( - ABC, - BaseEstimator, # type: ignore - TransformerMixin, # type: ignore + BaseEstimator, + TransformerMixin[Input, Output, None], ): """Base class for the registration methods.""" - def score(self, X: FData, y: None = None) -> float: - r"""Return the percentage of total variation removed. + def fit( + self: SelfType, + X: Input, + y: None = None, + ) -> SelfType: + """ + Fit the registration model. + + Args: + X: Original (unregistered) training data. + y: Ignored. + + Returns: + Returns the instance itself. + + """ + return self + + @overload # type: ignore[misc] + def fit_transform( + self, + X: Input, + y: None = None, + ) -> Output: + pass + + def fit_transform( + self, + X: Input, + y: None = None, + **fit_params: Any, + ) -> Output: + """ + Fit the registration model and return the registered data. + + Args: + X: Original (unregistered) training data. + y: Ignored. + fit_params: Additional fit parameters. + + Returns: + Registered training data. + + """ + return super().fit_transform( # type: ignore[call-arg] + X, + y, + **fit_params, + ) + + def score(self, X: Input, y: None = None) -> float: + r""" + Return the percentage of total variation removed. Computes the squared multiple correlation index of the proportion of the total variation due to phase, defined as: @@ -35,11 +94,11 @@ def score(self, X: FData, y: None = None) -> float: explanation. Args: - X (FData): Functional data to be registered - y (Ignored): Ignored, only for API conventions. + X: Functional data to be registered + y: Ignored, only for API conventions. Returns: - float. + Registration score. See also: :class:`~.validation.AmplitudePhaseDecomposition` @@ -51,3 +110,26 @@ def score(self, X: FData, y: None = None) -> float: from .validation import AmplitudePhaseDecomposition return AmplitudePhaseDecomposition()(self, X, y) + + +class InductiveRegistrationTransformer( + RegistrationTransformer[Input, Output], + InductiveTransformerMixin[Input, Output, None], +): + + @abstractmethod + def transform( + self: SelfType, + X: Input, + ) -> Output: + """ + Register new data. + + Args: + X: Original (unregistered) data. + + Returns: + Registered data. + + """ + pass diff --git a/tests/test_registration.py b/tests/test_registration.py index 506115fa0..a361a0468 100644 --- a/tests/test_registration.py +++ b/tests/test_registration.py @@ -276,7 +276,7 @@ def test_fit_and_transform(self) -> None: error_std=0, random_state=10, ) - + reg = LeastSquaresShiftRegistration() response = reg.fit(self.fd) @@ -311,13 +311,10 @@ def test_raises(self) -> None: reg.fit(self.fd) reg.set_params(restrict_domain=True) - # Test use fit or transform with restrict_domain=True + # Test use transform with restrict_domain=True with np.testing.assert_raises(AttributeError): reg.transform(self.fd) - with np.testing.assert_raises(AttributeError): - reg.fit(self.fd) - # Test inverse_transform without previous transformation with np.testing.assert_raises(AttributeError): reg.inverse_transform(self.fd) From 0032858bd86493fe505aa864e150b8583b2649fa Mon Sep 17 00:00:00 2001 From: VNMabus Date: Mon, 1 Nov 2021 02:59:40 +0100 Subject: [PATCH 072/117] Fixed error in _fisher_rao_warping_mean with early convergence. --- skfda/exploratory/stats/_fisher_rao.py | 12 ++++++------ tests/test_elastic.py | 14 ++++++++++++++ 2 files changed, 20 insertions(+), 6 deletions(-) diff --git a/skfda/exploratory/stats/_fisher_rao.py b/skfda/exploratory/stats/_fisher_rao.py index 0906ba466..bc07de515 100644 --- a/skfda/exploratory/stats/_fisher_rao.py +++ b/skfda/exploratory/stats/_fisher_rao.py @@ -116,13 +116,13 @@ def _fisher_rao_warping_mean( # Find psi closest to the mean psi_centered = psi - srsf.fit_transform(warping.mean()) - psi_data = psi_centered.data_matrix[..., 0] - np.square(psi_data, out=psi_data) - d = psi_data.sum(axis=1).argmin() + psi_centered_data = psi_centered.data_matrix[..., 0] + np.square(psi_centered_data, out=psi_centered_data) + d = psi_centered_data.sum(axis=1).argmin() # Get raw values to calculate - mu = psi[d].data_matrix[0, ..., 0] - psi = psi.data_matrix[..., 0] + mu = np.atleast_2d(psi[d].data_matrix[0, ..., 0]) + psi_data = psi.data_matrix[..., 0] vmean = np.empty((1, len(eval_points))) # Construction of shooting vectors @@ -130,7 +130,7 @@ def _fisher_rao_warping_mean( vmean[0] = 0 # Compute shooting vectors - for psi_i in psi: + for psi_i in psi_data: inner = scipy.integrate.simps(mu * psi_i, x=eval_points) inner = max(min(inner, 1), -1) diff --git a/tests/test_elastic.py b/tests/test_elastic.py index 9527e4491..517441974 100644 --- a/tests/test_elastic.py +++ b/tests/test_elastic.py @@ -219,6 +219,20 @@ def test_warping_mean(self) -> None: expected = [[[-1], [-0.376241], [0.136193], [0.599291], [1]]] np.testing.assert_array_almost_equal(values, expected) + def test_linear(self) -> None: + grid_points = [i for i in range(10)] + data_matrix = np.array([grid_points, grid_points]) + fd = FDataGrid( + data_matrix=data_matrix, + grid_points=grid_points, + ) + elastic_registration = FisherRaoElasticRegistration() + fd_registered = elastic_registration.fit_transform(fd) + np.testing.assert_array_almost_equal( + fd_registered.data_matrix[..., 0], + data_matrix, + ) + class TestElasticDistances(unittest.TestCase): """Test elastic distances.""" From 7c6f33db95f94e67def9424ee745dbd1326e256a Mon Sep 17 00:00:00 2001 From: VNMabus Date: Mon, 1 Nov 2021 12:36:17 +0100 Subject: [PATCH 073/117] Add minimum Pandas version requirement. --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index cba429bfb..53f91eff9 100644 --- a/setup.py +++ b/setup.py @@ -67,7 +67,7 @@ 'matplotlib', 'multimethod>=1.5', 'numpy>=1.16', - 'pandas', + 'pandas>=1.0', 'rdata', 'scikit-datasets[cran]>=0.1.24', 'scikit-learn>=0.20', From 7df98123d7008a76d1bdcb48697b92204c5e7b87 Mon Sep 17 00:00:00 2001 From: Alvaro Date: Tue, 2 Nov 2021 16:00:48 +0100 Subject: [PATCH 074/117] Fda Feature Union + tests --- .../feature_extraction/__init__.py | 1 + .../feature_extraction/_fda_feature_union.py | 123 ++++++++++++++++++ .../_per_class_feature_transformer.py | 47 ++++++- tests/test_fda_feature_union.py | 44 +++++++ tests/test_per_class_feature_construction.py | 41 ++++++ 5 files changed, 255 insertions(+), 1 deletion(-) create mode 100644 skfda/preprocessing/dim_reduction/feature_extraction/_fda_feature_union.py create mode 100644 tests/test_fda_feature_union.py create mode 100644 tests/test_per_class_feature_construction.py diff --git a/skfda/preprocessing/dim_reduction/feature_extraction/__init__.py b/skfda/preprocessing/dim_reduction/feature_extraction/__init__.py index 8c8f9895b..ec7613a5a 100644 --- a/skfda/preprocessing/dim_reduction/feature_extraction/__init__.py +++ b/skfda/preprocessing/dim_reduction/feature_extraction/__init__.py @@ -1,4 +1,5 @@ """Feature extraction.""" from ._ddg_transformer import DDGTransformer from ._per_class_feature_transformer import PerClassFeatureTransformer +from ._fda_feature_union import FdaFeatureUnion from ._fpca import FPCA diff --git a/skfda/preprocessing/dim_reduction/feature_extraction/_fda_feature_union.py b/skfda/preprocessing/dim_reduction/feature_extraction/_fda_feature_union.py new file mode 100644 index 000000000..604adb3b8 --- /dev/null +++ b/skfda/preprocessing/dim_reduction/feature_extraction/_fda_feature_union.py @@ -0,0 +1,123 @@ +"""Feature extraction union for dimensionality reduction.""" +from __future__ import annotations +from typing import Any +from numpy import ndarray +from pandas import DataFrame +from sklearn.pipeline import FeatureUnion +from ....representation.grid import FDataGrid +from ....representation.basis import FDataBasis + +class FdaFeatureUnion(FeatureUnion): + """Concatenates results of multiple functional transformer objects. + + This estimator applies a list of transformer objects in parallel to the + input data, then concatenates the results (They can be either FDataGrid + and FDataBasis objects or multivariate data itself).This is useful to + combine several feature extraction mechanisms into a single transformer. + Parameters of the transformers may be set using its name and the parameter + name separated by a '__'. A transformer may be replaced entirely by + setting the parameter with its name to another transformer, + or removed by setting to 'drop'. + + Parameters: + transformer_list: + List of tuple containing `(str, transformer)`. The first element + of the tuple is name affected to the transformer while the + second element is a scikit-learn transformer instance. + The transformer instance can also be `"drop"` for it to be + ignored. + n_jobs: + Number of jobs to run in parallel. + ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. + ``-1`` means using all processors. + The default value is None + transformer_weights: + Multiplicative weights for features per transformer. + Keys are transformer names, values the weights. + Raises ValueError if key not present in ``transformer_list``. + verbose: + If True, the time elapsed while fitting each transformer will be + printed as it is completed. + np_array_output: + indicates if the transformed data is requested to be a NumPy array + output. By default the value is False. + + Examples: + Firstly we will import the Berkeley Growth Study data set + >>> from skfda.datasets import fetch_growth + >>> X, y= fetch_growth(return_X_y=True, as_frame=True) + >>> X = X.iloc[:, 0].values + + Then we need to import the transformers we want to use + >>> from skfda.preprocessing.dim_reduction.feature_extraction import FPCA + >>> from skfda.representation import EvaluationTransformer + + Finally we import the union and apply fit and transform + >>> from skfda.preprocessing.dim_reduction.feature_extraction._fda_feature_union + ... import FdaFeatureUnion + >>> union = FdaFeatureUnion([ + ... ("Eval", EvaluationTransformer()), + ... ("fpca", FPCA()), ], np_array_output=True) + >>> union.fit_transform(X) + """ + def __init__( + self, + transformer_list, + *, + n_jobs=None, + transformer_weights=None, + verbose=False, + np_array_output=False + ) -> None : + self.np_array_output = np_array_output + super().__init__(transformer_list, n_jobs=n_jobs, transformer_weights = transformer_weights, verbose=verbose) + + + + def _hstack(self, Xs) -> (ndarray | DataFrame | Any): + + if (self.np_array_output): + for i in Xs: + if(isinstance(i, FDataGrid) or isinstance(i, FDataBasis)): + raise TypeError( + "There are transformed instances of FDataGrid or FDataBasis" + " that can't be concatenated on a NumPy array." + ) + return super()._hstack(Xs) + + first_grid = True + first_basis = True + for j in Xs: + if isinstance(j, FDataGrid): + if first_grid: + curves = j + first_grid = False + else: + curves = curves.concatenate(j) + elif isinstance(j, FDataBasis): + if first_basis: + target = j + first_basis = False + else: + target = target.concatenate(j) + else: + raise TypeError( + "Transformed instance is not of type FDataGrid or FDataBasis." + "It is %s" %(type(j)) + ) + + feature_name = curves.dataset_name.lower() + " transformed" + target_name = "transformed target" + if first_grid: # There are only FDataBasis + return DataFrame({ + target_name:target + }) + elif first_basis: # There are only FDataGrids + return DataFrame({ + feature_name:curves + }) + else: + return DataFrame({ + feature_name : curves, + target_name: target, + }) diff --git a/skfda/preprocessing/dim_reduction/feature_extraction/_per_class_feature_transformer.py b/skfda/preprocessing/dim_reduction/feature_extraction/_per_class_feature_transformer.py index cc646a5fb..bb61aa3ad 100644 --- a/skfda/preprocessing/dim_reduction/feature_extraction/_per_class_feature_transformer.py +++ b/skfda/preprocessing/dim_reduction/feature_extraction/_per_class_feature_transformer.py @@ -6,10 +6,55 @@ from sklearn.utils.validation import check_is_fitted as sklearn_check_is_fitted from ....representation.grid import FData from ...._utils import _classifier_fit_feature_transformer - T = TypeVar("T", bound=FData) class PerClassFeatureTransformer(TransformerMixin): + r"""Per class feature transformer for functional data. + + This class takes a transformer and performs the following map: + + .. math:: + \mathcal{X} &\rightarrow \mathbb{R}^G \\ + x &\rightarrow \textbf{t} = (T_1(x), T_2(x),...,T_k(x)) + + Where :math:`T_i(x)` is the transformation :math:`x` with respect to + the data in the :math:`i`-th group. + + Note that :math:`\mathcal{X}` is possibly multivariate, that is, + :math:`\mathcal{X} = \mathcal{X}_1 \times ... \times \mathcal{X}_p`. + + Parameters: + transformer: + The transformer that we want to apply to the given data. + It should use target data while fitting. + This is checked by looking at the 'stateless' and 'requires_y' tags + Examples: + Firstly, we will import and split the Berkeley Growth Study dataset + + >>> from skfda.datasets import fetch_growth + >>> from sklearn.model_selection import train_test_split + >>> X, y = fetch_growth(return_X_y=True, as_frame=True) + >>> X = X.iloc[:, 0].values + >>> y = y.values.codes + >>> X_train, X_test, y_train, y_test = train_test_split( + ... X, y, test_size=0.25, stratify=y, random_state=0) + + >>> from skfda.preprocessing.dim_reduction.feature_extraction + ... import PerClassFeatureTransformer + + Then we will need to select a fda transformer, and so we will + use RecursiveMaximaHunting + + >>> from skfda.preprocessing.dim_reduction.variable_selection + ... import RecursiveMaximaHunting + + Finally we need to fit the data and transform it + + >>> t.fit(X_train, y_train) + >>> x_transformed = t.transform(X_test) + + x_transformed will be a vector with the transformed data + """ def __init__( self, diff --git a/tests/test_fda_feature_union.py b/tests/test_fda_feature_union.py new file mode 100644 index 000000000..a33571488 --- /dev/null +++ b/tests/test_fda_feature_union.py @@ -0,0 +1,44 @@ +"""Test to check the Fda Feature Union module""" +from pandas.core.frame import DataFrame +from skfda.preprocessing.dim_reduction.feature_extraction._fda_feature_union import FdaFeatureUnion +from skfda.preprocessing.dim_reduction.feature_extraction import FPCA +from skfda.preprocessing.smoothing.kernel_smoothers import NadarayaWatsonSmoother +from skfda.representation import EvaluationTransformer +from skfda.misc.operators import SRSF +from skfda.datasets import fetch_growth +import unittest + + +class TestFdaFeatureUnion(unittest.TestCase): + def setUp(self) -> None: + X, y= fetch_growth(return_X_y=True, as_frame=True) + self.X = X.iloc[:, 0].values + + def test_incompatible_array_output(self): + + u = FdaFeatureUnion([("EvaluationT", EvaluationTransformer()), ("fpca", FPCA()), ], np_array_output=False) + self.assertRaises(TypeError, u.fit_transform, self.X) + + def test_incompatible_FDataGrid_output(self): + + u = FdaFeatureUnion([("EvaluationT", EvaluationTransformer()), ("srsf",SRSF()), ], np_array_output=True) + self.assertRaises(TypeError, u.fit_transform, self.X) + + def test_correct_transformation_concat(self): + u = FdaFeatureUnion([("srsf1",SRSF()), ("smooth",NadarayaWatsonSmoother())]) + created_frame = u.fit_transform(self.X) + + t1 = SRSF().fit_transform(self.X) + t2 = NadarayaWatsonSmoother().fit_transform(self.X) + t = t1.concatenate(t2) + + true_frame = DataFrame({ + t.dataset_name.lower() + " transformed": t + }) + + self.assertEqual(True, true_frame.equals(created_frame)) + + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/test_per_class_feature_construction.py b/tests/test_per_class_feature_construction.py new file mode 100644 index 000000000..5096f6acb --- /dev/null +++ b/tests/test_per_class_feature_construction.py @@ -0,0 +1,41 @@ +"""Test to check the per class feature transformer module""" +from skfda.preprocessing.dim_reduction.feature_extraction._per_class_feature_transformer import PerClassFeatureTransformer +from skfda.preprocessing.dim_reduction.variable_selection import RecursiveMaximaHunting +from skfda.ml.classification import KNeighborsClassifier +from skfda.preprocessing.dim_reduction.feature_extraction import FPCA +from skfda.datasets import fetch_growth +from skfda._utils import _classifier_get_classes + +import unittest + +import numpy as np + + +class TestPCFT(unittest.TestCase): + + # This test fails because the transformers do not have yet tags implemented + def test_transform(self): + + X, y = fetch_growth(return_X_y=True, as_frame=True) + X = X.iloc[:, 0].values + y = y.values.codes + t = PerClassFeatureTransformer(RecursiveMaximaHunting()) + t.fit_transform(X, y) + transformed = t.transform(X) + + classes, y_ind = _classifier_get_classes(y) + for cur_class in range(classes.size): + feature_transformer = RecursiveMaximaHunting().fit(X[y_ind == cur_class], y[y_ind == cur_class]) + a = feature_transformer.transform(X) + np.testing.assert_array_equal(transformed[cur_class], a) + + def test_not_transformer_argument(self): + self.assertRaises(TypeError, PerClassFeatureTransformer, KNeighborsClassifier()) + + def test_not_taget_required_fitting(self): + self.assertRaises(TypeError, PerClassFeatureTransformer, FPCA()) + + + +if __name__ == '__main__': + unittest.main() From a0e00c0a8cab194cda92735a9fc68dc946b68c6f Mon Sep 17 00:00:00 2001 From: Alvaro Date: Tue, 2 Nov 2021 16:34:54 +0100 Subject: [PATCH 075/117] Skiped test correction --- tests/test_per_class_feature_construction.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/test_per_class_feature_construction.py b/tests/test_per_class_feature_construction.py index 5096f6acb..f286e9782 100644 --- a/tests/test_per_class_feature_construction.py +++ b/tests/test_per_class_feature_construction.py @@ -7,13 +7,14 @@ from skfda._utils import _classifier_get_classes import unittest - +import pytest import numpy as np class TestPCFT(unittest.TestCase): # This test fails because the transformers do not have yet tags implemented + @pytest.mark.skip(reason="Tags are not yet implemented on transformers") def test_transform(self): X, y = fetch_growth(return_X_y=True, as_frame=True) From 6cbde335bf2df457ac7b579281a4310dbd4044ac Mon Sep 17 00:00:00 2001 From: Alvaro Date: Tue, 2 Nov 2021 23:51:41 +0100 Subject: [PATCH 076/117] Style errors fixing --- skfda/_utils/__init__.py | 2 +- skfda/_utils/_utils.py | 5 +- .../feature_extraction/__init__.py | 2 +- .../feature_extraction/_fda_feature_union.py | 79 +++++++++-------- .../_per_class_feature_transformer.py | 84 +++++++++++-------- tests/test_fda_feature_union.py | 53 +++++++----- tests/test_per_class_feature_construction.py | 36 ++++---- 7 files changed, 151 insertions(+), 110 deletions(-) diff --git a/skfda/_utils/__init__.py b/skfda/_utils/__init__.py index 9b3612616..3246c763f 100644 --- a/skfda/_utils/__init__.py +++ b/skfda/_utils/__init__.py @@ -8,10 +8,10 @@ _classifier_fit_depth_methods, _classifier_get_classes, _classifier_get_depth_methods, - _classifier_fit_feature_transformer, _compute_dependence, _DependenceMeasure, _evaluate_grid, + _fit_feature_transformer, _int_to_real, _pairwise_symmetric, _reshape_eval_points, diff --git a/skfda/_utils/_utils.py b/skfda/_utils/_utils.py index 9a8f42d3d..b4b3d381b 100644 --- a/skfda/_utils/_utils.py +++ b/skfda/_utils/_utils.py @@ -729,10 +729,11 @@ def _classifier_fit_depth_methods( return classes, class_depth_methods_ -def _classifier_fit_feature_transformer( + +def _fit_feature_transformer( X: T, y: ndarray, - transformer: TransformerMixin + transformer: TransformerMixin, ) -> Tuple[ndarray, Sequence[TransformerMixin]]: classes, y_ind = _classifier_get_classes(y) diff --git a/skfda/preprocessing/dim_reduction/feature_extraction/__init__.py b/skfda/preprocessing/dim_reduction/feature_extraction/__init__.py index ec7613a5a..1167a18a8 100644 --- a/skfda/preprocessing/dim_reduction/feature_extraction/__init__.py +++ b/skfda/preprocessing/dim_reduction/feature_extraction/__init__.py @@ -1,5 +1,5 @@ """Feature extraction.""" from ._ddg_transformer import DDGTransformer -from ._per_class_feature_transformer import PerClassFeatureTransformer from ._fda_feature_union import FdaFeatureUnion from ._fpca import FPCA +from ._per_class_feature_transformer import PerClassFeatureTransformer diff --git a/skfda/preprocessing/dim_reduction/feature_extraction/_fda_feature_union.py b/skfda/preprocessing/dim_reduction/feature_extraction/_fda_feature_union.py index 604adb3b8..7867cdeb1 100644 --- a/skfda/preprocessing/dim_reduction/feature_extraction/_fda_feature_union.py +++ b/skfda/preprocessing/dim_reduction/feature_extraction/_fda_feature_union.py @@ -1,18 +1,19 @@ """Feature extraction union for dimensionality reduction.""" from __future__ import annotations -from typing import Any -from numpy import ndarray + from pandas import DataFrame from sklearn.pipeline import FeatureUnion -from ....representation.grid import FDataGrid + from ....representation.basis import FDataBasis +from ....representation.grid import FDataGrid + class FdaFeatureUnion(FeatureUnion): """Concatenates results of multiple functional transformer objects. This estimator applies a list of transformer objects in parallel to the - input data, then concatenates the results (They can be either FDataGrid - and FDataBasis objects or multivariate data itself).This is useful to + input data, then concatenates the results (They can be either FDataGrid + and FDataBasis objects or multivariate data itself).This is useful to combine several feature extraction mechanisms into a single transformer. Parameters of the transformers may be set using its name and the parameter name separated by a '__'. A transformer may be replaced entirely by @@ -28,7 +29,8 @@ class FdaFeatureUnion(FeatureUnion): ignored. n_jobs: Number of jobs to run in parallel. - ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. + ``None`` means 1 unless in a :obj:`joblib.parallel_backend` + context. ``-1`` means using all processors. The default value is None transformer_weights: @@ -38,28 +40,29 @@ class FdaFeatureUnion(FeatureUnion): verbose: If True, the time elapsed while fitting each transformer will be printed as it is completed. - np_array_output: + np_array_output: indicates if the transformed data is requested to be a NumPy array output. By default the value is False. - + Examples: Firstly we will import the Berkeley Growth Study data set >>> from skfda.datasets import fetch_growth >>> X, y= fetch_growth(return_X_y=True, as_frame=True) >>> X = X.iloc[:, 0].values - + Then we need to import the transformers we want to use >>> from skfda.preprocessing.dim_reduction.feature_extraction import FPCA >>> from skfda.representation import EvaluationTransformer - + Finally we import the union and apply fit and transform - >>> from skfda.preprocessing.dim_reduction.feature_extraction._fda_feature_union - ... import FdaFeatureUnion + >>> from skfda.preprocessing.dim_reduction.feature_extraction. + ... _fda_feature_union import FdaFeatureUnion >>> union = FdaFeatureUnion([ ... ("Eval", EvaluationTransformer()), - ... ("fpca", FPCA()), ], np_array_output=True) + ... ("fpca", FPCA()), ], np_array_output=True) >>> union.fit_transform(X) """ + def __init__( self, transformer_list, @@ -67,22 +70,26 @@ def __init__( n_jobs=None, transformer_weights=None, verbose=False, - np_array_output=False - ) -> None : + np_array_output=False, + ) -> None: self.np_array_output = np_array_output - super().__init__(transformer_list, n_jobs=n_jobs, transformer_weights = transformer_weights, verbose=verbose) - - + super().__init__( + transformer_list, + n_jobs=n_jobs, + transformer_weights=transformer_weights, + verbose=verbose, + ) - def _hstack(self, Xs) -> (ndarray | DataFrame | Any): + def _hstack(self, Xs): if (self.np_array_output): for i in Xs: - if(isinstance(i, FDataGrid) or isinstance(i, FDataBasis)): + if isinstance(i, FDataGrid or FDataBasis): raise TypeError( - "There are transformed instances of FDataGrid or FDataBasis" - " that can't be concatenated on a NumPy array." - ) + "There are transformed instances of FDataGrid or " + "FDataBasis that can't be concatenated on a NumPy " + "array.", + ) return super()._hstack(Xs) first_grid = True @@ -100,24 +107,24 @@ def _hstack(self, Xs) -> (ndarray | DataFrame | Any): first_basis = False else: target = target.concatenate(j) - else: + else: raise TypeError( - "Transformed instance is not of type FDataGrid or FDataBasis." - "It is %s" %(type(j)) + "Transformed instance is not of type FDataGrid or" + " FDataBasis. It is " + type(j), ) feature_name = curves.dataset_name.lower() + " transformed" - target_name = "transformed target" - if first_grid: # There are only FDataBasis - return DataFrame({ - target_name:target - }) - elif first_basis: # There are only FDataGrids + target_name = "transformed target" + if first_grid: # There are only FDataBasis return DataFrame({ - feature_name:curves + target_name: target, }) - else: + elif first_basis: # There are only FDataGrids return DataFrame({ - feature_name : curves, - target_name: target, + feature_name: curves, }) + + return DataFrame({ + feature_name: curves, + target_name: target, + }) diff --git a/skfda/preprocessing/dim_reduction/feature_extraction/_per_class_feature_transformer.py b/skfda/preprocessing/dim_reduction/feature_extraction/_per_class_feature_transformer.py index bb61aa3ad..55a9270be 100644 --- a/skfda/preprocessing/dim_reduction/feature_extraction/_per_class_feature_transformer.py +++ b/skfda/preprocessing/dim_reduction/feature_extraction/_per_class_feature_transformer.py @@ -1,13 +1,18 @@ """Feature extraction transformers for dimensionality reduction.""" from __future__ import annotations -import numpy as np + from typing import TypeVar + +import numpy as np from sklearn.base import TransformerMixin from sklearn.utils.validation import check_is_fitted as sklearn_check_is_fitted + +from ...._utils import _fit_feature_transformer from ....representation.grid import FData -from ...._utils import _classifier_fit_feature_transformer + T = TypeVar("T", bound=FData) + class PerClassFeatureTransformer(TransformerMixin): r"""Per class feature transformer for functional data. @@ -19,7 +24,7 @@ class PerClassFeatureTransformer(TransformerMixin): Where :math:`T_i(x)` is the transformation :math:`x` with respect to the data in the :math:`i`-th group. - + Note that :math:`\mathcal{X}` is possibly multivariate, that is, :math:`\mathcal{X} = \mathcal{X}_1 \times ... \times \mathcal{X}_p`. @@ -41,7 +46,7 @@ class PerClassFeatureTransformer(TransformerMixin): >>> from skfda.preprocessing.dim_reduction.feature_extraction ... import PerClassFeatureTransformer - + Then we will need to select a fda transformer, and so we will use RecursiveMaximaHunting @@ -53,21 +58,22 @@ class PerClassFeatureTransformer(TransformerMixin): >>> t.fit(X_train, y_train) >>> x_transformed = t.transform(X_test) - x_transformed will be a vector with the transformed data + x_transformed will be a vector with the transformed data """ def __init__( self, - transformer: TransformerMixin + transformer: TransformerMixin, ) -> None: - self.transformer= transformer + self.transformer = transformer self._validate_transformer() - + def _validate_transformer( - self + self, ) -> None: """ - Checks that the transformer passed is scikit-learn-like and that uses target data in fit + Check that the transformer passed is\ + scikit-learn-like and that uses target data in fit. Args: None @@ -75,30 +81,36 @@ def _validate_transformer( Returns: None """ - if not (hasattr(self.transformer, "fit") or hasattr(self.transformer, "fit_transform")) or not hasattr( - self.transformer, "transform" - ): - raise TypeError( - "Transformer should implement fit and " - "transform. '%s' (type %s) doesn't" % (self.transformer, type(self.transformer)) - ) - + if not (hasattr(self.transformer, "fit") + and hasattr(self.transformer, "fit_transform") + and hasattr(self.transformer, "transform") + ): + + raise TypeError( + "Transformer should implement fit and " + "transform. " + str(self.transformer) + + " (type " + str(type(self.transformer)) + ")" + " doesn't", + ) + tags = self.transformer._get_tags() - - if not(tags['stateless'] and tags['requires_y']): - raise TypeError( - "Transformer should use target data in fit." - " '%s' (type %s) doesn't" % (self.transformer, type(self.transformer)) - ) - - + + if not (tags['stateless'] and tags['requires_y']): + raise TypeError( + "Transformer should use target data in fit." + + str(self.transformer) + + " (type " + str(type(self.transformer)) + ")" + " doesn't", + ) + def fit( self, X: T, - y: np.ndarray + y: np.ndarray, ) -> PerClassFeatureTransformer: """ - Fit the model on each class using X as training data and y as target values. + Fit the model on each class using X as\ + training data and y as target values. Args: X: FDataGrid with the training data. @@ -107,16 +119,17 @@ def fit( Returns: self """ - classes, class_feature_transformers = _classifier_fit_feature_transformer( - X, y, self.transformer + classes, class_feature_transformers = _fit_feature_transformer( + X, + y, + self.transformer, ) - + self._classes = classes self._class_feature_transformers_ = class_feature_transformers return self - def transform(self, X: T) -> np.ndarray: """ Transform the provided data using the already fitted transformer. @@ -128,16 +141,15 @@ def transform(self, X: T) -> np.ndarray: Array of shape (n_samples, G). """ sklearn_check_is_fitted(self) - + return [ - feature_transformer.transform(X) + feature_transformer.transform(X) for feature_transformer in self._class_feature_transformers_ ] - def fit_transform(self, X: T, y: np.ndarray) -> np.ndarray: """ - Fits and transforms the provided data + Fits and transforms the provided data\ using the transformer specified when initializing the class. Args: diff --git a/tests/test_fda_feature_union.py b/tests/test_fda_feature_union.py index a33571488..03ea49b30 100644 --- a/tests/test_fda_feature_union.py +++ b/tests/test_fda_feature_union.py @@ -1,31 +1,45 @@ -"""Test to check the Fda Feature Union module""" +"""Test to check the Fda Feature Union module.""" + +import unittest + from pandas.core.frame import DataFrame -from skfda.preprocessing.dim_reduction.feature_extraction._fda_feature_union import FdaFeatureUnion + +from skfda.datasets import fetch_growth +from skfda.misc.operators import SRSF from skfda.preprocessing.dim_reduction.feature_extraction import FPCA -from skfda.preprocessing.smoothing.kernel_smoothers import NadarayaWatsonSmoother +from skfda.preprocessing.smoothing.kernel_smoothers\ + import NadarayaWatsonSmoother +from skfda.preprocessing.dim_reduction.feature_extraction._fda_feature_union\ + import FdaFeatureUnion from skfda.representation import EvaluationTransformer -from skfda.misc.operators import SRSF -from skfda.datasets import fetch_growth -import unittest class TestFdaFeatureUnion(unittest.TestCase): + def setUp(self) -> None: - X, y= fetch_growth(return_X_y=True, as_frame=True) + X = fetch_growth(return_X_y=True, as_frame=True)[0] self.X = X.iloc[:, 0].values - + def test_incompatible_array_output(self): - - u = FdaFeatureUnion([("EvaluationT", EvaluationTransformer()), ("fpca", FPCA()), ], np_array_output=False) + + u = FdaFeatureUnion( + [("EvaluationT", EvaluationTransformer()), ("fpca", FPCA())], + np_array_output=False, + ) self.assertRaises(TypeError, u.fit_transform, self.X) - - def test_incompatible_FDataGrid_output(self): - - u = FdaFeatureUnion([("EvaluationT", EvaluationTransformer()), ("srsf",SRSF()), ], np_array_output=True) + + def test_incompatible_fdatagrid_output(self): + + u = FdaFeatureUnion( + [("EvaluationT", EvaluationTransformer()), ("srsf", SRSF())], + np_array_output=True, + ) self.assertRaises(TypeError, u.fit_transform, self.X) - + def test_correct_transformation_concat(self): - u = FdaFeatureUnion([("srsf1",SRSF()), ("smooth",NadarayaWatsonSmoother())]) + u = FdaFeatureUnion( + [("srsf1", SRSF()), ("smooth", NadarayaWatsonSmoother())], + ) created_frame = u.fit_transform(self.X) t1 = SRSF().fit_transform(self.X) @@ -33,12 +47,11 @@ def test_correct_transformation_concat(self): t = t1.concatenate(t2) true_frame = DataFrame({ - t.dataset_name.lower() + " transformed": t + t.dataset_name.lower() + " transformed": t, }) + result = True + self.assertEqual(result, true_frame.equals(created_frame)) - self.assertEqual(True, true_frame.equals(created_frame)) - - if __name__ == '__main__': unittest.main() diff --git a/tests/test_per_class_feature_construction.py b/tests/test_per_class_feature_construction.py index f286e9782..c51271602 100644 --- a/tests/test_per_class_feature_construction.py +++ b/tests/test_per_class_feature_construction.py @@ -1,22 +1,24 @@ """Test to check the per class feature transformer module""" -from skfda.preprocessing.dim_reduction.feature_extraction._per_class_feature_transformer import PerClassFeatureTransformer -from skfda.preprocessing.dim_reduction.variable_selection import RecursiveMaximaHunting +from skfda.datasets import fetch_growth from skfda.ml.classification import KNeighborsClassifier +from skfda.preprocessing.dim_reduction.feature_extraction.\ + _per_class_feature_transformer import PerClassFeatureTransformer +from skfda.preprocessing.dim_reduction.variable_selection \ + import RecursiveMaximaHunting from skfda.preprocessing.dim_reduction.feature_extraction import FPCA -from skfda.datasets import fetch_growth from skfda._utils import _classifier_get_classes -import unittest -import pytest import numpy as np +import pytest +import unittest class TestPCFT(unittest.TestCase): - + # This test fails because the transformers do not have yet tags implemented @pytest.mark.skip(reason="Tags are not yet implemented on transformers") - def test_transform(self): - + def test_transform(self): + X, y = fetch_growth(return_X_y=True, as_frame=True) X = X.iloc[:, 0].values y = y.values.codes @@ -26,17 +28,23 @@ def test_transform(self): classes, y_ind = _classifier_get_classes(y) for cur_class in range(classes.size): - feature_transformer = RecursiveMaximaHunting().fit(X[y_ind == cur_class], y[y_ind == cur_class]) + feature_transformer = RecursiveMaximaHunting().fit( + X[y_ind == cur_class], + y[y_ind == cur_class], + ) a = feature_transformer.transform(X) np.testing.assert_array_equal(transformed[cur_class], a) - + def test_not_transformer_argument(self): - self.assertRaises(TypeError, PerClassFeatureTransformer, KNeighborsClassifier()) - + self.assertRaises( + TypeError, + PerClassFeatureTransformer, + KNeighborsClassifier(), + ) + def test_not_taget_required_fitting(self): self.assertRaises(TypeError, PerClassFeatureTransformer, FPCA()) - - + if __name__ == '__main__': unittest.main() From 26b344e915838eff91461ec22fbae63f0fd88bc8 Mon Sep 17 00:00:00 2001 From: Alvaro Date: Wed, 3 Nov 2021 19:34:04 +0100 Subject: [PATCH 077/117] Changes on fda feature Union --- .../feature_extraction/_fda_feature_union.py | 72 ++++++++----------- tests/test_fda_feature_union.py | 36 +++++----- 2 files changed, 48 insertions(+), 60 deletions(-) diff --git a/skfda/preprocessing/dim_reduction/feature_extraction/_fda_feature_union.py b/skfda/preprocessing/dim_reduction/feature_extraction/_fda_feature_union.py index 7867cdeb1..a802e3763 100644 --- a/skfda/preprocessing/dim_reduction/feature_extraction/_fda_feature_union.py +++ b/skfda/preprocessing/dim_reduction/feature_extraction/_fda_feature_union.py @@ -1,7 +1,10 @@ """Feature extraction union for dimensionality reduction.""" from __future__ import annotations -from pandas import DataFrame +from typing import Any, Union + +from numpy import ndarray +from pandas import DataFrame, concat from sklearn.pipeline import FeatureUnion from ....representation.basis import FDataBasis @@ -21,42 +24,41 @@ class FdaFeatureUnion(FeatureUnion): or removed by setting to 'drop'. Parameters: - transformer_list: + transformer_list: list of tuple List of tuple containing `(str, transformer)`. The first element of the tuple is name affected to the transformer while the second element is a scikit-learn transformer instance. The transformer instance can also be `"drop"` for it to be ignored. - n_jobs: + n_jobs: int Number of jobs to run in parallel. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. The default value is None - transformer_weights: + transformer_weights: dict Multiplicative weights for features per transformer. Keys are transformer names, values the weights. Raises ValueError if key not present in ``transformer_list``. - verbose: + verbose: bool If True, the time elapsed while fitting each transformer will be - printed as it is completed. - np_array_output: + printed as it is completed. By default the value is False + np_array_output: bool indicates if the transformed data is requested to be a NumPy array output. By default the value is False. Examples: Firstly we will import the Berkeley Growth Study data set >>> from skfda.datasets import fetch_growth - >>> X, y= fetch_growth(return_X_y=True, as_frame=True) - >>> X = X.iloc[:, 0].values + >>> X = fetch_growth(return_X_y=True)[0] Then we need to import the transformers we want to use >>> from skfda.preprocessing.dim_reduction.feature_extraction import FPCA >>> from skfda.representation import EvaluationTransformer Finally we import the union and apply fit and transform - >>> from skfda.preprocessing.dim_reduction.feature_extraction. - ... _fda_feature_union import FdaFeatureUnion + >>> from skfda.preprocessing.dim_reduction.feature_extraction + ... import FdaFeatureUnion >>> union = FdaFeatureUnion([ ... ("Eval", EvaluationTransformer()), ... ("fpca", FPCA()), ], np_array_output=True) @@ -80,9 +82,9 @@ def __init__( verbose=verbose, ) - def _hstack(self, Xs): + def _hstack(self, Xs) -> Union[DataFrame, ndarray, Any]: - if (self.np_array_output): + if self.np_array_output: for i in Xs: if isinstance(i, FDataGrid or FDataBasis): raise TypeError( @@ -92,39 +94,23 @@ def _hstack(self, Xs): ) return super()._hstack(Xs) - first_grid = True - first_basis = True - for j in Xs: - if isinstance(j, FDataGrid): - if first_grid: - curves = j - first_grid = False - else: - curves = curves.concatenate(j) - elif isinstance(j, FDataBasis): - if first_basis: - target = j - first_basis = False - else: - target = target.concatenate(j) + if not isinstance(Xs[0], FDataGrid or FDataBasis): + raise TypeError( + "Transformed instance is not of type FDataGrid or" + " FDataBasis. It is " + type(Xs[0]), + ) + + frames = [DataFrame({Xs[0].dataset_name.lower(): Xs[0]})] + + for j in Xs[1:]: + if isinstance(j, FDataGrid or FDataBasis): + frames.append( + DataFrame({j.dataset_name.lower(): j}), + ) else: raise TypeError( "Transformed instance is not of type FDataGrid or" " FDataBasis. It is " + type(j), ) - feature_name = curves.dataset_name.lower() + " transformed" - target_name = "transformed target" - if first_grid: # There are only FDataBasis - return DataFrame({ - target_name: target, - }) - elif first_basis: # There are only FDataGrids - return DataFrame({ - feature_name: curves, - }) - - return DataFrame({ - feature_name: curves, - target_name: target, - }) + return concat(frames, axis=1) diff --git a/tests/test_fda_feature_union.py b/tests/test_fda_feature_union.py index 03ea49b30..1de476057 100644 --- a/tests/test_fda_feature_union.py +++ b/tests/test_fda_feature_union.py @@ -2,41 +2,42 @@ import unittest -from pandas.core.frame import DataFrame +from pandas import DataFrame, concat from skfda.datasets import fetch_growth from skfda.misc.operators import SRSF -from skfda.preprocessing.dim_reduction.feature_extraction import FPCA -from skfda.preprocessing.smoothing.kernel_smoothers\ - import NadarayaWatsonSmoother -from skfda.preprocessing.dim_reduction.feature_extraction._fda_feature_union\ - import FdaFeatureUnion +from skfda.preprocessing.dim_reduction.feature_extraction import ( + FPCA, + FdaFeatureUnion, +) +from skfda.preprocessing.smoothing.kernel_smoothers import ( + NadarayaWatsonSmoother, +) from skfda.representation import EvaluationTransformer class TestFdaFeatureUnion(unittest.TestCase): def setUp(self) -> None: - X = fetch_growth(return_X_y=True, as_frame=True)[0] - self.X = X.iloc[:, 0].values + self.X = fetch_growth(return_X_y=True)[0] - def test_incompatible_array_output(self): + def test_incompatible_array_output(self) -> None: u = FdaFeatureUnion( - [("EvaluationT", EvaluationTransformer()), ("fpca", FPCA())], + [("EvaluationT", EvaluationTransformer(None)), ("fpca", FPCA())], np_array_output=False, ) self.assertRaises(TypeError, u.fit_transform, self.X) - def test_incompatible_fdatagrid_output(self): + def test_incompatible_fdatagrid_output(self) -> None: u = FdaFeatureUnion( - [("EvaluationT", EvaluationTransformer()), ("srsf", SRSF())], + [("EvaluationT", EvaluationTransformer(None)), ("srsf", SRSF())], np_array_output=True, ) self.assertRaises(TypeError, u.fit_transform, self.X) - def test_correct_transformation_concat(self): + def test_correct_transformation_concat(self) -> None: u = FdaFeatureUnion( [("srsf1", SRSF()), ("smooth", NadarayaWatsonSmoother())], ) @@ -44,11 +45,12 @@ def test_correct_transformation_concat(self): t1 = SRSF().fit_transform(self.X) t2 = NadarayaWatsonSmoother().fit_transform(self.X) - t = t1.concatenate(t2) - true_frame = DataFrame({ - t.dataset_name.lower() + " transformed": t, - }) + frames = [ + DataFrame({t1.dataset_name.lower(): t1}), + DataFrame({t2.dataset_name.lower(): t2}), + ] + true_frame = concat(frames, axis=1) result = True self.assertEqual(result, true_frame.equals(created_frame)) From 3cc9ed54b04f22391f9b0fe36f6cea1fefe12bf7 Mon Sep 17 00:00:00 2001 From: Alvaro Date: Wed, 3 Nov 2021 19:46:27 +0100 Subject: [PATCH 078/117] Fix branch with issue 377 --- .../feature_extraction/__init__.py | 1 - .../_per_class_feature_transformer.py | 162 ------------------ tests/test_per_class_feature_construction.py | 50 ------ 3 files changed, 213 deletions(-) delete mode 100644 skfda/preprocessing/dim_reduction/feature_extraction/_per_class_feature_transformer.py delete mode 100644 tests/test_per_class_feature_construction.py diff --git a/skfda/preprocessing/dim_reduction/feature_extraction/__init__.py b/skfda/preprocessing/dim_reduction/feature_extraction/__init__.py index 1167a18a8..54589a3a6 100644 --- a/skfda/preprocessing/dim_reduction/feature_extraction/__init__.py +++ b/skfda/preprocessing/dim_reduction/feature_extraction/__init__.py @@ -2,4 +2,3 @@ from ._ddg_transformer import DDGTransformer from ._fda_feature_union import FdaFeatureUnion from ._fpca import FPCA -from ._per_class_feature_transformer import PerClassFeatureTransformer diff --git a/skfda/preprocessing/dim_reduction/feature_extraction/_per_class_feature_transformer.py b/skfda/preprocessing/dim_reduction/feature_extraction/_per_class_feature_transformer.py deleted file mode 100644 index 55a9270be..000000000 --- a/skfda/preprocessing/dim_reduction/feature_extraction/_per_class_feature_transformer.py +++ /dev/null @@ -1,162 +0,0 @@ -"""Feature extraction transformers for dimensionality reduction.""" -from __future__ import annotations - -from typing import TypeVar - -import numpy as np -from sklearn.base import TransformerMixin -from sklearn.utils.validation import check_is_fitted as sklearn_check_is_fitted - -from ...._utils import _fit_feature_transformer -from ....representation.grid import FData - -T = TypeVar("T", bound=FData) - - -class PerClassFeatureTransformer(TransformerMixin): - r"""Per class feature transformer for functional data. - - This class takes a transformer and performs the following map: - - .. math:: - \mathcal{X} &\rightarrow \mathbb{R}^G \\ - x &\rightarrow \textbf{t} = (T_1(x), T_2(x),...,T_k(x)) - - Where :math:`T_i(x)` is the transformation :math:`x` with respect to - the data in the :math:`i`-th group. - - Note that :math:`\mathcal{X}` is possibly multivariate, that is, - :math:`\mathcal{X} = \mathcal{X}_1 \times ... \times \mathcal{X}_p`. - - Parameters: - transformer: - The transformer that we want to apply to the given data. - It should use target data while fitting. - This is checked by looking at the 'stateless' and 'requires_y' tags - Examples: - Firstly, we will import and split the Berkeley Growth Study dataset - - >>> from skfda.datasets import fetch_growth - >>> from sklearn.model_selection import train_test_split - >>> X, y = fetch_growth(return_X_y=True, as_frame=True) - >>> X = X.iloc[:, 0].values - >>> y = y.values.codes - >>> X_train, X_test, y_train, y_test = train_test_split( - ... X, y, test_size=0.25, stratify=y, random_state=0) - - >>> from skfda.preprocessing.dim_reduction.feature_extraction - ... import PerClassFeatureTransformer - - Then we will need to select a fda transformer, and so we will - use RecursiveMaximaHunting - - >>> from skfda.preprocessing.dim_reduction.variable_selection - ... import RecursiveMaximaHunting - - Finally we need to fit the data and transform it - - >>> t.fit(X_train, y_train) - >>> x_transformed = t.transform(X_test) - - x_transformed will be a vector with the transformed data - """ - - def __init__( - self, - transformer: TransformerMixin, - ) -> None: - self.transformer = transformer - self._validate_transformer() - - def _validate_transformer( - self, - ) -> None: - """ - Check that the transformer passed is\ - scikit-learn-like and that uses target data in fit. - - Args: - None - - Returns: - None - """ - if not (hasattr(self.transformer, "fit") - and hasattr(self.transformer, "fit_transform") - and hasattr(self.transformer, "transform") - ): - - raise TypeError( - "Transformer should implement fit and " - "transform. " + str(self.transformer) - + " (type " + str(type(self.transformer)) + ")" - " doesn't", - ) - - tags = self.transformer._get_tags() - - if not (tags['stateless'] and tags['requires_y']): - raise TypeError( - "Transformer should use target data in fit." - + str(self.transformer) - + " (type " + str(type(self.transformer)) + ")" - " doesn't", - ) - - def fit( - self, - X: T, - y: np.ndarray, - ) -> PerClassFeatureTransformer: - """ - Fit the model on each class using X as\ - training data and y as target values. - - Args: - X: FDataGrid with the training data. - y: Target values of shape = (n_samples). - - Returns: - self - """ - classes, class_feature_transformers = _fit_feature_transformer( - X, - y, - self.transformer, - ) - - self._classes = classes - self._class_feature_transformers_ = class_feature_transformers - - return self - - def transform(self, X: T) -> np.ndarray: - """ - Transform the provided data using the already fitted transformer. - - Args: - X: FDataGrid with the test samples. - - Returns: - Array of shape (n_samples, G). - """ - sklearn_check_is_fitted(self) - - return [ - feature_transformer.transform(X) - for feature_transformer in self._class_feature_transformers_ - ] - - def fit_transform(self, X: T, y: np.ndarray) -> np.ndarray: - """ - Fits and transforms the provided data\ - using the transformer specified when initializing the class. - - Args: - X: FDataGrid with the samples. - y: Target values of shape = (n_samples) - - Returns: - Array of shape (n_samples, G). - """ - return self.fit(X, y).transform(X) diff --git a/tests/test_per_class_feature_construction.py b/tests/test_per_class_feature_construction.py deleted file mode 100644 index c51271602..000000000 --- a/tests/test_per_class_feature_construction.py +++ /dev/null @@ -1,50 +0,0 @@ -"""Test to check the per class feature transformer module""" -from skfda.datasets import fetch_growth -from skfda.ml.classification import KNeighborsClassifier -from skfda.preprocessing.dim_reduction.feature_extraction.\ - _per_class_feature_transformer import PerClassFeatureTransformer -from skfda.preprocessing.dim_reduction.variable_selection \ - import RecursiveMaximaHunting -from skfda.preprocessing.dim_reduction.feature_extraction import FPCA -from skfda._utils import _classifier_get_classes - -import numpy as np -import pytest -import unittest - - -class TestPCFT(unittest.TestCase): - - # This test fails because the transformers do not have yet tags implemented - @pytest.mark.skip(reason="Tags are not yet implemented on transformers") - def test_transform(self): - - X, y = fetch_growth(return_X_y=True, as_frame=True) - X = X.iloc[:, 0].values - y = y.values.codes - t = PerClassFeatureTransformer(RecursiveMaximaHunting()) - t.fit_transform(X, y) - transformed = t.transform(X) - - classes, y_ind = _classifier_get_classes(y) - for cur_class in range(classes.size): - feature_transformer = RecursiveMaximaHunting().fit( - X[y_ind == cur_class], - y[y_ind == cur_class], - ) - a = feature_transformer.transform(X) - np.testing.assert_array_equal(transformed[cur_class], a) - - def test_not_transformer_argument(self): - self.assertRaises( - TypeError, - PerClassFeatureTransformer, - KNeighborsClassifier(), - ) - - def test_not_taget_required_fitting(self): - self.assertRaises(TypeError, PerClassFeatureTransformer, FPCA()) - - -if __name__ == '__main__': - unittest.main() From a2bdb13497e2a7349508902111e11a4b589dd721 Mon Sep 17 00:00:00 2001 From: pedrorponga Date: Wed, 3 Nov 2021 20:50:21 +0100 Subject: [PATCH 079/117] Style --- tests/test_classification.py | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/tests/test_classification.py b/tests/test_classification.py index 6e426e4c5..f74a538b7 100644 --- a/tests/test_classification.py +++ b/tests/test_classification.py @@ -50,7 +50,7 @@ def test_dtm_independent_copy(self) -> None: clf1.fit(self._X_train, self._y_train) clf2.fit(self._X_train, self._y_train) - np.testing.assert_array_equal( # type: ignore + np.testing.assert_array_equal( clf1.predict(self._X_test), clf2.predict(self._X_test), ) @@ -60,9 +60,9 @@ def test_dtm_classifier(self) -> None: clf: DTMClassifier[FData] = DTMClassifier(proportiontocut=0.25) clf.fit(self._X_train, self._y_train) - np.testing.assert_array_equal( # type: ignore + np.testing.assert_array_equal( clf.predict(self._X_test), - [ + [ # noqa: WPS317 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, ], @@ -73,9 +73,9 @@ def test_centroid_classifier(self) -> None: clf: NearestCentroid[FData] = NearestCentroid() clf.fit(self._X_train, self._y_train) - np.testing.assert_array_equal( # type: ignore + np.testing.assert_array_equal( clf.predict(self._X_test), - [ + [ # noqa: WPS317 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, ], @@ -91,7 +91,7 @@ def test_dtm_inheritance(self) -> None: clf1.fit(self._X_train, self._y_train) clf2.fit(self._X_train, self._y_train) - np.testing.assert_array_equal( # type: ignore + np.testing.assert_array_equal( clf1.predict(self._X_test), clf2.predict(self._X_test), ) @@ -101,9 +101,9 @@ def test_maximumdepth_classifier(self) -> None: clf: MaximumDepthClassifier[FData] = MaximumDepthClassifier() clf.fit(self._X_train, self._y_train) - np.testing.assert_array_equal( # type: ignore + np.testing.assert_array_equal( clf.predict(self._X_test), - [ + [ # noqa: WPS317 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, ], @@ -114,9 +114,9 @@ def test_dd_classifier(self) -> None: clf: DDClassifier[FData] = DDClassifier(degree=2) clf.fit(self._X_train, self._y_train) - np.testing.assert_array_equal( # type: ignore + np.testing.assert_array_equal( clf.predict(self._X_test), - [ + [ # noqa: WPS317 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, ], @@ -127,9 +127,9 @@ def test_ddg_classifier(self) -> None: clf: DDGClassifier[FData] = DDGClassifier(_KNeighborsClassifier()) clf.fit(self._X_train, self._y_train) - np.testing.assert_array_equal( # type: ignore + np.testing.assert_array_equal( clf.predict(self._X_test), - [ + [ # noqa: WPS317 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, ], @@ -142,7 +142,7 @@ def test_maximumdepth_inheritance(self) -> None: clf1.fit(self._X_train, self._y_train) clf2.fit(self._X_train, self._y_train) - np.testing.assert_array_equal( # type: ignore + np.testing.assert_array_equal( clf1.predict(self._X_test), clf2.predict(self._X_test), ) @@ -152,9 +152,9 @@ def test_kneighbors_classifier(self) -> None: clf = KNeighborsClassifier() clf.fit(self._X_train, self._y_train) - np.testing.assert_array_equal( # type: ignore + np.testing.assert_array_equal( clf.predict(self._X_test), - [ + [ # noqa: WPS317 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, ], @@ -165,9 +165,9 @@ def test_radiusneighbors_classifier(self) -> None: clf = RadiusNeighborsClassifier(radius=15) clf.fit(self._X_train, self._y_train) - np.testing.assert_array_equal( # type: ignore + np.testing.assert_array_equal( clf.predict(self._X_test), - [ + [ # noqa: WPS317 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, ], From b70a6c893875d86daff2cee921eef21e4764643b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Lejeune?= <54889281+Clej@users.noreply.github.com> Date: Thu, 4 Nov 2021 13:59:46 +0100 Subject: [PATCH 080/117] add example of outl detection with fpca.inverse_transform --- ...t_fpca_inverse_transform_outl_detection.py | 220 ++++++++++++++++++ 1 file changed, 220 insertions(+) create mode 100644 examples/plot_fpca_inverse_transform_outl_detection.py diff --git a/examples/plot_fpca_inverse_transform_outl_detection.py b/examples/plot_fpca_inverse_transform_outl_detection.py new file mode 100644 index 000000000..186fa10ce --- /dev/null +++ b/examples/plot_fpca_inverse_transform_outl_detection.py @@ -0,0 +1,220 @@ +""" +Outlier detection with FPCA +=========================== + +Example of using the inverse_transform method +in the FPCA class to detect outlier(s) from +the reconstruction (truncation) error. + +In this example, we illustrate the utility of the inverse_transform method +of the FPCA class to perform functional outlier detection. +Roughly speaking, an outlier is a sample +which is not representative of the dataset +or different enough compared to a large part of the samples. +The intuition is the following: if the eigen basis, +i.e. the q>=1 first functional principal components (FPCs), is +sufficient to linearly approximate a clean set of +samples, then the error between an observed sample +and its approximation w.r.t to the first 'q' FPCs should be small. +Thus a sample with a high reconstruction error (RE) +is likely an outlier, in the sense that +it is underlied by a different covariance function +compared the training samples (nonoutliers). +""" + +# Author: Clément Lejeune +# License: MIT + +import matplotlib.pyplot as plt +import numpy as np +from scipy.stats import gaussian_kde +from skfda.preprocessing.dim_reduction.feature_extraction import FPCA +from skfda.misc.covariances import Exponential, Gaussian +from skfda.datasets import make_gaussian_process +from skfda.misc.metrics import lp_distance, lp_norm + +############################################################################## +# We proceed as follows: +# - We generate a clean training dataset (not supposed to contain outliers) +# and fit an FPCA with 'q' components on it. +# - We also generate a test set containing +# both nonoutliers samples and outliers. +# - Then, we fit an FPCA(n_components=q) +# and compute the vector of principal components scores +# of train and test samples. +# - We project back the vectors of principal components scores, +# with the inverse_transform method, to the input (training data space). +# This step can be seen as the reverse projection from the eigen space, +# spanned by the first FPCs, to the input (functional) space. +# - Finally, we compute the relative L2-norm error between +# the observed functions and their FPCs approximation. +# We flag as outlier the samples with a reconstruction error (RE) +# higher than a quantile-based threshold. +# Hence, an outlier is thus a sample that +# exhibits a different covariance function w.r.t the training samples. +# +# The train set is generated according to a Gaussian process +# with a Gaussian (i.e. squared-exponential) covariance function. +cov_clean = Gaussian(variance=2., length_scale=5.) +grid_size = 5 * 10**3 + +n_train = 10**3 +train_set = make_gaussian_process( + n_samples=n_train, + n_features=grid_size, + start=0., stop=25., + cov=cov_clean, + random_state=20 +) +train_set.sample_names = ['train_' + str(i) for i in range(n_train)] +train_set.dataset_name = 'train set' + +############################################################## +# The test set is generated according to a Gaussian process +# with the same covariance function for nonoutliers (50%) and +# with an exponential covariance function for outliers (50%). +n_test = 50 +test_set_clean = make_gaussian_process( + n_samples=n_test // 2, + n_features=grid_size, + start=0., stop=25., + cov=cov_clean, + random_state=20 +) # clean test set +test_set_clean.sample_names = [ + 'test_clean_' + str(i) for i in range(test_set_clean.n_samples)] + +cov_outlier = Exponential() + +test_set_outlier = make_gaussian_process( + n_samples=n_test // 2, + n_features=grid_size, + start=0., stop=25., + cov=cov_outlier, + random_state=20 +) # test set with outliers +test_set_outlier.sample_names = [ + 'test_outl_' + str(i) for i in range(test_set_outlier.n_samples)] + +test_set = test_set_clean.concatenate(test_set_outlier) +test_set.dataset_name = 'test set' + +############################# +# We plot the whole dataset. +whole_data = train_set.concatenate(test_set) + +labels = [] +for i in whole_data.sample_names: + if 'train_' in i: + labels.append('train(nonoutliers)') + elif 'test_clean' in i: + labels.append('test(nonoutliers)') + elif 'test_outl' in i: + labels.append('test(outliers)') + +fig = whole_data.plot( + group=np.array(labels), + group_colors={ + 'train(nonoutliers)': 'grey', + 'test(nonoutliers)': 'C3', + 'test(outliers)': 'C1'}, + linewidth=0.95, + alpha=0.2, + legend=True +) +fig.show() + +##################################################################### +# We fit an FPCA with an arbitrary low number of components +# compared to the input dimension (grid size). +# We compute the relative RE +# of both the training and test samples, and plot the pdf estimates. +# Errors are normalized w.r.t L2-norms of each sample +# to remove (explained) variance from the scale error. + +q = 5 +fpca_clean = FPCA(n_components=q) +fpca_clean.fit(train_set) +train_set_hat = fpca_clean.inverse_transform( + fpca_clean.transform(train_set) +) + +err_train = lp_distance( + train_set, + train_set_hat, + p=2 +) / lp_norm(train_set, p=2) + +test_set_hat = fpca_clean.inverse_transform( + fpca_clean.transform(test_set) +) +err_test = lp_distance( + test_set, + test_set_hat, + p=2 +) / lp_norm(test_set, p=2) + +########################################################################### +# We plot the density of the REs, +# both unconditionnaly (grey and blue) and conditionnaly (orange and red), +# to the rule error >= threshold. +x_density = np.linspace(0., 1.6, num=10**3) +density_train_err = gaussian_kde(err_train) +density_test_err = gaussian_kde(err_test) +err_thresh = np.quantile(err_train, 0.99) + +density_test_err_outl = gaussian_kde(err_test[err_test >= err_thresh]) +density_test_err_inli = gaussian_kde(err_test[err_test < err_thresh]) + +# density estimate of train errors +plt.plot(x_density, density_train_err(x_density), + label='Error train', color='grey') + +# density estimate of test errors +plt.plot(x_density, density_test_err(x_density), + label='Error test (outliers+nonoutliers', color='C0') + +# outlyingness threshold +plt.vlines(err_thresh, + ymax=max(density_train_err(x_density)), ymin=0., + label='thresh=quantile(p=0.99)', + linestyles='dashed', color='black') + +# density estimate of the error of test samples flagged as outliers +plt.plot(x_density, density_test_err_outl(x_density), + label='Error test>= thresh (outliers)', color='C1') + +# density estimate of the error of test samples flagged as nonoutliers +plt.plot(x_density, density_test_err_inli(x_density), + abel='Error test< thresh (nonoutliers)', color='C3') + +plt.xlabel('Relative L2-norm reconstruction errors') +plt.ylabel('Density (unormalized)') +plt.title('Densities of reconstruction errors with {} components'.format(q)) +plt.legend() +plt.show() + +############################################################################## +# We observe that the distribution of the training samples (grey) REs +# is unimodal and quite concentrated toward 0. This means that +# the training samples are well recovered with 5 FPCs if we allow +# an error rate around 0.4. On the countrary, the distribution of the +# test samples (blue) is bimodal +# with equivalent an magnitude order for each mode, +# meaning that half of the test samples is consistently approximated w.r.t +# training samples and the other half is poorly approximated in the FPCs basis. +# +# The distribution underlying the left blue mode (red) is the one of +# test samples REs flagged as nonoutliers, i.e. having a RE_i Date: Thu, 4 Nov 2021 14:10:29 +0100 Subject: [PATCH 081/117] First version --- examples/plot_depth_classification.py | 169 +++++++++++++++++++ skfda/exploratory/visualization/_baseplot.py | 13 ++ skfda/exploratory/visualization/_ddplot.py | 34 ++-- 3 files changed, 202 insertions(+), 14 deletions(-) create mode 100644 examples/plot_depth_classification.py diff --git a/examples/plot_depth_classification.py b/examples/plot_depth_classification.py new file mode 100644 index 000000000..2cfbb196d --- /dev/null +++ b/examples/plot_depth_classification.py @@ -0,0 +1,169 @@ +""" +Classification +============== + +This example shows the use of the depth based classifications methods +applied to the Berkeley Growth Study data. An attempt to show the +differences and similarities between MaximumDepthClassifier, +DDClassifier, and DDGClassifier is made. +""" + +# Author: Pedro Martín Rodríguez-Ponga Eyriès +# License: MIT + +# sphinx_gallery_thumbnail_number = 6 + +from scipy.interpolate import lagrange +import matplotlib.pyplot as plt +import numpy as np +from matplotlib.colors import ListedColormap +from sklearn.model_selection import train_test_split +from sklearn.neighbors import KNeighborsClassifier + +from skfda import datasets +from skfda.exploratory.depth import ModifiedBandDepth +from skfda.exploratory.visualization import DDPlot +from skfda.ml.classification import ( + DDClassifier, + DDGClassifier, + MaximumDepthClassifier, +) +from skfda.preprocessing.dim_reduction.feature_extraction import DDGTransformer + +############################################################################## +# +# The Berkeley Growth Study data contains the heights of 39 boys and 54 +# girls from age 1 to 18 and the ages at which they were collected. Males +# are assigned the numeric value 0 while females are coded to a 1. In our +# comparison of the different methods, we will try to learn the sex of a +# person by using its growth curve. +X, y = datasets.fetch_growth(return_X_y=True, as_frame=True) +X = X.iloc[:, 0].values +categories = y.values.categories +y = y.values.codes + +############################################################################## +# +# As in many ML algorithms, we split the dataset into train and test. In +# this graph, we can see the training dataset. These growth curves will +# be used to train the model. Hence, the predictions will be data-driven. +X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=0.5, stratify=y, random_state=0) + +# Plot samples grouped by sex +X_train.plot(group=y_train, group_names=categories) + +############################################################################## +# +# Below are the growth graphs of those individuals that we would like to +# classify. Some of them will be male and some female. +X_test.plot() + +############################################################################## +# +# As said above, we are trying to compare three different methods: +# MaximumDepthClassifier, DDClassifier, and DDGClassifier. +# Below are the classification predictions of these models as well as the +# score (obtained by comparing to the real known sex). For the three +# algorithms we will be using the depth +# :class:`~skfda.representation.depth.ModifiedBandDepth` for consistency. +# We will try polynomes of degrees one, two, and three for DDClassifier. +# DDClassifier will be used with +# :class:`~sklearn.neighbors.KNeighborsClassifier`. +clf = MaximumDepthClassifier(depth_method=ModifiedBandDepth()) +clf.fit(X_train, y_train) +print(clf.predict(X_test)) +print(clf.score(X_test, y_test)) + +clf1 = DDClassifier(degree=1, depth_method=ModifiedBandDepth()) +clf1.fit(X_train, y_train) +print(clf1.predict(X_test)) +print(clf1.score(X_test, y_test)) + +clf2 = DDClassifier(degree=2, depth_method=ModifiedBandDepth()) +clf2.fit(X_train, y_train) +print(clf2.predict(X_test)) +print(clf2.score(X_test, y_test)) + +clf3 = DDClassifier(degree=3, depth_method=ModifiedBandDepth()) +clf3.fit(X_train, y_train) +print(clf3.predict(X_test)) +print(clf3.score(X_test, y_test)) + + +clf = DDGClassifier( + KNeighborsClassifier(n_neighbors=5), + depth_method=ModifiedBandDepth(), +) +clf.fit(X_train, y_train) +print(clf.predict(X_test)) +clf.score(X_test, y_test) + +############################################################################## +# +# Finally, we plot all these classifiers in a DDPlot. There is a +# one-to-one correspondence between growth curves and data points. The +# coordinates of the points in the graph correspond to the respective +# depth to the class of all boys and the class of all girls. Note that +# the dots are blue if the true sex is female and red otherwise. The +# other elements of the graph are the decision boundaries: +# +# | Boundary | Classifier | +# | --------- | ------------------------------------ | +# | Gray line | MaximumDepthClassifier | +# | P1 | DDClassifier with degree 1 | +# | P2 | DDClassifier with degree 2 | +# | P3 | DDClassifier with degree 3 | +# | Colors | DDGClassifier with nearest neighbors | +ddg = DDGTransformer(depth_method=ModifiedBandDepth()) +X_train_trans = ddg.fit_transform(X_train, y_train) + +# from https://stackoverflow.com/questions/45075638/graph-k-nn-decision-boundaries-in-matplotlib +clf = KNeighborsClassifier(n_neighbors=5) +clf.fit(X_train_trans, y_train) + +h = .02 # step size in the mesh + +# Create color maps +cmap_light = ListedColormap(['#FFAAAA', '#AAAAFF']) +cmap_bold = ListedColormap(['#FF0000', '#0000FF']) + + +# Plot the decision boundary. For that, we will assign a color to each +# point in the mesh [x_min, x_max]x[y_min, y_max]. +x_min, x_max = X_train_trans[:, 0].min() - 1, X_train_trans[:, 0].max() + 1 +y_min, y_max = X_train_trans[:, 1].min() - 1, X_train_trans[:, 1].max() + 1 +xx, yy = np.meshgrid(np.arange(x_min, x_max, h), + np.arange(y_min, y_max, h)) +Z = clf.predict(np.c_[xx.ravel(), yy.ravel()]) + +# Put the result into a color plot +Z = Z.reshape(xx.shape) + +fig, ax = plt.subplots() + +ts = np.linspace(0 - 0.025, 1 + 0.025, 100) +pol1, = ax.plot(ts, np.polyval(clf1.poly_, ts), 'c', + linewidth=1, label="Polynomial") +pol2, = ax.plot(ts, np.polyval(clf2.poly_, ts), 'm', + linewidth=1, label="Polynomial") +pol3, = ax.plot(ts, np.polyval(clf3.poly_, ts), 'g', + linewidth=1, label="Polynomial") +ax.pcolormesh(xx, yy, Z, cmap=cmap_light, shading='auto') + +ax.legend([pol1, pol2, pol3], ['P1', 'P2', 'P3']) + + +index = y_train.astype(bool) +ddp = DDPlot( + fdata=X_test, + dist1=X_train[np.invert(index)], + dist2=X_train[index], + depth_method=ModifiedBandDepth(), + axes=ax, + c=y_test, + cmap_bold=cmap_bold, + x_label="Boy class depth", + y_label="Girl class depth", +) +ddp.plot() diff --git a/skfda/exploratory/visualization/_baseplot.py b/skfda/exploratory/visualization/_baseplot.py index 56294e206..c77d44728 100644 --- a/skfda/exploratory/visualization/_baseplot.py +++ b/skfda/exploratory/visualization/_baseplot.py @@ -18,6 +18,7 @@ from matplotlib.text import Annotation from ...representation import FData +from ...representation._typing import NDArrayInt from ._utils import _figure_to_svg, _get_figure_and_axes, _set_figure_layout @@ -42,6 +43,10 @@ def __init__( axes: Union[Axes, Sequence[Axes], None] = None, n_rows: Optional[int] = None, n_cols: Optional[int] = None, + c: NDArrayInt = None, + cmap_bold=None, + x_label: str = None, + y_label: str = None, ) -> None: self.artists: Optional[np.ndarray] = None self.chart = chart @@ -50,6 +55,10 @@ def __init__( self.n_rows = n_rows self.n_cols = n_cols self._tag = self._create_annotation() + self.c = c + self.cmap_bold = cmap_bold + self.x_label = x_label + self.y_label = y_label def _plot( self, @@ -77,6 +86,10 @@ def plot( fig=self.fig, axes=self.axes, ) + if self.x_label is not None: + axes[0].set_xlabel(self.x_label) + if self.y_label is not None: + axes[0].set_ylabel(self.y_label) self._plot(fig, axes) diff --git a/skfda/exploratory/visualization/_ddplot.py b/skfda/exploratory/visualization/_ddplot.py index bfa6707cd..b2086c603 100644 --- a/skfda/exploratory/visualization/_ddplot.py +++ b/skfda/exploratory/visualization/_ddplot.py @@ -14,6 +14,7 @@ from ...exploratory.depth.multivariate import Depth from ...representation._functional_data import FData +from ...representation._typing import NDArrayInt from ._baseplot import BasePlot T = TypeVar('T', bound=FData) @@ -59,16 +60,22 @@ def __init__( depth_method: Depth[T], fig: Optional[Figure] = None, axes: Optional[Axes] = None, + c: NDArrayInt = None, + cmap_bold=None, + x_label: str = "X depth", + y_label: str = "Y depth", ) -> None: - BasePlot.__init__( - self, + super().__init__( chart, fig=fig, axes=axes, + c=c, + cmap_bold=cmap_bold, + x_label=x_label, + y_label=y_label, ) self.fdata = fdata self.depth_method = depth_method - self.depth_method.fit(fdata) self.depth_dist1 = self.depth_method( self.fdata, distribution=dist1, @@ -91,7 +98,7 @@ def _plot( Plot DDPlot graph. Plot the depth of our fdata elements in the two different - distributions,one in each axis. It is useful to understand how + distributions, one in each axis. It is useful to understand how our data is more related with one subset of data / distribution than another one. Returns: @@ -103,24 +110,23 @@ def _plot( dtype=Artist, ) margin = 0.025 - width_aux_line = 0.35 + width_aux_line = 1 color_aux_line = "gray" ax = axes[0] - for i, (d1, d2) in enumerate(zip(self.depth_dist1, self.depth_dist2)): - self.artists[i, 0] = ax.scatter( - d1, - d2, - picker=True, - pickradius=2, - ) + self.artists[:, 0] = ax.scatter( + self.depth_dist1, + self.depth_dist2, + c=self.c, + cmap=self.cmap_bold, + picker=True, + pickradius=2, + ) # Set labels of graph if self.fdata.dataset_name is not None: ax.set_title(self.fdata.dataset_name) - ax.set_xlabel("X depth") - ax.set_ylabel("Y depth") ax.set_xlim( [ self.depth_method.min - margin, From e9f576f0df730bceb46676cdbb373d9bb1699dd4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Lejeune?= <54889281+Clej@users.noreply.github.com> Date: Thu, 4 Nov 2021 14:39:30 +0100 Subject: [PATCH 082/117] typos in plot_fpca_inverse_transform_outl_detection --- examples/plot_fpca_inverse_transform_outl_detection.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/examples/plot_fpca_inverse_transform_outl_detection.py b/examples/plot_fpca_inverse_transform_outl_detection.py index 186fa10ce..3b5dd1d65 100644 --- a/examples/plot_fpca_inverse_transform_outl_detection.py +++ b/examples/plot_fpca_inverse_transform_outl_detection.py @@ -38,9 +38,9 @@ # - We generate a clean training dataset (not supposed to contain outliers) # and fit an FPCA with 'q' components on it. # - We also generate a test set containing -# both nonoutliers samples and outliers. +# both nonoutliers and outliers samples. # - Then, we fit an FPCA(n_components=q) -# and compute the vector of principal components scores +# and compute the vectors of principal components scores # of train and test samples. # - We project back the vectors of principal components scores, # with the inverse_transform method, to the input (training data space). @@ -157,7 +157,11 @@ ########################################################################### # We plot the density of the REs, # both unconditionnaly (grey and blue) and conditionnaly (orange and red), -# to the rule error >= threshold. +# to the rule if error >= threshold then it is an outlier. +# The threshold is computed from RE of the training samples as +# the quantile of probability 0.99. +# In ohter words, a sample whose RE is higher than the threshold is unlikely +# approximated as a training sample with (low) probability 0.01. x_density = np.linspace(0., 1.6, num=10**3) density_train_err = gaussian_kde(err_train) density_test_err = gaussian_kde(err_test) From 6b4b1a33fb7a8d838540f423e090cff55a0d3c6e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Lejeune?= <54889281+Clej@users.noreply.github.com> Date: Thu, 4 Nov 2021 14:56:33 +0100 Subject: [PATCH 083/117] typos in plot_fpca_inverse_transform_outl_detection --- examples/plot_fpca_inverse_transform_outl_detection.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/plot_fpca_inverse_transform_outl_detection.py b/examples/plot_fpca_inverse_transform_outl_detection.py index 3b5dd1d65..accde34e2 100644 --- a/examples/plot_fpca_inverse_transform_outl_detection.py +++ b/examples/plot_fpca_inverse_transform_outl_detection.py @@ -190,7 +190,7 @@ # density estimate of the error of test samples flagged as nonoutliers plt.plot(x_density, density_test_err_inli(x_density), - abel='Error test< thresh (nonoutliers)', color='C3') + label='Error test< thresh (nonoutliers)', color='C3') plt.xlabel('Relative L2-norm reconstruction errors') plt.ylabel('Density (unormalized)') From 407d0d8bc6a8bf0d9f5592a9df6cd861de4efb06 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Lejeune?= <54889281+Clej@users.noreply.github.com> Date: Thu, 4 Nov 2021 15:23:30 +0100 Subject: [PATCH 084/117] added detection results in plot_fpca_inverste_transform_outl_detection --- examples/plot_fpca_inverse_transform_outl_detection.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/examples/plot_fpca_inverse_transform_outl_detection.py b/examples/plot_fpca_inverse_transform_outl_detection.py index accde34e2..6b4d3e0de 100644 --- a/examples/plot_fpca_inverse_transform_outl_detection.py +++ b/examples/plot_fpca_inverse_transform_outl_detection.py @@ -198,6 +198,14 @@ plt.legend() plt.show() +#################################################################### +# We can check that the outliers are all detected with this method, +# with no false positive (wrongly) in the test set. +print('Flagged outliers: \n', + test_set[err_test>=err_thresh].sample_names) +print('Flagged nonoutliers: \n', + test_set[err_test Date: Thu, 4 Nov 2021 15:28:48 +0100 Subject: [PATCH 085/117] added detection results in plot_fpca_inverste_transform_outl_detection --- examples/plot_fpca_inverse_transform_outl_detection.py | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/plot_fpca_inverse_transform_outl_detection.py b/examples/plot_fpca_inverse_transform_outl_detection.py index 6b4d3e0de..9a5897153 100644 --- a/examples/plot_fpca_inverse_transform_outl_detection.py +++ b/examples/plot_fpca_inverse_transform_outl_detection.py @@ -102,6 +102,7 @@ ############################# # We plot the whole dataset. whole_data = train_set.concatenate(test_set) +whole_data.dataset_name = 'train and test samples' labels = [] for i in whole_data.sample_names: From 0a01bb629636acf453bf19db54d4a3570943e631 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Lejeune?= <54889281+Clej@users.noreply.github.com> Date: Thu, 4 Nov 2021 15:36:53 +0100 Subject: [PATCH 086/117] plot titles in plot_fpca_inverste_transform_outl_detection --- examples/plot_fpca_inverse_transform_outl_detection.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/examples/plot_fpca_inverse_transform_outl_detection.py b/examples/plot_fpca_inverse_transform_outl_detection.py index 9a5897153..5b9f214c6 100644 --- a/examples/plot_fpca_inverse_transform_outl_detection.py +++ b/examples/plot_fpca_inverse_transform_outl_detection.py @@ -67,7 +67,6 @@ random_state=20 ) train_set.sample_names = ['train_' + str(i) for i in range(n_train)] -train_set.dataset_name = 'train set' ############################################################## # The test set is generated according to a Gaussian process @@ -97,12 +96,10 @@ 'test_outl_' + str(i) for i in range(test_set_outlier.n_samples)] test_set = test_set_clean.concatenate(test_set_outlier) -test_set.dataset_name = 'test set' ############################# # We plot the whole dataset. whole_data = train_set.concatenate(test_set) -whole_data.dataset_name = 'train and test samples' labels = [] for i in whole_data.sample_names: @@ -123,6 +120,7 @@ alpha=0.2, legend=True ) +plt.title('train and test samples') fig.show() ##################################################################### @@ -203,9 +201,9 @@ # We can check that the outliers are all detected with this method, # with no false positive (wrongly) in the test set. print('Flagged outliers: \n', - test_set[err_test>=err_thresh].sample_names) + test_set[err_test >= err_thresh].sample_names) print('Flagged nonoutliers: \n', - test_set[err_test Date: Thu, 4 Nov 2021 15:50:50 +0100 Subject: [PATCH 087/117] Style --- examples/plot_depth_classification.py | 56 ++++++++++++++------ skfda/exploratory/visualization/_baseplot.py | 9 ++-- skfda/exploratory/visualization/_ddplot.py | 5 +- 3 files changed, 49 insertions(+), 21 deletions(-) diff --git a/examples/plot_depth_classification.py b/examples/plot_depth_classification.py index 2cfbb196d..2b61e0928 100644 --- a/examples/plot_depth_classification.py +++ b/examples/plot_depth_classification.py @@ -11,9 +11,8 @@ # Author: Pedro Martín Rodríguez-Ponga Eyriès # License: MIT -# sphinx_gallery_thumbnail_number = 6 +# sphinx_gallery_thumbnail_number = 3 -from scipy.interpolate import lagrange import matplotlib.pyplot as plt import numpy as np from matplotlib.colors import ListedColormap @@ -29,6 +28,7 @@ MaximumDepthClassifier, ) from skfda.preprocessing.dim_reduction.feature_extraction import DDGTransformer +from skfda.representation.grid import FDataGrid ############################################################################## # @@ -48,7 +48,12 @@ # this graph, we can see the training dataset. These growth curves will # be used to train the model. Hence, the predictions will be data-driven. X_train, X_test, y_train, y_test = train_test_split( - X, y, test_size=0.5, stratify=y, random_state=0) + X, + y, + test_size=0.5, + stratify=y, + random_state=0, +) # Plot samples grouped by sex X_train.plot(group=y_train, group_names=categories) @@ -115,14 +120,17 @@ # | P2 | DDClassifier with degree 2 | # | P3 | DDClassifier with degree 3 | # | Colors | DDGClassifier with nearest neighbors | -ddg = DDGTransformer(depth_method=ModifiedBandDepth()) +ddg: DDGTransformer[FDataGrid] = DDGTransformer( + depth_method=ModifiedBandDepth(), +) X_train_trans = ddg.fit_transform(X_train, y_train) -# from https://stackoverflow.com/questions/45075638/graph-k-nn-decision-boundaries-in-matplotlib +# Code adapted from: +# https://stackoverflow.com/questions/45075638/graph-k-nn-decision-boundaries-in-matplotlib clf = KNeighborsClassifier(n_neighbors=5) clf.fit(X_train_trans, y_train) -h = .02 # step size in the mesh +h = 0.02 # step size in the mesh # Create color maps cmap_light = ListedColormap(['#FFAAAA', '#AAAAFF']) @@ -133,8 +141,10 @@ # point in the mesh [x_min, x_max]x[y_min, y_max]. x_min, x_max = X_train_trans[:, 0].min() - 1, X_train_trans[:, 0].max() + 1 y_min, y_max = X_train_trans[:, 1].min() - 1, X_train_trans[:, 1].max() + 1 -xx, yy = np.meshgrid(np.arange(x_min, x_max, h), - np.arange(y_min, y_max, h)) +xx, yy = np.meshgrid( + np.arange(x_min, x_max, h), + np.arange(y_min, y_max, h), +) Z = clf.predict(np.c_[xx.ravel(), yy.ravel()]) # Put the result into a color plot @@ -142,13 +152,29 @@ fig, ax = plt.subplots() -ts = np.linspace(0 - 0.025, 1 + 0.025, 100) -pol1, = ax.plot(ts, np.polyval(clf1.poly_, ts), 'c', - linewidth=1, label="Polynomial") -pol2, = ax.plot(ts, np.polyval(clf2.poly_, ts), 'm', - linewidth=1, label="Polynomial") -pol3, = ax.plot(ts, np.polyval(clf3.poly_, ts), 'g', - linewidth=1, label="Polynomial") +margin = 0.025 +ts = np.linspace(- margin, 1 + margin, 100) +pol1, = ax.plot( + ts, + np.polyval(clf1.poly_, ts), + 'c', + linewidth=1, + label="Polynomial", +) +pol2, = ax.plot( + ts, + np.polyval(clf2.poly_, ts), + 'm', + linewidth=1, + label="Polynomial", +) +pol3, = ax.plot( + ts, + np.polyval(clf3.poly_, ts), + 'g', + linewidth=1, + label="Polynomial", +) ax.pcolormesh(xx, yy, Z, cmap=cmap_light, shading='auto') ax.legend([pol1, pol2, pol3], ['P1', 'P2', 'P3']) diff --git a/skfda/exploratory/visualization/_baseplot.py b/skfda/exploratory/visualization/_baseplot.py index c77d44728..633802e18 100644 --- a/skfda/exploratory/visualization/_baseplot.py +++ b/skfda/exploratory/visualization/_baseplot.py @@ -14,6 +14,7 @@ from matplotlib.axes import Axes from matplotlib.backend_bases import LocationEvent, MouseEvent from matplotlib.collections import PathCollection +from matplotlib.colors import ListedColormap from matplotlib.figure import Figure from matplotlib.text import Annotation @@ -43,10 +44,10 @@ def __init__( axes: Union[Axes, Sequence[Axes], None] = None, n_rows: Optional[int] = None, n_cols: Optional[int] = None, - c: NDArrayInt = None, - cmap_bold=None, - x_label: str = None, - y_label: str = None, + c: Optional[NDArrayInt] = None, + cmap_bold: ListedColormap = None, + x_label: Optional[str] = None, + y_label: Optional[str] = None, ) -> None: self.artists: Optional[np.ndarray] = None self.chart = chart diff --git a/skfda/exploratory/visualization/_ddplot.py b/skfda/exploratory/visualization/_ddplot.py index b2086c603..2c71184a2 100644 --- a/skfda/exploratory/visualization/_ddplot.py +++ b/skfda/exploratory/visualization/_ddplot.py @@ -10,6 +10,7 @@ import numpy as np from matplotlib.artist import Artist from matplotlib.axes import Axes +from matplotlib.colors import ListedColormap from matplotlib.figure import Figure from ...exploratory.depth.multivariate import Depth @@ -60,8 +61,8 @@ def __init__( depth_method: Depth[T], fig: Optional[Figure] = None, axes: Optional[Axes] = None, - c: NDArrayInt = None, - cmap_bold=None, + c: Optional[NDArrayInt] = None, + cmap_bold: ListedColormap = None, x_label: str = "X depth", y_label: str = "Y depth", ) -> None: From 2ebed08b17810c5597bac15af36739706a4fcff0 Mon Sep 17 00:00:00 2001 From: pedrorponga Date: Thu, 4 Nov 2021 16:01:05 +0100 Subject: [PATCH 088/117] Style --- examples/plot_depth_classification.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/plot_depth_classification.py b/examples/plot_depth_classification.py index 2b61e0928..802a0ef93 100644 --- a/examples/plot_depth_classification.py +++ b/examples/plot_depth_classification.py @@ -154,21 +154,21 @@ margin = 0.025 ts = np.linspace(- margin, 1 + margin, 100) -pol1, = ax.plot( +pol1 = ax.plot( ts, np.polyval(clf1.poly_, ts), 'c', linewidth=1, label="Polynomial", ) -pol2, = ax.plot( +pol2 = ax.plot( ts, np.polyval(clf2.poly_, ts), 'm', linewidth=1, label="Polynomial", ) -pol3, = ax.plot( +pol3 = ax.plot( ts, np.polyval(clf3.poly_, ts), 'g', From 7856dddee8e82dd7f0660fdbfe616d3773af26eb Mon Sep 17 00:00:00 2001 From: pedrorponga Date: Thu, 4 Nov 2021 16:04:27 +0100 Subject: [PATCH 089/117] typo --- examples/plot_depth_classification.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/plot_depth_classification.py b/examples/plot_depth_classification.py index 802a0ef93..3b95784d1 100644 --- a/examples/plot_depth_classification.py +++ b/examples/plot_depth_classification.py @@ -160,21 +160,21 @@ 'c', linewidth=1, label="Polynomial", -) +)[0] pol2 = ax.plot( ts, np.polyval(clf2.poly_, ts), 'm', linewidth=1, label="Polynomial", -) +)[0] pol3 = ax.plot( ts, np.polyval(clf3.poly_, ts), 'g', linewidth=1, label="Polynomial", -) +)[0] ax.pcolormesh(xx, yy, Z, cmap=cmap_light, shading='auto') ax.legend([pol1, pol2, pol3], ['P1', 'P2', 'P3']) From ef595ae5ad6fe5536662b18c3b23cb6a0afbf4c1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Lejeune?= <54889281+Clej@users.noreply.github.com> Date: Sat, 6 Nov 2021 18:36:26 +0100 Subject: [PATCH 090/117] format issues, typos, remove sample names --- ...t_fpca_inverse_transform_outl_detection.py | 134 ++++++++++-------- 1 file changed, 76 insertions(+), 58 deletions(-) diff --git a/examples/plot_fpca_inverse_transform_outl_detection.py b/examples/plot_fpca_inverse_transform_outl_detection.py index 5b9f214c6..6a592734c 100644 --- a/examples/plot_fpca_inverse_transform_outl_detection.py +++ b/examples/plot_fpca_inverse_transform_outl_detection.py @@ -10,8 +10,8 @@ of the FPCA class to perform functional outlier detection. Roughly speaking, an outlier is a sample which is not representative of the dataset -or different enough compared to a large part of the samples. -The intuition is the following: if the eigen basis, +or different enough compared to a large part of the dataset. +The intuition is the following: if the eigenbasis, i.e. the q>=1 first functional principal components (FPCs), is sufficient to linearly approximate a clean set of samples, then the error between an observed sample @@ -31,7 +31,7 @@ from skfda.preprocessing.dim_reduction.feature_extraction import FPCA from skfda.misc.covariances import Exponential, Gaussian from skfda.datasets import make_gaussian_process -from skfda.misc.metrics import lp_distance, lp_norm +from skfda.misc.metrics import l2_distance, l2_norm ############################################################################## # We proceed as follows: @@ -40,11 +40,11 @@ # - We also generate a test set containing # both nonoutliers and outliers samples. # - Then, we fit an FPCA(n_components=q) -# and compute the vectors of principal components scores +# and compute the principal components scores # of train and test samples. -# - We project back the vectors of principal components scores, +# - We project back the principal components scores, # with the inverse_transform method, to the input (training data space). -# This step can be seen as the reverse projection from the eigen space, +# This step can be seen as the reverse projection from the eigenspace, # spanned by the first FPCs, to the input (functional) space. # - Finally, we compute the relative L2-norm error between # the observed functions and their FPCs approximation. @@ -55,18 +55,20 @@ # # The train set is generated according to a Gaussian process # with a Gaussian (i.e. squared-exponential) covariance function. -cov_clean = Gaussian(variance=2., length_scale=5.) grid_size = 5 * 10**3 +cov_clean = Gaussian(variance=2.0, length_scale=5.0) + n_train = 10**3 train_set = make_gaussian_process( n_samples=n_train, n_features=grid_size, - start=0., stop=25., + start=0.0, + stop=25.0, cov=cov_clean, random_state=20 ) -train_set.sample_names = ['train_' + str(i) for i in range(n_train)] +train_set_labels = np.array(['train(nonoutliers)'] * n_train) ############################################################## # The test set is generated according to a Gaussian process @@ -76,48 +78,45 @@ test_set_clean = make_gaussian_process( n_samples=n_test // 2, n_features=grid_size, - start=0., stop=25., + start=0.0, + stop=25.0, cov=cov_clean, random_state=20 ) # clean test set -test_set_clean.sample_names = [ - 'test_clean_' + str(i) for i in range(test_set_clean.n_samples)] +test_set_clean_labels = np.array(['test(nonoutliers)'] * (n_test // 2)) cov_outlier = Exponential() test_set_outlier = make_gaussian_process( n_samples=n_test // 2, n_features=grid_size, - start=0., stop=25., + start=0.0, + stop=25.0, cov=cov_outlier, random_state=20 ) # test set with outliers test_set_outlier.sample_names = [ 'test_outl_' + str(i) for i in range(test_set_outlier.n_samples)] +test_set_outlier_labels = np.array(['test(outliers)'] * (n_test // 2)) test_set = test_set_clean.concatenate(test_set_outlier) +test_set_labels = np.concatenate( + (test_set_clean_labels, test_set_outlier_labels) +) ############################# # We plot the whole dataset. whole_data = train_set.concatenate(test_set) - -labels = [] -for i in whole_data.sample_names: - if 'train_' in i: - labels.append('train(nonoutliers)') - elif 'test_clean' in i: - labels.append('test(nonoutliers)') - elif 'test_outl' in i: - labels.append('test(outliers)') +whole_data_labels = np.concatenate((train_set_labels, test_set_labels)) fig = whole_data.plot( - group=np.array(labels), + group=whole_data_labels, group_colors={ 'train(nonoutliers)': 'grey', - 'test(nonoutliers)': 'C3', + 'test(nonoutliers)': 'red', 'test(outliers)': 'C1'}, linewidth=0.95, - alpha=0.2, + alpha=0.3, legend=True ) plt.title('train and test samples') @@ -138,29 +137,27 @@ fpca_clean.transform(train_set) ) -err_train = lp_distance( +err_train = l2_distance( train_set, - train_set_hat, - p=2 -) / lp_norm(train_set, p=2) + train_set_hat +) / l2_norm(train_set) test_set_hat = fpca_clean.inverse_transform( fpca_clean.transform(test_set) ) -err_test = lp_distance( +err_test = l2_distance( test_set, - test_set_hat, - p=2 -) / lp_norm(test_set, p=2) + test_set_hat +) / l2_norm(test_set) ########################################################################### # We plot the density of the REs, -# both unconditionnaly (grey and blue) and conditionnaly (orange and red), +# both unconditionaly (grey and blue) and conditionaly (orange and red), # to the rule if error >= threshold then it is an outlier. # The threshold is computed from RE of the training samples as # the quantile of probability 0.99. -# In ohter words, a sample whose RE is higher than the threshold is unlikely -# approximated as a training sample with (low) probability 0.01. +# In other words, a sample whose RE is higher than the threshold is unlikely +# approximated as a training sample, with probability 0.01. x_density = np.linspace(0., 1.6, num=10**3) density_train_err = gaussian_kde(err_train) density_test_err = gaussian_kde(err_test) @@ -170,48 +167,69 @@ density_test_err_inli = gaussian_kde(err_test[err_test < err_thresh]) # density estimate of train errors -plt.plot(x_density, density_train_err(x_density), - label='Error train', color='grey') +plt.plot( + x_density, + density_train_err(x_density), + label='Error train', + color='grey' +) # density estimate of test errors -plt.plot(x_density, density_test_err(x_density), - label='Error test (outliers+nonoutliers', color='C0') +plt.plot( + x_density, + density_test_err(x_density), + label='Error test (outliers+nonoutliers)', + color='C0' +) # outlyingness threshold -plt.vlines(err_thresh, - ymax=max(density_train_err(x_density)), ymin=0., - label='thresh=quantile(p=0.99)', - linestyles='dashed', color='black') +plt.vlines( + err_thresh, + ymax=max(density_train_err(x_density)), + ymin=0.0, + label='thresh=quantile(p=0.99)', + linestyles='dashed', + color='black' +) # density estimate of the error of test samples flagged as outliers -plt.plot(x_density, density_test_err_outl(x_density), - label='Error test>= thresh (outliers)', color='C1') +plt.plot( + x_density, + density_test_err_outl(x_density), + label='Error test>= thresh (outliers)', + color='C1' +) # density estimate of the error of test samples flagged as nonoutliers -plt.plot(x_density, density_test_err_inli(x_density), - label='Error test< thresh (nonoutliers)', color='C3') +plt.plot( + x_density, + density_test_err_inli(x_density), + label='Error test< thresh (nonoutliers)', + color='red' +) plt.xlabel('Relative L2-norm reconstruction errors') -plt.ylabel('Density (unormalized)') -plt.title('Densities of reconstruction errors with {} components'.format(q)) +plt.ylabel('Density (unnormalized)') +plt.title(f'Densities of reconstruction errors with {q} components') plt.legend() plt.show() #################################################################### # We can check that the outliers are all detected with this method, # with no false positive (wrongly) in the test set. -print('Flagged outliers: \n', - test_set[err_test >= err_thresh].sample_names) -print('Flagged nonoutliers: \n', - test_set[err_test < err_thresh].sample_names) +print('Flagged outliers: ') +print(test_set_labels[err_test >= err_thresh]) +print('Flagged nonoutliers: ') +print(test_set_labels[err_test < err_thresh]) ############################################################################## # We observe that the distribution of the training samples (grey) REs -# is unimodal and quite concentrated toward 0. This means that +# is unimodal and quite skewed toward 0. This means that # the training samples are well recovered with 5 FPCs if we allow -# an error rate around 0.4. On the countrary, the distribution of the -# test samples (blue) is bimodal -# with equivalent an magnitude order for each mode, +# an reconsutrction error rate around 0.4. +# On the contrary, the distribution of the +# test samples (blue) REs is bimodal, +# where the two modes seem to be similar, # meaning that half of the test samples is consistently approximated w.r.t # training samples and the other half is poorly approximated in the FPCs basis. # From 083b06394e02957c4b9c46847126d2249a7072f0 Mon Sep 17 00:00:00 2001 From: Alvaro Date: Tue, 9 Nov 2021 20:41:57 +0100 Subject: [PATCH 091/117] Issue 377 corrections --- skfda/_utils/__init__.py | 1 - skfda/_utils/_utils.py | 17 +---------- .../feature_extraction/_fda_feature_union.py | 30 ++++++------------- tests/test_fda_feature_union.py | 23 +++++++------- 4 files changed, 21 insertions(+), 50 deletions(-) diff --git a/skfda/_utils/__init__.py b/skfda/_utils/__init__.py index 3246c763f..4c52efada 100644 --- a/skfda/_utils/__init__.py +++ b/skfda/_utils/__init__.py @@ -11,7 +11,6 @@ _compute_dependence, _DependenceMeasure, _evaluate_grid, - _fit_feature_transformer, _int_to_real, _pairwise_symmetric, _reshape_eval_points, diff --git a/skfda/_utils/_utils.py b/skfda/_utils/_utils.py index b4b3d381b..f718a55c5 100644 --- a/skfda/_utils/_utils.py +++ b/skfda/_utils/_utils.py @@ -23,7 +23,7 @@ import scipy.integrate from numpy import ndarray from pandas.api.indexers import check_array_indexer -from sklearn.base import TransformerMixin, clone +from sklearn.base import clone from sklearn.preprocessing import LabelEncoder from sklearn.utils.multiclass import check_classification_targets from typing_extensions import Literal, Protocol @@ -730,21 +730,6 @@ def _classifier_fit_depth_methods( return classes, class_depth_methods_ -def _fit_feature_transformer( - X: T, - y: ndarray, - transformer: TransformerMixin, -) -> Tuple[ndarray, Sequence[TransformerMixin]]: - classes, y_ind = _classifier_get_classes(y) - - class_feature_transformers = [ - clone(transformer).fit(X[y_ind == cur_class], y[y_ind == cur_class]) - for cur_class in range(classes.size) - ] - - return classes, class_feature_transformers - - _DependenceMeasure = Callable[[np.ndarray, np.ndarray], np.ndarray] diff --git a/skfda/preprocessing/dim_reduction/feature_extraction/_fda_feature_union.py b/skfda/preprocessing/dim_reduction/feature_extraction/_fda_feature_union.py index a802e3763..a8820d109 100644 --- a/skfda/preprocessing/dim_reduction/feature_extraction/_fda_feature_union.py +++ b/skfda/preprocessing/dim_reduction/feature_extraction/_fda_feature_union.py @@ -4,7 +4,7 @@ from typing import Any, Union from numpy import ndarray -from pandas import DataFrame, concat +from pandas import DataFrame from sklearn.pipeline import FeatureUnion from ....representation.basis import FDataBasis @@ -43,7 +43,7 @@ class FdaFeatureUnion(FeatureUnion): verbose: bool If True, the time elapsed while fitting each transformer will be printed as it is completed. By default the value is False - np_array_output: bool + array_output: bool indicates if the transformed data is requested to be a NumPy array output. By default the value is False. @@ -61,7 +61,7 @@ class FdaFeatureUnion(FeatureUnion): ... import FdaFeatureUnion >>> union = FdaFeatureUnion([ ... ("Eval", EvaluationTransformer()), - ... ("fpca", FPCA()), ], np_array_output=True) + ... ("fpca", FPCA()), ], array_output=True) >>> union.fit_transform(X) """ @@ -72,9 +72,9 @@ def __init__( n_jobs=None, transformer_weights=None, verbose=False, - np_array_output=False, + array_output=False, ) -> None: - self.np_array_output = np_array_output + self.array_output = array_output super().__init__( transformer_list, n_jobs=n_jobs, @@ -84,7 +84,7 @@ def __init__( def _hstack(self, Xs) -> Union[DataFrame, ndarray, Any]: - if self.np_array_output: + if self.array_output: for i in Xs: if isinstance(i, FDataGrid or FDataBasis): raise TypeError( @@ -94,23 +94,11 @@ def _hstack(self, Xs) -> Union[DataFrame, ndarray, Any]: ) return super()._hstack(Xs) - if not isinstance(Xs[0], FDataGrid or FDataBasis): - raise TypeError( - "Transformed instance is not of type FDataGrid or" - " FDataBasis. It is " + type(Xs[0]), - ) - - frames = [DataFrame({Xs[0].dataset_name.lower(): Xs[0]})] - - for j in Xs[1:]: - if isinstance(j, FDataGrid or FDataBasis): - frames.append( - DataFrame({j.dataset_name.lower(): j}), - ) - else: + for j in Xs: + if not isinstance(j, FDataGrid or FDataBasis): raise TypeError( "Transformed instance is not of type FDataGrid or" " FDataBasis. It is " + type(j), ) - return concat(frames, axis=1) + return DataFrame({'Transformed data': Xs}) diff --git a/tests/test_fda_feature_union.py b/tests/test_fda_feature_union.py index 1de476057..4d7aa50db 100644 --- a/tests/test_fda_feature_union.py +++ b/tests/test_fda_feature_union.py @@ -2,7 +2,8 @@ import unittest -from pandas import DataFrame, concat +from pandas import DataFrame +from pandas.testing import assert_frame_equal from skfda.datasets import fetch_growth from skfda.misc.operators import SRSF @@ -17,27 +18,30 @@ class TestFdaFeatureUnion(unittest.TestCase): + """Check the Fda Feature Union module.""" def setUp(self) -> None: + """Fetch the Berkeley Growth Study dataset.""" self.X = fetch_growth(return_X_y=True)[0] def test_incompatible_array_output(self) -> None: - + """Check that the transformer returns a ndarray.""" u = FdaFeatureUnion( [("EvaluationT", EvaluationTransformer(None)), ("fpca", FPCA())], - np_array_output=False, + array_output=False, ) self.assertRaises(TypeError, u.fit_transform, self.X) def test_incompatible_fdatagrid_output(self) -> None: - + """Check that the transformer returns a fdatagrid.""" u = FdaFeatureUnion( [("EvaluationT", EvaluationTransformer(None)), ("srsf", SRSF())], - np_array_output=True, + array_output=True, ) self.assertRaises(TypeError, u.fit_transform, self.X) def test_correct_transformation_concat(self) -> None: + """Check that the transformation is done correctly.""" u = FdaFeatureUnion( [("srsf1", SRSF()), ("smooth", NadarayaWatsonSmoother())], ) @@ -46,13 +50,8 @@ def test_correct_transformation_concat(self) -> None: t1 = SRSF().fit_transform(self.X) t2 = NadarayaWatsonSmoother().fit_transform(self.X) - frames = [ - DataFrame({t1.dataset_name.lower(): t1}), - DataFrame({t2.dataset_name.lower(): t2}), - ] - true_frame = concat(frames, axis=1) - result = True - self.assertEqual(result, true_frame.equals(created_frame)) + true_frame = DataFrame({"Transformed data": [t1, t2]}) + assert_frame_equal(true_frame, created_frame) if __name__ == '__main__': From cfe57775fee16c332ef9daa331a6da1bff18b683 Mon Sep 17 00:00:00 2001 From: vnmabus Date: Thu, 11 Nov 2021 01:02:05 +0100 Subject: [PATCH 092/117] Makes operations between FData and ndarray commutative. Also improve take implementation. --- skfda/representation/_functional_data.py | 78 +++++++++++++++---- skfda/representation/basis/_fdatabasis.py | 29 ++++++- skfda/representation/grid.py | 28 ++++++- tests/test_pandas.py | 15 ++-- ...fdatagrid_numpy.py => test_ufunc_numpy.py} | 22 ++++++ 5 files changed, 144 insertions(+), 28 deletions(-) rename tests/{test_fdatagrid_numpy.py => test_ufunc_numpy.py} (70%) diff --git a/skfda/representation/_functional_data.py b/skfda/representation/_functional_data.py index 22ac35a8a..9f25ac2cd 100644 --- a/skfda/representation/_functional_data.py +++ b/skfda/representation/_functional_data.py @@ -33,6 +33,7 @@ GridPointsLike, LabelTuple, LabelTupleLike, + NDArrayInt, ) from .evaluator import Evaluator from .extrapolation import ExtrapolationLike, _parse_extrapolation @@ -1049,6 +1050,24 @@ def __array__(self, *args: Any, **kwargs: Any) -> np.ndarray: return array + def __array_ufunc__( + self, + ufunc: Any, + method: str, + *inputs: Any, + **kwargs: Any, + ) -> Any: + """Prevent NumPy from converting to array just to do operations.""" + + # Make normal multiplication by scalar use the __mul__ method + if ufunc == np.multiply and method == "__call__" and len(inputs) == 2: + if isinstance(inputs[0], np.ndarray): + inputs = inputs[::-1] + + return inputs[0] * inputs[1] + + return NotImplemented + ##################################################################### # Pandas ExtensionArray methods ##################################################################### @@ -1100,9 +1119,17 @@ def _from_factorized(cls, values: Any, original: Any) -> NoReturn: "Factorization does not make sense for functional data", ) + @abstractmethod + def _take_allow_fill( + self: T, + indices: NDArrayInt, + fill_value: T, + ) -> T: + pass + def take( self: T, - indices: Sequence[int], + indices: Union[int, Sequence[int], NDArrayInt], allow_fill: bool = False, fill_value: Optional[T] = None, axis: int = 0, @@ -1148,28 +1175,44 @@ def take( numpy.take pandas.api.extensions.take """ - from pandas.core.algorithms import take - # The axis parameter must exist, because sklearn tries to use take # instead of __getitem__ if axis != 0: raise ValueError(f"Axis must be 0, not {axis}") - # If the ExtensionArray is backed by an ndarray, then - # just pass that here instead of coercing to object. - data = np.asarray(self) - if allow_fill and fill_value is None: + arr_indices = np.atleast_1d(indices) + + if fill_value is None: fill_value = self.dtype.na_value - # fill value should always be translated from the scalar - # type for the array, to the physical storage type for - # the data, before passing to take. - result = take( - data, - indices, - fill_value=fill_value, - allow_fill=allow_fill, - ) - return self._from_sequence(result, dtype=self.dtype) + + non_empty_take_msg = "cannot do a non-empty take from an empty axes" + + if allow_fill: + if (arr_indices < -1).any(): + raise ValueError("Invalid indexes") + + positive_mask = arr_indices >= 0 + if len(self) == 0 and positive_mask.any(): + raise IndexError(non_empty_take_msg) + + sample_names = np.zeros(len(arr_indices), dtype=object) + result = self._take_allow_fill(arr_indices, fill_value) + + sample_names[positive_mask] = np.array(self.sample_names)[ + arr_indices[positive_mask] + ] + + if fill_value is not self.dtype.na_value: + sample_names[~positive_mask] = fill_value.sample_names[0] + + result.sample_names = tuple(sample_names) + else: + if len(self) == 0 and len(arr_indices) != 0: + raise IndexError(non_empty_take_msg) + + result = self[arr_indices] + + return result @classmethod def _concat_same_type( @@ -1198,6 +1241,7 @@ def astype(self, dtype: Any, copy: bool = True) -> Any: if copy: new_obj = self.copy() return new_obj + return super().astype(dtype) def _reduce(self, name: str, skipna: bool = True, **kwargs: Any) -> Any: diff --git a/skfda/representation/basis/_fdatabasis.py b/skfda/representation/basis/_fdatabasis.py index a9071e7f4..fa28d94ea 100644 --- a/skfda/representation/basis/_fdatabasis.py +++ b/skfda/representation/basis/_fdatabasis.py @@ -23,7 +23,13 @@ from ..._utils import _check_array_key, _int_to_real, constants from .. import grid from .._functional_data import FData -from .._typing import ArrayLike, DomainRange, GridPointsLike, LabelTupleLike +from .._typing import ( + ArrayLike, + DomainRange, + GridPointsLike, + LabelTupleLike, + NDArrayInt, +) from ..extrapolation import ExtrapolationLike from . import Basis @@ -866,6 +872,27 @@ def __rtruediv__(self: T, other: Union[np.ndarray, float]) -> T: ##################################################################### # Pandas ExtensionArray methods ##################################################################### + def _take_allow_fill( + self: T, + indices: NDArrayInt, + fill_value: T, + ) -> T: + result = self.copy() + result.coefficients = np.full( + (len(indices),) + self.coefficients.shape[1:], + np.nan, + ) + + positive_mask = indices >= 0 + result.coefficients[positive_mask] = self.coefficients[ + indices[positive_mask] + ] + + if fill_value is not self.dtype.na_value: + result.coefficients[~positive_mask] = fill_value.coefficients[0] + + return result + @property def dtype(self) -> FDataBasisDType: """The dtype for this extension array, FDataGridDType""" diff --git a/skfda/representation/grid.py b/skfda/representation/grid.py index 9f3fb1bef..2a1ef1092 100644 --- a/skfda/representation/grid.py +++ b/skfda/representation/grid.py @@ -25,9 +25,8 @@ import findiff import numpy as np import pandas.api.extensions -from matplotlib.figure import Figure - import scipy.stats.mstats +from matplotlib.figure import Figure from .._utils import ( _check_array_key, @@ -44,6 +43,7 @@ GridPoints, GridPointsLike, LabelTupleLike, + NDArrayInt, ) from .basis import Basis from .evaluator import Evaluator @@ -1260,7 +1260,7 @@ def __array_ufunc__( new_inputs = [ i.data_matrix if isinstance(i, FDataGrid) - else i for i in inputs + else self._get_op_matrix(i) for i in inputs ] outputs = kwargs.pop('out', None) @@ -1292,6 +1292,28 @@ def __array_ufunc__( ##################################################################### # Pandas ExtensionArray methods ##################################################################### + + def _take_allow_fill( + self: T, + indices: NDArrayInt, + fill_value: T, + ) -> T: + result = self.copy() + result.data_matrix = np.full( + (len(indices),) + self.data_matrix.shape[1:], + np.nan, + ) + + positive_mask = indices >= 0 + result.data_matrix[positive_mask] = self.data_matrix[ + indices[positive_mask] + ] + + if fill_value is not self.dtype.na_value: + result.data_matrix[~positive_mask] = fill_value.data_matrix[0] + + return result + @property def dtype(self) -> FDataGridDType: """The dtype for this extension array, FDataGridDType""" diff --git a/tests/test_pandas.py b/tests/test_pandas.py index a05075eaa..320533189 100644 --- a/tests/test_pandas.py +++ b/tests/test_pandas.py @@ -1,45 +1,46 @@ -import skfda import unittest import pandas as pd +import skfda + class TestPandas(unittest.TestCase): - def setUp(self): + def setUp(self) -> None: self.fd = skfda.FDataGrid( [[1, 2, 3, 4, 5, 6, 7], [2, 3, 4, 5, 6, 7, 9]]) self.fd_basis = self.fd.to_basis(skfda.representation.basis.BSpline( n_basis=5)) - def test_fdatagrid_series(self): + def test_fdatagrid_series(self) -> None: series = pd.Series(self.fd) self.assertIsInstance( series.dtype, skfda.representation.grid.FDataGridDType) self.assertEqual(len(series), self.fd.n_samples) self.assertTrue(series[0].equals(self.fd[0])) - def test_fdatabasis_series(self): + def test_fdatabasis_series(self) -> None: series = pd.Series(self.fd_basis) self.assertIsInstance( series.dtype, skfda.representation.basis.FDataBasisDType) self.assertEqual(len(series), self.fd_basis.n_samples) self.assertTrue(series[0].equals(self.fd_basis[0])) - def test_fdatagrid_dataframe(self): + def test_fdatagrid_dataframe(self) -> None: df = pd.DataFrame({"function": self.fd}) self.assertIsInstance( df["function"].dtype, skfda.representation.grid.FDataGridDType) self.assertEqual(len(df["function"]), self.fd.n_samples) self.assertTrue(df["function"][0].equals(self.fd[0])) - def test_fdatabasis_dataframe(self): + def test_fdatabasis_dataframe(self) -> None: df = pd.DataFrame({"function": self.fd_basis}) self.assertIsInstance( df["function"].dtype, skfda.representation.basis.FDataBasisDType) self.assertEqual(len(df["function"]), self.fd_basis.n_samples) self.assertTrue(df["function"][0].equals(self.fd_basis[0])) - def test_take(self): + def test_take(self) -> None: self.assertTrue(self.fd.take(0).equals(self.fd[0])) self.assertTrue(self.fd.take(0, axis=0).equals(self.fd[0])) diff --git a/tests/test_fdatagrid_numpy.py b/tests/test_ufunc_numpy.py similarity index 70% rename from tests/test_fdatagrid_numpy.py rename to tests/test_ufunc_numpy.py index ef3455844..4c80f1d37 100644 --- a/tests/test_fdatagrid_numpy.py +++ b/tests/test_ufunc_numpy.py @@ -6,6 +6,7 @@ import numpy as np import pytest +import skfda from skfda import FDataGrid @@ -71,3 +72,24 @@ def test_out_ufunc(monary: Callable[..., Any]) -> None: fd_monary_build = FDataGrid(monary(data_matrix_copy)) assert fd.equals(fd_monary_build) + + +class TestOperators(unittest.TestCase): + """Tests for operators.""" + + def test_commutativity(self) -> None: + """Test that operations with numpy arrays commute.""" + X = FDataGrid([[1, 2, 3], [4, 5, 6]]) + arr = np.array([1, 2]) + + self.assertTrue((arr + X).equals((X + arr))) + + def test_commutativity_basis(self) -> None: + """Test that operations with numpy arrays for basis commute.""" + X = FDataGrid([[1, 2, 3], [4, 5, 6]]) + arr = np.array([1, 2]) + basis = skfda.representation.basis.Fourier(n_basis=5) + + X_basis = X.to_basis(basis) + + self.assertTrue((arr * X_basis).equals((X_basis * arr))) From 02db845b9d146f54f8fa3b60b2a9b323a4735582 Mon Sep 17 00:00:00 2001 From: Alvaro Date: Thu, 11 Nov 2021 22:31:49 +0100 Subject: [PATCH 093/117] Changes on issue 377 --- .../feature_extraction/_fda_feature_union.py | 55 +++++++++++++------ tests/test_fda_feature_union.py | 11 +--- 2 files changed, 40 insertions(+), 26 deletions(-) diff --git a/skfda/preprocessing/dim_reduction/feature_extraction/_fda_feature_union.py b/skfda/preprocessing/dim_reduction/feature_extraction/_fda_feature_union.py index a8820d109..811dac21c 100644 --- a/skfda/preprocessing/dim_reduction/feature_extraction/_fda_feature_union.py +++ b/skfda/preprocessing/dim_reduction/feature_extraction/_fda_feature_union.py @@ -53,16 +53,46 @@ class FdaFeatureUnion(FeatureUnion): >>> X = fetch_growth(return_X_y=True)[0] Then we need to import the transformers we want to use - >>> from skfda.preprocessing.dim_reduction.feature_extraction import FPCA + >>> from skfda.preprocessing.dim_reduction.feature_extraction import ( + ... FPCA, + ... FdaFeatureUnion, + ... ) >>> from skfda.representation import EvaluationTransformer - Finally we import the union and apply fit and transform - >>> from skfda.preprocessing.dim_reduction.feature_extraction - ... import FdaFeatureUnion - >>> union = FdaFeatureUnion([ - ... ("Eval", EvaluationTransformer()), - ... ("fpca", FPCA()), ], array_output=True) - >>> union.fit_transform(X) + Finally we apply fit and transform + >>> union = FdaFeatureUnion( + ... [ + ... ("eval", EvaluationTransformer()), + ... ("fpca", FPCA()), + ... ], + ... array_output=True, + ... ) + >>> transformed_data = union.fit_transform(X) + >>> transformed_data + [[ 81.3 , 84.2 , 86.4 , ..., 105.84283261, + -34.60733887, -14.97276458], + [ 76.2 , 80.4 , 83.2 , ..., -11.42260839, + -17.01293819, 24.77047871], + [ 76.8 , 79.8 , 82.6 , ..., -33.81180503, + -23.312921 , 7.67421522], + ..., + [ 68.6 , 73.6 , 78.6 , ..., -19.49404628, + 12.76825883, 0.70188222], + [ 79.9 , 82.6 , 84.8 , ..., 19.28399897, + 31.49601648, 6.54012077], + [ 76.1 , 78.4 , 82.3 , ..., 17.71973789, + 27.7332045 , -1.70532625]] + + We can also concatenate the result with the + original data on a Pandas DataFrame. + >>> from pandas.core.frame import DataFrame + >>> DataFrame({ + ... "Data": [transformed_data, X.data_matrix] + ... }) + Data + 0 [[81.3, 84.2, 86.4, 88.9, 91.4, 101.1, 109.5, ... + 1 [[[81.3], [84.2], [86.4], [88.9], [91.4], [101... + """ def __init__( @@ -86,7 +116,7 @@ def _hstack(self, Xs) -> Union[DataFrame, ndarray, Any]: if self.array_output: for i in Xs: - if isinstance(i, FDataGrid or FDataBasis): + if isinstance(i, (FDataGrid, FDataBasis)): raise TypeError( "There are transformed instances of FDataGrid or " "FDataBasis that can't be concatenated on a NumPy " @@ -94,11 +124,4 @@ def _hstack(self, Xs) -> Union[DataFrame, ndarray, Any]: ) return super()._hstack(Xs) - for j in Xs: - if not isinstance(j, FDataGrid or FDataBasis): - raise TypeError( - "Transformed instance is not of type FDataGrid or" - " FDataBasis. It is " + type(j), - ) - return DataFrame({'Transformed data': Xs}) diff --git a/tests/test_fda_feature_union.py b/tests/test_fda_feature_union.py index 4d7aa50db..339790885 100644 --- a/tests/test_fda_feature_union.py +++ b/tests/test_fda_feature_union.py @@ -8,7 +8,6 @@ from skfda.datasets import fetch_growth from skfda.misc.operators import SRSF from skfda.preprocessing.dim_reduction.feature_extraction import ( - FPCA, FdaFeatureUnion, ) from skfda.preprocessing.smoothing.kernel_smoothers import ( @@ -24,18 +23,10 @@ def setUp(self) -> None: """Fetch the Berkeley Growth Study dataset.""" self.X = fetch_growth(return_X_y=True)[0] - def test_incompatible_array_output(self) -> None: - """Check that the transformer returns a ndarray.""" - u = FdaFeatureUnion( - [("EvaluationT", EvaluationTransformer(None)), ("fpca", FPCA())], - array_output=False, - ) - self.assertRaises(TypeError, u.fit_transform, self.X) - def test_incompatible_fdatagrid_output(self) -> None: """Check that the transformer returns a fdatagrid.""" u = FdaFeatureUnion( - [("EvaluationT", EvaluationTransformer(None)), ("srsf", SRSF())], + [("eval", EvaluationTransformer(None)), ("srsf", SRSF())], array_output=True, ) self.assertRaises(TypeError, u.fit_transform, self.X) From f8c11e25938792c1a5f27b711bd07ecca8693f64 Mon Sep 17 00:00:00 2001 From: Alvaro Date: Mon, 15 Nov 2021 17:02:01 +0100 Subject: [PATCH 094/117] Fixes --- .../feature_extraction/_fda_feature_union.py | 59 +++++++++---------- 1 file changed, 27 insertions(+), 32 deletions(-) diff --git a/skfda/preprocessing/dim_reduction/feature_extraction/_fda_feature_union.py b/skfda/preprocessing/dim_reduction/feature_extraction/_fda_feature_union.py index 811dac21c..bc471629a 100644 --- a/skfda/preprocessing/dim_reduction/feature_extraction/_fda_feature_union.py +++ b/skfda/preprocessing/dim_reduction/feature_extraction/_fda_feature_union.py @@ -1,7 +1,7 @@ """Feature extraction union for dimensionality reduction.""" from __future__ import annotations -from typing import Any, Union +from typing import Union from numpy import ndarray from pandas import DataFrame @@ -50,49 +50,44 @@ class FdaFeatureUnion(FeatureUnion): Examples: Firstly we will import the Berkeley Growth Study data set >>> from skfda.datasets import fetch_growth - >>> X = fetch_growth(return_X_y=True)[0] + >>> X,y = fetch_growth(return_X_y=True) - Then we need to import the transformers we want to use + Then we need to import the transformers we want to use. In our case we + will use FPCA and Minimum Redundancy Maximum Relevance. + Evaluation Transformer returns the original curve, and as it is helpful, + we will concatenate it to the already metioned transformers. >>> from skfda.preprocessing.dim_reduction.feature_extraction import ( ... FPCA, ... FdaFeatureUnion, ... ) + >>> from skfda.preprocessing.dim_reduction.variable_selection import ( + ... MinimumRedundancyMaximumRelevance, + ... ) >>> from skfda.representation import EvaluationTransformer - Finally we apply fit and transform + Finally we apply fit and transform. >>> union = FdaFeatureUnion( ... [ - ... ("eval", EvaluationTransformer()), + ... ("mrmr", MinimumRedundancyMaximumRelevance()), ... ("fpca", FPCA()), + ... ("eval", EvaluationTransformer()), ... ], ... array_output=True, ... ) - >>> transformed_data = union.fit_transform(X) - >>> transformed_data - [[ 81.3 , 84.2 , 86.4 , ..., 105.84283261, - -34.60733887, -14.97276458], - [ 76.2 , 80.4 , 83.2 , ..., -11.42260839, - -17.01293819, 24.77047871], - [ 76.8 , 79.8 , 82.6 , ..., -33.81180503, - -23.312921 , 7.67421522], - ..., - [ 68.6 , 73.6 , 78.6 , ..., -19.49404628, - 12.76825883, 0.70188222], - [ 79.9 , 82.6 , 84.8 , ..., 19.28399897, - 31.49601648, 6.54012077], - [ 76.1 , 78.4 , 82.3 , ..., 17.71973789, - 27.7332045 , -1.70532625]] - - We can also concatenate the result with the - original data on a Pandas DataFrame. - >>> from pandas.core.frame import DataFrame - >>> DataFrame({ - ... "Data": [transformed_data, X.data_matrix] - ... }) - Data - 0 [[81.3, 84.2, 86.4, 88.9, 91.4, 101.1, 109.5, ... - 1 [[[81.3], [84.2], [86.4], [88.9], [91.4], [101... - + >>> union.fit_transform(X,y) + [[194.3 , 105.84, -34.61, ..., 193.8 , + 194.3 , 195.1 ], + [177.4 , -11.42, -17.01, ..., 176.1 , + 177.4 , 178.7 ], + [171.2 , -33.81, -23.31 , ..., 170.9 , + 171.2 , 171.5 ], + ..., + [166.3 , -19.49 12.77, ..., 166. , + 166.3 , 166.8 ], + [168.4 , 19.28, 31.5, ..., 168.3 , + 168.4 , 168.6 ], + [168.9 , 17.72, 27.73 , ..., 168.6 , + 168.9 , 169.2 ]] """ def __init__( @@ -112,7 +107,7 @@ def __init__( verbose=verbose, ) - def _hstack(self, Xs) -> Union[DataFrame, ndarray, Any]: + def _hstack(self, Xs) -> Union[DataFrame, ndarray]: if self.array_output: for i in Xs: From 1b3716b8e04149c0721615a712d641290ae5c982 Mon Sep 17 00:00:00 2001 From: vnmabus Date: Thu, 18 Nov 2021 21:18:43 +0100 Subject: [PATCH 095/117] Fix pairwise alignment example. --- docs/refs.bib | 19 +++++- examples/plot_pairwise_alignment.py | 89 +++++++++++++++------------ skfda/datasets/_samples_generators.py | 8 ++- 3 files changed, 72 insertions(+), 44 deletions(-) diff --git a/docs/refs.bib b/docs/refs.bib index 50e64219d..86bcbaa32 100644 --- a/docs/refs.bib +++ b/docs/refs.bib @@ -113,6 +113,21 @@ @article{ghosh+chaudhuri_2005_depth doi = {10.1111/j.1467-9469.2005.00423.x} } +@article{marron++_2015_functional, + title = {Functional {{Data Analysis}} of {{Amplitude}} and {{Phase Variation}}}, + author = {Marron, J. S. and Ramsay, James O. and Sangalli, Laura M. and Srivastava, Anuj}, + year = {2015}, + journal = {Statistical Science}, + volume = {30}, + number = {4}, + pages = {468--484}, + publisher = {{Institute of Mathematical Statistics}}, + issn = {0883-4237}, + url = {https://www.jstor.org/stable/24780816}, + urldate = {2021-11-18}, + abstract = {The abundance of functional observations in scientific endeavors has led to a significant development in tools for functional data analysis (FDA). This kind of data comes with several challenges: infinite-dimensionality of function spaces, observation noise, and so on. However, there is another interesting phenomena that creates problems in FDA. The functional data often comes with lateral displacements/deformations in curves, a phenomenon which is different from the height or amplitude variability and is termed phase variation. The presence of phase variability artificially often inflates data variance, blurs underlying data structures, and distorts principal components. While the separation and/or removal of phase from amplitude data is desirable, this is a difficult problem. In particular, a commonly used alignment procedure, based on minimizing the 핃2 norm between functions, does not provide satisfactory results. In this paper we motivate the importance of dealing with the phase variability and summarize several current ideas for separating phase and amplitude components. These approaches differ in the following: (1) the definition and mathematical representation of phase variability, (2) the objective functions that are used in functional data alignment, and (3) the algorithmic tools for solving estimation/optimization problems. We use simple examples to illustrate various approaches and to provide useful contrast between them.} +} + @article{pini+stamm+vantini_2018_hotellings, title = {Hotelling's T2 in separable Hilbert spaces}, author = {Alessia Pini and Aymeric Stamm and Simone Vantini}, @@ -215,8 +230,8 @@ @inbook{ramsay+silverman_2005_functional_basis @inbook{srivastava+klassen_2016_analysis_elastic, author = {Srivastava, Anuj and Klassen, Eric}, title = {Functional and Shape Data Analysis}, - chapter = {Functional Data and Elastic Registration}, - pages = {73 -- 122}, + chapter = {Functional Data and Elastic Registration}, + pages = {73 -- 122}, publisher = {Springer-Verlag New York}, year = {2016}, isbn = {978-1-4939-4018-9}, diff --git a/examples/plot_pairwise_alignment.py b/examples/plot_pairwise_alignment.py index 33b60f3b8..6733f8287 100644 --- a/examples/plot_pairwise_alignment.py +++ b/examples/plot_pairwise_alignment.py @@ -26,14 +26,16 @@ # Given any two functions :math:`f` and :math:`g`, we define their # pairwise alignment or registration to be the problem of finding a warping # function :math:`\gamma^*` such that a certain energy term -# :math:`E[f, g \circ \gamma]` is minimized. +# :math:`E[f, g \circ \gamma]` is minimized +# :footcite:p:`marron++_2015_functional`. # # .. math:: -# \gamma^*= *{argmin}_{\gamma \in \Gamma} E[f \circ \gamma, g] +# \gamma^*= \arg \min_{\gamma \in \Gamma} E[f \circ \gamma, g] # # In the case of elastic registration it is taken as energy function the # Fisher-Rao distance with a penalisation term, due to the property of -# invariance to reparameterizations of warpings functions. +# invariance to reparameterizations of warpings functions +# :footcite:p:`srivastava+klassen_2016_analysis_elastic`. # # .. math:: # E[f \circ \gamma, g] = d_{FR} (f \circ \gamma, g) @@ -42,22 +44,25 @@ # defined in [0, 1] wich will be used to show the elastic registration. # Due to the similarity of these curves can be aligned almost perfectly # between them. -# # Samples with modes in 1/3 and 2/3 -fd = make_multimodal_samples(n_samples=2, modes_location=[1 / 3, 2 / 3], - random_state=1, start=0, mode_std=.01) +fd = make_multimodal_samples( + n_samples=2, + modes_location=[1 / 3, 2 / 3], + random_state=1, + start=0, + mode_std=0.01, +) fig = fd.plot() fig.axes[0].legend(['$f$', '$g$']) - +plt.show() ############################################################################## # In this example :math:`g` will be used as template and :math:`f` will be # aligned to it. In the following figure it is shown the result of the # registration process, wich can be computed using # :class:`~skfda.preprocessing.registration.FisherRaoElasticRegistration`. -# f, g = fd[0], fd[1] @@ -72,13 +77,12 @@ # Legend -fig.axes[0].legend(['$f$', '$g$', '$f \\circ \\gamma $']) - +fig.axes[0].legend(['$f$', '$g$', r'$f \circ \gamma $']) +plt.show() ############################################################################## # The non-linear transformation :math:`\gamma` applied to :math:`f` in # the alignment is stored in the attribute `warping_`. -# # Warping used in the last transformation warping = elastic_registration.warping_ @@ -89,16 +93,14 @@ fig.axes[0].plot(t, t, linestyle='--') # Legend -fig.axes[0].legend(['$\\gamma$', '$\\gamma_{id}$']) - -fig +fig.axes[0].legend([r'$\gamma$', r'$\gamma_{id}$']) +plt.show() ############################################################################## # The transformation necessary to align :math:`g` to :math:`f` will be the # inverse of the original warping function, :math:`\gamma^{-1}`. # This fact is a consequence of the use of the Fisher-Rao metric as energy # function. -# warping_inverse = invert_warping(warping) @@ -107,7 +109,8 @@ # Legend -fig.axes[0].legend(['$f$', '$g$', '$g \\circ \\gamma^{-1} $']) +fig.axes[0].legend(['$f$', '$g$', r'$g \circ \gamma^{-1} $']) +plt.show() ############################################################################## # The amount of deformation used in the registration can be controlled by @@ -117,14 +120,13 @@ # # The following figure shows the original curves and the result to the # alignment varying :math:`\lambda` from 0 to 0.2. -# # Values of lambda -penalties = np.linspace(0, .2, 20) +penalties = np.linspace(0, 0.2, 20) # Creation of a color gradient cmap = clr.LinearSegmentedColormap.from_list('custom cmap', ['C1', 'C0']) -color = cmap(.2 + 3 * penalties) +color = cmap(0.2 + 3 * penalties) fig = plt.figure() ax = fig.add_subplot(1, 1, 1) @@ -136,18 +138,17 @@ elastic_registration.transform(f).plot(fig, color=c) -f.plot(fig=fig, color='C0', linewidth=2., label='$f$') -g.plot(fig=fig, color='C1', linewidth=2., label='$g$') +f.plot(fig=fig, color='C0', linewidth=2, label='$f$') +g.plot(fig=fig, color='C1', linewidth=2, label='$g$') # Legend fig.axes[0].legend() - +plt.show() ############################################################################## # This phenomenon of loss of elasticity is clearly observed in # the warpings used, since as the term of penalty increases, the functions # are closer to :math:`\gamma_{id}`. -# fig = plt.figure() ax = fig.add_subplot(1, 1, 1) @@ -159,7 +160,7 @@ # Plots identity fig.axes[0].plot(t, t, color='C0', linestyle="--") - +plt.show() ############################################################################## # We can perform the pairwise of multiple curves at once. We can use a single @@ -171,18 +172,28 @@ # # We will build two sets with 3 curves each, :math:`\{f_i\}` and # :math:`\{g_i\}`. -# # Creation of the 2 sets of functions state = np.random.RandomState(0) -location1 = state.normal(loc=-.3, scale=.1, size=3) +location1 = state.normal(loc=-0.3, scale=0.1, size=3) fd = skfda.datasets.make_multimodal_samples( - n_samples=3, modes_location=location1, noise=.001, random_state=1) + n_samples=3, + modes_location=location1, + noise=0.001, + random_state=1, +) -location2 = state.normal(loc=.3, scale=.1, size=3) +location2 = state.normal( + loc=0.3, + scale=0.1, + size=3, +) g = skfda.datasets.make_multimodal_samples( - n_samples=3, modes_location=location2, random_state=2) + n_samples=3, + modes_location=location2, + random_state=2, +) # Plot of the sets fig = fd.plot(color="C0", label="$f_i$") @@ -190,12 +201,11 @@ labels = fig.axes[0].get_lines() fig.axes[0].legend(handles=[labels[0], labels[-1]]) - +plt.show() ############################################################################## # The following figure shows the result of the pairwise alignment of # :math:`\{f_i\}` to :math:`\{g_i\}`. -# # Registration of the sets elastic_registration = FisherRaoElasticRegistration(template=g) @@ -207,18 +217,17 @@ l1 = fig.axes[0].get_lines()[-1] g.plot(fig=fig, color="C1", label="$g_i$") l2 = fig.axes[0].get_lines()[-1] -fd_registered.plot(fig=fig, color="C0", linestyle="--", - label="$f_i \\circ \\gamma_i$") +fd_registered.plot( + fig=fig, + color="C0", + linestyle="--", + label=r"$f_i \circ \gamma_i$", +) l3 = fig.axes[0].get_lines()[-1] fig.axes[0].legend(handles=[l1, l2, l3]) - +plt.show() ############################################################################## -# * Srivastava, Anuj & Klassen, Eric P. (2016). Functional and shape data -# analysis. In *Functional Data and Elastic Registration* (pp. 73-122). -# Springer. # -# * J. S. Marron, James O. Ramsay, Laura M. Sangalli and Anuj Srivastava -# (2015). Functional Data Analysis of Amplitude and Phase Variation. -# Statistical Science 2015, Vol. 30, No. 4 +# .. footbibliography:: diff --git a/skfda/datasets/_samples_generators.py b/skfda/datasets/_samples_generators.py index 9d00bcdc6..e3d2f5518 100644 --- a/skfda/datasets/_samples_generators.py +++ b/skfda/datasets/_samples_generators.py @@ -14,7 +14,11 @@ normalize_warping, ) from ..misc import covariances -from ..representation._typing import DomainRangeLike, GridPointsLike +from ..representation._typing import ( + DomainRangeLike, + GridPointsLike, + NDArrayFloat, +) from ..representation.interpolation import SplineInterpolation MeanCallable = Callable[[np.ndarray], np.ndarray] @@ -285,7 +289,7 @@ def make_multimodal_samples( std: float = 0.05, mode_std: float = 0.02, noise: float = 0, - modes_location: Optional[Sequence[float]] = None, + modes_location: Optional[Union[Sequence[float], NDArrayFloat]] = None, random_state: RandomStateLike = None, ) -> FDataGrid: r""" From dab2d1519a8a8d7692b5c3162dc3de584a47e736 Mon Sep 17 00:00:00 2001 From: Alvaro Date: Thu, 18 Nov 2021 22:37:33 +0100 Subject: [PATCH 096/117] Modified pytest --- .../feature_extraction/_fda_feature_union.py | 23 ++++++++----------- 1 file changed, 9 insertions(+), 14 deletions(-) diff --git a/skfda/preprocessing/dim_reduction/feature_extraction/_fda_feature_union.py b/skfda/preprocessing/dim_reduction/feature_extraction/_fda_feature_union.py index bc471629a..9f73119e2 100644 --- a/skfda/preprocessing/dim_reduction/feature_extraction/_fda_feature_union.py +++ b/skfda/preprocessing/dim_reduction/feature_extraction/_fda_feature_union.py @@ -64,6 +64,7 @@ class FdaFeatureUnion(FeatureUnion): ... MinimumRedundancyMaximumRelevance, ... ) >>> from skfda.representation import EvaluationTransformer + >>> import numpy as np Finally we apply fit and transform. >>> union = FdaFeatureUnion( @@ -74,20 +75,14 @@ class FdaFeatureUnion(FeatureUnion): ... ], ... array_output=True, ... ) - >>> union.fit_transform(X,y) - [[194.3 , 105.84, -34.61, ..., 193.8 , - 194.3 , 195.1 ], - [177.4 , -11.42, -17.01, ..., 176.1 , - 177.4 , 178.7 ], - [171.2 , -33.81, -23.31 , ..., 170.9 , - 171.2 , 171.5 ], - ..., - [166.3 , -19.49 12.77, ..., 166. , - 166.3 , 166.8 ], - [168.4 , 19.28, 31.5, ..., 168.3 , - 168.4 , 168.6 ], - [168.9 , 17.72, 27.73 , ..., 168.6 , - 168.9 , 169.2 ]] + >>> np.around(union.fit_transform(X,y), decimals = 2) + array([[194.3 , 105.84, -34.61, ..., 193.8 , 194.3 , 195.1 ], + [177.4 , -11.42, -17.01, ..., 176.1 , 177.4 , 178.7 ], + [171.2 , -33.81, -23.31, ..., 170.9 , 171.2 , 171.5 ], + ..., + [166.3 , -19.49, 12.77, ..., 166. , 166.3 , 166.8 ], + [168.4 , 19.28, 31.5 , ..., 168.3 , 168.4 , 168.6 ], + [168.9 , 17.72, 27.73, ..., 168.6 , 168.9 , 169.2 ]]) """ def __init__( From a4e7b0499571dc21f06b735291381258b12bc62c Mon Sep 17 00:00:00 2001 From: vnmabus Date: Fri, 19 Nov 2021 14:34:35 +0100 Subject: [PATCH 097/117] Add check in Fisher Rao registration for non-compatible data/template. --- skfda/_utils/__init__.py | 1 + skfda/_utils/_utils.py | 15 ++++++++++- .../preprocessing/registration/_fisher_rao.py | 20 +++++++++++--- tests/test_elastic.py | 27 +++++++++++++++++++ 4 files changed, 59 insertions(+), 4 deletions(-) diff --git a/skfda/_utils/__init__.py b/skfda/_utils/__init__.py index fc9972af9..d69780d16 100644 --- a/skfda/_utils/__init__.py +++ b/skfda/_utils/__init__.py @@ -9,6 +9,7 @@ _cartesian_product, _check_array_key, _check_compatible_fdata, + _check_compatible_fdatagrid, _check_estimator, _classifier_fit_depth_methods, _classifier_get_classes, diff --git a/skfda/_utils/_utils.py b/skfda/_utils/_utils.py index f718a55c5..a4ac3820e 100644 --- a/skfda/_utils/_utils.py +++ b/skfda/_utils/_utils.py @@ -76,7 +76,7 @@ def check_is_univariate(fd: FData) -> None: def _check_compatible_fdata(fdata1: FData, fdata2: FData) -> None: - """Check that fdata is compatible.""" + """Check that two FData are compatible.""" if (fdata1.dim_domain != fdata2.dim_domain): raise ValueError( f"Functional data has incompatible domain dimensions: " @@ -90,6 +90,19 @@ def _check_compatible_fdata(fdata1: FData, fdata2: FData) -> None: ) +def _check_compatible_fdatagrid(fdata1: FDataGrid, fdata2: FDataGrid) -> None: + """Check that two FDataGrid are compatible.""" + _check_compatible_fdata(fdata1, fdata2) + if not all( + np.array_equal(g1, g2) + for g1, g2 in zip(fdata1.grid_points, fdata2.grid_points) + ): + raise ValueError( + f"Incompatible grid points between template and " + f"data: {fdata1.grid_points} != {fdata2.grid_points}", + ) + + def _to_grid( X: FData, y: FData, diff --git a/skfda/preprocessing/registration/_fisher_rao.py b/skfda/preprocessing/registration/_fisher_rao.py index 6cdcd761f..6f836ce60 100644 --- a/skfda/preprocessing/registration/_fisher_rao.py +++ b/skfda/preprocessing/registration/_fisher_rao.py @@ -7,7 +7,12 @@ from sklearn.utils.validation import check_is_fitted from ... import FDataGrid -from ..._utils import check_is_univariate, invert_warping, normalize_scale +from ..._utils import ( + _check_compatible_fdatagrid, + check_is_univariate, + invert_warping, + normalize_scale, +) from ...exploratory.stats import fisher_rao_karcher_mean from ...exploratory.stats._fisher_rao import _elastic_alignment_array from ...misc.operators import SRSF @@ -120,21 +125,30 @@ def __init__( def fit(self: SelfType, X: FDataGrid, y: None = None) -> SelfType: + # Points of discretization + if self.output_points is None: + self._output_points = X.grid_points[0] + else: + self._output_points = self.output_points + if isinstance(self.template, FDataGrid): self.template_ = self.template # Template already constructed else: self.template_ = self.template(X) + _check_compatible_fdatagrid(X, self.template_) + # Constructs the SRSF of the template - srsf = SRSF(output_points=self.output_points, initial_value=0) + srsf = SRSF(output_points=self._output_points, initial_value=0) self._template_srsf = srsf.fit_transform(self.template_) return self def transform(self, X: FDataGrid, y: None = None) -> FDataGrid: - check_is_fitted(self, '_template_srsf') + check_is_fitted(self) check_is_univariate(X) + _check_compatible_fdatagrid(X, self.template_) if ( len(self._template_srsf) != 1 diff --git a/tests/test_elastic.py b/tests/test_elastic.py index 517441974..048d9cde1 100644 --- a/tests/test_elastic.py +++ b/tests/test_elastic.py @@ -44,6 +44,33 @@ def setUp(self) -> None: t = np.linspace(-3, 3, 9) self.dummy_sample = FDataGrid([np.sin(t)], t) + def test_fit_wrong_dimension(self) -> None: + """Checks that template and fit data is compatible.""" + reg = FisherRaoElasticRegistration(template=self.template) + + unimodal_samples = make_multimodal_samples( + n_samples=3, + points_per_dim=10, + random_state=1, + ) + + with self.assertRaises(ValueError): + reg.fit(unimodal_samples) + + def test_transform_wrong_dimension(self) -> None: + """Checks that template and transform data is compatible.""" + reg = FisherRaoElasticRegistration(template=self.template) + + unimodal_samples = make_multimodal_samples( + n_samples=3, + points_per_dim=10, + random_state=1, + ) + + reg.fit(self.unimodal_samples) + with self.assertRaises(ValueError): + reg.transform(unimodal_samples) + def test_to_srsf(self) -> None: """Test to srsf.""" # Checks SRSF conversion From 077bec586e0a282aba7ff6e2c5e34a054f785242 Mon Sep 17 00:00:00 2001 From: Alvaro Date: Fri, 19 Nov 2021 20:09:43 +0100 Subject: [PATCH 098/117] Example without fpca --- .../feature_extraction/_fda_feature_union.py | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/skfda/preprocessing/dim_reduction/feature_extraction/_fda_feature_union.py b/skfda/preprocessing/dim_reduction/feature_extraction/_fda_feature_union.py index 9f73119e2..b17f6492b 100644 --- a/skfda/preprocessing/dim_reduction/feature_extraction/_fda_feature_union.py +++ b/skfda/preprocessing/dim_reduction/feature_extraction/_fda_feature_union.py @@ -53,11 +53,10 @@ class FdaFeatureUnion(FeatureUnion): >>> X,y = fetch_growth(return_X_y=True) Then we need to import the transformers we want to use. In our case we - will use FPCA and Minimum Redundancy Maximum Relevance. + will use Minimum Redundancy Maximum Relevance. Evaluation Transformer returns the original curve, and as it is helpful, - we will concatenate it to the already metioned transformers. + we will concatenate it to the already metioned transformer. >>> from skfda.preprocessing.dim_reduction.feature_extraction import ( - ... FPCA, ... FdaFeatureUnion, ... ) >>> from skfda.preprocessing.dim_reduction.variable_selection import ( @@ -70,19 +69,18 @@ class FdaFeatureUnion(FeatureUnion): >>> union = FdaFeatureUnion( ... [ ... ("mrmr", MinimumRedundancyMaximumRelevance()), - ... ("fpca", FPCA()), ... ("eval", EvaluationTransformer()), ... ], ... array_output=True, ... ) >>> np.around(union.fit_transform(X,y), decimals = 2) - array([[194.3 , 105.84, -34.61, ..., 193.8 , 194.3 , 195.1 ], - [177.4 , -11.42, -17.01, ..., 176.1 , 177.4 , 178.7 ], - [171.2 , -33.81, -23.31, ..., 170.9 , 171.2 , 171.5 ], + array([[ 194.3, 81.3, 84.2, ..., 193.8, 194.3, 195.1], + [ 177.4, 76.2, 80.4, ..., 176.1, 177.4, 178.7], + [ 171.2, 76.8, 79.8, ..., 170.9, 171.2, 171.5], ..., - [166.3 , -19.49, 12.77, ..., 166. , 166.3 , 166.8 ], - [168.4 , 19.28, 31.5 , ..., 168.3 , 168.4 , 168.6 ], - [168.9 , 17.72, 27.73, ..., 168.6 , 168.9 , 169.2 ]]) + [ 166.3, 68.6, 73.6, ..., 166. , 166.3, 166.8], + [ 168.4, 79.9, 82.6, ..., 168.3, 168.4, 168.6], + [ 168.9, 76.1, 78.4, ..., 168.6, 168.9, 169.2]]) """ def __init__( From dbdf6ab5026c7df3c22198068887cb5f62a37799 Mon Sep 17 00:00:00 2001 From: Alvaro Date: Sat, 20 Nov 2021 23:51:45 +0100 Subject: [PATCH 099/117] Closes #377 --- .../feature_extraction/_fda_feature_union.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/skfda/preprocessing/dim_reduction/feature_extraction/_fda_feature_union.py b/skfda/preprocessing/dim_reduction/feature_extraction/_fda_feature_union.py index b17f6492b..006b3ae62 100644 --- a/skfda/preprocessing/dim_reduction/feature_extraction/_fda_feature_union.py +++ b/skfda/preprocessing/dim_reduction/feature_extraction/_fda_feature_union.py @@ -7,11 +7,10 @@ from pandas import DataFrame from sklearn.pipeline import FeatureUnion -from ....representation.basis import FDataBasis -from ....representation.grid import FDataGrid +from ....representation import FData -class FdaFeatureUnion(FeatureUnion): +class FdaFeatureUnion(FeatureUnion): # type: ignore """Concatenates results of multiple functional transformer objects. This estimator applies a list of transformer objects in parallel to the @@ -73,7 +72,7 @@ class FdaFeatureUnion(FeatureUnion): ... ], ... array_output=True, ... ) - >>> np.around(union.fit_transform(X,y), decimals = 2) + >>> np.around(union.fit_transform(X,y), decimals=2) array([[ 194.3, 81.3, 84.2, ..., 193.8, 194.3, 195.1], [ 177.4, 76.2, 80.4, ..., 176.1, 177.4, 178.7], [ 171.2, 76.8, 79.8, ..., 170.9, 171.2, 171.5], @@ -104,7 +103,7 @@ def _hstack(self, Xs) -> Union[DataFrame, ndarray]: if self.array_output: for i in Xs: - if isinstance(i, (FDataGrid, FDataBasis)): + if isinstance(i, FData): raise TypeError( "There are transformed instances of FDataGrid or " "FDataBasis that can't be concatenated on a NumPy " From f11be0ffeea9f2f6731c423233de68c8574af12d Mon Sep 17 00:00:00 2001 From: Alvaro Date: Sat, 20 Nov 2021 23:59:21 +0100 Subject: [PATCH 100/117] Capital letter name --- .../dim_reduction/feature_extraction/__init__.py | 2 +- .../feature_extraction/_fda_feature_union.py | 6 +++--- tests/test_fda_feature_union.py | 8 ++++---- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/skfda/preprocessing/dim_reduction/feature_extraction/__init__.py b/skfda/preprocessing/dim_reduction/feature_extraction/__init__.py index 54589a3a6..132cc8f96 100644 --- a/skfda/preprocessing/dim_reduction/feature_extraction/__init__.py +++ b/skfda/preprocessing/dim_reduction/feature_extraction/__init__.py @@ -1,4 +1,4 @@ """Feature extraction.""" from ._ddg_transformer import DDGTransformer -from ._fda_feature_union import FdaFeatureUnion +from ._fda_feature_union import FDAFeatureUnion from ._fpca import FPCA diff --git a/skfda/preprocessing/dim_reduction/feature_extraction/_fda_feature_union.py b/skfda/preprocessing/dim_reduction/feature_extraction/_fda_feature_union.py index 006b3ae62..855df8497 100644 --- a/skfda/preprocessing/dim_reduction/feature_extraction/_fda_feature_union.py +++ b/skfda/preprocessing/dim_reduction/feature_extraction/_fda_feature_union.py @@ -10,7 +10,7 @@ from ....representation import FData -class FdaFeatureUnion(FeatureUnion): # type: ignore +class FDAFeatureUnion(FeatureUnion): # type: ignore """Concatenates results of multiple functional transformer objects. This estimator applies a list of transformer objects in parallel to the @@ -56,7 +56,7 @@ class FdaFeatureUnion(FeatureUnion): # type: ignore Evaluation Transformer returns the original curve, and as it is helpful, we will concatenate it to the already metioned transformer. >>> from skfda.preprocessing.dim_reduction.feature_extraction import ( - ... FdaFeatureUnion, + ... FDAFeatureUnion, ... ) >>> from skfda.preprocessing.dim_reduction.variable_selection import ( ... MinimumRedundancyMaximumRelevance, @@ -65,7 +65,7 @@ class FdaFeatureUnion(FeatureUnion): # type: ignore >>> import numpy as np Finally we apply fit and transform. - >>> union = FdaFeatureUnion( + >>> union = FDAFeatureUnion( ... [ ... ("mrmr", MinimumRedundancyMaximumRelevance()), ... ("eval", EvaluationTransformer()), diff --git a/tests/test_fda_feature_union.py b/tests/test_fda_feature_union.py index 339790885..ad0358864 100644 --- a/tests/test_fda_feature_union.py +++ b/tests/test_fda_feature_union.py @@ -8,7 +8,7 @@ from skfda.datasets import fetch_growth from skfda.misc.operators import SRSF from skfda.preprocessing.dim_reduction.feature_extraction import ( - FdaFeatureUnion, + FDAFeatureUnion, ) from skfda.preprocessing.smoothing.kernel_smoothers import ( NadarayaWatsonSmoother, @@ -16,7 +16,7 @@ from skfda.representation import EvaluationTransformer -class TestFdaFeatureUnion(unittest.TestCase): +class TestFDAFeatureUnion(unittest.TestCase): """Check the Fda Feature Union module.""" def setUp(self) -> None: @@ -25,7 +25,7 @@ def setUp(self) -> None: def test_incompatible_fdatagrid_output(self) -> None: """Check that the transformer returns a fdatagrid.""" - u = FdaFeatureUnion( + u = FDAFeatureUnion( [("eval", EvaluationTransformer(None)), ("srsf", SRSF())], array_output=True, ) @@ -33,7 +33,7 @@ def test_incompatible_fdatagrid_output(self) -> None: def test_correct_transformation_concat(self) -> None: """Check that the transformation is done correctly.""" - u = FdaFeatureUnion( + u = FDAFeatureUnion( [("srsf1", SRSF()), ("smooth", NadarayaWatsonSmoother())], ) created_frame = u.fit_transform(self.X) From 0a31ee0bf56a5a99b4fc40d9e95d9931a3aab1b4 Mon Sep 17 00:00:00 2001 From: Alvaro Date: Sun, 21 Nov 2021 00:27:44 +0100 Subject: [PATCH 101/117] Try to correct github pytest on macos --- .../feature_extraction/_fda_feature_union.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/skfda/preprocessing/dim_reduction/feature_extraction/_fda_feature_union.py b/skfda/preprocessing/dim_reduction/feature_extraction/_fda_feature_union.py index 855df8497..2eae8b6fa 100644 --- a/skfda/preprocessing/dim_reduction/feature_extraction/_fda_feature_union.py +++ b/skfda/preprocessing/dim_reduction/feature_extraction/_fda_feature_union.py @@ -67,7 +67,13 @@ class FDAFeatureUnion(FeatureUnion): # type: ignore Finally we apply fit and transform. >>> union = FDAFeatureUnion( ... [ - ... ("mrmr", MinimumRedundancyMaximumRelevance()), + ... ( + ... "mrmr", + ... MinimumRedundancyMaximumRelevance( + ... n_features_to_select=1, + ... method="MID", + ... ), + ... ), ... ("eval", EvaluationTransformer()), ... ], ... array_output=True, From 19c238664bffcfc9f31b1439731cd6ba02396bdb Mon Sep 17 00:00:00 2001 From: Alvaro Date: Sun, 21 Nov 2021 00:39:15 +0100 Subject: [PATCH 102/117] More mypy corrections --- .../feature_extraction/_fda_feature_union.py | 10 +++++----- tests/test_fda_feature_union.py | 7 +++++-- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/skfda/preprocessing/dim_reduction/feature_extraction/_fda_feature_union.py b/skfda/preprocessing/dim_reduction/feature_extraction/_fda_feature_union.py index 2eae8b6fa..54764feb6 100644 --- a/skfda/preprocessing/dim_reduction/feature_extraction/_fda_feature_union.py +++ b/skfda/preprocessing/dim_reduction/feature_extraction/_fda_feature_union.py @@ -92,10 +92,10 @@ def __init__( self, transformer_list, *, - n_jobs=None, - transformer_weights=None, - verbose=False, - array_output=False, + n_jobs: int = None, + transformer_weights: dict = None, + verbose: bool = False, + array_output: bool = False, ) -> None: self.array_output = array_output super().__init__( @@ -105,7 +105,7 @@ def __init__( verbose=verbose, ) - def _hstack(self, Xs) -> Union[DataFrame, ndarray]: + def _hstack(self, Xs: ndarray) -> Union[DataFrame, ndarray]: if self.array_output: for i in Xs: diff --git a/tests/test_fda_feature_union.py b/tests/test_fda_feature_union.py index ad0358864..21a174067 100644 --- a/tests/test_fda_feature_union.py +++ b/tests/test_fda_feature_union.py @@ -34,12 +34,15 @@ def test_incompatible_fdatagrid_output(self) -> None: def test_correct_transformation_concat(self) -> None: """Check that the transformation is done correctly.""" u = FDAFeatureUnion( - [("srsf1", SRSF()), ("smooth", NadarayaWatsonSmoother())], + [ + ("srsf1", SRSF()), + ("smooth", NadarayaWatsonSmoother()), # type: ignore + ], ) created_frame = u.fit_transform(self.X) t1 = SRSF().fit_transform(self.X) - t2 = NadarayaWatsonSmoother().fit_transform(self.X) + t2 = NadarayaWatsonSmoother().fit_transform(self.X) # type: ignore true_frame = DataFrame({"Transformed data": [t1, t2]}) assert_frame_equal(true_frame, created_frame) From d8ff1b4141c552990cb1f781579435ca0ce9292f Mon Sep 17 00:00:00 2001 From: Alvaro Date: Sun, 21 Nov 2021 00:50:33 +0100 Subject: [PATCH 103/117] Another corrections I can only check if mypy passes on github by commiting --- .../dim_reduction/feature_extraction/_fda_feature_union.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/skfda/preprocessing/dim_reduction/feature_extraction/_fda_feature_union.py b/skfda/preprocessing/dim_reduction/feature_extraction/_fda_feature_union.py index 54764feb6..9e1fea227 100644 --- a/skfda/preprocessing/dim_reduction/feature_extraction/_fda_feature_union.py +++ b/skfda/preprocessing/dim_reduction/feature_extraction/_fda_feature_union.py @@ -90,10 +90,10 @@ class FDAFeatureUnion(FeatureUnion): # type: ignore def __init__( self, - transformer_list, + transformer_list: list, *, - n_jobs: int = None, - transformer_weights: dict = None, + n_jobs: int = 1, + transformer_weights: dict = None, # type: ignore verbose: bool = False, array_output: bool = False, ) -> None: From ab58f644768d620fc2c1c215a42bc4cbbb7e9c83 Mon Sep 17 00:00:00 2001 From: Alvaro Date: Sun, 21 Nov 2021 00:53:25 +0100 Subject: [PATCH 104/117] Last mypy correction --- .../dim_reduction/feature_extraction/_fda_feature_union.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skfda/preprocessing/dim_reduction/feature_extraction/_fda_feature_union.py b/skfda/preprocessing/dim_reduction/feature_extraction/_fda_feature_union.py index 9e1fea227..50a25797d 100644 --- a/skfda/preprocessing/dim_reduction/feature_extraction/_fda_feature_union.py +++ b/skfda/preprocessing/dim_reduction/feature_extraction/_fda_feature_union.py @@ -90,7 +90,7 @@ class FDAFeatureUnion(FeatureUnion): # type: ignore def __init__( self, - transformer_list: list, + transformer_list: list, # type: ignore *, n_jobs: int = 1, transformer_weights: dict = None, # type: ignore From d3fbb169c727f2db4bcfa496a6462d9c472c5db0 Mon Sep 17 00:00:00 2001 From: pedrorponga Date: Tue, 23 Nov 2021 18:12:47 +0100 Subject: [PATCH 105/117] final_example --- examples/plot_depth_classification.py | 274 +++++++++++++++++++------- 1 file changed, 204 insertions(+), 70 deletions(-) diff --git a/examples/plot_depth_classification.py b/examples/plot_depth_classification.py index 3b95784d1..a2701adc7 100644 --- a/examples/plot_depth_classification.py +++ b/examples/plot_depth_classification.py @@ -11,13 +11,14 @@ # Author: Pedro Martín Rodríguez-Ponga Eyriès # License: MIT -# sphinx_gallery_thumbnail_number = 3 +# sphinx_gallery_thumbnail_number = 5 import matplotlib.pyplot as plt import numpy as np from matplotlib.colors import ListedColormap from sklearn.model_selection import train_test_split from sklearn.neighbors import KNeighborsClassifier +from sklearn.neural_network import MLPClassifier from skfda import datasets from skfda.exploratory.depth import ModifiedBandDepth @@ -32,11 +33,11 @@ ############################################################################## # -# The Berkeley Growth Study data contains the heights of 39 boys and 54 -# girls from age 1 to 18 and the ages at which they were collected. Males -# are assigned the numeric value 0 while females are coded to a 1. In our -# comparison of the different methods, we will try to learn the sex of a -# person by using its growth curve. +# The Berkeley Growth Study data contains the heights of 39 boys and 54 girls +# from age 1 to 18 and the ages at which they were collected. Males are +# assigned the numeric value 0 while females are coded to a 1. In our +# comparison of the different methods, we will try to learn the sex of a person +# by using its growth curve. X, y = datasets.fetch_growth(return_X_y=True, as_frame=True) X = X.iloc[:, 0].values categories = y.values.categories @@ -44,9 +45,9 @@ ############################################################################## # -# As in many ML algorithms, we split the dataset into train and test. In -# this graph, we can see the training dataset. These growth curves will -# be used to train the model. Hence, the predictions will be data-driven. +# As in many ML algorithms, we split the dataset into train and test. In this +# graph, we can see the training dataset. These growth curves will be used to +# train the model. Hence, the predictions will be data-driven. X_train, X_test, y_train, y_test = train_test_split( X, y, @@ -67,75 +68,158 @@ ############################################################################## # # As said above, we are trying to compare three different methods: -# MaximumDepthClassifier, DDClassifier, and DDGClassifier. -# Below are the classification predictions of these models as well as the -# score (obtained by comparing to the real known sex). For the three -# algorithms we will be using the depth -# :class:`~skfda.representation.depth.ModifiedBandDepth` for consistency. -# We will try polynomes of degrees one, two, and three for DDClassifier. -# DDClassifier will be used with -# :class:`~sklearn.neighbors.KNeighborsClassifier`. +# MaximumDepthClassifier, DDClassifier, and DDGClassifier. They all use a depth +# which in our example is :class:`~skfda.representation.depth. +# ModifiedBandDepth` for consistency. With this depth we can create a DDPlot. +# In a DDPlot, a growth curve is mapped to :math:`[0,1]x[0,1]` where the first +# coordinate corresponds to the depth in the class of all boys and the second +# to that of all girls. Note that the dots will be blue if the true sex is +# female and red otherwise. + +############################################################################## +# +# Below we can see how a DDPlot is used to classify with +# MaximumDepthClassifier. In this case it is quite straighforward, a person is +# classified to the class where it is deeper. This means that if a point is +# above the diagonal it is a girl and otherwise it is a boy. clf = MaximumDepthClassifier(depth_method=ModifiedBandDepth()) clf.fit(X_train, y_train) print(clf.predict(X_test)) -print(clf.score(X_test, y_test)) +print('The score is {0:2.2%}'.format(clf.score(X_test, y_test))) + +fig, ax = plt.subplots() + +cmap_bold = ListedColormap(['#FF0000', '#0000FF']) + +index = y_train.astype(bool) +DDPlot( + fdata=X_test, + dist1=X_train[np.invert(index)], + dist2=X_train[index], + depth_method=ModifiedBandDepth(), + axes=ax, + c=y_test, + cmap_bold=cmap_bold, + x_label="Boy class depth", + y_label="Girl class depth", +).plot() +############################################################################## +# +# We can see that we have used the classification predictions to compute the +# score (obtained by comparing to the real known sex). This will also be done +# for the rest of the classifiers. + +############################################################################## +# +# Next we use DDClassifier with polynomes of degrees one, two, and three. Here, +# if a point in the DDPlot is above the polynome, the classifier will predict +# that it is a girl and otherwise, a boy. clf1 = DDClassifier(degree=1, depth_method=ModifiedBandDepth()) clf1.fit(X_train, y_train) print(clf1.predict(X_test)) -print(clf1.score(X_test, y_test)) +print('The score is {0:2.2%}'.format(clf1.score(X_test, y_test))) +############################################################################## +# clf2 = DDClassifier(degree=2, depth_method=ModifiedBandDepth()) clf2.fit(X_train, y_train) print(clf2.predict(X_test)) -print(clf2.score(X_test, y_test)) +print('The score is {0:2.2%}'.format(clf2.score(X_test, y_test))) +############################################################################## +# clf3 = DDClassifier(degree=3, depth_method=ModifiedBandDepth()) clf3.fit(X_train, y_train) print(clf3.predict(X_test)) -print(clf3.score(X_test, y_test)) +print('The score is {0:2.2%}'.format(clf3.score(X_test, y_test))) + +############################################################################## +# +fig, ax = plt.subplots() + + +def _plot_boundaries(axis): + margin = 0.025 + ts = np.linspace(- margin, 1 + margin, 100) + pol1 = axis.plot( + ts, + np.polyval(clf1.poly_, ts), + 'c', + label="Polynomial", + )[0] + pol2 = axis.plot( + ts, + np.polyval(clf2.poly_, ts), + 'm', + label="Polynomial", + )[0] + pol3 = axis.plot( + ts, + np.polyval(clf3.poly_, ts), + 'g', + label="Polynomial", + )[0] + max_depth = axis.plot( + [0, 1], + color="gray", + )[0] + + axis.legend([pol1, pol2, pol3, max_depth], ['P1', 'P2', 'P3', 'MaxDepth']) + + +_plot_boundaries(ax) +DDPlot( + fdata=X_test, + dist1=X_train[np.invert(index)], + dist2=X_train[index], + depth_method=ModifiedBandDepth(), + axes=ax, + c=y_test, + cmap_bold=cmap_bold, + x_label="Boy class depth", + y_label="Girl class depth", +).plot() + +############################################################################## +# +# DDClassifier used with :class:`~sklearn.neighbors.KNeighborsClassifier`. +############################################################################## +# clf = DDGClassifier( KNeighborsClassifier(n_neighbors=5), depth_method=ModifiedBandDepth(), ) clf.fit(X_train, y_train) print(clf.predict(X_test)) -clf.score(X_test, y_test) +print('The score is {0:2.2%}'.format(clf.score(X_test, y_test))) + ############################################################################## # -# Finally, we plot all these classifiers in a DDPlot. There is a -# one-to-one correspondence between growth curves and data points. The -# coordinates of the points in the graph correspond to the respective -# depth to the class of all boys and the class of all girls. Note that -# the dots are blue if the true sex is female and red otherwise. The -# other elements of the graph are the decision boundaries: +# The other elements of the graph are the decision boundaries: # -# | Boundary | Classifier | -# | --------- | ------------------------------------ | -# | Gray line | MaximumDepthClassifier | -# | P1 | DDClassifier with degree 1 | -# | P2 | DDClassifier with degree 2 | -# | P3 | DDClassifier with degree 3 | -# | Colors | DDGClassifier with nearest neighbors | +# | Boundary | Classifier | +# | ------------ | ------------------------------------ | +# | MaxDepth | MaximumDepthClassifier | +# | P1 | DDClassifier with degree 1 | +# | P2 | DDClassifier with degree 2 | +# | P3 | DDClassifier with degree 3 | +# | NearestClass | DDGClassifier with nearest neighbors | ddg: DDGTransformer[FDataGrid] = DDGTransformer( depth_method=ModifiedBandDepth(), ) X_train_trans = ddg.fit_transform(X_train, y_train) -# Code adapted from: -# https://stackoverflow.com/questions/45075638/graph-k-nn-decision-boundaries-in-matplotlib clf = KNeighborsClassifier(n_neighbors=5) clf.fit(X_train_trans, y_train) -h = 0.02 # step size in the mesh +h = 0.01 # step size in the mesh # Create color maps cmap_light = ListedColormap(['#FFAAAA', '#AAAAFF']) -cmap_bold = ListedColormap(['#FF0000', '#0000FF']) - # Plot the decision boundary. For that, we will assign a color to each # point in the mesh [x_min, x_max]x[y_min, y_max]. @@ -151,37 +235,10 @@ Z = Z.reshape(xx.shape) fig, ax = plt.subplots() - -margin = 0.025 -ts = np.linspace(- margin, 1 + margin, 100) -pol1 = ax.plot( - ts, - np.polyval(clf1.poly_, ts), - 'c', - linewidth=1, - label="Polynomial", -)[0] -pol2 = ax.plot( - ts, - np.polyval(clf2.poly_, ts), - 'm', - linewidth=1, - label="Polynomial", -)[0] -pol3 = ax.plot( - ts, - np.polyval(clf3.poly_, ts), - 'g', - linewidth=1, - label="Polynomial", -)[0] ax.pcolormesh(xx, yy, Z, cmap=cmap_light, shading='auto') -ax.legend([pol1, pol2, pol3], ['P1', 'P2', 'P3']) - - -index = y_train.astype(bool) -ddp = DDPlot( +_plot_boundaries(ax) +DDPlot( fdata=X_test, dist1=X_train[np.invert(index)], dist2=X_train[index], @@ -191,5 +248,82 @@ cmap_bold=cmap_bold, x_label="Boy class depth", y_label="Girl class depth", +).plot() + +############################################################################## +# +# In the above graph, we can see the obtained classifiers from the train set. +# The dots are all part of the test set and have their real color so, for +# example, if they are blue it means that the true sex is female. One can see +# that none of the built classifiers is perfect. +# +# Next, we will use DDGClassifier together with a neural network: +# :class:`~sklearn.neural_network.MLPClassifier`. +clf = DDGClassifier( + MLPClassifier( + solver='lbfgs', + alpha=1e-5, + hidden_layer_sizes=(6, 2), + random_state=1, + ), + depth_method=ModifiedBandDepth(), +) +clf.fit(X_train, y_train) +print(clf.predict(X_test)) +print('The score is {0:2.2%}'.format(clf.score(X_test, y_test))) + +############################################################################## +# +clf1 = KNeighborsClassifier(n_neighbors=5) +clf2 = MLPClassifier( + solver='lbfgs', + alpha=1e-5, + hidden_layer_sizes=(6, 2), + random_state=1, ) -ddp.plot() +clf1.fit(X_train_trans, y_train) +clf2.fit(X_train_trans, y_train) + +Z1 = clf1.predict(np.c_[xx.ravel(), yy.ravel()]) +Z2 = clf2.predict(np.c_[xx.ravel(), yy.ravel()]) + +Z1 = Z1.reshape(xx.shape) +Z2 = Z2.reshape(xx.shape) + +fig, axs = plt.subplots(1, 2, sharex=True, sharey=True) + +axs[0].pcolormesh(xx, yy, Z1, cmap=cmap_light, shading='auto') +axs[1].pcolormesh(xx, yy, Z2, cmap=cmap_light, shading='auto') + +DDPlot( + fdata=X_test, + dist1=X_train[np.invert(index)], + dist2=X_train[index], + depth_method=ModifiedBandDepth(), + axes=axs[0], + c=y_test, + cmap_bold=cmap_bold, + x_label="Boy class depth", + y_label="Girl class depth", +).plot() +DDPlot( + fdata=X_test, + dist1=X_train[np.invert(index)], + dist2=X_train[index], + depth_method=ModifiedBandDepth(), + axes=axs[1], + c=y_test, + cmap_bold=cmap_bold, + x_label="Boy class depth", + y_label="Girl class depth", +).plot() + +for axis in axs: + axis.label_outer() + +############################################################################## +# +# We can compare the behavior of two `DDGClassifier` based classifiers. The one +# on the left corresponds to nearest neighbors and the one on the right to a +# neural network. Interestingly, the neural network almost coincides with +# `MaximumDepthClassifier`. From 1626903193189205e0ce01a3237ad01e3c048d4c Mon Sep 17 00:00:00 2001 From: pedrorponga Date: Tue, 23 Nov 2021 18:34:36 +0100 Subject: [PATCH 106/117] Improving format --- examples/plot_depth_classification.py | 31 +++++++++++---------------- 1 file changed, 13 insertions(+), 18 deletions(-) diff --git a/examples/plot_depth_classification.py b/examples/plot_depth_classification.py index a2701adc7..2823fd557 100644 --- a/examples/plot_depth_classification.py +++ b/examples/plot_depth_classification.py @@ -4,8 +4,8 @@ This example shows the use of the depth based classifications methods applied to the Berkeley Growth Study data. An attempt to show the -differences and similarities between MaximumDepthClassifier, -DDClassifier, and DDGClassifier is made. +differences and similarities between `MaximumDepthClassifier`, +`DDClassifier`, and `DDGClassifier` is made. """ # Author: Pedro Martín Rodríguez-Ponga Eyriès @@ -68,17 +68,19 @@ ############################################################################## # # As said above, we are trying to compare three different methods: -# MaximumDepthClassifier, DDClassifier, and DDGClassifier. They all use a depth -# which in our example is :class:`~skfda.representation.depth. -# ModifiedBandDepth` for consistency. With this depth we can create a DDPlot. -# In a DDPlot, a growth curve is mapped to :math:`[0,1]x[0,1]` where the first +# MaximumDepthClassifier, DDClassifier, and `DDGClassifier`. They all use a depth +# which in our example is +# :class:`~skfda.representation.depth.ModifiedBandDepth` for consistency. With +# this depth we can create a `DDPlot`. +# +# In a `DDPlot`, a growth curve is mapped to :math:`[0,1]\times[0,1]` where the first # coordinate corresponds to the depth in the class of all boys and the second # to that of all girls. Note that the dots will be blue if the true sex is # female and red otherwise. ############################################################################## # -# Below we can see how a DDPlot is used to classify with +# Below we can see how a `DDPlot` is used to classify with # MaximumDepthClassifier. In this case it is quite straighforward, a person is # classified to the class where it is deeper. This means that if a point is # above the diagonal it is a girl and otherwise it is a boy. @@ -112,8 +114,8 @@ ############################################################################## # -# Next we use DDClassifier with polynomes of degrees one, two, and three. Here, -# if a point in the DDPlot is above the polynome, the classifier will predict +# Next we use `DDClassifier` with polynomes of degrees one, two, and three. Here, +# if a point in the `DDPlot` is above the polynome, the classifier will predict # that it is a girl and otherwise, a boy. clf1 = DDClassifier(degree=1, depth_method=ModifiedBandDepth()) clf1.fit(X_train, y_train) @@ -121,21 +123,18 @@ print('The score is {0:2.2%}'.format(clf1.score(X_test, y_test))) ############################################################################## -# clf2 = DDClassifier(degree=2, depth_method=ModifiedBandDepth()) clf2.fit(X_train, y_train) print(clf2.predict(X_test)) print('The score is {0:2.2%}'.format(clf2.score(X_test, y_test))) ############################################################################## -# clf3 = DDClassifier(degree=3, depth_method=ModifiedBandDepth()) clf3.fit(X_train, y_train) print(clf3.predict(X_test)) print('The score is {0:2.2%}'.format(clf3.score(X_test, y_test))) ############################################################################## -# fig, ax = plt.subplots() @@ -184,10 +183,7 @@ def _plot_boundaries(axis): ############################################################################## # -# DDClassifier used with :class:`~sklearn.neighbors.KNeighborsClassifier`. - -############################################################################## -# +# `DDClassifier` used with :class:`~sklearn.neighbors.KNeighborsClassifier`. clf = DDGClassifier( KNeighborsClassifier(n_neighbors=5), depth_method=ModifiedBandDepth(), @@ -257,7 +253,7 @@ def _plot_boundaries(axis): # example, if they are blue it means that the true sex is female. One can see # that none of the built classifiers is perfect. # -# Next, we will use DDGClassifier together with a neural network: +# Next, we will use `DDGClassifier` together with a neural network: # :class:`~sklearn.neural_network.MLPClassifier`. clf = DDGClassifier( MLPClassifier( @@ -273,7 +269,6 @@ def _plot_boundaries(axis): print('The score is {0:2.2%}'.format(clf.score(X_test, y_test))) ############################################################################## -# clf1 = KNeighborsClassifier(n_neighbors=5) clf2 = MLPClassifier( solver='lbfgs', From 836eed4bd224dddc253ba8c325ccf57554ed3bba Mon Sep 17 00:00:00 2001 From: pedrorponga Date: Tue, 23 Nov 2021 18:40:19 +0100 Subject: [PATCH 107/117] More formatting --- examples/plot_depth_classification.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/examples/plot_depth_classification.py b/examples/plot_depth_classification.py index 2823fd557..24ea9f894 100644 --- a/examples/plot_depth_classification.py +++ b/examples/plot_depth_classification.py @@ -68,15 +68,15 @@ ############################################################################## # # As said above, we are trying to compare three different methods: -# MaximumDepthClassifier, DDClassifier, and `DDGClassifier`. They all use a depth -# which in our example is +# MaximumDepthClassifier, DDClassifier, and `DDGClassifier`. They all use a +# depth which in our example is # :class:`~skfda.representation.depth.ModifiedBandDepth` for consistency. With # this depth we can create a `DDPlot`. # -# In a `DDPlot`, a growth curve is mapped to :math:`[0,1]\times[0,1]` where the first -# coordinate corresponds to the depth in the class of all boys and the second -# to that of all girls. Note that the dots will be blue if the true sex is -# female and red otherwise. +# In a `DDPlot`, a growth curve is mapped to :math:`[0,1]\times[0,1]` where the +# first coordinate corresponds to the depth in the class of all boys and the +# second to that of all girls. Note that the dots will be blue if the true sex +# is female and red otherwise. ############################################################################## # @@ -114,9 +114,9 @@ ############################################################################## # -# Next we use `DDClassifier` with polynomes of degrees one, two, and three. Here, -# if a point in the `DDPlot` is above the polynome, the classifier will predict -# that it is a girl and otherwise, a boy. +# Next we use `DDClassifier` with polynomes of degrees one, two, and three. +# Here, if a point in the `DDPlot` is above the polynome, the classifier will +# predict that it is a girl and otherwise, a boy. clf1 = DDClassifier(degree=1, depth_method=ModifiedBandDepth()) clf1.fit(X_train, y_train) print(clf1.predict(X_test)) @@ -318,7 +318,7 @@ def _plot_boundaries(axis): ############################################################################## # -# We can compare the behavior of two `DDGClassifier` based classifiers. The one -# on the left corresponds to nearest neighbors and the one on the right to a -# neural network. Interestingly, the neural network almost coincides with +# We can compare the behavior of two `DDGClassifier` based classifiers. The +# one on the left corresponds to nearest neighbors and the one on the right to +# a neural network. Interestingly, the neural network almost coincides with # `MaximumDepthClassifier`. From 3c4f55a54762dc72f168a56074f55759149befae Mon Sep 17 00:00:00 2001 From: Alvaro Date: Wed, 24 Nov 2021 18:58:41 +0100 Subject: [PATCH 108/117] Change example --- .../feature_extraction/_fda_feature_union.py | 32 +++++++++++-------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/skfda/preprocessing/dim_reduction/feature_extraction/_fda_feature_union.py b/skfda/preprocessing/dim_reduction/feature_extraction/_fda_feature_union.py index 50a25797d..da838c123 100644 --- a/skfda/preprocessing/dim_reduction/feature_extraction/_fda_feature_union.py +++ b/skfda/preprocessing/dim_reduction/feature_extraction/_fda_feature_union.py @@ -58,9 +58,10 @@ class FDAFeatureUnion(FeatureUnion): # type: ignore >>> from skfda.preprocessing.dim_reduction.feature_extraction import ( ... FDAFeatureUnion, ... ) - >>> from skfda.preprocessing.dim_reduction.variable_selection import ( - ... MinimumRedundancyMaximumRelevance, + >>> from skfda.preprocessing.dim_reduction.feature_extraction import ( + ... DDGTransformer, ... ) + >>> from skfda.exploratory.depth import ModifiedBandDepth >>> from skfda.representation import EvaluationTransformer >>> import numpy as np @@ -68,24 +69,27 @@ class FDAFeatureUnion(FeatureUnion): # type: ignore >>> union = FDAFeatureUnion( ... [ ... ( - ... "mrmr", - ... MinimumRedundancyMaximumRelevance( - ... n_features_to_select=1, - ... method="MID", - ... ), + ... 'ddgtransformer', + ... DDGTransformer(depth_method=[ModifiedBandDepth()]), ... ), ... ("eval", EvaluationTransformer()), ... ], ... array_output=True, ... ) >>> np.around(union.fit_transform(X,y), decimals=2) - array([[ 194.3, 81.3, 84.2, ..., 193.8, 194.3, 195.1], - [ 177.4, 76.2, 80.4, ..., 176.1, 177.4, 178.7], - [ 171.2, 76.8, 79.8, ..., 170.9, 171.2, 171.5], - ..., - [ 166.3, 68.6, 73.6, ..., 166. , 166.3, 166.8], - [ 168.4, 79.9, 82.6, ..., 168.3, 168.4, 168.6], - [ 168.9, 76.1, 78.4, ..., 168.6, 168.9, 169.2]]) + array([[ 2.100e-01, 9.000e-02, 8.130e+01, ..., 1.938e+02, 1.943e+02, + 1.951e+02], + [ 4.600e-01, 3.800e-01, 7.620e+01, ..., 1.761e+02, 1.774e+02, + 1.787e+02], + [ 2.000e-01, 3.300e-01, 7.680e+01, ..., 1.709e+02, 1.712e+02, + 1.715e+02], + ..., + [ 3.900e-01, 5.100e-01, 6.860e+01, ..., 1.660e+02, 1.663e+02, + 1.668e+02], + [ 2.600e-01, 2.700e-01, 7.990e+01, ..., 1.683e+02, 1.684e+02, + 1.686e+02], + [ 3.300e-01, 3.200e-01, 7.610e+01, ..., 1.686e+02, 1.689e+02, + 1.692e+02]]) """ def __init__( From a724cb65d4ab2e1f2ac33d535e9606b2d915054c Mon Sep 17 00:00:00 2001 From: Alvaro Date: Wed, 24 Nov 2021 19:17:14 +0100 Subject: [PATCH 109/117] Closes #377 --- .../dim_reduction/feature_extraction/_fda_feature_union.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skfda/preprocessing/dim_reduction/feature_extraction/_fda_feature_union.py b/skfda/preprocessing/dim_reduction/feature_extraction/_fda_feature_union.py index da838c123..e772be898 100644 --- a/skfda/preprocessing/dim_reduction/feature_extraction/_fda_feature_union.py +++ b/skfda/preprocessing/dim_reduction/feature_extraction/_fda_feature_union.py @@ -52,7 +52,7 @@ class FDAFeatureUnion(FeatureUnion): # type: ignore >>> X,y = fetch_growth(return_X_y=True) Then we need to import the transformers we want to use. In our case we - will use Minimum Redundancy Maximum Relevance. + will use Generalized depth-versus-depth transformer. Evaluation Transformer returns the original curve, and as it is helpful, we will concatenate it to the already metioned transformer. >>> from skfda.preprocessing.dim_reduction.feature_extraction import ( From b4f5dcfbcb38e25bbb668def5878ab87a0bbef57 Mon Sep 17 00:00:00 2001 From: vnmabus Date: Tue, 30 Nov 2021 19:22:16 +0100 Subject: [PATCH 110/117] Fix computation of eigenvalues in FPCA, for both basis and grid. --- setup.cfg | 4 +- .../dim_reduction/feature_extraction/_fpca.py | 8 +- tests/test_fpca.py | 198 +++++++++++++----- 3 files changed, 150 insertions(+), 60 deletions(-) diff --git a/setup.cfg b/setup.cfg index d272c11a4..6b04b4ec5 100644 --- a/setup.cfg +++ b/setup.cfg @@ -93,8 +93,8 @@ per-file-ignores = # There are many datasets _real_datasets.py: WPS202 - # Tests benefit from magic numbers and fixtures - test_*.py: WPS432, WPS442 + # Tests benefit from meaningless zeros, magic numbers and fixtures + test_*.py: WPS339, WPS432, WPS442 # Examples are allowed to have imports in the middle, "commented code", call print and have magic numbers plot_*.py: E402, E800, WPS421, WPS432 diff --git a/skfda/preprocessing/dim_reduction/feature_extraction/_fpca.py b/skfda/preprocessing/dim_reduction/feature_extraction/_fpca.py index 6e2f7cabe..8e6b9f2e4 100644 --- a/skfda/preprocessing/dim_reduction/feature_extraction/_fpca.py +++ b/skfda/preprocessing/dim_reduction/feature_extraction/_fpca.py @@ -61,6 +61,8 @@ class FPCA( each of the selected components. explained_variance_ratio\_ (array_like): this contains the percentage of variance explained by each principal component. + singular_values\_: The singular values corresponding to each of the + selected components. mean\_ (FData): mean of the train data. @@ -224,7 +226,7 @@ def _fit_basis( # the final matrix, C(L-1Jt)t for svd or (L-1Jt)-1CtC(L-1Jt)t for PCA final_matrix = ( - X.coefficients @ np.transpose(l_inv_j_t) / np.sqrt(n_samples) + X.coefficients @ np.transpose(l_inv_j_t) ) # initialize the pca module provided by scikit-learn @@ -241,6 +243,7 @@ def _fit_basis( self.explained_variance_ratio_ = pca.explained_variance_ratio_ self.explained_variance_ = pca.explained_variance_ + self.singular_values_ = pca.singular_values_ self.components_ = X.copy( basis=components_basis, coefficients=component_coefficients.T, @@ -375,7 +378,7 @@ def _fit_grid( ).T # see docstring for more information - final_matrix = fd_data @ np.sqrt(weights_matrix) / np.sqrt(n_samples) + final_matrix = fd_data @ np.sqrt(weights_matrix) pca = PCA(n_components=self.n_components) pca.fit(final_matrix) @@ -391,6 +394,7 @@ def _fit_grid( self.explained_variance_ratio_ = pca.explained_variance_ratio_ self.explained_variance_ = pca.explained_variance_ + self.singular_values_ = pca.singular_values_ return self diff --git a/tests/test_fpca.py b/tests/test_fpca.py index fc9ccd29c..93dce7529 100644 --- a/tests/test_fpca.py +++ b/tests/test_fpca.py @@ -2,7 +2,6 @@ import unittest import numpy as np - import skfda from skfda import FDataBasis, FDataGrid from skfda.datasets import fetch_weather @@ -10,6 +9,7 @@ from skfda.misc.regularization import TikhonovRegularization from skfda.preprocessing.dim_reduction.feature_extraction import FPCA from skfda.representation.basis import Basis, BSpline, Fourier +from sklearn.decomposition import PCA class FPCATestCase(unittest.TestCase): @@ -170,7 +170,28 @@ def test_basis_fpca_transform_result(self) -> None: np.testing.assert_allclose(scores, results, atol=1e-7) def test_basis_fpca_noregularization_fit_result(self) -> None: - """Compare the components in basis against the fda package.""" + """ + Compare the components in basis against the fda package. + + Replication code: + + # library(fda) + # + # data("CanadianWeather") + # temp = CanadianWeather$dailyAv[,,1] + # + # basis = create.fourier.basis(c(0,365), 9) + # fdata_temp = Data2fd(1:365, temp, basis) + # fpca = pca.fd(fdata_temp, nharm = 3) + # + # paste( + # round(fpca$harmonics$coefs[,1], 8), + # collapse=", " + # ) # first component, others are analogous + # + # fpca$varprop # explained variance ratio + + """ n_basis = 9 n_components = 3 @@ -186,19 +207,24 @@ def test_basis_fpca_noregularization_fit_result(self) -> None: # Results obtained using Ramsay's R package results = np.array([ [ # noqa: WPS317 - 0.9231551, 0.1364966, 0.3569451, 0.0092012, -0.0244525, - -0.02923873, -0.003566887, -0.009654571, -0.0100063, + 0.92315509, 0.1395638, 0.35575705, + 0.00877893, -0.02460726, -0.02932107, + -0.0028108, -0.00999328, -0.00966805, ], [ # noqa: WPS317 - -0.3315211, -0.0508643, 0.89218521, 0.1669182, 0.24539, - 0.03548997, 0.037938051, -0.025777507, 0.008416904, + -0.33152114, -0.04318338, 0.89258995, + 0.17111744, 0.24248046, 0.03645764, + 0.03700911, -0.02547251, 0.00929922, ], [ # noqa: WPS317 - -0.1379108, 0.9125089, 0.00142045, 0.2657423, -0.2146497, - 0.16833314, 0.031509179, -0.006768189, 0.047306718, + -0.13791076, 0.91248735, -0.00643356, + 0.26200806, -0.21919224, 0.16909055, + 0.02715258, -0.00513581, 0.04751166, ], ]) + explained_variance_ratio = [0.88958975, 0.08483036, 0.01844100] + # Compare results obtained using this library. There are slight # variations due to the fact that we are in two different packages # If the sign of the components is not the same the component is @@ -211,11 +237,62 @@ def test_basis_fpca_noregularization_fit_result(self) -> None: np.testing.assert_allclose( fpca.components_.coefficients, results, - atol=1e-7, + atol=0.008, + ) + + np.testing.assert_allclose( + fpca.explained_variance_ratio_, + explained_variance_ratio, + ) + + def test_grid_fpca_fit_sklearn(self) -> None: + """Compare the components in grid against the multivariate case.""" + n_components = 3 + + fd_data = fetch_weather()['data'].coordinates[0] + + fpca = FPCA(n_components=n_components, weights=[1] * 365) + fpca.fit(fd_data) + + pca = PCA(n_components=n_components) + pca.fit(fd_data.data_matrix[..., 0]) + + np.testing.assert_allclose( + fpca.components_.data_matrix[..., 0], + pca.components_, + ) + + np.testing.assert_allclose( + fpca.explained_variance_, + pca.explained_variance_, ) def test_grid_fpca_fit_result(self) -> None: - """Compare the components in grid against the fda.usc package.""" + """ + Compare the components in grid against the fda.usc package. + + Replication code: + + # library(fda) + # library(fda.usc) + # + # data("CanadianWeather") + # temp = CanadianWeather$dailyAv[,,1] + # + # fdata_temp = fdata(t(temp)) + # fpca = fdata2pc(fdata_temp, ncomp = 1) + # + # paste( + # round(fpca$rotation$data[1,], 8), + # collapse=", " + # ) # components + # fpca$d[1] # singular value + # paste( + # round(fpca$x[,1], 8), + # collapse=", " + # ) # transform + + """ n_components = 1 fd_data = fetch_weather()['data'].coordinates[0] @@ -228,78 +305,80 @@ def test_grid_fpca_fit_result(self) -> None: -0.06958281, -0.07015412, -0.07095115, -0.07185632, -0.07128256, -0.07124209, -0.07364828, -0.07297663, -0.07235438, -0.07307498, -0.07293423, -0.07449293, -0.07647909, -0.07796823, -0.07582476, - -0.07263243, -0.07241871, -0.0718136, -0.07015477, -0.07132331, - -0.0711527, -0.07435933, -0.07602666, -0.0769783, -0.07707199, - -0.07503802, -0.0770302, -0.07705581, -0.07633515, -0.07624817, - -0.07631568, -0.07619913, -0.07568, -0.07595155, -0.07506939, + -0.07263243, -0.07241871, -0.07181360, -0.07015477, -0.07132331, + -0.07115270, -0.07435933, -0.07602666, -0.07697830, -0.07707199, + -0.07503802, -0.07703020, -0.07705581, -0.07633515, -0.07624817, + -0.07631568, -0.07619913, -0.07568000, -0.07595155, -0.07506939, -0.07181941, -0.06907624, -0.06735476, -0.06853985, -0.06902363, - -0.07098882, -0.07479412, -0.07425241, -0.07555835, -0.0765903, + -0.07098882, -0.07479412, -0.07425241, -0.07555835, -0.07659030, -0.07651853, -0.07682536, -0.07458996, -0.07631711, -0.07726509, - -0.07641246, -0.0744066, -0.07501397, -0.07302722, -0.07045571, + -0.07641246, -0.07440660, -0.07501397, -0.07302722, -0.07045571, -0.06912529, -0.06792186, -0.06830739, -0.06898433, -0.07000192, - -0.07014513, -0.06994886, -0.07115909, -0.073999, -0.07292669, + -0.07014513, -0.06994886, -0.07115909, -0.07399900, -0.07292669, -0.07139879, -0.07226865, -0.07187915, -0.07122995, -0.06975022, -0.06800613, -0.06900793, -0.07186378, -0.07114479, -0.07015252, - -0.06944782, -0.068291, -0.06905348, -0.06925773, -0.06834624, + -0.06944782, -0.06829100, -0.06905348, -0.06925773, -0.06834624, -0.06837319, -0.06824067, -0.06644614, -0.06637313, -0.06626312, - -0.06470209, -0.0645058, -0.06477729, -0.06411049, -0.06158499, + -0.06470209, -0.06450580, -0.06477729, -0.06411049, -0.06158499, -0.06305197, -0.06398006, -0.06277579, -0.06282124, -0.06317684, - -0.0614125, -0.05961922, -0.05875443, -0.05845781, -0.05828608, + -0.06141250, -0.05961922, -0.05875443, -0.05845781, -0.05828608, -0.05666474, -0.05495706, -0.05446301, -0.05468254, -0.05478609, -0.05440798, -0.05312339, -0.05102368, -0.05160285, -0.05077954, - -0.04979648, -0.04890853, -0.04745462, -0.04496763, -0.0448713, + -0.04979648, -0.04890853, -0.04745462, -0.04496763, -0.04487130, -0.04599596, -0.04688998, -0.04488872, -0.04404507, -0.04420729, - -0.04368153, -0.04254381, -0.0411764, -0.04022811, -0.03999746, - -0.03963634, -0.03832502, -0.0383956, -0.04015374, -0.0387544, + -0.04368153, -0.04254381, -0.04117640, -0.04022811, -0.03999746, + -0.03963634, -0.03832502, -0.03839560, -0.04015374, -0.03875440, -0.03777315, -0.03830728, -0.03768616, -0.03714081, -0.03781918, -0.03739374, -0.03659894, -0.03563342, -0.03658407, -0.03686991, - -0.03543746, -0.03518799, -0.03361226, -0.0321534, -0.03050438, + -0.03543746, -0.03518799, -0.03361226, -0.03215340, -0.03050438, -0.02958411, -0.02855023, -0.02913402, -0.02992464, -0.02899548, -0.02891629, -0.02809554, -0.02702642, -0.02672194, -0.02678648, -0.02698471, -0.02628085, -0.02674285, -0.02658515, -0.02604447, - -0.0245711, -0.02413174, -0.02342496, -0.022898, -0.02216152, + -0.02457110, -0.02413174, -0.02342496, -0.02289800, -0.02216152, -0.02272283, -0.02199741, -0.02305362, -0.02371371, -0.02320865, - -0.02234777, -0.0225018, -0.02104359, -0.02203346, -0.02052545, + -0.02234777, -0.02250180, -0.02104359, -0.02203346, -0.02052545, -0.01987457, -0.01947911, -0.01986949, -0.02012196, -0.01958515, - -0.01906753, -0.01857869, -0.01874101, -0.01827973, -0.017752, + -0.01906753, -0.01857869, -0.01874101, -0.01827973, -0.01775200, -0.01702056, -0.01759611, -0.01888485, -0.01988159, -0.01951675, - -0.01872967, -0.01866667, -0.0183576, -0.01909758, -0.018599, - -0.01910036, -0.01930315, -0.01958856, -0.02129936, -0.0216614, - -0.0204397, -0.02002368, -0.02058828, -0.02149915, -0.02167326, + -0.01872967, -0.01866667, -0.01835760, -0.01909758, -0.01859900, + -0.01910036, -0.01930315, -0.01958856, -0.02129936, -0.02166140, + -0.02043970, -0.02002368, -0.02058828, -0.02149915, -0.02167326, -0.02238569, -0.02211907, -0.02168336, -0.02124387, -0.02131655, - -0.02130508, -0.02181227, -0.02230632, -0.02223732, -0.0228216, + -0.02130508, -0.02181227, -0.02230632, -0.02223732, -0.02282160, -0.02355137, -0.02275145, -0.02286893, -0.02437776, -0.02523897, - -0.0248354, -0.02319174, -0.02335831, -0.02405789, -0.02483273, + -0.02483540, -0.02319174, -0.02335831, -0.02405789, -0.02483273, -0.02428119, -0.02395295, -0.02437185, -0.02476434, -0.02347973, -0.02385957, -0.02451257, -0.02414586, -0.02439035, -0.02357782, -0.02417295, -0.02504764, -0.02682569, -0.02807111, -0.02886335, -0.02943406, -0.02956806, -0.02893096, -0.02903812, -0.02999862, - -0.029421, -0.03016203, -0.03118823, -0.03076205, -0.03005985, + -0.02942100, -0.03016203, -0.03118823, -0.03076205, -0.03005985, -0.03079187, -0.03215188, -0.03271075, -0.03146124, -0.03040965, -0.03008436, -0.03085897, -0.03015341, -0.03014661, -0.03110255, - -0.03271278, -0.03217399, -0.0331721, -0.03459221, -0.03572073, - -0.03560707, -0.03531492, -0.03687657, -0.03800143, -0.0373808, + -0.03271278, -0.03217399, -0.03317210, -0.03459221, -0.03572073, + -0.03560707, -0.03531492, -0.03687657, -0.03800143, -0.03738080, -0.03729927, -0.03748666, -0.03754171, -0.03790408, -0.03963726, - -0.03992153, -0.03812243, -0.0373844, -0.0385394, -0.03849716, - -0.03826345, -0.03743958, -0.0380861, -0.03857622, -0.04099357, + -0.03992153, -0.03812243, -0.03738440, -0.03853940, -0.03849716, + -0.03826345, -0.03743958, -0.03808610, -0.03857622, -0.04099357, -0.04102509, -0.04170207, -0.04283573, -0.04320618, -0.04269438, -0.04467527, -0.04470603, -0.04496092, -0.04796417, -0.04796633, - -0.047863, -0.04883668, -0.0505939, -0.05112441, -0.04960962, - -0.05000041, -0.04962112, -0.05087008, -0.0521671, -0.05369792, + -0.04786300, -0.04883668, -0.05059390, -0.05112441, -0.04960962, + -0.05000041, -0.04962112, -0.05087008, -0.05216710, -0.05369792, -0.05478139, -0.05559221, -0.05669698, -0.05654505, -0.05731113, -0.05783543, -0.05766056, -0.05754354, -0.05724272, -0.05831026, -0.05847512, -0.05804533, -0.05875046, -0.06021703, -0.06147975, - -0.06213918, -0.0645805, -0.06500849, -0.06361716, -0.06315227, + -0.06213918, -0.06458050, -0.06500849, -0.06361716, -0.06315227, -0.06306436, -0.06425743, -0.06626847, -0.06615213, -0.06881004, - -0.06942296, -0.06889225, -0.06868663, -0.0678667, -0.06720133, + -0.06942296, -0.06889225, -0.06868663, -0.06786670, -0.06720133, -0.06771172, -0.06885042, -0.06896979, -0.06961627, -0.07211988, -0.07252956, -0.07265559, -0.07264195, -0.07306334, -0.07282035, -0.07196505, -0.07210595, -0.07203942, -0.07105821, -0.06920599, -0.06892264, -0.06699939, -0.06537829, -0.06543323, -0.06913186, -0.07210039, -0.07219987, -0.07124228, -0.07065497, -0.06996833, - -0.0674457, -0.06800847, -0.06784175, -0.06592871, -0.06723401, + -0.06744570, -0.06800847, -0.06784175, -0.06592871, -0.06723401, ]) + singular_value = 728.9945 + # Compare results obtained using this library. There are slight # variations due to the fact that we are in two different packages # If the sign of the components is not the same the component is @@ -315,8 +394,18 @@ def test_grid_fpca_fit_result(self) -> None: rtol=1e-6, ) + np.testing.assert_allclose( + fpca.singular_values_, + singular_value, + ) + def test_grid_fpca_transform_result(self) -> None: - """Compare the scores in grid against the fda.usc package.""" + """ + Compare the scores in grid against the fda.usc package. + + See test_grid_fpca_fit_result for the replication code. + + """ n_components = 1 fd_data = fetch_weather()['data'].coordinates[0] @@ -327,21 +416,18 @@ def test_grid_fpca_transform_result(self) -> None: # results obtained results = np.array([ # noqa: WPS317 - [-77.05020176], [-90.56072204], [-82.39565947], - [-114.45375934], [-69.99735931], [-64.44894047], - [135.58336775], [-14.93460852], [0.75024737], - [-36.4781038], [-42.35637749], [-73.98910492], - [-67.11253749], [-103.68269798], [-104.65948079], - [-7.42817782], [7.48125036], [56.29792942], - [181.00258791], [-3.53294736], [37.94673912], - [124.43819913], [-7.04274676], [-49.61134859], - [-136.86256785], [-184.03502398], [-181.72835749], - [-51.06323208], [-137.85606731], [50.10941466], - [151.68118097], [159.01360046], [217.17981302], - [234.40195237], [345.39374006], + -76.43217603, -90.02095494, -81.80476223, -113.69868192, + -69.54664059, -64.15532621, 134.93536815, -15.00125409, + 0.60569550, -36.37615052, -42.18300642, -73.71660038, + -66.88119544, -103.15419038, -104.12065321, -7.49806764, + 7.14456774, 55.76321474, 180.16351452, -3.76283358, + 37.49075282, 123.73187622, -7.05384351, -49.36562021, + -136.37428322, -183.00666524, -180.64875116, -50.94411798, + -136.95768454, 49.83695668, 150.67710532, 158.20189044, + 216.43002289, 233.53770292, 344.18479151, ]) - np.testing.assert_allclose(scores, results) + np.testing.assert_allclose(scores.ravel(), results, rtol=0.25) def test_grid_fpca_regularization_fit_result(self) -> None: """Compare the components in grid against the fda.usc package.""" From 634c682bebebd98add464f812d6f9d3dfb7fa42b Mon Sep 17 00:00:00 2001 From: Alvaro Date: Fri, 10 Dec 2021 22:43:51 +0100 Subject: [PATCH 111/117] Change predict to transform method --- skfda/exploratory/depth/_depth.py | 16 +++++++-------- skfda/exploratory/depth/multivariate.py | 20 +++++++++---------- skfda/ml/classification/_depth_classifiers.py | 6 +++--- .../feature_extraction/_ddg_transformer.py | 4 ++-- 4 files changed, 22 insertions(+), 24 deletions(-) diff --git a/skfda/exploratory/depth/_depth.py b/skfda/exploratory/depth/_depth.py index 62950b765..d474c0568 100644 --- a/skfda/exploratory/depth/_depth.py +++ b/skfda/exploratory/depth/_depth.py @@ -11,12 +11,10 @@ from typing import Optional import numpy as np - import scipy.integrate from ... import FDataGrid -from . import multivariate -from .multivariate import Depth, _UnivariateFraimanMuniz +from .multivariate import Depth, SimplicialDepth, _UnivariateFraimanMuniz class IntegratedDepth(Depth[FDataGrid]): @@ -75,9 +73,9 @@ def fit( # noqa: D102 self.multivariate_depth_.fit(X.data_matrix) return self - def predict(self, X: FDataGrid) -> np.ndarray: # noqa: D102 + def transform(self, X: FDataGrid) -> np.ndarray: # noqa: D102 - pointwise_depth = self.multivariate_depth_.predict(X.data_matrix) + pointwise_depth = self.multivariate_depth_.transform(X.data_matrix) interval_len = ( self._domain_range[0][1] @@ -113,7 +111,7 @@ def min(self) -> float: # noqa: WPS125 class ModifiedBandDepth(IntegratedDepth): - r""" + """ Implementation of Modified Band Depth for functional data. The band depth of each sample is obtained by computing the fraction of time @@ -144,11 +142,11 @@ class ModifiedBandDepth(IntegratedDepth): """ def __init__(self) -> None: - super().__init__(multivariate_depth=multivariate.SimplicialDepth()) + super().__init__(multivariate_depth=SimplicialDepth()) class BandDepth(Depth[FDataGrid]): - r""" + """ Implementation of Band Depth for functional data. The band depth of each sample is obtained by computing the fraction of the @@ -188,7 +186,7 @@ def fit(self, X: FDataGrid, y: None = None) -> BandDepth: # noqa: D102 self._distribution = X return self - def predict(self, X: FDataGrid) -> np.ndarray: # noqa: D102 + def transform(self, X: FDataGrid) -> np.ndarray: # noqa: D102 num_in = 0 n_total = 0 diff --git a/skfda/exploratory/depth/multivariate.py b/skfda/exploratory/depth/multivariate.py index f7c1d5081..62e90c9e1 100644 --- a/skfda/exploratory/depth/multivariate.py +++ b/skfda/exploratory/depth/multivariate.py @@ -40,7 +40,7 @@ def fit(self: SelfType, X: T, y: None = None) -> SelfType: return self @abc.abstractmethod - def predict(self, X: T) -> np.ndarray: + def transform(self, X: T) -> np.ndarray: """ Compute the depth or outlyingness inside the learned distribution. @@ -53,7 +53,7 @@ def predict(self, X: T) -> np.ndarray: """ pass - def fit_predict(self, X: T, y: None = None) -> np.ndarray: + def fit_transform(self, X: T, y: None = None) -> np.ndarray: """ Compute the depth or outlyingness of each observation. @@ -67,7 +67,7 @@ def fit_predict(self, X: T, y: None = None) -> np.ndarray: Depth of each observation. """ - return self.fit(X).predict(X) + return self.fit(X).transform(X) def __call__( self, @@ -90,9 +90,9 @@ def __call__( copy = sklearn.base.clone(self) if distribution is None: - return copy.fit_predict(X) + return copy.fit_transform(X) - return copy.fit(distribution).predict(X) + return copy.fit(distribution).transform(X) @property # noqa: WPS125 def max(self) -> float: # noqa: WPS125 @@ -189,7 +189,7 @@ def fit(self: SelfType, X: np.ndarray, y: None = None) -> SelfType: self._sorted_values = np.sort(X, axis=0) return self - def predict(self, X: np.ndarray) -> np.ndarray: + def transform(self, X: np.ndarray) -> np.ndarray: cum_dist = _searchsorted_ordered( np.moveaxis(self._sorted_values, 0, -1), np.moveaxis(X, 0, -1), @@ -236,7 +236,7 @@ def fit( # noqa: D102 return self - def predict(self, X: np.ndarray) -> np.ndarray: # noqa: D102 + def transform(self, X: np.ndarray) -> np.ndarray: # noqa: D102 assert self._dim == X.shape[-1] @@ -307,8 +307,8 @@ def fit( # noqa: D102 return self - def predict(self, X: np.ndarray) -> np.ndarray: # noqa: D102 - outlyingness_values = self.outlyingness.predict(X) + def transform(self, X: np.ndarray) -> np.ndarray: # noqa: D102 + outlyingness_values = self.outlyingness.transform(X) min_val = self.outlyingness.min max_val = self.outlyingness.max @@ -355,7 +355,7 @@ def fit( # noqa: D102 return self - def predict(self, X: np.ndarray) -> np.ndarray: # noqa: D102 + def transform(self, X: np.ndarray) -> np.ndarray: # noqa: D102 dim = X.shape[-1] diff --git a/skfda/ml/classification/_depth_classifiers.py b/skfda/ml/classification/_depth_classifiers.py index 74cadcb5f..fe202fcec 100644 --- a/skfda/ml/classification/_depth_classifiers.py +++ b/skfda/ml/classification/_depth_classifiers.py @@ -113,7 +113,7 @@ def fit(self, X: T, y: NDArrayInt) -> DDClassifier[T]: raise ValueError("DDClassifier only accepts two classes.") dd_coordinates = [ - depth_method.predict(X) + depth_method.transform(X) for depth_method in self.class_depth_methods_ ] @@ -160,7 +160,7 @@ def predict(self, X: T) -> NDArrayInt: sklearn_check_is_fitted(self) dd_coordinates = [ - depth_method.predict(X) + depth_method.transform(X) for depth_method in self.class_depth_methods_ ] @@ -251,7 +251,7 @@ class DDGClassifier( (2017) The DDG-classifier in the functional setting. TEST, 26. 119-142. """ - def __init__( + def __init__( # noqa: WPS234 self, multivariate_classifier: ClassifierMixin = None, depth_method: Union[Depth[T], Sequence[Depth[T]], None] = None, diff --git a/skfda/preprocessing/dim_reduction/feature_extraction/_ddg_transformer.py b/skfda/preprocessing/dim_reduction/feature_extraction/_ddg_transformer.py index ffe150f2d..4e7ba706c 100644 --- a/skfda/preprocessing/dim_reduction/feature_extraction/_ddg_transformer.py +++ b/skfda/preprocessing/dim_reduction/feature_extraction/_ddg_transformer.py @@ -84,7 +84,7 @@ class DDGTransformer( TEST, 26. 119-142. """ - def __init__( + def __init__( # noqa: WPS234 self, depth_method: Union[Depth[T], Sequence[Depth[T]], None] = None, ) -> None: @@ -127,6 +127,6 @@ def transform(self, X: T) -> ndarray: sklearn_check_is_fitted(self) return np.transpose([ - depth_method.predict(X) + depth_method.transform(X) for depth_method in self.class_depth_methods_ ]) From 4df63d1e67480dc4285f072eebcf482585e8dabf Mon Sep 17 00:00:00 2001 From: pedrorponga Date: Thu, 16 Dec 2021 20:01:56 +0100 Subject: [PATCH 112/117] Fix --- skfda/exploratory/visualization/representation.py | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/skfda/exploratory/visualization/representation.py b/skfda/exploratory/visualization/representation.py index a56dcc309..c9c12052d 100644 --- a/skfda/exploratory/visualization/representation.py +++ b/skfda/exploratory/visualization/representation.py @@ -236,15 +236,11 @@ def __init__( else: self.max_grad = max_grad - aux_list = [ - grad_color - self.min_grad - for grad_color in self.gradient_criteria - ] - self.gradient_list: Optional[Sequence[float]] = ( [ - aux / (self.max_grad - self.min_grad) - for aux in aux_list + (grad_color - self.min_grad) + / (self.max_grad - self.min_grad) + for grad_color in self.gradient_criteria ] ) else: @@ -280,9 +276,7 @@ def __init__( else: colormap = matplotlib.cm.get_cmap(self.colormap) - sample_colors = [ - colormap(g for g in self.gradient_list), - ] + sample_colors = colormap(self.gradient_list) self.sample_colors = sample_colors self.patches = patches From c0dd3d8420a5da28b0f98a8f3cf5c0f477ef90f4 Mon Sep 17 00:00:00 2001 From: pedrorponga Date: Thu, 16 Dec 2021 20:40:28 +0100 Subject: [PATCH 113/117] Improvements --- examples/plot_depth_classification.py | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/examples/plot_depth_classification.py b/examples/plot_depth_classification.py index 24ea9f894..3756dae40 100644 --- a/examples/plot_depth_classification.py +++ b/examples/plot_depth_classification.py @@ -1,6 +1,6 @@ """ -Classification -============== +Depth based classification +========================== This example shows the use of the depth based classifications methods applied to the Berkeley Growth Study data. An attempt to show the @@ -63,7 +63,7 @@ # # Below are the growth graphs of those individuals that we would like to # classify. Some of them will be male and some female. -X_test.plot() +X_test.plot().show() ############################################################################## # @@ -104,7 +104,7 @@ cmap_bold=cmap_bold, x_label="Boy class depth", y_label="Girl class depth", -).plot() +).plot().show() ############################################################################## # @@ -179,7 +179,7 @@ def _plot_boundaries(axis): cmap_bold=cmap_bold, x_label="Boy class depth", y_label="Girl class depth", -).plot() +).plot().show() ############################################################################## # @@ -197,13 +197,20 @@ def _plot_boundaries(axis): # # The other elements of the graph are the decision boundaries: # +# +--------------+--------------------------------------+ # | Boundary | Classifier | -# | ------------ | ------------------------------------ | +# +==============+======================================+ # | MaxDepth | MaximumDepthClassifier | +# +--------------+--------------------------------------+ # | P1 | DDClassifier with degree 1 | +# +--------------+--------------------------------------+ # | P2 | DDClassifier with degree 2 | +# +--------------+--------------------------------------+ # | P3 | DDClassifier with degree 3 | +# +--------------+--------------------------------------+ # | NearestClass | DDGClassifier with nearest neighbors | +# +--------------+--------------------------------------+ + ddg: DDGTransformer[FDataGrid] = DDGTransformer( depth_method=ModifiedBandDepth(), ) @@ -244,7 +251,7 @@ def _plot_boundaries(axis): cmap_bold=cmap_bold, x_label="Boy class depth", y_label="Girl class depth", -).plot() +).plot().show() ############################################################################## # @@ -300,7 +307,7 @@ def _plot_boundaries(axis): cmap_bold=cmap_bold, x_label="Boy class depth", y_label="Girl class depth", -).plot() +).plot().show() DDPlot( fdata=X_test, dist1=X_train[np.invert(index)], @@ -311,7 +318,7 @@ def _plot_boundaries(axis): cmap_bold=cmap_bold, x_label="Boy class depth", y_label="Girl class depth", -).plot() +).plot().show() for axis in axs: axis.label_outer() From a7915c1128ae52935e58c440518dd46ef46ccdb0 Mon Sep 17 00:00:00 2001 From: pedrorponga Date: Thu, 16 Dec 2021 20:54:20 +0100 Subject: [PATCH 114/117] Final --- examples/plot_depth_classification.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/plot_depth_classification.py b/examples/plot_depth_classification.py index 3756dae40..56ff0ee69 100644 --- a/examples/plot_depth_classification.py +++ b/examples/plot_depth_classification.py @@ -57,7 +57,7 @@ ) # Plot samples grouped by sex -X_train.plot(group=y_train, group_names=categories) +X_train.plot(group=y_train, group_names=categories).show() ############################################################################## # From e74c3b8dfe00f4652949f72e68fa316bf90c6cfe Mon Sep 17 00:00:00 2001 From: pedrorponga <32200195+pedrorponga@users.noreply.github.com> Date: Thu, 16 Dec 2021 23:50:22 +0100 Subject: [PATCH 115/117] Update examples/plot_depth_classification.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Carlos Ramos Carreño --- examples/plot_depth_classification.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/plot_depth_classification.py b/examples/plot_depth_classification.py index 56ff0ee69..9d2b5edac 100644 --- a/examples/plot_depth_classification.py +++ b/examples/plot_depth_classification.py @@ -2,7 +2,7 @@ Depth based classification ========================== -This example shows the use of the depth based classifications methods +This example shows the use of the depth based classification methods applied to the Berkeley Growth Study data. An attempt to show the differences and similarities between `MaximumDepthClassifier`, `DDClassifier`, and `DDGClassifier` is made. From d83141ae2f68c8bc694fdf613ff9ee7624799878 Mon Sep 17 00:00:00 2001 From: pedrorponga Date: Fri, 17 Dec 2021 11:00:31 +0100 Subject: [PATCH 116/117] Final --- examples/plot_depth_classification.py | 62 +++++++++++++-------------- 1 file changed, 30 insertions(+), 32 deletions(-) diff --git a/examples/plot_depth_classification.py b/examples/plot_depth_classification.py index 9d2b5edac..3fb395b47 100644 --- a/examples/plot_depth_classification.py +++ b/examples/plot_depth_classification.py @@ -4,8 +4,10 @@ This example shows the use of the depth based classification methods applied to the Berkeley Growth Study data. An attempt to show the -differences and similarities between `MaximumDepthClassifier`, -`DDClassifier`, and `DDGClassifier` is made. +differences and similarities between +:class:`~skfda.ml.classification.MaximumDepthClassifier`, +:class:`~skfda.ml.classification.DDClassifier`, +and :class:`~skfda.ml.classification.DDGClassifier` is made. """ # Author: Pedro Martín Rodríguez-Ponga Eyriès @@ -32,10 +34,9 @@ from skfda.representation.grid import FDataGrid ############################################################################## -# # The Berkeley Growth Study data contains the heights of 39 boys and 54 girls # from age 1 to 18 and the ages at which they were collected. Males are -# assigned the numeric value 0 while females are coded to a 1. In our +# assigned the numeric value 0 while females are assigned a 1. In our # comparison of the different methods, we will try to learn the sex of a person # by using its growth curve. X, y = datasets.fetch_growth(return_X_y=True, as_frame=True) @@ -44,7 +45,6 @@ y = y.values.codes ############################################################################## -# # As in many ML algorithms, we split the dataset into train and test. In this # graph, we can see the training dataset. These growth curves will be used to # train the model. Hence, the predictions will be data-driven. @@ -60,30 +60,31 @@ X_train.plot(group=y_train, group_names=categories).show() ############################################################################## -# # Below are the growth graphs of those individuals that we would like to # classify. Some of them will be male and some female. X_test.plot().show() ############################################################################## -# # As said above, we are trying to compare three different methods: -# MaximumDepthClassifier, DDClassifier, and `DDGClassifier`. They all use a +# :class:`~skfda.ml.classification.MaximumDepthClassifier`, +# :class:`~skfda.ml.classification.DDClassifier`, and +# :class:`~skfda.ml.classification.DDGClassifier`. They all use a # depth which in our example is # :class:`~skfda.representation.depth.ModifiedBandDepth` for consistency. With -# this depth we can create a `DDPlot`. +# this depth we can create a :class:`~skfda.exploratory.visualization.DDPlot`. # -# In a `DDPlot`, a growth curve is mapped to :math:`[0,1]\times[0,1]` where the -# first coordinate corresponds to the depth in the class of all boys and the -# second to that of all girls. Note that the dots will be blue if the true sex -# is female and red otherwise. +# In a :class:`~skfda.exploratory.visualization.DDPlot`, a growth curve is +# mapped to :math:`[0,1]\times[0,1]` where the first coordinate corresponds +# to the depth in the class of all boys and the second to that of all girls. +# Note that the dots will be blue if the true sex is female and red otherwise. ############################################################################## -# -# Below we can see how a `DDPlot` is used to classify with -# MaximumDepthClassifier. In this case it is quite straighforward, a person is -# classified to the class where it is deeper. This means that if a point is -# above the diagonal it is a girl and otherwise it is a boy. +# Below we can see how a :class:`~skfda.exploratory.visualization.DDPlot` is +# used to classify with +# :class:`~skfda.ml.classification.MaximumDepthClassifier`. In this case it is +# quite straighforward, a person is classified to the class where it is +# deeper. This means that if a point is above the diagonal it is a girl and +# otherwise it is a boy. clf = MaximumDepthClassifier(depth_method=ModifiedBandDepth()) clf.fit(X_train, y_train) print(clf.predict(X_test)) @@ -107,16 +108,15 @@ ).plot().show() ############################################################################## -# # We can see that we have used the classification predictions to compute the # score (obtained by comparing to the real known sex). This will also be done # for the rest of the classifiers. ############################################################################## -# -# Next we use `DDClassifier` with polynomes of degrees one, two, and three. -# Here, if a point in the `DDPlot` is above the polynome, the classifier will -# predict that it is a girl and otherwise, a boy. +# Next we use :class:`~skfda.ml.classification.DDClassifier` with polynomes +# of degrees one, two, and three. Here, if a point in the +# :class:`~skfda.exploratory.visualization.DDPlot` is above the polynome, +# the classifier will predict that it is a girl and otherwise, a boy. clf1 = DDClassifier(degree=1, depth_method=ModifiedBandDepth()) clf1.fit(X_train, y_train) print(clf1.predict(X_test)) @@ -182,8 +182,8 @@ def _plot_boundaries(axis): ).plot().show() ############################################################################## -# -# `DDClassifier` used with :class:`~sklearn.neighbors.KNeighborsClassifier`. +# :class:`~skfda.ml.classification.DDClassifier` used with +# :class:`~sklearn.neighbors.KNeighborsClassifier`. clf = DDGClassifier( KNeighborsClassifier(n_neighbors=5), depth_method=ModifiedBandDepth(), @@ -194,7 +194,6 @@ def _plot_boundaries(axis): ############################################################################## -# # The other elements of the graph are the decision boundaries: # # +--------------+--------------------------------------+ @@ -254,14 +253,13 @@ def _plot_boundaries(axis): ).plot().show() ############################################################################## -# # In the above graph, we can see the obtained classifiers from the train set. # The dots are all part of the test set and have their real color so, for # example, if they are blue it means that the true sex is female. One can see # that none of the built classifiers is perfect. # -# Next, we will use `DDGClassifier` together with a neural network: -# :class:`~sklearn.neural_network.MLPClassifier`. +# Next, we will use :class:`~skfda.ml.classification.DDGClassifier` together +# with a neural network: :class:`~sklearn.neural_network.MLPClassifier`. clf = DDGClassifier( MLPClassifier( solver='lbfgs', @@ -324,8 +322,8 @@ def _plot_boundaries(axis): axis.label_outer() ############################################################################## -# -# We can compare the behavior of two `DDGClassifier` based classifiers. The +# We can compare the behavior of two +# :class:`~skfda.ml.classification.DDGClassifier` based classifiers. The # one on the left corresponds to nearest neighbors and the one on the right to # a neural network. Interestingly, the neural network almost coincides with -# `MaximumDepthClassifier`. +# :class:`~skfda.ml.classification.MaximumDepthClassifier`. From e5adb5bfc95041bcf801c0ef2b3a80bf0191c2c3 Mon Sep 17 00:00:00 2001 From: vnmabus Date: Thu, 23 Dec 2021 15:58:19 +0100 Subject: [PATCH 117/117] Bump version. --- VERSION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/VERSION b/VERSION index 5a2a5806d..ee6cdce3c 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -0.6 +0.6.1