diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md deleted file mode 100644 index 2f3c3c6c4..000000000 --- a/.github/ISSUE_TEMPLATE/bug_report.md +++ /dev/null @@ -1,33 +0,0 @@ ---- -name: Bug report -about: Create a report to help us improve -title: '' -labels: bug -assignees: '' - ---- - -**Describe the bug** -A clear and concise description of what the bug is. - -**To Reproduce** -Code to reproduce the behavior: - -```python -import skfda -... - -``` - -**Expected behavior** -A clear and concise description of what you expected to happen, results or figures. - - -**Version information** - - OS: [e.g. Linux, MacOs, Windows] - - Python version: [e.g. 3.6, 3.7] - - scikit-fda version: [e.g. develop, 0.23] - - Version of other packages involved [e.g. numpy, scipy, matplotlib, ... ] - -**Additional context** -Add any other context about the problem here. diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml new file mode 100644 index 000000000..4a1c480ec --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug_report.yml @@ -0,0 +1,91 @@ +name: Bug report +description: Create a report to help us reproduce and fix a bug +labels: [bug] + +body: +- type: markdown + attributes: + value: > + #### Please check that the bug has not been previously notified before submitting, by searching through the [issues list](https://github.com/GAA-UAM/scikit-fda/issues). +- type: textarea + attributes: + label: Bug description summary + description: > + Please describe the bug in a brief paragraph(s). Be clear and concise. + validations: + required: true +- type: textarea + attributes: + label: Code to reproduce the bug + description: | + Please add a minimal code example that can reproduce the error. This will be automatically converted to a Python block. + placeholder: | + from skfda.datasets import fetch_growth + + X, y = fetch_growth(return_X_y=True) + + X^y + render: Python +- type: textarea + attributes: + label: Expected result + description: > + Paste or describe the result that you expected here. + validations: + required: true +- type: textarea + attributes: + label: Actual result + description: > + Paste or describe the result that you obtained here. If the code raises an error, you can past it in the next field. + validations: + required: true +- type: textarea + attributes: + label: Traceback (if an exception is raised) + description: | + If an exception is raised, copy and paste the traceback here. + placeholder: | + TypeError Traceback (most recent call last) + Cell In[5], line 1 + ----> 1 X ^y + + File .../skfda/representation/grid.py:1393, in FDataGrid.__array_ufunc__(self, ufunc, method, *inputs, **kwargs) + 1390 else: + 1391 new_outputs = (None,) * ufunc.nout + -> 1393 results = getattr(ufunc, method)(*new_inputs, **kwargs) + 1394 if results is NotImplemented: + 1395 return NotImplemented + + TypeError: ufunc 'bitwise_xor' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe'' + + In [6]: X^y + --------------------------------------------------------------------------- + TypeError Traceback (most recent call last) + Cell In[6], line 1 + ----> 1 X^y + + File .../skfda/representation/grid.py:1393, in FDataGrid.__array_ufunc__(self, ufunc, method, *inputs, **kwargs) + 1390 else: + 1391 new_outputs = (None,) * ufunc.nout + -> 1393 results = getattr(ufunc, method)(*new_inputs, **kwargs) + 1394 if results is NotImplemented: + 1395 return NotImplemented + + TypeError: ufunc 'bitwise_xor' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe'' + render: Python +- type: textarea + attributes: + label: Software versions + description: > + Include the version of the library used (obtained with `skfda.__version__`). If relevant, you can include here the OS version and versions of related software. + placeholder: | + scikit-fda version: 0.9 + OS: Windows 10 + validations: + required: true +- type: textarea + attributes: + label: Additional context + description: > + Add any other context about the problem here. diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md deleted file mode 100644 index 11fc491ef..000000000 --- a/.github/ISSUE_TEMPLATE/feature_request.md +++ /dev/null @@ -1,20 +0,0 @@ ---- -name: Feature request -about: Suggest an idea for this project -title: '' -labels: enhancement -assignees: '' - ---- - -**Is your feature request related to a problem? Please describe.** -A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] - -**Describe the solution you'd like** -A clear and concise description of what you want to happen. - -**Describe alternatives you've considered** -A clear and concise description of any alternative solutions or features you've considered. - -**Additional context** -Add any other context or screenshots about the feature request here. diff --git a/.github/ISSUE_TEMPLATE/feature_request.yml b/.github/ISSUE_TEMPLATE/feature_request.yml new file mode 100644 index 000000000..40a23d9bf --- /dev/null +++ b/.github/ISSUE_TEMPLATE/feature_request.yml @@ -0,0 +1,35 @@ +name: Feature request +description: Suggest an idea for this project +labels: [enhancement] + +body: +- type: markdown + attributes: + value: > + #### Please check that this idea has not been proposed previously, by searching through the [issues list](https://github.com/GAA-UAM/scikit-fda/issues). +- type: textarea + attributes: + label: Motivation + description: > + A clear and concise description of what the problem is. Ex. I am always frustrated when [...] + validations: + required: true +- type: textarea + attributes: + label: Desired functionality + description: > + A clear and concise description of what you want to happen. + validations: + required: true +- type: textarea + attributes: + label: Alternatives + description: > + A clear and concise description of any alternative solutions or features you have considered. + validations: + required: false +- type: textarea + attributes: + label: Additional context + description: > + Add any other context about the problem here. diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md new file mode 100644 index 000000000..96e32e1f6 --- /dev/null +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -0,0 +1,31 @@ + + +## References to issues or other PRs + + + +## Describe the proposed changes + + +## Additional information + + +## Checklist before requesting a review + +- [ ] I have performed a self-review of my code +- [ ] The code conforms to the style used in this package +- [ ] The code is fully documented and typed (type-checked with [Mypy](https://mypy-lang.org/)) +- [ ] I have added thorough tests for the new/changed functionality diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index f08ec8d3f..7c8aaf615 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -11,7 +11,7 @@ jobs: strategy: matrix: os: [ubuntu-latest, macos-latest, windows-latest] - python-version: ['3.8', '3.9'] + python-version: ['3.9', '3.10'] steps: - uses: actions/checkout@v2 @@ -28,7 +28,7 @@ jobs: - name: Run tests run: | pip3 debug --verbose . - pip3 install numba + pip3 install mkl-devel pip3 install ".[test]" pytest --cov=skfda/ --cov-report=xml diff --git a/.gitignore b/.gitignore index 8f00f745c..a1e517e35 100644 --- a/.gitignore +++ b/.gitignore @@ -112,3 +112,4 @@ pip-wheel-metadata/ .DS_Store .gitignore .vscode +.ruff_cache diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md new file mode 100644 index 000000000..673cc9da3 --- /dev/null +++ b/CODE_OF_CONDUCT.md @@ -0,0 +1,128 @@ +# Contributor Covenant Code of Conduct + +## Our Pledge + +We as members, contributors, and leaders pledge to make participation in our +community a harassment-free experience for everyone, regardless of age, body +size, visible or invisible disability, ethnicity, sex characteristics, gender +identity and expression, level of experience, education, socio-economic status, +nationality, personal appearance, race, religion, or sexual identity +and orientation. + +We pledge to act and interact in ways that contribute to an open, welcoming, +diverse, inclusive, and healthy community. + +## Our Standards + +Examples of behavior that contributes to a positive environment for our +community include: + +* Demonstrating empathy and kindness toward other people +* Being respectful of differing opinions, viewpoints, and experiences +* Giving and gracefully accepting constructive feedback +* Accepting responsibility and apologizing to those affected by our mistakes, + and learning from the experience +* Focusing on what is best not just for us as individuals, but for the + overall community + +Examples of unacceptable behavior include: + +* The use of sexualized language or imagery, and sexual attention or + advances of any kind +* Trolling, insulting or derogatory comments, and personal or political attacks +* Public or private harassment +* Publishing others' private information, such as a physical or email + address, without their explicit permission +* Other conduct which could reasonably be considered inappropriate in a + professional setting + +## Enforcement Responsibilities + +Community leaders are responsible for clarifying and enforcing our standards of +acceptable behavior and will take appropriate and fair corrective action in +response to any behavior that they deem inappropriate, threatening, offensive, +or harmful. + +Community leaders have the right and responsibility to remove, edit, or reject +comments, commits, code, wiki edits, issues, and other contributions that are +not aligned to this Code of Conduct, and will communicate reasons for moderation +decisions when appropriate. + +## Scope + +This Code of Conduct applies within all community spaces, and also applies when +an individual is officially representing the community in public spaces. +Examples of representing our community include using an official e-mail address, +posting via an official social media account, or acting as an appointed +representative at an online or offline event. + +## Enforcement + +Instances of abusive, harassing, or otherwise unacceptable behavior may be +reported to the community leaders responsible for enforcement at +vnmabus@gmail.com. +All complaints will be reviewed and investigated promptly and fairly. + +All community leaders are obligated to respect the privacy and security of the +reporter of any incident. + +## Enforcement Guidelines + +Community leaders will follow these Community Impact Guidelines in determining +the consequences for any action they deem in violation of this Code of Conduct: + +### 1. Correction + +**Community Impact**: Use of inappropriate language or other behavior deemed +unprofessional or unwelcome in the community. + +**Consequence**: A private, written warning from community leaders, providing +clarity around the nature of the violation and an explanation of why the +behavior was inappropriate. A public apology may be requested. + +### 2. Warning + +**Community Impact**: A violation through a single incident or series +of actions. + +**Consequence**: A warning with consequences for continued behavior. No +interaction with the people involved, including unsolicited interaction with +those enforcing the Code of Conduct, for a specified period of time. This +includes avoiding interactions in community spaces as well as external channels +like social media. Violating these terms may lead to a temporary or +permanent ban. + +### 3. Temporary Ban + +**Community Impact**: A serious violation of community standards, including +sustained inappropriate behavior. + +**Consequence**: A temporary ban from any sort of interaction or public +communication with the community for a specified period of time. No public or +private interaction with the people involved, including unsolicited interaction +with those enforcing the Code of Conduct, is allowed during this period. +Violating these terms may lead to a permanent ban. + +### 4. Permanent Ban + +**Community Impact**: Demonstrating a pattern of violation of community +standards, including sustained inappropriate behavior, harassment of an +individual, or aggression toward or disparagement of classes of individuals. + +**Consequence**: A permanent ban from any sort of public interaction within +the community. + +## Attribution + +This Code of Conduct is adapted from the [Contributor Covenant][homepage], +version 2.0, available at +https://www.contributor-covenant.org/version/2/0/code_of_conduct.html. + +Community Impact Guidelines were inspired by [Mozilla's code of conduct +enforcement ladder](https://github.com/mozilla/diversity). + +[homepage]: https://www.contributor-covenant.org + +For answers to common questions about this code of conduct, see the FAQ at +https://www.contributor-covenant.org/faq. Translations are available at +https://www.contributor-covenant.org/translations. diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 000000000..c0e02fad7 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,27 @@ +# How to contribute + +Thank you for considering contributing to scikit-fda! You can contribute to rdata in several ways. + +## Opening issues + +Open a new issue if you find some bug or you want to propose a new feature. Please first check that nobody has asked that already! + +Make sure that your proposal is clearly written, preferably in English. In case you are reporting a bug, please include all relevant information, such as the software version and machine information. + +## Discussing the project + +You can open a discussion for any topic related with this package. Do you have doubts about how to use the package? Open a discussion! Do you want to show related projects, recent research or some use case for this software? Open a discussion! + +You are also encouraged to answer the discussions of other users and participate actively in the discussions forum. + +## Improving the documentation + +Do you feel that the documentation could be more clear? Did you found a typo? You can easily edit the documentation and make a pull request clicking the "Edit this page" link in the documentation. + +Advanced users can also propose the addition of new pages and examples. In case you want to do that, please open an issue to discuss that first. + +## Contributing software + +You can improve this package by adding new functionality, solving pending bugs or implementing accepted feature requests. Please discuss that first to ensure that it will be accepted and to assign that to you and prevent duplicated efforts. + +In any case, make sure that you own the rights to the software and are ok with releasing it under a BSD 3-Clause license. diff --git a/README.rst b/README.rst index bbe8fa54f..d88418e95 100644 --- a/README.rst +++ b/README.rst @@ -4,7 +4,7 @@ scikit-fda: Functional Data Analysis in Python =================================================== -|python|_ |build-status| |docs| |Codecov|_ |PyPIBadge|_ |conda| |license|_ |doi| +|python|_ |build-status| |docs| |Codecov| |repostatus| |versions| |PyPIBadge| |conda| |license| |doi| Functional Data Analysis, or FDA, is the field of Statistics that analyses data that depend on a continuous parameter. @@ -122,6 +122,14 @@ license_ can be found along with the code. .. |Codecov| image:: https://codecov.io/gh/GAA-UAM/scikit-fda/branch/develop/graph/badge.svg .. _Codecov: https://app.codecov.io/gh/GAA-UAM/scikit-fda +.. |repostatus| image:: https://www.repostatus.org/badges/latest/active.svg + :alt: Project Status: Active – The project has reached a stable, usable state and is being actively developed. + :target: https://www.repostatus.org/#active + +.. |versions| image:: https://img.shields.io/pypi/pyversions/scikit-fda + :alt: PyPI - Python Version + :scale: 100% + .. |PyPIBadge| image:: https://badge.fury.io/py/scikit-fda.svg .. _PyPIBadge: https://badge.fury.io/py/scikit-fda diff --git a/binder/requirements.txt b/binder/requirements.txt index b913cb3e4..9ee80ec8d 100644 --- a/binder/requirements.txt +++ b/binder/requirements.txt @@ -1,4 +1,4 @@ --r ../readthedocs-requirements.txt +scikit-fda[docs] jupytext sphinx-gallery<=0.7.0 . \ No newline at end of file diff --git a/docs/_static/switcher.json b/docs/_static/switcher.json index 71b626e80..f4e362095 100644 --- a/docs/_static/switcher.json +++ b/docs/_static/switcher.json @@ -5,7 +5,7 @@ "url": "https://fda.readthedocs.io/en/latest/" }, { - "name": "0.8.1 (stable)", + "name": "0.9 (stable)", "version": "stable", "url": "https://fda.readthedocs.io/en/stable/", "preferred": true diff --git a/docs/_templates/autosummary/class.rst b/docs/_templates/autosummary/class.rst index fede4ca4e..d1f44b52e 100644 --- a/docs/_templates/autosummary/class.rst +++ b/docs/_templates/autosummary/class.rst @@ -10,12 +10,16 @@ .. autosummary:: {% for item in methods %} + {% if item != "__init__" %} ~{{ name }}.{{ item }} + {% endif %} {%- endfor %} {% endif %} {% for item in methods %} + {% if item != "__init__" %} .. automethod:: {{ item }} + {% endif %} {%- endfor %} {% endblock %} diff --git a/docs/modules/exploratory/stats.rst b/docs/modules/exploratory/stats.rst index 17642a732..b55b05fe6 100644 --- a/docs/modules/exploratory/stats.rst +++ b/docs/modules/exploratory/stats.rst @@ -31,4 +31,5 @@ statistics can be used. skfda.exploratory.stats.cov skfda.exploratory.stats.var + skfda.exploratory.stats.std diff --git a/docs/modules/ml/regression.rst b/docs/modules/ml/regression.rst index 539d38ec2..b3e224e78 100644 --- a/docs/modules/ml/regression.rst +++ b/docs/modules/ml/regression.rst @@ -57,4 +57,17 @@ regression is fitted using the coefficients of the functions in said basis. .. autosummary:: :toctree: autosummary - skfda.ml.regression.FPCARegression \ No newline at end of file + skfda.ml.regression.FPCARegression + +FPLS regression +----------------- +This module includes the implementation of FPLS (Functional Partial Least Squares) +regression. This implementation accepts either functional or multivariate data as the regressor and the response. +FPLS regression consists on performing the FPLS dimensionality reduction algorithm +but using a regression deflation strategy. + + +.. autosummary:: + :toctree: autosummary + + skfda.ml.regression.FPLSRegression \ No newline at end of file diff --git a/docs/modules/preprocessing/dim_reduction.rst b/docs/modules/preprocessing/dim_reduction.rst index e35e5675f..d78d8705c 100644 --- a/docs/modules/preprocessing/dim_reduction.rst +++ b/docs/modules/preprocessing/dim_reduction.rst @@ -36,9 +36,12 @@ Other dimensionality reduction methods construct new features from existing ones. For example, in functional principal component analysis, we project the data samples into a smaller sample of functions that preserve most of the original -variance. +variance. Similarly, in functional partial least squares, we project +the data samples into a smaller sample of functions that preserve most +of the covariance between the two data blocks. .. autosummary:: :toctree: autosummary - skfda.preprocessing.dim_reduction.FPCA \ No newline at end of file + skfda.preprocessing.dim_reduction.FPCA + skfda.preprocessing.dim_reduction.FPLS \ No newline at end of file diff --git a/docs/refs.bib b/docs/refs.bib index c0aedacb8..005bc0112 100644 --- a/docs/refs.bib +++ b/docs/refs.bib @@ -107,6 +107,21 @@ @article{berrendero++_2022_functional keywords = {62J12,62R10,Functional data,Kernel methods in statistics,Logistic regression,Mathematics - Statistics Theory,Reproducing kernel Hilbert spaces} } +@article{borggaard+thodberg_1992_optimal, + title = {Optimal Minimal Neural Interpretation of Spectra}, + author = {Borggaard, {\relax Claus}. and Thodberg, Hans Henrik.}, + year = {1992}, + month = mar, + journal = {Analytical Chemistry}, + volume = {64}, + number = {5}, + pages = {545--551}, + issn = {0003-2700}, + doi = {10.1021/ac00029a018}, + url = {https://doi.org/10.1021/ac00029a018}, + urldate = {2018-12-09} +} + @article{cuesta-albertos++_2017_ddgclassifier, title = {The {{DDᴳ}}-Classifier in the Functional Setting}, author = {{Cuesta-Albertos}, J. A. and {Febrero-Bande}, M. and {Oviedo~de~la~Fuente}, M.}, @@ -293,6 +308,26 @@ @article{ghosh+chaudhuri_2005_maximum keywords = {Bayes risk,cross-validation,data depth,elliptic symmetry,kernel density estimation,location shift model,Mahalanobis distance,misclassification rate,Vapnik Chervonenkis dimension} } +@article{hastie++_1995_penalized, + title = {Penalized Discriminant Analysis}, + author = {Hastie, Trevor and Buja, Andreas and Tibshirani, Robert}, + year = {1995}, + month = feb, + journal = {The Annals of Statistics}, + volume = {23}, + number = {1}, + pages = {73--102}, + issn = {0090-5364, 2168-8966}, + doi = {10.1214/aos/1176324456}, + url = {https://projecteuclid.org/euclid.aos/1176324456}, + urldate = {2018-12-09}, + abstract = {Fisher's linear discriminant analysis (LDA) is a popular data-analytic tool for studying the relationship between a set of predictors and a categorical response. In this paper we describe a penalized version of LDA. It is designed for situations in which there are many highly correlated predictors, such as those obtained by discretizing a function, or the grey-scale values of the pixels in a series of images. In cases such as these it is natural, efficient and sometimes essential to impose a spatial smoothness constraint on the coefficients, both for improved prediction performance and interpretability. We cast the classification problem into a regression framework via optimal scoring. Using this, our proposal facilitates the use of any penalized regression technique in the classification setting. The technique is illustrated with examples in speech recognition and handwritten character recognition.}, + langid = {english}, + mrnumber = {MR1331657}, + zmnumber = {0821.62031}, + keywords = {discrimination,regularization,Signal and image classification} +} + @article{li++_2012_ddclassifier, title = {{{DD-Classifier}}: {{Nonparametric}} Classification Procedure Based on {{DD-Plot}}}, shorttitle = {{{DD-Classifier}}}, @@ -347,6 +382,14 @@ @article{marron++_2015_functional keywords = {alignment,dynamic time warping,elastic metric,Fisher–Rao metric,Functional data analysis,registration,warping} } +@misc{meteorologicalstateagencyofspainaemet_2009_meteorological, + title = {Meteorological Data of {{Spanish}} Weather Stations between Years 1980 and 2009.}, + author = {{Meteorological State Agency of Spain (AEMET)}}, + year = {2009}, + url = {http://www.aemet.es/}, + abstract = {The data were obtained from the FTP of AEMET in 2009.} +} + @article{pini++_2018_hotelling, title = {Hotelling's {{T2}} in Separable {{Hilbert}} Spaces}, author = {Pini, Alessia and Stamm, Aymeric and Vantini, Simone}, @@ -379,6 +422,20 @@ @incollection{pintado+romo_2005_depthbased isbn = {978-0-8218-3596-8} } +@inproceedings{ramos-carreno++_2022_scikitfda, + title = {{{scikit-fda}}: {{Computational}} Tools for Machine Learning with Functional Data}, + shorttitle = {Scikit-Fda}, + booktitle = {2022 {{IEEE}} 34th {{International Conference}} on {{Tools}} with {{Artificial Intelligence}} ({{ICTAI}})}, + author = {{Ramos-Carre{\~n}o}, Carlos and Torrecilla, Jos{\'e} Luis and Hong, Yujian and Su{\'a}rez, Alberto}, + year = {2022}, + month = oct, + pages = {213--218}, + issn = {2375-0197}, + doi = {10.1109/ICTAI56018.2022.00038}, + abstract = {Machine learning from functional data poses particular challenges that require specific computational tools that take into account their structure. In this work, we present scikit-fda, a Python library for functional data analysis, visualization, preprocessing, and machine learning. The library is designed for smooth integration in the Python scientific ecosystem. In particular, it complements and can be used in combination with scikit-learn, the reference Python library for machine learning. The functionality of scikit-fda is illustrated in clustering, regression, and classification problems from different areas of application.}, + keywords = {Data analysis,data visualization,Data visualization,Documentation,Ecosystems,Extrapolation,functional data analysis,Interpolation,machine learning,Machine learning,Python toolbox} +} + @inbook{ramsay+silverman_2005_functionala, title = {From Functional Data to Smooth Functions}, booktitle = {Functional Data Analysis}, @@ -436,6 +493,24 @@ @inbook{ramsay+silverman_2005_registration keywords = {Multivariate analysis} } +@incollection{romeo+marzoljaen_2014_analisis, + title = {{An\'alisis del viento y la niebla en el aeropuerto de Los Rodeos (Tenerife). Cambios y tendencias}}, + booktitle = {{Cambio clim\'atico y cambio global.}}, + author = {Romeo, V{\'i}ctor M. and Marzol Ja{\'e}n, M{\textordfeminine} Victoria}, + editor = {Fern{\'a}ndez Montes, Sonia and S{\'a}nchez Rodrigo, Fernando}, + year = {2014}, + series = {{Publicaciones de la Asociaci\'on Espa\~nola de Climatolog\'ia. Serie A.}}, + number = {9}, + pages = {325--334}, + publisher = {{Asociaci\'on Espa\~nola de Climatolog\'ia}}, + url = {https://repositorio.aemet.es/handle/20.500.11765/8173}, + urldate = {2022-07-28}, + abstract = {[ES]La particular localizaci\'on del aeropuerto Tenerife Norte-Los Rodeos, en el nordeste de la isla de Tenerife, es la causa de la modificaci\'on de la direcci\'on habitual de los vientos alisios del NE a vientos del NO, acompa\~nados frecuentemente por nubosidad. Esto ocasiona que en un n\'umero significativo de d\'ias al a\~no este aeropuerto est\'e cubierto por la niebla, lo que afecta de una forma muy importante a su operatividad. El objetivo del trabajo es caracterizar su r\'egimen de vientos y de la niebla durante los \'ultimos trece a\~nos (2000-2012) y comprobar si se han producido cambios significativos en ambas variables clim\'aticas con respecto a los periodos normales de 1961-1990 y 1971-2000. Los resultados indican que las dos direcciones dominantes del viento, NO y SE, mantienen sus frecuencias no as\'i la niebla que ha aumentado su presencia en los \'ultimos a\~nos.}, + copyright = {Licencia CC: Reconocimiento CC BY}, + isbn = {978-84-16027-69-9}, + langid = {spanish} +} + @article{ruiz-meana++_2003_cariporide, title = {Cariporide Preserves Mitochondrial Proton Gradient and Delays {{ATP}} Depletion in Cardiomyocytes during Ischemic Conditions}, author = {{Ruiz-Meana}, Marisol and {Garcia-Dorado}, David and Pina, Pilar and Inserte, Javier and Agull{\'o}, Luis and {Soler-Soler}, Jordi}, diff --git a/examples/full_examples/README.txt b/examples/full_examples/README.txt deleted file mode 100644 index 658318787..000000000 --- a/examples/full_examples/README.txt +++ /dev/null @@ -1,5 +0,0 @@ -ICTAI examples -============== - -Examples of complete analyses showcasing the main functionalities of the scikit-fda package, -presented at the 34th IEEE International Conference on Tools with Artificial Intelligence (ICTAI). diff --git a/examples/full_examples/plot_aemet_unsupervised.py b/examples/plot_aemet_unsupervised.py similarity index 53% rename from examples/full_examples/plot_aemet_unsupervised.py rename to examples/plot_aemet_unsupervised.py index 8f8886c30..5d68b8fd0 100644 --- a/examples/full_examples/plot_aemet_unsupervised.py +++ b/examples/plot_aemet_unsupervised.py @@ -14,11 +14,13 @@ from typing import Any, Mapping, Tuple +import cartopy.crs as ccrs import matplotlib.pyplot as plt import numpy as np import sklearn.cluster +from cartopy.io.img_tiles import GoogleTiles from matplotlib.axes import Axes -from mpl_toolkits.basemap import Basemap +from matplotlib.figure import Figure from skfda.datasets import fetch_aemet from skfda.exploratory.depth import ModifiedBandDepth @@ -28,31 +30,56 @@ from skfda.ml.clustering import KMeans from skfda.preprocessing.dim_reduction import FPCA -############################################################################## -# We will first load the AEMET dataset and plot it. +# %% +# In this example we explore the curves of daily temperatures at +# different weather stations from Spain, included in the AEMET dataset\ +# :footcite:p:`meteorologicalstateagencyofspainaemet_2009_meteorological`. +# +# We first show how the visualization tools of scikit-fda can be used to +# detect and interpret magnitude and shape outliers. +# We also explain how to employ a clustering method on these temperature +# curves using scikit-fda. +# Then, the location of each weather station is plotted into a map of Spain +# with a different color according to its cluster. +# This reveals a remarkable resemblance to a Spanish climate map. +# A posterior analysis using functional principal component analysis (FPCA) +# explains how the two first principal components are related with relevant +# meteorological concepts, and can be used to reconstruct and interpret +# the original clustering. +# +# This is one of the examples presented in the ICTAI conference\ +# :footcite:p:`ramos-carreno++_2022_scikitfda`. + +# %% +# We first load the AEMET dataset and plot it. X, _ = fetch_aemet(return_X_y=True) X = X.coordinates[0] X.plot() plt.show() -############################################################################## +# %% # A boxplot can show magnitude outliers, in this case Navacerrada. +# Here the temperatures are lower than in the other curves, as this +# weather station is at a high altitude, near a ski resort. Boxplot( X, depth_method=ModifiedBandDepth(), ).plot() plt.show() -############################################################################## +# %% # A magnitude-shape plot can be used to detect shape outliers, such as the # Canary islands, with a less steeper temperature curve. +# The Canary islands are at a lower latitude, closer to the equator. +# Thus, they have a subtropical climate which presents less temperature +# variation during the year. MagnitudeShapePlot( X, ).plot() plt.show() -############################################################################## +# %% # We now attempt to cluster the curves using functional k-means. n_clusters = 5 n_init = 10 @@ -65,11 +92,11 @@ ) fda_clusters = fda_kmeans.fit_predict(X) -############################################################################## +# %% # We want to plot the cluster of each station in the map of Spain. We need to # define first auxiliary variables and functions for plotting. -coords_spain = (-10, 34.98, 5, 44.8) -coords_canary = (-18.5, 27.5, -13, 29.5) +coords_spain = (-10, 5, 34.98, 44.8) +coords_canary = (-18.5, -13, 27.5, 29.5) # It is easier to obtain the longitudes and latitudes from the data in # a Pandas dataframe. @@ -81,24 +108,20 @@ def create_map( coords: Tuple[float, float, float, float], - ax: Axes, -) -> Basemap: + figsize: Tuple[float, float], +) -> Figure: """Create a map for a region of the world.""" - basemap = Basemap( - *coords, - projection='merc', - resolution="h", - epsg=4326, - ax=ax, - fix_aspect=False, - ) - basemap.arcgisimage( - service='World_Imagery', - xpixels=1000, - dpi=100, - ) + tiler = GoogleTiles(style="satellite") + mercator = tiler.crs + + fig = plt.figure(figsize=figsize) + ax = fig.add_axes([0, 0, 1, 1], projection=mercator) + ax.set_extent(coords, crs=ccrs.PlateCarree()) + + ax.add_image(tiler, 8) + ax.set_adjustable('datalim') - return basemap + return fig def plot_cluster_points( @@ -106,19 +129,18 @@ def plot_cluster_points( latitudes: np.typing.NDArray[np.floating[Any]], clusters: np.typing.NDArray[np.integer[Any]], color_map: Mapping[int, str], - basemap: Basemap, ax: Axes, ) -> None: """Plot the stations in a map with their cluster color.""" - x, y = basemap(longitudes, latitudes) for cluster in range(n_clusters): selection = (clusters == cluster) ax.scatter( - x[selection], - y[selection], + longitudes[selection], + latitudes[selection], s=64, color=color_map[cluster], edgecolors='white', + transform=ccrs.Geodetic(), ) @@ -133,44 +155,59 @@ def plot_cluster_points( # Names of each climate (for this particular seed) climate_names = { - 0: "Cold-mountain", + 0: "Cold/mountain", 1: "Mediterranean", 2: "Atlantic", 3: "Subtropical", 4: "Continental", } -############################################################################## +# %% # We now plot the obtained clustering in the maps. +# +# It is possible to notice that each cluster seems to roughly correspond with +# a particular climate: +# +# - Red points, only present in the Canary islands, correspond to a subtropical +# climate. +# - Yellow points, in the Mediterranean coast, correspond to a Mediterranean +# climate. +# - Green points, in the Atlantic coast of mainland Spain, correspond to an +# Atlantic or oceanic climate. +# - Orange points, in the center of mainland Spain, correspond to a continental +# climate. +# - Finally the purple points are located at the coldest regions of Spain, such +# as in high mountain ranges. +# Thus, it can be associated with a cold or mountain climate. +# +# Notice that a couple of points in the Canary islands are not red. +# The purple one is easy to explain, as it is in mount Teide, the highest +# mountain of Spain. +# The yellow one is in the airport of Los Rodeos, which has its own +# microclimate\ :footcite:p:`romeo+marzoljaen_2014_analisis`. # Mainland -fig_spain = plt.figure(figsize=(8, 6)) -ax_spain = fig_spain.add_axes([0, 0, 1, 1]) -map_spain = create_map(coords_spain, ax=ax_spain) +fig_spain = create_map(coords_spain, figsize=(8, 6)) plot_cluster_points( longitudes=station_longitudes, latitudes=station_latitudes, clusters=fda_clusters, color_map=fda_color_map, - basemap=map_spain, - ax=ax_spain, + ax=fig_spain.axes[0], ) # Canary Islands -fig_canary = plt.figure(figsize=(8, 3)) -ax_canary = fig_canary.add_axes([0, 0, 1, 1]) -map_canary = create_map(coords_canary, ax=ax_canary) +fig_canary = create_map(coords_canary, figsize=(8, 3)) plot_cluster_points( longitudes=station_longitudes, latitudes=station_latitudes, clusters=fda_clusters, color_map=fda_color_map, - basemap=map_canary, - ax=ax_canary, + ax=fig_canary.axes[0], ) plt.show() -############################################################################## +# %% # We now can compute the first two principal components for interpretability, # and project the data over these directions. fpca = FPCA(n_components=2) @@ -182,12 +219,17 @@ def plot_cluster_points( X_red = fpca.transform(X) -############################################################################## +# %% # We now plot the first two principal components as perturbations over the # mean. # # The ``factor`` parameters is a number that multiplies each component in # order to make their effect more noticeable. +# +# It is possible to observe that the first component measures a global +# increase/decrease in temperature. +# The second component instead has the effect of increasing/decreasing +# the variability of the temperatures during the year. fig = plt.figure(figsize=(8, 4)) FPCAPlot( fpca.mean_, @@ -197,8 +239,15 @@ def plot_cluster_points( ).plot() plt.show() -############################################################################## -# We also plot the projections over the first two principal components. +# %% +# We also plot the projections over the first two principal components, +# keeping the cluster colors. +# Here we can easily observe the corresponding characteristics of each +# climate in terms of global temperature and annual variability. +# The two outliers of the Mediterranean climate correspond to the +# aforementioned airport of Los Rodeos, and to Tarifa, which is +# located at the strait of Gibraltar and thus receives also the cold +# waters of the Atlantic, explaining its lower annual variability. fig, ax = plt.subplots(1, 1) for cluster in range(n_clusters): selection = fda_clusters == cluster @@ -214,7 +263,7 @@ def plot_cluster_points( ax.legend() plt.show() -############################################################################## +# %% # We now attempt a multivariate clustering using only these projections. mv_kmeans = sklearn.cluster.KMeans( n_clusters=n_clusters, @@ -223,41 +272,44 @@ def plot_cluster_points( ) mv_clusters = mv_kmeans.fit_predict(X_red) -############################################################################## +# %% # We now plot the multivariate clustering in the maps. We define a different # color map to match cluster colors with the previously obtained ones. +# As you can see, the clustering using only the two first principal components +# matches almost perfectly with the original one, that used the complete +# temperature curves. mv_color_map = { - 0: "red", - 1: "purple", - 2: "yellow", - 3: "green", - 4: "orange", + 0: "yellow", + 1: "orange", + 2: "red", + 3: "purple", + 4: "green", } # Mainland -fig_spain = plt.figure(figsize=(8, 6)) -ax_spain = fig_spain.add_axes([0, 0, 1, 1]) -map_spain = create_map(coords_spain, ax=ax_spain) +fig_spain = create_map(coords_spain, figsize=(8, 6)) plot_cluster_points( longitudes=station_longitudes, latitudes=station_latitudes, clusters=mv_clusters, color_map=mv_color_map, - basemap=map_spain, - ax=ax_spain, + ax=fig_spain.axes[0], ) # Canary Islands -fig_canary = plt.figure(figsize=(8, 3)) -ax_canary = fig_canary.add_axes([0, 0, 1, 1]) -map_canary = create_map(coords_canary, ax=ax_canary) +fig_canary = create_map(coords_canary, figsize=(8, 3)) plot_cluster_points( longitudes=station_longitudes, latitudes=station_latitudes, clusters=mv_clusters, color_map=mv_color_map, - basemap=map_canary, - ax=ax_canary, + ax=fig_canary.axes[0], ) plt.show() + +# %% +# References +# ---------- +# +# .. footbibliography:: diff --git a/examples/plot_functional_regression.py b/examples/plot_functional_regression.py new file mode 100644 index 000000000..640b29e9f --- /dev/null +++ b/examples/plot_functional_regression.py @@ -0,0 +1,89 @@ +""" +Functional Linear Regression with multivariate covariates. +========================================================== + +This example explores the use of the linear regression with +multivariate (scalar) covariates and functional response. + +""" + +# Author: Rafael Hidalgo Alejo +# License: MIT + +import numpy as np +import pandas as pd +from sklearn.preprocessing import OneHotEncoder + +import skfda +from skfda.ml.regression import LinearRegression +from skfda.representation.basis import FDataBasis, FourierBasis + +# %% +# In this example, we will demonstrate the use of the Linear Regression with +# functional response and multivariate covariates using the +# :func:`weather ` dataset. +# It is possible to divide the weather stations into four groups: +# Atlantic, Pacific, Continental and Artic. +# There are a total of 35 stations in this dataset. + +X_weather, y_weather = skfda.datasets.fetch_weather( + return_X_y=True, as_frame=True, +) +fd = X_weather.iloc[:, 0].values + +# %% +# The main goal is knowing about the effect of stations' geographic location +# on the shape of the temperature curves. +# So we will have a model with a functional response, the temperature curve, +# and five covariates. The first one is the intercept (all entries equal to 1) +# and it shows the contribution of the Canadian mean temperature. The remaining +# covariates use one-hot encoding, with 1 if that weather station is in the +# corresponding climate zone and 0 otherwise. + +# We first create the one-hot encoding of the climates. + +enc = OneHotEncoder(handle_unknown='ignore') +enc.fit([['Atlantic'], ['Continental'], ['Pacific']]) +X = np.array(y_weather).reshape(-1, 1) +X = enc.transform(X).toarray() + +# %% +# Then, we construct a dataframe with each covariate in a different column and +# the temperature curves (responses). + +X_df = pd.DataFrame(X) + +y_basis = FourierBasis(n_basis=65) +y_fd = fd.coordinates[0].to_basis(y_basis) + +# %% +# An intercept term is incorporated. +# All functional coefficients will have the same basis as the response. + +funct_reg = LinearRegression(fit_intercept=True) +funct_reg.fit(X_df, y_fd) + +# %% +# The regression coefficients are shown below. The first one is the intercept +# coefficient, corresponding to Canadian mean temperature. + +funct_reg.intercept_.plot() +funct_reg.coef_[0].plot() +funct_reg.coef_[1].plot() +funct_reg.coef_[2].plot() + +# %% +# Finally, it is shown a panel with the prediction for all climate zones. + +predictions = [] + +predictions.append(funct_reg.predict([[0, 1, 0, 0]])[0]) +predictions.append(funct_reg.predict([[0, 0, 1, 0]])[0]) +predictions.append(funct_reg.predict([[0, 0, 0, 1]])[0]) + +predictions_conc = FDataBasis.concatenate(*predictions) + +predictions_conc.argument_names = ('day',) +predictions_conc.coordinate_names = ('temperature (ºC)',) + +predictions_conc.plot() diff --git a/examples/full_examples/plot_phonemes_classification.py b/examples/plot_phonemes_classification.py similarity index 66% rename from examples/full_examples/plot_phonemes_classification.py rename to examples/plot_phonemes_classification.py index 75d0fd8e7..4b9eafd1e 100644 --- a/examples/full_examples/plot_phonemes_classification.py +++ b/examples/plot_phonemes_classification.py @@ -24,9 +24,22 @@ from skfda.preprocessing.registration import FisherRaoElasticRegistration from skfda.preprocessing.smoothing import KernelSmoother -############################################################################## -# We will first load the (binary) Phoneme dataset, restricted to the first -# 150 variables, and plot the first 20 functions. +# %% +# This example uses the Phoneme dataset\ :footcite:`hastie++_1995_penalized` +# containing the frequency curves of some common phonemes as pronounced by +# different people. +# We illustrate with this data the preprocessing and +# classification techniques available in scikit-fda. +# +# This is one of the examples presented in the ICTAI conference\ +# :footcite:p:`ramos-carreno++_2022_scikitfda`. + +# %% +# We will first load the (binary) Phoneme dataset and plot the first 20 +# functions. +# We restrict the data to the first 150 variables, as done in +# :footcite:t:`ferraty+vieu_2006_computational`, because most of the +# useful information is in the lower frequencies. X, y = fetch_phoneme(return_X_y=True) X = X[(y == 0) | (y == 1)] @@ -47,8 +60,11 @@ X[:n_plot].plot(group=y) plt.show() -############################################################################## -# We now smooth and plot the data again, as well as the class means. +# %% +# As we just saw, the curves are very noisy. +# We can leverage the continuity of the trajectories by smoothing, using +# a Nadaraya-Watson estimator. +# We then plot the data again, as well as the class means. smoother = KernelSmoother( NadarayaWatsonHatMatrix( bandwidth=0.1, @@ -66,8 +82,23 @@ X_smooth_ao.mean().plot(fig=fig, color="red", linewidth=3) plt.show() -############################################################################## -# We now register the data per class. As Fisher-Rao elastic registration is +# %% +# The smoothed curves are easier to interpret. +# Now, it is possible to appreciate the characteristic landmarks of each class, +# such as maxima or minima. +# However, not all these landmarks appear at the same frequency for each +# trajectory. +# One way to solve it is by registering (aligning) the data. +# We use Fisher-Rao elastic registration, a state-of-the-art registration +# method to illustrate the effect of registration. +# Although this registration method achieves very good results, it +# attempts to align all the curves to a common template. +# Thus, in order to clearly view the specific landmarks of each class we +# have to register the data per class. +# This also means that if the we cannot use this method for a classification +# task if the landmarks of each class are very different, as it is not able +# to do per-class registration with unlabeled data. +# As Fisher-Rao elastic registration is # very slow, we only register the plotted curves as an approximation. reg = FisherRaoElasticRegistration( penalty=0.01, @@ -83,7 +114,7 @@ X_reg_ao.mean().plot(fig=fig, color="red", linewidth=3) plt.show() -############################################################################## +# %% # We now split the smoothed data in train and test datasets. # Note that there is no data leakage because no parameters are fitted in # the smoothing step, but normally you would want to do all preprocessing in @@ -97,7 +128,7 @@ stratify=y, ) -############################################################################## +# %% # We use a k-nn classifier with a functional analog to the Mahalanobis # distance and a fixed number of neighbors. n_neighbors = int(np.sqrt(X_smooth.n_samples)) @@ -115,7 +146,7 @@ score = accuracy_score(y_test, y_pred) print(score) -############################################################################## +# %% # If we wanted to optimize hyperparameters, we can use scikit-learn tools. pipeline = Pipeline([ ("smoother", smoother), @@ -138,3 +169,9 @@ # y_pred = grid_search.predict(X_test) # score = accuracy_score(y_test, y_pred) # print(score) + +# %% +# References +# ---------- +# +# .. footbibliography:: diff --git a/examples/full_examples/plot_tecator_regression.py b/examples/plot_tecator_regression.py similarity index 68% rename from examples/full_examples/plot_tecator_regression.py rename to examples/plot_tecator_regression.py index 96333ce4a..3c19dfa3a 100644 --- a/examples/full_examples/plot_tecator_regression.py +++ b/examples/plot_tecator_regression.py @@ -25,7 +25,19 @@ ) from skfda.representation.basis import BSplineBasis -############################################################################## +# %% +# This example uses the Tecator dataset\ +# :footcite:`borggaard+thodberg_1992_optimal` +# in order to illustrate the problems of functional regression +# and functional variable selection. +# This dataset contains the spectra of absorbances of several pieces of +# finely chopped meat, as well as the percent of its content in water, +# fat and protein. +# +# This is one of the examples presented in the ICTAI conference\ +# :footcite:p:`ramos-carreno++_2022_scikitfda`. + +# %% # We will first load the Tecator data, keeping only the fat content target, # and plot it. X, y = fetch_tecator(return_X_y=True) @@ -34,13 +46,17 @@ X.plot(gradient_criteria=y) plt.show() -############################################################################## -# We compute numerically the second derivative and plot it. +# %% +# For spectrometric data, the relevant information of the curves can often +# be found in the derivatives\ :footcite:`ferraty+vieu_2006_computational`. +# Thus, we compute numerically the second derivative and plot it. X_der = X.derivative(order=2) X_der.plot(gradient_criteria=y) plt.show() -############################################################################## +# %% +# We first apply a simple linear regression model to compute a baseline +# for our regression predictions. # In order to compute functional linear regression we first convert the data # to a basis expansion. basis = BSplineBasis( @@ -48,7 +64,7 @@ ) X_der_basis = X_der.to_basis(basis) -############################################################################## +# %% # We split the data in train and test, and compute the regression score using # the linear regression model. X_train, X_test, y_train, y_test = train_test_split( @@ -63,8 +79,15 @@ score = r2_score(y_test, y_pred) print(score) -############################################################################## -# Another approach is to use variable selection using maxima hunting. +# %% +# We now will take a different approach. +# It is possible to note from the plot of the derivatives that most information +# necessary for regression can be found at some particular "impact" points. +# Thus, we now apply a functional variable selection method to detect those +# points and use them with a multivariate classifier. +# The variable selection method that we employ here is maxima hunting\ +# :footcite:`berrendero++_2016_variable`, a filter method that computes a +# relevance score for each point of the curve and selects all the local maxima. var_sel = MaximaHunting( local_maxima_selector=RelativeLocalMaximaSelector(max_points=2), ) @@ -72,21 +95,21 @@ print(var_sel.indexes_) -############################################################################## +# %% # We can visualize the relevance function and the selected points. var_sel.dependence_.plot() for p in var_sel.indexes_: plt.axvline(X_der.grid_points[0][p], color="black") plt.show() -############################################################################## +# %% # We also can visualize the selected points on the curves. X_der.plot(gradient_criteria=y) for p in var_sel.indexes_: plt.axvline(X_der.grid_points[0][p], color="black") plt.show() -############################################################################## +# %% # We split the data again (using the same seed), but this time without the # basis expansion. X_train, X_test, y_train, y_test = train_test_split( @@ -95,7 +118,7 @@ random_state=0, ) -############################################################################## +# %% # We now make a pipeline with the variable selection and a multivariate linear # regression method for comparison. pipeline = Pipeline([ @@ -107,7 +130,7 @@ score = r2_score(y_test, y_predicted) print(score) -############################################################################## +# %% # We can use a tree regressor instead to improve both the score and the # interpretability. pipeline = Pipeline([ @@ -119,8 +142,14 @@ score = r2_score(y_test, y_predicted) print(score) -############################################################################## +# %% # We can plot the final version of the tree to explain every prediction. fig, ax = plt.subplots(figsize=(10, 10)) plot_tree(pipeline.named_steps["classifier"], precision=6, filled=True, ax=ax) plt.show() + +# %% +# References +# ---------- +# +# .. footbibliography:: diff --git a/pyproject.toml b/pyproject.toml index dc5ea6708..48c8132f4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,7 +2,7 @@ name = "scikit-fda" description = "Functional Data Analysis Python package." readme = "README.rst" -requires-python = ">=3.8" +requires-python = ">=3.9" license = {file = "LICENSE.txt"} keywords = [ "functional data", @@ -20,7 +20,7 @@ classifiers = [ "Natural Language :: English", "Operating System :: OS Independent", "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", "Topic :: Scientific/Engineering :: Mathematics", "Topic :: Software Development :: Libraries :: Python Modules", "Typing :: Typed", @@ -30,11 +30,11 @@ dynamic = ["version"] dependencies = [ "dcor", - "fdasrsf>=2.2.0", + "fdasrsf>=2.2.0, !=2.5.7", "findiff", "lazy_loader", "matplotlib", - "multimethod>=1.5", + "multimethod>=1.5, !=1.11, != 1.11.1", "numpy>=1.16", "pandas>=1.0", "rdata", @@ -45,6 +45,19 @@ dependencies = [ ] [project.optional-dependencies] +docs = [ + "cartopy", + "ipykernel", + "jupyter-sphinx", + "myst-parser", + "pillow", + "pydata-sphinx-theme", + "pytest", + "setuptools>=41.2", + "sphinx>=3", + "sphinx-gallery", + "sphinxcontrib-bibtex", +] test = [ "pytest", "pytest-env", diff --git a/readthedocs-requirements.txt b/readthedocs-requirements.txt deleted file mode 100644 index f646304da..000000000 --- a/readthedocs-requirements.txt +++ /dev/null @@ -1,14 +0,0 @@ --r ./requirements.txt -basemap -basemap-data -basemap-data-hires -ipykernel -Sphinx>=3 -pydata-sphinx-theme -sphinx-gallery -pillow -setuptools>=41.2 -jupyter-sphinx -myst-parser -pytest -sphinxcontrib-bibtex \ No newline at end of file diff --git a/readthedocs.yml b/readthedocs.yml index 2c148156d..46689c30d 100644 --- a/readthedocs.yml +++ b/readthedocs.yml @@ -5,6 +5,11 @@ # Required version: 2 +build: + os: ubuntu-22.04 + tools: + python: "3.9" + # Build documentation in the docs/ directory with Sphinx sphinx: builder: html @@ -18,8 +23,8 @@ sphinx: # Optionally set the version of Python and requirements required to build your docs python: - version: "3.8" install: - - requirements: readthedocs-requirements.txt - method: pip - path: . \ No newline at end of file + path: . + extra_requirements: + - docs \ No newline at end of file diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index 2a806336a..000000000 --- a/requirements.txt +++ /dev/null @@ -1,7 +0,0 @@ -matplotlib -numpy -scipy -setuptools -scikit-learn -multimethod>=1.2 -findiff diff --git a/setup.cfg b/setup.cfg index bcbba2d86..f628b58b6 100644 --- a/setup.cfg +++ b/setup.cfg @@ -84,6 +84,8 @@ ignore = WPS507, # Comparison with not is not the same as with equality WPS520, + # Found bad magic module function: {0} + WPS413 per-file-ignores = __init__.py: diff --git a/skfda/__init__.py b/skfda/__init__.py index 47a67be43..35402d18d 100644 --- a/skfda/__init__.py +++ b/skfda/__init__.py @@ -30,4 +30,4 @@ concatenate as concatenate, ) -__version__ = "0.9" +__version__ = "0.9.1" diff --git a/skfda/_utils/__init__.py b/skfda/_utils/__init__.py index 387f22b58..bb0636fdd 100644 --- a/skfda/_utils/__init__.py +++ b/skfda/_utils/__init__.py @@ -20,6 +20,7 @@ "_same_domain", "_to_grid", "_to_grid_points", + "function_to_fdatabasis", "nquad_vec", ], '_warping': [ @@ -44,9 +45,9 @@ _same_domain as _same_domain, _to_grid as _to_grid, _to_grid_points as _to_grid_points, + function_to_fdatabasis as function_to_fdatabasis, nquad_vec as nquad_vec, ) - from ._warping import ( invert_warping as invert_warping, normalize_scale as normalize_scale, diff --git a/skfda/_utils/_utils.py b/skfda/_utils/_utils.py index 9a6382b57..aebb5189f 100644 --- a/skfda/_utils/_utils.py +++ b/skfda/_utils/_utils.py @@ -4,7 +4,6 @@ import functools import numbers -from functools import singledispatch from typing import ( TYPE_CHECKING, Any, @@ -36,7 +35,7 @@ ArrayDTypeT = TypeVar("ArrayDTypeT", bound="np.generic") if TYPE_CHECKING: - from ..representation import FData, FDataGrid + from ..representation import FData, FDataBasis, FDataGrid from ..representation.basis import Basis from ..representation.extrapolation import ExtrapolationLike @@ -605,3 +604,36 @@ def _classifier_get_classes( f'one; got {classes.size} class', ) return classes, y_ind + + +def function_to_fdatabasis( + f: Callable[[NDArrayFloat], NDArrayFloat], + new_basis: Basis, +) -> FDataBasis: + """Express a math function as a FDataBasis with a given basis. + + Args: + f: math function. + new_basis: the basis of the output. + + Returns: + FDataBasis: FDataBasis with calculated coefficients and the new + basis. + """ + from .. import FDataBasis # noqa: WPS442 + from ..misc._math import inner_product_matrix + + if isinstance(f, FDataBasis) and f.basis == new_basis: + return f.copy() + + inner_prod = inner_product_matrix( + new_basis, + f, + _domain_range=new_basis.domain_range, + ) + + gram_matrix = new_basis.gram_matrix() + + coefs = np.linalg.solve(gram_matrix, inner_prod) + + return FDataBasis(new_basis, coefs.T) diff --git a/skfda/datasets/_real_datasets.py b/skfda/datasets/_real_datasets.py index d345ebdcf..833e398fe 100644 --- a/skfda/datasets/_real_datasets.py +++ b/skfda/datasets/_real_datasets.py @@ -5,11 +5,12 @@ import numpy as np import pandas as pd -import rdata from pandas import DataFrame, Series from sklearn.utils import Bunch from typing_extensions import Literal +import rdata + from ..representation import FDataGrid from ..typing._numpy import NDArrayFloat, NDArrayInt @@ -292,6 +293,7 @@ def _fetch_fda_usc(name: str) -> Any: Discriminant Analysis. Ann. Statist. 23 (1995), no. 1, 73--102. doi:10.1214/aos/1176324456. https://projecteuclid.org/euclid.aos/1176324456 + """ @@ -886,14 +888,15 @@ def fetch_weather( Series of daily summaries of 73 spanish weather stations selected for the period 1980-2009. The dataset contains the geographic information of each station and the average for the period 1980-2009 of daily temperature, - daily precipitation and daily wind speed. Meteorological State Agency of - Spain (AEMET), http://www.aemet.es/. Government of Spain. - - Authors: - Manuel Febrero Bande, Manuel Oviedo de la Fuente + daily precipitation and daily wind speed. Source: + Meteorological State Agency of Spain (AEMET). (2009). + Meteorological data of Spanish weather stations between years 1980 + and 2009. [dataset]. http://www.aemet.es/ + The data were obtained from the FTP of AEMET in 2009. + """ diff --git a/skfda/exploratory/stats/__init__.py b/skfda/exploratory/stats/__init__.py index 0a1f3a6de..345eb30ac 100644 --- a/skfda/exploratory/stats/__init__.py +++ b/skfda/exploratory/stats/__init__.py @@ -19,6 +19,7 @@ "gmean", "mean", "modified_epigraph_index", + "std", "trim_mean", "var", ], @@ -37,6 +38,7 @@ gmean as gmean, mean as mean, modified_epigraph_index as modified_epigraph_index, + std as std, trim_mean as trim_mean, var as var, ) diff --git a/skfda/exploratory/stats/_stats.py b/skfda/exploratory/stats/_stats.py index 8d5b2478c..5bf81b3b9 100644 --- a/skfda/exploratory/stats/_stats.py +++ b/skfda/exploratory/stats/_stats.py @@ -1,6 +1,7 @@ """Functional data descriptive statistics.""" from __future__ import annotations +import functools from builtins import isinstance from typing import Callable, TypeVar, Union @@ -11,7 +12,7 @@ from skfda._utils.ndfunction import average_function_value from ...misc.metrics._lp_distances import l2_distance -from ...representation import FData, FDataGrid +from ...representation import FData, FDataBasis, FDataGrid from ...typing._metric import Metric from ...typing._numpy import NDArrayFloat from ..depth import Depth, ModifiedBandDepth @@ -101,6 +102,64 @@ def cov( return X.cov(correction=correction) +@functools.singledispatch +def std(X: F, correction: int = 1) -> F: + r""" + Compute the standard deviation of all the samples in a FData object. + + .. math:: + \text{std}_X(t) = \sqrt{\frac{1}{N-\text{correction}} + \sum_{n=1}^{N}{\left(X_n(t) - \overline{X}(t)\right)^2}} + + Args: + X: Object containing all the samples whose standard deviation is + wanted. + correction: degrees of freedom adjustment. The divisor used in the + calculation is `N - correction`, where `N` represents the number of + elements. Default: `0`. + + Returns: + Standard deviation of all the samples in the original object, as a + :term:`functional data object` with just one sample. + + """ + raise NotImplementedError("Not implemented for this type") + + +@std.register +def std_fdatagrid(X: FDataGrid, correction: int = 1) -> FDataGrid: + """Compute the standard deviation of a FDataGrid.""" + return X.copy( + data_matrix=np.std( + X.data_matrix, axis=0, ddof=correction, + )[np.newaxis, ...], + sample_names=(None,), + ) + + +@std.register +def std_fdatabasis(X: FDataBasis, correction: int = 1) -> FDataBasis: + """Compute the standard deviation of a FDataBasis.""" + from ..._utils import function_to_fdatabasis + + basis = X.basis + coeff_cov_matrix = np.cov( + X.coefficients, rowvar=False, ddof=correction, + ).reshape((basis.n_basis, basis.n_basis)) + + def std_function(t_points: NDArrayFloat) -> NDArrayFloat: # noqa: WPS430 + basis_evaluation = basis(t_points).reshape((basis.n_basis, -1)) + std_values = np.sqrt( + np.sum( + basis_evaluation * (coeff_cov_matrix @ basis_evaluation), + axis=0, + ), + ) + return np.reshape(std_values, (1, -1, X.dim_codomain)) + + return function_to_fdatabasis(f=std_function, new_basis=X.basis) + + def modified_epigraph_index(X: FDataGrid) -> NDArrayFloat: """ Calculate the Modified Epigraph Index of a FDataGrid. diff --git a/skfda/exploratory/visualization/fpca.py b/skfda/exploratory/visualization/fpca.py index 94f53f561..54d8914f9 100644 --- a/skfda/exploratory/visualization/fpca.py +++ b/skfda/exploratory/visualization/fpca.py @@ -105,8 +105,8 @@ def _get_component_perturbations(self, index: int = 0) -> FData: raise AttributeError("X must be a FData object") perturbations = self.mean.copy() perturbations = perturbations.concatenate( - perturbations[0] + self.multiple * self.components[index], + perturbations[0] + self.factor * self.components[index], ) return perturbations.concatenate( - perturbations[0] - self.multiple * self.components[index], + perturbations[0] - self.factor * self.components[index], ) diff --git a/skfda/inference/anova/_anova_oneway.py b/skfda/inference/anova/_anova_oneway.py index 9b4752c20..d95430b71 100644 --- a/skfda/inference/anova/_anova_oneway.py +++ b/skfda/inference/anova/_anova_oneway.py @@ -182,7 +182,13 @@ def v_asymptotic_stat( .. footbibliography:: """ - return float(_v_asymptotic_stat_with_reps(*fd, weights=weights, p=p)) + return float( + _v_asymptotic_stat_with_reps( + *fd, + weights=weights, + p=p, + ).reshape(()) + ) def _anova_bootstrap( diff --git a/skfda/misc/_math.py b/skfda/misc/_math.py index 94b440504..5d63f9100 100644 --- a/skfda/misc/_math.py +++ b/skfda/misc/_math.py @@ -12,7 +12,7 @@ import numpy as np import scipy.integrate -from .._utils import nquad_vec +from .._utils import _same_domain, nquad_vec from ..representation import FData, FDataBasis, FDataGrid from ..representation.basis import Basis from ..typing._base import DomainRange @@ -243,6 +243,7 @@ def inner_product( Args: arg1: First sample. arg2: Second sample. + kwargs: Additional parameters for the function. Returns: Vector with the inner products of each pair of samples. @@ -388,10 +389,17 @@ def _inner_product_fdatabasis( arg2: Union[FDataBasis, Basis], *, _matrix: bool = False, + _domain_range: Optional[DomainRange] = None, inner_product_matrix: Optional[NDArrayFloat] = None, force_numerical: bool = False, ) -> NDArrayFloat: + if not _same_domain(arg1, arg2): + raise ValueError("Both Objects should have the same domain_range") + + if _domain_range and not np.array_equal(arg1.domain_range, _domain_range): + raise ValueError("_domain_range should be the same as arg objects") + if isinstance(arg1, Basis): arg1 = arg1.to_basis() @@ -479,8 +487,15 @@ def _inner_product_integrate( def integrand(*args: NDArrayFloat) -> NDArrayFloat: # noqa: WPS430 f_args = np.asarray(args) - f1 = arg1(f_args)[:, 0, :] - f2 = arg2(f_args)[:, 0, :] + try: + f1 = arg1(f_args)[:, 0, :] + except Exception: + f1 = arg1(f_args) + + try: + f2 = arg2(f_args)[:, 0, :] + except Exception: + f2 = arg2(f_args) if _matrix: ret = np.einsum('n...,m...->nm...', f1, f2) diff --git a/skfda/misc/metrics/_utils.py b/skfda/misc/metrics/_utils.py index 9850057a0..30d1fae7e 100644 --- a/skfda/misc/metrics/_utils.py +++ b/skfda/misc/metrics/_utils.py @@ -120,8 +120,8 @@ class NormInducedMetric(Metric[VectorType]): >>> l2_distance = NormInducedMetric(l2_norm) >>> d = l2_distance(fd, fd2) - >>> float('%.3f'% d) - 0.289 + >>> float(d[0]) + 0.288... """ diff --git a/skfda/ml/classification/_logistic_regression.py b/skfda/ml/classification/_logistic_regression.py index ae95a1380..37ff0e4f6 100644 --- a/skfda/ml/classification/_logistic_regression.py +++ b/skfda/ml/classification/_logistic_regression.py @@ -62,8 +62,8 @@ class LogisticRegression( >>> from skfda.datasets import make_gaussian_process >>> from skfda.ml.classification import LogisticRegression >>> - >>> n_samples = 10000 - >>> n_features = 200 + >>> n_samples = 2000 + >>> n_features = 101 >>> >>> def mean_1(t): ... return (np.abs(t - 0.25) @@ -90,7 +90,7 @@ class LogisticRegression( >>> np.allclose(sorted(lr.points_), [0.25, 0.5, 0.75], rtol=1e-2) True >>> lr.score(X[1::2],y[1::2]) - 0.7498 + 0.768 References: .. footbibliography:: @@ -132,11 +132,9 @@ def fit( # noqa: D102, WPS210 (self.max_features, n_features), ) - penalty = 'none' if self.penalty is None else self.penalty - # multivariate logistic regression mvlr = mvLogisticRegression( - penalty=penalty, + penalty=self.penalty, C=self.C, solver=self.solver, max_iter=self.max_iter, diff --git a/skfda/ml/clustering/_hierarchical.py b/skfda/ml/clustering/_hierarchical.py index c8f63cf81..75e06e8f3 100644 --- a/skfda/ml/clustering/_hierarchical.py +++ b/skfda/ml/clustering/_hierarchical.py @@ -173,7 +173,7 @@ def _init_estimator(self) -> None: self._estimator = sklearn.cluster.AgglomerativeClustering( n_clusters=self.n_clusters, - affinity='precomputed', + metric='precomputed', memory=self.memory, connectivity=self.connectivity, compute_full_tree=self.compute_full_tree, diff --git a/skfda/ml/regression/__init__.py b/skfda/ml/regression/__init__.py index d01c3fdc4..3fb0a892c 100644 --- a/skfda/ml/regression/__init__.py +++ b/skfda/ml/regression/__init__.py @@ -10,6 +10,7 @@ "_kernel_regression": ["KernelRegression"], "_linear_regression": ["LinearRegression"], "_fpca_regression": ["FPCARegression"], + "_fpls_regression": ["FPLSRegression"], "_neighbors_regression": [ "KNeighborsRegressor", "RadiusNeighborsRegressor", @@ -19,6 +20,7 @@ if TYPE_CHECKING: from ._fpca_regression import FPCARegression + from ._fpls_regression import FPLSRegression as FPLSRegression from ._historical_linear_model import ( HistoricalLinearRegression as HistoricalLinearRegression, ) diff --git a/skfda/ml/regression/_coefficients.py b/skfda/ml/regression/_coefficients.py index e9cd9f72d..fe6abde11 100644 --- a/skfda/ml/regression/_coefficients.py +++ b/skfda/ml/regression/_coefficients.py @@ -157,8 +157,12 @@ def coefficient_info_from_covariate( def _coefficient_info_from_covariate_ndarray( # type: ignore[misc] X: NDArrayFloat, y: NDArrayFloat, + basis: Basis = None, **_: Any, ) -> CoefficientInfo[NDArrayFloat]: + if basis: + return CoefficientInfoNdarray(basis=basis) + return CoefficientInfoNdarray(basis=np.identity(X.shape[1], dtype=X.dtype)) diff --git a/skfda/ml/regression/_fpls_regression.py b/skfda/ml/regression/_fpls_regression.py new file mode 100644 index 000000000..ea4dd4eac --- /dev/null +++ b/skfda/ml/regression/_fpls_regression.py @@ -0,0 +1,118 @@ +from __future__ import annotations + +from typing import Any, TypeVar, Union + +from sklearn.utils.validation import check_is_fitted + +from ..._utils._sklearn_adapter import BaseEstimator, RegressorMixin +from ...misc.regularization import L2Regularization +from ...preprocessing.dim_reduction import FPLS +from ...representation import FDataGrid +from ...representation.basis import Basis, FDataBasis +from ...typing._numpy import NDArrayFloat + +InputType = TypeVar( + "InputType", + bound=Union[FDataGrid, FDataBasis, NDArrayFloat], +) + +OutputType = TypeVar( + "OutputType", + bound=Union[FDataGrid, FDataBasis, NDArrayFloat], +) + + +class FPLSRegression( + BaseEstimator, + RegressorMixin[InputType, OutputType], +): + r""" + Regression using Functional Partial Least Squares. + + Parameters: + n_components: Number of components to keep. + By default all available components are utilized. + regularization_X: Regularization for the calculation of the X weights. + weight_basis_X: Basis to use for the X block. Only + applicable if X is a FDataBasis. Otherwise it must be None. + weight_basis_Y: Basis to use for the Y block. Only + applicable if Y is a FDataBasis. Otherwise it must be None. + + Attributes: + coef\_: Coefficients of the linear model. + fpls\_: FPLS object used to fit the model. + + Examples: + Fit a FPLS regression model with two components. + + >>> from skfda.ml.regression import FPLSRegression + >>> from skfda.datasets import fetch_tecator + >>> from skfda.representation import FDataGrid + >>> from skfda.typing._numpy import NDArrayFloat + + >>> X, y = fetch_tecator(return_X_y=True) + >>> fpls = FPLSRegression[FDataGrid, NDArrayFloat](n_components=2) + >>> fpls = fpls.fit(X, y) + + """ + + def __init__( + self, + n_components: int | None = None, + regularization_X: L2Regularization[Any] | None = None, + weight_basis_X: Basis | None = None, + weight_basis_Y: Basis | None = None, + _integration_weights_X: NDArrayFloat | None = None, + _integration_weights_Y: NDArrayFloat | None = None, + ) -> None: + self.n_components = n_components + self._integration_weights_X = _integration_weights_X + self._integration_weights_Y = _integration_weights_Y + self.regularization_X = regularization_X + self.weight_basis_X = weight_basis_X + self.weight_basis_Y = weight_basis_Y + + def fit( + self, + X: InputType, + y: OutputType, + ) -> FPLSRegression[InputType, OutputType]: + """ + Fit the model using the data for both blocks. + + Args: + X: Data of the X block + y: Data of the Y block + + Returns: + self + """ + self.fpls_ = FPLS[InputType, OutputType]( + n_components=self.n_components, + regularization_X=self.regularization_X, + component_basis_X=self.weight_basis_X, + component_basis_Y=self.weight_basis_Y, + _integration_weights_X=self._integration_weights_X, + _integration_weights_Y=self._integration_weights_Y, + _deflation_mode="reg", + ) + + self.fpls_.fit(X, y) + + self.coef_ = ( + self.fpls_.x_rotations_matrix_ + @ self.fpls_.y_loadings_matrix_.T + ) + return self + + def predict(self, X: InputType) -> OutputType: + """Predict using the model. + + Args: + X: Data to predict. + + Returns: + Predicted values. + """ + check_is_fitted(self) + return self.fpls_.inverse_transform_y(self.fpls_.transform_x(X)) diff --git a/skfda/ml/regression/_linear_regression.py b/skfda/ml/regression/_linear_regression.py index 8b1ba5fee..f672a2923 100644 --- a/skfda/ml/regression/_linear_regression.py +++ b/skfda/ml/regression/_linear_regression.py @@ -2,17 +2,27 @@ import itertools import warnings -from typing import Any, Iterable, List, Optional, Sequence, Tuple, Union +from typing import ( + Any, + Callable, + Iterable, + List, + Optional, + Sequence, + Tuple, + Union, +) import numpy as np import pandas as pd from sklearn.utils.validation import check_is_fitted +from ..._utils import function_to_fdatabasis, nquad_vec from ..._utils._sklearn_adapter import BaseEstimator, RegressorMixin from ...misc.lstsq import solve_regularized_weighted_lstsq from ...misc.regularization import L2Regularization, compute_penalty_matrix -from ...representation import FData -from ...representation.basis import Basis +from ...representation import FData, FDataBasis +from ...representation.basis import Basis, ConstantBasis from ...typing._numpy import NDArrayFloat from ._coefficients import CoefficientInfo, coefficient_info_from_covariate @@ -56,13 +66,17 @@ class LinearRegression( NDArrayFloat, ], ): - r"""Linear regression with multivariate response. + r"""Linear regression with multivariate and functional response. This is a regression algorithm equivalent to multivariate linear regression, but accepting also functional data expressed in a basis expansion. - The model assumed by this method is: + Functional linear regression model is subdivided into three broad + categories, depending on whether the responses or the covariates, + or both, are curves. + + Particulary, when the response is scalar, the model assumed is: .. math:: y = w_0 + w_1 x_1 + \ldots + w_p x_p + \int w_{p+1}(t) x_{p+1}(t) dt \ @@ -71,24 +85,47 @@ class LinearRegression( where the covariates can be either multivariate or functional and the response is multivariate. + When the response is functional, the model assumed is: + + .. math:: + y(t) = \boldsymbol{X} \boldsymbol{\beta}(t) + + where the covariates are multivariate and the response is functional; or: + + .. math:: + y(t) = \boldsymbol{X}(t) \boldsymbol{\beta}(t) + + if the model covariates are also functional (concurrent). + + .. deprecated:: 0.8. Usage of arguments of type sequence of FData, ndarray is deprecated in methods fit, predict. Use covariate parameters of type pandas.DataFrame instead. - .. warning:: - For now, only scalar responses are supported. + Warning: + For now, only multivariate and concurrent convariates are supported + when the response is functional. + + Warning: + This functionality is still experimental. Users should be aware that + the API will suffer heavy changes in the future. Args: coef_basis (iterable): Basis of the coefficient functions of the - functional covariates. If multivariate data is supplied, their - corresponding entries should be ``None``. If ``None`` is provided - for a functional covariate, the same basis is assumed. If this - parameter is ``None`` (the default), it is assumed that ``None`` - is provided for all covariates. + functional covariates. + When the response is scalar, if multivariate data is supplied, + their corresponding entries should be ``None``. If ``None`` is + provided for a functional covariate, the same basis is assumed. + If this parameter is ``None`` (the default), it is assumed that + ``None`` is provided for all covariates. + When the response is functional, if ``None`` is provided, the + response basis is used for all covariates. fit_intercept: Whether to calculate the intercept for this model. If set to False, no intercept will be used in calculations (i.e. data is expected to be centered). + When the response is functional, a coef_basis for intercept must be + supplied. regularization (int, iterable or :class:`Regularization`): If it is not a :class:`Regularization` object, linear differential operator regularization is assumed. If it @@ -104,118 +141,160 @@ class LinearRegression( Attributes: coef\_: A list containing the weight coefficient for each - covariate. For multivariate data, the covariate is a Numpy array. - For functional data, the covariate is a FDataBasis object. + covariate. intercept\_: Independent term in the linear model. Set to 0.0 if `fit_intercept = False`. Examples: - >>> from skfda.ml.regression import LinearRegression - >>> from skfda.representation.basis import (FDataBasis, MonomialBasis, - ... ConstantBasis) - >>> import pandas as pd - Multivariate linear regression can be used with functions expressed in + Functional linear regression can be used with functions expressed in a basis. Also, a functional basis for the weights can be specified: + >>> from skfda.ml.regression import LinearRegression + >>> from skfda.representation.basis import ( + ... FDataBasis, + ... MonomialBasis, + ... ConstantBasis, + ... ) + >>> x_basis = MonomialBasis(n_basis=3) - >>> x_fd = FDataBasis(x_basis, [[0, 0, 1], - ... [0, 1, 0], - ... [0, 1, 1], - ... [1, 0, 1]]) - >>> y = [2, 3, 4, 5] - >>> linear = LinearRegression() - >>> _ = linear.fit(x_fd, y) - >>> linear.coef_[0] + >>> X_train = FDataBasis( + ... basis=x_basis, + ... coefficients=[ + ... [0, 0, 1], + ... [0, 1, 0], + ... [0, 1, 1], + ... [1, 0, 1], + ... ], + ... ) + >>> y_train = [2, 3, 4, 5] + >>> X_test = X_train # Just for illustration purposes + >>> linear_reg = LinearRegression() + >>> _ = linear_reg.fit(X_train, y_train) + >>> linear_reg.coef_[0] FDataBasis( basis=MonomialBasis(domain_range=((0.0, 1.0),), n_basis=3), coefficients=[[-15. 96. -90.]], ...) - >>> linear.intercept_ + >>> linear_reg.intercept_ array([ 1.]) - >>> linear.predict(x_fd) + >>> linear_reg.predict(X_test) array([ 2., 3., 4., 5.]) - Covariates can include also multivariate data: + Covariates can include also multivariate data. + In order to mix functional and multivariate data a dataframe can + be used: + >>> import pandas as pd >>> x_basis = MonomialBasis(n_basis=2) - >>> x_fd = FDataBasis(x_basis, [[0, 2], - ... [0, 4], - ... [1, 0], - ... [2, 0], - ... [1, 2], - ... [2, 2]]) - >>> x = [[1, 7], [2, 3], [4, 2], [1, 1], [3, 1], [2, 5]] - >>> y = [11, 10, 12, 6, 10, 13] - >>> linear = LinearRegression( - ... coef_basis=[None, ConstantBasis()]) - >>> _ = linear.fit([x, x_fd], y) - >>> linear.coef_[0] - array([ 2., 1.]) - >>> linear.coef_[1] + >>> X_train = pd.DataFrame({ + ... "functional_covariate": FDataBasis( + ... basis=x_basis, + ... coefficients=[ + ... [0, 2], + ... [0, 4], + ... [1, 0], + ... [2, 0], + ... [1, 2], + ... [2, 2], + ... ] + ... ), + ... "multivariate_covariate_1": [1, 2, 4, 1, 3, 2], + ... "multivariate_covariate_2": [7, 3, 2, 1, 1, 5], + ... }) + >>> y_train = [11, 10, 12, 6, 10, 13] + >>> X_test = X_train # Just for illustration purposes + >>> linear_reg = LinearRegression( + ... coef_basis=[ConstantBasis(), None, None], + ... ) + >>> _ = linear_reg.fit(X_train, y_train) + >>> linear_reg.coef_[0] FDataBasis( - basis=ConstantBasis(domain_range=((0.0, 1.0),), n_basis=1), - coefficients=[[ 1.]], - ...) - >>> linear.intercept_ + basis=ConstantBasis(domain_range=((0.0, 1.0),), n_basis=1), + coefficients=[[ 1.]], + ...) + >>> linear_reg.coef_[1] + array([ 2.]) + >>> linear_reg.coef_[2] + array([ 1.]) + >>> linear_reg.intercept_ array([ 1.]) - >>> linear.predict([x, x_fd]) + >>> linear_reg.predict(X_test) array([ 11., 10., 12., 6., 10., 13.]) - Funcionality with pandas Dataframe. - - First example: - - >>> x_basis = MonomialBasis(n_basis=3) - >>> x_fd = FDataBasis(x_basis, [[0, 0, 1], - ... [0, 1, 0], - ... [0, 1, 1], - ... [1, 0, 1]]) - >>> cov_dict = { "fd": x_fd } - >>> y = [2, 3, 4, 5] - >>> df = pd.DataFrame(cov_dict) - >>> linear = LinearRegression() - >>> _ = linear.fit(df, y) - >>> linear.coef_[0] + Response can be functional when covariates are multivariate: + + >>> y_basis = MonomialBasis(n_basis=3) + >>> X_train = pd.DataFrame({ + ... "covariate_1": [3, 5, 3], + ... "covariate_2": [4, 1, 2], + ... "covariate_3": [1, 6, 8], + ... }) + >>> y_train = FDataBasis( + ... basis=y_basis, + ... coefficients=[ + ... [47, 22, 24], + ... [43, 47, 39], + ... [40, 53, 51], + ... ], + ... ) + >>> X_test = X_train # Just for illustration purposes + >>> linear_reg = LinearRegression( + ... regularization=None, + ... fit_intercept=False, + ... ) + >>> _ = linear_reg.fit(X_train, y_train) + >>> linear_reg.coef_[0] FDataBasis( basis=MonomialBasis(domain_range=((0.0, 1.0),), n_basis=3), - coefficients=[[-15. 96. -90.]], + coefficients=[[ 6. 3. 1.]], ...) - >>> linear.intercept_ - array([ 1.]) - >>> linear.predict(df) - array([ 2., 3., 4., 5.]) + >>> linear_reg.predict(X_test) + FDataBasis( + basis=MonomialBasis(domain_range=((0.0, 1.0),), n_basis=3), + coefficients=[[ 47. 22. 24.] + [ 43. 47. 39.] + [ 40. 53. 51.]], + ...) - Second example: + Concurrent function-on-function regression is also supported: - >>> x_basis = MonomialBasis(n_basis=2) - >>> x_fd = FDataBasis(x_basis, [[0, 2], - ... [0, 4], - ... [1, 0], - ... [2, 0], - ... [1, 2], - ... [2, 2]]) - >>> mult1 = np.asarray([1, 2, 4, 1, 3, 2]) - >>> mult2 = np.asarray([7, 3, 2, 1, 1, 5]) - >>> cov_dict = {"m1": mult1, "m2": mult2, "fd": x_fd} - >>> df = pd.DataFrame(cov_dict) - >>> y = [11, 10, 12, 6, 10, 13] - >>> linear = LinearRegression( - ... coef_basis=[None, ConstantBasis(), ConstantBasis()]) - >>> _ = linear.fit(df, y) - >>> linear.coef_[0] - array([ 2.]) - >>> linear.coef_[1] - array([ 1.]) - >>> linear.coef_[2] + >>> x_basis = MonomialBasis(n_basis=3) + >>> y_basis = MonomialBasis(n_basis=2) + >>> X_train = FDataBasis( + ... basis=x_basis, + ... coefficients=[ + ... [0, 1, 0], + ... [2, 1, 0], + ... [5, 0, 1], + ... ], + ... ) + >>> y_train = FDataBasis( + ... basis=y_basis, + ... coefficients=[ + ... [1, 1], + ... [2, 1], + ... [3, 1], + ... ], + ... ) + >>> X_test = X_train # Just for illustration purposes + >>> linear_reg = LinearRegression( + ... coef_basis=[y_basis], + ... fit_intercept=False, + ... ) + >>> _ = linear_reg.fit(X_train, y_train) + >>> linear_reg.coef_[0] FDataBasis( - basis=ConstantBasis(domain_range=((0.0, 1.0),), n_basis=1), - coefficients=[[ 1.]], + basis=MonomialBasis(domain_range=((0.0, 1.0),), n_basis=2), + coefficients=[[ 0.68250377 0.09960705]], ...) - >>> linear.intercept_ - array([ 1.]) - >>> linear.predict(df) - array([ 11., 10., 12., 6., 10., 13.]) + >>> linear_reg.predict(X_test) + FDataBasis( + basis=MonomialBasis(domain_range=((0.0, 1.0),), n_basis=2), + coefficients=[[-0.01660117 0.78211082] + [ 1.34840637 0.98132492] + [ 3.27884682 1.27018536]], + ...) """ @@ -233,7 +312,7 @@ def __init__( def fit( # noqa: D102 self, X: Union[AcceptedDataType, Sequence[AcceptedDataType], pd.DataFrame], - y: NDArrayFloat, + y: AcceptedDataType, sample_weight: Optional[NDArrayFloat] = None, ) -> LinearRegression: @@ -246,30 +325,33 @@ def fit( # noqa: D102 regularization: RegularizationIterableType = self.regularization + coef_basis = self._coef_basis + if self.fit_intercept: new_x = np.ones((len(y), 1)) X_new = [new_x] + list(X_new) + + intercept_basis = self._choose_beta_basis( + coef_basis=None, + x_basis=None, + y_basis=y.basis if isinstance(y, FDataBasis) else None, + ) + coef_basis = [intercept_basis] + coef_basis + new_coef_info_list: List[AcceptedDataCoefsType] = [ - coefficient_info_from_covariate(new_x, y), + coefficient_info_from_covariate( + new_x, + y, + basis=intercept_basis, + ), ] coef_info = new_coef_info_list + list(coef_info) - if isinstance(regularization, Iterable): - regularization = itertools.chain([None], regularization) - elif regularization is not None: - regularization = (None, regularization) - - inner_products_list = [ - c.regression_matrix(x, y) # type: ignore[arg-type] - for x, c in zip(X_new, coef_info) - ] - - # This is C @ J - inner_products = np.concatenate(inner_products_list, axis=1) - - if sample_weight is not None: - inner_products = inner_products * np.sqrt(sample_weight) - y = y * np.sqrt(sample_weight) + if not self._functional_response: + if isinstance(regularization, Iterable): + regularization = itertools.chain([None], regularization) + elif regularization is not None: + regularization = (None, regularization) penalty_matrix = compute_penalty_matrix( basis_iterable=(c.basis for c in coef_info), @@ -277,25 +359,80 @@ def fit( # noqa: D102 regularization=regularization, ) - if self.fit_intercept and penalty_matrix is not None: - # Intercept is not penalized - penalty_matrix[0, 0] = 0 + if self._functional_response: + coef_lengths = [] + left_inner_products_list = [] + right_inner_products_list = [] + + Xt = self._make_transpose(X_new) + + for i, basis_i in enumerate(coef_basis): + coef_lengths.append(basis_i.n_basis) + row = [] + right_inner_products_list.append( + self._weighted_inner_product_integrate( + basis_i, + ConstantBasis(), + Xt[i], + y, + ), + ) + for j, basis_j in enumerate(coef_basis): + row.append( + self._weighted_inner_product_integrate( + basis_i, + basis_j, + Xt[i], + Xt[j], + ), + ) + left_inner_products_list.append(row) + + coef_lengths.pop() + left_inner_products = np.block(left_inner_products_list) + right_inner_products = np.concatenate(right_inner_products_list) + + if penalty_matrix is not None: + left_inner_products += penalty_matrix + + basiscoefs = np.linalg.solve( + left_inner_products, + right_inner_products, + ) + else: + if self.fit_intercept and penalty_matrix is not None: + # Intercept is not penalized + penalty_matrix[0, 0] = 0 - basiscoefs = solve_regularized_weighted_lstsq( - coefs=inner_products, - result=y, - penalty_matrix=penalty_matrix, - ) + inner_products_list = [ + c.regression_matrix(x, y) # type: ignore[arg-type] + for x, c in zip(X_new, coef_info) + ] + + # This is C @ J + inner_products = np.concatenate(inner_products_list, axis=1) + + if sample_weight is not None: + inner_products = inner_products * np.sqrt(sample_weight) + y = y * np.sqrt(sample_weight) + + basiscoefs = solve_regularized_weighted_lstsq( + coefs=inner_products, + result=y, + penalty_matrix=penalty_matrix, + ) + + coef_lengths = np.array([k.shape[1] for k in inner_products_list]) - coef_lengths = np.array([i.shape[1] for i in inner_products_list]) coef_start = np.cumsum(coef_lengths) basiscoef_list = np.split(basiscoefs, coef_start) # Express the coefficients in functional form - coefs = [ - c.convert_from_constant_coefs(bcoefs) - for c, bcoefs in zip(coef_info, basiscoef_list) - ] + coefs = self._convert_from_constant_coefs( + basiscoef_list, + coef_info, + coef_basis, + ) if self.fit_intercept: self.intercept_ = coefs[0] @@ -304,6 +441,7 @@ def fit( # noqa: D102 self.intercept_ = np.zeros(1) self.coef_ = coefs + self.basis_coefs = basiscoef_list self._coef_info = coef_info self._target_ndim = y.ndim @@ -311,27 +449,44 @@ def fit( # noqa: D102 def predict( # noqa: D102 self, - X: Union[AcceptedDataType, Sequence[AcceptedDataType], pd.DataFrame], + X: Union[Sequence[AcceptedDataType], pd.DataFrame], ) -> NDArrayFloat: check_is_fitted(self) X = self._argcheck_X(X) + result = [] - result = np.sum( - [ - coef_info.inner_product(coef, x) # type: ignore[arg-type] - for coef, x, coef_info - in zip(self.coef_, X, self._coef_info) - ], - axis=0, - ) + for coef, x, coef_info in zip(self.coef_, X, self._coef_info): + if self._functional_response: + def prediction(arg, x_eval=x, coef_eval=coef): # noqa: WPS430 + if isinstance(x_eval, Callable): + x_eval = x_eval(arg) # noqa: WPS220 + + # TODO: MIRAR ESTO BIEN + x_eval = x_eval.reshape((-1, 1, 1)) - result += self.intercept_ + return coef_eval(arg) * x_eval + + result.append( + function_to_fdatabasis(prediction, self._y_basis), + ) + else: + result.append(coef_info.inner_product(coef, x)) + + result = sum(result[1:], start=result[0]) + + if self.fit_intercept: + if self._functional_response: + result = result + function_to_fdatabasis( + self.intercept_, self._y_basis, + ) + else: + result += self.intercept_ if self._target_ndim == 1: result = result.ravel() - return result # type: ignore[no-any-return] + return result def _argcheck_X( # noqa: N802 self, @@ -352,15 +507,92 @@ def _argcheck_X( # noqa: N802 elif isinstance(X, pd.DataFrame): X = self._dataframe_conversion(X) - X = [ + return ([ x if isinstance(x, FData) else self._check_and_convert(x) for x in X - ] + ]) + + def _weighted_inner_product_integrate( + self, + basis: Union[FDataBasis, Basis], + transposed_basis: Union[FDataBasis, Basis], + transposed_weight: Union[FDataBasis, NDArrayFloat], + weight: Union[FDataBasis, NDArrayFloat], + ) -> NDArrayFloat: + r""" + Return the weighted inner product matrix between its arguments. + + For two basis (:math:\theta_1) and (:math:\theta_2) the weighted + inner product is defined as: + + . math:: + \int \boldsymbol{\theta}_1 (t) \boldsymbol{\theta}^T_2 (t) w(t) dt + + where w(t) is defined by a functional vectors product: + + . math:: + \boldsymbol{w}^T_1 (t) \boldsymbol{w}_2 (t) + + Args: + basis: Vertical basis vector. + transposed_basis: Horizontal basis vector. + transposed_weight: First weight, horizontal functional vector. + weight: Second weight, vertical functional vector. + + Returns: + Inner product matrix between basis and weight. + + """ + if isinstance(basis, Basis): + basis = basis.to_basis() + + if isinstance(transposed_basis, Basis): + transposed_basis = transposed_basis.to_basis() + + if isinstance(transposed_weight, FData) and not np.array_equal( + basis.domain_range, + transposed_weight.domain_range, + ): + raise ValueError("Domain range for weight and basis must be equal") - if all(not isinstance(i, FData) for i in X): - warnings.warn("All the covariates are scalar.") + domain_range = basis.domain_range - return X + def integrand(args): # noqa: WPS430 + eval_basis = basis(args)[:, 0, :] + eval_transposed_basis = transposed_basis(args)[:, 0, :] + + if isinstance(transposed_weight, FData): + eval_transposed_weight = transposed_weight(args)[:, 0, :] + else: + eval_transposed_weight = transposed_weight.T + + if isinstance(weight, FData): + eval_weight = weight(args)[:, 0, :] + else: + eval_weight = weight.T + + return eval_basis * eval_transposed_basis.T * np.dot( + eval_transposed_weight.T, eval_weight, + ) + + return nquad_vec( + integrand, + domain_range, + ) + + def _make_transpose( + self, + X: Sequence[AcceptedDataType], + ) -> Sequence[AcceptedDataType]: + Xt: list[AcceptedDataType] = [] + for x in X: + if isinstance(x, FData): + Xt.append(x) + else: + x_new = self._check_and_convert(x) + Xt.append(x_new.T) + + return Xt def _check_and_convert( self, @@ -379,37 +611,70 @@ def _check_and_convert( new_X = new_X[:, np.newaxis] return new_X + def _convert_from_constant_coefs( + self, + coef_list: Sequence[NDArrayFloat], + coef_info: Sequence[AcceptedDataCoefsType], + coef_basis: Sequence[Basis], + ) -> Sequence[FDataBasis]: + if self._functional_response: + return [ + FDataBasis(basis=basis, coefficients=coef.T) + for basis, coef in zip(coef_basis, coef_list) + ] + return [ + c.convert_from_constant_coefs(bcoefs) + for c, bcoefs in zip(coef_info, coef_list) + ] + def _argcheck_X_y( # noqa: N802 self, X: Union[AcceptedDataType, Sequence[AcceptedDataType], pd.DataFrame], - y: NDArrayFloat, + y: Union[AcceptedDataType, Sequence[AcceptedDataType]], sample_weight: Optional[NDArrayFloat] = None, coef_basis: Optional[BasisCoefsType] = None, ) -> ArgcheckResultType: """Do some checks to types and shapes.""" new_X = self._argcheck_X(X) - if any(isinstance(i, FData) for i in y): - raise ValueError( - "Some of the response variables are not scalar", - ) + len_new_X = len(new_X) - y = np.asarray(y) + self._y_basis = y.basis if isinstance(y, FDataBasis) else None + + if isinstance(y, FDataBasis): + self._functional_response = True + else: + if any(len(y) != len(x) for x in new_X): + raise ValueError( + "The number of samples on independent and " + "dependent variables should be the same", + ) + self._functional_response = False + y = np.asarray(y) if coef_basis is None: - coef_basis = [None] * len(new_X) + coef_basis = [None] * len_new_X + + if len(coef_basis) == 1 and len_new_X > 1: + # we assume basis objects are inmmutable + coef_basis = [coef_basis[0]] * len_new_X - if len(coef_basis) != len(new_X): + if len_new_X != len(coef_basis): raise ValueError( - "Number of regression coefficients does " - "not match number of independent variables.", + "The number of covariates and coefficients " + "basis should be the same", ) - if any(len(y) != len(x) for x in new_X): - raise ValueError( - "The number of samples on independent and " - "dependent variables should be the same", + coef_basis = [ + self._choose_beta_basis( + coef_basis=c_basis, + x_basis=x.basis if isinstance(x, FDataBasis) else None, + y_basis=self._y_basis, ) + for c_basis, x in zip(coef_basis, new_X) + ] + + self._coef_basis = coef_basis coef_info = [ coefficient_info_from_covariate(x, y, basis=b) @@ -421,6 +686,19 @@ def _argcheck_X_y( # noqa: N802 return new_X, y, sample_weight, coef_info + def _choose_beta_basis( + self, + coef_basis: Basis | None = None, + x_basis: Basis | None = None, + y_basis: Basis | None = None, + ) -> Basis | None: + if coef_basis is not None: + return coef_basis + elif y_basis is None: + return x_basis + else: + return y_basis + def _sample_weight_check( self, sample_weight: Optional[NDArrayFloat], diff --git a/skfda/preprocessing/dim_reduction/__init__.py b/skfda/preprocessing/dim_reduction/__init__.py index b0e01058d..5c8ae8405 100644 --- a/skfda/preprocessing/dim_reduction/__init__.py +++ b/skfda/preprocessing/dim_reduction/__init__.py @@ -13,13 +13,17 @@ ], submod_attrs={ "_fpca": ["FPCA"], - "_neighbor_transforms": ["KNeighborsTransformer"] + "_fpls": ["FPLS"], + "_neighbor_transforms": ["KNeighborsTransformer"], }, ) if TYPE_CHECKING: from ._fpca import FPCA as FPCA - from ._neighbor_transforms import KNeighborsTransformer as KNeighborsTransformer + from ._fpls import FPLS as FPLS + from ._neighbor_transforms import ( + KNeighborsTransformer as KNeighborsTransformer, + ) def __getattr__(name: str) -> Any: diff --git a/skfda/preprocessing/dim_reduction/_fpls.py b/skfda/preprocessing/dim_reduction/_fpls.py new file mode 100644 index 000000000..8641f037a --- /dev/null +++ b/skfda/preprocessing/dim_reduction/_fpls.py @@ -0,0 +1,886 @@ +from __future__ import annotations + +import warnings +from abc import abstractmethod +from typing import Any, Generic, Literal, Optional, Tuple, TypeVar, Union, cast + +import numpy as np +import scipy +from sklearn.utils.validation import check_is_fitted + +from ..._utils._sklearn_adapter import BaseEstimator +from ...misc.regularization import L2Regularization, compute_penalty_matrix +from ...representation import FDataGrid +from ...representation.basis import Basis, FDataBasis, _GridBasis +from ...typing._numpy import NDArrayFloat + + +def _power_solver( + X: NDArrayFloat, + tol: float, + max_iter: int, +) -> NDArrayFloat: + """Return the dominant eigenvector of a matrix using the power method.""" + t = X[:, 0] + t_prev = np.ones(t.shape) * max(np.max(t), 1) * 2 + iter_count = 0 + while np.linalg.norm(t - t_prev) > tol: + t_prev = t + t = X @ t + t /= np.linalg.norm(t) + iter_count += 1 + if iter_count > max_iter: + break + return t + + +def _calculate_weights( + X: NDArrayFloat, + Y: NDArrayFloat, + G_ww: NDArrayFloat, + G_xw: NDArrayFloat, + G_cc: NDArrayFloat, + G_yc: NDArrayFloat, + L_X_inv: NDArrayFloat, + L_Y_inv: NDArrayFloat, + tol: float, + max_iter: int, +) -> Tuple[NDArrayFloat, NDArrayFloat]: + """ + Calculate the weights for the PLS algorithm. + + Parameters: + - X: (n_samples, n_features) + The X block data matrix. + - Y: (n_samples, n_targets) + The Y block data matrix. + + - G_ww: (n_features, n_features) + The inner product matrix for the X block weights + (The discretization weights matrix in the case of FDataGrid). + - G_xw: (n_features, n_features) + The inner product matrix for the X block data and weights + (The discretization weights matrix in the case of FDataGrid). + + - G_cc: (n_targets, n_targets) + The inner product matrix for the Y block weights + (The discretization weights matrix in the case of FDataGrid). + - G_yc: (n_targets, n_targets) + The inner product matrix for the Y block data and weights + (The discretization weights matrix in the case of FDataGrid). + + - L_X_inv: (n_features, n_features) + The inverse of the Cholesky decomposition: + L_X @ L_X.T = G_ww + P_x, + where P_x is a the penalty matrix for the X block. + - L_Y_inv: (n_targets, n_targets) + The inverse of the Cholesky decomposition: + L_Y @ L_Y.T = G_cc + P_y, + where P_y is a the penalty matrix for the Y block. + + - tol: The tolerance for the power method. + - max_iter: The maximum number of iterations for the power method. + Returns: + - w: (n_features, 1) + The X block weights. + - c: (n_targets, 1) + The Y block weights. + """ + X = X @ G_xw @ L_X_inv.T + Y = Y @ G_yc @ L_Y_inv.T + S = X.T @ Y + w = _power_solver( + S @ S.T, + tol=tol, + max_iter=max_iter, + ) + + # Calculate the other weight + c = np.dot(Y.T, np.dot(X, w)) + + # Undo the transformation + w = L_X_inv.T @ w + + # Normalize + w /= np.sqrt(np.dot(w.T, G_ww @ w)) + + # Undo the transformation + c = L_Y_inv.T @ c + + # Normalize the other weight + c /= np.sqrt(np.dot(c.T, G_cc @ c)) + + return w, c + + +BlockType = TypeVar( + "BlockType", + bound=Union[FDataGrid, FDataBasis, NDArrayFloat], +) + + +# Ignore too many public instance attributes +class _FPLSBlock(Generic[BlockType]): # noqa: WPS230 + """ + Class to store the data of a block of a FPLS model. + + This is an internal class, intended to be used only by the FPLS class. + It provides a common interface for the different types of blocks, + simplifying the implementation of the FPLS algorithm. + + There are three types of blocks (depending on BlockType): + mutltivariate (NDArrayFloat), basis (FDataBasis) and grid (FDataGrid). + + In the following, n_samples is the number of samples of the block. + n_features is: + - The number of features of the block in the case of multivariate + block. + - The number of basis functions in the case of a FDataBasis block. + - The number of points in the case of a FDataGrid block. + + Parameters: + - data: The data of the block. + - label: Label of the block (X or Y). + - integration_weights: Array with shape (n_features,). + The integration weights of the block. It must be None for + multivariate or FDataBasis blocks. + - regularization: The regularization to apply to the block. + It must be None for multivariate blocks. + - weights_basis: The basis of the weights. It must be None for + multivariate or grid blocks. + + Attributes: + - label: Label of the block (X or Y). + - data: The data of the block. + - data_matrix: (n_samples, n_features) matrix. The data + matrix of the block. + - mean: The mean of the data. + - weights_basis: The basis of the weights. + - regularization_matrix: (n_features, n_features) matrix. + Inner product matrix of the regularization operator applied + to the basis or the grid. + - integration_matrix: (n_features, n_features) matrix. + Diagonal matrix of the integration weights. + - G_data_weights: (n_samples, n_samples) matrix. The inner + product matrix for the data and weights + (The discretization matrix in grid blocks). + - G_weights: (n_samples, n_samples) matrix. The inner product + matrix for the weights (The discretization matrix in grid blocks). + + - rotations: (n_features, n_components) matrix. The + rotations of the block. + - loadings_matrix: (n_features, n_components) matrix. The + loadings of the block. + - loadings: The loadings of the block (same type as the data). + + """ + + # Attributes that must be defined in the subclasses + mean: BlockType + data_matrix: NDArrayFloat + G_weights: NDArrayFloat + G_data_weights: NDArrayFloat + regularization_matrix: NDArrayFloat + + def set_nipals_results( + self, + rotations: NDArrayFloat, + loadings: NDArrayFloat, + ) -> None: + """Set the results of NIPALS.""" + self.rotations_matrix = rotations + self.loadings_matrix = loadings + self.rotations = self._to_block_type(self.rotations_matrix, "rotation") + self.loadings = self._to_block_type(self.loadings_matrix, "loading") + + @abstractmethod + def _to_block_type( + self, + nipals_matrix: NDArrayFloat, + title: str, + ) -> BlockType: + pass + + @abstractmethod + def transform( + self, + data: BlockType, + ) -> NDArrayFloat: + """Transform from the data space to the component space.""" + pass + + @abstractmethod + def inverse_transform( + self, + components: NDArrayFloat, + ) -> BlockType: + """Transform from the component space to the data space.""" + pass + + def get_penalty_matrix(self) -> NDArrayFloat: + """Return the penalty matrix.""" + return self.G_weights + self.regularization_matrix + + def get_cholesky_inv_penalty_matrix(self) -> NDArrayFloat: + """Return the Cholesky decomposition of the penalty matrix.""" + return np.linalg.inv(np.linalg.cholesky(self.get_penalty_matrix())) + + +class _FPLSBlockMultivariate(_FPLSBlock[NDArrayFloat]): + def __init__( + self, + data: NDArrayFloat, + label: str, + ) -> None: + self.label = label + if len(data.shape) == 1: + data = data[:, np.newaxis] + + self.G_data_weights = np.identity(data.shape[1]) + self.G_weights = np.identity(data.shape[1]) + + self.mean = np.mean(data, axis=0) + self.data_matrix = data - self.mean + self.data = data - self.mean + + self.regularization_matrix = np.zeros( + (data.shape[1], data.shape[1]), + ) + + def _to_block_type( + self, + nipals_matrix: NDArrayFloat, + title: str, + ) -> NDArrayFloat: + return nipals_matrix.T + + def transform( + self, + data: NDArrayFloat, + ) -> NDArrayFloat: + """Transform from the data space to the component space.""" + data_array = data + if len(data_array.shape) == 1: + data_array = data_array[:, np.newaxis] + data_array = data_array - self.mean + return data_array @ self.rotations_matrix + + def inverse_transform( + self, + components: NDArrayFloat, + ) -> NDArrayFloat: + """Transform from the component space to the data space.""" + reconstructed = components @ self.loadings_matrix.T + reconstructed += self.mean + return reconstructed + + +class _FPLSBlockBasis(_FPLSBlock[FDataBasis]): + def __init__( + self, + data: FDataBasis, + label: str, + regularization: Optional[L2Regularization[Any]], + weights_basis: Optional[Basis], + ) -> None: + """Initialize the data of a basis block.""" + self.label = label + self.mean = data.mean() + self.data = data - self.mean + self.data_matrix = self.data.coefficients + + # By default, use the basis of the input data + # for the weights + if weights_basis is None: + self.weights_basis = data.basis + else: + self.weights_basis = weights_basis + + # Compute the regularization matrix + # By default, all zeros (no regularization) + regularization_matrix = None + if regularization is not None: + regularization_matrix = compute_penalty_matrix( + basis_iterable=(self.weights_basis,), + regularization_parameter=1, + regularization=regularization, + ) + if regularization_matrix is None: + regularization_matrix = np.zeros( + (self.data_matrix.shape[1], self.data_matrix.shape[1]), + ) + + self.regularization_matrix = regularization_matrix + self.G_weights = self.weights_basis.gram_matrix() + self.G_data_weights = self.data.basis.inner_product_matrix( + self.weights_basis, + ) + + def _to_block_type( + self, + nipals_matrix: NDArrayFloat, + title: str, + ) -> FDataBasis: + # Each column of the matrix generated by NIPALS corresponds to + # an obsevation or direction. Therefore, they must be transposed + # so that each row corresponds ot an observation or direction + basis = self.weights_basis if title == "rotation" else self.data.basis + return FDataBasis( + coefficients=nipals_matrix.T, + basis=basis, + sample_names=[ + f"{title.capitalize()} {i}" + for i in range(nipals_matrix.shape[1]) + ], + coordinate_names=(f"FPLS {self.label} {title} value",), + dataset_name=f"FPLS {self.label} {title}s", + ) + + def transform( + self, + data: FDataBasis, + ) -> NDArrayFloat: + """Transform from the data space to the component space.""" + data_basis = data - self.mean + return ( + data_basis.coefficients + @ self.G_data_weights + @ self.rotations_matrix + ) + + def inverse_transform( + self, + components: NDArrayFloat, + ) -> FDataBasis: + """Transform from the component space to the data space.""" + reconstructed_basis = self.data.copy( + coefficients=components @ self.loadings_matrix.T, + sample_names=(None,) * components.shape[0], + dataset_name=f"FPLS {self.label} inverse transformed", + ) + return reconstructed_basis + self.mean + + +class _FPLSBlockGrid(_FPLSBlock[FDataGrid]): + def __init__( + self, + data: FDataGrid, + label: str, + integration_weights: Optional[np.ndarray], # type: ignore[type-arg] + regularization: Optional[L2Regularization[Any]], + ) -> None: + """Initialize the data of a grid block.""" + self.label = label + + self.mean = data.mean() + self.data = data - self.mean + self.data_matrix = data.data_matrix[..., 0] + + # Arrange the integration weights in a diagonal matrix + # By default, use Simpson's rule + if integration_weights is None: + identity = np.identity(self.data_matrix.shape[1]) + integration_weights = scipy.integrate.simps( + identity, + self.data.grid_points[0], + ) + self.integration_weights = np.diag(integration_weights) + + # Compute the regularization matrix + # By default, all zeros (no regularization) + regularization_matrix = None + if regularization is not None: + regularization_matrix = compute_penalty_matrix( + basis_iterable=( + _GridBasis(grid_points=self.data.grid_points), + ), + regularization_parameter=1, + regularization=regularization, + ) + if regularization_matrix is None: + regularization_matrix = np.zeros( + (self.data_matrix.shape[1], self.data_matrix.shape[1]), + ) + + self.regularization_matrix = regularization_matrix + self.G_data_weights = self.integration_weights + self.G_weights = self.integration_weights + + def _to_block_type( + self, + nipals_matrix: NDArrayFloat, + title: str, + ) -> FDataGrid: + # Each column of the matrix generated by NIPALS corresponds to + # an obsevation or direction. Therefore, they must be transposed + # so that each row corresponds ot an observation or direction + return FDataGrid( + data_matrix=nipals_matrix.T, + sample_names=[ + f"{title.capitalize()} {i}" + for i in range(nipals_matrix.shape[1]) + ], + coordinate_names=(f"FPLS {self.label} {title} value",), + dataset_name=f"FPLS {self.label} {title}s", + grid_points=self.data.grid_points[0], + ) + + def transform( + self, + data: FDataGrid, + ) -> NDArrayFloat: + """Transform from the data space to the component space.""" + data_grid = data - self.mean + return ( + data_grid.data_matrix[..., 0] + @ self.G_data_weights + @ self.rotations_matrix + ) + + def inverse_transform( + self, + components: NDArrayFloat, + ) -> FDataGrid: + """Transform from the component space to the data space.""" + reconstructed_grid = self.data.copy( + data_matrix=components @ self.loadings_matrix.T, + sample_names=(None,) * components.shape[0], + dataset_name=f"FPLS {self.label} inverse transformed", + ) + return reconstructed_grid + self.mean + + +def _fpls_block_factory( + data: BlockType, + label: str, + integration_weights: NDArrayFloat | None, + regularization: L2Regularization[Any] | None, + weights_basis: Basis | None, +) -> _FPLSBlock[BlockType]: + if isinstance(data, np.ndarray): + return cast( + _FPLSBlock[BlockType], + _FPLSBlockMultivariate( + data=data, + label=label, + ), + ) + elif isinstance(data, FDataBasis): + return cast( + _FPLSBlock[BlockType], + _FPLSBlockBasis( + data=data, + label=label, + regularization=regularization, + weights_basis=weights_basis, + ), + ) + elif isinstance(data, FDataGrid): + return cast( + _FPLSBlock[BlockType], + _FPLSBlockGrid( + data=data, + label=label, + integration_weights=integration_weights, + regularization=regularization, + ), + ) + + raise TypeError("Invalid type for data") + + +InputTypeX = TypeVar( + "InputTypeX", + bound=Union[FDataGrid, FDataBasis, NDArrayFloat], +) +InputTypeY = TypeVar( + "InputTypeY", + bound=Union[FDataGrid, FDataBasis, NDArrayFloat], +) + +DeflationMode = Literal["reg", "can"] + + +# Ignore too many public instance attributes +class FPLS( # noqa: WPS230 + BaseEstimator, + Generic[InputTypeX, InputTypeY], +): + r""" + Functional Partial Least Squares Regression. + + This is a generic class. When instantiated, the type of the + data in each block can be specified. The possiblities are: + NDArrayFloat, FDataGrid and FDataBasis. + + Parameters: + n_components: Number of components to extract. By default, the + maximum number of components is extracted. + regularization_X: Regularization to apply to the X block. + regularization_Y: Regularization to apply to the Y block. + component_basis_X: Basis to use for the X block. Only + applicable if X is a FDataBasis. Otherwise it must be None. + component_basis_Y: Basis to use for the Y block. Only + applicable if Y is a FDataBasis. Otherwise it must be None. + _deflation_mode: Mode to use for deflation. Can be "can" + (dimensionality reduction) or "reg" (regression). + + Attributes: + x_weights\_: (n_features_X, n_components) array with the X weights + extracted by NIPALS. + y_weights\_: (n_features_Y, n_components) array with the Y weights + extracted by NIPALS. + x_scores\_: (n_samples, n_components) array with the X scores + extracted by NIPALS. + y_scores\_: (n_samples, n_components) array with the Y scores + extracted by NIPALS. + x_rotations_matrix\_: (n_features_X, n_components) array with the + X rotations. + y_rotations_matrix\_: (n_features_Y, n_components) array with the + Y rotations. + x_loadings_matrix\_: (n_features_X, n_components) array with the + X loadings. + y_loadings_matrix\_: (n_features_Y, n_components) array with the + Y loadings. + x_rotations\_: Projection directions for the X block (same type as X). + y_rotations\_: Projection directions for the Y block (same type as Y). + x_loadings\_: Loadings for the X block (same type as X). + y_loadings\_: Loadings for the Y block (same type as Y). + + """ + + # Ignore too many arguments + def __init__( # noqa: WPS211 + self, + n_components: int | None = None, + *, + regularization_X: L2Regularization[InputTypeX] | None = None, + regularization_Y: L2Regularization[InputTypeY] | None = None, + component_basis_X: Basis | None = None, + component_basis_Y: Basis | None = None, + tol: float = 1e-6, + max_iter: int = 500, + _deflation_mode: DeflationMode = "can", + _integration_weights_X: NDArrayFloat | None = None, + _integration_weights_Y: NDArrayFloat | None = None, + ) -> None: + self.n_components = n_components + self._integration_weights_X = _integration_weights_X + self._integration_weights_Y = _integration_weights_Y + self.regularization_X = regularization_X + self.regularization_Y = regularization_Y + self.component_basis_X = component_basis_X + self.component_basis_Y = component_basis_Y + self._deflation_mode = _deflation_mode + self.tol = tol + self.max_iter = max_iter + + def _initialize_blocks(self, X: InputTypeX, Y: InputTypeY) -> None: + self._x_block = _fpls_block_factory( + data=X, + label="X", + integration_weights=self._integration_weights_X, + regularization=self.regularization_X, + weights_basis=self.component_basis_X, + ) + self._y_block = _fpls_block_factory( + data=Y, + label="Y", + integration_weights=self._integration_weights_Y, + regularization=self.regularization_Y, + weights_basis=self.component_basis_Y, + ) + + # Ignore too many local variables + def _perform_nipals(self) -> None: # noqa: WPS210 + X = self._x_block.data_matrix + Y = self._y_block.data_matrix + X = X - np.mean(X, axis=0) + Y = Y - np.mean(Y, axis=0) + + if len(Y.shape) == 1: + Y = Y[:, np.newaxis] + + # Store the matrices as list of columns + W, C = [], [] + T, U = [], [] + P, Q = [], [] + + # Calculate the penalty matrices in advance + L_X_inv = self._x_block.get_cholesky_inv_penalty_matrix() + L_Y_inv = self._y_block.get_cholesky_inv_penalty_matrix() + + # Determine some tolerances to stop the algorithm + x_epsilon = ( + 10 + * np.finfo(X.dtype).eps + * np.abs(self._x_block.data_matrix).mean() + ) + y_epsilon = ( + 10 + * np.finfo(Y.dtype).eps + * np.abs(self._y_block.data_matrix).mean() + ) + + n_comp = 0 + while self.n_components is None or n_comp < self.n_components: + # Stop if either matrix is all zeros + if np.all(X == 0) or np.all(Y == 0): + # If the number of components was specified, warn the user + if self.n_components is not None: + warnings.warn( + f"After extracting {n_comp} components, " + "one of the matrices is completely deflated. " + f"The algorithm will return {n_comp} components," + f"instead of {self.n_components}.", + stacklevel=3, + ) + break + + # Increase number of components + n_comp += 1 + + w, c = _calculate_weights( + X, + Y, + G_ww=self._x_block.G_weights, + G_xw=self._x_block.G_data_weights, + G_cc=self._y_block.G_weights, + G_yc=self._y_block.G_data_weights, + L_X_inv=L_X_inv, + L_Y_inv=L_Y_inv, + tol=self.tol, + max_iter=self.max_iter, + ) + + t = np.dot(X @ self._x_block.G_data_weights, w) + u = np.dot(Y @ self._y_block.G_data_weights, c) + + p = np.dot(X.T, t) / (np.dot(t.T, t)) + + y_proyection = t if self._deflation_mode == "reg" else u + + q = np.dot(Y.T, y_proyection) / ( + np.dot(y_proyection, y_proyection) + ) + + X = X - np.outer(t, p) + Y = Y - np.outer(y_proyection, q) + + W.append(w) + C.append(c) + T.append(t) + U.append(u) + P.append(p) + Q.append(q) + + # Set to zero the values that are close to zero + X[abs(X) < x_epsilon] = 0 + Y[abs(Y) < y_epsilon] = 0 + + # Convert each list of columns to a matrix + self.x_weights_ = np.array(W).T + self.y_weights_ = np.array(C).T + self.x_scores_ = np.array(T).T + self.y_scores_ = np.array(U).T + self.x_loadings_matrix_ = np.array(P).T + self.y_loadings_matrix_ = np.array(Q).T + + def fit( + self, + X: InputTypeX, + y: InputTypeY, + ) -> FPLS[InputTypeX, InputTypeY]: + """ + Fit the model using the data for both blocks. + + Args: + X: Data of the X block + y: Data of the Y block + + Returns: + self + """ + self._initialize_blocks(X, y) + + if self.n_components is not None: + # In regression mode, the number of components is limited + # only by the rank of the X data matrix since components are + # only extracted from the X block. + if self._deflation_mode == "reg": + range_upper_bound = min(*self._x_block.data_matrix.shape) + else: + range_upper_bound = min( + *self._x_block.data_matrix.shape, + *self._y_block.data_matrix.shape, + ) + + if self.n_components > range_upper_bound: + raise ValueError( + f"n_components must be less or equal " + f"than {range_upper_bound}", + ) + + self._perform_nipals() + + self.x_rotations_matrix_ = self.x_weights_ @ np.linalg.pinv( + self.x_loadings_matrix_.T + @ self._x_block.G_data_weights + @ self.x_weights_, + ) + + self.y_rotation_matrix_ = self.y_weights_ @ np.linalg.pinv( + self.y_loadings_matrix_.T + @ self._y_block.G_data_weights + @ self.y_weights_, + ) + + self._x_block.set_nipals_results( + rotations=self.x_rotations_matrix_, + loadings=self.x_loadings_matrix_, + ) + self._y_block.set_nipals_results( + rotations=self.y_rotation_matrix_, + loadings=self.y_loadings_matrix_, + ) + + self.x_rotations_ = self._x_block.rotations + self.y_rotations_ = self._y_block.rotations + + self.x_loadings_ = self._x_block.loadings + self.y_loadings_ = self._y_block.loadings + + return self + + def transform( + self, + X: InputTypeX, + y: InputTypeY | None = None, + ) -> NDArrayFloat | Tuple[NDArrayFloat, NDArrayFloat]: + """ + Apply the dimension reduction learned on the train data. + + Args: + X: Data to transform. Must have the same number of features and + type as the data used to train the model. + y : Data to transform. Must have the same number of features and + type as the data used to train the model. + If None, only X is transformed. + + Returns: + - x_scores: Data transformed. + - y_scores: Data transformed (if y is not None) + """ + check_is_fitted(self) + + x_scores = self._x_block.transform(X) + + if y is not None: + y_scores = self._y_block.transform(y) + return x_scores, y_scores + + return x_scores + + def transform_x( + self, + X: InputTypeX, + ) -> NDArrayFloat: + """ + Apply the dimension reduction learned on the train data. + + Args: + X: Data to transform. Must have the same number of features and + type as the data used to train the model. + + + Returns: + - Data transformed. + """ + check_is_fitted(self) + + return self._x_block.transform(X) + + def transform_y( + self, + Y: InputTypeY, + ) -> NDArrayFloat: + """ + Apply the dimension reduction learned on the train data. + + Args: + Y: Data to transform. Must have the same number of features and + type as the data used to train the model. + + + Returns: + - Data transformed. + """ + check_is_fitted(self) + + return self._y_block.transform(Y) + + def inverse_transform( + self, + X: NDArrayFloat, + Y: NDArrayFloat | None = None, + ) -> InputTypeX | Tuple[InputTypeX, InputTypeY]: + """ + Transform data back to its original space. + + Args: + X: Data to transform back. Must have the same number of columns + as the number of components of the model. + Y: Data to transform back. Must have the same number of columns + as the number of components of the model. + + Returns: + - X: Data reconstructed from the transformed data. + - Y: Data reconstructed from the transformed data + (if Y is not None) + + """ + check_is_fitted(self) + + x_reconstructed = self._x_block.inverse_transform(X) + + if Y is not None: + y_reconstructed = self._y_block.inverse_transform(Y) + return x_reconstructed, y_reconstructed + + return x_reconstructed + + def inverse_transform_x( + self, + X: NDArrayFloat, + ) -> InputTypeX: + """ + Transform X data back to its original space. + + Args: + X: Data to transform back. Must have the same number of columns + as the number of components of the model. + + Returns: + - Data reconstructed from the transformed data. + """ + check_is_fitted(self) + + return self._x_block.inverse_transform(X) + + def inverse_transform_y( + self, + Y: NDArrayFloat, + ) -> InputTypeY: + """ + Transform Y data back to its original space. + + Args: + Y: Data to transform back. Must have the same number of columns + as the number of components of the model. + + Returns: + - Data reconstructed from the transformed data. + """ + check_is_fitted(self) + + return self._y_block.inverse_transform(Y) diff --git a/skfda/preprocessing/dim_reduction/variable_selection/_rkvs.py b/skfda/preprocessing/dim_reduction/variable_selection/_rkvs.py index 4da5810b7..298a68fea 100644 --- a/skfda/preprocessing/dim_reduction/variable_selection/_rkvs.py +++ b/skfda/preprocessing/dim_reduction/variable_selection/_rkvs.py @@ -83,7 +83,7 @@ def _rkhs_vs( [indexes[j]], ]) - new_means = np.atleast_2d(means[new_selection]) + new_means = means[new_selection] lstsq_solution = linalg.lstsq( variances[new_selection[:, np.newaxis], new_selection], diff --git a/skfda/preprocessing/dim_reduction/variable_selection/mrmr.py b/skfda/preprocessing/dim_reduction/variable_selection/mrmr.py index 638d93bf3..036663ccf 100644 --- a/skfda/preprocessing/dim_reduction/variable_selection/mrmr.py +++ b/skfda/preprocessing/dim_reduction/variable_selection/mrmr.py @@ -160,7 +160,7 @@ def _mrmr( redundancies[last_selected, j] = redundancy_dependence_measure( X[:, last_selected, np.newaxis], X[:, j, np.newaxis], - ) + ).item() redundancies[j, last_selected] = redundancies[last_selected, j] W = np.mean( diff --git a/skfda/preprocessing/dim_reduction/variable_selection/recursive_maxima_hunting.py b/skfda/preprocessing/dim_reduction/variable_selection/recursive_maxima_hunting.py index ca7b377b8..432b65bb8 100644 --- a/skfda/preprocessing/dim_reduction/variable_selection/recursive_maxima_hunting.py +++ b/skfda/preprocessing/dim_reduction/variable_selection/recursive_maxima_hunting.py @@ -17,6 +17,7 @@ overload, ) +import dcor import numpy as np import numpy.linalg as linalg import numpy.ma as ma @@ -25,7 +26,6 @@ from sklearn.base import clone from typing_extensions import Literal -import dcor from skfda.exploratory.stats.covariance import ( CovarianceEstimator, EmpiricalCovariance, @@ -41,7 +41,6 @@ if TYPE_CHECKING: from ....misc.covariances import CovarianceLike - import GPy def _transform_to_2d(t: ArrayLike) -> NDArrayFloat: @@ -56,48 +55,6 @@ def _transform_to_2d(t: ArrayLike) -> NDArrayFloat: return t -class _PicklableKernel(): - """Class used to pickle GPy kernels.""" - - def __init__(self, kernel: GPy.kern.Kern) -> None: - super().__setattr__('_PicklableKernel__kernel', kernel) - - def __getattr__(self, name: str) -> Any: - if name != '__deepcopy__': - return getattr(self.__kernel, name) - - def __setattr__(self, name: str, value: Any) -> None: - setattr(self.__kernel, name, value) - - def __getstate__(self) -> Mapping[str, Any]: - return { - 'class': self.__kernel.__class__, - 'input_dim': self.__kernel.input_dim, - 'values': self.__kernel.param_array, - } - - def __setstate__(self, state: Mapping[str, Any]) -> None: - super().__setattr__('_PicklableKernel__kernel', state['class']( - input_dim=state['input_dim']), - ) - self.__kernel.param_array[...] = state['values'] - - def __call__(self, *args: Any, **kwargs: Any) -> NDArrayFloat: - return self.__kernel.K(*args, **kwargs) # type: ignore[no-any-return] - - -def make_kernel(k: CovarianceLike) -> CovarianceLike: - try: - import GPy - except ImportError: - return k - - if isinstance(k, GPy.kern.Kern): - return _PicklableKernel(k) - - return k - - def _absolute_argmax( function: FDataGrid, *, @@ -255,7 +212,7 @@ def __init__( super().__init__() self.mean = mean - self.cov = make_kernel(cov) + self.cov = cov def _evaluate_mean(self, t: NDArrayFloat) -> NDArrayFloat: @@ -625,7 +582,7 @@ def __call__( **kwargs: Any, ) -> bool: - score = float(dependences.data_matrix[0, selected_index, 0]) + score = float(dependences.data_matrix[(0,) + selected_index + (0,)]) return score < self.threshold diff --git a/skfda/tests/test_fpls.py b/skfda/tests/test_fpls.py new file mode 100644 index 000000000..af322959f --- /dev/null +++ b/skfda/tests/test_fpls.py @@ -0,0 +1,311 @@ +"""Test the FPLS class.""" + +from typing import Tuple + +import numpy as np +import pytest +from sklearn.cross_decomposition import PLSCanonical + +from skfda.datasets import fetch_tecator +from skfda.preprocessing.dim_reduction import FPLS +from skfda.representation import FData, FDataGrid +from skfda.representation.basis import BSplineBasis, FDataBasis +from skfda.typing._numpy import NDArrayFloat + + +class LatentVariablesModel: + """Simulate model driven by latent variables.""" + + def create_latent_variables(self, n_latent: int, n_samples: int) -> None: + """Create latent variables for testing.""" + self.rng = np.random.RandomState(0) + self.n_latent = n_latent + self.n_samples = n_samples + self.grid_points = np.linspace(0, 1, 100) + + # Get the means of the latent variables + latent_means = self.rng.uniform(low=1, high=10, size=(n_latent)) + + # Sample the variables + self.latent_samples = np.array( + [ + self.rng.normal(loc=mean, scale=1, size=n_samples) + for mean in latent_means + ], + ).T + + def create_observed_multivariate_variable( + self, + n_features: int, + noise: float = 0, + ) -> Tuple[NDArrayFloat, NDArrayFloat]: + """Create observed multivariate variable for testing.""" + rotations = self.rng.uniform( + low=0, + high=10, + size=(n_features, self.n_latent), + ) + + observed_values = self.latent_samples @ rotations.T + observed_noise = noise * self.rng.normal( + size=(self.n_samples, n_features), + ) + observed_values += observed_noise + + return observed_values, rotations + + def create_observed_functional_variable( + self, + noise: float = 0, + discretized: bool = False, + ) -> Tuple[FData, FData]: + """Create observed functional variable for testing.""" + n_basis = 20 + basis = BSplineBasis(n_basis=n_basis) + + rotation_coef = self.rng.uniform( + low=0, + high=10, + size=(self.n_latent, n_basis), + ) + rotation = FDataBasis(basis, rotation_coef) + + observed_coef = self.latent_samples @ rotation_coef + observed_func = FDataBasis(basis, observed_coef) + + if discretized: + observed_func = observed_func.to_grid( + grid_points=self.grid_points, + ) + + func_noise = noise * self.rng.normal(size=(self.n_samples, 100)) + observed_func.data_matrix[..., 0] += func_noise + + else: + observed_func.coefficients += noise * self.rng.normal( + size=(self.n_samples, n_basis), + ) + + return observed_func, rotation + + +class TestFPLS(LatentVariablesModel): + """Test the FPLS class.""" + + def test_sklearn(self) -> None: + """Compare results with sklearn.""" + # Load the data + X, y = fetch_tecator( + return_X_y=True, + ) + + integration_weights = np.ones(len(X.grid_points[0])) + + # Fit the model + fpls = FPLS[FDataGrid, NDArrayFloat]( + n_components=3, + _integration_weights_X=integration_weights, + ) + fpls.fit(X, y) + + sklearnpls = PLSCanonical(n_components=3, scale=False) + sklearnpls.fit(X.data_matrix[..., 0], y) + + rtol = 2e-4 + atol = 1e-6 + + # Check that the rotations are correct + np.testing.assert_allclose( + np.abs(fpls.x_rotations_matrix_).flatten(), + np.abs(sklearnpls.x_rotations_).flatten(), + rtol=rtol, + atol=atol, + ) + + # Check the transformation of X + np.testing.assert_allclose( + fpls.transform(X, y), + sklearnpls.transform(X.data_matrix[..., 0], y), + rtol=rtol, + atol=1e-5, + ) + + comp_x, comp_y = fpls.transform(X, y) + + fpls_inv_x, fpls_inv_y = fpls.inverse_transform(comp_x, comp_y) + sklearnpls_inv_x, sklearnpls_inv_y = sklearnpls.inverse_transform( + comp_x, + comp_y, + ) + # Check the inverse transformation of X + np.testing.assert_allclose( + fpls_inv_x.data_matrix.flatten(), + sklearnpls_inv_x.flatten(), + rtol=rtol, + atol=atol, + ) + + # Check the inverse transformation of y + np.testing.assert_allclose( + fpls_inv_y.flatten(), + sklearnpls_inv_y.flatten(), + rtol=rtol, + atol=atol, + ) + + # Ignoring WPS210: Found too many local variables + def test_basis_vs_grid( # noqa: WPS210 + self, + ) -> None: + """Test that the results are the same in basis and grid.""" + n_components = 5 + self.create_latent_variables(n_latent=n_components, n_samples=100) + + # Create the observed variable as basis variables + X_observed, X_rotations = self.create_observed_functional_variable( + discretized=False, + noise=0, + ) + y_observed, y_rotations = self.create_observed_functional_variable( + discretized=False, + noise=0, + ) + + # Fit the model + fpls = FPLS[FData, FData](n_components=n_components) + fpls.fit(X_observed, y_observed) + + # Convert the observed variables to grid + grid_points = np.linspace(0, 1, 2000) + X_observed_grid = X_observed.to_grid( + grid_points=grid_points, + ) + + y_observed_grid = y_observed.to_grid( + grid_points=grid_points, + ) + + # Fit the model with the grid variables + fpls_grid = FPLS[FData, FData](n_components=n_components) + fpls_grid.fit(X_observed_grid, y_observed_grid) + + # Check that the results are the same + np.testing.assert_allclose( + np.abs(fpls.x_rotations_(self.grid_points)), + np.abs(fpls_grid.x_rotations_(self.grid_points)), + rtol=5e-3, + ) + + np.testing.assert_allclose( + np.abs(fpls.y_rotations_(self.grid_points)), + np.abs(fpls_grid.y_rotations_(self.grid_points)), + rtol=5e-3, + ) + + # Check that the transform is the same + np.testing.assert_allclose( + np.abs(fpls.transform(X_observed, y_observed)), + np.abs(fpls_grid.transform(X_observed_grid, y_observed_grid)), + rtol=5e-3, + ) + # Check the inverse transform + comp_x, comp_y = fpls.transform(X_observed, y_observed) + + fpls_inv_x, fpls_inv_y = fpls.inverse_transform(comp_x, comp_y) + fpls_grid_x, fpsl_grid_y = fpls_grid.inverse_transform( + comp_x, + comp_y, + ) + # Check the inverse transformation of X + np.testing.assert_allclose( + fpls_inv_x(grid_points), + fpls_grid_x(grid_points), + rtol=7e-2, + ) + + # Check the inverse transformation of y + np.testing.assert_allclose( + fpls_inv_y(grid_points), + fpsl_grid_y(grid_points), + rtol=0.13, + ) + + def _generate_random_matrix_by_rank(self, n_samples, n_features, rank): + + random_data = np.random.random(n_samples * rank).reshape( + n_samples, + rank, + ) + if rank == n_features: + return random_data + repeated_data = np.array([random_data[:, 0]] * (n_features - rank)).T + + return np.concatenate( + [random_data, repeated_data], + axis=1, + ) + + @pytest.mark.parametrize("rank", [1, 2, 5]) + def test_collinear_matrix( + self, + rank: int, + ) -> None: + """Check that the behaviour is correct with collinear matrices.""" + n_samples = 100 + n_features = 10 + np.random.seed(0) + + X = self._generate_random_matrix_by_rank( + n_samples=n_samples, + n_features=n_features, + rank=rank, + ) + y = self._generate_random_matrix_by_rank( + n_samples=n_samples, + n_features=n_features, + rank=rank, + ) + + fpls = FPLS(n_components=rank) + fpls.fit(X, y) + + fpls = FPLS(n_components=5) + + # Check that a warning is raised when the rank is lower than the + # number of components + if rank < 5: + with pytest.warns(UserWarning): + fpls.fit(X, y) + else: + fpls.fit(X, y) + + # Check that only as many components as rank are returned + assert fpls.x_weights_.shape == (n_features, rank) + + # Check that the waring is not raised when the number of components + # is not set + fpls = FPLS().fit(X, y) + # Check that only as many components as rank are returned + assert fpls.x_weights_.shape == (n_features, rank) + + def test_number_of_components( + self, + ) -> None: + """Check error when number of components is too large.""" + n_samples = 100 + n_features = 10 + np.random.seed(0) + + X = self._generate_random_matrix_by_rank( + n_samples=n_samples, + n_features=n_features, + rank=n_features, + ) + y = self._generate_random_matrix_by_rank( + n_samples=n_samples, + n_features=n_features, + rank=n_features, + ) + + with pytest.raises(ValueError): + FPLS(n_components=n_features + 1).fit(X, y) diff --git a/skfda/tests/test_fpls_regression.py b/skfda/tests/test_fpls_regression.py new file mode 100644 index 000000000..b658e8863 --- /dev/null +++ b/skfda/tests/test_fpls_regression.py @@ -0,0 +1,292 @@ +"""Test the FPLSRegression class.""" +import os + +import numpy as np +import pytest +import scipy +from sklearn.cross_decomposition import PLSRegression + +from skfda.datasets import fetch_tecator +from skfda.misc.operators import LinearDifferentialOperator +from skfda.misc.regularization import L2Regularization +from skfda.ml.regression import FPLSRegression +from skfda.representation import FData +from skfda.representation.basis import BSplineBasis +from skfda.tests.test_fpls import LatentVariablesModel +from skfda.typing._numpy import NDArrayFloat + + +class TestFPLSRegression(LatentVariablesModel): + """Test the FPLSRegression class.""" + + def test_sklearn(self) -> None: + """Compare results with sklearn.""" + # Load the data + X, y = fetch_tecator( + return_X_y=True, + ) + + # Fit the model + fplsr = FPLSRegression[NDArrayFloat, NDArrayFloat]( + n_components=5, + _integration_weights_X=np.ones(len(X.grid_points[0])), + ) + fplsr.fit(X, y) + + sklearnpls = PLSRegression(n_components=5, scale=False) + sklearnpls.fit(X.data_matrix[..., 0], y) + + rtol = 3e-5 + atol = 1e-6 + # Compare the results + np.testing.assert_allclose( + fplsr.coef_.flatten(), + sklearnpls.coef_.T.flatten(), + rtol=rtol, + atol=atol, + ) + + # Compare predictions + np.testing.assert_allclose( + fplsr.predict(X).flatten(), + sklearnpls.predict(X.data_matrix[..., 0]).flatten(), + rtol=rtol, + atol=atol, + ) + + # Check that the rotations are correc + np.testing.assert_allclose( + np.abs(fplsr.fpls_.x_rotations_matrix_).flatten(), + np.abs(sklearnpls.x_rotations_).flatten(), + rtol=rtol, + atol=atol, + ) + + def test_fda_usc_no_reg(self) -> None: + """ + Test a multivariate regression with no regularization. + + Replication Code: + data(tecator) + x <- tecator$absorp.fdata + y <- tecator$y + res2 <- fdata2pls(x, y, ncomp=5) + for (i in res2$l) { + c <- res2$loading.weigths$data[i, ] + c <- c / c(sqrt(crossprod(c))) + print( + paste( + round(c, 8), + collapse = ", " + ) # components + ) + } + """ + # Results from fda.usc: + path = os.path.join( + f"{__file__[:-3]}_data", # Trim .py ending + "test_fda_usc_no_reg_data.npy", + ) + with open(path, "rb") as f: + r_results = np.load(f, allow_pickle=False) + + signs = np.array([1, -1, 1, -1, 1]) + + r_results *= signs + + X, y = fetch_tecator( + return_X_y=True, + ) + + plsReg = FPLSRegression[FData, NDArrayFloat]( + n_components=5, + _integration_weights_X=np.ones(len(X.grid_points[0])), + ) + plsReg.fit(X, y) + + W = plsReg.fpls_.x_weights_ + np.testing.assert_allclose(W, r_results, atol=1e-8) + + def test_fda_usc_reg(self) -> None: + """ + Test the FPLSRegression with regularization against fda.usc. + + Replication Code: + data(tecator) + x=tecator$absorp.fdata + y=tecator$y + res2=fdata2pls(x,y,ncomp=5,P=c(0,0,1),lambda = 1000) + for(i in res2$l){ + c = res2$loading.weigths$data[i,] + c = c/ c(sqrt(crossprod(c))) + print( + paste( + round(c, 8), + collapse=", " + ) # components + ) + } + """ + X, y = fetch_tecator( + return_X_y=True, + ) + + pen_order = 2 + reg_param = 1000 + # This factor compensates for the fact that the difference + # matrices in fda.usc are scaled by the mean of the deltas + # between grid points. This diference is introduced in + # the function D.penalty (fdata2pc.R:479) in fda.usc. + + grid_points = X[0].grid_points[0] + grid_step = np.mean(np.diff(grid_points)) + factor1 = grid_step ** (2 * pen_order - 1) + + reg_param *= factor1 + + regularization = L2Regularization( + LinearDifferentialOperator(pen_order), + regularization_parameter=reg_param, + ) + + # Fit the model + fplsr = FPLSRegression[FData, NDArrayFloat]( + n_components=5, + regularization_X=regularization, + _integration_weights_X=np.ones(len(X.grid_points[0])), + ) + fplsr.fit(X, y) + + path = os.path.join( + f"{__file__[:-3]}_data", # Trim .py ending + "test_fda_usc_reg_data.npy", + ) + with open(path, "rb") as f: + r_results = np.load(f, allow_pickle=False) + + signs = np.array([1, -1, 1, -1, 1]) + + w_mat = fplsr.fpls_.x_weights_ @ np.diag(signs) + np.testing.assert_allclose(w_mat, r_results, atol=6e-6, rtol=6e-4) + + def test_basis_vs_grid(self) -> None: + """Test that the results are the same in basis and grid.""" + X, y = fetch_tecator( + return_X_y=True, + ) + + original_grid_points = X.grid_points[0] + # Express the data in a FDataBasis + X = X.to_basis(BSplineBasis(n_basis=20)) + + # Fit the model + fplsr = FPLSRegression[FData, NDArrayFloat](n_components=5) + fplsr.fit(X, y) + basis_components = fplsr.fpls_.x_rotations_(original_grid_points) + + # Go back to grid + new_grid_points = np.linspace( + original_grid_points[0], + original_grid_points[-1], + 1000, + ) + X = X.to_grid(grid_points=new_grid_points) + + # Get the weights for the Simpson's rule + identity = np.eye(len(X.grid_points[0])) + ss_weights = scipy.integrate.simps(identity, X.grid_points[0]) + + # Fit the model with weights + fplsr = FPLSRegression( + n_components=5, + _integration_weights_X=ss_weights, + ) + fplsr.fit(X, y) + + grid_components = fplsr.fpls_.x_rotations_(original_grid_points) + + np.testing.assert_allclose( + basis_components, + grid_components, + rtol=3e-3, + ) + + @pytest.mark.parametrize("y_features", [1, 5]) + def test_multivariate_regression(self, y_features: int) -> None: + """Test the multivariate regression. + + Consider both scalar and multivariate responses. + """ + self.create_latent_variables(n_latent=5, n_samples=100) + + # Check that the model is able to recover the latent variables + # if it has enough components + y_observed, y_rotations = self.create_observed_multivariate_variable( + n_features=y_features, + ) + X_observed, X_rotations = self.create_observed_multivariate_variable( + n_features=10, + ) + + plsreg = FPLSRegression[NDArrayFloat, NDArrayFloat](n_components=5) + plsreg.fit(X_observed, y_observed) + + minimum_score = 0.99 + assert plsreg.score(X_observed, y_observed) > minimum_score + + @pytest.mark.parametrize("discretized", [True, False]) + @pytest.mark.parametrize("n_features", [1, 5]) + @pytest.mark.parametrize("noise_std", [0, 1]) + def test_simple_regresion( + self, + discretized: bool, + n_features: int, + noise_std: float, + ) -> None: + """Test multivariate regressor and functional response.""" + self.create_latent_variables(n_latent=5, n_samples=100) + + # Check that the model is able to recover the latent variables if + # it has enough components + y_observed, y_rotations = self.create_observed_multivariate_variable( + n_features=n_features, + noise=noise_std, + ) + X_observed, X_rotations = self.create_observed_functional_variable( + discretized=discretized, + noise=noise_std, + ) + + plsreg = FPLSRegression[FData, NDArrayFloat](n_components=5) + plsreg.fit(X_observed, y_observed) + + minimum_score = 0.99 if noise_std == 0 else 0.98 + assert plsreg.score(X_observed, y_observed) > minimum_score + + @pytest.mark.parametrize("discretized_observed", [True, False]) + @pytest.mark.parametrize("discretized_response", [True, False]) + @pytest.mark.parametrize("noise_std", [0, 1]) + def test_simple_regresion_dataset_functional( + self, + discretized_observed: bool, + discretized_response: bool, + noise_std: float, + ) -> None: + """Test multivariate regressor and functional response.""" + self.create_latent_variables(n_latent=5, n_samples=100) + + # Check that the model is able to recover the latent variables if + # it has enough components + X_observed, X_rotations = self.create_observed_functional_variable( + discretized=discretized_observed, + noise=noise_std, + ) + y_observed, y_rotations = self.create_observed_functional_variable( + discretized=discretized_response, + noise=noise_std, + ) + + plsreg = FPLSRegression[FData, FData](n_components=5) + plsreg.fit(X_observed, y_observed) + minimum_score = 0.99 if noise_std == 0 else 0.98 + assert plsreg.score(X_observed, y_observed) > minimum_score diff --git a/skfda/tests/test_fpls_regression_data/test_fda_usc_no_reg_data.npy b/skfda/tests/test_fpls_regression_data/test_fda_usc_no_reg_data.npy new file mode 100644 index 000000000..36b54860f Binary files /dev/null and b/skfda/tests/test_fpls_regression_data/test_fda_usc_no_reg_data.npy differ diff --git a/skfda/tests/test_fpls_regression_data/test_fda_usc_reg_data.npy b/skfda/tests/test_fpls_regression_data/test_fda_usc_reg_data.npy new file mode 100644 index 000000000..d1606e5e7 Binary files /dev/null and b/skfda/tests/test_fpls_regression_data/test_fda_usc_reg_data.npy differ diff --git a/skfda/tests/test_recursive_maxima_hunting.py b/skfda/tests/test_recursive_maxima_hunting.py index 389e095ac..d781a69b9 100644 --- a/skfda/tests/test_recursive_maxima_hunting.py +++ b/skfda/tests/test_recursive_maxima_hunting.py @@ -25,8 +25,8 @@ def test_gaussian_homoscedastic(self) -> None: join. """ - n_samples = 10000 - n_features = 100 + n_samples = 1000 + n_features = 101 def mean_1( # noqa: WPS430 t: np.typing.NDArray[np.float_], @@ -66,7 +66,7 @@ def mean_1( # noqa: WPS430 rmh.fit(X, y) point_mask = rmh.get_support() points = X.grid_points[0][point_mask] - np.testing.assert_allclose(points, [0.25, 0.5, 0.75], rtol=1e-1) + np.testing.assert_allclose(points, [0.25, 0.5, 0.75], rtol=1e-2) @pytest.mark.filterwarnings( 'ignore::sklearn.exceptions.ConvergenceWarning' @@ -83,7 +83,7 @@ def test_fit_exponential(self) -> None: """ n_samples = 1000 - n_features = 100 + n_features = 101 def mean_1( # noqa: WPS430 t: np.typing.NDArray[np.float_], @@ -129,7 +129,7 @@ def mean_1( # noqa: WPS430 rmh.fit(X, y) point_mask = rmh.get_support() points = X.grid_points[0][point_mask] - np.testing.assert_allclose(points, [0.25, 0.5, 0.75], rtol=1e-1) + np.testing.assert_allclose(points, [0.25, 0.5, 0.75], rtol=1e-2) if __name__ == '__main__': diff --git a/skfda/tests/test_regression.py b/skfda/tests/test_regression.py index 6bab0950f..1bb4fc0c4 100644 --- a/skfda/tests/test_regression.py +++ b/skfda/tests/test_regression.py @@ -1,3 +1,4 @@ +"""Tests for scalar and functional response linear regression.""" from __future__ import annotations import unittest @@ -6,8 +7,9 @@ import numpy as np import pandas as pd from scipy.integrate import cumtrapz +from sklearn.preprocessing import OneHotEncoder -from skfda.datasets import make_gaussian, make_gaussian_process +from skfda.datasets import fetch_weather, make_gaussian, make_gaussian_process from skfda.misc.covariances import Gaussian from skfda.misc.operators import LinearDifferentialOperator from skfda.misc.regularization import L2Regularization @@ -22,16 +24,56 @@ from skfda.representation.grid import FDataGrid +def _test_linear_regression_common( + X_train, + y_train, + expected_coefs, + X_test, + y_test, + coef_basis=None, + regularization=None, +) -> None: + """Execute a test of linear regression, given the parameters.""" + linear_regression = LinearRegression( + fit_intercept=False, + coef_basis=coef_basis, + regularization=regularization, + ) + linear_regression.fit(X_train, y_train) + + for coef, expected in zip(linear_regression.coef_, expected_coefs): + assert isinstance(coef, FDataBasis) + assert coef.basis == expected.basis + np.testing.assert_allclose( + coef.coefficients, + expected.coefficients, + atol=1e-6, + ) + + y_pred = linear_regression.predict(X_test) + assert isinstance(y_pred, FDataBasis) + assert y_pred.basis == y_test.basis + np.testing.assert_allclose( + y_pred.coefficients, + y_test.coefficients, + rtol=1e-5, + ) + + class TestScalarLinearRegression(unittest.TestCase): + """Tests for linear regression with scalar response.""" - def test_regression_single_explanatory(self) -> None: + def test_single_explanatory(self) -> None: + """Test a basic example of functional regression. + Scalar response with functional covariates. + """ x_basis = MonomialBasis(n_basis=7) x_fd = FDataBasis(x_basis, np.identity(7)) beta_basis = FourierBasis(n_basis=5) beta_fd = FDataBasis(beta_basis, [1, 1, 1, 1, 1]) - y = np.array([ + y = [ 0.9999999999999993, 0.162381381441085, 0.08527083481359901, @@ -39,7 +81,7 @@ def test_regression_single_explanatory(self) -> None: 0.09532291032042489, 0.10550022969639987, 0.11382675064746171, - ]) + ] scalar = LinearRegression(coef_basis=[beta_basis]) scalar.fit(x_fd, y) @@ -49,9 +91,7 @@ def test_regression_single_explanatory(self) -> None: beta_fd.coefficients, ) np.testing.assert_allclose( - scalar.intercept_, - 0.0, - atol=1e-6, + scalar.intercept_, 0.0, atol=1e-6, ) y_pred = scalar.predict(x_fd) @@ -68,15 +108,18 @@ def test_regression_single_explanatory(self) -> None: beta_fd.coefficients, ) np.testing.assert_equal( - scalar.intercept_, - 0.0, + scalar.intercept_, 0.0, ) y_pred = scalar.predict(x_fd) np.testing.assert_allclose(y_pred, y) - def test_regression_multiple_explanatory(self) -> None: - y = np.array([1, 2, 3, 4, 5, 6, 7]) + def test_multiple_explanatory(self) -> None: + """Test a example of functional regression. + + Scalar response with functional covariates. + """ + y = [1, 2, 3, 4, 5, 6, 7] X = FDataBasis(MonomialBasis(n_basis=7), np.identity(7)) @@ -87,9 +130,7 @@ def test_regression_multiple_explanatory(self) -> None: scalar.fit(X, y) np.testing.assert_allclose( - scalar.intercept_.round(4), - np.array([32.65]), - rtol=1e-3, + scalar.intercept_.round(4), np.array([32.65]), rtol=1e-3, ) assert isinstance(scalar.coef_[0], FDataBasis) @@ -101,39 +142,38 @@ def test_regression_multiple_explanatory(self) -> None: -188.587, 236.5832, -481.3449, - ]]), - rtol=1e-3, + ]]), rtol=1e-3, ) y_pred = scalar.predict(X) np.testing.assert_allclose(y_pred, y, atol=0.01) - def test_regression_mixed(self) -> None: + def test_mixed(self) -> None: + """Test a example of functional regression. - multivariate = np.array([ - [0, 0], [2, 7], [1, 7], [3, 9], - [4, 16], [2, 14], [3, 5], + Scalar response with multivariate and functional covariates. + """ + multivariate = np.array( + [[0, 0], [2, 7], [1, 7], [3, 9], [4, 16], [2, 14], [3, 5]], + ) + + x_fd = FDataBasis(MonomialBasis(n_basis=3), [ + [1, 0, 0], + [0, 1, 0], + [0, 0, 1], + [1, 0, 1], + [1, 0, 0], + [0, 1, 0], + [0, 0, 1], ]) - X: Sequence[ - np.typing.NDArray[np.float_] | FDataBasis, - ] = [ - multivariate, - FDataBasis( - MonomialBasis(n_basis=3), - [ - [1, 0, 0], [0, 1, 0], [0, 0, 1], - [1, 0, 1], [1, 0, 0], [0, 1, 0], - [0, 0, 1], - ], - ), - ] + X = [multivariate, x_fd] + # y = 2 + sum([3, 1] * array) + int(3 * function) # noqa: E800 intercept = 2 coefs_multivariate = np.array([3, 1]) coefs_functions = FDataBasis( - MonomialBasis(n_basis=3), - [[3, 0, 0]], + MonomialBasis(n_basis=3), [[3, 0, 0]], ) y_integral = np.array([3, 3 / 2, 1, 4, 3, 3 / 2, 1]) y_sum = multivariate @ coefs_multivariate @@ -143,15 +183,11 @@ def test_regression_mixed(self) -> None: scalar.fit(X, y) np.testing.assert_allclose( - scalar.intercept_, - intercept, - atol=0.01, + scalar.intercept_, intercept, atol=0.01, ) np.testing.assert_allclose( - scalar.coef_[0], - coefs_multivariate, - atol=0.01, + scalar.coef_[0], coefs_multivariate, atol=0.01, ) assert isinstance(scalar.coef_[1], FDataBasis) @@ -203,16 +239,25 @@ def test_same_result_1d_2d_multivariate_arrays(self) -> None: linear2.coef_[2], ) - def test_regression_df_multivariate(self) -> None: # noqa: D102 + def test_df_multivariate(self) -> None: + """Test a example of functional regression with Dataframe input. + Scalar response with multivariate and functional covariates. + """ multivariate1 = [0, 2, 1, 3, 4, 2, 3] multivariate2 = [0, 7, 7, 9, 16, 14, 5] multivariate = [list(obs) for obs in zip(multivariate1, multivariate2)] x_basis = MonomialBasis(n_basis=3) - x_fd = FDataBasis(x_basis, [[1, 0, 0], [0, 1, 0], [0, 0, 1], - [1, 0, 1], [1, 0, 0], [0, 1, 0], - [0, 0, 1]]) + x_fd = FDataBasis(x_basis, [ + [1, 0, 0], + [0, 1, 0], + [0, 0, 1], + [1, 0, 1], + [1, 0, 0], + [0, 1, 0], + [0, 0, 1], + ]) cov_dict = {"fd": x_fd, "mult1": multivariate1, "mult2": multivariate2} @@ -252,26 +297,31 @@ def test_regression_df_multivariate(self) -> None: # noqa: D102 y_pred = scalar.predict(df) np.testing.assert_allclose(y_pred, y, atol=0.01) - def test_regression_mixed_regularization(self) -> None: + def test_mixed_regularization(self) -> None: + """Test a example of functional regression. + Scalar response with multivariate and functional covariates + using regularization. + """ multivariate = np.array([ - [0, 0], [2, 7], [1, 7], [3, 9], - [4, 16], [2, 14], [3, 5], + [0, 0], [2, 7], [1, 7], [3, 9], [4, 16], [2, 14], [3, 5], + ]) + + x_fd = FDataBasis(MonomialBasis(n_basis=3), [ + [1, 0, 0], + [0, 1, 0], + [0, 0, 1], + [1, 0, 1], + [1, 0, 0], + [0, 1, 0], + [0, 0, 1], ]) X: Sequence[ np.typing.NDArray[np.float_] | FDataBasis, - ] = [ - multivariate, - FDataBasis( - MonomialBasis(n_basis=3), - [ - [1, 0, 0], [0, 1, 0], [0, 0, 1], - [1, 0, 1], [1, 0, 0], [0, 1, 0], - [0, 0, 1], - ]), - ] + ] = [multivariate, x_fd] + # y = 2 + sum([3, 1] * array) + int(3 * function) # noqa: E800 intercept = 2 coefs_multivariate = np.array([3, 1]) y_integral = np.array([3, 3 / 2, 1, 4, 3, 3 / 2, 1]) @@ -289,9 +339,7 @@ def test_regression_mixed_regularization(self) -> None: scalar.fit(X, y) np.testing.assert_allclose( - scalar.intercept_, - intercept, - atol=0.01, + scalar.intercept_, intercept, atol=0.01, ) np.testing.assert_allclose( @@ -311,23 +359,28 @@ def test_regression_mixed_regularization(self) -> None: np.testing.assert_allclose( y_pred, [ - 5.349035, 16.456464, 13.361185, 23.930295, - 32.650965, 23.961766, 16.29029, + 5.349035, + 16.456464, + 13.361185, + 23.930295, + 32.650965, + 23.961766, + 16.29029, ], atol=0.01, ) - def test_regression_regularization(self) -> None: + def test_regularization(self) -> None: + """Test a example of functional regression. + Scalar response with functional covariates using regularization. + """ x_basis = MonomialBasis(n_basis=7) x_fd = FDataBasis(x_basis, np.identity(7)) beta_basis = FourierBasis(n_basis=5) - beta_fd = FDataBasis( - beta_basis, - [1.0403, 0, 0, 0, 0], - ) - y = np.array([ + beta_fd = FDataBasis(beta_basis, [1.0403, 0, 0, 0, 0]) + y = [ 1.0000684777229512, 0.1623672257830915, 0.08521053851548224, @@ -335,7 +388,7 @@ def test_regression_regularization(self) -> None: 0.09529138749665378, 0.10549625973303875, 0.11384314859153018, - ]) + ] y_pred_compare = [ 0.890341, @@ -361,23 +414,19 @@ def test_regression_regularization(self) -> None: atol=1e-3, ) np.testing.assert_allclose( - scalar.intercept_, - -0.15, - atol=1e-4, + scalar.intercept_, -0.15, atol=1e-4, ) y_pred = scalar.predict(x_fd) np.testing.assert_allclose(y_pred, y_pred_compare, atol=1e-4) x_basis = MonomialBasis(n_basis=3) - x_fd = FDataBasis( - x_basis, - [ - [1, 0, 0], - [0, 1, 0], - [0, 0, 1], - [2, 0, 1], - ]) + x_fd = FDataBasis(x_basis, [ + [1, 0, 0], + [0, 1, 0], + [0, 0, 1], + [2, 0, 1], + ]) beta_fd = FDataBasis(x_basis, [3, 2, 1]) y = np.array([1 + 13 / 3, 1 + 29 / 12, 1 + 17 / 10, 1 + 311 / 30]) @@ -385,10 +434,8 @@ def test_regression_regularization(self) -> None: # Non regularized scalar = LinearRegression() scalar.fit(x_fd, y) - assert isinstance(scalar.coef_[0], FDataBasis) np.testing.assert_allclose( - scalar.coef_[0].coefficients, - beta_fd.coefficients, + scalar.coef_[0].coefficients, beta_fd.coefficients, ) np.testing.assert_allclose(scalar.intercept_, 1) @@ -412,63 +459,19 @@ def test_regression_regularization(self) -> None: atol=0.001, ) np.testing.assert_allclose( - scalar_reg.intercept_, - 0.998, - atol=0.001, + scalar_reg.intercept_, 0.998, atol=0.001, ) y_pred = scalar_reg.predict(x_fd) np.testing.assert_allclose(y_pred, y_reg, atol=0.001) - def test_error_X_not_FData(self) -> None: - """Tests that at least one variable is an FData object.""" - x_fd = np.identity(7) - y = np.zeros(7) - - scalar = LinearRegression(coef_basis=[FourierBasis(n_basis=5)]) - - with np.testing.assert_warns(UserWarning): - scalar.fit([x_fd], y) - - def test_error_y_is_FData(self) -> None: - """Tests that none of the explained variables is an FData object.""" - x_fd = FDataBasis(MonomialBasis(n_basis=7), np.identity(7)) - y = list(FDataBasis(MonomialBasis(n_basis=7), np.identity(7))) - - scalar = LinearRegression(coef_basis=[FourierBasis(n_basis=5)]) - - with np.testing.assert_raises(ValueError): - scalar.fit([x_fd], y) # type: ignore[arg-type] - - def test_error_X_beta_len_distinct(self) -> None: - """Test that the number of beta bases and explanatory variables - are not different """ - x_fd = FDataBasis(MonomialBasis(n_basis=7), np.identity(7)) - y = np.array([1 for _ in range(7)]) - beta = FourierBasis(n_basis=5) - - scalar = LinearRegression(coef_basis=[beta]) - with np.testing.assert_raises(ValueError): - scalar.fit([x_fd, x_fd], y) - - scalar = LinearRegression(coef_basis=[beta, beta]) - with np.testing.assert_raises(ValueError): - scalar.fit([x_fd], y) - - def test_error_y_X_samples_different(self) -> None: - """Test that the number of response samples and explanatory samples - are not different """ + def test_error_y_X_samples_different(self) -> None: # noqa: N802 + """Number of response samples and explanatory samples are not different. + Raises ValueError when response is scalar. + """ x_fd = FDataBasis(MonomialBasis(n_basis=7), np.identity(7)) - y = np.array([1 for _ in range(8)]) - beta = FourierBasis(n_basis=5) - - scalar = LinearRegression(coef_basis=[beta]) - with np.testing.assert_raises(ValueError): - scalar.fit([x_fd], y) - - x_fd = FDataBasis(MonomialBasis(n_basis=8), np.identity(8)) - y = np.array([1 for _ in range(7)]) + y = [1 for _ in range(8)] beta = FourierBasis(n_basis=5) scalar = LinearRegression(coef_basis=[beta]) @@ -476,7 +479,7 @@ def test_error_y_X_samples_different(self) -> None: scalar.fit([x_fd], y) def test_error_beta_not_basis(self) -> None: - """Test that all beta are Basis objects. """ + """Test that all beta are Basis objects.""" x_fd = FDataBasis(MonomialBasis(n_basis=7), np.identity(7)) y = np.array([1 for _ in range(7)]) beta = FDataBasis(MonomialBasis(n_basis=7), np.identity(7)) @@ -486,7 +489,7 @@ def test_error_beta_not_basis(self) -> None: scalar.fit([x_fd], y) def test_error_weights_lenght(self) -> None: - """Test that the number of weights is equal to n_samples.""" + """Number of weights is equal to the number of samples.""" x_fd = FDataBasis(MonomialBasis(n_basis=7), np.identity(7)) y = np.array([1 for _ in range(7)]) weights = np.array([1 for _ in range(8)]) @@ -508,6 +511,446 @@ def test_error_weights_negative(self) -> None: scalar.fit([x_fd], y, weights) +class TestFunctionalLinearRegression(unittest.TestCase): + """Tests for linear regression with functional response.""" + + def test_multivariate_covariates_constant_basic(self) -> None: + """ + Univariate with functional response and constant coefficient one. + """ + y_basis = ConstantBasis() + + X_train = pd.DataFrame({ + "covariate": [1, 3, 5], + }) + y_train = FDataBasis( + basis=y_basis, + coefficients=[ + [1], + [3], + [5], + ], + ) + + expected_coefs = [ + FDataBasis( + basis=y_basis, + coefficients=[[1]], + ) + ] + + X_test = pd.DataFrame({ + "covariate": [2, 4, 6], + }) + y_test = FDataBasis( + basis=y_basis, + coefficients=[ + [2], + [4], + [6], + ], + ) + + _test_linear_regression_common( + X_train=X_train, + y_train=y_train, + expected_coefs=expected_coefs, + X_test=X_test, + y_test=y_test, + ) + + # Check also without dataframes + _test_linear_regression_common( + X_train=X_train.to_numpy(), + y_train=y_train, + expected_coefs=expected_coefs, + X_test=X_test.to_numpy(), + y_test=y_test, + ) + + def test_multivariate_covariates_monomial_basic(self) -> None: + """ + Multivariate with functional response and identity coefficients. + """ + y_basis = MonomialBasis(n_basis=2) + + X_train = pd.DataFrame({ + "covariate_1": [1, 3, 5], + "covariate_2": [2, 4, 6], + }) + y_train = FDataBasis( + basis=y_basis, + coefficients=[ + [1, 2], + [3, 4], + [5, 6], + ], + ) + + expected_coefs = [ + FDataBasis( + basis=y_basis, + coefficients=[[1, 0]], + ), + FDataBasis( + basis=y_basis, + coefficients=[[0, 1]], + ), + ] + + X_test = pd.DataFrame({ + "covariate_1": [2, 4, 6], + "covariate_2": [1, 3, 5], + }) + y_test = FDataBasis( + basis=y_basis, + coefficients=[ + [2, 1], + [4, 3], + [6, 5], + ], + ) + + _test_linear_regression_common( + X_train=X_train, + y_train=y_train, + expected_coefs=expected_coefs, + X_test=X_test, + y_test=y_test, + ) + + # Check also without dataframes + expected_coefs = [ + FDataBasis( + basis=y_basis, + coefficients=[ + [1, 0], + [0, 1], + ], + ), + ] + + # Currently broken. + + # _test_linear_regression_common( + # X_train=X_train.to_numpy(), + # y_train=y_train, + # expected_coefs=expected_coefs, + # X_test=X_test.to_numpy(), + # y_test=y_test, + # ) + + def test_multivariate_3_covariates(self) -> None: + """Test a more complex example involving 3 covariates.""" + y_basis = MonomialBasis(n_basis=3) + + X_train = pd.DataFrame({ + "covariate_1": [3, 5, 3], + "covariate_2": [4, 1, 2], + "covariate_3": [1, 6, 8], + }) + y_train = FDataBasis( + basis=y_basis, + coefficients=[ + [47, 22, 24], + [43, 47, 39], + [40, 53, 51], + ], + ) + + expected_coefs = [ + FDataBasis( + basis=y_basis, + coefficients=[[6, 3, 1]], + ), + FDataBasis( + basis=y_basis, + coefficients=[[7, 2, 4]], + ), + FDataBasis( + basis=y_basis, + coefficients=[[1, 5, 5]], + ), + ] + + X_test = pd.DataFrame({ + "covariate_1": [3], + "covariate_2": [2], + "covariate_3": [1], + }) + y_test = FDataBasis( + basis=y_basis, + coefficients=[[33, 18, 16]], + ) + + _test_linear_regression_common( + X_train=X_train, + y_train=y_train, + expected_coefs=expected_coefs, + X_test=X_test, + y_test=y_test, + ) + + def test_multivariate_covariates_regularization(self) -> None: + """Test a example of functional regression. + + Functional response with multivariate covariates and + beta regularization. + """ + y_basis = MonomialBasis(n_basis=3) + + X_train = pd.DataFrame({ + "covariate_1": [3, 5, 3], + "covariate_2": [4, 1, 2], + "covariate_3": [1, 6, 8], + }) + y_train = FDataBasis( + basis=y_basis, + coefficients=[ + [47, 22, 24], + [43, 47, 39], + [40, 53, 51], + ], + ) + + expected_coefs = [ + FDataBasis( + basis=y_basis, + coefficients=[[5.769441, 3.025921, 1.440655]], + ), + FDataBasis( + basis=y_basis, + coefficients=[[6.688267, 1.938523, 3.579894]], + ), + FDataBasis( + basis=y_basis, + coefficients=[[1.198499, 4.952166, 4.811818]], + ), + ] + + X_test = pd.DataFrame({ + "covariate_1": [3], + "covariate_2": [2], + "covariate_3": [1], + }) + y_test = FDataBasis( + basis=y_basis, + coefficients=[[31.883356, 17.906975, 16.293571]], + ) + + _test_linear_regression_common( + X_train=X_train, + y_train=y_train, + expected_coefs=expected_coefs, + X_test=X_test, + y_test=y_test, + regularization=[L2Regularization()] * 3, + ) + + def test_multivariate_covariates_R_fda(self) -> None: # noqa: N802 + """Test a example with Canadian Weather comparing with R fda package. + + Code used in R: + daybasis65 <- create.fourier.basis( + rangeval=c(0, 365), nbasis=65, axes=list('axesIntervals')) + Temp.fd <- with(CanadianWeather, smooth.basisPar(day.5, + dailyAv[,,'Temperature.C'], daybasis65)$fd) + TempRgn.f <- fRegress(Temp.fd ~ region, CanadianWeather) + write.table( + t(round( + TempRgn.f$betaestlist$const$fd$coefs, + digits=4)), + file="", sep = ",", col.names = FALSE, row.names = FALSE + ) + write.table( + t(round( + TempRgn.f$betaestlist$region.Atlantic$fd$coefs, + digits=4)), + file="", sep = ",", col.names = FALSE, row.names = FALSE + ) + write.table( + t(round( + TempRgn.f$betaestlist$region.Continental$fd$coefs, + digits=4)), + file="", sep = ",", col.names = FALSE, row.names = FALSE) + write.table( + t(round( + TempRgn.f$betaestlist$region.Pacific$fd$coefs, + digits=4)), + file="", sep = ",", col.names = FALSE, row.names = FALSE) + """ + X_weather, y_weather = fetch_weather( + return_X_y=True, as_frame=True, + ) + fd = X_weather.iloc[:, 0].values + + y_basis = FourierBasis(n_basis=65) + y_fd = fd.coordinates[0].to_basis(y_basis) + + enc = OneHotEncoder(handle_unknown='ignore') + enc.fit([['Atlantic'], ['Continental'], ['Pacific']]) + X = np.array(y_weather).reshape(-1, 1) + X = enc.transform(X).toarray() + + cov_dict = {"mult1": X[:, 0], "mult2": X[:, 1], "mult3": X[:, 2]} + df = pd.DataFrame(cov_dict) + + beta_const_R = [ # noqa: WPS317 + -225.5085, -110.817, -243.4708, 4.6815, 21.4488, 10.3784, 2.6317, + 1.7571, 2.4812, -1.5179, 1.4451, -0.6581, 2.8287, 0.4106, 1.5839, + -1.711, 0.5587, -2.2268, 2.4745, -0.5179, -0.8376, -3.1504, + -0.1357, -0.1186, 1.1512, 0.7343, 1.842, -0.5258, 1.2263, -0.576, + -0.6754, -0.6952, -0.416, -1.0292, 1.6742, 0.4276, 0.5185, -0.2135, + 0.3239, 1.6598, 1.0682, 2.2478, 0.2692, 1.8589, -0.5416, 0.5256, + -1.6838, -1.1174, 0.1842, -0.3521, 0.1809, -1.6302, 0.6676, + -0.3356, 1.036, -0.6297, 0.4227, -0.3096, 1.1373, 0.6317, 0.3608, + -0.9949, -0.709, -0.4588, -0.5694, + ] + + beta_atlantic_R = [ # noqa: WPS317 + 312.966, 35.9273, 67.7156, -12.9111, -27.3945, -18.3422, + -6.6074, -0.0203, -4.5716, 3.3142, -1.8419, 2.2008, -3.1554, + -0.8167, -1.6248, 1.4791, -0.8676, 2.9854, -2.5819, -0.239, 0.6418, + 2.2211, 1.4992, -2.2746, 0.6767, -2.8692, 1.478, 0.5988, -0.3434, + -0.2574, 2.3693, -0.016, 1.4911, 3.2798, -0.6508, 1.3326, -0.6729, + 1.0736, -0.7874, -1.2653, -1.8837, -3.1971, 0.0166, -1.298, 0.1403, + -1.2479, 0.593, 0.715, 0.1659, 0.8047, -1.2938, 0.7217, -1.1323, + -0.9719, -1.256, 0.8089, -0.1986, 0.7974, -0.4129, -0.6855, + -0.6397, 3.2471, 0.4686, 1.3593, 0.9434, + ] + + beta_continental_R = [ # noqa: WPS317 + 214.8319, 41.1702, 6.2763, -11.5837, -40.6003, -10.9865, -6.6548, + 4.2589, -3.5174, 0.9494, 1.5624, -3.1435, -1.3242, -1.6431, + -1.0234, 2.0606, -1.1042, -0.1723, -4.2717, -0.9321, 1.2331, + 2.0911, -1.0444, -1.757, -1.9564, -2.3117, -3.0405, -1.3801, + -1.7431, -2.0031, 0.7171, -0.6877, 0.7969, -1.01, -0.1761, -2.7614, + 0.8308, -0.7232, 1.671, 0.0118, 1.8239, 0.5399, 1.8575, 0.9313, + 1.6813, 0.834, 2.1028, 1.8707, -0.147, -0.6401, -0.165, 1.5439, + -0.4666, 0.2153, -0.8795, 0.4695, 0.0417, 0.7045, -1.1045, 0.0166, + -0.7447, 1.4645, 1.5654, -0.3106, 0.7647, + ] + + beta_pacific_R = [ # noqa: WPS317 + 375.1732, 78.6384, 127.8782, 6.0014, -29.3124, -11.4446, -5.3623, + -1.1054, -5.4936, 0.5137, 0.0086, -0.7174, -5.2713, -1.2635, + -1.6654, -0.5359, -2.4626, 1.8152, -4.0212, 0.8431, -1.7737, + 3.7342, -2.0556, 0.0382, -2.4436, -1.9431, -3.6757, -0.6956, + -2.8307, -0.7396, 1.6465, -0.3534, 0.903, 0.0484, -1.6763, -1.6237, + -0.9657, -1.6763, -0.2481, -1.3371, -0.6295, -2.4142, 0.9318, + -1.1531, 0.8854, -0.966, 1.6884, 1.6327, -0.1843, 0.1531, -0.7279, + 0.8348, -0.4336, -0.1253, -1.0069, 0.2815, -0.3406, 1.4044, + -1.6412, 0.4354, -1.2269, 0.9194, 1.0373, 0.7552, 1.088, + ] + + linear_regression = LinearRegression() + linear_regression.fit(df, y_fd) + + np.testing.assert_allclose( + linear_regression.basis_coefs[0].ravel(), beta_const_R, atol=0.001, + ) + np.testing.assert_allclose( + linear_regression.basis_coefs[1].ravel(), beta_atlantic_R, atol=0.001, + ) + np.testing.assert_allclose( + linear_regression.basis_coefs[2].ravel(), beta_continental_R, atol=0.001, + ) + np.testing.assert_allclose( + linear_regression.basis_coefs[3].ravel(), beta_pacific_R, atol=0.001, + ) + np.testing.assert_equal(linear_regression.coef_[0].basis, y_fd.basis) + + def test_functional_covariates_concurrent(self) -> None: # noqa: N802 + """ + Test a example of concurrent functional regression. + + Functional response with functional and multivariate covariates. + Concurrent model. + """ + y_basis = MonomialBasis(n_basis=2) + x_basis = MonomialBasis(n_basis=3) + + X_train = pd.DataFrame({ + "covariate_1": FDataBasis( + basis=x_basis, + coefficients=[ + [0, 1, 0], + [0, 1, 0], + [0, 0, 1], + ], + ), + "covariate_2": [3, 1, 4], + "covariate_3": [9, 2, 7], + }) + y_train = FDataBasis( + basis=y_basis, + coefficients=[ + [1, 1], + [2, 1], + [3, 1], + ], + ) + + expected_coefs = [ + FDataBasis( + basis=y_basis, + coefficients=[[5.608253, -2.866976]], + ), + FDataBasis( + basis=y_basis, + coefficients=[[1.842478, -0.507984]], + ), + FDataBasis( + basis=y_basis, + coefficients=[[-0.55036, -0.032797]], + ), + ] + + X_test = pd.DataFrame({ + "covariate_1": FDataBasis( + basis=x_basis, + coefficients=[ + [0, 0, 1], + [0, 0, 1], + [1, 1, 0], + ], + ), + "covariate_2": [2, 1, 1], + "covariate_3": [0, 2, 1], + }) + y_test = FDataBasis( + basis=y_basis, + coefficients=[ + [3.323643, 2.012006], + [0.380445, 2.454396], + [7.378201, -0.666481], + ], + ) + + _test_linear_regression_common( + X_train=X_train, + y_train=y_train, + expected_coefs=expected_coefs, + X_test=X_test, + y_test=y_test, + coef_basis=[y_basis, y_basis, y_basis], + ) + + def test_error_y_X_samples_different(self) -> None: # noqa: N802 + """Number of response samples and explanatory samples are not different. + + Raises ValueError when response is functional. + """ + y_basis = MonomialBasis(n_basis=2) + X = [[1, 2], [3, 4], [5, 6], [1, 0]] + + y_fd = FDataBasis(y_basis, [[1, 2], [3, 4], [5, 6]]) + + funct_reg = LinearRegression() + with np.testing.assert_raises(ValueError): + funct_reg.fit(X, y_fd) + + class TestHistoricalLinearRegression(unittest.TestCase): """Tests for historical linear regression.""" @@ -620,7 +1063,7 @@ def test_historical(self) -> None: np.testing.assert_allclose( regression.coef_.data_matrix[0, ..., 0], np.triu(self.coefficients.data_matrix[0, ..., 0]), - atol=0.3, + atol=0.35, rtol=0, ) diff --git a/skfda/tests/test_stats_std.py b/skfda/tests/test_stats_std.py new file mode 100644 index 000000000..24f6687af --- /dev/null +++ b/skfda/tests/test_stats_std.py @@ -0,0 +1,188 @@ +"""Test stats functions.""" + +from __future__ import annotations + +from typing import Any + +import numpy as np +import pytest + +from skfda import FDataBasis, FDataGrid +from skfda.exploratory.stats import std +from skfda.representation.basis import ( + Basis, + FourierBasis, + MonomialBasis, + TensorBasis, + VectorValuedBasis, +) + +# Fixtures for test_std_fdatabasis_vector_valued_basis + + +@pytest.fixture(params=[3, 5]) +def vv_n_basis1(request: Any) -> int: + """n_basis for 1st coordinate of vector valued basis.""" + return request.param # type: ignore[no-any-return] + + +@pytest.fixture +def vv_basis1(vv_n_basis1: int) -> Basis: + """1-dimensional basis to test for vector valued basis.""" + # First element of the basis is assumed to be the 1 function + return MonomialBasis( + n_basis=vv_n_basis1, domain_range=(0, 1), + ) + + +@pytest.fixture(params=[FourierBasis, MonomialBasis]) +def vv_basis2(request: Any, vv_n_basis2: int = 3) -> Basis: + """1-dimensional basis to test for vector valued basis.""" + # First element of the basis is assumed to be the 1 function + return request.param( # type: ignore[no-any-return] + domain_range=(0, 1), n_basis=vv_n_basis2, + ) + + +# Fixtures for test_std_fdatabasis_tensor_basis + +@pytest.fixture(params=[FourierBasis]) +def t_basis1(request: Any, t_n_basis1: int = 3) -> Basis: + """1-dimensional basis to test for tensor basis.""" + # First element of the basis is assumed to be the 1 function + return request.param( # type: ignore[no-any-return] + domain_range=(0, 1), n_basis=t_n_basis1, + ) + + +@pytest.fixture(params=[MonomialBasis]) +def t_basis2(request: Any, t_n_basis2: int = 5) -> Basis: + """1-dimensional basis to test for tensor basis.""" + # First element of the basis is assumed to be the 1 function + return request.param( # type: ignore[no-any-return] + domain_range=(0, 1), n_basis=t_n_basis2, + ) + + +# Tests + +def test_std_fdatagrid_1d_to_2d() -> None: + """Test std_fdatagrid with R to R^2 functions.""" + fd = FDataGrid( + data_matrix=[ + [[0, 1, 2, 3, 4, 5], [0, -1, -2, -3, -4, -5]], + [[2, 3, 4, 5, 6, 7], [-2, -3, -4, -5, -6, -7]], + ], + grid_points=[ + [-2, -1], + [0, 1, 2, 3, 4, 5], + ], + ) + expected_std_data_matrix = np.full((1, 2, 6, 1), np.sqrt(2)) + np.testing.assert_allclose( + std(fd).data_matrix, + expected_std_data_matrix, + ) + + +def test_std_fdatagrid_2d_to_2d() -> None: + """Test std_fdatagrid with R to R^2 functions.""" + fd = FDataGrid( + data_matrix=[ + [ + [[10, 11], [10, 12], [11, 14]], + [[15, 16], [12, 15], [20, 13]], + ], + [ + [[11, 12], [11, 13], [12, 13]], + [[14, 15], [11, 16], [21, 12]], + ], + ], + grid_points=[ + [0, 1], + [0, 1, 2], + ], + ) + expected_std_data_matrix = np.full((1, 2, 3, 2), np.sqrt(1 / 2)) + np.testing.assert_allclose( + std(fd).data_matrix, + expected_std_data_matrix, + ) + + +def test_std_fdatabasis_vector_valued_basis( + vv_basis1: Basis, + vv_basis2: Basis, +) -> None: + """Test std_fdatabasis with a vector valued basis.""" + basis = VectorValuedBasis([vv_basis1, vv_basis2]) + + # coefficients of the function===(1, 1) + one_coefficients = np.concatenate(( + np.pad([1], (0, vv_basis1.n_basis - 1)), + np.pad([1], (0, vv_basis2.n_basis - 1)), + )) + + fd = FDataBasis( + basis=basis, + coefficients=[np.zeros(basis.n_basis), one_coefficients], + ) + + np.testing.assert_allclose( + std(fd).coefficients, + np.array([np.sqrt(1 / 2) * one_coefficients]), + rtol=1e-7, + atol=1e-7, + ) + + +def test_std_fdatabasis_tensor_basis( + t_basis1: Basis, + t_basis2: Basis, +) -> None: + """Test std_fdatabasis with a vector valued basis.""" + basis = TensorBasis([t_basis1, t_basis2]) + + # coefficients of the function===1 + one_coefficients = np.pad([1], (0, basis.n_basis - 1)) + + fd = FDataBasis( + basis=basis, + coefficients=[np.zeros(basis.n_basis), one_coefficients], + ) + + np.testing.assert_allclose( + std(fd).coefficients, + np.array([np.sqrt(1 / 2) * one_coefficients]), + rtol=1e-7, + atol=1e-7, + ) + + +def test_std_fdatabasis_2d_to_2d() -> None: + """Test std_fdatabasis with R^2 to R^2 basis.""" + basis = VectorValuedBasis([ + TensorBasis([ + MonomialBasis(domain_range=(0, 1), n_basis=2), + MonomialBasis(domain_range=(0, 1), n_basis=2), + ]), + TensorBasis([ + MonomialBasis(domain_range=(0, 1), n_basis=2), + MonomialBasis(domain_range=(0, 1), n_basis=2), + ]), + ]) + fd = FDataBasis( + basis=basis, + coefficients=[ + [0, 0, 0, 0, 0, 0, 0, 0], + [1, 0, 0, 0, 1, 0, 0, 0], + ], + ) + expected_coefficients = np.array([[np.sqrt(1 / 2), 0, 0, 0] * 2]) + + np.testing.assert_allclose( + std(fd).coefficients, + expected_coefficients, + rtol=1e-7, + atol=1e-7, + )