From 81b7a1329408ce579a6c2b143fc7bb576fb9309f Mon Sep 17 00:00:00 2001 From: Max Halford Date: Sat, 7 Sep 2024 15:43:24 +0200 Subject: [PATCH] Add get_feature_names_out --- docs/config.toml | 2 +- docs/content/faq.ipynb | 136 +++++++++++++++++++++++++++++++++++++++++ prince/mca.py | 3 + prince/pca.py | 3 + tests/test_mca.py | 2 + 5 files changed, 145 insertions(+), 1 deletion(-) create mode 100644 docs/content/faq.ipynb diff --git a/docs/config.toml b/docs/config.toml index cab2e968..fdf03b34 100644 --- a/docs/config.toml +++ b/docs/config.toml @@ -8,7 +8,7 @@ theme = 'hugo-bearblog' # Basic metadata configuration for your blog. title = "Prince" author = "Max Halford" -copyright = "Copyright © 2023, Max Halford." +copyright = "Copyright © 2024, Max Halford." languageCode = "en-US" # Generate a nice robots.txt for SEO diff --git a/docs/content/faq.ipynb b/docs/content/faq.ipynb new file mode 100644 index 00000000..c5641611 --- /dev/null +++ b/docs/content/faq.ipynb @@ -0,0 +1,136 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "+++\n", + "title = \"Frequently Asked Questions\"\n", + "menu = \"main\"\n", + "weight = 7\n", + "toc = true\n", + "aliases = [\"faq\"]\n", + "+++" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**How to use Prince with sklearn pipelines?**\n", + "\n", + "Prince estimators consume and produce pandas DataFrames. If you want to use them in a sklearn pipeline, you can [sklearn's `set_output` API](https://scikit-learn.org/stable/auto_examples/miscellaneous/plot_set_output.html). This way, you can tell sklearn that the pipeline should exchange DataFrames instead of numpy arrays between the steps." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
component01
0-2.2647030.480027
1-2.080961-0.674134
2-2.364229-0.341908
3-2.299384-0.597395
4-2.3898420.646835
\n", + "
" + ], + "text/plain": [ + "component 0 1\n", + "0 -2.264703 0.480027\n", + "1 -2.080961 -0.674134\n", + "2 -2.364229 -0.341908\n", + "3 -2.299384 -0.597395\n", + "4 -2.389842 0.646835" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import prince\n", + "from sklearn import datasets\n", + "from sklearn import impute\n", + "from sklearn import pipeline\n", + "\n", + "pipe = pipeline.make_pipeline(\n", + " impute.SimpleImputer(),\n", + " prince.PCA()\n", + ")\n", + "pipe.set_output(transform='pandas')\n", + "dataset = datasets.load_iris()\n", + "pipe.fit_transform(dataset.data).head()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "prince-NQ1O93Uh-py3.11", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/prince/mca.py b/prince/mca.py index 67924c3c..e49305e3 100644 --- a/prince/mca.py +++ b/prince/mca.py @@ -37,6 +37,9 @@ def _prepare(self, X): X = pd.get_dummies(X, columns=X.columns) return X + def get_feature_names_out(self, input_features=None): + return np.arange(self.n_components_) + @utils.check_is_dataframe_input def fit(self, X, y=None): """Fit the MCA for the dataframe X. diff --git a/prince/pca.py b/prince/pca.py index 41458212..e7603122 100755 --- a/prince/pca.py +++ b/prince/pca.py @@ -67,6 +67,9 @@ def _check_input(self, X): if self.check_input: sklearn.utils.check_array(X) + def get_feature_names_out(self, input_features=None): + return np.arange(self.n_components_) + @utils.check_is_dataframe_input def fit(self, X, y=None, supplementary_columns=None): self._check_input(X) diff --git a/tests/test_mca.py b/tests/test_mca.py index 87fb377b..60518b8c 100644 --- a/tests/test_mca.py +++ b/tests/test_mca.py @@ -138,6 +138,8 @@ def test_issue_131(): def test_issue_171(): """ + https://github.com/MaxHalford/prince/issues/171 + >>> from sklearn import impute >>> from sklearn import pipeline