From 62bd4ec7b863c3eb095f8023f00cecec160b4415 Mon Sep 17 00:00:00 2001 From: Aurelien Geron Date: Sun, 3 Oct 2021 23:05:49 +1300 Subject: [PATCH] Improve alignment between notebook and book section headers --- 01_the_machine_learning_landscape.ipynb | 2 +- 05_support_vector_machines.ipynb | 145 +++++- 06_decision_trees.ipynb | 148 +++++-- 07_ensemble_learning_and_random_forests.ipynb | 415 +++++++++++------- 08_dimensionality_reduction.ipynb | 178 ++++++-- 09_unsupervised_learning.ipynb | 37 +- 6 files changed, 667 insertions(+), 258 deletions(-) diff --git a/01_the_machine_learning_landscape.ipynb b/01_the_machine_learning_landscape.ipynb index a227f60fd..3fca6da12 100644 --- a/01_the_machine_learning_landscape.ipynb +++ b/01_the_machine_learning_landscape.ipynb @@ -93,7 +93,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The code in the book expects the data files to be located in the current directory. I just tweaked it here to fetch the files in datasets/lifesat." + "The code in the book expects the data files to be located in the current directory. I just tweaked it here to fetch the files in `datasets/lifesat`." ] }, { diff --git a/05_support_vector_machines.ipynb b/05_support_vector_machines.ipynb index bd27bc252..4195322b5 100644 --- a/05_support_vector_machines.ipynb +++ b/05_support_vector_machines.ipynb @@ -84,14 +84,16 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Large margin classification" + "# Linear SVM Classification" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "The next few code cells generate the first figures in chapter 5. The first actual code sample comes after:" + "The next few code cells generate the first figures in chapter 5. The first actual code sample comes after.\n", + "\n", + "**Code to generate Figure 5–1. Large margin classification**" ] }, { @@ -206,7 +208,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Sensitivity to feature scales" + "**Code to generate Figure 5–2. Sensitivity to feature scales**" ] }, { @@ -271,7 +273,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Sensitivity to outliers" + "## Soft Margin Classification\n", + "**Code to generate Figure 5–3. Hard margin sensitivity to outliers**" ] }, { @@ -349,14 +352,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Large margin *vs* margin violations" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This is the first code example in chapter 5:" + "**This is the first code example in chapter 5:**" ] }, { @@ -419,7 +415,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Now let's generate the graph comparing different regularization settings:" + "**Code to generate Figure 5–4. Large margin versus fewer margin violations**" ] }, { @@ -543,7 +539,14 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Non-linear classification" + "# Nonlinear SVM Classification" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Code to generate Figure 5–5. Adding features to make a dataset linearly separable**" ] }, { @@ -639,6 +642,13 @@ "plt.show()" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Here is second code example in the chapter:**" + ] + }, { "cell_type": "code", "execution_count": 13, @@ -679,6 +689,13 @@ "polynomial_svm_clf.fit(X, y)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Code to generate Figure 5–6. Linear SVM classifier using polynomial features**" + ] + }, { "cell_type": "code", "execution_count": 14, @@ -722,6 +739,20 @@ "plt.show()" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Polynomial Kernel" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Next code example:**" + ] + }, { "cell_type": "code", "execution_count": 15, @@ -749,6 +780,13 @@ "poly_kernel_svm_clf.fit(X, y)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Code to generate Figure 5–7. SVM classifiers with a polynomial kernel**" + ] + }, { "cell_type": "code", "execution_count": 16, @@ -817,6 +855,20 @@ "plt.show()" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Similarity Features" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Code to generate Figure 5–8. Similarity features using the Gaussian RBF**" + ] + }, { "cell_type": "code", "execution_count": 18, @@ -926,6 +978,20 @@ " print(\"Phi({}, {}) = {}\".format(x1_example, landmark, k))" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Gaussian RBF Kernel" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Next code example:**" + ] + }, { "cell_type": "code", "execution_count": 20, @@ -951,6 +1017,13 @@ "rbf_kernel_svm_clf.fit(X, y)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Code to generate Figure 5–9. SVM classifiers using an RBF kernel**" + ] + }, { "cell_type": "code", "execution_count": 21, @@ -1015,7 +1088,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Regression\n" + "# SVM Regression" ] }, { @@ -1030,6 +1103,13 @@ "y = (4 + 3 * X + np.random.randn(m, 1)).ravel()" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Next code example:**" + ] + }, { "cell_type": "code", "execution_count": 23, @@ -1053,6 +1133,13 @@ "svm_reg.fit(X, y)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Code to generate Figure 5–10. SVM Regression**" + ] + }, { "cell_type": "code", "execution_count": 24, @@ -1152,6 +1239,13 @@ "**Note**: to be future-proof, we set `gamma=\"scale\"`, as this will be the default value in Scikit-Learn 0.22." ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Next code example:**" + ] + }, { "cell_type": "code", "execution_count": 27, @@ -1175,6 +1269,13 @@ "svm_poly_reg.fit(X, y)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Code to generate Figure 5–11. SVM Regression using a second-degree polynomial kernel**" + ] + }, { "cell_type": "code", "execution_count": 28, @@ -1242,7 +1343,15 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Under the hood" + "# Under the Hood\n", + "## Decision Function and Predictions" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Code to generate Figure 5–12. Decision function for the iris dataset**" ] }, { @@ -1324,7 +1433,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Small weight vector results in a large margin" + "**Code to generate Figure 5–13. A smaller weight vector results in a larger margin**" ] }, { @@ -1414,7 +1523,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Hinge loss" + "**Code to generate the Hinge Loss figure:**" ] }, { diff --git a/06_decision_trees.ipynb b/06_decision_trees.ipynb index 06829a9b9..50f15e6b7 100644 --- a/06_decision_trees.ipynb +++ b/06_decision_trees.ipynb @@ -89,7 +89,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Training and visualizing" + "# Training and Visualizing a Decision Tree" ] }, { @@ -120,6 +120,13 @@ "tree_clf.fit(X, y)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**This code example generates Figure 6–1. Iris Decision Tree:**" + ] + }, { "cell_type": "code", "execution_count": 3, @@ -131,7 +138,7 @@ "\n", "\n", - "\n", "\n", "\n" ], "text/plain": [ - "" + "" ] }, "execution_count": 3, @@ -240,6 +247,20 @@ "Source.from_file(os.path.join(IMAGES_PATH, \"iris_tree.dot\"))" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Making Predictions" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Code to generate Figure 6–2. Decision Tree decision boundaries**" + ] + }, { "cell_type": "code", "execution_count": 4, @@ -311,7 +332,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Predicting classes and class probabilities" + "# Estimating Class Probabilities" ] }, { @@ -358,7 +379,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# High Variance" + "## Regularization Hyperparameters" ] }, { @@ -390,6 +411,13 @@ "tree_clf_tweaked.fit(X, y)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Code to generate Figure 6–8. Sensitivity to training set details:**" + ] + }, { "cell_type": "code", "execution_count": 8, @@ -427,9 +455,16 @@ "plt.show()" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Code to generate Figure 6–3. Regularization using min_samples_leaf:**" + ] + }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 9, "metadata": {}, "outputs": [ { @@ -474,9 +509,16 @@ "plt.show()" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Rotating the dataset also leads to completely different decision boundaries:" + ] + }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 10, "metadata": {}, "outputs": [ { @@ -506,9 +548,16 @@ "plt.show()" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Code to generate Figure 6–7. Sensitivity to training set rotation**" + ] + }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 11, "metadata": {}, "outputs": [ { @@ -560,12 +609,19 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Regression trees" + "# Regression" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's prepare a simple linear dataset:" ] }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 12, "metadata": {}, "outputs": [], "source": [ @@ -577,9 +633,16 @@ "y = y + np.random.randn(m, 1) / 10" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Code example:**" + ] + }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 13, "metadata": {}, "outputs": [ { @@ -588,7 +651,7 @@ "DecisionTreeRegressor(max_depth=2, random_state=42)" ] }, - "execution_count": 14, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } @@ -600,9 +663,16 @@ "tree_reg.fit(X, y)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Code to generate Figure 6–5. Predictions of two Decision Tree regression models:**" + ] + }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 14, "metadata": {}, "outputs": [ { @@ -667,9 +737,16 @@ "plt.show()" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Code to generate Figure 6-4. A Decision Tree for regression:**" + ] + }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 15, "metadata": {}, "outputs": [], "source": [ @@ -684,7 +761,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 16, "metadata": {}, "outputs": [ { @@ -805,7 +882,7 @@ "" ] }, - "execution_count": 17, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -814,9 +891,16 @@ "Source.from_file(os.path.join(IMAGES_PATH, \"regression_tree.dot\"))" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Code to generate Figure 6–6. Regularizing a Decision Tree regressor:**" + ] + }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 17, "metadata": {}, "outputs": [ { @@ -922,7 +1006,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 18, "metadata": {}, "outputs": [], "source": [ @@ -940,7 +1024,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 19, "metadata": {}, "outputs": [], "source": [ @@ -958,7 +1042,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 20, "metadata": {}, "outputs": [ { @@ -988,7 +1072,7 @@ " verbose=1)" ] }, - "execution_count": 21, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } @@ -1004,7 +1088,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 21, "metadata": {}, "outputs": [ { @@ -1013,7 +1097,7 @@ "DecisionTreeClassifier(max_leaf_nodes=17, random_state=42)" ] }, - "execution_count": 22, + "execution_count": 21, "metadata": {}, "output_type": "execute_result" } @@ -1038,7 +1122,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 22, "metadata": {}, "outputs": [ { @@ -1047,7 +1131,7 @@ "0.8695" ] }, - "execution_count": 23, + "execution_count": 22, "metadata": {}, "output_type": "execute_result" } @@ -1082,7 +1166,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 23, "metadata": {}, "outputs": [], "source": [ @@ -1109,7 +1193,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 24, "metadata": {}, "outputs": [ { @@ -1118,7 +1202,7 @@ "0.8054499999999999" ] }, - "execution_count": 25, + "execution_count": 24, "metadata": {}, "output_type": "execute_result" } @@ -1148,7 +1232,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 25, "metadata": {}, "outputs": [], "source": [ @@ -1160,7 +1244,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 26, "metadata": {}, "outputs": [], "source": [ @@ -1178,7 +1262,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 27, "metadata": {}, "outputs": [ { @@ -1187,7 +1271,7 @@ "0.872" ] }, - "execution_count": 28, + "execution_count": 27, "metadata": {}, "output_type": "execute_result" } diff --git a/07_ensemble_learning_and_random_forests.ipynb b/07_ensemble_learning_and_random_forests.ipynb index 73724f0c9..8638fd8c1 100644 --- a/07_ensemble_learning_and_random_forests.ipynb +++ b/07_ensemble_learning_and_random_forests.ipynb @@ -89,7 +89,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Voting classifiers" + "# Voting Classifiers" ] }, { @@ -103,6 +103,13 @@ "cumulative_heads_ratio = np.cumsum(coin_tosses, axis=0) / np.arange(1, 10001).reshape(-1, 1)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Code to generate Figure 7–3. The law of large numbers:**" + ] + }, { "cell_type": "code", "execution_count": 3, @@ -141,6 +148,13 @@ "plt.show()" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's use the moons dataset:" + ] + }, { "cell_type": "code", "execution_count": 4, @@ -161,6 +175,13 @@ "**Note**: to be future-proof, we set `solver=\"lbfgs\"`, `n_estimators=100`, and `gamma=\"scale\"` since these will be the default values in upcoming Scikit-Learn versions." ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Code examples:**" + ] + }, { "cell_type": "code", "execution_count": 5, @@ -301,7 +322,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Bagging ensembles" + "# Bagging and Pasting\n", + "## Bagging and Pasting in Scikit-Learn" ] }, { @@ -358,6 +380,13 @@ "print(accuracy_score(y_test, y_pred_tree))" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Code to generate Figure 7–5. A single Decision Tree (left) versus a bagging ensemble of 500 trees (right):**" + ] + }, { "cell_type": "code", "execution_count": 13, @@ -387,7 +416,9 @@ { "cell_type": "code", "execution_count": 14, - "metadata": {}, + "metadata": { + "scrolled": true + }, "outputs": [ { "name": "stdout", @@ -422,143 +453,6 @@ "plt.show()" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Random Forests" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [], - "source": [ - "bag_clf = BaggingClassifier(\n", - " DecisionTreeClassifier(max_features=\"sqrt\", max_leaf_nodes=16),\n", - " n_estimators=500, random_state=42)" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [], - "source": [ - "bag_clf.fit(X_train, y_train)\n", - "y_pred = bag_clf.predict(X_test)" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [], - "source": [ - "from sklearn.ensemble import RandomForestClassifier\n", - "\n", - "rnd_clf = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16, random_state=42)\n", - "rnd_clf.fit(X_train, y_train)\n", - "\n", - "y_pred_rf = rnd_clf.predict(X_test)" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "1.0" - ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "np.sum(y_pred == y_pred_rf) / len(y_pred) # very similar predictions" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "sepal length (cm) 0.11249225099876375\n", - "sepal width (cm) 0.02311928828251033\n", - "petal length (cm) 0.4410304643639577\n", - "petal width (cm) 0.4233579963547682\n" - ] - } - ], - "source": [ - "from sklearn.datasets import load_iris\n", - "iris = load_iris()\n", - "rnd_clf = RandomForestClassifier(n_estimators=500, random_state=42)\n", - "rnd_clf.fit(iris[\"data\"], iris[\"target\"])\n", - "for name, score in zip(iris[\"feature_names\"], rnd_clf.feature_importances_):\n", - " print(name, score)" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array([0.11249225, 0.02311929, 0.44103046, 0.423358 ])" - ] - }, - "execution_count": 20, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "rnd_clf.feature_importances_" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "plt.figure(figsize=(6, 4))\n", - "\n", - "for i in range(15):\n", - " tree_clf = DecisionTreeClassifier(max_leaf_nodes=16, random_state=42 + i)\n", - " indices_with_replacement = np.random.randint(0, len(X_train), len(X_train))\n", - " tree_clf.fit(X[indices_with_replacement], y[indices_with_replacement])\n", - " plot_decision_boundary(tree_clf, X, y, axes=[-1.5, 2.45, -1, 1.5], alpha=0.02, contour=False)\n", - "\n", - "plt.show()" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -568,7 +462,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 15, "metadata": {}, "outputs": [ { @@ -577,7 +471,7 @@ "0.8986666666666666" ] }, - "execution_count": 22, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } @@ -592,7 +486,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 16, "metadata": {}, "outputs": [ { @@ -975,7 +869,7 @@ " [0.58854167, 0.41145833]])" ] }, - "execution_count": 23, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -986,8 +880,10 @@ }, { "cell_type": "code", - "execution_count": 24, - "metadata": {}, + "execution_count": 17, + "metadata": { + "scrolled": true + }, "outputs": [ { "data": { @@ -995,7 +891,7 @@ "0.912" ] }, - "execution_count": 24, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } @@ -1010,7 +906,165 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Feature importance" + "# Random Forests" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.ensemble import RandomForestClassifier\n", + "\n", + "rnd_clf = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16, random_state=42)\n", + "rnd_clf.fit(X_train, y_train)\n", + "\n", + "y_pred_rf = rnd_clf.predict(X_test)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "A Random Forest is equivalent to a bag of decision trees:" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "bag_clf = BaggingClassifier(\n", + " DecisionTreeClassifier(max_features=\"sqrt\", max_leaf_nodes=16),\n", + " n_estimators=500, random_state=42)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "bag_clf.fit(X_train, y_train)\n", + "y_pred = bag_clf.predict(X_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1.0" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "np.sum(y_pred == y_pred_rf) / len(y_pred) # very similar predictions" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Feature Importance" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "sepal length (cm) 0.11249225099876375\n", + "sepal width (cm) 0.02311928828251033\n", + "petal length (cm) 0.4410304643639577\n", + "petal width (cm) 0.4233579963547682\n" + ] + } + ], + "source": [ + "from sklearn.datasets import load_iris\n", + "iris = load_iris()\n", + "rnd_clf = RandomForestClassifier(n_estimators=500, random_state=42)\n", + "rnd_clf.fit(iris[\"data\"], iris[\"target\"])\n", + "for name, score in zip(iris[\"feature_names\"], rnd_clf.feature_importances_):\n", + " print(name, score)" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([0.11249225, 0.02311929, 0.44103046, 0.423358 ])" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "rnd_clf.feature_importances_" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The following figure overlays the decision boundaries of 15 decision trees. As you can see, even though each decision tree is imperfect, the ensemble defines a pretty good decision boundary:" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plt.figure(figsize=(6, 4))\n", + "\n", + "for i in range(15):\n", + " tree_clf = DecisionTreeClassifier(max_leaf_nodes=16, random_state=42 + i)\n", + " indices_with_replacement = np.random.randint(0, len(X_train), len(X_train))\n", + " tree_clf.fit(X[indices_with_replacement], y[indices_with_replacement])\n", + " plot_decision_boundary(tree_clf, X, y, axes=[-1.5, 2.45, -1, 1.5], alpha=0.02, contour=False)\n", + "\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Code to generate Figure 7–6. MNIST pixel importance (according to a Random Forest classifier):**" ] }, { @@ -1105,7 +1159,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# AdaBoost" + "# Boosting\n", + "## AdaBoost" ] }, { @@ -1156,6 +1211,13 @@ "plot_decision_boundary(ada_clf, X, y)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Code to generate Figure 7–8. Decision boundaries of consecutive predictors:**" + ] + }, { "cell_type": "code", "execution_count": 31, @@ -1217,7 +1279,14 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Gradient Boosting" + "## Gradient Boosting" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let create a simple quadratic dataset:" ] }, { @@ -1231,6 +1300,13 @@ "y = 3*X[:, 0]**2 + 0.05 * np.random.randn(100)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now let's train a decision tree regressor on this dataset:" + ] + }, { "cell_type": "code", "execution_count": 33, @@ -1336,6 +1412,13 @@ "y_pred" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Code to generate Figure 7–9. In this depiction of Gradient Boosting, the first predictor (top left) is trained normally, then each consecutive predictor (middle left and lower left) is trained on the previous predictor’s residuals; the right column shows the resulting ensemble’s predictions:**" + ] + }, { "cell_type": "code", "execution_count": 39, @@ -1412,6 +1495,13 @@ "plt.show()" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now let's try a gradient boosting regressor:" + ] + }, { "cell_type": "code", "execution_count": 41, @@ -1436,6 +1526,13 @@ "gbrt.fit(X, y)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Code to generate Figure 7–10. GBRT ensembles with not enough predictors (left) and too many (right):**" + ] + }, { "cell_type": "code", "execution_count": 42, @@ -1504,7 +1601,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Gradient Boosting with Early stopping" + "**Gradient Boosting with Early stopping:**" ] }, { @@ -1541,6 +1638,13 @@ "gbrt_best.fit(X_train, y_train)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Code to generate Figure 7–11. Tuning the number of trees using early stopping:**" + ] + }, { "cell_type": "code", "execution_count": 45, @@ -1599,6 +1703,13 @@ "plt.show()" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Early stopping with some patience (interrupts training only after there's no improvement for 5 epochs):" + ] + }, { "cell_type": "code", "execution_count": 47, @@ -1661,7 +1772,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Using XGBoost" + "**Using XGBoost:**" ] }, { diff --git a/08_dimensionality_reduction.ipynb b/08_dimensionality_reduction.ipynb index 47a836462..d0f6cac6a 100644 --- a/08_dimensionality_reduction.ipynb +++ b/08_dimensionality_reduction.ipynb @@ -84,8 +84,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Projection methods\n", - "Build 3D dataset:" + "# PCA\n", + "Let's build a simple 3D dataset:" ] }, { @@ -110,7 +110,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## PCA using SVD decomposition" + "## Principal Components" ] }, { @@ -157,6 +157,13 @@ "np.allclose(X_centered, U.dot(S).dot(Vt))" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Projecting Down to d Dimensions" + ] + }, { "cell_type": "code", "execution_count": 6, @@ -180,7 +187,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## PCA using Scikit-Learn" + "## Using Scikit-Learn" ] }, { @@ -453,6 +460,13 @@ "Notice how the axes are flipped." ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Explained Variance Ratio" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -548,6 +562,13 @@ "Next, let's generate some nice figures! :)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Code to generate Figure 8–2. A 3D dataset lying close to a 2D subspace:**" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -677,6 +698,13 @@ "plt.show()" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Code to generate Figure 8–3. The new 2D dataset after projection:**" + ] + }, { "cell_type": "code", "execution_count": 25, @@ -722,8 +750,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Manifold learning\n", - "Swiss roll:" + "**Code to generate Figure 8–4. Swiss roll dataset:**" ] }, { @@ -733,6 +760,7 @@ "outputs": [], "source": [ "from sklearn.datasets import make_swiss_roll\n", + "\n", "X, t = make_swiss_roll(n_samples=1000, noise=0.2, random_state=42)" ] }, @@ -780,6 +808,13 @@ "plt.show()" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Code to generate Figure 8–5. Squashing by projecting onto a plane (left) versus unrolling the Swiss roll (right):**" + ] + }, { "cell_type": "code", "execution_count": 28, @@ -825,6 +860,13 @@ "plt.show()" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Code to generate Figure 8–6. The decision boundary may not always be simpler with lower dimensions:**" + ] + }, { "cell_type": "code", "execution_count": 29, @@ -987,7 +1029,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# PCA" + "**Code to generate Figure 8–7. Selecting the subspace to project on:**" ] }, { @@ -1080,7 +1122,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# MNIST compression" + "## Choosing the Right Number of Dimensions" ] }, { @@ -1148,6 +1190,13 @@ "d" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Code to generate Figure 8–8. Explained variance as a function of the number of dimensions:**" + ] + }, { "cell_type": "code", "execution_count": 35, @@ -1239,6 +1288,13 @@ "np.sum(pca.explained_variance_ratio_)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## PCA for Compression" + ] + }, { "cell_type": "code", "execution_count": 39, @@ -1250,6 +1306,13 @@ "X_recovered = pca.inverse_transform(X_reduced)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Code to generate Figure 8–9. MNIST compression that preserves 95% of the variance:**" + ] + }, { "cell_type": "code", "execution_count": 40, @@ -1322,13 +1385,30 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Incremental PCA" + "## Randomized PCA" ] }, { "cell_type": "code", "execution_count": 43, "metadata": {}, + "outputs": [], + "source": [ + "rnd_pca = PCA(n_components=154, svd_solver=\"randomized\", random_state=42)\n", + "X_reduced = rnd_pca.fit_transform(X_train)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Incremental PCA" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -1352,16 +1432,23 @@ }, { "cell_type": "code", - "execution_count": 44, + "execution_count": 45, "metadata": {}, "outputs": [], "source": [ "X_recovered_inc_pca = inc_pca.inverse_transform(X_reduced)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's check that compression still works well:" + ] + }, { "cell_type": "code", - "execution_count": 45, + "execution_count": 46, "metadata": {}, "outputs": [ { @@ -1388,7 +1475,7 @@ }, { "cell_type": "code", - "execution_count": 46, + "execution_count": 47, "metadata": {}, "outputs": [], "source": [ @@ -1404,7 +1491,7 @@ }, { "cell_type": "code", - "execution_count": 47, + "execution_count": 48, "metadata": {}, "outputs": [ { @@ -1413,7 +1500,7 @@ "True" ] }, - "execution_count": 47, + "execution_count": 48, "metadata": {}, "output_type": "execute_result" } @@ -1431,7 +1518,7 @@ }, { "cell_type": "code", - "execution_count": 48, + "execution_count": 49, "metadata": {}, "outputs": [ { @@ -1440,7 +1527,7 @@ "False" ] }, - "execution_count": 48, + "execution_count": 49, "metadata": {}, "output_type": "execute_result" } @@ -1453,7 +1540,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Using `memmap()`" + "**Using `memmap()`:**" ] }, { @@ -1465,7 +1552,7 @@ }, { "cell_type": "code", - "execution_count": 49, + "execution_count": 50, "metadata": {}, "outputs": [], "source": [ @@ -1485,7 +1572,7 @@ }, { "cell_type": "code", - "execution_count": 50, + "execution_count": 51, "metadata": {}, "outputs": [], "source": [ @@ -1501,7 +1588,7 @@ }, { "cell_type": "code", - "execution_count": 51, + "execution_count": 52, "metadata": {}, "outputs": [ { @@ -1510,7 +1597,7 @@ "IncrementalPCA(batch_size=525, n_components=154)" ] }, - "execution_count": 51, + "execution_count": 52, "metadata": {}, "output_type": "execute_result" } @@ -1523,21 +1610,11 @@ "inc_pca.fit(X_mm)" ] }, - { - "cell_type": "code", - "execution_count": 52, - "metadata": {}, - "outputs": [], - "source": [ - "rnd_pca = PCA(n_components=154, svd_solver=\"randomized\", random_state=42)\n", - "X_reduced = rnd_pca.fit_transform(X_train)" - ] - }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Time complexity" + "**Time complexity:**" ] }, { @@ -1737,6 +1814,13 @@ "X_reduced = rbf_pca.fit_transform(X)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Code to generate Figure 8–10. Swiss roll reduced to 2D using kPCA with various kernels:**" + ] + }, { "cell_type": "code", "execution_count": 58, @@ -1791,6 +1875,13 @@ "plt.show()" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Code to generate Figure 8–11. Kernel PCA and the reconstruction pre-image error:**" + ] + }, { "cell_type": "code", "execution_count": 59, @@ -1864,6 +1955,13 @@ "plt.grid(True)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Selecting a Kernel and Tuning Hyperparameters" + ] + }, { "cell_type": "code", "execution_count": 61, @@ -1983,6 +2081,13 @@ "X_reduced = lle.fit_transform(X)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Code to generate Figure 8–12. Unrolled Swiss roll using LLE:**" + ] + }, { "cell_type": "code", "execution_count": 67, @@ -2024,7 +2129,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# MDS, Isomap and t-SNE" + "## Other Dimensionality Reduction Techniques" ] }, { @@ -2078,6 +2183,13 @@ "X_reduced_lda = lda.transform(X_mnist)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Code to generate Figure 8–13. Using various techniques to reduce the Swill roll to 2D:**" + ] + }, { "cell_type": "code", "execution_count": 72, diff --git a/09_unsupervised_learning.ipynb b/09_unsupervised_learning.ipynb index 37bec482f..84d5135fd 100644 --- a/09_unsupervised_learning.ipynb +++ b/09_unsupervised_learning.ipynb @@ -91,7 +91,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Introduction – Classification _vs_ Clustering" + "**Introduction – Classification _vs_ Clustering**" ] }, { @@ -417,7 +417,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Fit and Predict" + "**Fit and predict**" ] }, { @@ -584,7 +584,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Decision Boundaries" + "**Decision Boundaries**" ] }, { @@ -683,7 +683,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Hard Clustering _vs_ Soft Clustering" + "**Hard Clustering _vs_ Soft Clustering**" ] }, { @@ -750,7 +750,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### K-Means Algorithm" + "### The K-Means Algorithm" ] }, { @@ -875,7 +875,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### K-Means Variability" + "**K-Means Variability**" ] }, { @@ -1162,7 +1162,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### K-Means++" + "### Centroid initialization methods" ] }, { @@ -2019,7 +2019,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Limits of K-Means" + "## Limits of K-Means" ] }, { @@ -2125,7 +2125,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Using clustering for image segmentation" + "## Using Clustering for Image Segmentation" ] }, { @@ -2259,7 +2259,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Using Clustering for Preprocessing" + "## Using Clustering for Preprocessing" ] }, { @@ -2659,7 +2659,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Clustering for Semi-supervised Learning" + "## Using Clustering for Semi-Supervised Learning" ] }, { @@ -4199,7 +4199,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Anomaly Detection using Gaussian Mixtures" + "## Anomaly Detection Using Gaussian Mixtures" ] }, { @@ -4260,7 +4260,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Model selection" + "## Selecting the Number of Clusters" ] }, { @@ -4532,7 +4532,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Variational Bayesian Gaussian Mixtures" + "## Bayesian Gaussian Mixture Models" ] }, { @@ -4828,7 +4828,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Likelihood Function" + "**Likelihood Function**" ] }, { @@ -4939,13 +4939,6 @@ "plt.show()" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, { "cell_type": "markdown", "metadata": {},