diff --git a/06_ensemble_learning_and_random_forests.ipynb b/06_ensemble_learning_and_random_forests.ipynb index c0a5c41..25b63e4 100644 --- a/06_ensemble_learning_and_random_forests.ipynb +++ b/06_ensemble_learning_and_random_forests.ipynb @@ -125,20 +125,13 @@ "# Voting Classifiers" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Code to generate Figure 6–3. The law of large numbers:**" - ] - }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ - "# not in the book\n", + "# not in the book – this cell generates and saves Figure 6–3\n", "\n", "import matplotlib.pyplot as plt\n", "import numpy as np\n", @@ -272,20 +265,13 @@ "bag_clf.fit(X_train, y_train)" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Code to generate Figure 6–5. A single Decision Tree (left) versus a bagging ensemble of 500 trees (right):**" - ] - }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ - "# not in the book\n", + "# not in the book – this cell generates and saves Figure 6–5\n", "\n", "def plot_decision_boundary(clf, X, y, alpha=1.0):\n", " axes=[-1.5, 2.4, -1, 1.5]\n", @@ -303,15 +289,8 @@ " color=colors[idx], marker=markers[idx], linestyle=\"none\")\n", " plt.axis(axes)\n", " plt.xlabel(r\"$x_1$\")\n", - " plt.ylabel(r\"$x_2$\", rotation=0)" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [], - "source": [ + " plt.ylabel(r\"$x_2$\", rotation=0)\n", + "\n", "tree_clf = DecisionTreeClassifier(random_state=42)\n", "tree_clf.fit(X_train, y_train)\n", "\n", @@ -336,7 +315,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 14, "metadata": {}, "outputs": [], "source": [ @@ -348,7 +327,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 15, "metadata": {}, "outputs": [], "source": [ @@ -357,7 +336,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 16, "metadata": { "scrolled": true }, @@ -378,11 +357,11 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 17, "metadata": {}, "outputs": [], "source": [ - "# not in the book\n", + "# not in the book – this code shows how to compute the 63% proba\n", "print(1 - (1 - 1 / 1000) ** 1000)\n", "print(1 - np.exp(-1))" ] @@ -396,7 +375,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 18, "metadata": {}, "outputs": [], "source": [ @@ -417,7 +396,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 19, "metadata": {}, "outputs": [], "source": [ @@ -428,11 +407,11 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 20, "metadata": {}, "outputs": [], "source": [ - "# not in the book\n", + "# not in the book – this code verifies that the predictions are identical\n", "bag_clf.fit(X_train, y_train)\n", "y_pred_bag = bag_clf.predict(X_test)\n", "np.all(y_pred_bag == y_pred_rf) # same predictions" @@ -447,7 +426,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 21, "metadata": {}, "outputs": [], "source": [ @@ -460,20 +439,13 @@ " print(round(score, 2), name)" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Code to generate Figure 6–6. MNIST pixel importance (according to a Random Forest classifier):**" - ] - }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 22, "metadata": {}, "outputs": [], "source": [ - "# not in the book\n", + "# not in the book – this cell generates and saves Figure 6–6\n", "\n", "from sklearn.datasets import fetch_openml\n", "\n", @@ -500,20 +472,13 @@ "## AdaBoost" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Code to generate Figure 6–8. Decision boundaries of consecutive predictors:**" - ] - }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 23, "metadata": {}, "outputs": [], "source": [ - "# not in the book\n", + "# not in the book – this cell generates and saves Figure 6–8\n", "\n", "m = len(X_train)\n", "\n", @@ -549,7 +514,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 24, "metadata": {}, "outputs": [], "source": [ @@ -563,11 +528,13 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 25, "metadata": {}, "outputs": [], "source": [ - "plot_decision_boundary(ada_clf, X_train, y_train) # not in the book" + "# not in the book – in case you're curious to see what the decision boundary\n", + "# looks like for the AdaBoost classifier\n", + "plot_decision_boundary(ada_clf, X_train, y_train)" ] }, { @@ -586,7 +553,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 26, "metadata": {}, "outputs": [], "source": [ @@ -610,7 +577,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 27, "metadata": {}, "outputs": [], "source": [ @@ -621,7 +588,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 28, "metadata": {}, "outputs": [], "source": [ @@ -632,7 +599,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 29, "metadata": {}, "outputs": [], "source": [ @@ -640,20 +607,13 @@ "sum(tree.predict(X_new) for tree in (tree_reg1, tree_reg2, tree_reg3))" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Code to generate Figure 6–9. In this depiction of Gradient Boosting, the first predictor (top left) is trained normally, then each consecutive predictor (middle left and lower left) is trained on the previous predictor’s residuals; the right column shows the resulting ensemble’s predictions:**" - ] - }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 30, "metadata": {}, "outputs": [], "source": [ - "# not in the book\n", + "# not in the book – this cell generates and saves Figure 6–9\n", "\n", "def plot_predictions(regressors, X, y, axes, style,\n", " label=None, data_style=\"b.\", data_label=None):\n", @@ -715,7 +675,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 31, "metadata": {}, "outputs": [], "source": [ @@ -728,7 +688,7 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 32, "metadata": {}, "outputs": [], "source": [ @@ -740,27 +700,20 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 33, "metadata": {}, "outputs": [], "source": [ "gbrt_best.n_estimators_" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Code to generate Figure 6–10. GBRT ensembles with not enough predictors (left) and too many (right):**" - ] - }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 34, "metadata": {}, "outputs": [], "source": [ - "# not in the book\n", + "# not in the book – this cell generates and saves Figure 6–10\n", "\n", "fix, axes = plt.subplots(ncols=2, figsize=(10,4), sharey=True)\n", "\n", @@ -784,11 +737,11 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 35, "metadata": {}, "outputs": [], "source": [ - "# not in the book (at least, not in this chapter: it's presented in chapter 2)\n", + "# not in the book – at least not in this chapter, it's presented in chapter 2\n", "\n", "import tarfile\n", "import urllib.request\n", @@ -817,7 +770,7 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 36, "metadata": {}, "outputs": [], "source": [ @@ -836,11 +789,11 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 37, "metadata": {}, "outputs": [], "source": [ - "# not in the book\n", + "# not in the book – evaluate the RMSE stats for the hgb_reg model\n", "\n", "from sklearn.model_selection import cross_val_score\n", "\n", @@ -858,7 +811,7 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 38, "metadata": {}, "outputs": [], "source": [ @@ -878,7 +831,7 @@ }, { "cell_type": "code", - "execution_count": 40, + "execution_count": 39, "metadata": {}, "outputs": [], "source": [ @@ -929,7 +882,7 @@ }, { "cell_type": "code", - "execution_count": 41, + "execution_count": 40, "metadata": {}, "outputs": [], "source": [ @@ -947,7 +900,7 @@ }, { "cell_type": "code", - "execution_count": 42, + "execution_count": 41, "metadata": {}, "outputs": [], "source": [ @@ -958,7 +911,7 @@ }, { "cell_type": "code", - "execution_count": 43, + "execution_count": 42, "metadata": {}, "outputs": [], "source": [ @@ -970,7 +923,7 @@ }, { "cell_type": "code", - "execution_count": 44, + "execution_count": 43, "metadata": {}, "outputs": [], "source": [ @@ -982,7 +935,7 @@ }, { "cell_type": "code", - "execution_count": 45, + "execution_count": 44, "metadata": {}, "outputs": [], "source": [ @@ -1005,7 +958,7 @@ }, { "cell_type": "code", - "execution_count": 46, + "execution_count": 45, "metadata": {}, "outputs": [], "source": [ @@ -1014,7 +967,7 @@ }, { "cell_type": "code", - "execution_count": 47, + "execution_count": 46, "metadata": {}, "outputs": [], "source": [ @@ -1028,7 +981,7 @@ }, { "cell_type": "code", - "execution_count": 48, + "execution_count": 47, "metadata": {}, "outputs": [], "source": [ @@ -1037,7 +990,7 @@ }, { "cell_type": "code", - "execution_count": 49, + "execution_count": 48, "metadata": {}, "outputs": [], "source": [ @@ -1046,7 +999,7 @@ }, { "cell_type": "code", - "execution_count": 50, + "execution_count": 49, "metadata": {}, "outputs": [], "source": [ @@ -1062,7 +1015,7 @@ }, { "cell_type": "code", - "execution_count": 51, + "execution_count": 50, "metadata": {}, "outputs": [], "source": [ @@ -1081,7 +1034,7 @@ }, { "cell_type": "code", - "execution_count": 52, + "execution_count": 51, "metadata": {}, "outputs": [], "source": [ @@ -1097,7 +1050,7 @@ }, { "cell_type": "code", - "execution_count": 53, + "execution_count": 52, "metadata": {}, "outputs": [], "source": [ @@ -1114,7 +1067,7 @@ }, { "cell_type": "code", - "execution_count": 54, + "execution_count": 53, "metadata": {}, "outputs": [], "source": [ @@ -1130,7 +1083,7 @@ }, { "cell_type": "code", - "execution_count": 55, + "execution_count": 54, "metadata": {}, "outputs": [], "source": [ @@ -1146,7 +1099,7 @@ }, { "cell_type": "code", - "execution_count": 56, + "execution_count": 55, "metadata": {}, "outputs": [], "source": [ @@ -1155,7 +1108,7 @@ }, { "cell_type": "code", - "execution_count": 57, + "execution_count": 56, "metadata": {}, "outputs": [], "source": [ @@ -1171,7 +1124,7 @@ }, { "cell_type": "code", - "execution_count": 58, + "execution_count": 57, "metadata": {}, "outputs": [], "source": [ @@ -1188,7 +1141,7 @@ }, { "cell_type": "code", - "execution_count": 59, + "execution_count": 58, "metadata": {}, "outputs": [], "source": [ @@ -1204,7 +1157,7 @@ }, { "cell_type": "code", - "execution_count": 60, + "execution_count": 59, "metadata": {}, "outputs": [], "source": [ @@ -1213,7 +1166,7 @@ }, { "cell_type": "code", - "execution_count": 61, + "execution_count": 60, "metadata": {}, "outputs": [], "source": [ @@ -1236,7 +1189,7 @@ }, { "cell_type": "code", - "execution_count": 62, + "execution_count": 61, "metadata": {}, "outputs": [], "source": [ @@ -1246,7 +1199,7 @@ }, { "cell_type": "code", - "execution_count": 63, + "execution_count": 62, "metadata": {}, "outputs": [], "source": [ @@ -1277,7 +1230,7 @@ }, { "cell_type": "code", - "execution_count": 64, + "execution_count": 63, "metadata": {}, "outputs": [], "source": [ @@ -1289,7 +1242,7 @@ }, { "cell_type": "code", - "execution_count": 65, + "execution_count": 64, "metadata": {}, "outputs": [], "source": [ @@ -1298,7 +1251,7 @@ }, { "cell_type": "code", - "execution_count": 66, + "execution_count": 65, "metadata": {}, "outputs": [], "source": [ @@ -1309,7 +1262,7 @@ }, { "cell_type": "code", - "execution_count": 67, + "execution_count": 66, "metadata": {}, "outputs": [], "source": [ @@ -1332,7 +1285,7 @@ }, { "cell_type": "code", - "execution_count": 68, + "execution_count": 67, "metadata": {}, "outputs": [], "source": [ @@ -1344,7 +1297,7 @@ }, { "cell_type": "code", - "execution_count": 69, + "execution_count": 68, "metadata": {}, "outputs": [], "source": [ @@ -1353,7 +1306,7 @@ }, { "cell_type": "code", - "execution_count": 70, + "execution_count": 69, "metadata": {}, "outputs": [], "source": [ @@ -1383,7 +1336,7 @@ }, { "cell_type": "code", - "execution_count": 71, + "execution_count": 70, "metadata": {}, "outputs": [], "source": [ @@ -1406,7 +1359,7 @@ }, { "cell_type": "code", - "execution_count": 72, + "execution_count": 71, "metadata": {}, "outputs": [], "source": [ @@ -1417,7 +1370,7 @@ }, { "cell_type": "code", - "execution_count": 73, + "execution_count": 72, "metadata": {}, "outputs": [], "source": [