Clarify the 'not in the book' comments

main
Aurélien Geron 2021-11-21 17:55:56 +13:00
parent dc64daaf65
commit 7f0c64b0f4
1 changed files with 74 additions and 121 deletions

View File

@ -125,20 +125,13 @@
"# Voting Classifiers" "# Voting Classifiers"
] ]
}, },
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**Code to generate Figure 63. The law of large numbers:**"
]
},
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 5, "execution_count": 5,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"# not in the book\n", "# not in the book this cell generates and saves Figure 63\n",
"\n", "\n",
"import matplotlib.pyplot as plt\n", "import matplotlib.pyplot as plt\n",
"import numpy as np\n", "import numpy as np\n",
@ -272,20 +265,13 @@
"bag_clf.fit(X_train, y_train)" "bag_clf.fit(X_train, y_train)"
] ]
}, },
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**Code to generate Figure 65. A single Decision Tree (left) versus a bagging ensemble of 500 trees (right):**"
]
},
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 13, "execution_count": 13,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"# not in the book\n", "# not in the book this cell generates and saves Figure 65\n",
"\n", "\n",
"def plot_decision_boundary(clf, X, y, alpha=1.0):\n", "def plot_decision_boundary(clf, X, y, alpha=1.0):\n",
" axes=[-1.5, 2.4, -1, 1.5]\n", " axes=[-1.5, 2.4, -1, 1.5]\n",
@ -303,15 +289,8 @@
" color=colors[idx], marker=markers[idx], linestyle=\"none\")\n", " color=colors[idx], marker=markers[idx], linestyle=\"none\")\n",
" plt.axis(axes)\n", " plt.axis(axes)\n",
" plt.xlabel(r\"$x_1$\")\n", " plt.xlabel(r\"$x_1$\")\n",
" plt.ylabel(r\"$x_2$\", rotation=0)" " plt.ylabel(r\"$x_2$\", rotation=0)\n",
] "\n",
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"tree_clf = DecisionTreeClassifier(random_state=42)\n", "tree_clf = DecisionTreeClassifier(random_state=42)\n",
"tree_clf.fit(X_train, y_train)\n", "tree_clf.fit(X_train, y_train)\n",
"\n", "\n",
@ -336,7 +315,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 15, "execution_count": 14,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -348,7 +327,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 16, "execution_count": 15,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -357,7 +336,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 17, "execution_count": 16,
"metadata": { "metadata": {
"scrolled": true "scrolled": true
}, },
@ -378,11 +357,11 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 18, "execution_count": 17,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"# not in the book\n", "# not in the book this code shows how to compute the 63% proba\n",
"print(1 - (1 - 1 / 1000) ** 1000)\n", "print(1 - (1 - 1 / 1000) ** 1000)\n",
"print(1 - np.exp(-1))" "print(1 - np.exp(-1))"
] ]
@ -396,7 +375,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 19, "execution_count": 18,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -417,7 +396,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 20, "execution_count": 19,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -428,11 +407,11 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 21, "execution_count": 20,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"# not in the book\n", "# not in the book this code verifies that the predictions are identical\n",
"bag_clf.fit(X_train, y_train)\n", "bag_clf.fit(X_train, y_train)\n",
"y_pred_bag = bag_clf.predict(X_test)\n", "y_pred_bag = bag_clf.predict(X_test)\n",
"np.all(y_pred_bag == y_pred_rf) # same predictions" "np.all(y_pred_bag == y_pred_rf) # same predictions"
@ -447,7 +426,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 22, "execution_count": 21,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -460,20 +439,13 @@
" print(round(score, 2), name)" " print(round(score, 2), name)"
] ]
}, },
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**Code to generate Figure 66. MNIST pixel importance (according to a Random Forest classifier):**"
]
},
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 23, "execution_count": 22,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"# not in the book\n", "# not in the book this cell generates and saves Figure 66\n",
"\n", "\n",
"from sklearn.datasets import fetch_openml\n", "from sklearn.datasets import fetch_openml\n",
"\n", "\n",
@ -500,20 +472,13 @@
"## AdaBoost" "## AdaBoost"
] ]
}, },
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**Code to generate Figure 68. Decision boundaries of consecutive predictors:**"
]
},
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 24, "execution_count": 23,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"# not in the book\n", "# not in the book this cell generates and saves Figure 68\n",
"\n", "\n",
"m = len(X_train)\n", "m = len(X_train)\n",
"\n", "\n",
@ -549,7 +514,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 25, "execution_count": 24,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -563,11 +528,13 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 26, "execution_count": 25,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"plot_decision_boundary(ada_clf, X_train, y_train) # not in the book" "# not in the book in case you're curious to see what the decision boundary\n",
"# looks like for the AdaBoost classifier\n",
"plot_decision_boundary(ada_clf, X_train, y_train)"
] ]
}, },
{ {
@ -586,7 +553,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 27, "execution_count": 26,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -610,7 +577,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 28, "execution_count": 27,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -621,7 +588,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 29, "execution_count": 28,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -632,7 +599,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 30, "execution_count": 29,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -640,20 +607,13 @@
"sum(tree.predict(X_new) for tree in (tree_reg1, tree_reg2, tree_reg3))" "sum(tree.predict(X_new) for tree in (tree_reg1, tree_reg2, tree_reg3))"
] ]
}, },
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**Code to generate Figure 69. In this depiction of Gradient Boosting, the first predictor (top left) is trained normally, then each consecutive predictor (middle left and lower left) is trained on the previous predictors residuals; the right column shows the resulting ensembles predictions:**"
]
},
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 31, "execution_count": 30,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"# not in the book\n", "# not in the book this cell generates and saves Figure 69\n",
"\n", "\n",
"def plot_predictions(regressors, X, y, axes, style,\n", "def plot_predictions(regressors, X, y, axes, style,\n",
" label=None, data_style=\"b.\", data_label=None):\n", " label=None, data_style=\"b.\", data_label=None):\n",
@ -715,7 +675,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 32, "execution_count": 31,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -728,7 +688,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 33, "execution_count": 32,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -740,27 +700,20 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 34, "execution_count": 33,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"gbrt_best.n_estimators_" "gbrt_best.n_estimators_"
] ]
}, },
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**Code to generate Figure 610. GBRT ensembles with not enough predictors (left) and too many (right):**"
]
},
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 35, "execution_count": 34,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"# not in the book\n", "# not in the book this cell generates and saves Figure 610\n",
"\n", "\n",
"fix, axes = plt.subplots(ncols=2, figsize=(10,4), sharey=True)\n", "fix, axes = plt.subplots(ncols=2, figsize=(10,4), sharey=True)\n",
"\n", "\n",
@ -784,11 +737,11 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 36, "execution_count": 35,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"# not in the book (at least, not in this chapter: it's presented in chapter 2)\n", "# not in the book at least not in this chapter, it's presented in chapter 2\n",
"\n", "\n",
"import tarfile\n", "import tarfile\n",
"import urllib.request\n", "import urllib.request\n",
@ -817,7 +770,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 37, "execution_count": 36,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -836,11 +789,11 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 38, "execution_count": 37,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"# not in the book\n", "# not in the book evaluate the RMSE stats for the hgb_reg model\n",
"\n", "\n",
"from sklearn.model_selection import cross_val_score\n", "from sklearn.model_selection import cross_val_score\n",
"\n", "\n",
@ -858,7 +811,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 39, "execution_count": 38,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -878,7 +831,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 40, "execution_count": 39,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -929,7 +882,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 41, "execution_count": 40,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -947,7 +900,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 42, "execution_count": 41,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -958,7 +911,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 43, "execution_count": 42,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -970,7 +923,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 44, "execution_count": 43,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -982,7 +935,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 45, "execution_count": 44,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -1005,7 +958,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 46, "execution_count": 45,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -1014,7 +967,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 47, "execution_count": 46,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -1028,7 +981,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 48, "execution_count": 47,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -1037,7 +990,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 49, "execution_count": 48,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -1046,7 +999,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 50, "execution_count": 49,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -1062,7 +1015,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 51, "execution_count": 50,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -1081,7 +1034,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 52, "execution_count": 51,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -1097,7 +1050,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 53, "execution_count": 52,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -1114,7 +1067,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 54, "execution_count": 53,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -1130,7 +1083,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 55, "execution_count": 54,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -1146,7 +1099,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 56, "execution_count": 55,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -1155,7 +1108,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 57, "execution_count": 56,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -1171,7 +1124,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 58, "execution_count": 57,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -1188,7 +1141,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 59, "execution_count": 58,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -1204,7 +1157,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 60, "execution_count": 59,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -1213,7 +1166,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 61, "execution_count": 60,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -1236,7 +1189,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 62, "execution_count": 61,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -1246,7 +1199,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 63, "execution_count": 62,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -1277,7 +1230,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 64, "execution_count": 63,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -1289,7 +1242,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 65, "execution_count": 64,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -1298,7 +1251,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 66, "execution_count": 65,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -1309,7 +1262,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 67, "execution_count": 66,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -1332,7 +1285,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 68, "execution_count": 67,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -1344,7 +1297,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 69, "execution_count": 68,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -1353,7 +1306,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 70, "execution_count": 69,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -1383,7 +1336,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 71, "execution_count": 70,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -1406,7 +1359,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 72, "execution_count": 71,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -1417,7 +1370,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 73, "execution_count": 72,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [