From c46123155daae9c64420be791e523be6a249a999 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Aur=C3=A9lien=20Geron?= <ageron@users.noreply.github.com>
Date: Sun, 21 Nov 2021 16:40:36 +1300
Subject: [PATCH] Clarify the 'not in the book' comments

---
 02_end_to_end_machine_learning_project.ipynb | 433 +++++++++----------
 1 file changed, 209 insertions(+), 224 deletions(-)

diff --git a/02_end_to_end_machine_learning_project.ipynb b/02_end_to_end_machine_learning_project.ipynb
index 29de8c3..5dde5da 100644
--- a/02_end_to_end_machine_learning_project.ipynb
+++ b/02_end_to_end_machine_learning_project.ipynb
@@ -177,7 +177,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Not in the book\n",
+    "# not in the book – code to make figures prettier\n",
     "import matplotlib as mpl\n",
     "\n",
     "mpl.rc('font', size=12)\n",
@@ -198,7 +198,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Not in the book\n",
+    "# not in the book – code to save the figures as high-res PNGs for the book\n",
     "\n",
     "IMAGES_PATH = Path() / \"images\" / \"end_to_end_project\"\n",
     "IMAGES_PATH.mkdir(parents=True, exist_ok=True)\n",
@@ -365,7 +365,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Not in the book\n",
+    "# not in the book – shows how to compute the 10.7% proba of getting a bad sample\n",
     "\n",
     "from scipy.stats import binom\n",
     "\n",
@@ -389,7 +389,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Not in the book\n",
+    "# not in the book – shows another way to estimate the probability of bad sample\n",
     "\n",
     "np.random.seed(42)\n",
     "\n",
@@ -478,16 +478,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "housing[\"income_cat\"].value_counts() / len(housing)  # not in the book"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 30,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Not in the book\n",
+    "# not in the book – this code computes the data for Figure 2–10\n",
     "\n",
     "def income_cat_proportions(data):\n",
     "    return data[\"income_cat\"].value_counts() / len(data)\n",
@@ -503,21 +494,13 @@
     "compare_props[\"Strat. Error %\"] = (compare_props[\"Stratified %\"] /\n",
     "                                   compare_props[\"Overall %\"] - 1)\n",
     "compare_props[\"Rand. Error %\"] = (compare_props[\"Random %\"] /\n",
-    "                                  compare_props[\"Overall %\"] - 1)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 31,
-   "metadata": {},
-   "outputs": [],
-   "source": [
+    "                                  compare_props[\"Overall %\"] - 1)\n",
     "(compare_props * 100).round(2)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 32,
+   "execution_count": 30,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -534,7 +517,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 33,
+   "execution_count": 31,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -550,7 +533,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 34,
+   "execution_count": 32,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -561,7 +544,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 35,
+   "execution_count": 33,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -572,7 +555,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 36,
+   "execution_count": 34,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -595,16 +578,16 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "The next couple of cells generate the first figure in the chapter (this code is not in the book). It's just a beautified version of the previous figure, with an image of California added in the background, nicer label names and no grid."
+    "The next cell generates the first figure in the chapter (this code is not in the book). It's just a beautified version of the previous figure, with an image of California added in the background, nicer label names and no grid."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 37,
+   "execution_count": 35,
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Not in the book\n",
+    "# not in the book – this code generates the first figure in the chapter\n",
     "\n",
     "# Download the California image\n",
     "filename = \"california.png\"\n",
@@ -612,16 +595,7 @@
     "    root = \"https://raw.githubusercontent.com/ageron/handson-ml2/master/\"\n",
     "    url = root + \"images/end_to_end_project/\" + filename\n",
     "    print(\"Downloading\", filename)\n",
-    "    urllib.request.urlretrieve(url, IMAGES_PATH / filename)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 38,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# not in the book\n",
+    "    urllib.request.urlretrieve(url, IMAGES_PATH / filename)\n",
     "\n",
     "housing_renamed = housing.rename(columns={\n",
     "    \"latitude\": \"Latitude\", \"longitude\": \"Longitude\",\n",
@@ -651,7 +625,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 39,
+   "execution_count": 36,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -660,7 +634,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 40,
+   "execution_count": 37,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -669,7 +643,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 41,
+   "execution_count": 38,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -684,7 +658,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 42,
+   "execution_count": 39,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -703,7 +677,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 43,
+   "execution_count": 40,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -714,7 +688,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 44,
+   "execution_count": 41,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -738,7 +712,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 45,
+   "execution_count": 42,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -773,7 +747,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 46,
+   "execution_count": 43,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -783,7 +757,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 47,
+   "execution_count": 44,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -796,7 +770,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 48,
+   "execution_count": 45,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -809,7 +783,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 49,
+   "execution_count": 46,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -823,7 +797,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 50,
+   "execution_count": 47,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -841,7 +815,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 51,
+   "execution_count": 48,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -850,7 +824,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 52,
+   "execution_count": 49,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -859,7 +833,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 53,
+   "execution_count": 50,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -875,7 +849,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 54,
+   "execution_count": 51,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -891,7 +865,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 55,
+   "execution_count": 52,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -900,13 +874,41 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 56,
+   "execution_count": 53,
    "metadata": {},
    "outputs": [],
    "source": [
     "imputer.feature_names_in_"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 54,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "housing_tr = pd.DataFrame(X, columns=housing_num.columns,\n",
+    "                          index=housing_num.index)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 55,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "housing_tr.loc[null_rows_idx].head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 56,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "imputer.strategy"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 57,
@@ -922,41 +924,13 @@
    "execution_count": 58,
    "metadata": {},
    "outputs": [],
-   "source": [
-    "housing_tr.loc[null_rows_idx].head()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 59,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "imputer.strategy"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 60,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "housing_tr = pd.DataFrame(X, columns=housing_num.columns,\n",
-    "                          index=housing_num.index)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 61,
-   "metadata": {},
-   "outputs": [],
    "source": [
     "housing_tr.loc[null_rows_idx].head()  # not shown in the book"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 62,
+   "execution_count": 59,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -974,7 +948,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 63,
+   "execution_count": 60,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -986,7 +960,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 64,
+   "execution_count": 61,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -995,7 +969,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 65,
+   "execution_count": 62,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1019,7 +993,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 66,
+   "execution_count": 63,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1029,7 +1003,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 67,
+   "execution_count": 64,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1041,7 +1015,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 68,
+   "execution_count": 65,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1050,7 +1024,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 69,
+   "execution_count": 66,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1059,7 +1033,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 70,
+   "execution_count": 67,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1071,7 +1045,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 71,
+   "execution_count": 68,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1087,7 +1061,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 72,
+   "execution_count": 69,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1103,7 +1077,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 73,
+   "execution_count": 70,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1114,7 +1088,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 74,
+   "execution_count": 71,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1123,7 +1097,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 75,
+   "execution_count": 72,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1133,7 +1107,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 76,
+   "execution_count": 73,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1142,7 +1116,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 77,
+   "execution_count": 74,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1152,7 +1126,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 78,
+   "execution_count": 75,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1162,7 +1136,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 79,
+   "execution_count": 76,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1171,7 +1145,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 80,
+   "execution_count": 77,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1180,7 +1154,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 81,
+   "execution_count": 78,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1191,7 +1165,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 82,
+   "execution_count": 79,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1207,7 +1181,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 83,
+   "execution_count": 80,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1219,7 +1193,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 84,
+   "execution_count": 81,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1231,11 +1205,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 85,
+   "execution_count": 82,
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Not in the book\n",
+    "# not in the book – this code generates Figure 2–17\n",
     "fig, axs = plt.subplots(1, 2, figsize=(8,3), sharey=True)\n",
     "housing[\"population\"].hist(ax=axs[0], bins=50)\n",
     "housing[\"population\"].apply(np.log).hist(ax=axs[1], bins=50)\n",
@@ -1255,11 +1229,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 86,
+   "execution_count": 83,
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Not in the book\n",
+    "# not in the book – this code just shows that we get a uniform distribution\n",
     "percentiles = [np.percentile(housing[\"median_income\"], p)\n",
     "               for p in range(1, 100)]\n",
     "flattened_median_income = pd.cut(housing[\"median_income\"],\n",
@@ -1276,7 +1250,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 87,
+   "execution_count": 84,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1287,11 +1261,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 88,
+   "execution_count": 85,
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Not in the book\n",
+    "# not in the book – this code generates Figure 2–18\n",
     "\n",
     "ages = np.linspace(housing[\"housing_median_age\"].min(),\n",
     "                   housing[\"housing_median_age\"].max(),\n",
@@ -1321,7 +1295,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 89,
+   "execution_count": 86,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1340,7 +1314,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 90,
+   "execution_count": 87,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1349,7 +1323,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 91,
+   "execution_count": 88,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1363,7 +1337,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 92,
+   "execution_count": 89,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1386,7 +1360,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 93,
+   "execution_count": 90,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1398,7 +1372,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 94,
+   "execution_count": 91,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1409,7 +1383,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 95,
+   "execution_count": 92,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1418,7 +1392,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 96,
+   "execution_count": 93,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1430,7 +1404,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 97,
+   "execution_count": 94,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1439,7 +1413,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 98,
+   "execution_count": 95,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1449,7 +1423,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 99,
+   "execution_count": 96,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1478,7 +1452,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 100,
+   "execution_count": 97,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1504,7 +1478,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 101,
+   "execution_count": 98,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1515,7 +1489,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 102,
+   "execution_count": 99,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1524,11 +1498,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 103,
+   "execution_count": 100,
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Not in the book\n",
+    "# not in the book – this code generates Figure 2–19\n",
     "\n",
     "housing_renamed = housing.rename(columns={\n",
     "    \"latitude\": \"Latitude\", \"longitude\": \"Longitude\",\n",
@@ -1566,7 +1540,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 104,
+   "execution_count": 101,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1580,7 +1554,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 105,
+   "execution_count": 102,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1591,7 +1565,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 106,
+   "execution_count": 103,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1604,7 +1578,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 107,
+   "execution_count": 104,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1614,7 +1588,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 108,
+   "execution_count": 105,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1663,19 +1637,27 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 109,
+   "execution_count": 106,
    "metadata": {},
    "outputs": [],
    "source": [
     "df_housing_num_prepared = pd.DataFrame(\n",
     "    housing_num_prepared, columns=num_pipeline.get_feature_names_out(),\n",
-    "    index=housing_num.index)\n",
+    "    index=housing_num.index)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 107,
+   "metadata": {},
+   "outputs": [],
+   "source": [
     "df_housing_num_prepared.head(2)  # not in the book"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 110,
+   "execution_count": 108,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1684,7 +1666,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 111,
+   "execution_count": 109,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1693,7 +1675,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 112,
+   "execution_count": 110,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1702,7 +1684,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 113,
+   "execution_count": 111,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1711,7 +1693,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 114,
+   "execution_count": 112,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1720,7 +1702,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 115,
+   "execution_count": 113,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1742,7 +1724,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 116,
+   "execution_count": 114,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1756,7 +1738,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 117,
+   "execution_count": 115,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1765,20 +1747,21 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 118,
+   "execution_count": 116,
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Not in the book\n",
-    "housing_prepared_fr = pd.DataFrame(housing_prepared,\n",
-    "                                columns=preprocessing.get_feature_names_out(),\n",
-    "                                index=housing.index)\n",
+    "# not in the book – this code shows that we can get a DataFrame out if we want\n",
+    "housing_prepared_fr = pd.DataFrame(\n",
+    "    housing_prepared,\n",
+    "    columns=preprocessing.get_feature_names_out(),\n",
+    "    index=housing.index)\n",
     "housing_prepared_fr.head(2)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 119,
+   "execution_count": 117,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1815,7 +1798,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 120,
+   "execution_count": 118,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1825,7 +1808,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 121,
+   "execution_count": 119,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1848,7 +1831,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 122,
+   "execution_count": 120,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1867,7 +1850,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 123,
+   "execution_count": 121,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1884,7 +1867,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 124,
+   "execution_count": 122,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1893,18 +1876,18 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 125,
+   "execution_count": 123,
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Not in the book\n",
+    "# not in the book – this code computes the error ratios discussed in the book\n",
     "error_ratios = housing_predictions[:5].round(-2) / housing_labels.iloc[:5].values - 1\n",
     "print(\", \".join([f\"{100 * ratio:.1f}%\" for ratio in error_ratios]))"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 126,
+   "execution_count": 124,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1917,7 +1900,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 127,
+   "execution_count": 125,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1929,7 +1912,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 128,
+   "execution_count": 126,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1948,7 +1931,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 129,
+   "execution_count": 127,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1960,7 +1943,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 130,
+   "execution_count": 128,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1969,11 +1952,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 131,
+   "execution_count": 129,
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Not in the book\n",
+    "# not in the book – this code computes the error stats for the linear model\n",
     "lin_rmses = -cross_val_score(lin_reg, housing, housing_labels,\n",
     "                              scoring=\"neg_root_mean_squared_error\", cv=10)\n",
     "pd.Series(lin_rmses).describe()"
@@ -1988,7 +1971,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 132,
+   "execution_count": 130,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2002,7 +1985,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 133,
+   "execution_count": 131,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2018,7 +2001,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 134,
+   "execution_count": 132,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2059,7 +2042,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 135,
+   "execution_count": 133,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2089,11 +2072,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 136,
+   "execution_count": 134,
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Not in the book\n",
+    "# not in the book – this code shows part of the output of get_params().keys()\n",
     "print(str(full_pipeline.get_params().keys())[:1000] + \"...\")"
    ]
   },
@@ -2106,7 +2089,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 137,
+   "execution_count": 135,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2115,7 +2098,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 138,
+   "execution_count": 136,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2131,14 +2114,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 139,
+   "execution_count": 137,
    "metadata": {},
    "outputs": [],
    "source": [
     "cv_res = pd.DataFrame(grid_search.cv_results_)\n",
     "cv_res.sort_values(by=\"mean_test_score\", ascending=False, inplace=True)\n",
     "\n",
-    "# not in the book\n",
+    "# not in the book – these few lines of code just make the DataFrame look nicer\n",
     "cv_res = cv_res[[\"param_preprocessing__geo__n_clusters\",\n",
     "                 \"param_random_forest__max_features\", \"split0_test_score\",\n",
     "                 \"split1_test_score\", \"split2_test_score\", \"mean_test_score\"]]\n",
@@ -2165,7 +2148,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 140,
+   "execution_count": 138,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2182,7 +2165,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 141,
+   "execution_count": 139,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2201,11 +2184,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 142,
+   "execution_count": 140,
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Not in the book\n",
+    "# not in the book – this code displays the random search results\n",
     "cv_res = pd.DataFrame(rnd_search.cv_results_)\n",
     "cv_res.sort_values(by=\"mean_test_score\", ascending=False, inplace=True)\n",
     "cv_res = cv_res[[\"param_preprocessing__geo__n_clusters\",\n",
@@ -2238,13 +2221,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 143,
+   "execution_count": 141,
    "metadata": {
     "tags": []
    },
    "outputs": [],
    "source": [
-    "# Not in the book\n",
+    "# not in the book – plots a few distributions you can use in randomized search\n",
+    "\n",
     "from scipy.stats import randint, uniform, geom, expon\n",
     "\n",
     "xs1 = np.arange(0, 7 + 1)\n",
@@ -2299,13 +2283,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 144,
+   "execution_count": 142,
    "metadata": {
     "tags": []
    },
    "outputs": [],
    "source": [
-    "# Not in the book\n",
+    "# not in the book – shows the difference between expon and reciprocal\n",
+    "\n",
     "from scipy.stats import reciprocal\n",
     "\n",
     "xs1 = np.linspace(0, 7, 500)\n",
@@ -2362,7 +2347,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 145,
+   "execution_count": 143,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2373,7 +2358,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 146,
+   "execution_count": 144,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2391,7 +2376,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 147,
+   "execution_count": 145,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2413,7 +2398,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 148,
+   "execution_count": 146,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2435,11 +2420,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 149,
+   "execution_count": 147,
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Not in the book\n",
+    "# not in the book – shows how to compute a confidence interval for the RMSE\n",
     "m = len(squared_errors)\n",
     "mean = squared_errors.mean()\n",
     "tscore = stats.t.ppf((1 + confidence) / 2, df=m - 1)\n",
@@ -2456,11 +2441,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 150,
+   "execution_count": 148,
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Not in the book\n",
+    "# not in the book – computes a confidence interval again using z-score\n",
     "zscore = stats.norm.ppf((1 + confidence) / 2)\n",
     "zmargin = zscore * squared_errors.std(ddof=1) / np.sqrt(m)\n",
     "np.sqrt(mean - zmargin), np.sqrt(mean + zmargin)"
@@ -2482,7 +2467,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 151,
+   "execution_count": 149,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2495,18 +2480,18 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Now you can deploy this model to production, load it in your scripts and use is to make predictions:"
+    "Now you can deploy this model to production. For example, the following code could be a script that would run in production:"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 152,
+   "execution_count": 150,
    "metadata": {},
    "outputs": [],
    "source": [
     "import joblib\n",
     "\n",
-    "# not in the book\n",
+    "# not in the book – excluded for conciseness\n",
     "from sklearn.cluster import KMeans\n",
     "from sklearn.base import BaseEstimator, TransformerMixin\n",
     "from sklearn.metrics.pairwise import rbf_kernel\n",
@@ -2525,7 +2510,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 153,
+   "execution_count": 151,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2562,7 +2547,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 154,
+   "execution_count": 152,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2592,7 +2577,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 155,
+   "execution_count": 153,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2609,7 +2594,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 156,
+   "execution_count": 154,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2646,7 +2631,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 157,
+   "execution_count": 155,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2680,7 +2665,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 158,
+   "execution_count": 156,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2697,7 +2682,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 159,
+   "execution_count": 157,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2720,7 +2705,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 160,
+   "execution_count": 158,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2760,7 +2745,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 161,
+   "execution_count": 159,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2778,7 +2763,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 162,
+   "execution_count": 160,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2820,7 +2805,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 163,
+   "execution_count": 161,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2865,7 +2850,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 164,
+   "execution_count": 162,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2883,7 +2868,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 165,
+   "execution_count": 163,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2902,7 +2887,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 166,
+   "execution_count": 164,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2918,7 +2903,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 167,
+   "execution_count": 165,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2934,7 +2919,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 168,
+   "execution_count": 166,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2948,7 +2933,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 169,
+   "execution_count": 167,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2983,7 +2968,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 170,
+   "execution_count": 168,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -3005,7 +2990,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 171,
+   "execution_count": 169,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -3036,7 +3021,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 172,
+   "execution_count": 170,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -3097,7 +3082,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 173,
+   "execution_count": 171,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -3122,7 +3107,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 174,
+   "execution_count": 172,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -3144,7 +3129,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 175,
+   "execution_count": 173,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -3163,7 +3148,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 176,
+   "execution_count": 174,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -3181,7 +3166,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 177,
+   "execution_count": 175,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -3198,7 +3183,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 178,
+   "execution_count": 176,
    "metadata": {},
    "outputs": [],
    "source": [