From c46123155daae9c64420be791e523be6a249a999 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aur=C3=A9lien=20Geron?= Date: Sun, 21 Nov 2021 16:40:36 +1300 Subject: [PATCH] Clarify the 'not in the book' comments --- 02_end_to_end_machine_learning_project.ipynb | 433 +++++++++---------- 1 file changed, 209 insertions(+), 224 deletions(-) diff --git a/02_end_to_end_machine_learning_project.ipynb b/02_end_to_end_machine_learning_project.ipynb index 29de8c3..5dde5da 100644 --- a/02_end_to_end_machine_learning_project.ipynb +++ b/02_end_to_end_machine_learning_project.ipynb @@ -177,7 +177,7 @@ "metadata": {}, "outputs": [], "source": [ - "# Not in the book\n", + "# not in the book – code to make figures prettier\n", "import matplotlib as mpl\n", "\n", "mpl.rc('font', size=12)\n", @@ -198,7 +198,7 @@ "metadata": {}, "outputs": [], "source": [ - "# Not in the book\n", + "# not in the book – code to save the figures as high-res PNGs for the book\n", "\n", "IMAGES_PATH = Path() / \"images\" / \"end_to_end_project\"\n", "IMAGES_PATH.mkdir(parents=True, exist_ok=True)\n", @@ -365,7 +365,7 @@ "metadata": {}, "outputs": [], "source": [ - "# Not in the book\n", + "# not in the book – shows how to compute the 10.7% proba of getting a bad sample\n", "\n", "from scipy.stats import binom\n", "\n", @@ -389,7 +389,7 @@ "metadata": {}, "outputs": [], "source": [ - "# Not in the book\n", + "# not in the book – shows another way to estimate the probability of bad sample\n", "\n", "np.random.seed(42)\n", "\n", @@ -478,16 +478,7 @@ "metadata": {}, "outputs": [], "source": [ - "housing[\"income_cat\"].value_counts() / len(housing) # not in the book" - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "metadata": {}, - "outputs": [], - "source": [ - "# Not in the book\n", + "# not in the book – this code computes the data for Figure 2–10\n", "\n", "def income_cat_proportions(data):\n", " return data[\"income_cat\"].value_counts() / len(data)\n", @@ -503,21 +494,13 @@ "compare_props[\"Strat. Error %\"] = (compare_props[\"Stratified %\"] /\n", " compare_props[\"Overall %\"] - 1)\n", "compare_props[\"Rand. Error %\"] = (compare_props[\"Random %\"] /\n", - " compare_props[\"Overall %\"] - 1)" - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "metadata": {}, - "outputs": [], - "source": [ + " compare_props[\"Overall %\"] - 1)\n", "(compare_props * 100).round(2)" ] }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 30, "metadata": {}, "outputs": [], "source": [ @@ -534,7 +517,7 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 31, "metadata": {}, "outputs": [], "source": [ @@ -550,7 +533,7 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 32, "metadata": {}, "outputs": [], "source": [ @@ -561,7 +544,7 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 33, "metadata": {}, "outputs": [], "source": [ @@ -572,7 +555,7 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 34, "metadata": {}, "outputs": [], "source": [ @@ -595,16 +578,16 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The next couple of cells generate the first figure in the chapter (this code is not in the book). It's just a beautified version of the previous figure, with an image of California added in the background, nicer label names and no grid." + "The next cell generates the first figure in the chapter (this code is not in the book). It's just a beautified version of the previous figure, with an image of California added in the background, nicer label names and no grid." ] }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 35, "metadata": {}, "outputs": [], "source": [ - "# Not in the book\n", + "# not in the book – this code generates the first figure in the chapter\n", "\n", "# Download the California image\n", "filename = \"california.png\"\n", @@ -612,16 +595,7 @@ " root = \"https://raw.githubusercontent.com/ageron/handson-ml2/master/\"\n", " url = root + \"images/end_to_end_project/\" + filename\n", " print(\"Downloading\", filename)\n", - " urllib.request.urlretrieve(url, IMAGES_PATH / filename)" - ] - }, - { - "cell_type": "code", - "execution_count": 38, - "metadata": {}, - "outputs": [], - "source": [ - "# not in the book\n", + " urllib.request.urlretrieve(url, IMAGES_PATH / filename)\n", "\n", "housing_renamed = housing.rename(columns={\n", " \"latitude\": \"Latitude\", \"longitude\": \"Longitude\",\n", @@ -651,7 +625,7 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 36, "metadata": {}, "outputs": [], "source": [ @@ -660,7 +634,7 @@ }, { "cell_type": "code", - "execution_count": 40, + "execution_count": 37, "metadata": {}, "outputs": [], "source": [ @@ -669,7 +643,7 @@ }, { "cell_type": "code", - "execution_count": 41, + "execution_count": 38, "metadata": {}, "outputs": [], "source": [ @@ -684,7 +658,7 @@ }, { "cell_type": "code", - "execution_count": 42, + "execution_count": 39, "metadata": {}, "outputs": [], "source": [ @@ -703,7 +677,7 @@ }, { "cell_type": "code", - "execution_count": 43, + "execution_count": 40, "metadata": {}, "outputs": [], "source": [ @@ -714,7 +688,7 @@ }, { "cell_type": "code", - "execution_count": 44, + "execution_count": 41, "metadata": {}, "outputs": [], "source": [ @@ -738,7 +712,7 @@ }, { "cell_type": "code", - "execution_count": 45, + "execution_count": 42, "metadata": {}, "outputs": [], "source": [ @@ -773,7 +747,7 @@ }, { "cell_type": "code", - "execution_count": 46, + "execution_count": 43, "metadata": {}, "outputs": [], "source": [ @@ -783,7 +757,7 @@ }, { "cell_type": "code", - "execution_count": 47, + "execution_count": 44, "metadata": {}, "outputs": [], "source": [ @@ -796,7 +770,7 @@ }, { "cell_type": "code", - "execution_count": 48, + "execution_count": 45, "metadata": {}, "outputs": [], "source": [ @@ -809,7 +783,7 @@ }, { "cell_type": "code", - "execution_count": 49, + "execution_count": 46, "metadata": {}, "outputs": [], "source": [ @@ -823,7 +797,7 @@ }, { "cell_type": "code", - "execution_count": 50, + "execution_count": 47, "metadata": {}, "outputs": [], "source": [ @@ -841,7 +815,7 @@ }, { "cell_type": "code", - "execution_count": 51, + "execution_count": 48, "metadata": {}, "outputs": [], "source": [ @@ -850,7 +824,7 @@ }, { "cell_type": "code", - "execution_count": 52, + "execution_count": 49, "metadata": {}, "outputs": [], "source": [ @@ -859,7 +833,7 @@ }, { "cell_type": "code", - "execution_count": 53, + "execution_count": 50, "metadata": {}, "outputs": [], "source": [ @@ -875,7 +849,7 @@ }, { "cell_type": "code", - "execution_count": 54, + "execution_count": 51, "metadata": {}, "outputs": [], "source": [ @@ -891,7 +865,7 @@ }, { "cell_type": "code", - "execution_count": 55, + "execution_count": 52, "metadata": {}, "outputs": [], "source": [ @@ -900,13 +874,41 @@ }, { "cell_type": "code", - "execution_count": 56, + "execution_count": 53, "metadata": {}, "outputs": [], "source": [ "imputer.feature_names_in_" ] }, + { + "cell_type": "code", + "execution_count": 54, + "metadata": {}, + "outputs": [], + "source": [ + "housing_tr = pd.DataFrame(X, columns=housing_num.columns,\n", + " index=housing_num.index)" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "metadata": {}, + "outputs": [], + "source": [ + "housing_tr.loc[null_rows_idx].head()" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "metadata": {}, + "outputs": [], + "source": [ + "imputer.strategy" + ] + }, { "cell_type": "code", "execution_count": 57, @@ -922,41 +924,13 @@ "execution_count": 58, "metadata": {}, "outputs": [], - "source": [ - "housing_tr.loc[null_rows_idx].head()" - ] - }, - { - "cell_type": "code", - "execution_count": 59, - "metadata": {}, - "outputs": [], - "source": [ - "imputer.strategy" - ] - }, - { - "cell_type": "code", - "execution_count": 60, - "metadata": {}, - "outputs": [], - "source": [ - "housing_tr = pd.DataFrame(X, columns=housing_num.columns,\n", - " index=housing_num.index)" - ] - }, - { - "cell_type": "code", - "execution_count": 61, - "metadata": {}, - "outputs": [], "source": [ "housing_tr.loc[null_rows_idx].head() # not shown in the book" ] }, { "cell_type": "code", - "execution_count": 62, + "execution_count": 59, "metadata": {}, "outputs": [], "source": [ @@ -974,7 +948,7 @@ }, { "cell_type": "code", - "execution_count": 63, + "execution_count": 60, "metadata": {}, "outputs": [], "source": [ @@ -986,7 +960,7 @@ }, { "cell_type": "code", - "execution_count": 64, + "execution_count": 61, "metadata": {}, "outputs": [], "source": [ @@ -995,7 +969,7 @@ }, { "cell_type": "code", - "execution_count": 65, + "execution_count": 62, "metadata": {}, "outputs": [], "source": [ @@ -1019,7 +993,7 @@ }, { "cell_type": "code", - "execution_count": 66, + "execution_count": 63, "metadata": {}, "outputs": [], "source": [ @@ -1029,7 +1003,7 @@ }, { "cell_type": "code", - "execution_count": 67, + "execution_count": 64, "metadata": {}, "outputs": [], "source": [ @@ -1041,7 +1015,7 @@ }, { "cell_type": "code", - "execution_count": 68, + "execution_count": 65, "metadata": {}, "outputs": [], "source": [ @@ -1050,7 +1024,7 @@ }, { "cell_type": "code", - "execution_count": 69, + "execution_count": 66, "metadata": {}, "outputs": [], "source": [ @@ -1059,7 +1033,7 @@ }, { "cell_type": "code", - "execution_count": 70, + "execution_count": 67, "metadata": {}, "outputs": [], "source": [ @@ -1071,7 +1045,7 @@ }, { "cell_type": "code", - "execution_count": 71, + "execution_count": 68, "metadata": {}, "outputs": [], "source": [ @@ -1087,7 +1061,7 @@ }, { "cell_type": "code", - "execution_count": 72, + "execution_count": 69, "metadata": {}, "outputs": [], "source": [ @@ -1103,7 +1077,7 @@ }, { "cell_type": "code", - "execution_count": 73, + "execution_count": 70, "metadata": {}, "outputs": [], "source": [ @@ -1114,7 +1088,7 @@ }, { "cell_type": "code", - "execution_count": 74, + "execution_count": 71, "metadata": {}, "outputs": [], "source": [ @@ -1123,7 +1097,7 @@ }, { "cell_type": "code", - "execution_count": 75, + "execution_count": 72, "metadata": {}, "outputs": [], "source": [ @@ -1133,7 +1107,7 @@ }, { "cell_type": "code", - "execution_count": 76, + "execution_count": 73, "metadata": {}, "outputs": [], "source": [ @@ -1142,7 +1116,7 @@ }, { "cell_type": "code", - "execution_count": 77, + "execution_count": 74, "metadata": {}, "outputs": [], "source": [ @@ -1152,7 +1126,7 @@ }, { "cell_type": "code", - "execution_count": 78, + "execution_count": 75, "metadata": {}, "outputs": [], "source": [ @@ -1162,7 +1136,7 @@ }, { "cell_type": "code", - "execution_count": 79, + "execution_count": 76, "metadata": {}, "outputs": [], "source": [ @@ -1171,7 +1145,7 @@ }, { "cell_type": "code", - "execution_count": 80, + "execution_count": 77, "metadata": {}, "outputs": [], "source": [ @@ -1180,7 +1154,7 @@ }, { "cell_type": "code", - "execution_count": 81, + "execution_count": 78, "metadata": {}, "outputs": [], "source": [ @@ -1191,7 +1165,7 @@ }, { "cell_type": "code", - "execution_count": 82, + "execution_count": 79, "metadata": {}, "outputs": [], "source": [ @@ -1207,7 +1181,7 @@ }, { "cell_type": "code", - "execution_count": 83, + "execution_count": 80, "metadata": {}, "outputs": [], "source": [ @@ -1219,7 +1193,7 @@ }, { "cell_type": "code", - "execution_count": 84, + "execution_count": 81, "metadata": {}, "outputs": [], "source": [ @@ -1231,11 +1205,11 @@ }, { "cell_type": "code", - "execution_count": 85, + "execution_count": 82, "metadata": {}, "outputs": [], "source": [ - "# Not in the book\n", + "# not in the book – this code generates Figure 2–17\n", "fig, axs = plt.subplots(1, 2, figsize=(8,3), sharey=True)\n", "housing[\"population\"].hist(ax=axs[0], bins=50)\n", "housing[\"population\"].apply(np.log).hist(ax=axs[1], bins=50)\n", @@ -1255,11 +1229,11 @@ }, { "cell_type": "code", - "execution_count": 86, + "execution_count": 83, "metadata": {}, "outputs": [], "source": [ - "# Not in the book\n", + "# not in the book – this code just shows that we get a uniform distribution\n", "percentiles = [np.percentile(housing[\"median_income\"], p)\n", " for p in range(1, 100)]\n", "flattened_median_income = pd.cut(housing[\"median_income\"],\n", @@ -1276,7 +1250,7 @@ }, { "cell_type": "code", - "execution_count": 87, + "execution_count": 84, "metadata": {}, "outputs": [], "source": [ @@ -1287,11 +1261,11 @@ }, { "cell_type": "code", - "execution_count": 88, + "execution_count": 85, "metadata": {}, "outputs": [], "source": [ - "# Not in the book\n", + "# not in the book – this code generates Figure 2–18\n", "\n", "ages = np.linspace(housing[\"housing_median_age\"].min(),\n", " housing[\"housing_median_age\"].max(),\n", @@ -1321,7 +1295,7 @@ }, { "cell_type": "code", - "execution_count": 89, + "execution_count": 86, "metadata": {}, "outputs": [], "source": [ @@ -1340,7 +1314,7 @@ }, { "cell_type": "code", - "execution_count": 90, + "execution_count": 87, "metadata": {}, "outputs": [], "source": [ @@ -1349,7 +1323,7 @@ }, { "cell_type": "code", - "execution_count": 91, + "execution_count": 88, "metadata": {}, "outputs": [], "source": [ @@ -1363,7 +1337,7 @@ }, { "cell_type": "code", - "execution_count": 92, + "execution_count": 89, "metadata": {}, "outputs": [], "source": [ @@ -1386,7 +1360,7 @@ }, { "cell_type": "code", - "execution_count": 93, + "execution_count": 90, "metadata": {}, "outputs": [], "source": [ @@ -1398,7 +1372,7 @@ }, { "cell_type": "code", - "execution_count": 94, + "execution_count": 91, "metadata": {}, "outputs": [], "source": [ @@ -1409,7 +1383,7 @@ }, { "cell_type": "code", - "execution_count": 95, + "execution_count": 92, "metadata": {}, "outputs": [], "source": [ @@ -1418,7 +1392,7 @@ }, { "cell_type": "code", - "execution_count": 96, + "execution_count": 93, "metadata": {}, "outputs": [], "source": [ @@ -1430,7 +1404,7 @@ }, { "cell_type": "code", - "execution_count": 97, + "execution_count": 94, "metadata": {}, "outputs": [], "source": [ @@ -1439,7 +1413,7 @@ }, { "cell_type": "code", - "execution_count": 98, + "execution_count": 95, "metadata": {}, "outputs": [], "source": [ @@ -1449,7 +1423,7 @@ }, { "cell_type": "code", - "execution_count": 99, + "execution_count": 96, "metadata": {}, "outputs": [], "source": [ @@ -1478,7 +1452,7 @@ }, { "cell_type": "code", - "execution_count": 100, + "execution_count": 97, "metadata": {}, "outputs": [], "source": [ @@ -1504,7 +1478,7 @@ }, { "cell_type": "code", - "execution_count": 101, + "execution_count": 98, "metadata": {}, "outputs": [], "source": [ @@ -1515,7 +1489,7 @@ }, { "cell_type": "code", - "execution_count": 102, + "execution_count": 99, "metadata": {}, "outputs": [], "source": [ @@ -1524,11 +1498,11 @@ }, { "cell_type": "code", - "execution_count": 103, + "execution_count": 100, "metadata": {}, "outputs": [], "source": [ - "# Not in the book\n", + "# not in the book – this code generates Figure 2–19\n", "\n", "housing_renamed = housing.rename(columns={\n", " \"latitude\": \"Latitude\", \"longitude\": \"Longitude\",\n", @@ -1566,7 +1540,7 @@ }, { "cell_type": "code", - "execution_count": 104, + "execution_count": 101, "metadata": {}, "outputs": [], "source": [ @@ -1580,7 +1554,7 @@ }, { "cell_type": "code", - "execution_count": 105, + "execution_count": 102, "metadata": {}, "outputs": [], "source": [ @@ -1591,7 +1565,7 @@ }, { "cell_type": "code", - "execution_count": 106, + "execution_count": 103, "metadata": {}, "outputs": [], "source": [ @@ -1604,7 +1578,7 @@ }, { "cell_type": "code", - "execution_count": 107, + "execution_count": 104, "metadata": {}, "outputs": [], "source": [ @@ -1614,7 +1588,7 @@ }, { "cell_type": "code", - "execution_count": 108, + "execution_count": 105, "metadata": {}, "outputs": [], "source": [ @@ -1663,19 +1637,27 @@ }, { "cell_type": "code", - "execution_count": 109, + "execution_count": 106, "metadata": {}, "outputs": [], "source": [ "df_housing_num_prepared = pd.DataFrame(\n", " housing_num_prepared, columns=num_pipeline.get_feature_names_out(),\n", - " index=housing_num.index)\n", + " index=housing_num.index)" + ] + }, + { + "cell_type": "code", + "execution_count": 107, + "metadata": {}, + "outputs": [], + "source": [ "df_housing_num_prepared.head(2) # not in the book" ] }, { "cell_type": "code", - "execution_count": 110, + "execution_count": 108, "metadata": {}, "outputs": [], "source": [ @@ -1684,7 +1666,7 @@ }, { "cell_type": "code", - "execution_count": 111, + "execution_count": 109, "metadata": {}, "outputs": [], "source": [ @@ -1693,7 +1675,7 @@ }, { "cell_type": "code", - "execution_count": 112, + "execution_count": 110, "metadata": {}, "outputs": [], "source": [ @@ -1702,7 +1684,7 @@ }, { "cell_type": "code", - "execution_count": 113, + "execution_count": 111, "metadata": {}, "outputs": [], "source": [ @@ -1711,7 +1693,7 @@ }, { "cell_type": "code", - "execution_count": 114, + "execution_count": 112, "metadata": {}, "outputs": [], "source": [ @@ -1720,7 +1702,7 @@ }, { "cell_type": "code", - "execution_count": 115, + "execution_count": 113, "metadata": {}, "outputs": [], "source": [ @@ -1742,7 +1724,7 @@ }, { "cell_type": "code", - "execution_count": 116, + "execution_count": 114, "metadata": {}, "outputs": [], "source": [ @@ -1756,7 +1738,7 @@ }, { "cell_type": "code", - "execution_count": 117, + "execution_count": 115, "metadata": {}, "outputs": [], "source": [ @@ -1765,20 +1747,21 @@ }, { "cell_type": "code", - "execution_count": 118, + "execution_count": 116, "metadata": {}, "outputs": [], "source": [ - "# Not in the book\n", - "housing_prepared_fr = pd.DataFrame(housing_prepared,\n", - " columns=preprocessing.get_feature_names_out(),\n", - " index=housing.index)\n", + "# not in the book – this code shows that we can get a DataFrame out if we want\n", + "housing_prepared_fr = pd.DataFrame(\n", + " housing_prepared,\n", + " columns=preprocessing.get_feature_names_out(),\n", + " index=housing.index)\n", "housing_prepared_fr.head(2)" ] }, { "cell_type": "code", - "execution_count": 119, + "execution_count": 117, "metadata": {}, "outputs": [], "source": [ @@ -1815,7 +1798,7 @@ }, { "cell_type": "code", - "execution_count": 120, + "execution_count": 118, "metadata": {}, "outputs": [], "source": [ @@ -1825,7 +1808,7 @@ }, { "cell_type": "code", - "execution_count": 121, + "execution_count": 119, "metadata": {}, "outputs": [], "source": [ @@ -1848,7 +1831,7 @@ }, { "cell_type": "code", - "execution_count": 122, + "execution_count": 120, "metadata": {}, "outputs": [], "source": [ @@ -1867,7 +1850,7 @@ }, { "cell_type": "code", - "execution_count": 123, + "execution_count": 121, "metadata": {}, "outputs": [], "source": [ @@ -1884,7 +1867,7 @@ }, { "cell_type": "code", - "execution_count": 124, + "execution_count": 122, "metadata": {}, "outputs": [], "source": [ @@ -1893,18 +1876,18 @@ }, { "cell_type": "code", - "execution_count": 125, + "execution_count": 123, "metadata": {}, "outputs": [], "source": [ - "# Not in the book\n", + "# not in the book – this code computes the error ratios discussed in the book\n", "error_ratios = housing_predictions[:5].round(-2) / housing_labels.iloc[:5].values - 1\n", "print(\", \".join([f\"{100 * ratio:.1f}%\" for ratio in error_ratios]))" ] }, { "cell_type": "code", - "execution_count": 126, + "execution_count": 124, "metadata": {}, "outputs": [], "source": [ @@ -1917,7 +1900,7 @@ }, { "cell_type": "code", - "execution_count": 127, + "execution_count": 125, "metadata": {}, "outputs": [], "source": [ @@ -1929,7 +1912,7 @@ }, { "cell_type": "code", - "execution_count": 128, + "execution_count": 126, "metadata": {}, "outputs": [], "source": [ @@ -1948,7 +1931,7 @@ }, { "cell_type": "code", - "execution_count": 129, + "execution_count": 127, "metadata": {}, "outputs": [], "source": [ @@ -1960,7 +1943,7 @@ }, { "cell_type": "code", - "execution_count": 130, + "execution_count": 128, "metadata": {}, "outputs": [], "source": [ @@ -1969,11 +1952,11 @@ }, { "cell_type": "code", - "execution_count": 131, + "execution_count": 129, "metadata": {}, "outputs": [], "source": [ - "# Not in the book\n", + "# not in the book – this code computes the error stats for the linear model\n", "lin_rmses = -cross_val_score(lin_reg, housing, housing_labels,\n", " scoring=\"neg_root_mean_squared_error\", cv=10)\n", "pd.Series(lin_rmses).describe()" @@ -1988,7 +1971,7 @@ }, { "cell_type": "code", - "execution_count": 132, + "execution_count": 130, "metadata": {}, "outputs": [], "source": [ @@ -2002,7 +1985,7 @@ }, { "cell_type": "code", - "execution_count": 133, + "execution_count": 131, "metadata": {}, "outputs": [], "source": [ @@ -2018,7 +2001,7 @@ }, { "cell_type": "code", - "execution_count": 134, + "execution_count": 132, "metadata": {}, "outputs": [], "source": [ @@ -2059,7 +2042,7 @@ }, { "cell_type": "code", - "execution_count": 135, + "execution_count": 133, "metadata": {}, "outputs": [], "source": [ @@ -2089,11 +2072,11 @@ }, { "cell_type": "code", - "execution_count": 136, + "execution_count": 134, "metadata": {}, "outputs": [], "source": [ - "# Not in the book\n", + "# not in the book – this code shows part of the output of get_params().keys()\n", "print(str(full_pipeline.get_params().keys())[:1000] + \"...\")" ] }, @@ -2106,7 +2089,7 @@ }, { "cell_type": "code", - "execution_count": 137, + "execution_count": 135, "metadata": {}, "outputs": [], "source": [ @@ -2115,7 +2098,7 @@ }, { "cell_type": "code", - "execution_count": 138, + "execution_count": 136, "metadata": {}, "outputs": [], "source": [ @@ -2131,14 +2114,14 @@ }, { "cell_type": "code", - "execution_count": 139, + "execution_count": 137, "metadata": {}, "outputs": [], "source": [ "cv_res = pd.DataFrame(grid_search.cv_results_)\n", "cv_res.sort_values(by=\"mean_test_score\", ascending=False, inplace=True)\n", "\n", - "# not in the book\n", + "# not in the book – these few lines of code just make the DataFrame look nicer\n", "cv_res = cv_res[[\"param_preprocessing__geo__n_clusters\",\n", " \"param_random_forest__max_features\", \"split0_test_score\",\n", " \"split1_test_score\", \"split2_test_score\", \"mean_test_score\"]]\n", @@ -2165,7 +2148,7 @@ }, { "cell_type": "code", - "execution_count": 140, + "execution_count": 138, "metadata": {}, "outputs": [], "source": [ @@ -2182,7 +2165,7 @@ }, { "cell_type": "code", - "execution_count": 141, + "execution_count": 139, "metadata": {}, "outputs": [], "source": [ @@ -2201,11 +2184,11 @@ }, { "cell_type": "code", - "execution_count": 142, + "execution_count": 140, "metadata": {}, "outputs": [], "source": [ - "# Not in the book\n", + "# not in the book – this code displays the random search results\n", "cv_res = pd.DataFrame(rnd_search.cv_results_)\n", "cv_res.sort_values(by=\"mean_test_score\", ascending=False, inplace=True)\n", "cv_res = cv_res[[\"param_preprocessing__geo__n_clusters\",\n", @@ -2238,13 +2221,14 @@ }, { "cell_type": "code", - "execution_count": 143, + "execution_count": 141, "metadata": { "tags": [] }, "outputs": [], "source": [ - "# Not in the book\n", + "# not in the book – plots a few distributions you can use in randomized search\n", + "\n", "from scipy.stats import randint, uniform, geom, expon\n", "\n", "xs1 = np.arange(0, 7 + 1)\n", @@ -2299,13 +2283,14 @@ }, { "cell_type": "code", - "execution_count": 144, + "execution_count": 142, "metadata": { "tags": [] }, "outputs": [], "source": [ - "# Not in the book\n", + "# not in the book – shows the difference between expon and reciprocal\n", + "\n", "from scipy.stats import reciprocal\n", "\n", "xs1 = np.linspace(0, 7, 500)\n", @@ -2362,7 +2347,7 @@ }, { "cell_type": "code", - "execution_count": 145, + "execution_count": 143, "metadata": {}, "outputs": [], "source": [ @@ -2373,7 +2358,7 @@ }, { "cell_type": "code", - "execution_count": 146, + "execution_count": 144, "metadata": {}, "outputs": [], "source": [ @@ -2391,7 +2376,7 @@ }, { "cell_type": "code", - "execution_count": 147, + "execution_count": 145, "metadata": {}, "outputs": [], "source": [ @@ -2413,7 +2398,7 @@ }, { "cell_type": "code", - "execution_count": 148, + "execution_count": 146, "metadata": {}, "outputs": [], "source": [ @@ -2435,11 +2420,11 @@ }, { "cell_type": "code", - "execution_count": 149, + "execution_count": 147, "metadata": {}, "outputs": [], "source": [ - "# Not in the book\n", + "# not in the book – shows how to compute a confidence interval for the RMSE\n", "m = len(squared_errors)\n", "mean = squared_errors.mean()\n", "tscore = stats.t.ppf((1 + confidence) / 2, df=m - 1)\n", @@ -2456,11 +2441,11 @@ }, { "cell_type": "code", - "execution_count": 150, + "execution_count": 148, "metadata": {}, "outputs": [], "source": [ - "# Not in the book\n", + "# not in the book – computes a confidence interval again using z-score\n", "zscore = stats.norm.ppf((1 + confidence) / 2)\n", "zmargin = zscore * squared_errors.std(ddof=1) / np.sqrt(m)\n", "np.sqrt(mean - zmargin), np.sqrt(mean + zmargin)" @@ -2482,7 +2467,7 @@ }, { "cell_type": "code", - "execution_count": 151, + "execution_count": 149, "metadata": {}, "outputs": [], "source": [ @@ -2495,18 +2480,18 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Now you can deploy this model to production, load it in your scripts and use is to make predictions:" + "Now you can deploy this model to production. For example, the following code could be a script that would run in production:" ] }, { "cell_type": "code", - "execution_count": 152, + "execution_count": 150, "metadata": {}, "outputs": [], "source": [ "import joblib\n", "\n", - "# not in the book\n", + "# not in the book – excluded for conciseness\n", "from sklearn.cluster import KMeans\n", "from sklearn.base import BaseEstimator, TransformerMixin\n", "from sklearn.metrics.pairwise import rbf_kernel\n", @@ -2525,7 +2510,7 @@ }, { "cell_type": "code", - "execution_count": 153, + "execution_count": 151, "metadata": {}, "outputs": [], "source": [ @@ -2562,7 +2547,7 @@ }, { "cell_type": "code", - "execution_count": 154, + "execution_count": 152, "metadata": {}, "outputs": [], "source": [ @@ -2592,7 +2577,7 @@ }, { "cell_type": "code", - "execution_count": 155, + "execution_count": 153, "metadata": {}, "outputs": [], "source": [ @@ -2609,7 +2594,7 @@ }, { "cell_type": "code", - "execution_count": 156, + "execution_count": 154, "metadata": {}, "outputs": [], "source": [ @@ -2646,7 +2631,7 @@ }, { "cell_type": "code", - "execution_count": 157, + "execution_count": 155, "metadata": {}, "outputs": [], "source": [ @@ -2680,7 +2665,7 @@ }, { "cell_type": "code", - "execution_count": 158, + "execution_count": 156, "metadata": {}, "outputs": [], "source": [ @@ -2697,7 +2682,7 @@ }, { "cell_type": "code", - "execution_count": 159, + "execution_count": 157, "metadata": {}, "outputs": [], "source": [ @@ -2720,7 +2705,7 @@ }, { "cell_type": "code", - "execution_count": 160, + "execution_count": 158, "metadata": {}, "outputs": [], "source": [ @@ -2760,7 +2745,7 @@ }, { "cell_type": "code", - "execution_count": 161, + "execution_count": 159, "metadata": {}, "outputs": [], "source": [ @@ -2778,7 +2763,7 @@ }, { "cell_type": "code", - "execution_count": 162, + "execution_count": 160, "metadata": {}, "outputs": [], "source": [ @@ -2820,7 +2805,7 @@ }, { "cell_type": "code", - "execution_count": 163, + "execution_count": 161, "metadata": {}, "outputs": [], "source": [ @@ -2865,7 +2850,7 @@ }, { "cell_type": "code", - "execution_count": 164, + "execution_count": 162, "metadata": {}, "outputs": [], "source": [ @@ -2883,7 +2868,7 @@ }, { "cell_type": "code", - "execution_count": 165, + "execution_count": 163, "metadata": {}, "outputs": [], "source": [ @@ -2902,7 +2887,7 @@ }, { "cell_type": "code", - "execution_count": 166, + "execution_count": 164, "metadata": {}, "outputs": [], "source": [ @@ -2918,7 +2903,7 @@ }, { "cell_type": "code", - "execution_count": 167, + "execution_count": 165, "metadata": {}, "outputs": [], "source": [ @@ -2934,7 +2919,7 @@ }, { "cell_type": "code", - "execution_count": 168, + "execution_count": 166, "metadata": {}, "outputs": [], "source": [ @@ -2948,7 +2933,7 @@ }, { "cell_type": "code", - "execution_count": 169, + "execution_count": 167, "metadata": {}, "outputs": [], "source": [ @@ -2983,7 +2968,7 @@ }, { "cell_type": "code", - "execution_count": 170, + "execution_count": 168, "metadata": {}, "outputs": [], "source": [ @@ -3005,7 +2990,7 @@ }, { "cell_type": "code", - "execution_count": 171, + "execution_count": 169, "metadata": {}, "outputs": [], "source": [ @@ -3036,7 +3021,7 @@ }, { "cell_type": "code", - "execution_count": 172, + "execution_count": 170, "metadata": {}, "outputs": [], "source": [ @@ -3097,7 +3082,7 @@ }, { "cell_type": "code", - "execution_count": 173, + "execution_count": 171, "metadata": {}, "outputs": [], "source": [ @@ -3122,7 +3107,7 @@ }, { "cell_type": "code", - "execution_count": 174, + "execution_count": 172, "metadata": {}, "outputs": [], "source": [ @@ -3144,7 +3129,7 @@ }, { "cell_type": "code", - "execution_count": 175, + "execution_count": 173, "metadata": {}, "outputs": [], "source": [ @@ -3163,7 +3148,7 @@ }, { "cell_type": "code", - "execution_count": 176, + "execution_count": 174, "metadata": {}, "outputs": [], "source": [ @@ -3181,7 +3166,7 @@ }, { "cell_type": "code", - "execution_count": 177, + "execution_count": 175, "metadata": {}, "outputs": [], "source": [ @@ -3198,7 +3183,7 @@ }, { "cell_type": "code", - "execution_count": 178, + "execution_count": 176, "metadata": {}, "outputs": [], "source": [