diff --git a/02_end_to_end_machine_learning_project.ipynb b/02_end_to_end_machine_learning_project.ipynb index dae1e4a..895a484 100644 --- a/02_end_to_end_machine_learning_project.ipynb +++ b/02_end_to_end_machine_learning_project.ipynb @@ -16,7 +16,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "**Note**: You may find little differences between the code outputs in the book and in these Jupyter notebooks: these slight differences are mostly due to the random nature of many training algorithms: although I have tried to make these notebooks' outputs as constant as possible, it is impossible to guarantee that they will produce the exact same output on every platform. Also, some data structures (such as dictionaries) do not preserve the item order. Finally, I fixed a few minor bugs (I added notes next to the concerned cells) which lead to slightly different results, without changing the ideas presented in the book." ] @@ -245,6 +248,7 @@ "source": [ "import numpy as np\n", "\n", + "# For illustration only. Sklearn has train_test_split()\n", "def split_train_test(data, test_ratio):\n", " shuffled_indices = np.random.permutation(len(data))\n", " test_set_size = int(len(data) * test_ratio)\n", @@ -395,7 +399,9 @@ }, "outputs": [], "source": [ + "# Divide by 1.5 to limit the number of income categories\n", "housing[\"income_cat\"] = np.ceil(housing[\"median_income\"] / 1.5)\n", + "# Label those above 5 as 5\n", "housing[\"income_cat\"].where(housing[\"income_cat\"] < 5, 5.0, inplace=True)" ] }, @@ -416,7 +422,9 @@ "cell_type": "code", "execution_count": 23, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -558,7 +566,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "The argument `sharex=False` fixes a display bug (the x-axis values and legend were not displayed). This is a temporary fix (see: https://github.com/pandas-dev/pandas/issues/10611). Thanks to Wilmer Arellano for pointing it out." ] @@ -649,10 +660,12 @@ }, "outputs": [], "source": [ - "housing.plot(kind=\"scatter\", x=\"median_income\", y=\"median_house_value\",\n", - " alpha=0.1)\n", - "plt.axis([0, 16, 0, 550000])\n", - "save_fig(\"income_vs_house_value_scatterplot\")" + "from pandas.tools.plotting import scatter_matrix\n", + "\n", + "attributes = [\"median_house_value\", \"median_income\", \"total_rooms\",\n", + " \"housing_median_age\"]\n", + "scatter_matrix(housing[attributes], figsize=(12, 8))\n", + "save_fig(\"scatter_matrix_plot\")" ] }, { @@ -665,12 +678,10 @@ }, "outputs": [], "source": [ - "from pandas.tools.plotting import scatter_matrix\n", - "\n", - "attributes = [\"median_house_value\", \"median_income\", \"total_rooms\",\n", - " \"housing_median_age\"]\n", - "scatter_matrix(housing[attributes], figsize=(12, 8))\n", - "save_fig(\"scatter_matrix_plot\")" + "housing.plot(kind=\"scatter\", x=\"median_income\", y=\"median_house_value\",\n", + " alpha=0.1)\n", + "plt.axis([0, 16, 0, 550000])\n", + "save_fig(\"income_vs_house_value_scatterplot\")" ] }, { @@ -690,7 +701,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "Note: there was a bug in the previous cell, in the definition of the `rooms_per_household` attribute. This explains why the correlation value below differs slightly from the value in the book (unless you are reading the latest version)." ] @@ -758,7 +772,7 @@ }, "outputs": [], "source": [ - "housing = strat_train_set.drop(\"median_house_value\", axis=1)\n", + "housing = strat_train_set.drop(\"median_house_value\", axis=1) # drop labels for training set\n", "housing_labels = strat_train_set[\"median_house_value\"].copy()" ] }, @@ -772,7 +786,8 @@ }, "outputs": [], "source": [ - "housing.iloc[21:24]" + "sample_incomplete_rows = housing[housing.isnull().any(axis=1)].head()\n", + "sample_incomplete_rows" ] }, { @@ -785,8 +800,7 @@ }, "outputs": [], "source": [ - "housing_copy = housing.copy().iloc[21:24]\n", - "housing_copy.dropna(subset=[\"total_bedrooms\"]) # option 1" + "sample_incomplete_rows.dropna(subset=[\"total_bedrooms\"]) # option 1" ] }, { @@ -799,8 +813,7 @@ }, "outputs": [], "source": [ - "housing_copy = housing.copy().iloc[21:24]\n", - "housing_copy.drop(\"total_bedrooms\", axis=1) # option 2" + "sample_incomplete_rows.drop(\"total_bedrooms\", axis=1) # option 2" ] }, { @@ -813,10 +826,9 @@ }, "outputs": [], "source": [ - "housing_copy = housing.copy().iloc[21:24]\n", - "median = housing_copy[\"total_bedrooms\"].median()\n", - "housing_copy[\"total_bedrooms\"].fillna(median, inplace=True) # option 3\n", - "housing_copy" + "median = housing[\"total_bedrooms\"].median()\n", + "sample_incomplete_rows[\"total_bedrooms\"].fillna(median, inplace=True) # option 3\n", + "sample_incomplete_rows" ] }, { @@ -828,65 +840,22 @@ "editable": true }, "outputs": [], - "source": [ - "housing_copy.drop(\"total_bedrooms\", axis=1) # option 2" - ] - }, - { - "cell_type": "code", - "execution_count": 48, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, - "outputs": [], - "source": [ - "median = housing_copy[\"total_bedrooms\"].median()\n", - "housing_copy[\"total_bedrooms\"].fillna(median, inplace=True) # option 3\n", - "housing_copy" - ] - }, - { - "cell_type": "code", - "execution_count": 49, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, - "outputs": [], - "source": [ - "# Summary...\n", - "housing_copy = housing.copy().iloc[21:24]\n", - "housing_copy.dropna(subset=[\"total_bedrooms\"]) # option 1\n", - "\n", - "housing_copy = housing.copy().iloc[21:24]\n", - "housing_copy.drop(\"total_bedrooms\", axis=1) # option 2\n", - "\n", - "housing_copy = housing.copy().iloc[21:24]\n", - "median = housing_copy[\"total_bedrooms\"].median()\n", - "housing_copy[\"total_bedrooms\"].fillna(median, inplace=True) # option 3" - ] - }, - { - "cell_type": "code", - "execution_count": 50, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, - "outputs": [], "source": [ "from sklearn.preprocessing import Imputer\n", "\n", "imputer = Imputer(strategy=\"median\")" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Remove the text attribute because median can only be calculated on numerical attributes:" + ] + }, { "cell_type": "code", - "execution_count": 51, + "execution_count": 48, "metadata": { "collapsed": false, "deletable": true, @@ -899,7 +868,7 @@ }, { "cell_type": "code", - "execution_count": 52, + "execution_count": 49, "metadata": { "collapsed": false, "deletable": true, @@ -912,7 +881,7 @@ }, { "cell_type": "code", - "execution_count": 53, + "execution_count": 50, "metadata": { "collapsed": false, "deletable": true, @@ -923,9 +892,16 @@ "imputer.statistics_" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Check that this is the same as manually computing the median of each attribute:" + ] + }, { "cell_type": "code", - "execution_count": 54, + "execution_count": 51, "metadata": { "collapsed": false, "deletable": true, @@ -936,9 +912,16 @@ "housing_num.median().values" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Transform the training set:" + ] + }, { "cell_type": "code", - "execution_count": 55, + "execution_count": 52, "metadata": { "collapsed": true, "deletable": true, @@ -951,20 +934,7 @@ }, { "cell_type": "code", - "execution_count": 56, - "metadata": { - "collapsed": true, - "deletable": true, - "editable": true - }, - "outputs": [], - "source": [ - "housing_tr = pd.DataFrame(X, columns=housing_num.columns)" - ] - }, - { - "cell_type": "code", - "execution_count": 57, + "execution_count": 53, "metadata": { "collapsed": false, "deletable": true, @@ -972,12 +942,26 @@ }, "outputs": [], "source": [ - "housing_tr.iloc[21:24]" + "housing_tr = pd.DataFrame(X, columns=housing_num.columns,\n", + " index = list(housing.index.values))" ] }, { "cell_type": "code", - "execution_count": 58, + "execution_count": 54, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "housing_tr.loc[sample_incomplete_rows.index.values]" + ] + }, + { + "cell_type": "code", + "execution_count": 55, "metadata": { "collapsed": false, "deletable": true, @@ -990,7 +974,7 @@ }, { "cell_type": "code", - "execution_count": 59, + "execution_count": 56, "metadata": { "collapsed": false, "deletable": true, @@ -1004,7 +988,7 @@ }, { "cell_type": "code", - "execution_count": 60, + "execution_count": 57, "metadata": { "collapsed": false, "deletable": true, @@ -1022,7 +1006,7 @@ }, { "cell_type": "code", - "execution_count": 61, + "execution_count": 58, "metadata": { "collapsed": false, "deletable": true, @@ -1035,7 +1019,7 @@ }, { "cell_type": "code", - "execution_count": 62, + "execution_count": 59, "metadata": { "collapsed": false, "deletable": true, @@ -1052,7 +1036,7 @@ }, { "cell_type": "code", - "execution_count": 63, + "execution_count": 60, "metadata": { "collapsed": false, "deletable": true, @@ -1065,7 +1049,7 @@ }, { "cell_type": "code", - "execution_count": 64, + "execution_count": 61, "metadata": { "collapsed": false, "deletable": true, @@ -1082,7 +1066,7 @@ }, { "cell_type": "code", - "execution_count": 65, + "execution_count": 62, "metadata": { "collapsed": false, "deletable": true, @@ -1092,6 +1076,7 @@ "source": [ "from sklearn.base import BaseEstimator, TransformerMixin\n", "\n", + "# column index\n", "rooms_ix, bedrooms_ix, population_ix, household_ix = 3, 4, 5, 6\n", "\n", "class CombinedAttributesAdder(BaseEstimator, TransformerMixin):\n", @@ -1115,7 +1100,7 @@ }, { "cell_type": "code", - "execution_count": 66, + "execution_count": 63, "metadata": { "collapsed": false, "deletable": true, @@ -1129,7 +1114,7 @@ }, { "cell_type": "code", - "execution_count": 67, + "execution_count": 64, "metadata": { "collapsed": false, "deletable": true, @@ -1151,7 +1136,7 @@ }, { "cell_type": "code", - "execution_count": 68, + "execution_count": 65, "metadata": { "collapsed": false, "deletable": true, @@ -1164,7 +1149,7 @@ }, { "cell_type": "code", - "execution_count": 69, + "execution_count": 66, "metadata": { "collapsed": true, "deletable": true, @@ -1174,6 +1159,8 @@ "source": [ "from sklearn.base import BaseEstimator, TransformerMixin\n", "\n", + "# Create a class to select numerical or categorical columns \n", + "# since Scikit-Learn doesn't handle DataFrames yet\n", "class DataFrameSelector(BaseEstimator, TransformerMixin):\n", " def __init__(self, attribute_names):\n", " self.attribute_names = attribute_names\n", @@ -1185,7 +1172,7 @@ }, { "cell_type": "code", - "execution_count": 70, + "execution_count": 67, "metadata": { "collapsed": true, "deletable": true, @@ -1211,7 +1198,7 @@ }, { "cell_type": "code", - "execution_count": 71, + "execution_count": 68, "metadata": { "collapsed": false, "deletable": true, @@ -1229,7 +1216,7 @@ }, { "cell_type": "code", - "execution_count": 72, + "execution_count": 69, "metadata": { "collapsed": false, "deletable": true, @@ -1243,7 +1230,7 @@ }, { "cell_type": "code", - "execution_count": 73, + "execution_count": 70, "metadata": { "collapsed": false, "deletable": true, @@ -1266,7 +1253,7 @@ }, { "cell_type": "code", - "execution_count": 74, + "execution_count": 71, "metadata": { "collapsed": false, "deletable": true, @@ -1282,7 +1269,7 @@ }, { "cell_type": "code", - "execution_count": 75, + "execution_count": 72, "metadata": { "collapsed": false, "deletable": true, @@ -1298,9 +1285,16 @@ "print(\"Predictions:\", lin_reg.predict(some_data_prepared))" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Compare against the actual values:" + ] + }, { "cell_type": "code", - "execution_count": 76, + "execution_count": 73, "metadata": { "collapsed": false, "deletable": true, @@ -1313,7 +1307,7 @@ }, { "cell_type": "code", - "execution_count": 77, + "execution_count": 74, "metadata": { "collapsed": false, "deletable": true, @@ -1326,7 +1320,7 @@ }, { "cell_type": "code", - "execution_count": 78, + "execution_count": 75, "metadata": { "collapsed": false, "deletable": true, @@ -1344,7 +1338,7 @@ }, { "cell_type": "code", - "execution_count": 79, + "execution_count": 76, "metadata": { "collapsed": false, "deletable": true, @@ -1360,7 +1354,7 @@ }, { "cell_type": "code", - "execution_count": 80, + "execution_count": 77, "metadata": { "collapsed": false, "deletable": true, @@ -1376,7 +1370,7 @@ }, { "cell_type": "code", - "execution_count": 81, + "execution_count": 78, "metadata": { "collapsed": false, "deletable": true, @@ -1402,7 +1396,7 @@ }, { "cell_type": "code", - "execution_count": 82, + "execution_count": 79, "metadata": { "collapsed": false, "deletable": true, @@ -1419,7 +1413,7 @@ }, { "cell_type": "code", - "execution_count": 83, + "execution_count": 80, "metadata": { "collapsed": false, "deletable": true, @@ -1437,7 +1431,7 @@ }, { "cell_type": "code", - "execution_count": 84, + "execution_count": 81, "metadata": { "collapsed": false, "deletable": true, @@ -1453,7 +1447,7 @@ }, { "cell_type": "code", - "execution_count": 85, + "execution_count": 82, "metadata": { "collapsed": false, "deletable": true, @@ -1469,7 +1463,7 @@ }, { "cell_type": "code", - "execution_count": 86, + "execution_count": 83, "metadata": { "collapsed": false, "deletable": true, @@ -1485,7 +1479,7 @@ }, { "cell_type": "code", - "execution_count": 87, + "execution_count": 84, "metadata": { "collapsed": false, "deletable": true, @@ -1503,7 +1497,7 @@ }, { "cell_type": "code", - "execution_count": 88, + "execution_count": 85, "metadata": { "collapsed": false, "deletable": true, @@ -1517,7 +1511,7 @@ }, { "cell_type": "code", - "execution_count": 89, + "execution_count": 86, "metadata": { "collapsed": false, "deletable": true, @@ -1537,7 +1531,7 @@ }, { "cell_type": "code", - "execution_count": 90, + "execution_count": 87, "metadata": { "collapsed": false, "deletable": true, @@ -1548,19 +1542,29 @@ "from sklearn.model_selection import GridSearchCV\n", "\n", "param_grid = [\n", + " # try 12 (3×4) combinations of hyperparameters\n", " {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},\n", + " # then try 6 (2×3) combinations with bootstrap set as False\n", " {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},\n", " ]\n", "\n", "forest_reg = RandomForestRegressor(random_state=42)\n", + "# train across 5 folds, that's a total of (12+6)*5=90 rounds of training \n", "grid_search = GridSearchCV(forest_reg, param_grid, cv=5,\n", " scoring='neg_mean_squared_error')\n", "grid_search.fit(housing_prepared, housing_labels)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The best hyperparameter combination found:" + ] + }, { "cell_type": "code", - "execution_count": 91, + "execution_count": 88, "metadata": { "collapsed": false, "deletable": true, @@ -1573,7 +1577,7 @@ }, { "cell_type": "code", - "execution_count": 92, + "execution_count": 89, "metadata": { "collapsed": false, "deletable": true, @@ -1584,9 +1588,16 @@ "grid_search.best_estimator_" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's look at the score of each hyperparameter combination tested during the grid search:" + ] + }, { "cell_type": "code", - "execution_count": 93, + "execution_count": 90, "metadata": { "collapsed": false, "deletable": true, @@ -1601,7 +1612,7 @@ }, { "cell_type": "code", - "execution_count": 94, + "execution_count": 91, "metadata": { "collapsed": false, "deletable": true, @@ -1614,7 +1625,7 @@ }, { "cell_type": "code", - "execution_count": 95, + "execution_count": 92, "metadata": { "collapsed": false, "deletable": true, @@ -1638,7 +1649,7 @@ }, { "cell_type": "code", - "execution_count": 96, + "execution_count": 93, "metadata": { "collapsed": false, "deletable": true, @@ -1653,7 +1664,7 @@ }, { "cell_type": "code", - "execution_count": 97, + "execution_count": 94, "metadata": { "collapsed": false, "deletable": true, @@ -1667,7 +1678,7 @@ }, { "cell_type": "code", - "execution_count": 98, + "execution_count": 95, "metadata": { "collapsed": false, "deletable": true, @@ -1683,7 +1694,7 @@ }, { "cell_type": "code", - "execution_count": 99, + "execution_count": 96, "metadata": { "collapsed": true, "deletable": true, @@ -1705,7 +1716,7 @@ }, { "cell_type": "code", - "execution_count": 100, + "execution_count": 97, "metadata": { "collapsed": false, "deletable": true, @@ -1741,7 +1752,7 @@ }, { "cell_type": "code", - "execution_count": 101, + "execution_count": 98, "metadata": { "collapsed": false, "deletable": true, @@ -1778,7 +1789,7 @@ }, { "cell_type": "code", - "execution_count": 102, + "execution_count": 99, "metadata": { "collapsed": true, "deletable": true, @@ -1791,7 +1802,7 @@ }, { "cell_type": "code", - "execution_count": 103, + "execution_count": 100, "metadata": { "collapsed": true, "deletable": true, @@ -1817,7 +1828,7 @@ }, { "cell_type": "code", - "execution_count": 104, + "execution_count": 101, "metadata": { "collapsed": false, "deletable": true, @@ -1867,7 +1878,7 @@ }, { "cell_type": "code", - "execution_count": 105, + "execution_count": 102, "metadata": { "collapsed": false, "deletable": true, @@ -1900,7 +1911,7 @@ }, { "cell_type": "code", - "execution_count": 106, + "execution_count": 103, "metadata": { "collapsed": false, "deletable": true, @@ -1925,7 +1936,7 @@ }, { "cell_type": "code", - "execution_count": 107, + "execution_count": 104, "metadata": { "collapsed": false, "deletable": true, @@ -1968,7 +1979,7 @@ }, { "cell_type": "code", - "execution_count": 108, + "execution_count": 105, "metadata": { "collapsed": false, "deletable": true, @@ -2008,7 +2019,7 @@ }, { "cell_type": "code", - "execution_count": 109, + "execution_count": 106, "metadata": { "collapsed": false, "deletable": true, @@ -2033,7 +2044,7 @@ }, { "cell_type": "code", - "execution_count": 110, + "execution_count": 107, "metadata": { "collapsed": false, "deletable": true, @@ -2066,7 +2077,7 @@ }, { "cell_type": "code", - "execution_count": 111, + "execution_count": 108, "metadata": { "collapsed": false, "deletable": true, @@ -2098,7 +2109,7 @@ }, { "cell_type": "code", - "execution_count": 112, + "execution_count": 109, "metadata": { "collapsed": false, "deletable": true, @@ -2150,7 +2161,7 @@ }, { "cell_type": "code", - "execution_count": 113, + "execution_count": 110, "metadata": { "collapsed": true, "deletable": true, @@ -2196,7 +2207,7 @@ }, { "cell_type": "code", - "execution_count": 114, + "execution_count": 111, "metadata": { "collapsed": true, "deletable": true, @@ -2219,7 +2230,7 @@ }, { "cell_type": "code", - "execution_count": 115, + "execution_count": 112, "metadata": { "collapsed": false, "deletable": true, @@ -2233,7 +2244,7 @@ }, { "cell_type": "code", - "execution_count": 116, + "execution_count": 113, "metadata": { "collapsed": false, "deletable": true, @@ -2256,7 +2267,7 @@ }, { "cell_type": "code", - "execution_count": 117, + "execution_count": 114, "metadata": { "collapsed": false, "deletable": true, @@ -2279,7 +2290,7 @@ }, { "cell_type": "code", - "execution_count": 118, + "execution_count": 115, "metadata": { "collapsed": false, "deletable": true, @@ -2295,7 +2306,7 @@ }, { "cell_type": "code", - "execution_count": 119, + "execution_count": 116, "metadata": { "collapsed": true, "deletable": true, @@ -2318,7 +2329,7 @@ }, { "cell_type": "code", - "execution_count": 120, + "execution_count": 117, "metadata": { "collapsed": false, "deletable": true, @@ -2341,7 +2352,7 @@ }, { "cell_type": "code", - "execution_count": 121, + "execution_count": 118, "metadata": { "collapsed": false, "deletable": true, @@ -2384,7 +2395,7 @@ }, { "cell_type": "code", - "execution_count": 122, + "execution_count": 119, "metadata": { "collapsed": false, "deletable": true, @@ -2401,7 +2412,7 @@ }, { "cell_type": "code", - "execution_count": 123, + "execution_count": 120, "metadata": { "collapsed": false, "deletable": true, @@ -2424,7 +2435,7 @@ }, { "cell_type": "code", - "execution_count": 124, + "execution_count": 121, "metadata": { "collapsed": false, "deletable": true, @@ -2471,7 +2482,7 @@ }, { "cell_type": "code", - "execution_count": 125, + "execution_count": 122, "metadata": { "collapsed": false, "deletable": true, @@ -2491,7 +2502,7 @@ }, { "cell_type": "code", - "execution_count": 126, + "execution_count": 123, "metadata": { "collapsed": false, "deletable": true, @@ -2514,7 +2525,7 @@ }, { "cell_type": "code", - "execution_count": 127, + "execution_count": 124, "metadata": { "collapsed": false, "deletable": true,