From c8b7f045eeb0d755e5020105f93c275923318add Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Aur=C3=A9lien=20Geron?= <ageron@users.noreply.github.com>
Date: Sun, 14 Jan 2018 09:11:47 +0100
Subject: [PATCH] Fix hyperparameter search and comment at the end of the
 solution of exercise 5, chapter 2

---
 02_end_to_end_machine_learning_project.ipynb | 103 +++++--------------
 1 file changed, 25 insertions(+), 78 deletions(-)

diff --git a/02_end_to_end_machine_learning_project.ipynb b/02_end_to_end_machine_learning_project.ipynb
index 1cadabb..1e51f9a 100644
--- a/02_end_to_end_machine_learning_project.ipynb
+++ b/02_end_to_end_machine_learning_project.ipynb
@@ -406,9 +406,7 @@
   {
    "cell_type": "code",
    "execution_count": 30,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "housing = strat_train_set.copy()"
@@ -486,9 +484,7 @@
   {
    "cell_type": "code",
    "execution_count": 35,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "corr_matrix = housing.corr()"
@@ -533,9 +529,7 @@
   {
    "cell_type": "code",
    "execution_count": 39,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "housing[\"rooms_per_household\"] = housing[\"total_rooms\"]/housing[\"households\"]\n",
@@ -591,9 +585,7 @@
   {
    "cell_type": "code",
    "execution_count": 43,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "housing = strat_train_set.drop(\"median_house_value\", axis=1) # drop labels for training set\n",
@@ -642,9 +634,7 @@
   {
    "cell_type": "code",
    "execution_count": 48,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "from sklearn.preprocessing import Imputer\n",
@@ -662,9 +652,7 @@
   {
    "cell_type": "code",
    "execution_count": 49,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "housing_num = housing.drop('ocean_proximity', axis=1)\n",
@@ -715,9 +703,7 @@
   {
    "cell_type": "code",
    "execution_count": 53,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "X = imputer.transform(housing_num)"
@@ -726,9 +712,7 @@
   {
    "cell_type": "code",
    "execution_count": 54,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "housing_tr = pd.DataFrame(X, columns=housing_num.columns,\n",
@@ -859,9 +843,7 @@
   {
    "cell_type": "code",
    "execution_count": 63,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "# Definition of the CategoricalEncoder class, copied from PR #9151.\n",
@@ -1126,9 +1108,7 @@
   {
    "cell_type": "code",
    "execution_count": 68,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "from sklearn.base import BaseEstimator, TransformerMixin\n",
@@ -1175,9 +1155,7 @@
   {
    "cell_type": "code",
    "execution_count": 70,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "from sklearn.pipeline import Pipeline\n",
@@ -1211,9 +1189,7 @@
   {
    "cell_type": "code",
    "execution_count": 72,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "from sklearn.base import BaseEstimator, TransformerMixin\n",
@@ -1261,9 +1237,7 @@
   {
    "cell_type": "code",
    "execution_count": 74,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "from sklearn.pipeline import FeatureUnion\n",
@@ -1411,9 +1385,7 @@
   {
    "cell_type": "code",
    "execution_count": 85,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "from sklearn.model_selection import cross_val_score\n",
@@ -1644,9 +1616,7 @@
   {
    "cell_type": "code",
    "execution_count": 102,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "final_model = grid_search.best_estimator_\n",
@@ -1709,9 +1679,7 @@
   {
    "cell_type": "code",
    "execution_count": 105,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "my_model = full_pipeline_with_predictor"
@@ -1720,9 +1688,7 @@
   {
    "cell_type": "code",
    "execution_count": 106,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "from sklearn.externals import joblib\n",
@@ -1991,9 +1957,7 @@
   {
    "cell_type": "code",
    "execution_count": 116,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "from sklearn.base import BaseEstimator, TransformerMixin\n",
@@ -2029,9 +1993,7 @@
   {
    "cell_type": "code",
    "execution_count": 117,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "k = 5"
@@ -2089,9 +2051,7 @@
   {
    "cell_type": "code",
    "execution_count": 121,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "preparation_and_feature_selection_pipeline = Pipeline([\n",
@@ -2103,9 +2063,7 @@
   {
    "cell_type": "code",
    "execution_count": 122,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "housing_prepared_top_k_features = preparation_and_feature_selection_pipeline.fit_transform(housing)"
@@ -2167,9 +2125,7 @@
   {
    "cell_type": "code",
    "execution_count": 125,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "prepare_select_and_predict_pipeline = Pipeline([\n",
@@ -2237,7 +2193,7 @@
    "source": [
     "param_grid = [\n",
     "        {'preparation__num_pipeline__imputer__strategy': ['mean', 'median', 'most_frequent'],\n",
-    "         'feature_selection__k': [3, 4, 5, 6, 7]}\n",
+    "         'feature_selection__k': list(range(1, len(feature_importances) + 1))}\n",
     "]\n",
     "\n",
     "grid_search_prep = GridSearchCV(prepare_select_and_predict_pipeline, param_grid, cv=5,\n",
@@ -2258,16 +2214,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Great! It seems that we had the right imputer strategy (median), and apparently only the top 7 features are useful (out of 9), the last 2 seem to just add some noise."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 130,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "housing.shape"
+    "The best imputer strategy is `most_frequent` and apparently almost all features are useful (15 out of 16). The last one (`ISLAND`) seems to just add some noise."
    ]
   },
   {
@@ -2294,7 +2241,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.6.2"
+   "version": "3.6.3"
   },
   "nav_menu": {
    "height": "279px",