From d192f3de7a07e2e83185e6e2f6ef7636ed7ceb01 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Aur=C3=A9lien=20Geron?= <ageron@users.noreply.github.com>
Date: Sun, 30 Apr 2017 17:32:46 +0200
Subject: [PATCH] Add exercise solutions

---
 02_end_to_end_machine_learning_project.ipynb | 666 ++++++++++++++++++-
 1 file changed, 662 insertions(+), 4 deletions(-)

diff --git a/02_end_to_end_machine_learning_project.ipynb b/02_end_to_end_machine_learning_project.ipynb
index 8bf4fa7..d08c7fc 100644
--- a/02_end_to_end_machine_learning_project.ipynb
+++ b/02_end_to_end_machine_learning_project.ipynb
@@ -1523,19 +1523,677 @@
     "editable": true
    },
    "source": [
-    "**Coming soon**"
+    "## 1."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "collapsed": true,
+    "deletable": true,
+    "editable": true
+   },
+   "source": [
+    "Question: Try a Support Vector Machine regressor (`sklearn.svm.SVR`), with various hyperparameters such as `kernel=\"linear\"` (with various values for the `C` hyperparameter) or `kernel=\"rbf\"` (with various values for the `C` and `gamma` hyperparameters). Don't worry about what these hyperparameters mean for now. How does the best `SVR` predictor perform?"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 81,
+   "metadata": {
+    "collapsed": false,
+    "deletable": true,
+    "editable": true
+   },
+   "outputs": [],
+   "source": [
+    "from sklearn.model_selection import GridSearchCV\n",
+    "\n",
+    "param_grid = [\n",
+    "        {'kernel': ['linear'], 'C': [10., 30., 100., 300., 1000., 3000., 10000., 30000.0]},\n",
+    "        {'kernel': ['rbf'], 'C': [1.0, 3.0, 10., 30., 100., 300., 1000.0],\n",
+    "         'gamma': [0.01, 0.03, 0.1, 0.3, 1.0, 3.0]},\n",
+    "    ]\n",
+    "\n",
+    "svm_reg = SVR()\n",
+    "grid_search = GridSearchCV(svm_reg,param_grid, cv=5, scoring='neg_mean_squared_error', verbose=2, n_jobs=4)\n",
+    "grid_search.fit(housing_prepared, housing_labels)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "deletable": true,
+    "editable": true
+   },
+   "source": [
+    "The best model achieves the following score (evaluated using 5-fold cross validation):"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 82,
+   "metadata": {
+    "collapsed": false,
+    "deletable": true,
+    "editable": true
+   },
+   "outputs": [],
+   "source": [
+    "negative_mse = grid_search.best_score_\n",
+    "rmse = np.sqrt(-negative_mse)\n",
+    "rmse"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "deletable": true,
+    "editable": true
+   },
+   "source": [
+    "That's much worse than the `RandomForestRegressor`. Let's check the best hyperparameters found:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 83,
+   "metadata": {
+    "collapsed": false,
+    "deletable": true,
+    "editable": true
+   },
+   "outputs": [],
+   "source": [
+    "grid_search.best_params_"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "deletable": true,
+    "editable": true
+   },
+   "source": [
+    "The linear kernel seems better than the RBF kernel. Notice that the value of `C` is the maximum tested value. When this happens you definitely want to launch the grid search again with higher values for `C` (removing the smallest values), because it is likely that higher values of `C` will be better."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "deletable": true,
+    "editable": true
+   },
+   "source": [
+    "## 2."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "deletable": true,
+    "editable": true
+   },
+   "source": [
+    "Question: Try replacing `GridSearchCV` with `RandomizedSearchCV`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 84,
+   "metadata": {
+    "collapsed": false,
+    "deletable": true,
+    "editable": true
+   },
+   "outputs": [],
+   "source": [
+    "from sklearn.model_selection import RandomizedSearchCV\n",
+    "from scipy.stats import expon\n",
+    "\n",
+    "# see https://docs.scipy.org/doc/scipy-0.19.0/reference/stats.html\n",
+    "# for `expon()` documentation and more probability distribution functions.\n",
+    "\n",
+    "# Note: gamma is ignored when kernel is \"linear\"\n",
+    "param_distribs = {\n",
+    "        'kernel': ['linear', 'rbf'],\n",
+    "        'C': np.exp(9 * rnd.rand(1000) + 3), #  from exp(3) to exp(12) (i.e, ~20.0 to ~162,755)\n",
+    "        'gamma': expon(scale=1.0),\n",
+    "    }\n",
+    "\n",
+    "svm_reg = SVR()\n",
+    "rnd_search = RandomizedSearchCV(svm_reg, param_distributions=param_distribs,\n",
+    "                                n_iter=50, cv=5, scoring='neg_mean_squared_error', verbose=2, n_jobs=4)\n",
+    "rnd_search.fit(housing_prepared, housing_labels)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "deletable": true,
+    "editable": true
+   },
+   "source": [
+    "The best model achieves the following score (evaluated using 5-fold cross validation):"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 85,
+   "metadata": {
+    "collapsed": false,
+    "deletable": true,
+    "editable": true
+   },
+   "outputs": [],
+   "source": [
+    "negative_mse = rnd_search.best_score_\n",
+    "rmse = np.sqrt(-negative_mse)\n",
+    "rmse"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "deletable": true,
+    "editable": true
+   },
+   "source": [
+    "Now this is much closer to the performance of the `RandomForestRegressor` (but not quite there yet). Let's check the best hyperparameters found:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 86,
+   "metadata": {
+    "collapsed": false,
+    "deletable": true,
+    "editable": true
+   },
+   "outputs": [],
+   "source": [
+    "rnd_search.best_params_"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "deletable": true,
+    "editable": true
+   },
+   "source": [
+    "This time the search found a good set of hyperparameters for the RBF kernel. Randomized search tends to find better hyperparameters than grid search in the same amount of time."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "deletable": true,
+    "editable": true
+   },
+   "source": [
+    "Let's look at the exponential distribution we used, with `scale=1.0`. Note that some samples are much larger or smaller than 1.0, but when you look at the log of the distribution, you can see that most values are actually concentrated roughly in the range of exp(-2) to exp(+2), which is about 0.1 to 7.4."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 87,
+   "metadata": {
+    "collapsed": false,
+    "deletable": true,
+    "editable": true
+   },
+   "outputs": [],
+   "source": [
+    "expon_distrib = expon(scale=1.)\n",
+    "samples = expon_distrib.rvs(10000)\n",
+    "plt.figure(figsize=(10, 4))\n",
+    "plt.subplot(121)\n",
+    "plt.title(\"Exponential distribution (scale=1.0)\")\n",
+    "plt.hist(samples, bins=50)\n",
+    "plt.subplot(122)\n",
+    "plt.title(\"Log of this distribution\")\n",
+    "plt.hist(np.log(samples), bins=50)\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "deletable": true,
+    "editable": true
+   },
+   "source": [
+    "The distribution we used for `C` looks quite different: the scale of the samples is picked from a uniform distribution within a given range, which is why the right graph, which represents the log of the samples, looks roughly constant. This distribution is useful when you don't have a clue of what the target scale is:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 88,
+   "metadata": {
+    "collapsed": false,
+    "deletable": true,
+    "editable": true
+   },
+   "outputs": [],
+   "source": [
+    "samples = np.exp(9 * rnd.rand(10000) + 2)\n",
+    "plt.figure(figsize=(10, 4))\n",
+    "plt.subplot(121)\n",
+    "plt.title(\"Home made distribution\\nexp(x) with x uniformly sampled in [2,11]\")\n",
+    "plt.hist(samples, bins=50)\n",
+    "plt.subplot(122)\n",
+    "plt.title(\"Log of this distribution\")\n",
+    "plt.hist(np.log(samples), bins=50)\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "deletable": true,
+    "editable": true
+   },
+   "source": [
+    "## 3."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "deletable": true,
+    "editable": true
+   },
+   "source": [
+    "Question: Try adding a transformer in the preparation pipeline to select only the most important attributes."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 89,
    "metadata": {
     "collapsed": true,
     "deletable": true,
     "editable": true
    },
    "outputs": [],
-   "source": []
+   "source": [
+    "from sklearn.base import BaseEstimator, TransformerMixin\n",
+    "\n",
+    "def indices_of_top_k(arr, k):\n",
+    "    return np.sort(np.argpartition(np.array(arr), -k)[-k:])\n",
+    "\n",
+    "class TopFeatureSelector(BaseEstimator, TransformerMixin):\n",
+    "    def __init__(self, feature_importances, k):\n",
+    "        self.feature_importances = feature_importances\n",
+    "        self.k = k\n",
+    "    def fit(self, X, y=None):\n",
+    "        self.feature_indices_ = indices_of_top_k(self.feature_importances, self.k)\n",
+    "        return self\n",
+    "    def transform(self, X):\n",
+    "        return X[:, self.feature_indices_]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "deletable": true,
+    "editable": true
+   },
+   "source": [
+    "Note: this feature selector assumes that you have already computed the feature importances somehow (for example using a `RandomForestRegressor`). You may be tempted to compute them directly in the `TopFeatureSelector`'s `fit()` method, however this would likely slow down grid/randomized search since the feature importances would have to be computed for every hyperparameter combination (unless you implement some sort of cache)."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "deletable": true,
+    "editable": true
+   },
+   "source": [
+    "Let's define the number of top features we want to keep:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 90,
+   "metadata": {
+    "collapsed": true,
+    "deletable": true,
+    "editable": true
+   },
+   "outputs": [],
+   "source": [
+    "k = 5"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "deletable": true,
+    "editable": true
+   },
+   "source": [
+    "Now let's look for the indices of the top k features:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 91,
+   "metadata": {
+    "collapsed": false,
+    "deletable": true,
+    "editable": true
+   },
+   "outputs": [],
+   "source": [
+    "top_k_feature_indices = indices_of_top_k(feature_importances, k)\n",
+    "top_k_feature_indices"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 92,
+   "metadata": {
+    "collapsed": false,
+    "deletable": true,
+    "editable": true
+   },
+   "outputs": [],
+   "source": [
+    "np.array(attributes)[top_k_feature_indices]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "deletable": true,
+    "editable": true
+   },
+   "source": [
+    "Let's double check that these are indeed the top k features:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 93,
+   "metadata": {
+    "collapsed": false,
+    "deletable": true,
+    "editable": true
+   },
+   "outputs": [],
+   "source": [
+    "sorted(zip(feature_importances, attributes), reverse=True)[:k]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "deletable": true,
+    "editable": true
+   },
+   "source": [
+    "Looking good... Now let's create a new pipeline that runs the previously defined preparation pipeline, and adds top k feature selection:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 94,
+   "metadata": {
+    "collapsed": false,
+    "deletable": true,
+    "editable": true
+   },
+   "outputs": [],
+   "source": [
+    "preparation_and_feature_selection_pipeline = Pipeline([\n",
+    "    ('preparation', preparation_pipeline),\n",
+    "    ('feature_selection', TopFeatureSelector(feature_importances, k))\n",
+    "])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 95,
+   "metadata": {
+    "collapsed": true,
+    "deletable": true,
+    "editable": true
+   },
+   "outputs": [],
+   "source": [
+    "housing_prepared_top_k_features = preparation_and_feature_selection_pipeline.fit_transform(housing)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "deletable": true,
+    "editable": true
+   },
+   "source": [
+    "Let's look at the features of the first 3 instances:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 96,
+   "metadata": {
+    "collapsed": false,
+    "deletable": true,
+    "editable": true
+   },
+   "outputs": [],
+   "source": [
+    "housing_prepared_top_k_features[0:3]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "deletable": true,
+    "editable": true
+   },
+   "source": [
+    "Now let's double check that these are indeed the top k features:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 97,
+   "metadata": {
+    "collapsed": false,
+    "deletable": true,
+    "editable": true
+   },
+   "outputs": [],
+   "source": [
+    "housing_prepared[0:3, top_k_feature_indices]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "deletable": true,
+    "editable": true
+   },
+   "source": [
+    "Works great!  :)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "deletable": true,
+    "editable": true
+   },
+   "source": [
+    "## 4."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "deletable": true,
+    "editable": true
+   },
+   "source": [
+    "Question: Try creating a single pipeline that does the full data preparation plus the final prediction."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 98,
+   "metadata": {
+    "collapsed": true,
+    "deletable": true,
+    "editable": true
+   },
+   "outputs": [],
+   "source": [
+    "prepare_select_and_predict_pipeline = Pipeline([\n",
+    "    ('preparation', preparation_pipeline),\n",
+    "    ('feature_selection', TopFeatureSelector(feature_importances, k)),\n",
+    "    ('svm_reg', SVR(**rnd_search.best_params_))\n",
+    "])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 99,
+   "metadata": {
+    "collapsed": false,
+    "deletable": true,
+    "editable": true
+   },
+   "outputs": [],
+   "source": [
+    "prepare_select_and_predict_pipeline.fit(housing, housing_labels)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "deletable": true,
+    "editable": true
+   },
+   "source": [
+    "Let's try the full pipeline on a few instances:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 100,
+   "metadata": {
+    "collapsed": false,
+    "deletable": true,
+    "editable": true
+   },
+   "outputs": [],
+   "source": [
+    "some_data = housing.iloc[:4]\n",
+    "some_labels = housing_labels.iloc[:4]\n",
+    "\n",
+    "print(\"Predictions:\\t\", prepare_select_and_predict_pipeline.predict(some_data))\n",
+    "print(\"Labels:\\t\\t\", list(some_labels))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "deletable": true,
+    "editable": true
+   },
+   "source": [
+    "Well, the full pipeline seems to work fine. Of course, the predictions are not fantastic: they would be better if we used the best `RandomForestRegressor` that we found earlier, rather than the best `SVR`."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "deletable": true,
+    "editable": true
+   },
+   "source": [
+    "## 5."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "deletable": true,
+    "editable": true
+   },
+   "source": [
+    "Question: Automatically explore some preparation options using `GridSearchCV`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 101,
+   "metadata": {
+    "collapsed": false,
+    "deletable": true,
+    "editable": true
+   },
+   "outputs": [],
+   "source": [
+    "param_grid = [\n",
+    "        {'preparation__num_pipeline__imputer__strategy': ['mean', 'median', 'most_frequent'],\n",
+    "         'feature_selection__k': [3, 4, 5, 6, 7]}\n",
+    "]\n",
+    "\n",
+    "grid_search_prep = GridSearchCV(prepare_select_and_predict_pipeline, param_grid, cv=5,\n",
+    "                                scoring='neg_mean_squared_error', verbose=2, n_jobs=4)\n",
+    "grid_search_prep.fit(housing, housing_labels)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 102,
+   "metadata": {
+    "collapsed": false,
+    "deletable": true,
+    "editable": true
+   },
+   "outputs": [],
+   "source": [
+    "grid_search_prep.best_params_"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "deletable": true,
+    "editable": true
+   },
+   "source": [
+    "Great! It seems that we had the right imputer stragegy (mean), and apparently only the top 7 features are useful (out of 9), the last 2 seem to just add some noise."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 103,
+   "metadata": {
+    "collapsed": false,
+    "deletable": true,
+    "editable": true
+   },
+   "outputs": [],
+   "source": [
+    "housing.shape"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "deletable": true,
+    "editable": true
+   },
+   "source": [
+    "Congratulations! You already know quite a lot about Machine Learning. :)"
+   ]
   }
  ],
  "metadata": {
@@ -1554,7 +2212,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.5.2+"
+   "version": "3.5.3"
   },
   "nav_menu": {
    "height": "279px",