From b59d2016d6ef43a445345f3c7e49c312552077a1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Aur=C3=A9lien=20Geron?= <ageron@users.noreply.github.com>
Date: Fri, 15 Sep 2017 14:40:13 +0200
Subject: [PATCH] Provide workaround and explanations about the breakage of
 LabelBinarizer by Scikit-Learn 0.19.0

---
 02_end_to_end_machine_learning_project.ipynb | 238 +++++++++++--------
 1 file changed, 145 insertions(+), 93 deletions(-)

diff --git a/02_end_to_end_machine_learning_project.ipynb b/02_end_to_end_machine_learning_project.ipynb
index d6c133e..cfd004b 100644
--- a/02_end_to_end_machine_learning_project.ipynb
+++ b/02_end_to_end_machine_learning_project.ipynb
@@ -35,7 +35,9 @@
   {
    "cell_type": "code",
    "execution_count": 1,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "# To support both python 2 and python 3\n",
@@ -78,7 +80,9 @@
   {
    "cell_type": "code",
    "execution_count": 2,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "import os\n",
@@ -102,7 +106,9 @@
   {
    "cell_type": "code",
    "execution_count": 3,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "fetch_housing_data()"
@@ -188,7 +194,9 @@
   {
    "cell_type": "code",
    "execution_count": 11,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "import numpy as np\n",
@@ -215,7 +223,9 @@
   {
    "cell_type": "code",
    "execution_count": 13,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "import hashlib\n",
@@ -245,7 +255,9 @@
   {
    "cell_type": "code",
    "execution_count": 15,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "housing_with_id = housing.reset_index()   # adds an `index` column\n",
@@ -255,7 +267,9 @@
   {
    "cell_type": "code",
    "execution_count": 16,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "housing_with_id[\"id\"] = housing[\"longitude\"] * 1000 + housing[\"latitude\"]\n",
@@ -274,7 +288,9 @@
   {
    "cell_type": "code",
    "execution_count": 18,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "from sklearn.model_selection import train_test_split\n",
@@ -303,7 +319,9 @@
   {
    "cell_type": "code",
    "execution_count": 21,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "# Divide by 1.5 to limit the number of income categories\n",
@@ -333,7 +351,9 @@
   {
    "cell_type": "code",
    "execution_count": 24,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "from sklearn.model_selection import StratifiedShuffleSplit\n",
@@ -356,7 +376,9 @@
   {
    "cell_type": "code",
    "execution_count": 26,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "def income_cat_proportions(data):\n",
@@ -385,7 +407,9 @@
   {
    "cell_type": "code",
    "execution_count": 28,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "for set_ in (strat_train_set, strat_test_set):\n",
@@ -505,7 +529,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from pandas.tools.plotting import scatter_matrix\n",
+    "# from pandas.tools.plotting import scatter_matrix # For older versions of Pandas\n",
+    "from pandas.plotting import scatter_matrix\n",
     "\n",
     "attributes = [\"median_house_value\", \"median_income\", \"total_rooms\",\n",
     "              \"housing_median_age\"]\n",
@@ -637,7 +662,9 @@
   {
    "cell_type": "code",
    "execution_count": 47,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "from sklearn.preprocessing import Imputer\n",
@@ -655,7 +682,9 @@
   {
    "cell_type": "code",
    "execution_count": 48,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "housing_num = housing.drop(\"ocean_proximity\", axis=1)"
@@ -716,7 +745,9 @@
   {
    "cell_type": "code",
    "execution_count": 53,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "housing_tr = pd.DataFrame(X, columns=housing_num.columns,\n",
@@ -812,7 +843,9 @@
   {
    "cell_type": "code",
    "execution_count": 62,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "from sklearn.base import BaseEstimator, TransformerMixin\n",
@@ -852,7 +885,9 @@
   {
    "cell_type": "code",
    "execution_count": 64,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "from sklearn.pipeline import Pipeline\n",
@@ -897,6 +932,13 @@
     "        return X[self.attribute_names].values"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**Important note**: the `LabelEncoder` and `LabelBinarizer` classes were designed for preprocessing labels, not input features, so their `fit()` and `fit_transform()` methods only accept one parameter `y` instead of two parameters `X` and `y`. The proper way to convert categorical input features to one-hot vectors should be to use the `OneHotEncoder` class, but unfortunately it does not work with string categories, only integer categories (people are working on it, see [Pull Request 7327](https://github.com/scikit-learn/scikit-learn/pull/7327)). In the meantime, one workaround was to use the `LabelBinarizer` class, as shown in the book. Unfortunately, since Scikit-Learn 0.19.0, pipelines now expect each estimator to have a `fit()` or `fit_transform()` method with two parameters `X` and `y`, so the code shown in the book won't work if you are using Scikit-Learn 0.19.0 (and possibly later as well). A temporary workaround (until PR 7327 is finished and you can use a `OneHotEncoder`) is to create a small wrapper class around the `LabelBinarizer` class, to fix its `fit_transform()` method, like this:"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 67,
@@ -904,6 +946,19 @@
     "collapsed": true
    },
    "outputs": [],
+   "source": [
+    "class PipelineFriendlyLabelBinarizer(LabelBinarizer):\n",
+    "    def fit_transform(self, X, y=None):\n",
+    "        return super(PipelineFriendlyLabelBinarizer, self).fit_transform(X)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 68,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
    "source": [
     "num_attribs = list(housing_num)\n",
     "cat_attribs = [\"ocean_proximity\"]\n",
@@ -917,14 +972,16 @@
     "\n",
     "cat_pipeline = Pipeline([\n",
     "        ('selector', DataFrameSelector(cat_attribs)),\n",
-    "        ('label_binarizer', LabelBinarizer()),\n",
+    "        ('label_binarizer', PipelineFriendlyLabelBinarizer()),\n",
     "    ])"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 68,
-   "metadata": {},
+   "execution_count": 69,
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "from sklearn.pipeline import FeatureUnion\n",
@@ -937,7 +994,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 69,
+   "execution_count": 70,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -947,7 +1004,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 70,
+   "execution_count": 71,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -963,7 +1020,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 71,
+   "execution_count": 72,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -975,7 +1032,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 72,
+   "execution_count": 73,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -996,7 +1053,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 73,
+   "execution_count": 74,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1005,7 +1062,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 74,
+   "execution_count": 75,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1014,7 +1071,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 75,
+   "execution_count": 76,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1028,7 +1085,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 76,
+   "execution_count": 77,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1040,7 +1097,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 77,
+   "execution_count": 78,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1052,7 +1109,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 78,
+   "execution_count": 79,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1071,8 +1128,10 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 79,
-   "metadata": {},
+   "execution_count": 80,
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "from sklearn.model_selection import cross_val_score\n",
@@ -1084,7 +1143,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 80,
+   "execution_count": 81,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1098,7 +1157,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 81,
+   "execution_count": 82,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1110,7 +1169,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 82,
+   "execution_count": 83,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1122,7 +1181,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 83,
+   "execution_count": 84,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1134,7 +1193,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 84,
+   "execution_count": 85,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1148,7 +1207,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 85,
+   "execution_count": 86,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1158,7 +1217,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 86,
+   "execution_count": 87,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1174,7 +1233,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 87,
+   "execution_count": 88,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1203,7 +1262,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 88,
+   "execution_count": 89,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1212,7 +1271,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 89,
+   "execution_count": 90,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1228,7 +1287,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 90,
+   "execution_count": 91,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1239,7 +1298,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 91,
+   "execution_count": 92,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1248,7 +1307,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 92,
+   "execution_count": 93,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1268,7 +1327,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 93,
+   "execution_count": 94,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1279,7 +1338,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 94,
+   "execution_count": 95,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1289,7 +1348,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 95,
+   "execution_count": 96,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1301,7 +1360,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 96,
+   "execution_count": 97,
    "metadata": {
     "collapsed": true
    },
@@ -1321,7 +1380,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 97,
+   "execution_count": 98,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1339,26 +1398,15 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Label Binarizer hack\n",
-    "`LabelBinarizer`'s `fit_transform()` method only accepts one parameter `y` (because it was meant for labels, not predictors), so it does not work in a pipeline where the final estimator is a supervised estimator because in this case its `fit()` method takes two parameters `X` and `y`.\n",
-    "\n",
-    "This hack creates a supervision-friendly `LabelBinarizer`."
+    "## A full pipeline with both preparation and prediction"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 98,
+   "execution_count": 99,
    "metadata": {},
    "outputs": [],
    "source": [
-    "class SupervisionFriendlyLabelBinarizer(LabelBinarizer):\n",
-    "    def fit_transform(self, X, y=None):\n",
-    "        return super(SupervisionFriendlyLabelBinarizer, self).fit_transform(X)\n",
-    "\n",
-    "# Replace the Labelbinarizer with a SupervisionFriendlyLabelBinarizer\n",
-    "cat_pipeline.steps[1] = (\"label_binarizer\", SupervisionFriendlyLabelBinarizer())\n",
-    "\n",
-    "# Now you can create a full pipeline with a supervised predictor at the end.\n",
     "full_pipeline_with_predictor = Pipeline([\n",
     "        (\"preparation\", full_pipeline),\n",
     "        (\"linear\", LinearRegression())\n",
@@ -1377,7 +1425,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 99,
+   "execution_count": 100,
    "metadata": {
     "collapsed": true
    },
@@ -1388,7 +1436,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 100,
+   "execution_count": 101,
    "metadata": {
     "collapsed": true
    },
@@ -1409,7 +1457,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 101,
+   "execution_count": 102,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1447,7 +1495,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 102,
+   "execution_count": 103,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1473,7 +1521,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 103,
+   "execution_count": 104,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1491,7 +1539,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 104,
+   "execution_count": 105,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1521,7 +1569,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 105,
+   "execution_count": 106,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1554,7 +1602,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 106,
+   "execution_count": 107,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1572,7 +1620,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 107,
+   "execution_count": 108,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1595,7 +1643,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 108,
+   "execution_count": 109,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1620,7 +1668,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 109,
+   "execution_count": 110,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1659,7 +1707,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 110,
+   "execution_count": 111,
    "metadata": {
     "collapsed": true
    },
@@ -1697,7 +1745,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 111,
+   "execution_count": 112,
    "metadata": {
     "collapsed": true
    },
@@ -1715,7 +1763,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 112,
+   "execution_count": 113,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1725,7 +1773,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 113,
+   "execution_count": 114,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1741,7 +1789,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 114,
+   "execution_count": 115,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1757,8 +1805,10 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 115,
-   "metadata": {},
+   "execution_count": 116,
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "preparation_and_feature_selection_pipeline = Pipeline([\n",
@@ -1769,7 +1819,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 116,
+   "execution_count": 117,
    "metadata": {
     "collapsed": true
    },
@@ -1787,7 +1837,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 117,
+   "execution_count": 118,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1803,7 +1853,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 118,
+   "execution_count": 119,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1833,8 +1883,10 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 119,
-   "metadata": {},
+   "execution_count": 120,
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "prepare_select_and_predict_pipeline = Pipeline([\n",
@@ -1846,7 +1898,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 120,
+   "execution_count": 121,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1862,7 +1914,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 121,
+   "execution_count": 122,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1896,7 +1948,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 122,
+   "execution_count": 123,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1912,7 +1964,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 123,
+   "execution_count": 124,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1928,7 +1980,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 124,
+   "execution_count": 125,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1959,7 +2011,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.5.3"
+   "version": "3.5.2"
   },
   "nav_menu": {
    "height": "279px",