Provide workaround and explanations about the breakage of LabelBinarizer by Scikit-Learn 0.19.0

2017-09-15 14:40:13 +02:00 · 2017-09-15 14:40:13 +02:00 · b59d2016d6
parent e5505eeef6
commit b59d2016d6
1 changed files with 145 additions and 93 deletions
--- a/02_end_to_end_machine_learning_project.ipynb
+++ b/02_end_to_end_machine_learning_project.ipynb
@ -35,7 +35,9 @@
  {
   "cell_type": "code",
   "execution_count": 1,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
   "outputs": [],
   "source": [
    "# To support both python 2 and python 3\n",
@ -78,7 +80,9 @@
  {
   "cell_type": "code",
   "execution_count": 2,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
   "outputs": [],
   "source": [
    "import os\n",
@ -102,7 +106,9 @@
  {
   "cell_type": "code",
   "execution_count": 3,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
   "outputs": [],
   "source": [
    "fetch_housing_data()"
@ -188,7 +194,9 @@
  {
   "cell_type": "code",
   "execution_count": 11,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
@ -215,7 +223,9 @@
  {
   "cell_type": "code",
   "execution_count": 13,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
   "outputs": [],
   "source": [
    "import hashlib\n",
@ -245,7 +255,9 @@
  {
   "cell_type": "code",
   "execution_count": 15,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
   "outputs": [],
   "source": [
    "housing_with_id = housing.reset_index()   # adds an `index` column\n",
@ -255,7 +267,9 @@
  {
   "cell_type": "code",
   "execution_count": 16,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
   "outputs": [],
   "source": [
    "housing_with_id[\"id\"] = housing[\"longitude\"] * 1000 + housing[\"latitude\"]\n",
@ -274,7 +288,9 @@
  {
   "cell_type": "code",
   "execution_count": 18,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
   "outputs": [],
   "source": [
    "from sklearn.model_selection import train_test_split\n",
@ -303,7 +319,9 @@
  {
   "cell_type": "code",
   "execution_count": 21,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
   "outputs": [],
   "source": [
    "# Divide by 1.5 to limit the number of income categories\n",
@ -333,7 +351,9 @@
  {
   "cell_type": "code",
   "execution_count": 24,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
   "outputs": [],
   "source": [
    "from sklearn.model_selection import StratifiedShuffleSplit\n",
@ -356,7 +376,9 @@
  {
   "cell_type": "code",
   "execution_count": 26,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
   "outputs": [],
   "source": [
    "def income_cat_proportions(data):\n",
@ -385,7 +407,9 @@
  {
   "cell_type": "code",
   "execution_count": 28,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
   "outputs": [],
   "source": [
    "for set_ in (strat_train_set, strat_test_set):\n",
@ -505,7 +529,8 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "from pandas.tools.plotting import scatter_matrix\n",
+    "# from pandas.tools.plotting import scatter_matrix # For older versions of Pandas\n",
+    "from pandas.plotting import scatter_matrix\n",
    "\n",
    "attributes = [\"median_house_value\", \"median_income\", \"total_rooms\",\n",
    "              \"housing_median_age\"]\n",
@ -637,7 +662,9 @@
  {
   "cell_type": "code",
   "execution_count": 47,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
   "outputs": [],
   "source": [
    "from sklearn.preprocessing import Imputer\n",
@ -655,7 +682,9 @@
  {
   "cell_type": "code",
   "execution_count": 48,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
   "outputs": [],
   "source": [
    "housing_num = housing.drop(\"ocean_proximity\", axis=1)"
@ -716,7 +745,9 @@
  {
   "cell_type": "code",
   "execution_count": 53,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
   "outputs": [],
   "source": [
    "housing_tr = pd.DataFrame(X, columns=housing_num.columns,\n",
@ -812,7 +843,9 @@
  {
   "cell_type": "code",
   "execution_count": 62,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
   "outputs": [],
   "source": [
    "from sklearn.base import BaseEstimator, TransformerMixin\n",
@ -852,7 +885,9 @@
  {
   "cell_type": "code",
   "execution_count": 64,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
   "outputs": [],
   "source": [
    "from sklearn.pipeline import Pipeline\n",
@ -897,6 +932,13 @@
    "        return X[self.attribute_names].values"
   ]
  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**Important note**: the `LabelEncoder` and `LabelBinarizer` classes were designed for preprocessing labels, not input features, so their `fit()` and `fit_transform()` methods only accept one parameter `y` instead of two parameters `X` and `y`. The proper way to convert categorical input features to one-hot vectors should be to use the `OneHotEncoder` class, but unfortunately it does not work with string categories, only integer categories (people are working on it, see [Pull Request 7327](https://github.com/scikit-learn/scikit-learn/pull/7327)). In the meantime, one workaround was to use the `LabelBinarizer` class, as shown in the book. Unfortunately, since Scikit-Learn 0.19.0, pipelines now expect each estimator to have a `fit()` or `fit_transform()` method with two parameters `X` and `y`, so the code shown in the book won't work if you are using Scikit-Learn 0.19.0 (and possibly later as well). A temporary workaround (until PR 7327 is finished and you can use a `OneHotEncoder`) is to create a small wrapper class around the `LabelBinarizer` class, to fix its `fit_transform()` method, like this:"
+   ]
+  },
  {
   "cell_type": "code",
   "execution_count": 67,
@ -904,6 +946,19 @@
    "collapsed": true
   },
   "outputs": [],
+   "source": [
+    "class PipelineFriendlyLabelBinarizer(LabelBinarizer):\n",
+    "    def fit_transform(self, X, y=None):\n",
+    "        return super(PipelineFriendlyLabelBinarizer, self).fit_transform(X)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 68,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
   "source": [
    "num_attribs = list(housing_num)\n",
    "cat_attribs = [\"ocean_proximity\"]\n",
@ -917,14 +972,16 @@
    "\n",
    "cat_pipeline = Pipeline([\n",
    "        ('selector', DataFrameSelector(cat_attribs)),\n",
-    "        ('label_binarizer', LabelBinarizer()),\n",
+    "        ('label_binarizer', PipelineFriendlyLabelBinarizer()),\n",
    "    ])"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 68,
-   "metadata": {},
+   "execution_count": 69,
+   "metadata": {
+    "collapsed": true
+   },
   "outputs": [],
   "source": [
    "from sklearn.pipeline import FeatureUnion\n",
@ -937,7 +994,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 69,
+   "execution_count": 70,
   "metadata": {},
   "outputs": [],
   "source": [
@ -947,7 +1004,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 70,
+   "execution_count": 71,
   "metadata": {},
   "outputs": [],
   "source": [
@ -963,7 +1020,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 71,
+   "execution_count": 72,
   "metadata": {},
   "outputs": [],
   "source": [
@ -975,7 +1032,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 72,
+   "execution_count": 73,
   "metadata": {},
   "outputs": [],
   "source": [
@ -996,7 +1053,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 73,
+   "execution_count": 74,
   "metadata": {},
   "outputs": [],
   "source": [
@ -1005,7 +1062,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 74,
+   "execution_count": 75,
   "metadata": {},
   "outputs": [],
   "source": [
@ -1014,7 +1071,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 75,
+   "execution_count": 76,
   "metadata": {},
   "outputs": [],
   "source": [
@ -1028,7 +1085,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 76,
+   "execution_count": 77,
   "metadata": {},
   "outputs": [],
   "source": [
@ -1040,7 +1097,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 77,
+   "execution_count": 78,
   "metadata": {},
   "outputs": [],
   "source": [
@ -1052,7 +1109,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 78,
+   "execution_count": 79,
   "metadata": {},
   "outputs": [],
   "source": [
@ -1071,8 +1128,10 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 79,
-   "metadata": {},
+   "execution_count": 80,
+   "metadata": {
+    "collapsed": true
+   },
   "outputs": [],
   "source": [
    "from sklearn.model_selection import cross_val_score\n",
@ -1084,7 +1143,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 80,
+   "execution_count": 81,
   "metadata": {},
   "outputs": [],
   "source": [
@ -1098,7 +1157,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 81,
+   "execution_count": 82,
   "metadata": {},
   "outputs": [],
   "source": [
@ -1110,7 +1169,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 82,
+   "execution_count": 83,
   "metadata": {},
   "outputs": [],
   "source": [
@ -1122,7 +1181,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 83,
+   "execution_count": 84,
   "metadata": {},
   "outputs": [],
   "source": [
@ -1134,7 +1193,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 84,
+   "execution_count": 85,
   "metadata": {},
   "outputs": [],
   "source": [
@ -1148,7 +1207,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 85,
+   "execution_count": 86,
   "metadata": {},
   "outputs": [],
   "source": [
@ -1158,7 +1217,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 86,
+   "execution_count": 87,
   "metadata": {},
   "outputs": [],
   "source": [
@ -1174,7 +1233,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 87,
+   "execution_count": 88,
   "metadata": {},
   "outputs": [],
   "source": [
@ -1203,7 +1262,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 88,
+   "execution_count": 89,
   "metadata": {},
   "outputs": [],
   "source": [
@ -1212,7 +1271,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 89,
+   "execution_count": 90,
   "metadata": {},
   "outputs": [],
   "source": [
@ -1228,7 +1287,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 90,
+   "execution_count": 91,
   "metadata": {},
   "outputs": [],
   "source": [
@ -1239,7 +1298,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 91,
+   "execution_count": 92,
   "metadata": {},
   "outputs": [],
   "source": [
@ -1248,7 +1307,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 92,
+   "execution_count": 93,
   "metadata": {},
   "outputs": [],
   "source": [
@ -1268,7 +1327,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 93,
+   "execution_count": 94,
   "metadata": {},
   "outputs": [],
   "source": [
@ -1279,7 +1338,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 94,
+   "execution_count": 95,
   "metadata": {},
   "outputs": [],
   "source": [
@ -1289,7 +1348,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 95,
+   "execution_count": 96,
   "metadata": {},
   "outputs": [],
   "source": [
@ -1301,7 +1360,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 96,
+   "execution_count": 97,
   "metadata": {
    "collapsed": true
   },
@ -1321,7 +1380,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 97,
+   "execution_count": 98,
   "metadata": {},
   "outputs": [],
   "source": [
@ -1339,26 +1398,15 @@
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "## Label Binarizer hack\n",
-    "`LabelBinarizer`'s `fit_transform()` method only accepts one parameter `y` (because it was meant for labels, not predictors), so it does not work in a pipeline where the final estimator is a supervised estimator because in this case its `fit()` method takes two parameters `X` and `y`.\n",
-    "\n",
-    "This hack creates a supervision-friendly `LabelBinarizer`."
+    "## A full pipeline with both preparation and prediction"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 98,
+   "execution_count": 99,
   "metadata": {},
   "outputs": [],
   "source": [
-    "class SupervisionFriendlyLabelBinarizer(LabelBinarizer):\n",
-    "    def fit_transform(self, X, y=None):\n",
-    "        return super(SupervisionFriendlyLabelBinarizer, self).fit_transform(X)\n",
-    "\n",
-    "# Replace the Labelbinarizer with a SupervisionFriendlyLabelBinarizer\n",
-    "cat_pipeline.steps[1] = (\"label_binarizer\", SupervisionFriendlyLabelBinarizer())\n",
-    "\n",
-    "# Now you can create a full pipeline with a supervised predictor at the end.\n",
    "full_pipeline_with_predictor = Pipeline([\n",
    "        (\"preparation\", full_pipeline),\n",
    "        (\"linear\", LinearRegression())\n",
@ -1377,7 +1425,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 99,
+   "execution_count": 100,
   "metadata": {
    "collapsed": true
   },
@ -1388,7 +1436,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 100,
+   "execution_count": 101,
   "metadata": {
    "collapsed": true
   },
@ -1409,7 +1457,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 101,
+   "execution_count": 102,
   "metadata": {},
   "outputs": [],
   "source": [
@ -1447,7 +1495,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 102,
+   "execution_count": 103,
   "metadata": {},
   "outputs": [],
   "source": [
@ -1473,7 +1521,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 103,
+   "execution_count": 104,
   "metadata": {},
   "outputs": [],
   "source": [
@ -1491,7 +1539,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 104,
+   "execution_count": 105,
   "metadata": {},
   "outputs": [],
   "source": [
@ -1521,7 +1569,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 105,
+   "execution_count": 106,
   "metadata": {},
   "outputs": [],
   "source": [
@ -1554,7 +1602,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 106,
+   "execution_count": 107,
   "metadata": {},
   "outputs": [],
   "source": [
@ -1572,7 +1620,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 107,
+   "execution_count": 108,
   "metadata": {},
   "outputs": [],
   "source": [
@ -1595,7 +1643,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 108,
+   "execution_count": 109,
   "metadata": {},
   "outputs": [],
   "source": [
@ -1620,7 +1668,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 109,
+   "execution_count": 110,
   "metadata": {},
   "outputs": [],
   "source": [
@ -1659,7 +1707,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 110,
+   "execution_count": 111,
   "metadata": {
    "collapsed": true
   },
@ -1697,7 +1745,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 111,
+   "execution_count": 112,
   "metadata": {
    "collapsed": true
   },
@ -1715,7 +1763,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 112,
+   "execution_count": 113,
   "metadata": {},
   "outputs": [],
   "source": [
@ -1725,7 +1773,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 113,
+   "execution_count": 114,
   "metadata": {},
   "outputs": [],
   "source": [
@ -1741,7 +1789,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 114,
+   "execution_count": 115,
   "metadata": {},
   "outputs": [],
   "source": [
@ -1757,8 +1805,10 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 115,
-   "metadata": {},
+   "execution_count": 116,
+   "metadata": {
+    "collapsed": true
+   },
   "outputs": [],
   "source": [
    "preparation_and_feature_selection_pipeline = Pipeline([\n",
@ -1769,7 +1819,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 116,
+   "execution_count": 117,
   "metadata": {
    "collapsed": true
   },
@ -1787,7 +1837,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 117,
+   "execution_count": 118,
   "metadata": {},
   "outputs": [],
   "source": [
@ -1803,7 +1853,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 118,
+   "execution_count": 119,
   "metadata": {},
   "outputs": [],
   "source": [
@ -1833,8 +1883,10 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 119,
-   "metadata": {},
+   "execution_count": 120,
+   "metadata": {
+    "collapsed": true
+   },
   "outputs": [],
   "source": [
    "prepare_select_and_predict_pipeline = Pipeline([\n",
@ -1846,7 +1898,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 120,
+   "execution_count": 121,
   "metadata": {},
   "outputs": [],
   "source": [
@ -1862,7 +1914,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 121,
+   "execution_count": 122,
   "metadata": {},
   "outputs": [],
   "source": [
@ -1896,7 +1948,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 122,
+   "execution_count": 123,
   "metadata": {},
   "outputs": [],
   "source": [
@ -1912,7 +1964,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 123,
+   "execution_count": 124,
   "metadata": {},
   "outputs": [],
   "source": [
@ -1928,7 +1980,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 124,
+   "execution_count": 125,
   "metadata": {},
   "outputs": [],
   "source": [
@ -1959,7 +2011,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.5.3"
+   "version": "3.5.2"
  },
  "nav_menu": {
   "height": "279px",