From b59d2016d6ef43a445345f3c7e49c312552077a1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aur=C3=A9lien=20Geron?= Date: Fri, 15 Sep 2017 14:40:13 +0200 Subject: [PATCH] Provide workaround and explanations about the breakage of LabelBinarizer by Scikit-Learn 0.19.0 --- 02_end_to_end_machine_learning_project.ipynb | 238 +++++++++++-------- 1 file changed, 145 insertions(+), 93 deletions(-) diff --git a/02_end_to_end_machine_learning_project.ipynb b/02_end_to_end_machine_learning_project.ipynb index d6c133e..cfd004b 100644 --- a/02_end_to_end_machine_learning_project.ipynb +++ b/02_end_to_end_machine_learning_project.ipynb @@ -35,7 +35,9 @@ { "cell_type": "code", "execution_count": 1, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "# To support both python 2 and python 3\n", @@ -78,7 +80,9 @@ { "cell_type": "code", "execution_count": 2, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "import os\n", @@ -102,7 +106,9 @@ { "cell_type": "code", "execution_count": 3, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "fetch_housing_data()" @@ -188,7 +194,9 @@ { "cell_type": "code", "execution_count": 11, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "import numpy as np\n", @@ -215,7 +223,9 @@ { "cell_type": "code", "execution_count": 13, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "import hashlib\n", @@ -245,7 +255,9 @@ { "cell_type": "code", "execution_count": 15, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "housing_with_id = housing.reset_index() # adds an `index` column\n", @@ -255,7 +267,9 @@ { "cell_type": "code", "execution_count": 16, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "housing_with_id[\"id\"] = housing[\"longitude\"] * 1000 + housing[\"latitude\"]\n", @@ -274,7 +288,9 @@ { "cell_type": "code", "execution_count": 18, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "from sklearn.model_selection import train_test_split\n", @@ -303,7 +319,9 @@ { "cell_type": "code", "execution_count": 21, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "# Divide by 1.5 to limit the number of income categories\n", @@ -333,7 +351,9 @@ { "cell_type": "code", "execution_count": 24, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "from sklearn.model_selection import StratifiedShuffleSplit\n", @@ -356,7 +376,9 @@ { "cell_type": "code", "execution_count": 26, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "def income_cat_proportions(data):\n", @@ -385,7 +407,9 @@ { "cell_type": "code", "execution_count": 28, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "for set_ in (strat_train_set, strat_test_set):\n", @@ -505,7 +529,8 @@ "metadata": {}, "outputs": [], "source": [ - "from pandas.tools.plotting import scatter_matrix\n", + "# from pandas.tools.plotting import scatter_matrix # For older versions of Pandas\n", + "from pandas.plotting import scatter_matrix\n", "\n", "attributes = [\"median_house_value\", \"median_income\", \"total_rooms\",\n", " \"housing_median_age\"]\n", @@ -637,7 +662,9 @@ { "cell_type": "code", "execution_count": 47, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "from sklearn.preprocessing import Imputer\n", @@ -655,7 +682,9 @@ { "cell_type": "code", "execution_count": 48, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "housing_num = housing.drop(\"ocean_proximity\", axis=1)" @@ -716,7 +745,9 @@ { "cell_type": "code", "execution_count": 53, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "housing_tr = pd.DataFrame(X, columns=housing_num.columns,\n", @@ -812,7 +843,9 @@ { "cell_type": "code", "execution_count": 62, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "from sklearn.base import BaseEstimator, TransformerMixin\n", @@ -852,7 +885,9 @@ { "cell_type": "code", "execution_count": 64, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "from sklearn.pipeline import Pipeline\n", @@ -897,6 +932,13 @@ " return X[self.attribute_names].values" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Important note**: the `LabelEncoder` and `LabelBinarizer` classes were designed for preprocessing labels, not input features, so their `fit()` and `fit_transform()` methods only accept one parameter `y` instead of two parameters `X` and `y`. The proper way to convert categorical input features to one-hot vectors should be to use the `OneHotEncoder` class, but unfortunately it does not work with string categories, only integer categories (people are working on it, see [Pull Request 7327](https://github.com/scikit-learn/scikit-learn/pull/7327)). In the meantime, one workaround was to use the `LabelBinarizer` class, as shown in the book. Unfortunately, since Scikit-Learn 0.19.0, pipelines now expect each estimator to have a `fit()` or `fit_transform()` method with two parameters `X` and `y`, so the code shown in the book won't work if you are using Scikit-Learn 0.19.0 (and possibly later as well). A temporary workaround (until PR 7327 is finished and you can use a `OneHotEncoder`) is to create a small wrapper class around the `LabelBinarizer` class, to fix its `fit_transform()` method, like this:" + ] + }, { "cell_type": "code", "execution_count": 67, @@ -904,6 +946,19 @@ "collapsed": true }, "outputs": [], + "source": [ + "class PipelineFriendlyLabelBinarizer(LabelBinarizer):\n", + " def fit_transform(self, X, y=None):\n", + " return super(PipelineFriendlyLabelBinarizer, self).fit_transform(X)" + ] + }, + { + "cell_type": "code", + "execution_count": 68, + "metadata": { + "collapsed": true + }, + "outputs": [], "source": [ "num_attribs = list(housing_num)\n", "cat_attribs = [\"ocean_proximity\"]\n", @@ -917,14 +972,16 @@ "\n", "cat_pipeline = Pipeline([\n", " ('selector', DataFrameSelector(cat_attribs)),\n", - " ('label_binarizer', LabelBinarizer()),\n", + " ('label_binarizer', PipelineFriendlyLabelBinarizer()),\n", " ])" ] }, { "cell_type": "code", - "execution_count": 68, - "metadata": {}, + "execution_count": 69, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "from sklearn.pipeline import FeatureUnion\n", @@ -937,7 +994,7 @@ }, { "cell_type": "code", - "execution_count": 69, + "execution_count": 70, "metadata": {}, "outputs": [], "source": [ @@ -947,7 +1004,7 @@ }, { "cell_type": "code", - "execution_count": 70, + "execution_count": 71, "metadata": {}, "outputs": [], "source": [ @@ -963,7 +1020,7 @@ }, { "cell_type": "code", - "execution_count": 71, + "execution_count": 72, "metadata": {}, "outputs": [], "source": [ @@ -975,7 +1032,7 @@ }, { "cell_type": "code", - "execution_count": 72, + "execution_count": 73, "metadata": {}, "outputs": [], "source": [ @@ -996,7 +1053,7 @@ }, { "cell_type": "code", - "execution_count": 73, + "execution_count": 74, "metadata": {}, "outputs": [], "source": [ @@ -1005,7 +1062,7 @@ }, { "cell_type": "code", - "execution_count": 74, + "execution_count": 75, "metadata": {}, "outputs": [], "source": [ @@ -1014,7 +1071,7 @@ }, { "cell_type": "code", - "execution_count": 75, + "execution_count": 76, "metadata": {}, "outputs": [], "source": [ @@ -1028,7 +1085,7 @@ }, { "cell_type": "code", - "execution_count": 76, + "execution_count": 77, "metadata": {}, "outputs": [], "source": [ @@ -1040,7 +1097,7 @@ }, { "cell_type": "code", - "execution_count": 77, + "execution_count": 78, "metadata": {}, "outputs": [], "source": [ @@ -1052,7 +1109,7 @@ }, { "cell_type": "code", - "execution_count": 78, + "execution_count": 79, "metadata": {}, "outputs": [], "source": [ @@ -1071,8 +1128,10 @@ }, { "cell_type": "code", - "execution_count": 79, - "metadata": {}, + "execution_count": 80, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "from sklearn.model_selection import cross_val_score\n", @@ -1084,7 +1143,7 @@ }, { "cell_type": "code", - "execution_count": 80, + "execution_count": 81, "metadata": {}, "outputs": [], "source": [ @@ -1098,7 +1157,7 @@ }, { "cell_type": "code", - "execution_count": 81, + "execution_count": 82, "metadata": {}, "outputs": [], "source": [ @@ -1110,7 +1169,7 @@ }, { "cell_type": "code", - "execution_count": 82, + "execution_count": 83, "metadata": {}, "outputs": [], "source": [ @@ -1122,7 +1181,7 @@ }, { "cell_type": "code", - "execution_count": 83, + "execution_count": 84, "metadata": {}, "outputs": [], "source": [ @@ -1134,7 +1193,7 @@ }, { "cell_type": "code", - "execution_count": 84, + "execution_count": 85, "metadata": {}, "outputs": [], "source": [ @@ -1148,7 +1207,7 @@ }, { "cell_type": "code", - "execution_count": 85, + "execution_count": 86, "metadata": {}, "outputs": [], "source": [ @@ -1158,7 +1217,7 @@ }, { "cell_type": "code", - "execution_count": 86, + "execution_count": 87, "metadata": {}, "outputs": [], "source": [ @@ -1174,7 +1233,7 @@ }, { "cell_type": "code", - "execution_count": 87, + "execution_count": 88, "metadata": {}, "outputs": [], "source": [ @@ -1203,7 +1262,7 @@ }, { "cell_type": "code", - "execution_count": 88, + "execution_count": 89, "metadata": {}, "outputs": [], "source": [ @@ -1212,7 +1271,7 @@ }, { "cell_type": "code", - "execution_count": 89, + "execution_count": 90, "metadata": {}, "outputs": [], "source": [ @@ -1228,7 +1287,7 @@ }, { "cell_type": "code", - "execution_count": 90, + "execution_count": 91, "metadata": {}, "outputs": [], "source": [ @@ -1239,7 +1298,7 @@ }, { "cell_type": "code", - "execution_count": 91, + "execution_count": 92, "metadata": {}, "outputs": [], "source": [ @@ -1248,7 +1307,7 @@ }, { "cell_type": "code", - "execution_count": 92, + "execution_count": 93, "metadata": {}, "outputs": [], "source": [ @@ -1268,7 +1327,7 @@ }, { "cell_type": "code", - "execution_count": 93, + "execution_count": 94, "metadata": {}, "outputs": [], "source": [ @@ -1279,7 +1338,7 @@ }, { "cell_type": "code", - "execution_count": 94, + "execution_count": 95, "metadata": {}, "outputs": [], "source": [ @@ -1289,7 +1348,7 @@ }, { "cell_type": "code", - "execution_count": 95, + "execution_count": 96, "metadata": {}, "outputs": [], "source": [ @@ -1301,7 +1360,7 @@ }, { "cell_type": "code", - "execution_count": 96, + "execution_count": 97, "metadata": { "collapsed": true }, @@ -1321,7 +1380,7 @@ }, { "cell_type": "code", - "execution_count": 97, + "execution_count": 98, "metadata": {}, "outputs": [], "source": [ @@ -1339,26 +1398,15 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Label Binarizer hack\n", - "`LabelBinarizer`'s `fit_transform()` method only accepts one parameter `y` (because it was meant for labels, not predictors), so it does not work in a pipeline where the final estimator is a supervised estimator because in this case its `fit()` method takes two parameters `X` and `y`.\n", - "\n", - "This hack creates a supervision-friendly `LabelBinarizer`." + "## A full pipeline with both preparation and prediction" ] }, { "cell_type": "code", - "execution_count": 98, + "execution_count": 99, "metadata": {}, "outputs": [], "source": [ - "class SupervisionFriendlyLabelBinarizer(LabelBinarizer):\n", - " def fit_transform(self, X, y=None):\n", - " return super(SupervisionFriendlyLabelBinarizer, self).fit_transform(X)\n", - "\n", - "# Replace the Labelbinarizer with a SupervisionFriendlyLabelBinarizer\n", - "cat_pipeline.steps[1] = (\"label_binarizer\", SupervisionFriendlyLabelBinarizer())\n", - "\n", - "# Now you can create a full pipeline with a supervised predictor at the end.\n", "full_pipeline_with_predictor = Pipeline([\n", " (\"preparation\", full_pipeline),\n", " (\"linear\", LinearRegression())\n", @@ -1377,7 +1425,7 @@ }, { "cell_type": "code", - "execution_count": 99, + "execution_count": 100, "metadata": { "collapsed": true }, @@ -1388,7 +1436,7 @@ }, { "cell_type": "code", - "execution_count": 100, + "execution_count": 101, "metadata": { "collapsed": true }, @@ -1409,7 +1457,7 @@ }, { "cell_type": "code", - "execution_count": 101, + "execution_count": 102, "metadata": {}, "outputs": [], "source": [ @@ -1447,7 +1495,7 @@ }, { "cell_type": "code", - "execution_count": 102, + "execution_count": 103, "metadata": {}, "outputs": [], "source": [ @@ -1473,7 +1521,7 @@ }, { "cell_type": "code", - "execution_count": 103, + "execution_count": 104, "metadata": {}, "outputs": [], "source": [ @@ -1491,7 +1539,7 @@ }, { "cell_type": "code", - "execution_count": 104, + "execution_count": 105, "metadata": {}, "outputs": [], "source": [ @@ -1521,7 +1569,7 @@ }, { "cell_type": "code", - "execution_count": 105, + "execution_count": 106, "metadata": {}, "outputs": [], "source": [ @@ -1554,7 +1602,7 @@ }, { "cell_type": "code", - "execution_count": 106, + "execution_count": 107, "metadata": {}, "outputs": [], "source": [ @@ -1572,7 +1620,7 @@ }, { "cell_type": "code", - "execution_count": 107, + "execution_count": 108, "metadata": {}, "outputs": [], "source": [ @@ -1595,7 +1643,7 @@ }, { "cell_type": "code", - "execution_count": 108, + "execution_count": 109, "metadata": {}, "outputs": [], "source": [ @@ -1620,7 +1668,7 @@ }, { "cell_type": "code", - "execution_count": 109, + "execution_count": 110, "metadata": {}, "outputs": [], "source": [ @@ -1659,7 +1707,7 @@ }, { "cell_type": "code", - "execution_count": 110, + "execution_count": 111, "metadata": { "collapsed": true }, @@ -1697,7 +1745,7 @@ }, { "cell_type": "code", - "execution_count": 111, + "execution_count": 112, "metadata": { "collapsed": true }, @@ -1715,7 +1763,7 @@ }, { "cell_type": "code", - "execution_count": 112, + "execution_count": 113, "metadata": {}, "outputs": [], "source": [ @@ -1725,7 +1773,7 @@ }, { "cell_type": "code", - "execution_count": 113, + "execution_count": 114, "metadata": {}, "outputs": [], "source": [ @@ -1741,7 +1789,7 @@ }, { "cell_type": "code", - "execution_count": 114, + "execution_count": 115, "metadata": {}, "outputs": [], "source": [ @@ -1757,8 +1805,10 @@ }, { "cell_type": "code", - "execution_count": 115, - "metadata": {}, + "execution_count": 116, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "preparation_and_feature_selection_pipeline = Pipeline([\n", @@ -1769,7 +1819,7 @@ }, { "cell_type": "code", - "execution_count": 116, + "execution_count": 117, "metadata": { "collapsed": true }, @@ -1787,7 +1837,7 @@ }, { "cell_type": "code", - "execution_count": 117, + "execution_count": 118, "metadata": {}, "outputs": [], "source": [ @@ -1803,7 +1853,7 @@ }, { "cell_type": "code", - "execution_count": 118, + "execution_count": 119, "metadata": {}, "outputs": [], "source": [ @@ -1833,8 +1883,10 @@ }, { "cell_type": "code", - "execution_count": 119, - "metadata": {}, + "execution_count": 120, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "prepare_select_and_predict_pipeline = Pipeline([\n", @@ -1846,7 +1898,7 @@ }, { "cell_type": "code", - "execution_count": 120, + "execution_count": 121, "metadata": {}, "outputs": [], "source": [ @@ -1862,7 +1914,7 @@ }, { "cell_type": "code", - "execution_count": 121, + "execution_count": 122, "metadata": {}, "outputs": [], "source": [ @@ -1896,7 +1948,7 @@ }, { "cell_type": "code", - "execution_count": 122, + "execution_count": 123, "metadata": {}, "outputs": [], "source": [ @@ -1912,7 +1964,7 @@ }, { "cell_type": "code", - "execution_count": 123, + "execution_count": 124, "metadata": {}, "outputs": [], "source": [ @@ -1928,7 +1980,7 @@ }, { "cell_type": "code", - "execution_count": 124, + "execution_count": 125, "metadata": {}, "outputs": [], "source": [ @@ -1959,7 +2011,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.5.3" + "version": "3.5.2" }, "nav_menu": { "height": "279px",