Provide workaround and explanations about the breakage of LabelBinarizer by Scikit-Learn 0.19.0

main
Aurélien Geron 2017-09-15 14:40:13 +02:00
parent e5505eeef6
commit b59d2016d6
1 changed files with 145 additions and 93 deletions

View File

@ -35,7 +35,9 @@
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# To support both python 2 and python 3\n",
@ -78,7 +80,9 @@
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import os\n",
@ -102,7 +106,9 @@
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"fetch_housing_data()"
@ -188,7 +194,9 @@
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import numpy as np\n",
@ -215,7 +223,9 @@
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import hashlib\n",
@ -245,7 +255,9 @@
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"housing_with_id = housing.reset_index() # adds an `index` column\n",
@ -255,7 +267,9 @@
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"housing_with_id[\"id\"] = housing[\"longitude\"] * 1000 + housing[\"latitude\"]\n",
@ -274,7 +288,9 @@
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from sklearn.model_selection import train_test_split\n",
@ -303,7 +319,9 @@
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# Divide by 1.5 to limit the number of income categories\n",
@ -333,7 +351,9 @@
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from sklearn.model_selection import StratifiedShuffleSplit\n",
@ -356,7 +376,9 @@
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def income_cat_proportions(data):\n",
@ -385,7 +407,9 @@
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"for set_ in (strat_train_set, strat_test_set):\n",
@ -505,7 +529,8 @@
"metadata": {},
"outputs": [],
"source": [
"from pandas.tools.plotting import scatter_matrix\n",
"# from pandas.tools.plotting import scatter_matrix # For older versions of Pandas\n",
"from pandas.plotting import scatter_matrix\n",
"\n",
"attributes = [\"median_house_value\", \"median_income\", \"total_rooms\",\n",
" \"housing_median_age\"]\n",
@ -637,7 +662,9 @@
{
"cell_type": "code",
"execution_count": 47,
"metadata": {},
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from sklearn.preprocessing import Imputer\n",
@ -655,7 +682,9 @@
{
"cell_type": "code",
"execution_count": 48,
"metadata": {},
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"housing_num = housing.drop(\"ocean_proximity\", axis=1)"
@ -716,7 +745,9 @@
{
"cell_type": "code",
"execution_count": 53,
"metadata": {},
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"housing_tr = pd.DataFrame(X, columns=housing_num.columns,\n",
@ -812,7 +843,9 @@
{
"cell_type": "code",
"execution_count": 62,
"metadata": {},
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from sklearn.base import BaseEstimator, TransformerMixin\n",
@ -852,7 +885,9 @@
{
"cell_type": "code",
"execution_count": 64,
"metadata": {},
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from sklearn.pipeline import Pipeline\n",
@ -897,6 +932,13 @@
" return X[self.attribute_names].values"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**Important note**: the `LabelEncoder` and `LabelBinarizer` classes were designed for preprocessing labels, not input features, so their `fit()` and `fit_transform()` methods only accept one parameter `y` instead of two parameters `X` and `y`. The proper way to convert categorical input features to one-hot vectors should be to use the `OneHotEncoder` class, but unfortunately it does not work with string categories, only integer categories (people are working on it, see [Pull Request 7327](https://github.com/scikit-learn/scikit-learn/pull/7327)). In the meantime, one workaround was to use the `LabelBinarizer` class, as shown in the book. Unfortunately, since Scikit-Learn 0.19.0, pipelines now expect each estimator to have a `fit()` or `fit_transform()` method with two parameters `X` and `y`, so the code shown in the book won't work if you are using Scikit-Learn 0.19.0 (and possibly later as well). A temporary workaround (until PR 7327 is finished and you can use a `OneHotEncoder`) is to create a small wrapper class around the `LabelBinarizer` class, to fix its `fit_transform()` method, like this:"
]
},
{
"cell_type": "code",
"execution_count": 67,
@ -904,6 +946,19 @@
"collapsed": true
},
"outputs": [],
"source": [
"class PipelineFriendlyLabelBinarizer(LabelBinarizer):\n",
" def fit_transform(self, X, y=None):\n",
" return super(PipelineFriendlyLabelBinarizer, self).fit_transform(X)"
]
},
{
"cell_type": "code",
"execution_count": 68,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"num_attribs = list(housing_num)\n",
"cat_attribs = [\"ocean_proximity\"]\n",
@ -917,14 +972,16 @@
"\n",
"cat_pipeline = Pipeline([\n",
" ('selector', DataFrameSelector(cat_attribs)),\n",
" ('label_binarizer', LabelBinarizer()),\n",
" ('label_binarizer', PipelineFriendlyLabelBinarizer()),\n",
" ])"
]
},
{
"cell_type": "code",
"execution_count": 68,
"metadata": {},
"execution_count": 69,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from sklearn.pipeline import FeatureUnion\n",
@ -937,7 +994,7 @@
},
{
"cell_type": "code",
"execution_count": 69,
"execution_count": 70,
"metadata": {},
"outputs": [],
"source": [
@ -947,7 +1004,7 @@
},
{
"cell_type": "code",
"execution_count": 70,
"execution_count": 71,
"metadata": {},
"outputs": [],
"source": [
@ -963,7 +1020,7 @@
},
{
"cell_type": "code",
"execution_count": 71,
"execution_count": 72,
"metadata": {},
"outputs": [],
"source": [
@ -975,7 +1032,7 @@
},
{
"cell_type": "code",
"execution_count": 72,
"execution_count": 73,
"metadata": {},
"outputs": [],
"source": [
@ -996,7 +1053,7 @@
},
{
"cell_type": "code",
"execution_count": 73,
"execution_count": 74,
"metadata": {},
"outputs": [],
"source": [
@ -1005,7 +1062,7 @@
},
{
"cell_type": "code",
"execution_count": 74,
"execution_count": 75,
"metadata": {},
"outputs": [],
"source": [
@ -1014,7 +1071,7 @@
},
{
"cell_type": "code",
"execution_count": 75,
"execution_count": 76,
"metadata": {},
"outputs": [],
"source": [
@ -1028,7 +1085,7 @@
},
{
"cell_type": "code",
"execution_count": 76,
"execution_count": 77,
"metadata": {},
"outputs": [],
"source": [
@ -1040,7 +1097,7 @@
},
{
"cell_type": "code",
"execution_count": 77,
"execution_count": 78,
"metadata": {},
"outputs": [],
"source": [
@ -1052,7 +1109,7 @@
},
{
"cell_type": "code",
"execution_count": 78,
"execution_count": 79,
"metadata": {},
"outputs": [],
"source": [
@ -1071,8 +1128,10 @@
},
{
"cell_type": "code",
"execution_count": 79,
"metadata": {},
"execution_count": 80,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from sklearn.model_selection import cross_val_score\n",
@ -1084,7 +1143,7 @@
},
{
"cell_type": "code",
"execution_count": 80,
"execution_count": 81,
"metadata": {},
"outputs": [],
"source": [
@ -1098,7 +1157,7 @@
},
{
"cell_type": "code",
"execution_count": 81,
"execution_count": 82,
"metadata": {},
"outputs": [],
"source": [
@ -1110,7 +1169,7 @@
},
{
"cell_type": "code",
"execution_count": 82,
"execution_count": 83,
"metadata": {},
"outputs": [],
"source": [
@ -1122,7 +1181,7 @@
},
{
"cell_type": "code",
"execution_count": 83,
"execution_count": 84,
"metadata": {},
"outputs": [],
"source": [
@ -1134,7 +1193,7 @@
},
{
"cell_type": "code",
"execution_count": 84,
"execution_count": 85,
"metadata": {},
"outputs": [],
"source": [
@ -1148,7 +1207,7 @@
},
{
"cell_type": "code",
"execution_count": 85,
"execution_count": 86,
"metadata": {},
"outputs": [],
"source": [
@ -1158,7 +1217,7 @@
},
{
"cell_type": "code",
"execution_count": 86,
"execution_count": 87,
"metadata": {},
"outputs": [],
"source": [
@ -1174,7 +1233,7 @@
},
{
"cell_type": "code",
"execution_count": 87,
"execution_count": 88,
"metadata": {},
"outputs": [],
"source": [
@ -1203,7 +1262,7 @@
},
{
"cell_type": "code",
"execution_count": 88,
"execution_count": 89,
"metadata": {},
"outputs": [],
"source": [
@ -1212,7 +1271,7 @@
},
{
"cell_type": "code",
"execution_count": 89,
"execution_count": 90,
"metadata": {},
"outputs": [],
"source": [
@ -1228,7 +1287,7 @@
},
{
"cell_type": "code",
"execution_count": 90,
"execution_count": 91,
"metadata": {},
"outputs": [],
"source": [
@ -1239,7 +1298,7 @@
},
{
"cell_type": "code",
"execution_count": 91,
"execution_count": 92,
"metadata": {},
"outputs": [],
"source": [
@ -1248,7 +1307,7 @@
},
{
"cell_type": "code",
"execution_count": 92,
"execution_count": 93,
"metadata": {},
"outputs": [],
"source": [
@ -1268,7 +1327,7 @@
},
{
"cell_type": "code",
"execution_count": 93,
"execution_count": 94,
"metadata": {},
"outputs": [],
"source": [
@ -1279,7 +1338,7 @@
},
{
"cell_type": "code",
"execution_count": 94,
"execution_count": 95,
"metadata": {},
"outputs": [],
"source": [
@ -1289,7 +1348,7 @@
},
{
"cell_type": "code",
"execution_count": 95,
"execution_count": 96,
"metadata": {},
"outputs": [],
"source": [
@ -1301,7 +1360,7 @@
},
{
"cell_type": "code",
"execution_count": 96,
"execution_count": 97,
"metadata": {
"collapsed": true
},
@ -1321,7 +1380,7 @@
},
{
"cell_type": "code",
"execution_count": 97,
"execution_count": 98,
"metadata": {},
"outputs": [],
"source": [
@ -1339,26 +1398,15 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"## Label Binarizer hack\n",
"`LabelBinarizer`'s `fit_transform()` method only accepts one parameter `y` (because it was meant for labels, not predictors), so it does not work in a pipeline where the final estimator is a supervised estimator because in this case its `fit()` method takes two parameters `X` and `y`.\n",
"\n",
"This hack creates a supervision-friendly `LabelBinarizer`."
"## A full pipeline with both preparation and prediction"
]
},
{
"cell_type": "code",
"execution_count": 98,
"execution_count": 99,
"metadata": {},
"outputs": [],
"source": [
"class SupervisionFriendlyLabelBinarizer(LabelBinarizer):\n",
" def fit_transform(self, X, y=None):\n",
" return super(SupervisionFriendlyLabelBinarizer, self).fit_transform(X)\n",
"\n",
"# Replace the Labelbinarizer with a SupervisionFriendlyLabelBinarizer\n",
"cat_pipeline.steps[1] = (\"label_binarizer\", SupervisionFriendlyLabelBinarizer())\n",
"\n",
"# Now you can create a full pipeline with a supervised predictor at the end.\n",
"full_pipeline_with_predictor = Pipeline([\n",
" (\"preparation\", full_pipeline),\n",
" (\"linear\", LinearRegression())\n",
@ -1377,7 +1425,7 @@
},
{
"cell_type": "code",
"execution_count": 99,
"execution_count": 100,
"metadata": {
"collapsed": true
},
@ -1388,7 +1436,7 @@
},
{
"cell_type": "code",
"execution_count": 100,
"execution_count": 101,
"metadata": {
"collapsed": true
},
@ -1409,7 +1457,7 @@
},
{
"cell_type": "code",
"execution_count": 101,
"execution_count": 102,
"metadata": {},
"outputs": [],
"source": [
@ -1447,7 +1495,7 @@
},
{
"cell_type": "code",
"execution_count": 102,
"execution_count": 103,
"metadata": {},
"outputs": [],
"source": [
@ -1473,7 +1521,7 @@
},
{
"cell_type": "code",
"execution_count": 103,
"execution_count": 104,
"metadata": {},
"outputs": [],
"source": [
@ -1491,7 +1539,7 @@
},
{
"cell_type": "code",
"execution_count": 104,
"execution_count": 105,
"metadata": {},
"outputs": [],
"source": [
@ -1521,7 +1569,7 @@
},
{
"cell_type": "code",
"execution_count": 105,
"execution_count": 106,
"metadata": {},
"outputs": [],
"source": [
@ -1554,7 +1602,7 @@
},
{
"cell_type": "code",
"execution_count": 106,
"execution_count": 107,
"metadata": {},
"outputs": [],
"source": [
@ -1572,7 +1620,7 @@
},
{
"cell_type": "code",
"execution_count": 107,
"execution_count": 108,
"metadata": {},
"outputs": [],
"source": [
@ -1595,7 +1643,7 @@
},
{
"cell_type": "code",
"execution_count": 108,
"execution_count": 109,
"metadata": {},
"outputs": [],
"source": [
@ -1620,7 +1668,7 @@
},
{
"cell_type": "code",
"execution_count": 109,
"execution_count": 110,
"metadata": {},
"outputs": [],
"source": [
@ -1659,7 +1707,7 @@
},
{
"cell_type": "code",
"execution_count": 110,
"execution_count": 111,
"metadata": {
"collapsed": true
},
@ -1697,7 +1745,7 @@
},
{
"cell_type": "code",
"execution_count": 111,
"execution_count": 112,
"metadata": {
"collapsed": true
},
@ -1715,7 +1763,7 @@
},
{
"cell_type": "code",
"execution_count": 112,
"execution_count": 113,
"metadata": {},
"outputs": [],
"source": [
@ -1725,7 +1773,7 @@
},
{
"cell_type": "code",
"execution_count": 113,
"execution_count": 114,
"metadata": {},
"outputs": [],
"source": [
@ -1741,7 +1789,7 @@
},
{
"cell_type": "code",
"execution_count": 114,
"execution_count": 115,
"metadata": {},
"outputs": [],
"source": [
@ -1757,8 +1805,10 @@
},
{
"cell_type": "code",
"execution_count": 115,
"metadata": {},
"execution_count": 116,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"preparation_and_feature_selection_pipeline = Pipeline([\n",
@ -1769,7 +1819,7 @@
},
{
"cell_type": "code",
"execution_count": 116,
"execution_count": 117,
"metadata": {
"collapsed": true
},
@ -1787,7 +1837,7 @@
},
{
"cell_type": "code",
"execution_count": 117,
"execution_count": 118,
"metadata": {},
"outputs": [],
"source": [
@ -1803,7 +1853,7 @@
},
{
"cell_type": "code",
"execution_count": 118,
"execution_count": 119,
"metadata": {},
"outputs": [],
"source": [
@ -1833,8 +1883,10 @@
},
{
"cell_type": "code",
"execution_count": 119,
"metadata": {},
"execution_count": 120,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"prepare_select_and_predict_pipeline = Pipeline([\n",
@ -1846,7 +1898,7 @@
},
{
"cell_type": "code",
"execution_count": 120,
"execution_count": 121,
"metadata": {},
"outputs": [],
"source": [
@ -1862,7 +1914,7 @@
},
{
"cell_type": "code",
"execution_count": 121,
"execution_count": 122,
"metadata": {},
"outputs": [],
"source": [
@ -1896,7 +1948,7 @@
},
{
"cell_type": "code",
"execution_count": 122,
"execution_count": 123,
"metadata": {},
"outputs": [],
"source": [
@ -1912,7 +1964,7 @@
},
{
"cell_type": "code",
"execution_count": 123,
"execution_count": 124,
"metadata": {},
"outputs": [],
"source": [
@ -1928,7 +1980,7 @@
},
{
"cell_type": "code",
"execution_count": 124,
"execution_count": 125,
"metadata": {},
"outputs": [],
"source": [
@ -1959,7 +2011,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.3"
"version": "3.5.2"
},
"nav_menu": {
"height": "279px",