From 46f547daeb0f997aee0f75c561991488681821dc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aur=C3=A9lien=20Geron?= Date: Mon, 7 May 2018 19:52:01 +0200 Subject: [PATCH] Replace the CategoricalEncoder with the OneHotEncoder class --- 03_classification.ipynb | 344 ++++++---------------------------------- 1 file changed, 51 insertions(+), 293 deletions(-) diff --git a/03_classification.ipynb b/03_classification.ipynb index 284d0a6..6cfb7f6 100644 --- a/03_classification.ipynb +++ b/03_classification.ipynb @@ -26,9 +26,7 @@ { "cell_type": "code", "execution_count": 1, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "# To support both python 2 and python 3\n", @@ -130,9 +128,7 @@ { "cell_type": "code", "execution_count": 7, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "def plot_digit(data):\n", @@ -145,9 +141,7 @@ { "cell_type": "code", "execution_count": 8, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "# EXTRA\n", @@ -192,9 +186,7 @@ { "cell_type": "code", "execution_count": 11, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]" @@ -203,9 +195,7 @@ { "cell_type": "code", "execution_count": 12, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", @@ -224,9 +214,7 @@ { "cell_type": "code", "execution_count": 13, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "y_train_5 = (y_train == 5)\n", @@ -291,9 +279,7 @@ { "cell_type": "code", "execution_count": 18, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "from sklearn.base import BaseEstimator\n", @@ -317,9 +303,7 @@ { "cell_type": "code", "execution_count": 20, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "from sklearn.model_selection import cross_val_predict\n", @@ -341,9 +325,7 @@ { "cell_type": "code", "execution_count": 22, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "y_train_perfect_predictions = y_train_5" @@ -428,9 +410,7 @@ { "cell_type": "code", "execution_count": 31, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "threshold = 0\n", @@ -460,9 +440,7 @@ { "cell_type": "code", "execution_count": 34, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "y_scores = cross_val_predict(sgd_clf, X_train, y_train_5, cv=3,\n", @@ -488,9 +466,7 @@ { "cell_type": "code", "execution_count": 36, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "# hack to work around issue #9589 introduced in Scikit-Learn 0.19.0\n", @@ -501,9 +477,7 @@ { "cell_type": "code", "execution_count": 37, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "from sklearn.metrics import precision_recall_curve\n", @@ -543,9 +517,7 @@ { "cell_type": "code", "execution_count": 40, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "y_train_pred_90 = (y_scores > 70000)" @@ -597,9 +569,7 @@ { "cell_type": "code", "execution_count": 44, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "from sklearn.metrics import roc_curve\n", @@ -640,9 +610,7 @@ { "cell_type": "code", "execution_count": 47, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "from sklearn.ensemble import RandomForestClassifier\n", @@ -654,9 +622,7 @@ { "cell_type": "code", "execution_count": 48, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "y_scores_forest = y_probas_forest[:, 1] # score = proba of positive class\n", @@ -834,9 +800,7 @@ { "cell_type": "code", "execution_count": 65, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "def plot_confusion_matrix(matrix):\n", @@ -861,9 +825,7 @@ { "cell_type": "code", "execution_count": 67, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "row_sums = conf_mx.sum(axis=1, keepdims=True)\n", @@ -962,9 +924,7 @@ { "cell_type": "code", "execution_count": 73, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "noise = np.random.randint(0, 100, (len(X_train), 784))\n", @@ -1017,9 +977,7 @@ { "cell_type": "code", "execution_count": 76, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "from sklearn.dummy import DummyClassifier\n", @@ -1061,9 +1019,7 @@ { "cell_type": "code", "execution_count": 79, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "y_knn_pred = knn_clf.predict(X_test)" @@ -1362,15 +1318,20 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Let's load the data:" + "First, login to [Kaggle](https://www.kaggle.com/) and go to the [Titanic challenge](https://www.kaggle.com/c/titanic) to download `train.csv` and `test.csv`. Save them to the `datasets/titanic` directory." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Next, let's load the data:" ] }, { "cell_type": "code", "execution_count": 100, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "import os\n", @@ -1381,9 +1342,7 @@ { "cell_type": "code", "execution_count": 101, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", @@ -1396,9 +1355,7 @@ { "cell_type": "code", "execution_count": 102, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "train_data = load_titanic_data(\"train.csv\")\n", @@ -1560,204 +1517,19 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The `CategoricalEncoder` class will allow us to convert categorical attributes to one-hot vectors. It will soon be added to Scikit-Learn, and in the meantime you can use the code below (copied from Pull Request #9151)." + "The `OneHotEncoder` class will allow us to convert categorical attributes to one-hot vectors. Since Scikit-Learn 0.20, this class can handle string categorical attributes, which is what we need. In case you are using an older version of Scikit-Learn, we get the latest version of this class from `future_encoders.py`." ] }, { "cell_type": "code", "execution_count": 110, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ - "# Definition of the CategoricalEncoder class, copied from PR #9151.\n", - "# Just run this cell, or copy it to your code, no need to try to\n", - "# understand every line.\n", - "\n", - "from sklearn.base import BaseEstimator, TransformerMixin\n", - "from sklearn.utils import check_array\n", - "from sklearn.preprocessing import LabelEncoder\n", - "from scipy import sparse\n", - "\n", - "class CategoricalEncoder(BaseEstimator, TransformerMixin):\n", - " \"\"\"Encode categorical features as a numeric array.\n", - " The input to this transformer should be a matrix of integers or strings,\n", - " denoting the values taken on by categorical (discrete) features.\n", - " The features can be encoded using a one-hot aka one-of-K scheme\n", - " (``encoding='onehot'``, the default) or converted to ordinal integers\n", - " (``encoding='ordinal'``).\n", - " This encoding is needed for feeding categorical data to many scikit-learn\n", - " estimators, notably linear models and SVMs with the standard kernels.\n", - " Read more in the :ref:`User Guide `.\n", - " Parameters\n", - " ----------\n", - " encoding : str, 'onehot', 'onehot-dense' or 'ordinal'\n", - " The type of encoding to use (default is 'onehot'):\n", - " - 'onehot': encode the features using a one-hot aka one-of-K scheme\n", - " (or also called 'dummy' encoding). This creates a binary column for\n", - " each category and returns a sparse matrix.\n", - " - 'onehot-dense': the same as 'onehot' but returns a dense array\n", - " instead of a sparse matrix.\n", - " - 'ordinal': encode the features as ordinal integers. This results in\n", - " a single column of integers (0 to n_categories - 1) per feature.\n", - " categories : 'auto' or a list of lists/arrays of values.\n", - " Categories (unique values) per feature:\n", - " - 'auto' : Determine categories automatically from the training data.\n", - " - list : ``categories[i]`` holds the categories expected in the ith\n", - " column. The passed categories are sorted before encoding the data\n", - " (used categories can be found in the ``categories_`` attribute).\n", - " dtype : number type, default np.float64\n", - " Desired dtype of output.\n", - " handle_unknown : 'error' (default) or 'ignore'\n", - " Whether to raise an error or ignore if a unknown categorical feature is\n", - " present during transform (default is to raise). When this is parameter\n", - " is set to 'ignore' and an unknown category is encountered during\n", - " transform, the resulting one-hot encoded columns for this feature\n", - " will be all zeros.\n", - " Ignoring unknown categories is not supported for\n", - " ``encoding='ordinal'``.\n", - " Attributes\n", - " ----------\n", - " categories_ : list of arrays\n", - " The categories of each feature determined during fitting. When\n", - " categories were specified manually, this holds the sorted categories\n", - " (in order corresponding with output of `transform`).\n", - " Examples\n", - " --------\n", - " Given a dataset with three features and two samples, we let the encoder\n", - " find the maximum value per feature and transform the data to a binary\n", - " one-hot encoding.\n", - " >>> from sklearn.preprocessing import CategoricalEncoder\n", - " >>> enc = CategoricalEncoder(handle_unknown='ignore')\n", - " >>> enc.fit([[0, 0, 3], [1, 1, 0], [0, 2, 1], [1, 0, 2]])\n", - " ... # doctest: +ELLIPSIS\n", - " CategoricalEncoder(categories='auto', dtype=<... 'numpy.float64'>,\n", - " encoding='onehot', handle_unknown='ignore')\n", - " >>> enc.transform([[0, 1, 1], [1, 0, 4]]).toarray()\n", - " array([[ 1., 0., 0., 1., 0., 0., 1., 0., 0.],\n", - " [ 0., 1., 1., 0., 0., 0., 0., 0., 0.]])\n", - " See also\n", - " --------\n", - " sklearn.preprocessing.OneHotEncoder : performs a one-hot encoding of\n", - " integer ordinal features. The ``OneHotEncoder assumes`` that input\n", - " features take on values in the range ``[0, max(feature)]`` instead of\n", - " using the unique values.\n", - " sklearn.feature_extraction.DictVectorizer : performs a one-hot encoding of\n", - " dictionary items (also handles string-valued features).\n", - " sklearn.feature_extraction.FeatureHasher : performs an approximate one-hot\n", - " encoding of dictionary items or strings.\n", - " \"\"\"\n", - "\n", - " def __init__(self, encoding='onehot', categories='auto', dtype=np.float64,\n", - " handle_unknown='error'):\n", - " self.encoding = encoding\n", - " self.categories = categories\n", - " self.dtype = dtype\n", - " self.handle_unknown = handle_unknown\n", - "\n", - " def fit(self, X, y=None):\n", - " \"\"\"Fit the CategoricalEncoder to X.\n", - " Parameters\n", - " ----------\n", - " X : array-like, shape [n_samples, n_feature]\n", - " The data to determine the categories of each feature.\n", - " Returns\n", - " -------\n", - " self\n", - " \"\"\"\n", - "\n", - " if self.encoding not in ['onehot', 'onehot-dense', 'ordinal']:\n", - " template = (\"encoding should be either 'onehot', 'onehot-dense' \"\n", - " \"or 'ordinal', got %s\")\n", - " raise ValueError(template % self.handle_unknown)\n", - "\n", - " if self.handle_unknown not in ['error', 'ignore']:\n", - " template = (\"handle_unknown should be either 'error' or \"\n", - " \"'ignore', got %s\")\n", - " raise ValueError(template % self.handle_unknown)\n", - "\n", - " if self.encoding == 'ordinal' and self.handle_unknown == 'ignore':\n", - " raise ValueError(\"handle_unknown='ignore' is not supported for\"\n", - " \" encoding='ordinal'\")\n", - "\n", - " X = check_array(X, dtype=np.object, accept_sparse='csc', copy=True)\n", - " n_samples, n_features = X.shape\n", - "\n", - " self._label_encoders_ = [LabelEncoder() for _ in range(n_features)]\n", - "\n", - " for i in range(n_features):\n", - " le = self._label_encoders_[i]\n", - " Xi = X[:, i]\n", - " if self.categories == 'auto':\n", - " le.fit(Xi)\n", - " else:\n", - " valid_mask = np.in1d(Xi, self.categories[i])\n", - " if not np.all(valid_mask):\n", - " if self.handle_unknown == 'error':\n", - " diff = np.unique(Xi[~valid_mask])\n", - " msg = (\"Found unknown categories {0} in column {1}\"\n", - " \" during fit\".format(diff, i))\n", - " raise ValueError(msg)\n", - " le.classes_ = np.array(np.sort(self.categories[i]))\n", - "\n", - " self.categories_ = [le.classes_ for le in self._label_encoders_]\n", - "\n", - " return self\n", - "\n", - " def transform(self, X):\n", - " \"\"\"Transform X using one-hot encoding.\n", - " Parameters\n", - " ----------\n", - " X : array-like, shape [n_samples, n_features]\n", - " The data to encode.\n", - " Returns\n", - " -------\n", - " X_out : sparse matrix or a 2-d array\n", - " Transformed input.\n", - " \"\"\"\n", - " X = check_array(X, accept_sparse='csc', dtype=np.object, copy=True)\n", - " n_samples, n_features = X.shape\n", - " X_int = np.zeros_like(X, dtype=np.int)\n", - " X_mask = np.ones_like(X, dtype=np.bool)\n", - "\n", - " for i in range(n_features):\n", - " valid_mask = np.in1d(X[:, i], self.categories_[i])\n", - "\n", - " if not np.all(valid_mask):\n", - " if self.handle_unknown == 'error':\n", - " diff = np.unique(X[~valid_mask, i])\n", - " msg = (\"Found unknown categories {0} in column {1}\"\n", - " \" during transform\".format(diff, i))\n", - " raise ValueError(msg)\n", - " else:\n", - " # Set the problematic rows to an acceptable value and\n", - " # continue `The rows are marked `X_mask` and will be\n", - " # removed later.\n", - " X_mask[:, i] = valid_mask\n", - " X[:, i][~valid_mask] = self.categories_[i][0]\n", - " X_int[:, i] = self._label_encoders_[i].transform(X[:, i])\n", - "\n", - " if self.encoding == 'ordinal':\n", - " return X_int.astype(self.dtype, copy=False)\n", - "\n", - " mask = X_mask.ravel()\n", - " n_values = [cats.shape[0] for cats in self.categories_]\n", - " n_values = np.array([0] + n_values)\n", - " indices = np.cumsum(n_values)\n", - "\n", - " column_indices = (X_int + indices[:-1]).ravel()[mask]\n", - " row_indices = np.repeat(np.arange(n_samples, dtype=np.int32),\n", - " n_features)[mask]\n", - " data = np.ones(n_samples * n_features)[mask]\n", - "\n", - " out = sparse.csc_matrix((data, (row_indices, column_indices)),\n", - " shape=(n_samples, indices[-1]),\n", - " dtype=self.dtype).tocsr()\n", - " if self.encoding == 'onehot-dense':\n", - " return out.toarray()\n", - " else:\n", - " return out" + "try:\n", + " from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder\n", + "except:\n", + " from future_encoders import OrdinalEncoder, OneHotEncoder" ] }, { @@ -1770,9 +1542,7 @@ { "cell_type": "code", "execution_count": 111, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "from sklearn.base import BaseEstimator, TransformerMixin\n", @@ -1798,9 +1568,7 @@ { "cell_type": "code", "execution_count": 112, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "from sklearn.pipeline import Pipeline\n", @@ -1833,19 +1601,17 @@ { "cell_type": "code", "execution_count": 114, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "# Inspired from stackoverflow.com/questions/25239958\n", "class MostFrequentImputer(BaseEstimator, TransformerMixin):\n", " def fit(self, X, y=None):\n", - " self.most_frequent = pd.Series([X[c].value_counts().index[0] for c in X],\n", - " index=X.columns)\n", + " self.most_frequent_ = pd.Series([X[c].value_counts().index[0] for c in X],\n", + " index=X.columns)\n", " return self\n", " def transform(self, X, y=None):\n", - " return X.fillna(self.most_frequent)" + " return X.fillna(self.most_frequent_)" ] }, { @@ -1858,15 +1624,13 @@ { "cell_type": "code", "execution_count": 115, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "cat_pipeline = Pipeline([\n", " (\"select_cat\", DataFrameSelector([\"Pclass\", \"Sex\", \"Embarked\"])),\n", " (\"imputer\", MostFrequentImputer()),\n", - " (\"cat_encoder\", CategoricalEncoder(encoding='onehot-dense')),\n", + " (\"cat_encoder\", OneHotEncoder(sparse=False)),\n", " ])" ] }, @@ -1889,9 +1653,7 @@ { "cell_type": "code", "execution_count": 117, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "from sklearn.pipeline import FeatureUnion\n", @@ -1928,9 +1690,7 @@ { "cell_type": "code", "execution_count": 119, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "y_train = train_data[\"Survived\"]" @@ -1965,9 +1725,7 @@ { "cell_type": "code", "execution_count": 121, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "X_test = preprocess_pipeline.transform(test_data)\n", @@ -2735,7 +2493,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.5.2" + "version": "3.6.5" }, "nav_menu": {}, "toc": {