Replace the CategoricalEncoder with the OneHotEncoder class
parent
bdd58b7c52
commit
46f547daeb
|
@ -26,9 +26,7 @@
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 1,
|
"execution_count": 1,
|
||||||
"metadata": {
|
"metadata": {},
|
||||||
"collapsed": true
|
|
||||||
},
|
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"# To support both python 2 and python 3\n",
|
"# To support both python 2 and python 3\n",
|
||||||
|
@ -130,9 +128,7 @@
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 7,
|
"execution_count": 7,
|
||||||
"metadata": {
|
"metadata": {},
|
||||||
"collapsed": true
|
|
||||||
},
|
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"def plot_digit(data):\n",
|
"def plot_digit(data):\n",
|
||||||
|
@ -145,9 +141,7 @@
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 8,
|
"execution_count": 8,
|
||||||
"metadata": {
|
"metadata": {},
|
||||||
"collapsed": true
|
|
||||||
},
|
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"# EXTRA\n",
|
"# EXTRA\n",
|
||||||
|
@ -192,9 +186,7 @@
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 11,
|
"execution_count": 11,
|
||||||
"metadata": {
|
"metadata": {},
|
||||||
"collapsed": true
|
|
||||||
},
|
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]"
|
"X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]"
|
||||||
|
@ -203,9 +195,7 @@
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 12,
|
"execution_count": 12,
|
||||||
"metadata": {
|
"metadata": {},
|
||||||
"collapsed": true
|
|
||||||
},
|
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"import numpy as np\n",
|
"import numpy as np\n",
|
||||||
|
@ -224,9 +214,7 @@
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 13,
|
"execution_count": 13,
|
||||||
"metadata": {
|
"metadata": {},
|
||||||
"collapsed": true
|
|
||||||
},
|
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"y_train_5 = (y_train == 5)\n",
|
"y_train_5 = (y_train == 5)\n",
|
||||||
|
@ -291,9 +279,7 @@
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 18,
|
"execution_count": 18,
|
||||||
"metadata": {
|
"metadata": {},
|
||||||
"collapsed": true
|
|
||||||
},
|
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"from sklearn.base import BaseEstimator\n",
|
"from sklearn.base import BaseEstimator\n",
|
||||||
|
@ -317,9 +303,7 @@
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 20,
|
"execution_count": 20,
|
||||||
"metadata": {
|
"metadata": {},
|
||||||
"collapsed": true
|
|
||||||
},
|
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"from sklearn.model_selection import cross_val_predict\n",
|
"from sklearn.model_selection import cross_val_predict\n",
|
||||||
|
@ -341,9 +325,7 @@
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 22,
|
"execution_count": 22,
|
||||||
"metadata": {
|
"metadata": {},
|
||||||
"collapsed": true
|
|
||||||
},
|
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"y_train_perfect_predictions = y_train_5"
|
"y_train_perfect_predictions = y_train_5"
|
||||||
|
@ -428,9 +410,7 @@
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 31,
|
"execution_count": 31,
|
||||||
"metadata": {
|
"metadata": {},
|
||||||
"collapsed": true
|
|
||||||
},
|
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"threshold = 0\n",
|
"threshold = 0\n",
|
||||||
|
@ -460,9 +440,7 @@
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 34,
|
"execution_count": 34,
|
||||||
"metadata": {
|
"metadata": {},
|
||||||
"collapsed": true
|
|
||||||
},
|
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"y_scores = cross_val_predict(sgd_clf, X_train, y_train_5, cv=3,\n",
|
"y_scores = cross_val_predict(sgd_clf, X_train, y_train_5, cv=3,\n",
|
||||||
|
@ -488,9 +466,7 @@
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 36,
|
"execution_count": 36,
|
||||||
"metadata": {
|
"metadata": {},
|
||||||
"collapsed": true
|
|
||||||
},
|
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"# hack to work around issue #9589 introduced in Scikit-Learn 0.19.0\n",
|
"# hack to work around issue #9589 introduced in Scikit-Learn 0.19.0\n",
|
||||||
|
@ -501,9 +477,7 @@
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 37,
|
"execution_count": 37,
|
||||||
"metadata": {
|
"metadata": {},
|
||||||
"collapsed": true
|
|
||||||
},
|
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"from sklearn.metrics import precision_recall_curve\n",
|
"from sklearn.metrics import precision_recall_curve\n",
|
||||||
|
@ -543,9 +517,7 @@
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 40,
|
"execution_count": 40,
|
||||||
"metadata": {
|
"metadata": {},
|
||||||
"collapsed": true
|
|
||||||
},
|
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"y_train_pred_90 = (y_scores > 70000)"
|
"y_train_pred_90 = (y_scores > 70000)"
|
||||||
|
@ -597,9 +569,7 @@
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 44,
|
"execution_count": 44,
|
||||||
"metadata": {
|
"metadata": {},
|
||||||
"collapsed": true
|
|
||||||
},
|
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"from sklearn.metrics import roc_curve\n",
|
"from sklearn.metrics import roc_curve\n",
|
||||||
|
@ -640,9 +610,7 @@
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 47,
|
"execution_count": 47,
|
||||||
"metadata": {
|
"metadata": {},
|
||||||
"collapsed": true
|
|
||||||
},
|
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"from sklearn.ensemble import RandomForestClassifier\n",
|
"from sklearn.ensemble import RandomForestClassifier\n",
|
||||||
|
@ -654,9 +622,7 @@
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 48,
|
"execution_count": 48,
|
||||||
"metadata": {
|
"metadata": {},
|
||||||
"collapsed": true
|
|
||||||
},
|
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"y_scores_forest = y_probas_forest[:, 1] # score = proba of positive class\n",
|
"y_scores_forest = y_probas_forest[:, 1] # score = proba of positive class\n",
|
||||||
|
@ -834,9 +800,7 @@
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 65,
|
"execution_count": 65,
|
||||||
"metadata": {
|
"metadata": {},
|
||||||
"collapsed": true
|
|
||||||
},
|
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"def plot_confusion_matrix(matrix):\n",
|
"def plot_confusion_matrix(matrix):\n",
|
||||||
|
@ -861,9 +825,7 @@
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 67,
|
"execution_count": 67,
|
||||||
"metadata": {
|
"metadata": {},
|
||||||
"collapsed": true
|
|
||||||
},
|
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"row_sums = conf_mx.sum(axis=1, keepdims=True)\n",
|
"row_sums = conf_mx.sum(axis=1, keepdims=True)\n",
|
||||||
|
@ -962,9 +924,7 @@
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 73,
|
"execution_count": 73,
|
||||||
"metadata": {
|
"metadata": {},
|
||||||
"collapsed": true
|
|
||||||
},
|
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"noise = np.random.randint(0, 100, (len(X_train), 784))\n",
|
"noise = np.random.randint(0, 100, (len(X_train), 784))\n",
|
||||||
|
@ -1017,9 +977,7 @@
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 76,
|
"execution_count": 76,
|
||||||
"metadata": {
|
"metadata": {},
|
||||||
"collapsed": true
|
|
||||||
},
|
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"from sklearn.dummy import DummyClassifier\n",
|
"from sklearn.dummy import DummyClassifier\n",
|
||||||
|
@ -1061,9 +1019,7 @@
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 79,
|
"execution_count": 79,
|
||||||
"metadata": {
|
"metadata": {},
|
||||||
"collapsed": true
|
|
||||||
},
|
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"y_knn_pred = knn_clf.predict(X_test)"
|
"y_knn_pred = knn_clf.predict(X_test)"
|
||||||
|
@ -1362,15 +1318,20 @@
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"Let's load the data:"
|
"First, login to [Kaggle](https://www.kaggle.com/) and go to the [Titanic challenge](https://www.kaggle.com/c/titanic) to download `train.csv` and `test.csv`. Save them to the `datasets/titanic` directory."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Next, let's load the data:"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 100,
|
"execution_count": 100,
|
||||||
"metadata": {
|
"metadata": {},
|
||||||
"collapsed": true
|
|
||||||
},
|
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"import os\n",
|
"import os\n",
|
||||||
|
@ -1381,9 +1342,7 @@
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 101,
|
"execution_count": 101,
|
||||||
"metadata": {
|
"metadata": {},
|
||||||
"collapsed": true
|
|
||||||
},
|
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"import pandas as pd\n",
|
"import pandas as pd\n",
|
||||||
|
@ -1396,9 +1355,7 @@
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 102,
|
"execution_count": 102,
|
||||||
"metadata": {
|
"metadata": {},
|
||||||
"collapsed": true
|
|
||||||
},
|
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"train_data = load_titanic_data(\"train.csv\")\n",
|
"train_data = load_titanic_data(\"train.csv\")\n",
|
||||||
|
@ -1560,204 +1517,19 @@
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"The `CategoricalEncoder` class will allow us to convert categorical attributes to one-hot vectors. It will soon be added to Scikit-Learn, and in the meantime you can use the code below (copied from Pull Request #9151)."
|
"The `OneHotEncoder` class will allow us to convert categorical attributes to one-hot vectors. Since Scikit-Learn 0.20, this class can handle string categorical attributes, which is what we need. In case you are using an older version of Scikit-Learn, we get the latest version of this class from `future_encoders.py`."
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 110,
|
"execution_count": 110,
|
||||||
"metadata": {
|
"metadata": {},
|
||||||
"collapsed": true
|
|
||||||
},
|
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"# Definition of the CategoricalEncoder class, copied from PR #9151.\n",
|
"try:\n",
|
||||||
"# Just run this cell, or copy it to your code, no need to try to\n",
|
" from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder\n",
|
||||||
"# understand every line.\n",
|
"except:\n",
|
||||||
"\n",
|
" from future_encoders import OrdinalEncoder, OneHotEncoder"
|
||||||
"from sklearn.base import BaseEstimator, TransformerMixin\n",
|
|
||||||
"from sklearn.utils import check_array\n",
|
|
||||||
"from sklearn.preprocessing import LabelEncoder\n",
|
|
||||||
"from scipy import sparse\n",
|
|
||||||
"\n",
|
|
||||||
"class CategoricalEncoder(BaseEstimator, TransformerMixin):\n",
|
|
||||||
" \"\"\"Encode categorical features as a numeric array.\n",
|
|
||||||
" The input to this transformer should be a matrix of integers or strings,\n",
|
|
||||||
" denoting the values taken on by categorical (discrete) features.\n",
|
|
||||||
" The features can be encoded using a one-hot aka one-of-K scheme\n",
|
|
||||||
" (``encoding='onehot'``, the default) or converted to ordinal integers\n",
|
|
||||||
" (``encoding='ordinal'``).\n",
|
|
||||||
" This encoding is needed for feeding categorical data to many scikit-learn\n",
|
|
||||||
" estimators, notably linear models and SVMs with the standard kernels.\n",
|
|
||||||
" Read more in the :ref:`User Guide <preprocessing_categorical_features>`.\n",
|
|
||||||
" Parameters\n",
|
|
||||||
" ----------\n",
|
|
||||||
" encoding : str, 'onehot', 'onehot-dense' or 'ordinal'\n",
|
|
||||||
" The type of encoding to use (default is 'onehot'):\n",
|
|
||||||
" - 'onehot': encode the features using a one-hot aka one-of-K scheme\n",
|
|
||||||
" (or also called 'dummy' encoding). This creates a binary column for\n",
|
|
||||||
" each category and returns a sparse matrix.\n",
|
|
||||||
" - 'onehot-dense': the same as 'onehot' but returns a dense array\n",
|
|
||||||
" instead of a sparse matrix.\n",
|
|
||||||
" - 'ordinal': encode the features as ordinal integers. This results in\n",
|
|
||||||
" a single column of integers (0 to n_categories - 1) per feature.\n",
|
|
||||||
" categories : 'auto' or a list of lists/arrays of values.\n",
|
|
||||||
" Categories (unique values) per feature:\n",
|
|
||||||
" - 'auto' : Determine categories automatically from the training data.\n",
|
|
||||||
" - list : ``categories[i]`` holds the categories expected in the ith\n",
|
|
||||||
" column. The passed categories are sorted before encoding the data\n",
|
|
||||||
" (used categories can be found in the ``categories_`` attribute).\n",
|
|
||||||
" dtype : number type, default np.float64\n",
|
|
||||||
" Desired dtype of output.\n",
|
|
||||||
" handle_unknown : 'error' (default) or 'ignore'\n",
|
|
||||||
" Whether to raise an error or ignore if a unknown categorical feature is\n",
|
|
||||||
" present during transform (default is to raise). When this is parameter\n",
|
|
||||||
" is set to 'ignore' and an unknown category is encountered during\n",
|
|
||||||
" transform, the resulting one-hot encoded columns for this feature\n",
|
|
||||||
" will be all zeros.\n",
|
|
||||||
" Ignoring unknown categories is not supported for\n",
|
|
||||||
" ``encoding='ordinal'``.\n",
|
|
||||||
" Attributes\n",
|
|
||||||
" ----------\n",
|
|
||||||
" categories_ : list of arrays\n",
|
|
||||||
" The categories of each feature determined during fitting. When\n",
|
|
||||||
" categories were specified manually, this holds the sorted categories\n",
|
|
||||||
" (in order corresponding with output of `transform`).\n",
|
|
||||||
" Examples\n",
|
|
||||||
" --------\n",
|
|
||||||
" Given a dataset with three features and two samples, we let the encoder\n",
|
|
||||||
" find the maximum value per feature and transform the data to a binary\n",
|
|
||||||
" one-hot encoding.\n",
|
|
||||||
" >>> from sklearn.preprocessing import CategoricalEncoder\n",
|
|
||||||
" >>> enc = CategoricalEncoder(handle_unknown='ignore')\n",
|
|
||||||
" >>> enc.fit([[0, 0, 3], [1, 1, 0], [0, 2, 1], [1, 0, 2]])\n",
|
|
||||||
" ... # doctest: +ELLIPSIS\n",
|
|
||||||
" CategoricalEncoder(categories='auto', dtype=<... 'numpy.float64'>,\n",
|
|
||||||
" encoding='onehot', handle_unknown='ignore')\n",
|
|
||||||
" >>> enc.transform([[0, 1, 1], [1, 0, 4]]).toarray()\n",
|
|
||||||
" array([[ 1., 0., 0., 1., 0., 0., 1., 0., 0.],\n",
|
|
||||||
" [ 0., 1., 1., 0., 0., 0., 0., 0., 0.]])\n",
|
|
||||||
" See also\n",
|
|
||||||
" --------\n",
|
|
||||||
" sklearn.preprocessing.OneHotEncoder : performs a one-hot encoding of\n",
|
|
||||||
" integer ordinal features. The ``OneHotEncoder assumes`` that input\n",
|
|
||||||
" features take on values in the range ``[0, max(feature)]`` instead of\n",
|
|
||||||
" using the unique values.\n",
|
|
||||||
" sklearn.feature_extraction.DictVectorizer : performs a one-hot encoding of\n",
|
|
||||||
" dictionary items (also handles string-valued features).\n",
|
|
||||||
" sklearn.feature_extraction.FeatureHasher : performs an approximate one-hot\n",
|
|
||||||
" encoding of dictionary items or strings.\n",
|
|
||||||
" \"\"\"\n",
|
|
||||||
"\n",
|
|
||||||
" def __init__(self, encoding='onehot', categories='auto', dtype=np.float64,\n",
|
|
||||||
" handle_unknown='error'):\n",
|
|
||||||
" self.encoding = encoding\n",
|
|
||||||
" self.categories = categories\n",
|
|
||||||
" self.dtype = dtype\n",
|
|
||||||
" self.handle_unknown = handle_unknown\n",
|
|
||||||
"\n",
|
|
||||||
" def fit(self, X, y=None):\n",
|
|
||||||
" \"\"\"Fit the CategoricalEncoder to X.\n",
|
|
||||||
" Parameters\n",
|
|
||||||
" ----------\n",
|
|
||||||
" X : array-like, shape [n_samples, n_feature]\n",
|
|
||||||
" The data to determine the categories of each feature.\n",
|
|
||||||
" Returns\n",
|
|
||||||
" -------\n",
|
|
||||||
" self\n",
|
|
||||||
" \"\"\"\n",
|
|
||||||
"\n",
|
|
||||||
" if self.encoding not in ['onehot', 'onehot-dense', 'ordinal']:\n",
|
|
||||||
" template = (\"encoding should be either 'onehot', 'onehot-dense' \"\n",
|
|
||||||
" \"or 'ordinal', got %s\")\n",
|
|
||||||
" raise ValueError(template % self.handle_unknown)\n",
|
|
||||||
"\n",
|
|
||||||
" if self.handle_unknown not in ['error', 'ignore']:\n",
|
|
||||||
" template = (\"handle_unknown should be either 'error' or \"\n",
|
|
||||||
" \"'ignore', got %s\")\n",
|
|
||||||
" raise ValueError(template % self.handle_unknown)\n",
|
|
||||||
"\n",
|
|
||||||
" if self.encoding == 'ordinal' and self.handle_unknown == 'ignore':\n",
|
|
||||||
" raise ValueError(\"handle_unknown='ignore' is not supported for\"\n",
|
|
||||||
" \" encoding='ordinal'\")\n",
|
|
||||||
"\n",
|
|
||||||
" X = check_array(X, dtype=np.object, accept_sparse='csc', copy=True)\n",
|
|
||||||
" n_samples, n_features = X.shape\n",
|
|
||||||
"\n",
|
|
||||||
" self._label_encoders_ = [LabelEncoder() for _ in range(n_features)]\n",
|
|
||||||
"\n",
|
|
||||||
" for i in range(n_features):\n",
|
|
||||||
" le = self._label_encoders_[i]\n",
|
|
||||||
" Xi = X[:, i]\n",
|
|
||||||
" if self.categories == 'auto':\n",
|
|
||||||
" le.fit(Xi)\n",
|
|
||||||
" else:\n",
|
|
||||||
" valid_mask = np.in1d(Xi, self.categories[i])\n",
|
|
||||||
" if not np.all(valid_mask):\n",
|
|
||||||
" if self.handle_unknown == 'error':\n",
|
|
||||||
" diff = np.unique(Xi[~valid_mask])\n",
|
|
||||||
" msg = (\"Found unknown categories {0} in column {1}\"\n",
|
|
||||||
" \" during fit\".format(diff, i))\n",
|
|
||||||
" raise ValueError(msg)\n",
|
|
||||||
" le.classes_ = np.array(np.sort(self.categories[i]))\n",
|
|
||||||
"\n",
|
|
||||||
" self.categories_ = [le.classes_ for le in self._label_encoders_]\n",
|
|
||||||
"\n",
|
|
||||||
" return self\n",
|
|
||||||
"\n",
|
|
||||||
" def transform(self, X):\n",
|
|
||||||
" \"\"\"Transform X using one-hot encoding.\n",
|
|
||||||
" Parameters\n",
|
|
||||||
" ----------\n",
|
|
||||||
" X : array-like, shape [n_samples, n_features]\n",
|
|
||||||
" The data to encode.\n",
|
|
||||||
" Returns\n",
|
|
||||||
" -------\n",
|
|
||||||
" X_out : sparse matrix or a 2-d array\n",
|
|
||||||
" Transformed input.\n",
|
|
||||||
" \"\"\"\n",
|
|
||||||
" X = check_array(X, accept_sparse='csc', dtype=np.object, copy=True)\n",
|
|
||||||
" n_samples, n_features = X.shape\n",
|
|
||||||
" X_int = np.zeros_like(X, dtype=np.int)\n",
|
|
||||||
" X_mask = np.ones_like(X, dtype=np.bool)\n",
|
|
||||||
"\n",
|
|
||||||
" for i in range(n_features):\n",
|
|
||||||
" valid_mask = np.in1d(X[:, i], self.categories_[i])\n",
|
|
||||||
"\n",
|
|
||||||
" if not np.all(valid_mask):\n",
|
|
||||||
" if self.handle_unknown == 'error':\n",
|
|
||||||
" diff = np.unique(X[~valid_mask, i])\n",
|
|
||||||
" msg = (\"Found unknown categories {0} in column {1}\"\n",
|
|
||||||
" \" during transform\".format(diff, i))\n",
|
|
||||||
" raise ValueError(msg)\n",
|
|
||||||
" else:\n",
|
|
||||||
" # Set the problematic rows to an acceptable value and\n",
|
|
||||||
" # continue `The rows are marked `X_mask` and will be\n",
|
|
||||||
" # removed later.\n",
|
|
||||||
" X_mask[:, i] = valid_mask\n",
|
|
||||||
" X[:, i][~valid_mask] = self.categories_[i][0]\n",
|
|
||||||
" X_int[:, i] = self._label_encoders_[i].transform(X[:, i])\n",
|
|
||||||
"\n",
|
|
||||||
" if self.encoding == 'ordinal':\n",
|
|
||||||
" return X_int.astype(self.dtype, copy=False)\n",
|
|
||||||
"\n",
|
|
||||||
" mask = X_mask.ravel()\n",
|
|
||||||
" n_values = [cats.shape[0] for cats in self.categories_]\n",
|
|
||||||
" n_values = np.array([0] + n_values)\n",
|
|
||||||
" indices = np.cumsum(n_values)\n",
|
|
||||||
"\n",
|
|
||||||
" column_indices = (X_int + indices[:-1]).ravel()[mask]\n",
|
|
||||||
" row_indices = np.repeat(np.arange(n_samples, dtype=np.int32),\n",
|
|
||||||
" n_features)[mask]\n",
|
|
||||||
" data = np.ones(n_samples * n_features)[mask]\n",
|
|
||||||
"\n",
|
|
||||||
" out = sparse.csc_matrix((data, (row_indices, column_indices)),\n",
|
|
||||||
" shape=(n_samples, indices[-1]),\n",
|
|
||||||
" dtype=self.dtype).tocsr()\n",
|
|
||||||
" if self.encoding == 'onehot-dense':\n",
|
|
||||||
" return out.toarray()\n",
|
|
||||||
" else:\n",
|
|
||||||
" return out"
|
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -1770,9 +1542,7 @@
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 111,
|
"execution_count": 111,
|
||||||
"metadata": {
|
"metadata": {},
|
||||||
"collapsed": true
|
|
||||||
},
|
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"from sklearn.base import BaseEstimator, TransformerMixin\n",
|
"from sklearn.base import BaseEstimator, TransformerMixin\n",
|
||||||
|
@ -1798,9 +1568,7 @@
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 112,
|
"execution_count": 112,
|
||||||
"metadata": {
|
"metadata": {},
|
||||||
"collapsed": true
|
|
||||||
},
|
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"from sklearn.pipeline import Pipeline\n",
|
"from sklearn.pipeline import Pipeline\n",
|
||||||
|
@ -1833,19 +1601,17 @@
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 114,
|
"execution_count": 114,
|
||||||
"metadata": {
|
"metadata": {},
|
||||||
"collapsed": true
|
|
||||||
},
|
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"# Inspired from stackoverflow.com/questions/25239958\n",
|
"# Inspired from stackoverflow.com/questions/25239958\n",
|
||||||
"class MostFrequentImputer(BaseEstimator, TransformerMixin):\n",
|
"class MostFrequentImputer(BaseEstimator, TransformerMixin):\n",
|
||||||
" def fit(self, X, y=None):\n",
|
" def fit(self, X, y=None):\n",
|
||||||
" self.most_frequent = pd.Series([X[c].value_counts().index[0] for c in X],\n",
|
" self.most_frequent_ = pd.Series([X[c].value_counts().index[0] for c in X],\n",
|
||||||
" index=X.columns)\n",
|
" index=X.columns)\n",
|
||||||
" return self\n",
|
" return self\n",
|
||||||
" def transform(self, X, y=None):\n",
|
" def transform(self, X, y=None):\n",
|
||||||
" return X.fillna(self.most_frequent)"
|
" return X.fillna(self.most_frequent_)"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -1858,15 +1624,13 @@
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 115,
|
"execution_count": 115,
|
||||||
"metadata": {
|
"metadata": {},
|
||||||
"collapsed": true
|
|
||||||
},
|
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"cat_pipeline = Pipeline([\n",
|
"cat_pipeline = Pipeline([\n",
|
||||||
" (\"select_cat\", DataFrameSelector([\"Pclass\", \"Sex\", \"Embarked\"])),\n",
|
" (\"select_cat\", DataFrameSelector([\"Pclass\", \"Sex\", \"Embarked\"])),\n",
|
||||||
" (\"imputer\", MostFrequentImputer()),\n",
|
" (\"imputer\", MostFrequentImputer()),\n",
|
||||||
" (\"cat_encoder\", CategoricalEncoder(encoding='onehot-dense')),\n",
|
" (\"cat_encoder\", OneHotEncoder(sparse=False)),\n",
|
||||||
" ])"
|
" ])"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
@ -1889,9 +1653,7 @@
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 117,
|
"execution_count": 117,
|
||||||
"metadata": {
|
"metadata": {},
|
||||||
"collapsed": true
|
|
||||||
},
|
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"from sklearn.pipeline import FeatureUnion\n",
|
"from sklearn.pipeline import FeatureUnion\n",
|
||||||
|
@ -1928,9 +1690,7 @@
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 119,
|
"execution_count": 119,
|
||||||
"metadata": {
|
"metadata": {},
|
||||||
"collapsed": true
|
|
||||||
},
|
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"y_train = train_data[\"Survived\"]"
|
"y_train = train_data[\"Survived\"]"
|
||||||
|
@ -1965,9 +1725,7 @@
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 121,
|
"execution_count": 121,
|
||||||
"metadata": {
|
"metadata": {},
|
||||||
"collapsed": true
|
|
||||||
},
|
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"X_test = preprocess_pipeline.transform(test_data)\n",
|
"X_test = preprocess_pipeline.transform(test_data)\n",
|
||||||
|
@ -2735,7 +2493,7 @@
|
||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.5.2"
|
"version": "3.6.5"
|
||||||
},
|
},
|
||||||
"nav_menu": {},
|
"nav_menu": {},
|
||||||
"toc": {
|
"toc": {
|
||||||
|
|
Loading…
Reference in New Issue