handson-ml/future_encoders.py

883 lines
32 KiB
Python
Raw Normal View History

# Authors: Andreas Mueller <amueller@ais.uni-bonn.de>
# Joris Van den Bossche <jorisvandenbossche@gmail.com>
# License: BSD 3 clause
from __future__ import division
import numbers
import warnings
import numpy as np
from scipy import sparse
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.externals import six
from sklearn.utils import check_array
from sklearn.utils.validation import check_is_fitted, FLOAT_DTYPES
from sklearn.preprocessing.label import LabelEncoder
BOUNDS_THRESHOLD = 1e-7
zip = six.moves.zip
map = six.moves.map
range = six.moves.range
__all__ = [
'OneHotEncoder',
'OrdinalEncoder'
]
def _argmax(arr_or_spmatrix, axis=None):
return arr_or_spmatrix.argmax(axis=axis)
def _handle_zeros_in_scale(scale, copy=True):
''' Makes sure that whenever scale is zero, we handle it correctly.
This happens in most scalers when we have constant features.'''
# if we are fitting on 1D arrays, scale might be a scalar
if np.isscalar(scale):
if scale == .0:
scale = 1.
return scale
elif isinstance(scale, np.ndarray):
if copy:
# New array to avoid side-effects
scale = scale.copy()
scale[scale == 0.0] = 1.0
return scale
def _transform_selected(X, transform, selected="all", copy=True):
"""Apply a transform function to portion of selected features
Parameters
----------
X : {array-like, sparse matrix}, shape [n_samples, n_features]
Dense array or sparse matrix.
transform : callable
A callable transform(X) -> X_transformed
copy : boolean, optional
Copy X even if it could be avoided.
selected: "all" or array of indices or mask
Specify which features to apply the transform to.
Returns
-------
X : array or sparse matrix, shape=(n_samples, n_features_new)
"""
X = check_array(X, accept_sparse='csc', copy=copy, dtype=FLOAT_DTYPES)
if isinstance(selected, six.string_types) and selected == "all":
return transform(X)
if len(selected) == 0:
return X
n_features = X.shape[1]
ind = np.arange(n_features)
sel = np.zeros(n_features, dtype=bool)
sel[np.asarray(selected)] = True
not_sel = np.logical_not(sel)
n_selected = np.sum(sel)
if n_selected == 0:
# No features selected.
return X
elif n_selected == n_features:
# All features selected.
return transform(X)
else:
X_sel = transform(X[:, ind[sel]])
X_not_sel = X[:, ind[not_sel]]
if sparse.issparse(X_sel) or sparse.issparse(X_not_sel):
return sparse.hstack((X_sel, X_not_sel))
else:
return np.hstack((X_sel, X_not_sel))
class _BaseEncoder(BaseEstimator, TransformerMixin):
"""
Base class for encoders that includes the code to categorize and
transform the input features.
"""
def _fit(self, X, handle_unknown='error'):
X_temp = check_array(X, dtype=None)
if not hasattr(X, 'dtype') and np.issubdtype(X_temp.dtype, np.str_):
X = check_array(X, dtype=np.object)
else:
X = X_temp
n_samples, n_features = X.shape
if self.categories != 'auto':
for cats in self.categories:
if not np.all(np.sort(cats) == np.array(cats)):
raise ValueError("Unsorted categories are not yet "
"supported")
if len(self.categories) != n_features:
raise ValueError("Shape mismatch: if n_values is an array,"
" it has to be of shape (n_features,).")
self._label_encoders_ = [LabelEncoder() for _ in range(n_features)]
for i in range(n_features):
le = self._label_encoders_[i]
Xi = X[:, i]
if self.categories == 'auto':
le.fit(Xi)
else:
if handle_unknown == 'error':
valid_mask = np.in1d(Xi, self.categories[i])
if not np.all(valid_mask):
diff = np.unique(Xi[~valid_mask])
msg = ("Found unknown categories {0} in column {1}"
" during fit".format(diff, i))
raise ValueError(msg)
le.classes_ = np.array(self.categories[i])
self.categories_ = [le.classes_ for le in self._label_encoders_]
def _transform(self, X, handle_unknown='error'):
X_temp = check_array(X, dtype=None)
if not hasattr(X, 'dtype') and np.issubdtype(X_temp.dtype, np.str_):
X = check_array(X, dtype=np.object)
else:
X = X_temp
_, n_features = X.shape
X_int = np.zeros_like(X, dtype=np.int)
X_mask = np.ones_like(X, dtype=np.bool)
for i in range(n_features):
Xi = X[:, i]
valid_mask = np.in1d(Xi, self.categories_[i])
if not np.all(valid_mask):
if handle_unknown == 'error':
diff = np.unique(X[~valid_mask, i])
msg = ("Found unknown categories {0} in column {1}"
" during transform".format(diff, i))
raise ValueError(msg)
else:
# Set the problematic rows to an acceptable value and
# continue `The rows are marked `X_mask` and will be
# removed later.
X_mask[:, i] = valid_mask
Xi = Xi.copy()
Xi[~valid_mask] = self.categories_[i][0]
X_int[:, i] = self._label_encoders_[i].transform(Xi)
return X_int, X_mask
WARNING_MSG = (
"The handling of integer data will change in the future. Currently, the "
"categories are determined based on the range [0, max(values)], while "
"in the future they will be determined based on the unique values.\n"
"If you want the future behaviour, you can specify \"categories='auto'\"."
)
class OneHotEncoder(_BaseEncoder):
"""Encode categorical integer features as a one-hot numeric array.
The input to this transformer should be an array-like of integers or
strings, denoting the values taken on by categorical (discrete) features.
The features are encoded using a one-hot (aka 'one-of-K' or 'dummy')
encoding scheme. This creates a binary column for each category and
returns a sparse matrix or dense array.
By default, the encoder derives the categories based on the unique values
in each feature. Alternatively, you can also specify the `categories`
manually.
The OneHotEncoder previously assumed that the input features take on
values in the range [0, max(values)). This behaviour is deprecated.
This encoding is needed for feeding categorical data to many scikit-learn
estimators, notably linear models and SVMs with the standard kernels.
Note: a one-hot encoding of y labels should use a LabelBinarizer
instead.
Read more in the :ref:`User Guide <preprocessing_categorical_features>`.
Parameters
----------
categories : 'auto' or a list of lists/arrays of values.
Categories (unique values) per feature:
- 'auto' : Determine categories automatically from the training data.
- list : ``categories[i]`` holds the categories expected in the ith
column. The passed categories must be sorted and should not mix
strings and numeric values.
The used categories can be found in the ``categories_`` attribute.
sparse : boolean, default=True
Will return sparse matrix if set True else will return an array.
dtype : number type, default=np.float
Desired dtype of output.
handle_unknown : 'error' (default) or 'ignore'
Whether to raise an error or ignore if a unknown categorical feature is
present during transform (default is to raise). When this parameter
is set to 'ignore' and an unknown category is encountered during
transform, the resulting one-hot encoded columns for this feature
will be all zeros. In the inverse transform, an unknown category
will be denoted as None.
n_values : 'auto', int or array of ints
Number of values per feature.
- 'auto' : determine value range from training data.
- int : number of categorical values per feature.
Each feature value should be in ``range(n_values)``
- array : ``n_values[i]`` is the number of categorical values in
``X[:, i]``. Each feature value should be
in ``range(n_values[i])``
.. deprecated:: 0.20
The `n_values` keyword is deprecated and will be removed in 0.22.
Use `categories` instead.
categorical_features : "all" or array of indices or mask
Specify what features are treated as categorical.
- 'all' (default): All features are treated as categorical.
- array of indices: Array of categorical feature indices.
- mask: Array of length n_features and with dtype=bool.
Non-categorical features are always stacked to the right of the matrix.
.. deprecated:: 0.20
The `categorical_features` keyword is deprecated and will be
removed in 0.22.
Attributes
----------
categories_ : list of arrays
The categories of each feature determined during fitting
(in order corresponding with output of ``transform``).
active_features_ : array
Indices for active features, meaning values that actually occur
in the training set. Only available when n_values is ``'auto'``.
.. deprecated:: 0.20
feature_indices_ : array of shape (n_features,)
Indices to feature ranges.
Feature ``i`` in the original data is mapped to features
from ``feature_indices_[i]`` to ``feature_indices_[i+1]``
(and then potentially masked by `active_features_` afterwards)
.. deprecated:: 0.20
n_values_ : array of shape (n_features,)
Maximum number of values per feature.
.. deprecated:: 0.20
Examples
--------
Given a dataset with two features, we let the encoder find the unique
values per feature and transform the data to a binary one-hot encoding.
>>> from sklearn.preprocessing import OneHotEncoder
>>> enc = OneHotEncoder(handle_unknown='ignore')
>>> X = [['Male', 1], ['Female', 3], ['Female', 2]]
>>> enc.fit(X)
... # doctest: +ELLIPSIS
OneHotEncoder(categories='auto', dtype=<... 'numpy.float64'>,
handle_unknown='ignore', sparse=True)
>>> enc.categories_
[array(['Female', 'Male'], dtype=object), array([1, 2, 3], dtype=object)]
>>> enc.transform([['Female', 1], ['Male', 4]]).toarray()
array([[ 1., 0., 1., 0., 0.],
[ 0., 1., 0., 0., 0.]])
>>> enc.inverse_transform([[0, 1, 1, 0, 0], [0, 0, 0, 1, 0]])
array([['Male', 1],
[None, 2]], dtype=object)
See also
--------
sklearn.preprocessing.OrdinalEncoder : performs an ordinal (integer)
encoding of the categorical features.
sklearn.feature_extraction.DictVectorizer : performs a one-hot encoding of
dictionary items (also handles string-valued features).
sklearn.feature_extraction.FeatureHasher : performs an approximate one-hot
encoding of dictionary items or strings.
sklearn.preprocessing.LabelBinarizer : binarizes labels in a one-vs-all
fashion.
sklearn.preprocessing.MultiLabelBinarizer : transforms between iterable of
iterables and a multilabel format, e.g. a (samples x classes) binary
matrix indicating the presence of a class label.
"""
def __init__(self, n_values=None, categorical_features=None,
categories=None, sparse=True, dtype=np.float64,
handle_unknown='error'):
self._categories = categories
if categories is None:
self.categories = 'auto'
else:
self.categories = categories
self.sparse = sparse
self.dtype = dtype
self.handle_unknown = handle_unknown
if n_values is not None:
pass
# warnings.warn("Deprecated", DeprecationWarning)
else:
n_values = "auto"
self._deprecated_n_values = n_values
if categorical_features is not None:
pass
# warnings.warn("Deprecated", DeprecationWarning)
else:
categorical_features = "all"
self._deprecated_categorical_features = categorical_features
# Deprecated keywords
@property
def n_values(self):
warnings.warn("The 'n_values' parameter is deprecated.",
DeprecationWarning)
return self._deprecated_n_values
@n_values.setter
def n_values(self, value):
warnings.warn("The 'n_values' parameter is deprecated.",
DeprecationWarning)
self._deprecated_n_values = value
@property
def categorical_features(self):
warnings.warn("The 'categorical_features' parameter is deprecated.",
DeprecationWarning)
return self._deprecated_categorical_features
@categorical_features.setter
def categorical_features(self, value):
warnings.warn("The 'categorical_features' parameter is deprecated.",
DeprecationWarning)
self._deprecated_categorical_features = value
# Deprecated attributes
@property
def active_features_(self):
check_is_fitted(self, 'categories_')
warnings.warn("The 'active_features_' attribute is deprecated.",
DeprecationWarning)
return self._active_features_
@property
def feature_indices_(self):
check_is_fitted(self, 'categories_')
warnings.warn("The 'feature_indices_' attribute is deprecated.",
DeprecationWarning)
return self._feature_indices_
@property
def n_values_(self):
check_is_fitted(self, 'categories_')
warnings.warn("The 'n_values_' attribute is deprecated.",
DeprecationWarning)
return self._n_values_
def _handle_deprecations(self, X):
user_set_categories = False
if self._categories is not None:
self._legacy_mode = False
user_set_categories = True
elif self._deprecated_n_values != 'auto':
msg = (
"Passing 'n_values' is deprecated and will be removed in a "
"future release. You can use the 'categories' keyword instead."
" 'n_values=n' corresponds to 'n_values=[range(n)]'.")
warnings.warn(msg, DeprecationWarning)
# we internally translate this to the correct categories
# and don't use legacy mode
X = check_array(X, dtype=np.int)
if isinstance(self._deprecated_n_values, numbers.Integral):
n_features = X.shape[1]
self.categories = [
list(range(self._deprecated_n_values))
for _ in range(n_features)]
n_values = np.empty(n_features, dtype=np.int)
n_values.fill(self._deprecated_n_values)
else:
try:
n_values = np.asarray(self._deprecated_n_values, dtype=int)
self.categories = [list(range(i))
for i in self._deprecated_n_values]
except (ValueError, TypeError):
raise TypeError(
"Wrong type for parameter `n_values`. Expected 'auto',"
" int or array of ints, got %r".format(type(X)))
self._n_values_ = n_values
n_values = np.hstack([[0], n_values])
indices = np.cumsum(n_values)
self._feature_indices_ = indices
self._legacy_mode = False
else: # n_values = 'auto'
if self.handle_unknown == 'ignore':
# no change in behaviour, no need to raise deprecation warning
self._legacy_mode = False
else:
# check if we have integer or categorical input
try:
X = check_array(X, dtype=np.int)
except ValueError:
self._legacy_mode = False
else:
warnings.warn(WARNING_MSG, DeprecationWarning)
self._legacy_mode = True
if (not isinstance(self._deprecated_categorical_features,
six.string_types)
or (isinstance(self._deprecated_categorical_features,
six.string_types)
and self._deprecated_categorical_features != 'all')):
if user_set_categories:
raise ValueError(
"The 'categorical_features' keyword is deprecated, and "
"cannot be used together with specifying 'categories'.")
warnings.warn("The 'categorical_features' keyword is deprecated.",
DeprecationWarning)
self._legacy_mode = True
def fit(self, X, y=None):
"""Fit OneHotEncoder to X.
Parameters
----------
X : array-like, shape [n_samples, n_feature]
The data to determine the categories of each feature.
Returns
-------
self
"""
if self.handle_unknown not in ['error', 'ignore']:
template = ("handle_unknown should be either 'error' or "
"'ignore', got %s")
raise ValueError(template % self.handle_unknown)
self._handle_deprecations(X)
if self._legacy_mode:
# TODO not with _transform_selected ??
self._legacy_fit_transform(X)
return self
else:
self._fit(X, handle_unknown=self.handle_unknown)
return self
def _legacy_fit_transform(self, X):
"""Assumes X contains only categorical features."""
self_n_values = self._deprecated_n_values
dtype = getattr(X, 'dtype', None)
X = check_array(X, dtype=np.int)
if np.any(X < 0):
raise ValueError("X needs to contain only non-negative integers.")
n_samples, n_features = X.shape
if (isinstance(self_n_values, six.string_types) and
self_n_values == 'auto'):
n_values = np.max(X, axis=0) + 1
elif isinstance(self_n_values, numbers.Integral):
if (np.max(X, axis=0) >= self_n_values).any():
raise ValueError("Feature out of bounds for n_values=%d"
% self_n_values)
n_values = np.empty(n_features, dtype=np.int)
n_values.fill(self_n_values)
else:
try:
n_values = np.asarray(self_n_values, dtype=int)
except (ValueError, TypeError):
raise TypeError("Wrong type for parameter `n_values`. Expected"
" 'auto', int or array of ints, got %r"
% type(X))
if n_values.ndim < 1 or n_values.shape[0] != X.shape[1]:
raise ValueError("Shape mismatch: if n_values is an array,"
" it has to be of shape (n_features,).")
self._n_values_ = n_values
self.categories_ = [np.arange(n_val - 1, dtype=dtype)
for n_val in n_values]
n_values = np.hstack([[0], n_values])
indices = np.cumsum(n_values)
self._feature_indices_ = indices
column_indices = (X + indices[:-1]).ravel()
row_indices = np.repeat(np.arange(n_samples, dtype=np.int32),
n_features)
data = np.ones(n_samples * n_features)
out = sparse.coo_matrix((data, (row_indices, column_indices)),
shape=(n_samples, indices[-1]),
dtype=self.dtype).tocsr()
if (isinstance(self_n_values, six.string_types) and
self_n_values == 'auto'):
mask = np.array(out.sum(axis=0)).ravel() != 0
active_features = np.where(mask)[0]
out = out[:, active_features]
self._active_features_ = active_features
self.categories_ = [
np.unique(X[:, i]).astype(dtype) if dtype else np.unique(X[:, i])
for i in range(n_features)]
#import pdb; pdb.set_trace()
return out if self.sparse else out.toarray()
def fit_transform(self, X, y=None):
"""Fit OneHotEncoder to X, then transform X.
Equivalent to self.fit(X).transform(X), but more convenient and more
efficient. See fit for the parameters, transform for the return value.
Parameters
----------
X : array-like, shape [n_samples, n_feature]
Input array of type int.
"""
if self.handle_unknown not in ['error', 'ignore']:
template = ("handle_unknown should be either 'error' or "
"'ignore', got %s")
raise ValueError(template % self.handle_unknown)
self._handle_deprecations(X)
if self._legacy_mode:
return _transform_selected(X, self._legacy_fit_transform,
self._deprecated_categorical_features,
copy=True)
else:
return self.fit(X).transform(X)
def _legacy_transform(self, X):
"""Assumes X contains only categorical features."""
self_n_values = self._deprecated_n_values
X = check_array(X, dtype=np.int)
if np.any(X < 0):
raise ValueError("X needs to contain only non-negative integers.")
n_samples, n_features = X.shape
indices = self._feature_indices_
if n_features != indices.shape[0] - 1:
raise ValueError("X has different shape than during fitting."
" Expected %d, got %d."
% (indices.shape[0] - 1, n_features))
# We use only those categorical features of X that are known using fit.
# i.e lesser than n_values_ using mask.
# This means, if self.handle_unknown is "ignore", the row_indices and
# col_indices corresponding to the unknown categorical feature are
# ignored.
mask = (X < self._n_values_).ravel()
if np.any(~mask):
if self.handle_unknown not in ['error', 'ignore']:
raise ValueError("handle_unknown should be either error or "
"unknown got %s" % self.handle_unknown)
if self.handle_unknown == 'error':
raise ValueError("unknown categorical feature present %s "
"during transform." % X.ravel()[~mask])
column_indices = (X + indices[:-1]).ravel()[mask]
row_indices = np.repeat(np.arange(n_samples, dtype=np.int32),
n_features)[mask]
data = np.ones(np.sum(mask))
out = sparse.coo_matrix((data, (row_indices, column_indices)),
shape=(n_samples, indices[-1]),
dtype=self.dtype).tocsr()
if (isinstance(self_n_values, six.string_types) and
self_n_values == 'auto'):
out = out[:, self._active_features_]
return out if self.sparse else out.toarray()
def _transform_new(self, X):
"""New implementation assuming categorical input"""
X_temp = check_array(X, dtype=None)
if not hasattr(X, 'dtype') and np.issubdtype(X_temp.dtype, np.str_):
X = check_array(X, dtype=np.object)
else:
X = X_temp
n_samples, n_features = X.shape
X_int, X_mask = self._transform(X, handle_unknown=self.handle_unknown)
mask = X_mask.ravel()
n_values = [cats.shape[0] for cats in self.categories_]
n_values = np.array([0] + n_values)
feature_indices = np.cumsum(n_values)
indices = (X_int + feature_indices[:-1]).ravel()[mask]
indptr = X_mask.sum(axis=1).cumsum()
indptr = np.insert(indptr, 0, 0)
data = np.ones(n_samples * n_features)[mask]
out = sparse.csr_matrix((data, indices, indptr),
shape=(n_samples, feature_indices[-1]),
dtype=self.dtype)
if not self.sparse:
return out.toarray()
else:
return out
def transform(self, X):
"""Transform X using one-hot encoding.
Parameters
----------
X : array-like, shape [n_samples, n_features]
The data to encode.
Returns
-------
X_out : sparse matrix if sparse=True else a 2-d array
Transformed input.
"""
if not self._legacy_mode:
return self._transform_new(X)
else:
return _transform_selected(X, self._legacy_transform,
self._deprecated_categorical_features,
copy=True)
def inverse_transform(self, X):
"""Convert back the data to the original representation.
In case unknown categories are encountered (all zero's in the
one-hot encoding), ``None`` is used to represent this category.
Parameters
----------
X : array-like or sparse matrix, shape [n_samples, n_encoded_features]
The transformed data.
Returns
-------
X_tr : array-like, shape [n_samples, n_features]
Inverse transformed array.
"""
# if self._legacy_mode:
# raise ValueError("only supported for categorical features")
check_is_fitted(self, 'categories_')
X = check_array(X, accept_sparse='csr')
n_samples, _ = X.shape
n_features = len(self.categories_)
n_transformed_features = sum([len(cats) for cats in self.categories_])
# validate shape of passed X
msg = ("Shape of the passed X data is not correct. Expected {0} "
"columns, got {1}.")
if X.shape[1] != n_transformed_features:
raise ValueError(msg.format(n_transformed_features, X.shape[1]))
# create resulting array of appropriate dtype
dt = np.find_common_type([cat.dtype for cat in self.categories_], [])
X_tr = np.empty((n_samples, n_features), dtype=dt)
j = 0
found_unknown = {}
for i in range(n_features):
n_categories = len(self.categories_[i])
sub = X[:, j:j + n_categories]
# for sparse X argmax returns 2D matrix, ensure 1D array
labels = np.asarray(_argmax(sub, axis=1)).flatten()
X_tr[:, i] = self.categories_[i][labels]
if self.handle_unknown == 'ignore':
# ignored unknown categories: we have a row of all zero's
unknown = np.asarray(sub.sum(axis=1) == 0).flatten()
if unknown.any():
found_unknown[i] = unknown
j += n_categories
# if ignored are found: potentially need to upcast result to
# insert None values
if found_unknown:
if X_tr.dtype != object:
X_tr = X_tr.astype(object)
for idx, mask in found_unknown.items():
X_tr[mask, idx] = None
return X_tr
class OrdinalEncoder(_BaseEncoder):
"""Encode categorical features as an integer array.
The input to this transformer should be an array-like of integers or
strings, denoting the values taken on by categorical (discrete) features.
The features are converted to ordinal integers. This results in
a single column of integers (0 to n_categories - 1) per feature.
Read more in the :ref:`User Guide <preprocessing_categorical_features>`.
Parameters
----------
categories : 'auto' or a list of lists/arrays of values.
Categories (unique values) per feature:
- 'auto' : Determine categories automatically from the training data.
- list : ``categories[i]`` holds the categories expected in the ith
column. The passed categories must be sorted and should not mix
strings and numeric values.
The used categories can be found in the ``categories_`` attribute.
dtype : number type, default np.float64
Desired dtype of output.
Attributes
----------
categories_ : list of arrays
The categories of each feature determined during fitting
(in order corresponding with output of ``transform``).
Examples
--------
Given a dataset with two features, we let the encoder find the unique
values per feature and transform the data to a binary one-hot encoding.
>>> from sklearn.preprocessing import OrdinalEncoder
>>> enc = OrdinalEncoder()
>>> X = [['Male', 1], ['Female', 3], ['Female', 2]]
>>> enc.fit(X)
... # doctest: +ELLIPSIS
OrdinalEncoder(categories='auto', dtype=<... 'numpy.float64'>)
>>> enc.categories_
[array(['Female', 'Male'], dtype=object), array([1, 2, 3], dtype=object)]
>>> enc.transform([['Female', 3], ['Male', 1]])
array([[ 0., 2.],
[ 1., 0.]])
>>> enc.inverse_transform([[1, 0], [0, 1]])
array([['Male', 1],
['Female', 2]], dtype=object)
See also
--------
sklearn.preprocessing.OneHotEncoder : performs a one-hot encoding of
categorical features.
sklearn.preprocessing.LabelEncoder : encodes target labels with values
between 0 and n_classes-1.
sklearn.feature_extraction.DictVectorizer : performs a one-hot encoding of
dictionary items (also handles string-valued features).
sklearn.feature_extraction.FeatureHasher : performs an approximate one-hot
encoding of dictionary items or strings.
"""
def __init__(self, categories='auto', dtype=np.float64):
self.categories = categories
self.dtype = dtype
def fit(self, X, y=None):
"""Fit the OrdinalEncoder to X.
Parameters
----------
X : array-like, shape [n_samples, n_features]
The data to determine the categories of each feature.
Returns
-------
self
"""
self._fit(X)
return self
def transform(self, X):
"""Transform X to ordinal codes.
Parameters
----------
X : array-like, shape [n_samples, n_features]
The data to encode.
Returns
-------
X_out : sparse matrix or a 2-d array
Transformed input.
"""
X_int, _ = self._transform(X)
return X_int.astype(self.dtype, copy=False)
def inverse_transform(self, X):
"""Convert back the data to the original representation.
Parameters
----------
X : array-like or sparse matrix, shape [n_samples, n_encoded_features]
The transformed data.
Returns
-------
X_tr : array-like, shape [n_samples, n_features]
Inverse transformed array.
"""
check_is_fitted(self, 'categories_')
X = check_array(X, accept_sparse='csr')
n_samples, _ = X.shape
n_features = len(self.categories_)
# validate shape of passed X
msg = ("Shape of the passed X data is not correct. Expected {0} "
"columns, got {1}.")
if X.shape[1] != n_features:
raise ValueError(msg.format(n_features, X.shape[1]))
# create resulting array of appropriate dtype
dt = np.find_common_type([cat.dtype for cat in self.categories_], [])
X_tr = np.empty((n_samples, n_features), dtype=dt)
for i in range(n_features):
labels = X[:, i].astype('int64')
X_tr[:, i] = self.categories_[i][labels]
return X_tr