# Authors: Andreas Mueller # Joris Van den Bossche # License: BSD 3 clause from __future__ import division import numbers import warnings import numpy as np from scipy import sparse from sklearn.base import BaseEstimator, TransformerMixin from sklearn.externals import six from sklearn.utils import check_array from sklearn.utils.validation import check_is_fitted, FLOAT_DTYPES from sklearn.preprocessing.label import LabelEncoder BOUNDS_THRESHOLD = 1e-7 zip = six.moves.zip map = six.moves.map range = six.moves.range __all__ = [ 'OneHotEncoder', 'OrdinalEncoder' ] def _argmax(arr_or_spmatrix, axis=None): return arr_or_spmatrix.argmax(axis=axis) def _handle_zeros_in_scale(scale, copy=True): ''' Makes sure that whenever scale is zero, we handle it correctly. This happens in most scalers when we have constant features.''' # if we are fitting on 1D arrays, scale might be a scalar if np.isscalar(scale): if scale == .0: scale = 1. return scale elif isinstance(scale, np.ndarray): if copy: # New array to avoid side-effects scale = scale.copy() scale[scale == 0.0] = 1.0 return scale def _transform_selected(X, transform, selected="all", copy=True): """Apply a transform function to portion of selected features Parameters ---------- X : {array-like, sparse matrix}, shape [n_samples, n_features] Dense array or sparse matrix. transform : callable A callable transform(X) -> X_transformed copy : boolean, optional Copy X even if it could be avoided. selected: "all" or array of indices or mask Specify which features to apply the transform to. Returns ------- X : array or sparse matrix, shape=(n_samples, n_features_new) """ X = check_array(X, accept_sparse='csc', copy=copy, dtype=FLOAT_DTYPES) if isinstance(selected, six.string_types) and selected == "all": return transform(X) if len(selected) == 0: return X n_features = X.shape[1] ind = np.arange(n_features) sel = np.zeros(n_features, dtype=bool) sel[np.asarray(selected)] = True not_sel = np.logical_not(sel) n_selected = np.sum(sel) if n_selected == 0: # No features selected. return X elif n_selected == n_features: # All features selected. return transform(X) else: X_sel = transform(X[:, ind[sel]]) X_not_sel = X[:, ind[not_sel]] if sparse.issparse(X_sel) or sparse.issparse(X_not_sel): return sparse.hstack((X_sel, X_not_sel)) else: return np.hstack((X_sel, X_not_sel)) class _BaseEncoder(BaseEstimator, TransformerMixin): """ Base class for encoders that includes the code to categorize and transform the input features. """ def _fit(self, X, handle_unknown='error'): X_temp = check_array(X, dtype=None) if not hasattr(X, 'dtype') and np.issubdtype(X_temp.dtype, np.str_): X = check_array(X, dtype=np.object) else: X = X_temp n_samples, n_features = X.shape if self.categories != 'auto': for cats in self.categories: if not np.all(np.sort(cats) == np.array(cats)): raise ValueError("Unsorted categories are not yet " "supported") if len(self.categories) != n_features: raise ValueError("Shape mismatch: if n_values is an array," " it has to be of shape (n_features,).") self._label_encoders_ = [LabelEncoder() for _ in range(n_features)] for i in range(n_features): le = self._label_encoders_[i] Xi = X[:, i] if self.categories == 'auto': le.fit(Xi) else: if handle_unknown == 'error': valid_mask = np.in1d(Xi, self.categories[i]) if not np.all(valid_mask): diff = np.unique(Xi[~valid_mask]) msg = ("Found unknown categories {0} in column {1}" " during fit".format(diff, i)) raise ValueError(msg) le.classes_ = np.array(self.categories[i]) self.categories_ = [le.classes_ for le in self._label_encoders_] def _transform(self, X, handle_unknown='error'): X_temp = check_array(X, dtype=None) if not hasattr(X, 'dtype') and np.issubdtype(X_temp.dtype, np.str_): X = check_array(X, dtype=np.object) else: X = X_temp _, n_features = X.shape X_int = np.zeros_like(X, dtype=np.int) X_mask = np.ones_like(X, dtype=np.bool) for i in range(n_features): Xi = X[:, i] valid_mask = np.in1d(Xi, self.categories_[i]) if not np.all(valid_mask): if handle_unknown == 'error': diff = np.unique(X[~valid_mask, i]) msg = ("Found unknown categories {0} in column {1}" " during transform".format(diff, i)) raise ValueError(msg) else: # Set the problematic rows to an acceptable value and # continue `The rows are marked `X_mask` and will be # removed later. X_mask[:, i] = valid_mask Xi = Xi.copy() Xi[~valid_mask] = self.categories_[i][0] X_int[:, i] = self._label_encoders_[i].transform(Xi) return X_int, X_mask WARNING_MSG = ( "The handling of integer data will change in the future. Currently, the " "categories are determined based on the range [0, max(values)], while " "in the future they will be determined based on the unique values.\n" "If you want the future behaviour, you can specify \"categories='auto'\"." ) class OneHotEncoder(_BaseEncoder): """Encode categorical integer features as a one-hot numeric array. The input to this transformer should be an array-like of integers or strings, denoting the values taken on by categorical (discrete) features. The features are encoded using a one-hot (aka 'one-of-K' or 'dummy') encoding scheme. This creates a binary column for each category and returns a sparse matrix or dense array. By default, the encoder derives the categories based on the unique values in each feature. Alternatively, you can also specify the `categories` manually. The OneHotEncoder previously assumed that the input features take on values in the range [0, max(values)). This behaviour is deprecated. This encoding is needed for feeding categorical data to many scikit-learn estimators, notably linear models and SVMs with the standard kernels. Note: a one-hot encoding of y labels should use a LabelBinarizer instead. Read more in the :ref:`User Guide `. Parameters ---------- categories : 'auto' or a list of lists/arrays of values. Categories (unique values) per feature: - 'auto' : Determine categories automatically from the training data. - list : ``categories[i]`` holds the categories expected in the ith column. The passed categories must be sorted and should not mix strings and numeric values. The used categories can be found in the ``categories_`` attribute. sparse : boolean, default=True Will return sparse matrix if set True else will return an array. dtype : number type, default=np.float Desired dtype of output. handle_unknown : 'error' (default) or 'ignore' Whether to raise an error or ignore if a unknown categorical feature is present during transform (default is to raise). When this parameter is set to 'ignore' and an unknown category is encountered during transform, the resulting one-hot encoded columns for this feature will be all zeros. In the inverse transform, an unknown category will be denoted as None. n_values : 'auto', int or array of ints Number of values per feature. - 'auto' : determine value range from training data. - int : number of categorical values per feature. Each feature value should be in ``range(n_values)`` - array : ``n_values[i]`` is the number of categorical values in ``X[:, i]``. Each feature value should be in ``range(n_values[i])`` .. deprecated:: 0.20 The `n_values` keyword is deprecated and will be removed in 0.22. Use `categories` instead. categorical_features : "all" or array of indices or mask Specify what features are treated as categorical. - 'all' (default): All features are treated as categorical. - array of indices: Array of categorical feature indices. - mask: Array of length n_features and with dtype=bool. Non-categorical features are always stacked to the right of the matrix. .. deprecated:: 0.20 The `categorical_features` keyword is deprecated and will be removed in 0.22. Attributes ---------- categories_ : list of arrays The categories of each feature determined during fitting (in order corresponding with output of ``transform``). active_features_ : array Indices for active features, meaning values that actually occur in the training set. Only available when n_values is ``'auto'``. .. deprecated:: 0.20 feature_indices_ : array of shape (n_features,) Indices to feature ranges. Feature ``i`` in the original data is mapped to features from ``feature_indices_[i]`` to ``feature_indices_[i+1]`` (and then potentially masked by `active_features_` afterwards) .. deprecated:: 0.20 n_values_ : array of shape (n_features,) Maximum number of values per feature. .. deprecated:: 0.20 Examples -------- Given a dataset with two features, we let the encoder find the unique values per feature and transform the data to a binary one-hot encoding. >>> from sklearn.preprocessing import OneHotEncoder >>> enc = OneHotEncoder(handle_unknown='ignore') >>> X = [['Male', 1], ['Female', 3], ['Female', 2]] >>> enc.fit(X) ... # doctest: +ELLIPSIS OneHotEncoder(categories='auto', dtype=<... 'numpy.float64'>, handle_unknown='ignore', sparse=True) >>> enc.categories_ [array(['Female', 'Male'], dtype=object), array([1, 2, 3], dtype=object)] >>> enc.transform([['Female', 1], ['Male', 4]]).toarray() array([[ 1., 0., 1., 0., 0.], [ 0., 1., 0., 0., 0.]]) >>> enc.inverse_transform([[0, 1, 1, 0, 0], [0, 0, 0, 1, 0]]) array([['Male', 1], [None, 2]], dtype=object) See also -------- sklearn.preprocessing.OrdinalEncoder : performs an ordinal (integer) encoding of the categorical features. sklearn.feature_extraction.DictVectorizer : performs a one-hot encoding of dictionary items (also handles string-valued features). sklearn.feature_extraction.FeatureHasher : performs an approximate one-hot encoding of dictionary items or strings. sklearn.preprocessing.LabelBinarizer : binarizes labels in a one-vs-all fashion. sklearn.preprocessing.MultiLabelBinarizer : transforms between iterable of iterables and a multilabel format, e.g. a (samples x classes) binary matrix indicating the presence of a class label. """ def __init__(self, n_values=None, categorical_features=None, categories=None, sparse=True, dtype=np.float64, handle_unknown='error'): self._categories = categories if categories is None: self.categories = 'auto' else: self.categories = categories self.sparse = sparse self.dtype = dtype self.handle_unknown = handle_unknown if n_values is not None: pass # warnings.warn("Deprecated", DeprecationWarning) else: n_values = "auto" self._deprecated_n_values = n_values if categorical_features is not None: pass # warnings.warn("Deprecated", DeprecationWarning) else: categorical_features = "all" self._deprecated_categorical_features = categorical_features # Deprecated keywords @property def n_values(self): warnings.warn("The 'n_values' parameter is deprecated.", DeprecationWarning) return self._deprecated_n_values @n_values.setter def n_values(self, value): warnings.warn("The 'n_values' parameter is deprecated.", DeprecationWarning) self._deprecated_n_values = value @property def categorical_features(self): warnings.warn("The 'categorical_features' parameter is deprecated.", DeprecationWarning) return self._deprecated_categorical_features @categorical_features.setter def categorical_features(self, value): warnings.warn("The 'categorical_features' parameter is deprecated.", DeprecationWarning) self._deprecated_categorical_features = value # Deprecated attributes @property def active_features_(self): check_is_fitted(self, 'categories_') warnings.warn("The 'active_features_' attribute is deprecated.", DeprecationWarning) return self._active_features_ @property def feature_indices_(self): check_is_fitted(self, 'categories_') warnings.warn("The 'feature_indices_' attribute is deprecated.", DeprecationWarning) return self._feature_indices_ @property def n_values_(self): check_is_fitted(self, 'categories_') warnings.warn("The 'n_values_' attribute is deprecated.", DeprecationWarning) return self._n_values_ def _handle_deprecations(self, X): user_set_categories = False if self._categories is not None: self._legacy_mode = False user_set_categories = True elif self._deprecated_n_values != 'auto': msg = ( "Passing 'n_values' is deprecated and will be removed in a " "future release. You can use the 'categories' keyword instead." " 'n_values=n' corresponds to 'n_values=[range(n)]'.") warnings.warn(msg, DeprecationWarning) # we internally translate this to the correct categories # and don't use legacy mode X = check_array(X, dtype=np.int) if isinstance(self._deprecated_n_values, numbers.Integral): n_features = X.shape[1] self.categories = [ list(range(self._deprecated_n_values)) for _ in range(n_features)] n_values = np.empty(n_features, dtype=np.int) n_values.fill(self._deprecated_n_values) else: try: n_values = np.asarray(self._deprecated_n_values, dtype=int) self.categories = [list(range(i)) for i in self._deprecated_n_values] except (ValueError, TypeError): raise TypeError( "Wrong type for parameter `n_values`. Expected 'auto'," " int or array of ints, got %r".format(type(X))) self._n_values_ = n_values n_values = np.hstack([[0], n_values]) indices = np.cumsum(n_values) self._feature_indices_ = indices self._legacy_mode = False else: # n_values = 'auto' if self.handle_unknown == 'ignore': # no change in behaviour, no need to raise deprecation warning self._legacy_mode = False else: # check if we have integer or categorical input try: X = check_array(X, dtype=np.int) except ValueError: self._legacy_mode = False else: warnings.warn(WARNING_MSG, DeprecationWarning) self._legacy_mode = True if (not isinstance(self._deprecated_categorical_features, six.string_types) or (isinstance(self._deprecated_categorical_features, six.string_types) and self._deprecated_categorical_features != 'all')): if user_set_categories: raise ValueError( "The 'categorical_features' keyword is deprecated, and " "cannot be used together with specifying 'categories'.") warnings.warn("The 'categorical_features' keyword is deprecated.", DeprecationWarning) self._legacy_mode = True def fit(self, X, y=None): """Fit OneHotEncoder to X. Parameters ---------- X : array-like, shape [n_samples, n_feature] The data to determine the categories of each feature. Returns ------- self """ if self.handle_unknown not in ['error', 'ignore']: template = ("handle_unknown should be either 'error' or " "'ignore', got %s") raise ValueError(template % self.handle_unknown) self._handle_deprecations(X) if self._legacy_mode: # TODO not with _transform_selected ?? self._legacy_fit_transform(X) return self else: self._fit(X, handle_unknown=self.handle_unknown) return self def _legacy_fit_transform(self, X): """Assumes X contains only categorical features.""" self_n_values = self._deprecated_n_values dtype = getattr(X, 'dtype', None) X = check_array(X, dtype=np.int) if np.any(X < 0): raise ValueError("X needs to contain only non-negative integers.") n_samples, n_features = X.shape if (isinstance(self_n_values, six.string_types) and self_n_values == 'auto'): n_values = np.max(X, axis=0) + 1 elif isinstance(self_n_values, numbers.Integral): if (np.max(X, axis=0) >= self_n_values).any(): raise ValueError("Feature out of bounds for n_values=%d" % self_n_values) n_values = np.empty(n_features, dtype=np.int) n_values.fill(self_n_values) else: try: n_values = np.asarray(self_n_values, dtype=int) except (ValueError, TypeError): raise TypeError("Wrong type for parameter `n_values`. Expected" " 'auto', int or array of ints, got %r" % type(X)) if n_values.ndim < 1 or n_values.shape[0] != X.shape[1]: raise ValueError("Shape mismatch: if n_values is an array," " it has to be of shape (n_features,).") self._n_values_ = n_values self.categories_ = [np.arange(n_val - 1, dtype=dtype) for n_val in n_values] n_values = np.hstack([[0], n_values]) indices = np.cumsum(n_values) self._feature_indices_ = indices column_indices = (X + indices[:-1]).ravel() row_indices = np.repeat(np.arange(n_samples, dtype=np.int32), n_features) data = np.ones(n_samples * n_features) out = sparse.coo_matrix((data, (row_indices, column_indices)), shape=(n_samples, indices[-1]), dtype=self.dtype).tocsr() if (isinstance(self_n_values, six.string_types) and self_n_values == 'auto'): mask = np.array(out.sum(axis=0)).ravel() != 0 active_features = np.where(mask)[0] out = out[:, active_features] self._active_features_ = active_features self.categories_ = [ np.unique(X[:, i]).astype(dtype) if dtype else np.unique(X[:, i]) for i in range(n_features)] #import pdb; pdb.set_trace() return out if self.sparse else out.toarray() def fit_transform(self, X, y=None): """Fit OneHotEncoder to X, then transform X. Equivalent to self.fit(X).transform(X), but more convenient and more efficient. See fit for the parameters, transform for the return value. Parameters ---------- X : array-like, shape [n_samples, n_feature] Input array of type int. """ if self.handle_unknown not in ['error', 'ignore']: template = ("handle_unknown should be either 'error' or " "'ignore', got %s") raise ValueError(template % self.handle_unknown) self._handle_deprecations(X) if self._legacy_mode: return _transform_selected(X, self._legacy_fit_transform, self._deprecated_categorical_features, copy=True) else: return self.fit(X).transform(X) def _legacy_transform(self, X): """Assumes X contains only categorical features.""" self_n_values = self._deprecated_n_values X = check_array(X, dtype=np.int) if np.any(X < 0): raise ValueError("X needs to contain only non-negative integers.") n_samples, n_features = X.shape indices = self._feature_indices_ if n_features != indices.shape[0] - 1: raise ValueError("X has different shape than during fitting." " Expected %d, got %d." % (indices.shape[0] - 1, n_features)) # We use only those categorical features of X that are known using fit. # i.e lesser than n_values_ using mask. # This means, if self.handle_unknown is "ignore", the row_indices and # col_indices corresponding to the unknown categorical feature are # ignored. mask = (X < self._n_values_).ravel() if np.any(~mask): if self.handle_unknown not in ['error', 'ignore']: raise ValueError("handle_unknown should be either error or " "unknown got %s" % self.handle_unknown) if self.handle_unknown == 'error': raise ValueError("unknown categorical feature present %s " "during transform." % X.ravel()[~mask]) column_indices = (X + indices[:-1]).ravel()[mask] row_indices = np.repeat(np.arange(n_samples, dtype=np.int32), n_features)[mask] data = np.ones(np.sum(mask)) out = sparse.coo_matrix((data, (row_indices, column_indices)), shape=(n_samples, indices[-1]), dtype=self.dtype).tocsr() if (isinstance(self_n_values, six.string_types) and self_n_values == 'auto'): out = out[:, self._active_features_] return out if self.sparse else out.toarray() def _transform_new(self, X): """New implementation assuming categorical input""" X_temp = check_array(X, dtype=None) if not hasattr(X, 'dtype') and np.issubdtype(X_temp.dtype, np.str_): X = check_array(X, dtype=np.object) else: X = X_temp n_samples, n_features = X.shape X_int, X_mask = self._transform(X, handle_unknown=self.handle_unknown) mask = X_mask.ravel() n_values = [cats.shape[0] for cats in self.categories_] n_values = np.array([0] + n_values) feature_indices = np.cumsum(n_values) indices = (X_int + feature_indices[:-1]).ravel()[mask] indptr = X_mask.sum(axis=1).cumsum() indptr = np.insert(indptr, 0, 0) data = np.ones(n_samples * n_features)[mask] out = sparse.csr_matrix((data, indices, indptr), shape=(n_samples, feature_indices[-1]), dtype=self.dtype) if not self.sparse: return out.toarray() else: return out def transform(self, X): """Transform X using one-hot encoding. Parameters ---------- X : array-like, shape [n_samples, n_features] The data to encode. Returns ------- X_out : sparse matrix if sparse=True else a 2-d array Transformed input. """ if not self._legacy_mode: return self._transform_new(X) else: return _transform_selected(X, self._legacy_transform, self._deprecated_categorical_features, copy=True) def inverse_transform(self, X): """Convert back the data to the original representation. In case unknown categories are encountered (all zero's in the one-hot encoding), ``None`` is used to represent this category. Parameters ---------- X : array-like or sparse matrix, shape [n_samples, n_encoded_features] The transformed data. Returns ------- X_tr : array-like, shape [n_samples, n_features] Inverse transformed array. """ # if self._legacy_mode: # raise ValueError("only supported for categorical features") check_is_fitted(self, 'categories_') X = check_array(X, accept_sparse='csr') n_samples, _ = X.shape n_features = len(self.categories_) n_transformed_features = sum([len(cats) for cats in self.categories_]) # validate shape of passed X msg = ("Shape of the passed X data is not correct. Expected {0} " "columns, got {1}.") if X.shape[1] != n_transformed_features: raise ValueError(msg.format(n_transformed_features, X.shape[1])) # create resulting array of appropriate dtype dt = np.find_common_type([cat.dtype for cat in self.categories_], []) X_tr = np.empty((n_samples, n_features), dtype=dt) j = 0 found_unknown = {} for i in range(n_features): n_categories = len(self.categories_[i]) sub = X[:, j:j + n_categories] # for sparse X argmax returns 2D matrix, ensure 1D array labels = np.asarray(_argmax(sub, axis=1)).flatten() X_tr[:, i] = self.categories_[i][labels] if self.handle_unknown == 'ignore': # ignored unknown categories: we have a row of all zero's unknown = np.asarray(sub.sum(axis=1) == 0).flatten() if unknown.any(): found_unknown[i] = unknown j += n_categories # if ignored are found: potentially need to upcast result to # insert None values if found_unknown: if X_tr.dtype != object: X_tr = X_tr.astype(object) for idx, mask in found_unknown.items(): X_tr[mask, idx] = None return X_tr class OrdinalEncoder(_BaseEncoder): """Encode categorical features as an integer array. The input to this transformer should be an array-like of integers or strings, denoting the values taken on by categorical (discrete) features. The features are converted to ordinal integers. This results in a single column of integers (0 to n_categories - 1) per feature. Read more in the :ref:`User Guide `. Parameters ---------- categories : 'auto' or a list of lists/arrays of values. Categories (unique values) per feature: - 'auto' : Determine categories automatically from the training data. - list : ``categories[i]`` holds the categories expected in the ith column. The passed categories must be sorted and should not mix strings and numeric values. The used categories can be found in the ``categories_`` attribute. dtype : number type, default np.float64 Desired dtype of output. Attributes ---------- categories_ : list of arrays The categories of each feature determined during fitting (in order corresponding with output of ``transform``). Examples -------- Given a dataset with two features, we let the encoder find the unique values per feature and transform the data to a binary one-hot encoding. >>> from sklearn.preprocessing import OrdinalEncoder >>> enc = OrdinalEncoder() >>> X = [['Male', 1], ['Female', 3], ['Female', 2]] >>> enc.fit(X) ... # doctest: +ELLIPSIS OrdinalEncoder(categories='auto', dtype=<... 'numpy.float64'>) >>> enc.categories_ [array(['Female', 'Male'], dtype=object), array([1, 2, 3], dtype=object)] >>> enc.transform([['Female', 3], ['Male', 1]]) array([[ 0., 2.], [ 1., 0.]]) >>> enc.inverse_transform([[1, 0], [0, 1]]) array([['Male', 1], ['Female', 2]], dtype=object) See also -------- sklearn.preprocessing.OneHotEncoder : performs a one-hot encoding of categorical features. sklearn.preprocessing.LabelEncoder : encodes target labels with values between 0 and n_classes-1. sklearn.feature_extraction.DictVectorizer : performs a one-hot encoding of dictionary items (also handles string-valued features). sklearn.feature_extraction.FeatureHasher : performs an approximate one-hot encoding of dictionary items or strings. """ def __init__(self, categories='auto', dtype=np.float64): self.categories = categories self.dtype = dtype def fit(self, X, y=None): """Fit the OrdinalEncoder to X. Parameters ---------- X : array-like, shape [n_samples, n_features] The data to determine the categories of each feature. Returns ------- self """ self._fit(X) return self def transform(self, X): """Transform X to ordinal codes. Parameters ---------- X : array-like, shape [n_samples, n_features] The data to encode. Returns ------- X_out : sparse matrix or a 2-d array Transformed input. """ X_int, _ = self._transform(X) return X_int.astype(self.dtype, copy=False) def inverse_transform(self, X): """Convert back the data to the original representation. Parameters ---------- X : array-like or sparse matrix, shape [n_samples, n_encoded_features] The transformed data. Returns ------- X_tr : array-like, shape [n_samples, n_features] Inverse transformed array. """ check_is_fitted(self, 'categories_') X = check_array(X, accept_sparse='csr') n_samples, _ = X.shape n_features = len(self.categories_) # validate shape of passed X msg = ("Shape of the passed X data is not correct. Expected {0} " "columns, got {1}.") if X.shape[1] != n_features: raise ValueError(msg.format(n_features, X.shape[1])) # create resulting array of appropriate dtype dt = np.find_common_type([cat.dtype for cat in self.categories_], []) X_tr = np.empty((n_samples, n_features), dtype=dt) for i in range(n_features): labels = X[:, i].astype('int64') X_tr[:, i] = self.categories_[i][labels] return X_tr