Fix get_feature_names_out for FunctionTransformer

main
Aurélien Geron 2022-09-23 10:06:07 +12:00
parent 0573deb5d3
commit 576cec95d9
1 changed files with 74 additions and 80 deletions

View File

@ -2302,6 +2302,13 @@
"outlier_pred"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"If you wanted to drop outliers, you would run the following code:"
]
},
{
"cell_type": "code",
"execution_count": 61,
@ -3467,20 +3474,11 @@
"cell_type": "code",
"execution_count": 104,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Monkey-patching SimpleImputer.get_feature_names_out()\n",
"Monkey-patching FunctionTransformer.get_feature_names_out()\n"
]
}
],
"outputs": [],
"source": [
"def monkey_patch_get_signature_names_out():\n",
" \"\"\"Monkey patch some classes which did not handle get_feature_names_out()\n",
" correctly in 1.0.0.\"\"\"\n",
" correctly in Scikit-Learn 1.0.*.\"\"\"\n",
" from inspect import Signature, signature, Parameter\n",
" import pandas as pd\n",
" from sklearn.impute import SimpleImputer\n",
@ -3508,12 +3506,10 @@
" Parameter(\"feature_names_out\", Parameter.KEYWORD_ONLY)])\n",
"\n",
" def get_feature_names_out(self, names=None):\n",
" if self.feature_names_out is None:\n",
" if callable(self.feature_names_out):\n",
" return self.feature_names_out(self, names)\n",
" assert self.feature_names_out == \"one-to-one\"\n",
" return default_get_feature_names_out(self, names)\n",
" elif callable(self.feature_names_out):\n",
" return self.feature_names_out(names)\n",
" else:\n",
" return self.feature_names_out\n",
"\n",
" FunctionTransformer.__init__ = __init__\n",
" FunctionTransformer.get_feature_names_out = get_feature_names_out\n",
@ -3896,28 +3892,28 @@
"def column_ratio(X):\n",
" return X[:, [0]] / X[:, [1]]\n",
"\n",
"def ratio_pipeline(name=None):\n",
"def ratio_name(function_transformer, feature_names_in):\n",
" return [\"ratio\"] # feature names out\n",
"\n",
"def ratio_pipeline():\n",
" return make_pipeline(\n",
" SimpleImputer(strategy=\"median\"),\n",
" FunctionTransformer(column_ratio,\n",
" feature_names_out=[name]),\n",
" FunctionTransformer(column_ratio, feature_names_out=ratio_name),\n",
" StandardScaler())\n",
"\n",
"log_pipeline = make_pipeline(SimpleImputer(strategy=\"median\"),\n",
" FunctionTransformer(np.log),\n",
"log_pipeline = make_pipeline(\n",
" SimpleImputer(strategy=\"median\"),\n",
" FunctionTransformer(np.log, feature_names_out=\"one-to-one\"),\n",
" StandardScaler())\n",
"cluster_simil = ClusterSimilarity(n_clusters=10, gamma=1., random_state=42)\n",
"default_num_pipeline = make_pipeline(SimpleImputer(strategy=\"median\"),\n",
" StandardScaler())\n",
"preprocessing = ColumnTransformer([\n",
" (\"bedrooms_ratio\", ratio_pipeline(\"bedrooms_ratio\"),\n",
" [\"total_bedrooms\", \"total_rooms\"]),\n",
" (\"rooms_per_house\", ratio_pipeline(\"rooms_per_house\"),\n",
" [\"total_rooms\", \"households\"]),\n",
" (\"people_per_house\", ratio_pipeline(\"people_per_house\"),\n",
" [\"population\", \"households\"]),\n",
" (\"log\", log_pipeline, [\"total_bedrooms\", \"total_rooms\",\n",
" \"population\", \"households\", \"median_income\"]),\n",
" (\"bedrooms\", ratio_pipeline(), [\"total_bedrooms\", \"total_rooms\"]),\n",
" (\"rooms_per_house\", ratio_pipeline(), [\"total_rooms\", \"households\"]),\n",
" (\"people_per_house\", ratio_pipeline(), [\"population\", \"households\"]),\n",
" (\"log\", log_pipeline, [\"total_bedrooms\", \"total_rooms\", \"population\",\n",
" \"households\", \"median_income\"]),\n",
" (\"geo\", cluster_simil, [\"latitude\", \"longitude\"]),\n",
" (\"cat\", cat_pipeline, make_column_selector(dtype_include=object)),\n",
" ],\n",
@ -3953,9 +3949,8 @@
{
"data": {
"text/plain": [
"array(['bedrooms_ratio__bedrooms_ratio',\n",
" 'rooms_per_house__rooms_per_house',\n",
" 'people_per_house__people_per_house', 'log__total_bedrooms',\n",
"array(['bedrooms__ratio', 'rooms_per_house__ratio',\n",
" 'people_per_house__ratio', 'log__total_bedrooms',\n",
" 'log__total_rooms', 'log__population', 'log__households',\n",
" 'log__median_income', 'geo__Cluster 0 similarity',\n",
" 'geo__Cluster 1 similarity', 'geo__Cluster 2 similarity',\n",
@ -4004,12 +3999,12 @@
" SimpleImputer(strategy='median')),\n",
" ('standardscaler',\n",
" StandardScaler())]),\n",
" transformers=[('bedrooms_ratio',\n",
" transformers=[('bedrooms',\n",
" Pipeline(steps=[('simpleimputer',\n",
" SimpleImputer(strategy='median')),\n",
" ('functiontransformer',\n",
" FunctionTransformer(feature_names_out=['bedrooms_ratio'],\n",
" f...\n",
" FunctionTransformer(feature_names_out=<function ratio_name at 0x1a5...\n",
" 'households',\n",
" 'median_income']),\n",
" ('geo',\n",
" ClusterSimilarity(random_state=42),\n",
@ -4019,7 +4014,7 @@
" SimpleImputer(strategy='most_frequent')),\n",
" ('onehotencoder',\n",
" OneHotEncoder(handle_unknown='ignore'))]),\n",
" <sklearn.compose._column_transformer.make_column_selector object at 0x7f9b50613dc0>)])),\n",
" <sklearn.compose._column_transformer.make_column_selector object at 0x1a57e3a00>)])),\n",
" ('linearregression', LinearRegression())])"
]
},
@ -4146,12 +4141,12 @@
" SimpleImputer(strategy='median')),\n",
" ('standardscaler',\n",
" StandardScaler())]),\n",
" transformers=[('bedrooms_ratio',\n",
" transformers=[('bedrooms',\n",
" Pipeline(steps=[('simpleimputer',\n",
" SimpleImputer(strategy='median')),\n",
" ('functiontransformer',\n",
" FunctionTransformer(feature_names_out=['bedrooms_ratio'],\n",
" f...\n",
" FunctionTransformer(feature_names_out=<function ratio_name at 0x1a5...\n",
" ('geo',\n",
" ClusterSimilarity(random_state=42),\n",
" ['latitude', 'longitude']),\n",
" ('cat',\n",
@ -4159,7 +4154,7 @@
" SimpleImputer(strategy='most_frequent')),\n",
" ('onehotencoder',\n",
" OneHotEncoder(handle_unknown='ignore'))]),\n",
" <sklearn.compose._column_transformer.make_column_selector object at 0x7f9b50613dc0>)])),\n",
" <sklearn.compose._column_transformer.make_column_selector object at 0x1a57e3a00>)])),\n",
" ('decisiontreeregressor',\n",
" DecisionTreeRegressor(random_state=42))])"
]
@ -4399,12 +4394,12 @@
" SimpleImputer(strategy='median')),\n",
" ('standardscaler',\n",
" StandardScaler())]),\n",
" transformers=[('bedrooms_ratio',\n",
" transformers=[('bedrooms',\n",
" Pipeline(steps=[('simpleimputer',\n",
" SimpleImputer(strategy='median')),\n",
" ('functiontransformer',\n",
" FunctionTransformer(feature_names_...\n",
" <sklearn.compose._column_transformer.make_column_selector object at 0x7f9b50613dc0>)])),\n",
" FunctionTransformer(feature_names_out=<f...\n",
" <sklearn.compose._column_transformer.make_column_selector object at 0x1a57e3a00>)])),\n",
" ('random_forest',\n",
" RandomForestRegressor(random_state=42))]),\n",
" param_grid=[{'preprocessing__geo__n_clusters': [5, 8, 10],\n",
@ -4502,12 +4497,11 @@
" SimpleImputer(strategy='median')),\n",
" ('standardscaler',\n",
" StandardScaler())]),\n",
" transformers=[('bedrooms_ratio',\n",
" transformers=[('bedrooms',\n",
" Pipeline(steps=[('simpleimputer',\n",
" SimpleImputer(strategy='median')),\n",
" ('functiontransformer',\n",
" FunctionTransformer(feature_names_out=['bedrooms_ratio'],\n",
" func=...\n",
" FunctionTransformer(feature_names_out=<function ratio_name at 0x1a5b6fd...\n",
" ClusterSimilarity(n_clusters=15,\n",
" random_state=42),\n",
" ['latitude', 'longitude']),\n",
@ -4516,7 +4510,7 @@
" SimpleImputer(strategy='most_frequent')),\n",
" ('onehotencoder',\n",
" OneHotEncoder(handle_unknown='ignore'))]),\n",
" <sklearn.compose._column_transformer.make_column_selector object at 0x7f9b410ec490>)])),\n",
" <sklearn.compose._column_transformer.make_column_selector object at 0x1a5cdffd0>)])),\n",
" ('random_forest',\n",
" RandomForestRegressor(max_features=6, random_state=42))])"
]
@ -4657,13 +4651,6 @@
"## Randomized Search"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**Warning:** the following cell may take a few minutes to run:"
]
},
{
"cell_type": "code",
"execution_count": 137,
@ -4681,6 +4668,13 @@
"Try 30 (`n_iter` × `cv`) random combinations of hyperparameters:"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**Warning:** the following cell may take a few minutes to run:"
]
},
{
"cell_type": "code",
"execution_count": 138,
@ -4695,16 +4689,16 @@
" SimpleImputer(strategy='median')),\n",
" ('standardscaler',\n",
" StandardScaler())]),\n",
" transformers=[('bedrooms_ratio',\n",
" transformers=[('bedrooms',\n",
" Pipeline(steps=[('simpleimputer',\n",
" SimpleImputer(strategy='median')),\n",
" ('functiontransformer',\n",
" FunctionTransformer(feature_...\n",
" <sklearn.compose._column_transformer.make_column_selector object at 0x7f9b50613dc0>)])),\n",
" FunctionTransformer(feature_names_...\n",
" <sklearn.compose._column_transformer.make_column_selector object at 0x1a57e3a00>)])),\n",
" ('random_forest',\n",
" RandomForestRegressor(random_state=42))]),\n",
" param_distributions={'preprocessing__geo__n_clusters': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f9b103bb760>,\n",
" 'random_forest__max_features': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f9b410decd0>},\n",
" param_distributions={'preprocessing__geo__n_clusters': <scipy.stats._distn_infrastructure.rv_discrete_frozen object at 0x1a4bfcb20>,\n",
" 'random_forest__max_features': <scipy.stats._distn_infrastructure.rv_discrete_frozen object at 0x1a57c7bb0>},\n",
" random_state=42, scoring='neg_root_mean_squared_error')"
]
},
@ -5046,9 +5040,9 @@
"text/plain": [
"[(0.18694559869103852, 'log__median_income'),\n",
" (0.0748194905715524, 'cat__ocean_proximity_INLAND'),\n",
" (0.06926417748515576, 'bedrooms_ratio__bedrooms_ratio'),\n",
" (0.05446998753775219, 'rooms_per_house__rooms_per_house'),\n",
" (0.05262301809680712, 'people_per_house__people_per_house'),\n",
" (0.06926417748515576, 'bedrooms__ratio'),\n",
" (0.05446998753775219, 'rooms_per_house__ratio'),\n",
" (0.05262301809680712, 'people_per_house__ratio'),\n",
" (0.03819415873915732, 'geo__Cluster 0 similarity'),\n",
" (0.02879263999929514, 'geo__Cluster 28 similarity'),\n",
" (0.023530192521380392, 'geo__Cluster 24 similarity'),\n",
@ -5333,7 +5327,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"Also works with pickle, but joblib is more efficient."
"You could use pickle instead, but joblib is more efficient."
]
},
{
@ -5371,12 +5365,12 @@
" SimpleImputer(strategy='median')),\n",
" ('standardscaler',\n",
" StandardScaler())]),\n",
" transformers=[('bedrooms_ratio',\n",
" transformers=[('bedrooms',\n",
" Pipeline(steps=[('simpleimputer',\n",
" SimpleImputer(strategy='median')),\n",
" ('functiontransformer',\n",
" FunctionTransformer(feature_names_...\n",
" <sklearn.compose._column_transformer.make_column_selector object at 0x7f9b50613dc0>)])),\n",
" FunctionTransformer(feature_names_out=<f...\n",
" <sklearn.compose._column_transformer.make_column_selector object at 0x1a57e3a00>)])),\n",
" ('svr', SVR())]),\n",
" param_grid=[{'svr__C': [10.0, 30.0, 100.0, 300.0, 1000.0, 3000.0,\n",
" 10000.0, 30000.0],\n",
@ -5508,16 +5502,16 @@
" SimpleImputer(strategy='median')),\n",
" ('standardscaler',\n",
" StandardScaler())]),\n",
" transformers=[('bedrooms_ratio',\n",
" transformers=[('bedrooms',\n",
" Pipeline(steps=[('simpleimputer',\n",
" SimpleImputer(strategy='median')),\n",
" ('functiontransformer',\n",
" FunctionTransformer(feature_...\n",
" <sklearn.compose._column_transformer.make_column_selector object at 0x7f9b50613dc0>)])),\n",
" FunctionTransformer(feature_names_...\n",
" <sklearn.compose._column_transformer.make_column_selector object at 0x1a57e3a00>)])),\n",
" ('svr', SVR())]),\n",
" n_iter=50,\n",
" param_distributions={'svr__C': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f9ae254b9d0>,\n",
" 'svr__gamma': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f9b734dbe50>,\n",
" param_distributions={'svr__C': <scipy.stats._distn_infrastructure.rv_continuous_frozen object at 0x1a5d4c3a0>,\n",
" 'svr__gamma': <scipy.stats._distn_infrastructure.rv_continuous_frozen object at 0x1a5d9ca00>,\n",
" 'svr__kernel': ['linear', 'rbf']},\n",
" random_state=42, scoring='neg_root_mean_squared_error')"
]
@ -5970,20 +5964,19 @@
"text/plain": [
"RandomizedSearchCV(cv=3,\n",
" estimator=Pipeline(steps=[('preprocessing',\n",
" ColumnTransformer(transformers=[('bedrooms_ratio',\n",
" ColumnTransformer(transformers=[('bedrooms',\n",
" Pipeline(steps=[('simpleimputer',\n",
" SimpleImputer(strategy='median')),\n",
" ('functiontransformer',\n",
" FunctionTransformer(feature_names_out=['bedrooms_ratio'],\n",
" func=<function column_ratio at 0x7f9b505e5670>)),\n",
" FunctionTransformer(feature_names_out=<function ratio_name at 0x1a5b6fd90>,\n",
" func=<function column_ratio at 0x1a5695bd0>)),\n",
" ('standardscaler',\n",
" StandardScaler())]),\n",
" ['...\n",
" StandardScaler()...\n",
" param_distributions={'preprocessing__geo__estimator__n_neighbors': range(1, 30),\n",
" 'preprocessing__geo__estimator__weights': ['distance',\n",
" 'uniform'],\n",
" 'svr__C': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f9acb940bb0>,\n",
" 'svr__gamma': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f9acb940a30>},\n",
" 'svr__C': <scipy.stats._distn_infrastructure.rv_continuous_frozen object at 0x1a63fda80>,\n",
" 'svr__gamma': <scipy.stats._distn_infrastructure.rv_continuous_frozen object at 0x1a63fe410>},\n",
" random_state=42, scoring='neg_root_mean_squared_error')"
]
},
@ -6186,6 +6179,7 @@
"source": [
"scaler = StandardScalerClone()\n",
"X_back = scaler.inverse_transform(scaler.fit_transform(X))\n",
"\n",
"assert np.allclose(X, X_back)"
]
},