diff --git a/02_end_to_end_machine_learning_project.ipynb b/02_end_to_end_machine_learning_project.ipynb index c2a34ff..fbc5746 100644 --- a/02_end_to_end_machine_learning_project.ipynb +++ b/02_end_to_end_machine_learning_project.ipynb @@ -2302,6 +2302,13 @@ "outlier_pred" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If you wanted to drop outliers, you would run the following code:" + ] + }, { "cell_type": "code", "execution_count": 61, @@ -3467,20 +3474,11 @@ "cell_type": "code", "execution_count": 104, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Monkey-patching SimpleImputer.get_feature_names_out()\n", - "Monkey-patching FunctionTransformer.get_feature_names_out()\n" - ] - } - ], + "outputs": [], "source": [ "def monkey_patch_get_signature_names_out():\n", " \"\"\"Monkey patch some classes which did not handle get_feature_names_out()\n", - " correctly in 1.0.0.\"\"\"\n", + " correctly in Scikit-Learn 1.0.*.\"\"\"\n", " from inspect import Signature, signature, Parameter\n", " import pandas as pd\n", " from sklearn.impute import SimpleImputer\n", @@ -3508,12 +3506,10 @@ " Parameter(\"feature_names_out\", Parameter.KEYWORD_ONLY)])\n", "\n", " def get_feature_names_out(self, names=None):\n", - " if self.feature_names_out is None:\n", - " return default_get_feature_names_out(self, names)\n", - " elif callable(self.feature_names_out):\n", - " return self.feature_names_out(names)\n", - " else:\n", - " return self.feature_names_out\n", + " if callable(self.feature_names_out):\n", + " return self.feature_names_out(self, names)\n", + " assert self.feature_names_out == \"one-to-one\"\n", + " return default_get_feature_names_out(self, names)\n", "\n", " FunctionTransformer.__init__ = __init__\n", " FunctionTransformer.get_feature_names_out = get_feature_names_out\n", @@ -3896,28 +3892,28 @@ "def column_ratio(X):\n", " return X[:, [0]] / X[:, [1]]\n", "\n", - "def ratio_pipeline(name=None):\n", + "def ratio_name(function_transformer, feature_names_in):\n", + " return [\"ratio\"] # feature names out\n", + "\n", + "def ratio_pipeline():\n", " return make_pipeline(\n", " SimpleImputer(strategy=\"median\"),\n", - " FunctionTransformer(column_ratio,\n", - " feature_names_out=[name]),\n", + " FunctionTransformer(column_ratio, feature_names_out=ratio_name),\n", " StandardScaler())\n", "\n", - "log_pipeline = make_pipeline(SimpleImputer(strategy=\"median\"),\n", - " FunctionTransformer(np.log),\n", - " StandardScaler())\n", + "log_pipeline = make_pipeline(\n", + " SimpleImputer(strategy=\"median\"),\n", + " FunctionTransformer(np.log, feature_names_out=\"one-to-one\"),\n", + " StandardScaler())\n", "cluster_simil = ClusterSimilarity(n_clusters=10, gamma=1., random_state=42)\n", "default_num_pipeline = make_pipeline(SimpleImputer(strategy=\"median\"),\n", " StandardScaler())\n", "preprocessing = ColumnTransformer([\n", - " (\"bedrooms_ratio\", ratio_pipeline(\"bedrooms_ratio\"),\n", - " [\"total_bedrooms\", \"total_rooms\"]),\n", - " (\"rooms_per_house\", ratio_pipeline(\"rooms_per_house\"),\n", - " [\"total_rooms\", \"households\"]),\n", - " (\"people_per_house\", ratio_pipeline(\"people_per_house\"),\n", - " [\"population\", \"households\"]),\n", - " (\"log\", log_pipeline, [\"total_bedrooms\", \"total_rooms\",\n", - " \"population\", \"households\", \"median_income\"]),\n", + " (\"bedrooms\", ratio_pipeline(), [\"total_bedrooms\", \"total_rooms\"]),\n", + " (\"rooms_per_house\", ratio_pipeline(), [\"total_rooms\", \"households\"]),\n", + " (\"people_per_house\", ratio_pipeline(), [\"population\", \"households\"]),\n", + " (\"log\", log_pipeline, [\"total_bedrooms\", \"total_rooms\", \"population\",\n", + " \"households\", \"median_income\"]),\n", " (\"geo\", cluster_simil, [\"latitude\", \"longitude\"]),\n", " (\"cat\", cat_pipeline, make_column_selector(dtype_include=object)),\n", " ],\n", @@ -3953,9 +3949,8 @@ { "data": { "text/plain": [ - "array(['bedrooms_ratio__bedrooms_ratio',\n", - " 'rooms_per_house__rooms_per_house',\n", - " 'people_per_house__people_per_house', 'log__total_bedrooms',\n", + "array(['bedrooms__ratio', 'rooms_per_house__ratio',\n", + " 'people_per_house__ratio', 'log__total_bedrooms',\n", " 'log__total_rooms', 'log__population', 'log__households',\n", " 'log__median_income', 'geo__Cluster 0 similarity',\n", " 'geo__Cluster 1 similarity', 'geo__Cluster 2 similarity',\n", @@ -4004,12 +3999,12 @@ " SimpleImputer(strategy='median')),\n", " ('standardscaler',\n", " StandardScaler())]),\n", - " transformers=[('bedrooms_ratio',\n", + " transformers=[('bedrooms',\n", " Pipeline(steps=[('simpleimputer',\n", " SimpleImputer(strategy='median')),\n", " ('functiontransformer',\n", - " FunctionTransformer(feature_names_out=['bedrooms_ratio'],\n", - " f...\n", + " FunctionTransformer(feature_names_out=)])),\n", + " )])),\n", " ('linearregression', LinearRegression())])" ] }, @@ -4146,12 +4141,12 @@ " SimpleImputer(strategy='median')),\n", " ('standardscaler',\n", " StandardScaler())]),\n", - " transformers=[('bedrooms_ratio',\n", + " transformers=[('bedrooms',\n", " Pipeline(steps=[('simpleimputer',\n", " SimpleImputer(strategy='median')),\n", " ('functiontransformer',\n", - " FunctionTransformer(feature_names_out=['bedrooms_ratio'],\n", - " f...\n", + " FunctionTransformer(feature_names_out=)])),\n", + " )])),\n", " ('decisiontreeregressor',\n", " DecisionTreeRegressor(random_state=42))])" ] @@ -4399,12 +4394,12 @@ " SimpleImputer(strategy='median')),\n", " ('standardscaler',\n", " StandardScaler())]),\n", - " transformers=[('bedrooms_ratio',\n", + " transformers=[('bedrooms',\n", " Pipeline(steps=[('simpleimputer',\n", " SimpleImputer(strategy='median')),\n", " ('functiontransformer',\n", - " FunctionTransformer(feature_names_...\n", - " )])),\n", + " FunctionTransformer(feature_names_out=)])),\n", " ('random_forest',\n", " RandomForestRegressor(random_state=42))]),\n", " param_grid=[{'preprocessing__geo__n_clusters': [5, 8, 10],\n", @@ -4502,12 +4497,11 @@ " SimpleImputer(strategy='median')),\n", " ('standardscaler',\n", " StandardScaler())]),\n", - " transformers=[('bedrooms_ratio',\n", + " transformers=[('bedrooms',\n", " Pipeline(steps=[('simpleimputer',\n", " SimpleImputer(strategy='median')),\n", " ('functiontransformer',\n", - " FunctionTransformer(feature_names_out=['bedrooms_ratio'],\n", - " func=...\n", + " FunctionTransformer(feature_names_out=)])),\n", + " )])),\n", " ('random_forest',\n", " RandomForestRegressor(max_features=6, random_state=42))])" ] @@ -4657,13 +4651,6 @@ "## Randomized Search" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Warning:** the following cell may take a few minutes to run:" - ] - }, { "cell_type": "code", "execution_count": 137, @@ -4681,6 +4668,13 @@ "Try 30 (`n_iter` × `cv`) random combinations of hyperparameters:" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Warning:** the following cell may take a few minutes to run:" + ] + }, { "cell_type": "code", "execution_count": 138, @@ -4695,16 +4689,16 @@ " SimpleImputer(strategy='median')),\n", " ('standardscaler',\n", " StandardScaler())]),\n", - " transformers=[('bedrooms_ratio',\n", + " transformers=[('bedrooms',\n", " Pipeline(steps=[('simpleimputer',\n", " SimpleImputer(strategy='median')),\n", " ('functiontransformer',\n", - " FunctionTransformer(feature_...\n", - " )])),\n", + " FunctionTransformer(feature_names_...\n", + " )])),\n", " ('random_forest',\n", " RandomForestRegressor(random_state=42))]),\n", - " param_distributions={'preprocessing__geo__n_clusters': ,\n", - " 'random_forest__max_features': },\n", + " param_distributions={'preprocessing__geo__n_clusters': ,\n", + " 'random_forest__max_features': },\n", " random_state=42, scoring='neg_root_mean_squared_error')" ] }, @@ -5046,9 +5040,9 @@ "text/plain": [ "[(0.18694559869103852, 'log__median_income'),\n", " (0.0748194905715524, 'cat__ocean_proximity_INLAND'),\n", - " (0.06926417748515576, 'bedrooms_ratio__bedrooms_ratio'),\n", - " (0.05446998753775219, 'rooms_per_house__rooms_per_house'),\n", - " (0.05262301809680712, 'people_per_house__people_per_house'),\n", + " (0.06926417748515576, 'bedrooms__ratio'),\n", + " (0.05446998753775219, 'rooms_per_house__ratio'),\n", + " (0.05262301809680712, 'people_per_house__ratio'),\n", " (0.03819415873915732, 'geo__Cluster 0 similarity'),\n", " (0.02879263999929514, 'geo__Cluster 28 similarity'),\n", " (0.023530192521380392, 'geo__Cluster 24 similarity'),\n", @@ -5333,7 +5327,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Also works with pickle, but joblib is more efficient." + "You could use pickle instead, but joblib is more efficient." ] }, { @@ -5371,12 +5365,12 @@ " SimpleImputer(strategy='median')),\n", " ('standardscaler',\n", " StandardScaler())]),\n", - " transformers=[('bedrooms_ratio',\n", + " transformers=[('bedrooms',\n", " Pipeline(steps=[('simpleimputer',\n", " SimpleImputer(strategy='median')),\n", " ('functiontransformer',\n", - " FunctionTransformer(feature_names_...\n", - " )])),\n", + " FunctionTransformer(feature_names_out=)])),\n", " ('svr', SVR())]),\n", " param_grid=[{'svr__C': [10.0, 30.0, 100.0, 300.0, 1000.0, 3000.0,\n", " 10000.0, 30000.0],\n", @@ -5508,16 +5502,16 @@ " SimpleImputer(strategy='median')),\n", " ('standardscaler',\n", " StandardScaler())]),\n", - " transformers=[('bedrooms_ratio',\n", + " transformers=[('bedrooms',\n", " Pipeline(steps=[('simpleimputer',\n", " SimpleImputer(strategy='median')),\n", " ('functiontransformer',\n", - " FunctionTransformer(feature_...\n", - " )])),\n", + " FunctionTransformer(feature_names_...\n", + " )])),\n", " ('svr', SVR())]),\n", " n_iter=50,\n", - " param_distributions={'svr__C': ,\n", - " 'svr__gamma': ,\n", + " param_distributions={'svr__C': ,\n", + " 'svr__gamma': ,\n", " 'svr__kernel': ['linear', 'rbf']},\n", " random_state=42, scoring='neg_root_mean_squared_error')" ] @@ -5970,20 +5964,19 @@ "text/plain": [ "RandomizedSearchCV(cv=3,\n", " estimator=Pipeline(steps=[('preprocessing',\n", - " ColumnTransformer(transformers=[('bedrooms_ratio',\n", + " ColumnTransformer(transformers=[('bedrooms',\n", " Pipeline(steps=[('simpleimputer',\n", " SimpleImputer(strategy='median')),\n", " ('functiontransformer',\n", - " FunctionTransformer(feature_names_out=['bedrooms_ratio'],\n", - " func=)),\n", + " FunctionTransformer(feature_names_out=,\n", + " func=)),\n", " ('standardscaler',\n", - " StandardScaler())]),\n", - " ['...\n", + " StandardScaler()...\n", " param_distributions={'preprocessing__geo__estimator__n_neighbors': range(1, 30),\n", " 'preprocessing__geo__estimator__weights': ['distance',\n", " 'uniform'],\n", - " 'svr__C': ,\n", - " 'svr__gamma': },\n", + " 'svr__C': ,\n", + " 'svr__gamma': },\n", " random_state=42, scoring='neg_root_mean_squared_error')" ] }, @@ -6186,6 +6179,7 @@ "source": [ "scaler = StandardScalerClone()\n", "X_back = scaler.inverse_transform(scaler.fit_transform(X))\n", + "\n", "assert np.allclose(X, X_back)" ] },