From c658c2b07c0550303fa4643587267b7cf333c19e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aur=C3=A9lien=20Geron?= Date: Mon, 15 Nov 2021 17:45:26 +1300 Subject: [PATCH] Move StandardScalerClone inverse_transform and get_feature_names_out to exercise --- 02_end_to_end_machine_learning_project.ipynb | 389 +++++++++++++------ 1 file changed, 267 insertions(+), 122 deletions(-) diff --git a/02_end_to_end_machine_learning_project.ipynb b/02_end_to_end_machine_learning_project.ipynb index 9971e97..e2657fb 100644 --- a/02_end_to_end_machine_learning_project.ipynb +++ b/02_end_to_end_machine_learning_project.ipynb @@ -1473,27 +1473,7 @@ " assert self.n_features_in_ == X.shape[1]\n", " if self.with_mean:\n", " X = X - self.mean_\n", - " return X / self.scale_\n", - " \n", - " # not in the book (left as an exercise):\n", - " def inverse_transform(self, X):\n", - " check_is_fitted(self)\n", - " X = check_array(X)\n", - " assert self.n_features_in_ == X.shape[1]\n", - " X = X * self.scale_\n", - " return X + self.mean_ if self.with_mean else X\n", - " \n", - " # not in the book (left as an exercise):\n", - " def get_feature_names_out(self, names=None):\n", - " return names or getattr(self, \"feature_names_in_\",\n", - " [f\"x{i}\" for i in range(self.n_features_in_)]) " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's test our custom transformer:" + " return X / self.scale_" ] }, { @@ -1501,30 +1481,6 @@ "execution_count": 100, "metadata": {}, "outputs": [], - "source": [ - "# Not in the book\n", - "from sklearn.utils.estimator_checks import check_estimator\n", - " \n", - "check_estimator(StandardScaler())\n", - "X = np.random.rand(1000, 3)\n", - "ss = StandardScaler()\n", - "ssc = StandardScalerClone()\n", - "X_scaled1 = ss.fit_transform(X)\n", - "X_scaled2 = ssc.fit_transform(X)\n", - "X_back1 = ss.inverse_transform(X_scaled1)\n", - "X_back2 = ssc.inverse_transform(X_scaled2)\n", - "assert np.allclose(X_scaled1, X_scaled2)\n", - "assert np.allclose(X_back1, X_back2)\n", - "assert ssc.n_features_in_ == 3\n", - "assert not hasattr(ssc, \"features_names_in_\")\n", - "assert ssc.get_feature_names_out() == [\"x0\", \"x1\", \"x2\"]" - ] - }, - { - "cell_type": "code", - "execution_count": 101, - "metadata": {}, - "outputs": [], "source": [ "from sklearn.cluster import KMeans\n", "\n", @@ -1548,7 +1504,7 @@ }, { "cell_type": "code", - "execution_count": 102, + "execution_count": 101, "metadata": {}, "outputs": [], "source": [ @@ -1559,7 +1515,7 @@ }, { "cell_type": "code", - "execution_count": 103, + "execution_count": 102, "metadata": {}, "outputs": [], "source": [ @@ -1568,7 +1524,7 @@ }, { "cell_type": "code", - "execution_count": 104, + "execution_count": 103, "metadata": {}, "outputs": [], "source": [ @@ -1610,7 +1566,7 @@ }, { "cell_type": "code", - "execution_count": 105, + "execution_count": 104, "metadata": {}, "outputs": [], "source": [ @@ -1624,7 +1580,7 @@ }, { "cell_type": "code", - "execution_count": 106, + "execution_count": 105, "metadata": {}, "outputs": [], "source": [ @@ -1635,7 +1591,7 @@ }, { "cell_type": "code", - "execution_count": 107, + "execution_count": 106, "metadata": {}, "outputs": [], "source": [ @@ -1648,7 +1604,7 @@ }, { "cell_type": "code", - "execution_count": 108, + "execution_count": 107, "metadata": {}, "outputs": [], "source": [ @@ -1658,7 +1614,7 @@ }, { "cell_type": "code", - "execution_count": 109, + "execution_count": 108, "metadata": {}, "outputs": [], "source": [ @@ -1707,7 +1663,7 @@ }, { "cell_type": "code", - "execution_count": 110, + "execution_count": 109, "metadata": {}, "outputs": [], "source": [ @@ -1719,7 +1675,7 @@ }, { "cell_type": "code", - "execution_count": 111, + "execution_count": 110, "metadata": {}, "outputs": [], "source": [ @@ -1728,7 +1684,7 @@ }, { "cell_type": "code", - "execution_count": 112, + "execution_count": 111, "metadata": {}, "outputs": [], "source": [ @@ -1737,7 +1693,7 @@ }, { "cell_type": "code", - "execution_count": 113, + "execution_count": 112, "metadata": {}, "outputs": [], "source": [ @@ -1746,7 +1702,7 @@ }, { "cell_type": "code", - "execution_count": 114, + "execution_count": 113, "metadata": {}, "outputs": [], "source": [ @@ -1755,7 +1711,7 @@ }, { "cell_type": "code", - "execution_count": 115, + "execution_count": 114, "metadata": {}, "outputs": [], "source": [ @@ -1764,7 +1720,7 @@ }, { "cell_type": "code", - "execution_count": 116, + "execution_count": 115, "metadata": {}, "outputs": [], "source": [ @@ -1786,7 +1742,7 @@ }, { "cell_type": "code", - "execution_count": 117, + "execution_count": 116, "metadata": {}, "outputs": [], "source": [ @@ -1800,7 +1756,7 @@ }, { "cell_type": "code", - "execution_count": 118, + "execution_count": 117, "metadata": {}, "outputs": [], "source": [ @@ -1809,7 +1765,7 @@ }, { "cell_type": "code", - "execution_count": 119, + "execution_count": 118, "metadata": {}, "outputs": [], "source": [ @@ -1822,7 +1778,7 @@ }, { "cell_type": "code", - "execution_count": 120, + "execution_count": 119, "metadata": {}, "outputs": [], "source": [ @@ -1859,7 +1815,7 @@ }, { "cell_type": "code", - "execution_count": 121, + "execution_count": 120, "metadata": {}, "outputs": [], "source": [ @@ -1869,7 +1825,7 @@ }, { "cell_type": "code", - "execution_count": 122, + "execution_count": 121, "metadata": {}, "outputs": [], "source": [ @@ -1892,7 +1848,7 @@ }, { "cell_type": "code", - "execution_count": 123, + "execution_count": 122, "metadata": {}, "outputs": [], "source": [ @@ -1911,7 +1867,7 @@ }, { "cell_type": "code", - "execution_count": 124, + "execution_count": 123, "metadata": {}, "outputs": [], "source": [ @@ -1928,7 +1884,7 @@ }, { "cell_type": "code", - "execution_count": 125, + "execution_count": 124, "metadata": {}, "outputs": [], "source": [ @@ -1937,7 +1893,7 @@ }, { "cell_type": "code", - "execution_count": 126, + "execution_count": 125, "metadata": {}, "outputs": [], "source": [ @@ -1948,7 +1904,7 @@ }, { "cell_type": "code", - "execution_count": 127, + "execution_count": 126, "metadata": {}, "outputs": [], "source": [ @@ -1961,7 +1917,7 @@ }, { "cell_type": "code", - "execution_count": 128, + "execution_count": 127, "metadata": {}, "outputs": [], "source": [ @@ -1973,7 +1929,7 @@ }, { "cell_type": "code", - "execution_count": 129, + "execution_count": 128, "metadata": {}, "outputs": [], "source": [ @@ -1992,7 +1948,7 @@ }, { "cell_type": "code", - "execution_count": 130, + "execution_count": 129, "metadata": {}, "outputs": [], "source": [ @@ -2004,7 +1960,7 @@ }, { "cell_type": "code", - "execution_count": 131, + "execution_count": 130, "metadata": {}, "outputs": [], "source": [ @@ -2013,7 +1969,7 @@ }, { "cell_type": "code", - "execution_count": 132, + "execution_count": 131, "metadata": {}, "outputs": [], "source": [ @@ -2032,7 +1988,7 @@ }, { "cell_type": "code", - "execution_count": 133, + "execution_count": 132, "metadata": {}, "outputs": [], "source": [ @@ -2046,7 +2002,7 @@ }, { "cell_type": "code", - "execution_count": 134, + "execution_count": 133, "metadata": {}, "outputs": [], "source": [ @@ -2062,7 +2018,7 @@ }, { "cell_type": "code", - "execution_count": 135, + "execution_count": 134, "metadata": {}, "outputs": [], "source": [ @@ -2103,7 +2059,7 @@ }, { "cell_type": "code", - "execution_count": 136, + "execution_count": 135, "metadata": {}, "outputs": [], "source": [ @@ -2133,7 +2089,7 @@ }, { "cell_type": "code", - "execution_count": 137, + "execution_count": 136, "metadata": {}, "outputs": [], "source": [ @@ -2150,7 +2106,7 @@ }, { "cell_type": "code", - "execution_count": 138, + "execution_count": 137, "metadata": {}, "outputs": [], "source": [ @@ -2159,7 +2115,7 @@ }, { "cell_type": "code", - "execution_count": 139, + "execution_count": 138, "metadata": {}, "outputs": [], "source": [ @@ -2175,7 +2131,7 @@ }, { "cell_type": "code", - "execution_count": 140, + "execution_count": 139, "metadata": {}, "outputs": [], "source": [ @@ -2209,7 +2165,7 @@ }, { "cell_type": "code", - "execution_count": 141, + "execution_count": 140, "metadata": {}, "outputs": [], "source": [ @@ -2226,7 +2182,7 @@ }, { "cell_type": "code", - "execution_count": 142, + "execution_count": 141, "metadata": {}, "outputs": [], "source": [ @@ -2245,7 +2201,7 @@ }, { "cell_type": "code", - "execution_count": 143, + "execution_count": 142, "metadata": {}, "outputs": [], "source": [ @@ -2282,7 +2238,7 @@ }, { "cell_type": "code", - "execution_count": 144, + "execution_count": 143, "metadata": { "tags": [] }, @@ -2343,7 +2299,7 @@ }, { "cell_type": "code", - "execution_count": 145, + "execution_count": 144, "metadata": { "tags": [] }, @@ -2406,7 +2362,7 @@ }, { "cell_type": "code", - "execution_count": 146, + "execution_count": 145, "metadata": {}, "outputs": [], "source": [ @@ -2417,7 +2373,7 @@ }, { "cell_type": "code", - "execution_count": 147, + "execution_count": 146, "metadata": {}, "outputs": [], "source": [ @@ -2435,7 +2391,7 @@ }, { "cell_type": "code", - "execution_count": 148, + "execution_count": 147, "metadata": {}, "outputs": [], "source": [ @@ -2457,7 +2413,7 @@ }, { "cell_type": "code", - "execution_count": 149, + "execution_count": 148, "metadata": {}, "outputs": [], "source": [ @@ -2479,7 +2435,7 @@ }, { "cell_type": "code", - "execution_count": 150, + "execution_count": 149, "metadata": {}, "outputs": [], "source": [ @@ -2500,7 +2456,7 @@ }, { "cell_type": "code", - "execution_count": 151, + "execution_count": 150, "metadata": {}, "outputs": [], "source": [ @@ -2526,7 +2482,7 @@ }, { "cell_type": "code", - "execution_count": 152, + "execution_count": 151, "metadata": {}, "outputs": [], "source": [ @@ -2544,7 +2500,7 @@ }, { "cell_type": "code", - "execution_count": 153, + "execution_count": 152, "metadata": {}, "outputs": [], "source": [ @@ -2569,7 +2525,7 @@ }, { "cell_type": "code", - "execution_count": 154, + "execution_count": 153, "metadata": {}, "outputs": [], "source": [ @@ -2601,12 +2557,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "_Try a Support Vector Machine regressor (`sklearn.svm.SVR`) with various hyperparameters, such as `kernel=\"linear\"` (with various values for the `C` hyperparameter) or `kernel=\"rbf\"` (with various values for the `C` and `gamma` hyperparameters). Note that SVMs don't scale well to large datasets, so you should probably train your model on just the first 5,000 instances of the training set and use only 3-fold cross-validation, or else it will take hours. Don't worry about what the hyperparameters mean for now (see the SVM notebook if you're interested). How does the best `SVR` predictor perform?_" + "Exercise: _Try a Support Vector Machine regressor (`sklearn.svm.SVR`) with various hyperparameters, such as `kernel=\"linear\"` (with various values for the `C` hyperparameter) or `kernel=\"rbf\"` (with various values for the `C` and `gamma` hyperparameters). Note that SVMs don't scale well to large datasets, so you should probably train your model on just the first 5,000 instances of the training set and use only 3-fold cross-validation, or else it will take hours. Don't worry about what the hyperparameters mean for now (see the SVM notebook if you're interested). How does the best `SVR` predictor perform?_" ] }, { "cell_type": "code", - "execution_count": 155, + "execution_count": 154, "metadata": {}, "outputs": [], "source": [ @@ -2636,7 +2592,7 @@ }, { "cell_type": "code", - "execution_count": 156, + "execution_count": 155, "metadata": {}, "outputs": [], "source": [ @@ -2653,7 +2609,7 @@ }, { "cell_type": "code", - "execution_count": 157, + "execution_count": 156, "metadata": {}, "outputs": [], "source": [ @@ -2678,7 +2634,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "_Try replacing the `GridSearchCV` with a `RandomizedSearchCV`._" + "Exercise: _Try replacing the `GridSearchCV` with a `RandomizedSearchCV`._" ] }, { @@ -2690,7 +2646,7 @@ }, { "cell_type": "code", - "execution_count": 158, + "execution_count": 157, "metadata": {}, "outputs": [], "source": [ @@ -2724,7 +2680,7 @@ }, { "cell_type": "code", - "execution_count": 159, + "execution_count": 158, "metadata": {}, "outputs": [], "source": [ @@ -2741,7 +2697,7 @@ }, { "cell_type": "code", - "execution_count": 160, + "execution_count": 159, "metadata": {}, "outputs": [], "source": [ @@ -2764,7 +2720,7 @@ }, { "cell_type": "code", - "execution_count": 161, + "execution_count": 160, "metadata": {}, "outputs": [], "source": [ @@ -2792,7 +2748,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "_Try adding a `SelectFromModel` transformer in the preparation pipeline to select only the most important attributes._" + "Exercise: _Try adding a `SelectFromModel` transformer in the preparation pipeline to select only the most important attributes._" ] }, { @@ -2804,7 +2760,7 @@ }, { "cell_type": "code", - "execution_count": 162, + "execution_count": 161, "metadata": {}, "outputs": [], "source": [ @@ -2822,7 +2778,7 @@ }, { "cell_type": "code", - "execution_count": 163, + "execution_count": 162, "metadata": {}, "outputs": [], "source": [ @@ -2852,7 +2808,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "_Try creating a custom transformer that trains a k-Nearest Neighbors regressor (`sklearn.neighbors.KNeighborsRegressor`) in its `fit()` method, and outputs the model's predictions in its `transform()` method. Then add this feature to the preprocessing pipeline, using latitude and longitude as the inputs to this transformer. This will add a feature in the model that corresponds to the housing median price of the nearest districts._" + "Exercise: _Try creating a custom transformer that trains a k-Nearest Neighbors regressor (`sklearn.neighbors.KNeighborsRegressor`) in its `fit()` method, and outputs the model's predictions in its `transform()` method. Then add this feature to the preprocessing pipeline, using latitude and longitude as the inputs to this transformer. This will add a feature in the model that corresponds to the housing median price of the nearest districts._" ] }, { @@ -2864,7 +2820,7 @@ }, { "cell_type": "code", - "execution_count": 164, + "execution_count": 163, "metadata": {}, "outputs": [], "source": [ @@ -2909,7 +2865,7 @@ }, { "cell_type": "code", - "execution_count": 165, + "execution_count": 164, "metadata": {}, "outputs": [], "source": [ @@ -2925,7 +2881,7 @@ }, { "cell_type": "code", - "execution_count": 166, + "execution_count": 165, "metadata": {}, "outputs": [], "source": [ @@ -2944,7 +2900,7 @@ }, { "cell_type": "code", - "execution_count": 167, + "execution_count": 166, "metadata": {}, "outputs": [], "source": [ @@ -2960,7 +2916,7 @@ }, { "cell_type": "code", - "execution_count": 168, + "execution_count": 167, "metadata": {}, "outputs": [], "source": [ @@ -2976,7 +2932,7 @@ }, { "cell_type": "code", - "execution_count": 169, + "execution_count": 168, "metadata": {}, "outputs": [], "source": [ @@ -2990,7 +2946,7 @@ }, { "cell_type": "code", - "execution_count": 170, + "execution_count": 169, "metadata": {}, "outputs": [], "source": [ @@ -3020,12 +2976,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Question: Automatically explore some preparation options using `RandomSearchCV`." + "Exercise: _Automatically explore some preparation options using `RandomSearchCV`._" ] }, { "cell_type": "code", - "execution_count": 171, + "execution_count": 170, "metadata": {}, "outputs": [], "source": [ @@ -3047,7 +3003,7 @@ }, { "cell_type": "code", - "execution_count": 172, + "execution_count": 171, "metadata": {}, "outputs": [], "source": [ @@ -3066,7 +3022,196 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "That's all for today! 😀" + "## 6." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Exercise: _Try to implement the `StandardScalerClone` class again from scratch, then add support for the `inverse_transform()` method: executing `scaler.inverse_transform(scaler.fit_transform(X))` should return an array very close to `X`. Then add support for feature names: set `feature_names_in_` in the `fit()` method if the input is a DataFrame. This attribute should be a NumPy array of column names. Lastly, implement the `get_feature_names_out()` method: it should have one optional `input_features=None` argument. If passed, the method should check that its length matches `n_features_in_`, and it should match `feature_names_in_` if it is defined, then `input_features` should be returned. If `input_features` is `None`, then the method should return `feature_names_in_` if it is defined or `np.array([\"x0\", \"x1\", ...])` with length `n_features_in_` otherwise._" + ] + }, + { + "cell_type": "code", + "execution_count": 172, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.base import BaseEstimator, TransformerMixin\n", + "from sklearn.utils.validation import check_array, check_is_fitted\n", + "\n", + "class StandardScalerClone(BaseEstimator, TransformerMixin):\n", + " def __init__(self, with_mean=True): # no *args or **kwargs!\n", + " self.with_mean = with_mean\n", + "\n", + " def fit(self, X, y=None): # y is required even though we don't use it\n", + " X = check_array(X) # checks that X is an array with finite float values\n", + " self.mean_ = X.mean(axis=0)\n", + " self.scale_ = X.std(axis=0)\n", + " self.n_features_in_ = X.shape[1] # every estimator stores this in fit()\n", + " if hasattr(X, \"columns\"):\n", + " self.feature_names_in_ = np.array(X.columns, np.object)\n", + " return self # always return self!\n", + "\n", + " def transform(self, X):\n", + " check_is_fitted(self) # looks for learned attributes (with trailing _)\n", + " X = check_array(X)\n", + " if self.n_features_in_ != X.shape[1]:\n", + " raise ValueError(\"Unexpected number of features\")\n", + " if self.with_mean:\n", + " X = X - self.mean_\n", + " return X / self.scale_\n", + " \n", + " def inverse_transform(self, X):\n", + " check_is_fitted(self)\n", + " X = check_array(X)\n", + " if self.n_features_in_ != X.shape[1]:\n", + " raise ValueError(\"Unexpected number of features\")\n", + " X = X * self.scale_\n", + " return X + self.mean_ if self.with_mean else X\n", + " \n", + " def get_feature_names_out(self, input_features=None):\n", + " if input_features is None:\n", + " return getattr(self, \"feature_names_in_\",\n", + " [f\"x{i}\" for i in range(self.n_features_in_)])\n", + " else:\n", + " if len(input_features) != self.n_features_in_:\n", + " raise ValueError(\"Invalid number of features\")\n", + " if hasattr(self, \"feature_names_in_\") and not np.all(\n", + " self.feature_names_in_ == input_features\n", + " ):\n", + " raise ValueError(\"input_features ≠ feature_names_in_\")\n", + " return input_features" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's test our custom transformer:" + ] + }, + { + "cell_type": "code", + "execution_count": 173, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.utils.estimator_checks import check_estimator\n", + " \n", + "check_estimator(StandardScalerClone())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "No errors, that's a great start, we respect the Scikit-Learn API." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now let's ensure we the transformation works as expected:" + ] + }, + { + "cell_type": "code", + "execution_count": 174, + "metadata": {}, + "outputs": [], + "source": [ + "np.random.seed(42)\n", + "X = np.random.rand(1000, 3)\n", + "\n", + "scaler = StandardScalerClone()\n", + "X_scaled = scaler.fit_transform(X)\n", + "\n", + "assert np.allclose(X_scaled, (X - X.mean(axis=0)) / X.std(axis=0))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "How about setting `with_mean=False`?" + ] + }, + { + "cell_type": "code", + "execution_count": 175, + "metadata": {}, + "outputs": [], + "source": [ + "scaler = StandardScalerClone(with_mean=False)\n", + "X_scaled_uncentered = scaler.fit_transform(X)\n", + "\n", + "assert np.allclose(X_scaled_uncentered, X / X.std(axis=0))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "And does the inverse work?" + ] + }, + { + "cell_type": "code", + "execution_count": 176, + "metadata": {}, + "outputs": [], + "source": [ + "scaler = StandardScalerClone()\n", + "X_back = scaler.inverse_transform(scaler.fit_transform(X))\n", + "assert np.allclose(X, X_back)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "How about the feature names out?" + ] + }, + { + "cell_type": "code", + "execution_count": 177, + "metadata": {}, + "outputs": [], + "source": [ + "assert np.all(scaler.get_feature_names_out() == [\"x0\", \"x1\", \"x2\"])\n", + "assert np.all(scaler.get_feature_names_out([\"a\", \"b\", \"c\"]) == [\"a\", \"b\", \"c\"])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "And if we fit a DataFrame, are the feature in and out ok?" + ] + }, + { + "cell_type": "code", + "execution_count": 178, + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.DataFrame({\"a\": np.random.rand(100), \"b\": np.random.rand(100)})\n", + "scaler = StandardScalerClone()\n", + "X_scaled = scaler.fit_transform(df)\n", + "\n", + "assert np.all(ss.feature_names_in_ == [\"a\", \"b\"])\n", + "assert np.all(ss.get_feature_names_out() == [\"a\", \"b\"])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "All good! That's all for today! 😀" ] }, {