From b54ee1b60839b0c70a67c7491b38c6021b1b60d0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aur=C3=A9lien=20Geron?= Date: Fri, 21 Dec 2018 10:18:31 +0800 Subject: [PATCH] Update notebooks 1 to 8 to latest library versions (in particular Scikit-Learn 0.20) --- 01_the_machine_learning_landscape.ipynb | 6 +- 02_end_to_end_machine_learning_project.ipynb | 59 +- 03_classification.ipynb | 437 +++++++------- 04_training_linear_models.ipynb | 15 +- 05_support_vector_machines.ipynb | 83 ++- 06_decision_trees.ipynb | 4 +- 07_ensemble_learning_and_random_forests.ipynb | 141 +++-- 08_dimensionality_reduction.ipynb | 535 +++++++++--------- 8 files changed, 694 insertions(+), 586 deletions(-) diff --git a/01_the_machine_learning_landscape.ipynb b/01_the_machine_learning_landscape.ipynb index 8b99fce..1ca8b0b 100644 --- a/01_the_machine_learning_landscape.ipynb +++ b/01_the_machine_learning_landscape.ipynb @@ -64,7 +64,7 @@ "\n", "# Ignore useless warnings (see SciPy issue #5998)\n", "import warnings\n", - "warnings.filterwarnings(action=\"ignore\", module=\"scipy\", message=\"^internal gelsd\")" + "warnings.filterwarnings(action=\"ignore\", message=\"^internal gelsd\")" ] }, { @@ -407,7 +407,7 @@ "source": [ "cyprus_gdp_per_capita = gdp_per_capita.loc[\"Cyprus\"][\"GDP per capita\"]\n", "print(cyprus_gdp_per_capita)\n", - "cyprus_predicted_life_satisfaction = lin1.predict(cyprus_gdp_per_capita)[0][0]\n", + "cyprus_predicted_life_satisfaction = lin1.predict([[cyprus_gdp_per_capita]])[0][0]\n", "cyprus_predicted_life_satisfaction" ] }, @@ -719,7 +719,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.3" + "version": "3.6.6" }, "nav_menu": {}, "toc": { diff --git a/02_end_to_end_machine_learning_project.ipynb b/02_end_to_end_machine_learning_project.ipynb index 7e5480f..52ac5cb 100644 --- a/02_end_to_end_machine_learning_project.ipynb +++ b/02_end_to_end_machine_learning_project.ipynb @@ -661,15 +661,25 @@ "sample_incomplete_rows" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Warning**: Since Scikit-Learn 0.20, the `sklearn.preprocessing.Imputer` class was replaced by the `sklearn.impute.SimpleImputer` class." + ] + }, { "cell_type": "code", "execution_count": 49, "metadata": {}, "outputs": [], "source": [ - "from sklearn.preprocessing import Imputer\n", + "try:\n", + " from sklearn.impute import SimpleImputer # Scikit-Learn 0.20+\n", + "except ImportError:\n", + " from sklearn.preprocessing import Imputer as SimpleImputer\n", "\n", - "imputer = Imputer(strategy=\"median\")" + "imputer = SimpleImputer(strategy=\"median\")" ] }, { @@ -798,7 +808,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "**Warning**: earlier versions of the book used the `LabelEncoder` class or Pandas' `Series.factorize()` method to encode string categorical attributes as integers. However, the `OrdinalEncoder` class that is planned to be introduced in Scikit-Learn 0.20 (see [PR #10521](https://github.com/scikit-learn/scikit-learn/issues/10521)) is preferable since it is designed for input features (`X` instead of labels `y`) and it plays well with pipelines (introduced later in this notebook). For now, we will import it from `future_encoders.py`, but once it is available you can import it directly from `sklearn.preprocessing`." + "**Warning**: earlier versions of the book used the `LabelEncoder` class or Pandas' `Series.factorize()` method to encode string categorical attributes as integers. However, the `OrdinalEncoder` class that was introduced in Scikit-Learn 0.20 (see [PR #10521](https://github.com/scikit-learn/scikit-learn/issues/10521)) is preferable since it is designed for input features (`X` instead of labels `y`) and it plays well with pipelines (introduced later in this notebook). If you are using an older version of Scikit-Learn (<0.20), then you can import it from `future_encoders.py` instead." ] }, { @@ -807,7 +817,10 @@ "metadata": {}, "outputs": [], "source": [ - "from future_encoders import OrdinalEncoder" + "try:\n", + " from sklearn.preprocessing import OrdinalEncoder\n", + "except ImportError:\n", + " from future_encoders import OrdinalEncoder # Scikit-Learn < 0.20" ] }, { @@ -834,7 +847,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "**Warning**: earlier versions of the book used the `LabelBinarizer` or `CategoricalEncoder` classes to convert each categorical value to a one-hot vector. It is now preferable to use the `OneHotEncoder` class. Right now it can only handle integer categorical inputs, but in Scikit-Learn 0.20 it will also handle string categorical inputs (see [PR #10521](https://github.com/scikit-learn/scikit-learn/issues/10521)). So for now we import it from `future_encoders.py`, but when Scikit-Learn 0.20 is released, you can import it from `sklearn.preprocessing` instead:" + "**Warning**: earlier versions of the book used the `LabelBinarizer` or `CategoricalEncoder` classes to convert each categorical value to a one-hot vector. It is now preferable to use the `OneHotEncoder` class. Since Scikit-Learn 0.20 it can handle string categorical inputs (see [PR #10521](https://github.com/scikit-learn/scikit-learn/issues/10521)), not just integer categorical inputs. If you are using an older version of Scikit-Learn, you can import the new version from `future_encoders.py`:" ] }, { @@ -843,7 +856,11 @@ "metadata": {}, "outputs": [], "source": [ - "from future_encoders import OneHotEncoder\n", + "try:\n", + " from sklearn.preprocessing import OrdinalEncoder # just to raise an ImportError if Scikit-Learn < 0.20\n", + " from sklearn.preprocessing import OneHotEncoder\n", + "except ImportError:\n", + " from future_encoders import OneHotEncoder # Scikit-Learn < 0.20\n", "\n", "cat_encoder = OneHotEncoder()\n", "housing_cat_1hot = cat_encoder.fit_transform(housing_cat)\n", @@ -959,7 +976,7 @@ "from sklearn.preprocessing import StandardScaler\n", "\n", "num_pipeline = Pipeline([\n", - " ('imputer', Imputer(strategy=\"median\")),\n", + " ('imputer', SimpleImputer(strategy=\"median\")),\n", " ('attribs_adder', CombinedAttributesAdder()),\n", " ('std_scaler', StandardScaler()),\n", " ])\n", @@ -980,7 +997,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "**Warning**: earlier versions of the book applied different transformations to different columns using a solution based on a `DataFrameSelector` transformer and a `FeatureUnion` (see below). It is now preferable to use the `ColumnTransformer` class that will be introduced in Scikit-Learn 0.20. For now we import it from `future_encoders.py`, but when Scikit-Learn 0.20 is released, you can import it from `sklearn.compose` instead:" + "**Warning**: earlier versions of the book applied different transformations to different columns using a solution based on a `DataFrameSelector` transformer and a `FeatureUnion` (see below). It is now preferable to use the `ColumnTransformer` class that was introduced in Scikit-Learn 0.20. If you are using an older version of Scikit-Learn, you can import it from `future_encoders.py`:" ] }, { @@ -989,7 +1006,10 @@ "metadata": {}, "outputs": [], "source": [ - "from future_encoders import ColumnTransformer" + "try:\n", + " from sklearn.compose import ColumnTransformer\n", + "except ImportError:\n", + " from future_encoders import ColumnTransformer # Scikit-Learn < 0.20" ] }, { @@ -1070,7 +1090,7 @@ "\n", "old_num_pipeline = Pipeline([\n", " ('selector', OldDataFrameSelector(num_attribs)),\n", - " ('imputer', Imputer(strategy=\"median\")),\n", + " ('imputer', SimpleImputer(strategy=\"median\")),\n", " ('attribs_adder', CombinedAttributesAdder()),\n", " ('std_scaler', StandardScaler()),\n", " ])\n", @@ -1275,6 +1295,13 @@ "display_scores(lin_rmse_scores)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Note**: we specify `n_estimators=10` to avoid a warning about the fact that the default value is going to change to 100 in Scikit-Learn 0.22." + ] + }, { "cell_type": "code", "execution_count": 91, @@ -1283,7 +1310,7 @@ "source": [ "from sklearn.ensemble import RandomForestRegressor\n", "\n", - "forest_reg = RandomForestRegressor(random_state=42)\n", + "forest_reg = RandomForestRegressor(n_estimators=10, random_state=42)\n", "forest_reg.fit(housing_prepared, housing_labels)" ] }, @@ -2114,10 +2141,10 @@ "metadata": {}, "outputs": [], "source": [ - "param_grid = [\n", - " {'preparation__num__imputer__strategy': ['mean', 'median', 'most_frequent'],\n", - " 'feature_selection__k': list(range(1, len(feature_importances) + 1))}\n", - "]\n", + "param_grid = [{\n", + " 'preparation__num__imputer__strategy': ['mean', 'median', 'most_frequent'],\n", + " 'feature_selection__k': list(range(1, len(feature_importances) + 1))\n", + "}]\n", "\n", "grid_search_prep = GridSearchCV(prepare_select_and_predict_pipeline, param_grid, cv=5,\n", " scoring='neg_mean_squared_error', verbose=2, n_jobs=4)\n", @@ -2164,7 +2191,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.5" + "version": "3.6.6" }, "nav_menu": { "height": "279px", diff --git a/03_classification.ipynb b/03_classification.ipynb index 9812293..c25e2ef 100644 --- a/03_classification.ipynb +++ b/03_classification.ipynb @@ -66,15 +66,26 @@ "# MNIST" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Warning**: `fetch_mldata()` is deprecated since Scikit-Learn 0.20. You should use `fetch_openml()` instead. However, it returns the unsorted MNIST dataset, whereas `fetch_mldata()` returned the dataset sorted by target (the training set and the test test were sorted separately). In general, this is fine, but if you want to get the exact same results as before, you need to sort the dataset using the following function:" + ] + }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ - "from sklearn.datasets import fetch_mldata\n", - "mnist = fetch_mldata('MNIST original')\n", - "mnist" + "def sort_by_target(mnist):\n", + " reorder_train = np.array(sorted([(target, i) for i, target in enumerate(mnist.target[:60000])]))[:, 1]\n", + " reorder_test = np.array(sorted([(target, i) for i, target in enumerate(mnist.target[60000:])]))[:, 1]\n", + " mnist.data[:60000] = mnist.data[reorder_train]\n", + " mnist.target[:60000] = mnist.target[reorder_train]\n", + " mnist.data[60000:] = mnist.data[reorder_test + 60000]\n", + " mnist.target[60000:] = mnist.target[reorder_test + 60000]" ] }, { @@ -82,6 +93,32 @@ "execution_count": 3, "metadata": {}, "outputs": [], + "source": [ + "try:\n", + " from sklearn.datasets import fetch_openml\n", + " mnist = fetch_openml('mnist_784', version=1, cache=True)\n", + " mnist.target = mnist.target.astype(np.int8) # fetch_openml() returns targets as strings\n", + " sort_by_target(mnist) # fetch_openml() returns an unsorted dataset\n", + "except ImportError:\n", + " from sklearn.datasets import fetch_mldata\n", + " mnist = fetch_mldata('MNIST original')\n", + "mnist[\"data\"], mnist[\"target\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "mnist.data.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], "source": [ "X, y = mnist[\"data\"], mnist[\"target\"]\n", "X.shape" @@ -89,7 +126,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ @@ -98,7 +135,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ @@ -107,7 +144,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ @@ -127,7 +164,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ @@ -140,7 +177,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ @@ -163,7 +200,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ @@ -176,7 +213,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 12, "metadata": {}, "outputs": [], "source": [ @@ -185,7 +222,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ @@ -194,7 +231,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 14, "metadata": {}, "outputs": [], "source": [ @@ -213,7 +250,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 15, "metadata": {}, "outputs": [], "source": [ @@ -221,21 +258,28 @@ "y_test_5 = (y_test == 5)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Note**: a few hyperparameters will have a different default value in future versions of Scikit-Learn, so a warning is issued if you do not set them explicitly. This is why we set `max_iter=5` and `tol=-np.infty`, to get the same results as in the book, while avoiding the warnings." + ] + }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "from sklearn.linear_model import SGDClassifier\n", "\n", - "sgd_clf = SGDClassifier(max_iter=5, random_state=42)\n", + "sgd_clf = SGDClassifier(max_iter=5, tol=-np.infty, random_state=42)\n", "sgd_clf.fit(X_train, y_train_5)" ] }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 17, "metadata": {}, "outputs": [], "source": [ @@ -244,7 +288,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 18, "metadata": {}, "outputs": [], "source": [ @@ -254,7 +298,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 19, "metadata": {}, "outputs": [], "source": [ @@ -278,7 +322,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 20, "metadata": {}, "outputs": [], "source": [ @@ -292,7 +336,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 21, "metadata": {}, "outputs": [], "source": [ @@ -302,7 +346,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 22, "metadata": {}, "outputs": [], "source": [ @@ -313,7 +357,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 23, "metadata": {}, "outputs": [], "source": [ @@ -324,7 +368,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 24, "metadata": {}, "outputs": [], "source": [ @@ -333,7 +377,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 25, "metadata": {}, "outputs": [], "source": [ @@ -342,7 +386,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 26, "metadata": {}, "outputs": [], "source": [ @@ -353,7 +397,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 27, "metadata": {}, "outputs": [], "source": [ @@ -362,7 +406,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 28, "metadata": {}, "outputs": [], "source": [ @@ -371,7 +415,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 29, "metadata": {}, "outputs": [], "source": [ @@ -380,7 +424,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 30, "metadata": {}, "outputs": [], "source": [ @@ -390,7 +434,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 31, "metadata": {}, "outputs": [], "source": [ @@ -399,7 +443,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 32, "metadata": {}, "outputs": [], "source": [ @@ -409,7 +453,7 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 33, "metadata": {}, "outputs": [], "source": [ @@ -419,7 +463,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 34, "metadata": {}, "outputs": [], "source": [ @@ -428,7 +472,7 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 35, "metadata": {}, "outputs": [], "source": [ @@ -439,7 +483,7 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 36, "metadata": {}, "outputs": [], "source": [ @@ -456,7 +500,7 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 37, "metadata": {}, "outputs": [], "source": [ @@ -465,7 +509,7 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 38, "metadata": {}, "outputs": [], "source": [ @@ -476,7 +520,7 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 39, "metadata": {}, "outputs": [], "source": [ @@ -487,7 +531,7 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 40, "metadata": {}, "outputs": [], "source": [ @@ -507,7 +551,7 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 41, "metadata": {}, "outputs": [], "source": [ @@ -516,7 +560,7 @@ }, { "cell_type": "code", - "execution_count": 40, + "execution_count": 42, "metadata": {}, "outputs": [], "source": [ @@ -525,7 +569,7 @@ }, { "cell_type": "code", - "execution_count": 41, + "execution_count": 43, "metadata": {}, "outputs": [], "source": [ @@ -534,7 +578,7 @@ }, { "cell_type": "code", - "execution_count": 42, + "execution_count": 44, "metadata": {}, "outputs": [], "source": [ @@ -543,7 +587,7 @@ }, { "cell_type": "code", - "execution_count": 43, + "execution_count": 45, "metadata": {}, "outputs": [], "source": [ @@ -568,7 +612,7 @@ }, { "cell_type": "code", - "execution_count": 44, + "execution_count": 46, "metadata": {}, "outputs": [], "source": [ @@ -579,7 +623,7 @@ }, { "cell_type": "code", - "execution_count": 45, + "execution_count": 47, "metadata": {}, "outputs": [], "source": [ @@ -598,7 +642,7 @@ }, { "cell_type": "code", - "execution_count": 46, + "execution_count": 48, "metadata": {}, "outputs": [], "source": [ @@ -607,21 +651,28 @@ "roc_auc_score(y_train_5, y_scores)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Note**: we set `n_estimators=10` to avoid a warning about the fact that its default value will be set to 100 in Scikit-Learn 0.22." + ] + }, { "cell_type": "code", - "execution_count": 47, + "execution_count": 49, "metadata": {}, "outputs": [], "source": [ "from sklearn.ensemble import RandomForestClassifier\n", - "forest_clf = RandomForestClassifier(random_state=42)\n", + "forest_clf = RandomForestClassifier(n_estimators=10, random_state=42)\n", "y_probas_forest = cross_val_predict(forest_clf, X_train, y_train_5, cv=3,\n", " method=\"predict_proba\")" ] }, { "cell_type": "code", - "execution_count": 48, + "execution_count": 50, "metadata": {}, "outputs": [], "source": [ @@ -631,7 +682,7 @@ }, { "cell_type": "code", - "execution_count": 49, + "execution_count": 51, "metadata": {}, "outputs": [], "source": [ @@ -645,7 +696,7 @@ }, { "cell_type": "code", - "execution_count": 50, + "execution_count": 52, "metadata": {}, "outputs": [], "source": [ @@ -654,7 +705,7 @@ }, { "cell_type": "code", - "execution_count": 51, + "execution_count": 53, "metadata": {}, "outputs": [], "source": [ @@ -664,7 +715,7 @@ }, { "cell_type": "code", - "execution_count": 52, + "execution_count": 54, "metadata": {}, "outputs": [], "source": [ @@ -680,7 +731,7 @@ }, { "cell_type": "code", - "execution_count": 53, + "execution_count": 55, "metadata": {}, "outputs": [], "source": [ @@ -690,7 +741,7 @@ }, { "cell_type": "code", - "execution_count": 54, + "execution_count": 56, "metadata": {}, "outputs": [], "source": [ @@ -700,7 +751,7 @@ }, { "cell_type": "code", - "execution_count": 55, + "execution_count": 57, "metadata": {}, "outputs": [], "source": [ @@ -709,7 +760,7 @@ }, { "cell_type": "code", - "execution_count": 56, + "execution_count": 58, "metadata": {}, "outputs": [], "source": [ @@ -718,7 +769,7 @@ }, { "cell_type": "code", - "execution_count": 57, + "execution_count": 59, "metadata": {}, "outputs": [], "source": [ @@ -727,19 +778,19 @@ }, { "cell_type": "code", - "execution_count": 58, + "execution_count": 60, "metadata": {}, "outputs": [], "source": [ "from sklearn.multiclass import OneVsOneClassifier\n", - "ovo_clf = OneVsOneClassifier(SGDClassifier(max_iter=5, random_state=42))\n", + "ovo_clf = OneVsOneClassifier(SGDClassifier(max_iter=5, tol=-np.infty, random_state=42))\n", "ovo_clf.fit(X_train, y_train)\n", "ovo_clf.predict([some_digit])" ] }, { "cell_type": "code", - "execution_count": 59, + "execution_count": 61, "metadata": {}, "outputs": [], "source": [ @@ -748,7 +799,7 @@ }, { "cell_type": "code", - "execution_count": 60, + "execution_count": 62, "metadata": {}, "outputs": [], "source": [ @@ -758,7 +809,7 @@ }, { "cell_type": "code", - "execution_count": 61, + "execution_count": 63, "metadata": {}, "outputs": [], "source": [ @@ -767,7 +818,7 @@ }, { "cell_type": "code", - "execution_count": 62, + "execution_count": 64, "metadata": {}, "outputs": [], "source": [ @@ -776,7 +827,7 @@ }, { "cell_type": "code", - "execution_count": 63, + "execution_count": 65, "metadata": {}, "outputs": [], "source": [ @@ -788,7 +839,7 @@ }, { "cell_type": "code", - "execution_count": 64, + "execution_count": 66, "metadata": {}, "outputs": [], "source": [ @@ -799,7 +850,7 @@ }, { "cell_type": "code", - "execution_count": 65, + "execution_count": 67, "metadata": {}, "outputs": [], "source": [ @@ -813,7 +864,7 @@ }, { "cell_type": "code", - "execution_count": 66, + "execution_count": 68, "metadata": {}, "outputs": [], "source": [ @@ -824,7 +875,7 @@ }, { "cell_type": "code", - "execution_count": 67, + "execution_count": 69, "metadata": {}, "outputs": [], "source": [ @@ -834,7 +885,7 @@ }, { "cell_type": "code", - "execution_count": 68, + "execution_count": 70, "metadata": {}, "outputs": [], "source": [ @@ -846,7 +897,7 @@ }, { "cell_type": "code", - "execution_count": 69, + "execution_count": 71, "metadata": {}, "outputs": [], "source": [ @@ -874,7 +925,7 @@ }, { "cell_type": "code", - "execution_count": 70, + "execution_count": 72, "metadata": {}, "outputs": [], "source": [ @@ -890,7 +941,7 @@ }, { "cell_type": "code", - "execution_count": 71, + "execution_count": 73, "metadata": {}, "outputs": [], "source": [ @@ -906,7 +957,7 @@ }, { "cell_type": "code", - "execution_count": 72, + "execution_count": 74, "metadata": {}, "outputs": [], "source": [ @@ -923,7 +974,7 @@ }, { "cell_type": "code", - "execution_count": 73, + "execution_count": 75, "metadata": {}, "outputs": [], "source": [ @@ -937,7 +988,7 @@ }, { "cell_type": "code", - "execution_count": 74, + "execution_count": 76, "metadata": {}, "outputs": [], "source": [ @@ -950,7 +1001,7 @@ }, { "cell_type": "code", - "execution_count": 75, + "execution_count": 77, "metadata": {}, "outputs": [], "source": [ @@ -976,7 +1027,7 @@ }, { "cell_type": "code", - "execution_count": 76, + "execution_count": 78, "metadata": {}, "outputs": [], "source": [ @@ -988,7 +1039,7 @@ }, { "cell_type": "code", - "execution_count": 77, + "execution_count": 79, "metadata": { "scrolled": true }, @@ -1007,7 +1058,7 @@ }, { "cell_type": "code", - "execution_count": 78, + "execution_count": 80, "metadata": {}, "outputs": [], "source": [ @@ -1018,7 +1069,7 @@ }, { "cell_type": "code", - "execution_count": 79, + "execution_count": 81, "metadata": {}, "outputs": [], "source": [ @@ -1027,7 +1078,7 @@ }, { "cell_type": "code", - "execution_count": 80, + "execution_count": 82, "metadata": {}, "outputs": [], "source": [ @@ -1037,7 +1088,7 @@ }, { "cell_type": "code", - "execution_count": 81, + "execution_count": 83, "metadata": {}, "outputs": [], "source": [ @@ -1050,7 +1101,7 @@ }, { "cell_type": "code", - "execution_count": 82, + "execution_count": 84, "metadata": {}, "outputs": [], "source": [ @@ -1068,7 +1119,7 @@ }, { "cell_type": "code", - "execution_count": 83, + "execution_count": 85, "metadata": {}, "outputs": [], "source": [ @@ -1077,10 +1128,8 @@ }, { "cell_type": "code", - "execution_count": 84, - "metadata": { - "collapsed": true - }, + "execution_count": 86, + "metadata": {}, "outputs": [], "source": [ "y_knn_expanded_pred = knn_clf.predict(X_test)" @@ -1088,7 +1137,7 @@ }, { "cell_type": "code", - "execution_count": 85, + "execution_count": 87, "metadata": {}, "outputs": [], "source": [ @@ -1097,7 +1146,7 @@ }, { "cell_type": "code", - "execution_count": 86, + "execution_count": 88, "metadata": {}, "outputs": [], "source": [ @@ -1107,7 +1156,7 @@ }, { "cell_type": "code", - "execution_count": 87, + "execution_count": 89, "metadata": {}, "outputs": [], "source": [ @@ -1130,9 +1179,16 @@ "## 1. An MNIST Classifier With Over 97% Accuracy" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Warning**: the next cell may take hours to run, depending on your hardware." + ] + }, { "cell_type": "code", - "execution_count": 88, + "execution_count": 90, "metadata": {}, "outputs": [], "source": [ @@ -1147,7 +1203,7 @@ }, { "cell_type": "code", - "execution_count": 89, + "execution_count": 91, "metadata": {}, "outputs": [], "source": [ @@ -1156,7 +1212,7 @@ }, { "cell_type": "code", - "execution_count": 90, + "execution_count": 92, "metadata": {}, "outputs": [], "source": [ @@ -1165,7 +1221,7 @@ }, { "cell_type": "code", - "execution_count": 91, + "execution_count": 93, "metadata": {}, "outputs": [], "source": [ @@ -1184,10 +1240,8 @@ }, { "cell_type": "code", - "execution_count": 92, - "metadata": { - "collapsed": true - }, + "execution_count": 94, + "metadata": {}, "outputs": [], "source": [ "from scipy.ndimage.interpolation import shift" @@ -1195,10 +1249,8 @@ }, { "cell_type": "code", - "execution_count": 93, - "metadata": { - "collapsed": true - }, + "execution_count": 95, + "metadata": {}, "outputs": [], "source": [ "def shift_image(image, dx, dy):\n", @@ -1209,7 +1261,7 @@ }, { "cell_type": "code", - "execution_count": 94, + "execution_count": 96, "metadata": {}, "outputs": [], "source": [ @@ -1232,10 +1284,8 @@ }, { "cell_type": "code", - "execution_count": 95, - "metadata": { - "collapsed": true - }, + "execution_count": 97, + "metadata": {}, "outputs": [], "source": [ "X_train_augmented = [image for image in X_train]\n", @@ -1252,10 +1302,8 @@ }, { "cell_type": "code", - "execution_count": 96, - "metadata": { - "collapsed": true - }, + "execution_count": 98, + "metadata": {}, "outputs": [], "source": [ "shuffle_idx = np.random.permutation(len(X_train_augmented))\n", @@ -1265,10 +1313,8 @@ }, { "cell_type": "code", - "execution_count": 97, - "metadata": { - "collapsed": true - }, + "execution_count": 99, + "metadata": {}, "outputs": [], "source": [ "knn_clf = KNeighborsClassifier(**grid_search.best_params_)" @@ -1276,7 +1322,7 @@ }, { "cell_type": "code", - "execution_count": 98, + "execution_count": 100, "metadata": {}, "outputs": [], "source": [ @@ -1285,7 +1331,7 @@ }, { "cell_type": "code", - "execution_count": 99, + "execution_count": 101, "metadata": {}, "outputs": [], "source": [ @@ -1330,7 +1376,7 @@ }, { "cell_type": "code", - "execution_count": 100, + "execution_count": 102, "metadata": {}, "outputs": [], "source": [ @@ -1341,7 +1387,7 @@ }, { "cell_type": "code", - "execution_count": 101, + "execution_count": 103, "metadata": {}, "outputs": [], "source": [ @@ -1354,7 +1400,7 @@ }, { "cell_type": "code", - "execution_count": 102, + "execution_count": 104, "metadata": {}, "outputs": [], "source": [ @@ -1378,7 +1424,7 @@ }, { "cell_type": "code", - "execution_count": 103, + "execution_count": 105, "metadata": {}, "outputs": [], "source": [ @@ -1410,7 +1456,7 @@ }, { "cell_type": "code", - "execution_count": 104, + "execution_count": 106, "metadata": {}, "outputs": [], "source": [ @@ -1440,7 +1486,7 @@ }, { "cell_type": "code", - "execution_count": 105, + "execution_count": 107, "metadata": {}, "outputs": [], "source": [ @@ -1465,7 +1511,7 @@ }, { "cell_type": "code", - "execution_count": 106, + "execution_count": 108, "metadata": {}, "outputs": [], "source": [ @@ -1481,7 +1527,7 @@ }, { "cell_type": "code", - "execution_count": 107, + "execution_count": 109, "metadata": {}, "outputs": [], "source": [ @@ -1490,7 +1536,7 @@ }, { "cell_type": "code", - "execution_count": 108, + "execution_count": 110, "metadata": {}, "outputs": [], "source": [ @@ -1499,7 +1545,7 @@ }, { "cell_type": "code", - "execution_count": 109, + "execution_count": 111, "metadata": {}, "outputs": [], "source": [ @@ -1522,7 +1568,7 @@ }, { "cell_type": "code", - "execution_count": 110, + "execution_count": 112, "metadata": {}, "outputs": [], "source": [ @@ -1543,29 +1589,32 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Let's build the pipeline for the numerical attributes:" + "Let's build the pipeline for the numerical attributes:\n", + "\n", + "**Warning**: Since Scikit-Learn 0.20, the `sklearn.preprocessing.Imputer` class was replaced by the `sklearn.impute.SimpleImputer` class." ] }, { "cell_type": "code", - "execution_count": 111, + "execution_count": 113, "metadata": {}, "outputs": [], "source": [ "from sklearn.pipeline import Pipeline\n", - "from sklearn.preprocessing import Imputer\n", - "\n", - "imputer = Imputer(strategy=\"median\")\n", + "try:\n", + " from sklearn.impute import SimpleImputer # Scikit-Learn 0.20+\n", + "except ImportError:\n", + " from sklearn.preprocessing import Imputer as SimpleImputer\n", "\n", "num_pipeline = Pipeline([\n", " (\"select_numeric\", DataFrameSelector([\"Age\", \"SibSp\", \"Parch\", \"Fare\"])),\n", - " (\"imputer\", Imputer(strategy=\"median\")),\n", + " (\"imputer\", SimpleImputer(strategy=\"median\")),\n", " ])" ] }, { "cell_type": "code", - "execution_count": 112, + "execution_count": 114, "metadata": {}, "outputs": [], "source": [ @@ -1576,12 +1625,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We will also need an imputer for the string categorical columns (the regular `Imputer` does not work on those):" + "We will also need an imputer for the string categorical columns (the regular `SimpleImputer` does not work on those):" ] }, { "cell_type": "code", - "execution_count": 113, + "execution_count": 115, "metadata": {}, "outputs": [], "source": [ @@ -1599,16 +1648,20 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We can convert each categorical value to a one-hot vector using a `OneHotEncoder`. Right now this class can only handle integer categorical inputs, but in Scikit-Learn 0.20 it will also handle string categorical inputs (see [PR #10521](https://github.com/scikit-learn/scikit-learn/issues/10521)). So for now we import it from `future_encoders.py`, but when Scikit-Learn 0.20 is released, you can import it from `sklearn.preprocessing` instead:" + "**Warning**: earlier versions of the book used the `LabelBinarizer` or `CategoricalEncoder` classes to convert each categorical value to a one-hot vector. It is now preferable to use the `OneHotEncoder` class. Since Scikit-Learn 0.20 it can handle string categorical inputs (see [PR #10521](https://github.com/scikit-learn/scikit-learn/issues/10521)), not just integer categorical inputs. If you are using an older version of Scikit-Learn, you can import the new version from `future_encoders.py`:" ] }, { "cell_type": "code", - "execution_count": 114, + "execution_count": 116, "metadata": {}, "outputs": [], "source": [ - "from future_encoders import OneHotEncoder" + "try:\n", + " from sklearn.preprocessing import OrdinalEncoder # just to raise an ImportError if Scikit-Learn < 0.20\n", + " from sklearn.preprocessing import OneHotEncoder\n", + "except ImportError:\n", + " from future_encoders import OneHotEncoder # Scikit-Learn < 0.20" ] }, { @@ -1620,7 +1673,7 @@ }, { "cell_type": "code", - "execution_count": 115, + "execution_count": 117, "metadata": {}, "outputs": [], "source": [ @@ -1633,7 +1686,7 @@ }, { "cell_type": "code", - "execution_count": 116, + "execution_count": 118, "metadata": {}, "outputs": [], "source": [ @@ -1649,7 +1702,7 @@ }, { "cell_type": "code", - "execution_count": 117, + "execution_count": 119, "metadata": {}, "outputs": [], "source": [ @@ -1669,7 +1722,7 @@ }, { "cell_type": "code", - "execution_count": 118, + "execution_count": 120, "metadata": {}, "outputs": [], "source": [ @@ -1686,7 +1739,7 @@ }, { "cell_type": "code", - "execution_count": 119, + "execution_count": 121, "metadata": {}, "outputs": [], "source": [ @@ -1702,13 +1755,13 @@ }, { "cell_type": "code", - "execution_count": 120, + "execution_count": 122, "metadata": {}, "outputs": [], "source": [ "from sklearn.svm import SVC\n", "\n", - "svm_clf = SVC()\n", + "svm_clf = SVC(gamma=\"auto\")\n", "svm_clf.fit(X_train, y_train)" ] }, @@ -1721,7 +1774,7 @@ }, { "cell_type": "code", - "execution_count": 121, + "execution_count": 123, "metadata": {}, "outputs": [], "source": [ @@ -1738,7 +1791,7 @@ }, { "cell_type": "code", - "execution_count": 122, + "execution_count": 124, "metadata": {}, "outputs": [], "source": [ @@ -1764,13 +1817,13 @@ }, { "cell_type": "code", - "execution_count": 123, + "execution_count": 125, "metadata": {}, "outputs": [], "source": [ "from sklearn.ensemble import RandomForestClassifier\n", "\n", - "forest_clf = RandomForestClassifier(random_state=42)\n", + "forest_clf = RandomForestClassifier(n_estimators=100, random_state=42)\n", "forest_scores = cross_val_score(forest_clf, X_train, y_train, cv=10)\n", "forest_scores.mean()" ] @@ -1791,7 +1844,7 @@ }, { "cell_type": "code", - "execution_count": 124, + "execution_count": 126, "metadata": {}, "outputs": [], "source": [ @@ -1817,7 +1870,7 @@ }, { "cell_type": "code", - "execution_count": 125, + "execution_count": 127, "metadata": {}, "outputs": [], "source": [ @@ -1827,7 +1880,7 @@ }, { "cell_type": "code", - "execution_count": 126, + "execution_count": 128, "metadata": {}, "outputs": [], "source": [ @@ -1851,7 +1904,7 @@ }, { "cell_type": "code", - "execution_count": 127, + "execution_count": 129, "metadata": {}, "outputs": [], "source": [ @@ -1878,7 +1931,7 @@ }, { "cell_type": "code", - "execution_count": 128, + "execution_count": 130, "metadata": {}, "outputs": [], "source": [ @@ -1894,7 +1947,7 @@ }, { "cell_type": "code", - "execution_count": 129, + "execution_count": 131, "metadata": {}, "outputs": [], "source": [ @@ -1906,7 +1959,7 @@ }, { "cell_type": "code", - "execution_count": 130, + "execution_count": 132, "metadata": {}, "outputs": [], "source": [ @@ -1915,7 +1968,7 @@ }, { "cell_type": "code", - "execution_count": 131, + "execution_count": 133, "metadata": {}, "outputs": [], "source": [ @@ -1931,7 +1984,7 @@ }, { "cell_type": "code", - "execution_count": 132, + "execution_count": 134, "metadata": {}, "outputs": [], "source": [ @@ -1946,7 +1999,7 @@ }, { "cell_type": "code", - "execution_count": 133, + "execution_count": 135, "metadata": {}, "outputs": [], "source": [ @@ -1963,7 +2016,7 @@ }, { "cell_type": "code", - "execution_count": 134, + "execution_count": 136, "metadata": {}, "outputs": [], "source": [ @@ -1972,7 +2025,7 @@ }, { "cell_type": "code", - "execution_count": 135, + "execution_count": 137, "metadata": {}, "outputs": [], "source": [ @@ -1988,7 +2041,7 @@ }, { "cell_type": "code", - "execution_count": 136, + "execution_count": 138, "metadata": {}, "outputs": [], "source": [ @@ -2007,7 +2060,7 @@ }, { "cell_type": "code", - "execution_count": 137, + "execution_count": 139, "metadata": {}, "outputs": [], "source": [ @@ -2023,7 +2076,7 @@ }, { "cell_type": "code", - "execution_count": 138, + "execution_count": 140, "metadata": {}, "outputs": [], "source": [ @@ -2032,7 +2085,7 @@ }, { "cell_type": "code", - "execution_count": 139, + "execution_count": 141, "metadata": {}, "outputs": [], "source": [ @@ -2055,7 +2108,7 @@ }, { "cell_type": "code", - "execution_count": 140, + "execution_count": 142, "metadata": {}, "outputs": [], "source": [ @@ -2072,7 +2125,7 @@ }, { "cell_type": "code", - "execution_count": 141, + "execution_count": 143, "metadata": {}, "outputs": [], "source": [ @@ -2088,7 +2141,7 @@ }, { "cell_type": "code", - "execution_count": 142, + "execution_count": 144, "metadata": {}, "outputs": [], "source": [ @@ -2110,7 +2163,7 @@ }, { "cell_type": "code", - "execution_count": 143, + "execution_count": 145, "metadata": {}, "outputs": [], "source": [ @@ -2134,7 +2187,7 @@ }, { "cell_type": "code", - "execution_count": 144, + "execution_count": 146, "metadata": {}, "outputs": [], "source": [ @@ -2153,7 +2206,7 @@ }, { "cell_type": "code", - "execution_count": 145, + "execution_count": 147, "metadata": {}, "outputs": [], "source": [ @@ -2169,7 +2222,7 @@ }, { "cell_type": "code", - "execution_count": 146, + "execution_count": 148, "metadata": {}, "outputs": [], "source": [ @@ -2193,7 +2246,7 @@ }, { "cell_type": "code", - "execution_count": 147, + "execution_count": 149, "metadata": {}, "outputs": [], "source": [ @@ -2211,7 +2264,7 @@ }, { "cell_type": "code", - "execution_count": 148, + "execution_count": 150, "metadata": {}, "outputs": [], "source": [ @@ -2237,7 +2290,7 @@ }, { "cell_type": "code", - "execution_count": 149, + "execution_count": 151, "metadata": {}, "outputs": [], "source": [ @@ -2260,7 +2313,7 @@ }, { "cell_type": "code", - "execution_count": 150, + "execution_count": 152, "metadata": {}, "outputs": [], "source": [ @@ -2312,7 +2365,7 @@ }, { "cell_type": "code", - "execution_count": 151, + "execution_count": 153, "metadata": {}, "outputs": [], "source": [ @@ -2337,7 +2390,7 @@ }, { "cell_type": "code", - "execution_count": 152, + "execution_count": 154, "metadata": {}, "outputs": [], "source": [ @@ -2369,7 +2422,7 @@ }, { "cell_type": "code", - "execution_count": 153, + "execution_count": 155, "metadata": {}, "outputs": [], "source": [ @@ -2380,7 +2433,7 @@ }, { "cell_type": "code", - "execution_count": 154, + "execution_count": 156, "metadata": {}, "outputs": [], "source": [ @@ -2396,7 +2449,7 @@ }, { "cell_type": "code", - "execution_count": 155, + "execution_count": 157, "metadata": {}, "outputs": [], "source": [ @@ -2412,7 +2465,7 @@ }, { "cell_type": "code", - "execution_count": 156, + "execution_count": 158, "metadata": {}, "outputs": [], "source": [ @@ -2428,14 +2481,14 @@ }, { "cell_type": "code", - "execution_count": 157, + "execution_count": 159, "metadata": {}, "outputs": [], "source": [ "from sklearn.linear_model import LogisticRegression\n", "from sklearn.model_selection import cross_val_score\n", "\n", - "log_clf = LogisticRegression(random_state=42)\n", + "log_clf = LogisticRegression(solver=\"liblinear\", random_state=42)\n", "score = cross_val_score(log_clf, X_train_transformed, y_train, cv=3, verbose=3)\n", "score.mean()" ] @@ -2451,7 +2504,7 @@ }, { "cell_type": "code", - "execution_count": 158, + "execution_count": 160, "metadata": {}, "outputs": [], "source": [ @@ -2459,7 +2512,7 @@ "\n", "X_test_transformed = preprocess_pipeline.transform(X_test)\n", "\n", - "log_clf = LogisticRegression(random_state=42)\n", + "log_clf = LogisticRegression(solver=\"liblinear\", random_state=42)\n", "log_clf.fit(X_train_transformed, y_train)\n", "\n", "y_pred = log_clf.predict(X_test_transformed)\n", @@ -2492,7 +2545,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.5.2" + "version": "3.6.6" }, "nav_menu": {}, "toc": { diff --git a/04_training_linear_models.ipynb b/04_training_linear_models.ipynb index 1845e8e..1326008 100644 --- a/04_training_linear_models.ipynb +++ b/04_training_linear_models.ipynb @@ -65,7 +65,7 @@ "\n", "# Ignore useless warnings (see SciPy issue #5998)\n", "import warnings\n", - "warnings.filterwarnings(action=\"ignore\", module=\"scipy\", message=\"^internal gelsd\")" + "warnings.filterwarnings(action=\"ignore\", message=\"^internal gelsd\")" ] }, { @@ -384,7 +384,7 @@ "outputs": [], "source": [ "from sklearn.linear_model import SGDRegressor\n", - "sgd_reg = SGDRegressor(max_iter=50, penalty=None, eta0=0.1, random_state=42)\n", + "sgd_reg = SGDRegressor(max_iter=50, tol=-np.infty, penalty=None, eta0=0.1, random_state=42)\n", "sgd_reg.fit(X, y.ravel())" ] }, @@ -727,7 +727,7 @@ "metadata": {}, "outputs": [], "source": [ - "sgd_reg = SGDRegressor(max_iter=5, penalty=\"l2\", random_state=42)\n", + "sgd_reg = SGDRegressor(max_iter=50, tol=-np.infty, penalty=\"l2\", random_state=42)\n", "sgd_reg.fit(X, y.ravel())\n", "sgd_reg.predict([[1.5]])" ] @@ -810,6 +810,7 @@ "X_val_poly_scaled = poly_scaler.transform(X_val)\n", "\n", "sgd_reg = SGDRegressor(max_iter=1,\n", + " tol=-np.infty,\n", " penalty=None,\n", " eta0=0.0005,\n", " warm_start=True,\n", @@ -854,7 +855,7 @@ "outputs": [], "source": [ "from sklearn.base import clone\n", - "sgd_reg = SGDRegressor(max_iter=1, warm_start=True, penalty=None,\n", + "sgd_reg = SGDRegressor(max_iter=1, tol=-np.infty, warm_start=True, penalty=None,\n", " learning_rate=\"constant\", eta0=0.0005, random_state=42)\n", "\n", "minimum_val_error = float(\"inf\")\n", @@ -1043,7 +1044,7 @@ "outputs": [], "source": [ "from sklearn.linear_model import LogisticRegression\n", - "log_reg = LogisticRegression(random_state=42)\n", + "log_reg = LogisticRegression(solver=\"liblinear\", random_state=42)\n", "log_reg.fit(X, y)" ] }, @@ -1123,7 +1124,7 @@ "X = iris[\"data\"][:, (2, 3)] # petal length, petal width\n", "y = (iris[\"target\"] == 2).astype(np.int)\n", "\n", - "log_reg = LogisticRegression(C=10**10, random_state=42)\n", + "log_reg = LogisticRegression(solver=\"liblinear\", C=10**10, random_state=42)\n", "log_reg.fit(X, y)\n", "\n", "x0, x1 = np.meshgrid(\n", @@ -1742,7 +1743,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.4" + "version": "3.6.6" }, "nav_menu": {}, "toc": { diff --git a/05_support_vector_machines.ipynb b/05_support_vector_machines.ipynb index ef7c39f..40ac5a0 100644 --- a/05_support_vector_machines.ipynb +++ b/05_support_vector_machines.ipynb @@ -774,6 +774,13 @@ "y = (0.2 + 0.1 * X + 0.5 * X**2 + np.random.randn(m, 1)/10).ravel()" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Warning**: the default value of `gamma` will change from `'auto'` to `'scale'` in version 0.22 to better account for unscaled features. To preserve the same results as in the book, we explicitly set it to `'auto'`, but you should probably just use the default in your own code." + ] + }, { "cell_type": "code", "execution_count": 27, @@ -782,7 +789,7 @@ "source": [ "from sklearn.svm import SVR\n", "\n", - "svm_poly_reg = SVR(kernel=\"poly\", degree=2, C=100, epsilon=0.1)\n", + "svm_poly_reg = SVR(kernel=\"poly\", degree=2, C=100, epsilon=0.1, gamma=\"auto\")\n", "svm_poly_reg.fit(X, y)" ] }, @@ -794,8 +801,8 @@ "source": [ "from sklearn.svm import SVR\n", "\n", - "svm_poly_reg1 = SVR(kernel=\"poly\", degree=2, C=100, epsilon=0.1)\n", - "svm_poly_reg2 = SVR(kernel=\"poly\", degree=2, C=0.01, epsilon=0.1)\n", + "svm_poly_reg1 = SVR(kernel=\"poly\", degree=2, C=100, epsilon=0.1, gamma=\"auto\")\n", + "svm_poly_reg2 = SVR(kernel=\"poly\", degree=2, C=0.01, epsilon=0.1, gamma=\"auto\")\n", "svm_poly_reg1.fit(X, y)\n", "svm_poly_reg2.fit(X, y)" ] @@ -876,7 +883,7 @@ "ax1 = fig.add_subplot(111, projection='3d')\n", "plot_3D_decision_function(ax1, w=svm_clf2.coef_[0], b=svm_clf2.intercept_[0])\n", "\n", - "save_fig(\"iris_3D_plot\")\n", + "#save_fig(\"iris_3D_plot\")\n", "plt.show()" ] }, @@ -1165,7 +1172,7 @@ "source": [ "from sklearn.linear_model import SGDClassifier\n", "\n", - "sgd_clf = SGDClassifier(loss=\"hinge\", alpha = 0.017, max_iter = 50, random_state=42)\n", + "sgd_clf = SGDClassifier(loss=\"hinge\", alpha = 0.017, max_iter = 50, tol=-np.infty, random_state=42)\n", "sgd_clf.fit(X, y.ravel())\n", "\n", "m = len(X)\n", @@ -1265,7 +1272,7 @@ "lin_clf = LinearSVC(loss=\"hinge\", C=C, random_state=42)\n", "svm_clf = SVC(kernel=\"linear\", C=C)\n", "sgd_clf = SGDClassifier(loss=\"hinge\", learning_rate=\"constant\", eta0=0.001, alpha=alpha,\n", - " max_iter=100000, random_state=42)\n", + " max_iter=100000, tol=-np.infty, random_state=42)\n", "\n", "scaler = StandardScaler()\n", "X_scaled = scaler.fit_transform(X)\n", @@ -1354,9 +1361,13 @@ "metadata": {}, "outputs": [], "source": [ - "from sklearn.datasets import fetch_mldata\n", + "try:\n", + " from sklearn.datasets import fetch_openml\n", + " mnist = fetch_openml('mnist_784', version=1, cache=True)\n", + "except ImportError:\n", + " from sklearn.datasets import fetch_mldata\n", + " mnist = fetch_mldata('MNIST original')\n", "\n", - "mnist = fetch_mldata(\"MNIST original\")\n", "X = mnist[\"data\"]\n", "y = mnist[\"target\"]\n", "\n", @@ -1425,7 +1436,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Wow, 82% accuracy on MNIST is a really bad performance. This linear model is certainly too simple for MNIST, but perhaps we just needed to scale the data first:" + "Wow, 86% accuracy on MNIST is a really bad performance. This linear model is certainly too simple for MNIST, but perhaps we just needed to scale the data first:" ] }, { @@ -1474,7 +1485,7 @@ "metadata": {}, "outputs": [], "source": [ - "svm_clf = SVC(decision_function_shape=\"ovr\")\n", + "svm_clf = SVC(decision_function_shape=\"ovr\", gamma=\"auto\")\n", "svm_clf.fit(X_train_scaled[:10000], y_train[:10000])" ] }, @@ -1505,7 +1516,7 @@ "from scipy.stats import reciprocal, uniform\n", "\n", "param_distributions = {\"gamma\": reciprocal(0.001, 0.1), \"C\": uniform(1, 10)}\n", - "rnd_search_cv = RandomizedSearchCV(svm_clf, param_distributions, n_iter=10, verbose=2)\n", + "rnd_search_cv = RandomizedSearchCV(svm_clf, param_distributions, n_iter=10, verbose=2, cv=3)\n", "rnd_search_cv.fit(X_train_scaled[:1000], y_train[:1000])" ] }, @@ -1536,7 +1547,7 @@ }, { "cell_type": "code", - "execution_count": 59, + "execution_count": 60, "metadata": {}, "outputs": [], "source": [ @@ -1545,7 +1556,7 @@ }, { "cell_type": "code", - "execution_count": 60, + "execution_count": 61, "metadata": {}, "outputs": [], "source": [ @@ -1562,7 +1573,7 @@ }, { "cell_type": "code", - "execution_count": 61, + "execution_count": 62, "metadata": {}, "outputs": [], "source": [ @@ -1600,7 +1611,7 @@ }, { "cell_type": "code", - "execution_count": 62, + "execution_count": 63, "metadata": {}, "outputs": [], "source": [ @@ -1620,7 +1631,7 @@ }, { "cell_type": "code", - "execution_count": 63, + "execution_count": 64, "metadata": {}, "outputs": [], "source": [ @@ -1638,7 +1649,7 @@ }, { "cell_type": "code", - "execution_count": 64, + "execution_count": 65, "metadata": {}, "outputs": [], "source": [ @@ -1658,7 +1669,7 @@ }, { "cell_type": "code", - "execution_count": 65, + "execution_count": 66, "metadata": {}, "outputs": [], "source": [ @@ -1677,7 +1688,7 @@ }, { "cell_type": "code", - "execution_count": 66, + "execution_count": 67, "metadata": {}, "outputs": [], "source": [ @@ -1697,7 +1708,7 @@ }, { "cell_type": "code", - "execution_count": 67, + "execution_count": 68, "metadata": {}, "outputs": [], "source": [ @@ -1713,7 +1724,7 @@ }, { "cell_type": "code", - "execution_count": 68, + "execution_count": 69, "metadata": {}, "outputs": [], "source": [ @@ -1722,13 +1733,13 @@ "from scipy.stats import reciprocal, uniform\n", "\n", "param_distributions = {\"gamma\": reciprocal(0.001, 0.1), \"C\": uniform(1, 10)}\n", - "rnd_search_cv = RandomizedSearchCV(SVR(), param_distributions, n_iter=10, verbose=2, random_state=42)\n", + "rnd_search_cv = RandomizedSearchCV(SVR(), param_distributions, n_iter=10, verbose=2, cv=3, random_state=42)\n", "rnd_search_cv.fit(X_train_scaled, y_train)" ] }, { "cell_type": "code", - "execution_count": 69, + "execution_count": 70, "metadata": {}, "outputs": [], "source": [ @@ -1744,7 +1755,7 @@ }, { "cell_type": "code", - "execution_count": 70, + "execution_count": 71, "metadata": {}, "outputs": [], "source": [ @@ -1762,7 +1773,7 @@ }, { "cell_type": "code", - "execution_count": 71, + "execution_count": 72, "metadata": {}, "outputs": [], "source": [ @@ -1771,6 +1782,26 @@ "np.sqrt(mse)" ] }, + { + "cell_type": "code", + "execution_count": 73, + "metadata": {}, + "outputs": [], + "source": [ + "cmap = matplotlib.cm.get_cmap(\"jet\")" + ] + }, + { + "cell_type": "code", + "execution_count": 74, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.datasets import fetch_openml\n", + "mnist = fetch_openml(\"mnist_784\", version=1)\n", + "print(mnist.data.shape)" + ] + }, { "cell_type": "code", "execution_count": null, @@ -1795,7 +1826,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.5.2" + "version": "3.6.6" }, "nav_menu": {}, "toc": { diff --git a/06_decision_trees.ipynb b/06_decision_trees.ipynb index ba8bd50..908abf7 100644 --- a/06_decision_trees.ipynb +++ b/06_decision_trees.ipynb @@ -531,7 +531,7 @@ "from sklearn.model_selection import GridSearchCV\n", "\n", "params = {'max_leaf_nodes': list(range(2, 100)), 'min_samples_split': [2, 3, 4]}\n", - "grid_search_cv = GridSearchCV(DecisionTreeClassifier(random_state=42), params, n_jobs=-1, verbose=1)\n", + "grid_search_cv = GridSearchCV(DecisionTreeClassifier(random_state=42), params, n_jobs=-1, verbose=1, cv=3)\n", "\n", "grid_search_cv.fit(X_train, y_train)" ] @@ -710,7 +710,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.5.2" + "version": "3.6.6" }, "nav_menu": { "height": "309px", diff --git a/07_ensemble_learning_and_random_forests.ipynb b/07_ensemble_learning_and_random_forests.ipynb index eed1f84..a7697cf 100644 --- a/07_ensemble_learning_and_random_forests.ipynb +++ b/07_ensemble_learning_and_random_forests.ipynb @@ -115,6 +115,13 @@ "X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Warning**: In Scikit-Learn 0.20, some hyperparameters (`solver`, `n_estimators`, `gamma`, etc.) start issuing warnings about the fact that their default value will change in Scikit-Learn 0.22. To avoid these warnings and ensure that this notebooks keeps producing the same outputs as in the book, I set the hyperparameters to their old default value. In your own code, you can simply rely on the latest default values instead." + ] + }, { "cell_type": "code", "execution_count": 5, @@ -126,9 +133,9 @@ "from sklearn.linear_model import LogisticRegression\n", "from sklearn.svm import SVC\n", "\n", - "log_clf = LogisticRegression(random_state=42)\n", - "rnd_clf = RandomForestClassifier(random_state=42)\n", - "svm_clf = SVC(random_state=42)\n", + "log_clf = LogisticRegression(solver=\"liblinear\", random_state=42)\n", + "rnd_clf = RandomForestClassifier(n_estimators=10, random_state=42)\n", + "svm_clf = SVC(gamma=\"auto\", random_state=42)\n", "\n", "voting_clf = VotingClassifier(\n", " estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],\n", @@ -164,9 +171,9 @@ "metadata": {}, "outputs": [], "source": [ - "log_clf = LogisticRegression(random_state=42)\n", - "rnd_clf = RandomForestClassifier(random_state=42)\n", - "svm_clf = SVC(probability=True, random_state=42)\n", + "log_clf = LogisticRegression(solver=\"liblinear\", random_state=42)\n", + "rnd_clf = RandomForestClassifier(n_estimators=10, random_state=42)\n", + "svm_clf = SVC(gamma=\"auto\", probability=True, random_state=42)\n", "\n", "voting_clf = VotingClassifier(\n", " estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],\n", @@ -420,8 +427,13 @@ "metadata": {}, "outputs": [], "source": [ - "from sklearn.datasets import fetch_mldata\n", - "mnist = fetch_mldata('MNIST original')" + "try:\n", + " from sklearn.datasets import fetch_openml\n", + " mnist = fetch_openml('mnist_784', version=1)\n", + " mnist.target = mnist.target.astype(np.int64)\n", + "except ImportError:\n", + " from sklearn.datasets import fetch_mldata\n", + " mnist = fetch_mldata('MNIST original')" ] }, { @@ -430,7 +442,7 @@ "metadata": {}, "outputs": [], "source": [ - "rnd_clf = RandomForestClassifier(random_state=42)\n", + "rnd_clf = RandomForestClassifier(n_estimators=10, random_state=42)\n", "rnd_clf.fit(mnist[\"data\"], mnist[\"target\"])" ] }, @@ -505,7 +517,7 @@ " sample_weights = np.ones(m)\n", " plt.subplot(subplot)\n", " for i in range(5):\n", - " svm_clf = SVC(kernel=\"rbf\", C=0.05, random_state=42)\n", + " svm_clf = SVC(kernel=\"rbf\", C=0.05, gamma=\"auto\", random_state=42)\n", " svm_clf.fit(X_train, y_train, sample_weight=sample_weights)\n", " y_pred = svm_clf.predict(X_train)\n", " sample_weights[y_pred != y_train] *= (1 + learning_rate)\n", @@ -911,36 +923,25 @@ "Exercise: _Load the MNIST data and split it into a training set, a validation set, and a test set (e.g., use 50,000 instances for training, 10,000 for validation, and 10,000 for testing)._" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The MNIST dataset was loaded earlier." + ] + }, { "cell_type": "code", "execution_count": 55, "metadata": {}, "outputs": [], - "source": [ - "from sklearn.datasets import fetch_mldata" - ] - }, - { - "cell_type": "code", - "execution_count": 56, - "metadata": {}, - "outputs": [], - "source": [ - "mnist = fetch_mldata('MNIST original')" - ] - }, - { - "cell_type": "code", - "execution_count": 57, - "metadata": {}, - "outputs": [], "source": [ "from sklearn.model_selection import train_test_split" ] }, { "cell_type": "code", - "execution_count": 58, + "execution_count": 56, "metadata": {}, "outputs": [], "source": [ @@ -959,7 +960,7 @@ }, { "cell_type": "code", - "execution_count": 59, + "execution_count": 57, "metadata": {}, "outputs": [], "source": [ @@ -970,19 +971,19 @@ }, { "cell_type": "code", - "execution_count": 60, + "execution_count": 58, "metadata": {}, "outputs": [], "source": [ - "random_forest_clf = RandomForestClassifier(random_state=42)\n", - "extra_trees_clf = ExtraTreesClassifier(random_state=42)\n", + "random_forest_clf = RandomForestClassifier(n_estimators=10, random_state=42)\n", + "extra_trees_clf = ExtraTreesClassifier(n_estimators=10, random_state=42)\n", "svm_clf = LinearSVC(random_state=42)\n", "mlp_clf = MLPClassifier(random_state=42)" ] }, { "cell_type": "code", - "execution_count": 61, + "execution_count": 59, "metadata": {}, "outputs": [], "source": [ @@ -994,7 +995,7 @@ }, { "cell_type": "code", - "execution_count": 62, + "execution_count": 60, "metadata": {}, "outputs": [], "source": [ @@ -1017,7 +1018,7 @@ }, { "cell_type": "code", - "execution_count": 63, + "execution_count": 61, "metadata": {}, "outputs": [], "source": [ @@ -1026,7 +1027,7 @@ }, { "cell_type": "code", - "execution_count": 64, + "execution_count": 62, "metadata": {}, "outputs": [], "source": [ @@ -1040,7 +1041,7 @@ }, { "cell_type": "code", - "execution_count": 65, + "execution_count": 63, "metadata": {}, "outputs": [], "source": [ @@ -1049,7 +1050,7 @@ }, { "cell_type": "code", - "execution_count": 66, + "execution_count": 64, "metadata": {}, "outputs": [], "source": [ @@ -1058,7 +1059,7 @@ }, { "cell_type": "code", - "execution_count": 67, + "execution_count": 65, "metadata": {}, "outputs": [], "source": [ @@ -1067,7 +1068,7 @@ }, { "cell_type": "code", - "execution_count": 68, + "execution_count": 66, "metadata": {}, "outputs": [], "source": [ @@ -1083,7 +1084,7 @@ }, { "cell_type": "code", - "execution_count": 69, + "execution_count": 67, "metadata": {}, "outputs": [], "source": [ @@ -1099,16 +1100,7 @@ }, { "cell_type": "code", - "execution_count": 70, - "metadata": {}, - "outputs": [], - "source": [ - "voting_clf.estimators" - ] - }, - { - "cell_type": "code", - "execution_count": 71, + "execution_count": 68, "metadata": {}, "outputs": [], "source": [ @@ -1124,7 +1116,7 @@ }, { "cell_type": "code", - "execution_count": 72, + "execution_count": 69, "metadata": {}, "outputs": [], "source": [ @@ -1140,7 +1132,7 @@ }, { "cell_type": "code", - "execution_count": 73, + "execution_count": 70, "metadata": {}, "outputs": [], "source": [ @@ -1156,7 +1148,7 @@ }, { "cell_type": "code", - "execution_count": 74, + "execution_count": 71, "metadata": {}, "outputs": [], "source": [ @@ -1167,12 +1159,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Much better! The SVM was hurting performance. Now let's try using a soft voting classifier. We do not actually need to retrain the classifier, we can just set `voting` to `\"soft\"`:" + "A bit better! The SVM was hurting performance. Now let's try using a soft voting classifier. We do not actually need to retrain the classifier, we can just set `voting` to `\"soft\"`:" ] }, { "cell_type": "code", - "execution_count": 75, + "execution_count": 72, "metadata": {}, "outputs": [], "source": [ @@ -1181,7 +1173,7 @@ }, { "cell_type": "code", - "execution_count": 76, + "execution_count": 73, "metadata": {}, "outputs": [], "source": [ @@ -1204,7 +1196,7 @@ }, { "cell_type": "code", - "execution_count": 77, + "execution_count": 74, "metadata": {}, "outputs": [], "source": [ @@ -1213,7 +1205,7 @@ }, { "cell_type": "code", - "execution_count": 78, + "execution_count": 75, "metadata": {}, "outputs": [], "source": [ @@ -1224,7 +1216,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The voting classifier reduced the error rate from about 4.9% for our best model (the `MLPClassifier`) to just 3.5%. That's about 28% less errors, not bad!" + "The voting classifier reduced the error rate from about 4.0% for our best model (the `MLPClassifier`) to just 3.1%. That's about 22.5% less errors, not bad!" ] }, { @@ -1243,7 +1235,7 @@ }, { "cell_type": "code", - "execution_count": 79, + "execution_count": 76, "metadata": {}, "outputs": [], "source": [ @@ -1255,7 +1247,7 @@ }, { "cell_type": "code", - "execution_count": 80, + "execution_count": 77, "metadata": {}, "outputs": [], "source": [ @@ -1264,7 +1256,7 @@ }, { "cell_type": "code", - "execution_count": 81, + "execution_count": 78, "metadata": {}, "outputs": [], "source": [ @@ -1274,7 +1266,7 @@ }, { "cell_type": "code", - "execution_count": 82, + "execution_count": 79, "metadata": {}, "outputs": [], "source": [ @@ -1297,7 +1289,7 @@ }, { "cell_type": "code", - "execution_count": 83, + "execution_count": 80, "metadata": {}, "outputs": [], "source": [ @@ -1309,7 +1301,7 @@ }, { "cell_type": "code", - "execution_count": 84, + "execution_count": 81, "metadata": {}, "outputs": [], "source": [ @@ -1318,7 +1310,7 @@ }, { "cell_type": "code", - "execution_count": 85, + "execution_count": 82, "metadata": {}, "outputs": [], "source": [ @@ -1327,7 +1319,7 @@ }, { "cell_type": "code", - "execution_count": 86, + "execution_count": 83, "metadata": {}, "outputs": [], "source": [ @@ -1338,15 +1330,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "This stacking ensemble does not perform as well as the soft voting classifier we trained earlier, but it still beats all the individual classifiers." + "This stacking ensemble does not perform as well as the soft voting classifier we trained earlier, it's just as good as the best individual classifier." ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { @@ -1365,7 +1350,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.5.2" + "version": "3.6.6" }, "nav_menu": { "height": "252px", diff --git a/08_dimensionality_reduction.ipynb b/08_dimensionality_reduction.ipynb index e5ef84c..bd18d4b 100644 --- a/08_dimensionality_reduction.ipynb +++ b/08_dimensionality_reduction.ipynb @@ -60,7 +60,7 @@ "\n", "# Ignore useless warnings (see SciPy issue #5998)\n", "import warnings\n", - "warnings.filterwarnings(action=\"ignore\", module=\"scipy\", message=\"^internal gelsd\")" + "warnings.filterwarnings(action=\"ignore\", message=\"^internal gelsd\")" ] }, { @@ -504,6 +504,10 @@ "ax.set_ylim(axes[2:4])\n", "ax.set_zlim(axes[4:6])\n", "\n", + "# Workaround for https://github.com/matplotlib/matplotlib/issues/12239\n", + "for spine in ax.spines.values():\n", + " spine.set_visible(False)\n", + "\n", "save_fig(\"dataset_3d_plot\")\n", "plt.show()" ] @@ -567,6 +571,10 @@ "ax.set_ylim(axes[2:4])\n", "ax.set_zlim(axes[4:6])\n", "\n", + "# Workaround for https://github.com/matplotlib/matplotlib/issues/12239\n", + "for spine in ax.spines.values():\n", + " spine.set_visible(False)\n", + "\n", "save_fig(\"swiss_roll_plot\")\n", "plt.show()" ] @@ -627,6 +635,10 @@ "ax.set_ylim(axes[2:4])\n", "ax.set_zlim(axes[4:6])\n", "\n", + "# Workaround for https://github.com/matplotlib/matplotlib/issues/12239\n", + "for spine in ax.spines.values():\n", + " spine.set_visible(False)\n", + "\n", "save_fig(\"manifold_decision_boundary_plot1\")\n", "plt.show()\n", "\n", @@ -659,6 +671,10 @@ "ax.set_ylim(axes[2:4])\n", "ax.set_zlim(axes[4:6])\n", "\n", + "# Workaround for https://github.com/matplotlib/matplotlib/issues/12239\n", + "for spine in ax.spines.values():\n", + " spine.set_visible(False)\n", + "\n", "save_fig(\"manifold_decision_boundary_plot3\")\n", "plt.show()\n", "\n", @@ -764,8 +780,13 @@ "outputs": [], "source": [ "from six.moves import urllib\n", - "from sklearn.datasets import fetch_mldata\n", - "mnist = fetch_mldata('MNIST original')" + "try:\n", + " from sklearn.datasets import fetch_openml\n", + " mnist = fetch_openml('mnist_784', version=1)\n", + " mnist.target = mnist.target.astype(np.int64)\n", + "except ImportError:\n", + " from sklearn.datasets import fetch_mldata\n", + " mnist = fetch_mldata('MNIST original')" ] }, { @@ -1244,6 +1265,10 @@ "ax.set_yticklabels([])\n", "ax.set_zticklabels([])\n", "\n", + "# Workaround for https://github.com/matplotlib/matplotlib/issues/12239\n", + "for spine in ax.spines.values():\n", + " spine.set_visible(False)\n", + "\n", "save_fig(\"preimage_plot\", tight_layout=False)\n", "plt.show()" ] @@ -1276,7 +1301,7 @@ "\n", "clf = Pipeline([\n", " (\"kpca\", KernelPCA(n_components=2)),\n", - " (\"log_reg\", LogisticRegression())\n", + " (\"log_reg\", LogisticRegression(solver=\"liblinear\"))\n", " ])\n", "\n", "param_grid = [{\n", @@ -1474,7 +1499,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 73, "metadata": {}, "outputs": [], "source": [ @@ -1483,7 +1508,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 74, "metadata": {}, "outputs": [], "source": [ @@ -1495,24 +1520,24 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 75, "metadata": {}, "outputs": [], "source": [ "plt.figure(figsize=(9, 3.5))\n", "\n", "plt.subplot(121)\n", - "plt.plot(X[y==0, 0], X[y==0, 1], \"yo\", label=\"Iris-Setosa\")\n", - "plt.plot(X[y==1, 0], X[y==1, 1], \"bs\", label=\"Iris-Versicolor\")\n", - "plt.plot(X[y==2, 0], X[y==2, 1], \"g^\", label=\"Iris-Virginica\")\n", + "plt.plot(X[y==0, 2], X[y==0, 3], \"yo\", label=\"Iris-Setosa\")\n", + "plt.plot(X[y==1, 2], X[y==1, 3], \"bs\", label=\"Iris-Versicolor\")\n", + "plt.plot(X[y==2, 2], X[y==2, 3], \"g^\", label=\"Iris-Virginica\")\n", "plt.xlabel(\"Petal length\", fontsize=14)\n", "plt.ylabel(\"Petal width\", fontsize=14)\n", "plt.legend(fontsize=12)\n", "\n", "plt.subplot(122)\n", - "plt.scatter(X[:, 0], X[:, 1], c=\"k\", marker=\".\")\n", + "plt.scatter(X[:, 2], X[:, 3], c=\"k\", marker=\".\")\n", "plt.xlabel(\"Petal length\", fontsize=14)\n", - "plt.tick_params(labelleft='off')\n", + "plt.tick_params(labelleft=False)\n", "\n", "save_fig(\"classification_vs_clustering_diagram\")\n", "plt.show()" @@ -1522,12 +1547,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "A Gaussian mixture model (explained below) can actually separate these clusters pretty well." + "A Gaussian mixture model (explained below) can actually separate these clusters pretty well (using all 4 features: petal length & width, and sepal length & width)." ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 76, "metadata": {}, "outputs": [], "source": [ @@ -1536,7 +1561,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 77, "metadata": {}, "outputs": [], "source": [ @@ -1547,22 +1572,22 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 78, "metadata": {}, "outputs": [], "source": [ - "plt.plot(X[y_pred==0, 0], X[y_pred==0, 1], \"yo\", label=\"Cluster 1\")\n", - "plt.plot(X[y_pred==1, 0], X[y_pred==1, 1], \"bs\", label=\"Cluster 2\")\n", - "plt.plot(X[y_pred==2, 0], X[y_pred==2, 1], \"g^\", label=\"Cluster 3\")\n", + "plt.plot(X[y_pred==0, 2], X[y_pred==0, 3], \"yo\", label=\"Cluster 1\")\n", + "plt.plot(X[y_pred==1, 2], X[y_pred==1, 3], \"bs\", label=\"Cluster 2\")\n", + "plt.plot(X[y_pred==2, 2], X[y_pred==2, 3], \"g^\", label=\"Cluster 3\")\n", "plt.xlabel(\"Petal length\", fontsize=14)\n", "plt.ylabel(\"Petal width\", fontsize=14)\n", - "plt.legend(loc=\"upper right\", fontsize=12)\n", + "plt.legend(loc=\"upper left\", fontsize=12)\n", "plt.show()" ] }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 79, "metadata": {}, "outputs": [], "source": [ @@ -1571,7 +1596,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 80, "metadata": {}, "outputs": [], "source": [ @@ -1594,7 +1619,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 81, "metadata": {}, "outputs": [], "source": [ @@ -1603,7 +1628,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 82, "metadata": {}, "outputs": [], "source": [ @@ -1618,7 +1643,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 83, "metadata": {}, "outputs": [], "source": [ @@ -1635,7 +1660,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 84, "metadata": {}, "outputs": [], "source": [ @@ -1647,7 +1672,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 85, "metadata": {}, "outputs": [], "source": [ @@ -1673,7 +1698,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 86, "metadata": {}, "outputs": [], "source": [ @@ -1682,7 +1707,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 87, "metadata": {}, "outputs": [], "source": [ @@ -1700,7 +1725,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 88, "metadata": {}, "outputs": [], "source": [ @@ -1709,7 +1734,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 89, "metadata": {}, "outputs": [], "source": [ @@ -1725,7 +1750,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 90, "metadata": {}, "outputs": [], "source": [ @@ -1741,7 +1766,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 91, "metadata": {}, "outputs": [], "source": [ @@ -1757,7 +1782,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 92, "metadata": {}, "outputs": [], "source": [ @@ -1781,7 +1806,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 93, "metadata": {}, "outputs": [], "source": [ @@ -1818,16 +1843,16 @@ " if show_xlabels:\n", " plt.xlabel(\"$x_1$\", fontsize=14)\n", " else:\n", - " plt.tick_params(labelbottom='off')\n", + " plt.tick_params(labelbottom=False)\n", " if show_ylabels:\n", " plt.ylabel(\"$x_2$\", fontsize=14, rotation=0)\n", " else:\n", - " plt.tick_params(labelleft='off')" + " plt.tick_params(labelleft=False)" ] }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 94, "metadata": {}, "outputs": [], "source": [ @@ -1860,7 +1885,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 95, "metadata": {}, "outputs": [], "source": [ @@ -1876,7 +1901,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 96, "metadata": {}, "outputs": [], "source": [ @@ -1917,7 +1942,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 97, "metadata": {}, "outputs": [], "source": [ @@ -1941,7 +1966,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 98, "metadata": {}, "outputs": [], "source": [ @@ -1951,7 +1976,7 @@ "plot_data(X)\n", "plot_centroids(kmeans_iter1.cluster_centers_, circle_color='r', cross_color='w')\n", "plt.ylabel(\"$x_2$\", fontsize=14, rotation=0)\n", - "plt.tick_params(labelbottom='off')\n", + "plt.tick_params(labelbottom=False)\n", "plt.title(\"Update the centroids (initially randomly)\", fontsize=14)\n", "\n", "plt.subplot(322)\n", @@ -1994,7 +2019,7 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 99, "metadata": {}, "outputs": [], "source": [ @@ -2017,7 +2042,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 100, "metadata": {}, "outputs": [], "source": [ @@ -2049,7 +2074,7 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 101, "metadata": {}, "outputs": [], "source": [ @@ -2065,7 +2090,7 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 102, "metadata": {}, "outputs": [], "source": [ @@ -2082,7 +2107,7 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 103, "metadata": {}, "outputs": [], "source": [ @@ -2105,7 +2130,7 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 104, "metadata": {}, "outputs": [], "source": [ @@ -2114,7 +2139,7 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 105, "metadata": {}, "outputs": [], "source": [ @@ -2137,7 +2162,7 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 106, "metadata": {}, "outputs": [], "source": [ @@ -2155,7 +2180,7 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 107, "metadata": {}, "outputs": [], "source": [ @@ -2197,7 +2222,7 @@ }, { "cell_type": "code", - "execution_count": 40, + "execution_count": 108, "metadata": {}, "outputs": [], "source": [ @@ -2206,7 +2231,7 @@ }, { "cell_type": "code", - "execution_count": 41, + "execution_count": 109, "metadata": {}, "outputs": [], "source": [ @@ -2239,7 +2264,7 @@ }, { "cell_type": "code", - "execution_count": 42, + "execution_count": 110, "metadata": {}, "outputs": [], "source": [ @@ -2248,7 +2273,7 @@ }, { "cell_type": "code", - "execution_count": 43, + "execution_count": 111, "metadata": { "scrolled": true }, @@ -2273,7 +2298,7 @@ }, { "cell_type": "code", - "execution_count": 44, + "execution_count": 112, "metadata": {}, "outputs": [], "source": [ @@ -2282,7 +2307,7 @@ }, { "cell_type": "code", - "execution_count": 45, + "execution_count": 113, "metadata": {}, "outputs": [], "source": [ @@ -2292,7 +2317,7 @@ }, { "cell_type": "code", - "execution_count": 46, + "execution_count": 114, "metadata": {}, "outputs": [], "source": [ @@ -2308,7 +2333,7 @@ }, { "cell_type": "code", - "execution_count": 47, + "execution_count": 115, "metadata": {}, "outputs": [], "source": [ @@ -2319,7 +2344,7 @@ }, { "cell_type": "code", - "execution_count": 48, + "execution_count": 116, "metadata": { "scrolled": false }, @@ -2338,7 +2363,7 @@ }, { "cell_type": "code", - "execution_count": 49, + "execution_count": 117, "metadata": {}, "outputs": [], "source": [ @@ -2355,7 +2380,7 @@ }, { "cell_type": "code", - "execution_count": 50, + "execution_count": 118, "metadata": {}, "outputs": [], "source": [ @@ -2364,7 +2389,7 @@ }, { "cell_type": "code", - "execution_count": 51, + "execution_count": 119, "metadata": {}, "outputs": [], "source": [ @@ -2396,7 +2421,7 @@ }, { "cell_type": "code", - "execution_count": 52, + "execution_count": 120, "metadata": {}, "outputs": [], "source": [ @@ -2412,7 +2437,7 @@ }, { "cell_type": "code", - "execution_count": 53, + "execution_count": 121, "metadata": {}, "outputs": [], "source": [ @@ -2421,7 +2446,7 @@ }, { "cell_type": "code", - "execution_count": 54, + "execution_count": 122, "metadata": {}, "outputs": [], "source": [ @@ -2437,7 +2462,7 @@ }, { "cell_type": "code", - "execution_count": 55, + "execution_count": 123, "metadata": {}, "outputs": [], "source": [ @@ -2446,7 +2471,7 @@ }, { "cell_type": "code", - "execution_count": 80, + "execution_count": 124, "metadata": {}, "outputs": [], "source": [ @@ -2464,7 +2489,7 @@ }, { "cell_type": "code", - "execution_count": 103, + "execution_count": 125, "metadata": {}, "outputs": [], "source": [ @@ -2508,7 +2533,7 @@ }, { "cell_type": "code", - "execution_count": 127, + "execution_count": 126, "metadata": {}, "outputs": [], "source": [ @@ -2529,7 +2554,7 @@ }, { "cell_type": "code", - "execution_count": 128, + "execution_count": 127, "metadata": {}, "outputs": [], "source": [ @@ -2538,7 +2563,7 @@ }, { "cell_type": "code", - "execution_count": 129, + "execution_count": 128, "metadata": {}, "outputs": [], "source": [ @@ -2554,7 +2579,7 @@ }, { "cell_type": "code", - "execution_count": 130, + "execution_count": 129, "metadata": {}, "outputs": [], "source": [ @@ -2565,7 +2590,7 @@ }, { "cell_type": "code", - "execution_count": 131, + "execution_count": 130, "metadata": {}, "outputs": [], "source": [ @@ -2594,7 +2619,7 @@ }, { "cell_type": "code", - "execution_count": 132, + "execution_count": 131, "metadata": {}, "outputs": [], "source": [ @@ -2618,7 +2643,7 @@ }, { "cell_type": "code", - "execution_count": 133, + "execution_count": 132, "metadata": {}, "outputs": [], "source": [ @@ -2627,7 +2652,7 @@ }, { "cell_type": "code", - "execution_count": 134, + "execution_count": 133, "metadata": {}, "outputs": [], "source": [ @@ -2636,7 +2661,7 @@ }, { "cell_type": "code", - "execution_count": 135, + "execution_count": 134, "metadata": {}, "outputs": [], "source": [ @@ -2646,7 +2671,7 @@ }, { "cell_type": "code", - "execution_count": 136, + "execution_count": 135, "metadata": {}, "outputs": [], "source": [ @@ -2675,7 +2700,7 @@ }, { "cell_type": "code", - "execution_count": 137, + "execution_count": 136, "metadata": {}, "outputs": [], "source": [ @@ -2712,7 +2737,7 @@ " plt.gca().set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])\n", " plt.xlabel(\"Silhouette Coefficient\")\n", " else:\n", - " plt.tick_params(labelbottom='off')\n", + " plt.tick_params(labelbottom=False)\n", "\n", " plt.axvline(x=silhouette_scores[k - 2], color=\"red\", linestyle=\"--\")\n", " plt.title(\"$k={}$\".format(k), fontsize=16)\n", @@ -2730,7 +2755,7 @@ }, { "cell_type": "code", - "execution_count": 118, + "execution_count": 137, "metadata": {}, "outputs": [], "source": [ @@ -2744,7 +2769,7 @@ }, { "cell_type": "code", - "execution_count": 119, + "execution_count": 138, "metadata": {}, "outputs": [], "source": [ @@ -2753,7 +2778,7 @@ }, { "cell_type": "code", - "execution_count": 140, + "execution_count": 139, "metadata": {}, "outputs": [], "source": [ @@ -2765,7 +2790,7 @@ }, { "cell_type": "code", - "execution_count": 141, + "execution_count": 140, "metadata": { "scrolled": false }, @@ -2794,7 +2819,7 @@ }, { "cell_type": "code", - "execution_count": 142, + "execution_count": 141, "metadata": {}, "outputs": [], "source": [ @@ -2805,7 +2830,7 @@ }, { "cell_type": "code", - "execution_count": 143, + "execution_count": 142, "metadata": {}, "outputs": [], "source": [ @@ -2817,7 +2842,7 @@ }, { "cell_type": "code", - "execution_count": 144, + "execution_count": 143, "metadata": {}, "outputs": [], "source": [ @@ -2831,7 +2856,7 @@ }, { "cell_type": "code", - "execution_count": 145, + "execution_count": 144, "metadata": {}, "outputs": [], "source": [ @@ -2869,7 +2894,7 @@ }, { "cell_type": "code", - "execution_count": 146, + "execution_count": 145, "metadata": {}, "outputs": [], "source": [ @@ -2878,7 +2903,7 @@ }, { "cell_type": "code", - "execution_count": 147, + "execution_count": 146, "metadata": {}, "outputs": [], "source": [ @@ -2894,7 +2919,7 @@ }, { "cell_type": "code", - "execution_count": 148, + "execution_count": 147, "metadata": {}, "outputs": [], "source": [ @@ -2903,7 +2928,7 @@ }, { "cell_type": "code", - "execution_count": 149, + "execution_count": 148, "metadata": {}, "outputs": [], "source": [ @@ -2919,7 +2944,7 @@ }, { "cell_type": "code", - "execution_count": 150, + "execution_count": 149, "metadata": {}, "outputs": [], "source": [ @@ -2928,17 +2953,17 @@ }, { "cell_type": "code", - "execution_count": 151, + "execution_count": 150, "metadata": {}, "outputs": [], "source": [ - "log_reg = LogisticRegression(random_state=42)\n", + "log_reg = LogisticRegression(multi_class=\"ovr\", solver=\"liblinear\", random_state=42)\n", "log_reg.fit(X_train, y_train)" ] }, { "cell_type": "code", - "execution_count": 152, + "execution_count": 151, "metadata": {}, "outputs": [], "source": [ @@ -2954,7 +2979,7 @@ }, { "cell_type": "code", - "execution_count": 153, + "execution_count": 152, "metadata": {}, "outputs": [], "source": [ @@ -2963,20 +2988,20 @@ }, { "cell_type": "code", - "execution_count": 154, + "execution_count": 153, "metadata": {}, "outputs": [], "source": [ "pipeline = Pipeline([\n", " (\"kmeans\", KMeans(n_clusters=50, random_state=42)),\n", - " (\"log_reg\", LogisticRegression(random_state=42)),\n", + " (\"log_reg\", LogisticRegression(multi_class=\"ovr\", solver=\"liblinear\", random_state=42)),\n", "])\n", "pipeline.fit(X_train, y_train)" ] }, { "cell_type": "code", - "execution_count": 155, + "execution_count": 154, "metadata": {}, "outputs": [], "source": [ @@ -2985,7 +3010,7 @@ }, { "cell_type": "code", - "execution_count": 156, + "execution_count": 155, "metadata": {}, "outputs": [], "source": [ @@ -3001,7 +3026,7 @@ }, { "cell_type": "code", - "execution_count": 157, + "execution_count": 156, "metadata": {}, "outputs": [], "source": [ @@ -3010,7 +3035,7 @@ }, { "cell_type": "code", - "execution_count": 158, + "execution_count": 157, "metadata": {}, "outputs": [], "source": [ @@ -3021,7 +3046,7 @@ }, { "cell_type": "code", - "execution_count": 159, + "execution_count": 158, "metadata": {}, "outputs": [], "source": [ @@ -3030,7 +3055,7 @@ }, { "cell_type": "code", - "execution_count": 160, + "execution_count": 159, "metadata": {}, "outputs": [], "source": [ @@ -3067,7 +3092,7 @@ }, { "cell_type": "code", - "execution_count": 161, + "execution_count": 160, "metadata": {}, "outputs": [], "source": [ @@ -3076,11 +3101,11 @@ }, { "cell_type": "code", - "execution_count": 162, + "execution_count": 161, "metadata": {}, "outputs": [], "source": [ - "log_reg = LogisticRegression(random_state=42)\n", + "log_reg = LogisticRegression(multi_class=\"ovr\", solver=\"liblinear\", random_state=42)\n", "log_reg.fit(X_train[:n_labeled], y_train[:n_labeled])\n", "log_reg.score(X_test, y_test)" ] @@ -3094,7 +3119,7 @@ }, { "cell_type": "code", - "execution_count": 163, + "execution_count": 162, "metadata": {}, "outputs": [], "source": [ @@ -3103,7 +3128,7 @@ }, { "cell_type": "code", - "execution_count": 164, + "execution_count": 163, "metadata": {}, "outputs": [], "source": [ @@ -3122,7 +3147,7 @@ }, { "cell_type": "code", - "execution_count": 165, + "execution_count": 164, "metadata": {}, "outputs": [], "source": [ @@ -3138,7 +3163,7 @@ }, { "cell_type": "code", - "execution_count": 166, + "execution_count": 165, "metadata": {}, "outputs": [], "source": [ @@ -3159,11 +3184,11 @@ }, { "cell_type": "code", - "execution_count": 167, + "execution_count": 166, "metadata": {}, "outputs": [], "source": [ - "log_reg = LogisticRegression(random_state=42)\n", + "log_reg = LogisticRegression(multi_class=\"ovr\", solver=\"liblinear\", random_state=42)\n", "log_reg.fit(X_representative_digits, y_representative_digits)\n", "log_reg.score(X_test, y_test)" ] @@ -3184,7 +3209,7 @@ }, { "cell_type": "code", - "execution_count": 168, + "execution_count": 167, "metadata": {}, "outputs": [], "source": [ @@ -3195,17 +3220,17 @@ }, { "cell_type": "code", - "execution_count": 169, + "execution_count": 168, "metadata": {}, "outputs": [], "source": [ - "log_reg = LogisticRegression(random_state=42)\n", + "log_reg = LogisticRegression(multi_class=\"ovr\", solver=\"liblinear\", random_state=42)\n", "log_reg.fit(X_train, y_train_propagated)" ] }, { "cell_type": "code", - "execution_count": 170, + "execution_count": 169, "metadata": {}, "outputs": [], "source": [ @@ -3221,7 +3246,7 @@ }, { "cell_type": "code", - "execution_count": 171, + "execution_count": 170, "metadata": {}, "outputs": [], "source": [ @@ -3238,7 +3263,7 @@ }, { "cell_type": "code", - "execution_count": 172, + "execution_count": 171, "metadata": {}, "outputs": [], "source": [ @@ -3249,17 +3274,17 @@ }, { "cell_type": "code", - "execution_count": 173, + "execution_count": 172, "metadata": {}, "outputs": [], "source": [ - "log_reg = LogisticRegression(random_state=42)\n", + "log_reg = LogisticRegression(multi_class=\"ovr\", solver=\"liblinear\", random_state=42)\n", "log_reg.fit(X_train_partially_propagated, y_train_partially_propagated)" ] }, { "cell_type": "code", - "execution_count": 174, + "execution_count": 173, "metadata": {}, "outputs": [], "source": [ @@ -3282,7 +3307,7 @@ }, { "cell_type": "code", - "execution_count": 175, + "execution_count": 174, "metadata": {}, "outputs": [], "source": [ @@ -3307,7 +3332,7 @@ }, { "cell_type": "code", - "execution_count": 106, + "execution_count": 175, "metadata": {}, "outputs": [], "source": [ @@ -3316,7 +3341,7 @@ }, { "cell_type": "code", - "execution_count": 107, + "execution_count": 176, "metadata": {}, "outputs": [], "source": [ @@ -3325,7 +3350,7 @@ }, { "cell_type": "code", - "execution_count": 108, + "execution_count": 177, "metadata": {}, "outputs": [], "source": [ @@ -3334,7 +3359,7 @@ }, { "cell_type": "code", - "execution_count": 112, + "execution_count": 178, "metadata": {}, "outputs": [], "source": [ @@ -3344,7 +3369,7 @@ }, { "cell_type": "code", - "execution_count": 113, + "execution_count": 179, "metadata": {}, "outputs": [], "source": [ @@ -3353,7 +3378,7 @@ }, { "cell_type": "code", - "execution_count": 114, + "execution_count": 180, "metadata": {}, "outputs": [], "source": [ @@ -3362,7 +3387,7 @@ }, { "cell_type": "code", - "execution_count": 115, + "execution_count": 181, "metadata": {}, "outputs": [], "source": [ @@ -3371,7 +3396,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 182, "metadata": {}, "outputs": [], "source": [ @@ -3380,7 +3405,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 183, "metadata": {}, "outputs": [], "source": [ @@ -3389,7 +3414,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 184, "metadata": {}, "outputs": [], "source": [ @@ -3399,7 +3424,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 185, "metadata": {}, "outputs": [], "source": [ @@ -3422,17 +3447,17 @@ " if show_xlabels:\n", " plt.xlabel(\"$x_1$\", fontsize=14)\n", " else:\n", - " plt.tick_params(labelbottom='off')\n", + " plt.tick_params(labelbottom=False)\n", " if show_ylabels:\n", " plt.ylabel(\"$x_2$\", fontsize=14, rotation=0)\n", " else:\n", - " plt.tick_params(labelleft='off')\n", + " plt.tick_params(labelleft=False)\n", " plt.title(\"eps={:.2f}, min_samples={}\".format(dbscan.eps, dbscan.min_samples), fontsize=14)" ] }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 186, "metadata": {}, "outputs": [], "source": [ @@ -3450,7 +3475,7 @@ }, { "cell_type": "code", - "execution_count": 188, + "execution_count": 187, "metadata": {}, "outputs": [], "source": [ @@ -3459,7 +3484,7 @@ }, { "cell_type": "code", - "execution_count": 189, + "execution_count": 188, "metadata": {}, "outputs": [], "source": [ @@ -3468,7 +3493,7 @@ }, { "cell_type": "code", - "execution_count": 190, + "execution_count": 189, "metadata": {}, "outputs": [], "source": [ @@ -3478,7 +3503,7 @@ }, { "cell_type": "code", - "execution_count": 191, + "execution_count": 190, "metadata": {}, "outputs": [], "source": [ @@ -3488,7 +3513,7 @@ }, { "cell_type": "code", - "execution_count": 192, + "execution_count": 191, "metadata": {}, "outputs": [], "source": [ @@ -3497,7 +3522,7 @@ }, { "cell_type": "code", - "execution_count": 193, + "execution_count": 192, "metadata": {}, "outputs": [], "source": [ @@ -3510,7 +3535,7 @@ }, { "cell_type": "code", - "execution_count": 194, + "execution_count": 193, "metadata": {}, "outputs": [], "source": [ @@ -3536,7 +3561,7 @@ }, { "cell_type": "code", - "execution_count": 195, + "execution_count": 194, "metadata": {}, "outputs": [], "source": [ @@ -3545,7 +3570,7 @@ }, { "cell_type": "code", - "execution_count": 196, + "execution_count": 195, "metadata": {}, "outputs": [], "source": [ @@ -3555,7 +3580,7 @@ }, { "cell_type": "code", - "execution_count": 197, + "execution_count": 196, "metadata": {}, "outputs": [], "source": [ @@ -3565,7 +3590,7 @@ }, { "cell_type": "code", - "execution_count": 198, + "execution_count": 197, "metadata": {}, "outputs": [], "source": [ @@ -3574,7 +3599,7 @@ }, { "cell_type": "code", - "execution_count": 199, + "execution_count": 198, "metadata": {}, "outputs": [], "source": [ @@ -3586,17 +3611,17 @@ " if show_xlabels:\n", " plt.xlabel(\"$x_1$\", fontsize=14)\n", " else:\n", - " plt.tick_params(labelbottom='off')\n", + " plt.tick_params(labelbottom=False)\n", " if show_ylabels:\n", " plt.ylabel(\"$x_2$\", fontsize=14, rotation=0)\n", " else:\n", - " plt.tick_params(labelleft='off')\n", + " plt.tick_params(labelleft=False)\n", " plt.title(\"RBF gamma={}\".format(sc.gamma), fontsize=14)" ] }, { "cell_type": "code", - "execution_count": 200, + "execution_count": 199, "metadata": {}, "outputs": [], "source": [ @@ -3620,7 +3645,7 @@ }, { "cell_type": "code", - "execution_count": 201, + "execution_count": 200, "metadata": {}, "outputs": [], "source": [ @@ -3629,7 +3654,7 @@ }, { "cell_type": "code", - "execution_count": 202, + "execution_count": 201, "metadata": {}, "outputs": [], "source": [ @@ -3639,7 +3664,7 @@ }, { "cell_type": "code", - "execution_count": 203, + "execution_count": 202, "metadata": {}, "outputs": [], "source": [ @@ -3648,7 +3673,7 @@ }, { "cell_type": "code", - "execution_count": 204, + "execution_count": 203, "metadata": { "scrolled": true }, @@ -3666,7 +3691,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 204, "metadata": {}, "outputs": [], "source": [ @@ -3687,7 +3712,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 205, "metadata": {}, "outputs": [], "source": [ @@ -3696,7 +3721,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 206, "metadata": {}, "outputs": [], "source": [ @@ -3713,7 +3738,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 207, "metadata": {}, "outputs": [], "source": [ @@ -3722,7 +3747,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 208, "metadata": {}, "outputs": [], "source": [ @@ -3731,7 +3756,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 209, "metadata": {}, "outputs": [], "source": [ @@ -3747,7 +3772,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 210, "metadata": {}, "outputs": [], "source": [ @@ -3763,7 +3788,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 211, "metadata": {}, "outputs": [], "source": [ @@ -3779,7 +3804,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 212, "metadata": {}, "outputs": [], "source": [ @@ -3788,7 +3813,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 213, "metadata": {}, "outputs": [], "source": [ @@ -3804,7 +3829,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 214, "metadata": {}, "outputs": [], "source": [ @@ -3814,7 +3839,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 215, "metadata": {}, "outputs": [], "source": [ @@ -3837,7 +3862,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 216, "metadata": {}, "outputs": [], "source": [ @@ -3853,7 +3878,7 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 217, "metadata": {}, "outputs": [], "source": [ @@ -3876,7 +3901,7 @@ }, { "cell_type": "code", - "execution_count": 178, + "execution_count": 218, "metadata": {}, "outputs": [], "source": [ @@ -3910,12 +3935,12 @@ " if show_ylabels:\n", " plt.ylabel(\"$x_2$\", fontsize=14, rotation=0)\n", " else:\n", - " plt.tick_params(labelleft='off')" + " plt.tick_params(labelleft=False)" ] }, { "cell_type": "code", - "execution_count": 155, + "execution_count": 219, "metadata": {}, "outputs": [], "source": [ @@ -3940,7 +3965,7 @@ }, { "cell_type": "code", - "execution_count": 221, + "execution_count": 220, "metadata": {}, "outputs": [], "source": [ @@ -3956,7 +3981,7 @@ }, { "cell_type": "code", - "execution_count": 222, + "execution_count": 221, "metadata": {}, "outputs": [], "source": [ @@ -3974,7 +3999,7 @@ }, { "cell_type": "code", - "execution_count": 223, + "execution_count": 222, "metadata": {}, "outputs": [], "source": [ @@ -3986,7 +4011,7 @@ }, { "cell_type": "code", - "execution_count": 224, + "execution_count": 223, "metadata": {}, "outputs": [], "source": [ @@ -4011,7 +4036,7 @@ }, { "cell_type": "code", - "execution_count": 94, + "execution_count": 224, "metadata": {}, "outputs": [], "source": [ @@ -4022,7 +4047,7 @@ }, { "cell_type": "code", - "execution_count": 100, + "execution_count": 225, "metadata": {}, "outputs": [], "source": [ @@ -4030,7 +4055,7 @@ "\n", "plot_gaussian_mixture(gm, X)\n", "plt.scatter(anomalies[:, 0], anomalies[:, 1], color='r', marker='*')\n", - "plt.ylim(ymax=5.1)\n", + "plt.ylim(top=5.1)\n", "\n", "save_fig(\"mixture_anomaly_detection_diagram\")\n", "plt.show()" @@ -4062,7 +4087,7 @@ }, { "cell_type": "code", - "execution_count": 227, + "execution_count": 226, "metadata": {}, "outputs": [], "source": [ @@ -4071,7 +4096,7 @@ }, { "cell_type": "code", - "execution_count": 228, + "execution_count": 227, "metadata": {}, "outputs": [], "source": [ @@ -4087,7 +4112,7 @@ }, { "cell_type": "code", - "execution_count": 229, + "execution_count": 228, "metadata": {}, "outputs": [], "source": [ @@ -4104,7 +4129,7 @@ }, { "cell_type": "code", - "execution_count": 230, + "execution_count": 229, "metadata": {}, "outputs": [], "source": [ @@ -4113,7 +4138,7 @@ }, { "cell_type": "code", - "execution_count": 231, + "execution_count": 230, "metadata": {}, "outputs": [], "source": [ @@ -4136,7 +4161,7 @@ }, { "cell_type": "code", - "execution_count": 232, + "execution_count": 231, "metadata": {}, "outputs": [], "source": [ @@ -4146,7 +4171,7 @@ }, { "cell_type": "code", - "execution_count": 233, + "execution_count": 232, "metadata": {}, "outputs": [], "source": [ @@ -4156,7 +4181,7 @@ }, { "cell_type": "code", - "execution_count": 234, + "execution_count": 233, "metadata": {}, "outputs": [], "source": [ @@ -4187,7 +4212,7 @@ }, { "cell_type": "code", - "execution_count": 235, + "execution_count": 234, "metadata": {}, "outputs": [], "source": [ @@ -4206,7 +4231,7 @@ }, { "cell_type": "code", - "execution_count": 236, + "execution_count": 235, "metadata": {}, "outputs": [], "source": [ @@ -4215,7 +4240,7 @@ }, { "cell_type": "code", - "execution_count": 237, + "execution_count": 236, "metadata": {}, "outputs": [], "source": [ @@ -4238,7 +4263,7 @@ }, { "cell_type": "code", - "execution_count": 157, + "execution_count": 237, "metadata": {}, "outputs": [], "source": [ @@ -4247,7 +4272,7 @@ }, { "cell_type": "code", - "execution_count": 158, + "execution_count": 238, "metadata": {}, "outputs": [], "source": [ @@ -4264,7 +4289,7 @@ }, { "cell_type": "code", - "execution_count": 159, + "execution_count": 239, "metadata": {}, "outputs": [], "source": [ @@ -4273,7 +4298,7 @@ }, { "cell_type": "code", - "execution_count": 160, + "execution_count": 240, "metadata": {}, "outputs": [], "source": [ @@ -4284,7 +4309,7 @@ }, { "cell_type": "code", - "execution_count": 161, + "execution_count": 241, "metadata": {}, "outputs": [], "source": [ @@ -4299,7 +4324,7 @@ }, { "cell_type": "code", - "execution_count": 162, + "execution_count": 242, "metadata": {}, "outputs": [], "source": [ @@ -4308,7 +4333,7 @@ }, { "cell_type": "code", - "execution_count": 163, + "execution_count": 243, "metadata": {}, "outputs": [], "source": [ @@ -4317,7 +4342,7 @@ }, { "cell_type": "code", - "execution_count": 179, + "execution_count": 244, "metadata": {}, "outputs": [], "source": [ @@ -4344,7 +4369,7 @@ }, { "cell_type": "code", - "execution_count": 167, + "execution_count": 245, "metadata": {}, "outputs": [], "source": [ @@ -4353,7 +4378,7 @@ }, { "cell_type": "code", - "execution_count": 168, + "execution_count": 246, "metadata": { "scrolled": true }, @@ -4365,7 +4390,7 @@ }, { "cell_type": "code", - "execution_count": 169, + "execution_count": 247, "metadata": {}, "outputs": [], "source": [ @@ -4525,18 +4550,15 @@ ] }, { - "cell_type": "code", - "execution_count": 251, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "from sklearn.datasets import fetch_mldata\n", - "mnist = fetch_mldata('MNIST original')" + "The MNIST dataset was loaded earlier." ] }, { "cell_type": "code", - "execution_count": 252, + "execution_count": 251, "metadata": {}, "outputs": [], "source": [ @@ -4556,18 +4578,18 @@ }, { "cell_type": "code", - "execution_count": 253, + "execution_count": 252, "metadata": {}, "outputs": [], "source": [ "from sklearn.ensemble import RandomForestClassifier\n", "\n", - "rnd_clf = RandomForestClassifier(random_state=42)" + "rnd_clf = RandomForestClassifier(n_estimators=10, random_state=42)" ] }, { "cell_type": "code", - "execution_count": 254, + "execution_count": 253, "metadata": {}, "outputs": [], "source": [ @@ -4580,7 +4602,7 @@ }, { "cell_type": "code", - "execution_count": 255, + "execution_count": 254, "metadata": {}, "outputs": [], "source": [ @@ -4589,7 +4611,7 @@ }, { "cell_type": "code", - "execution_count": 256, + "execution_count": 255, "metadata": {}, "outputs": [], "source": [ @@ -4608,7 +4630,7 @@ }, { "cell_type": "code", - "execution_count": 257, + "execution_count": 256, "metadata": {}, "outputs": [], "source": [ @@ -4627,11 +4649,11 @@ }, { "cell_type": "code", - "execution_count": 258, + "execution_count": 257, "metadata": {}, "outputs": [], "source": [ - "rnd_clf2 = RandomForestClassifier(random_state=42)\n", + "rnd_clf2 = RandomForestClassifier(n_estimators=10, random_state=42)\n", "t0 = time.time()\n", "rnd_clf2.fit(X_train_reduced, y_train)\n", "t1 = time.time()" @@ -4639,7 +4661,7 @@ }, { "cell_type": "code", - "execution_count": 259, + "execution_count": 258, "metadata": {}, "outputs": [], "source": [ @@ -4662,7 +4684,7 @@ }, { "cell_type": "code", - "execution_count": 260, + "execution_count": 259, "metadata": {}, "outputs": [], "source": [ @@ -4683,7 +4705,7 @@ }, { "cell_type": "code", - "execution_count": 261, + "execution_count": 260, "metadata": {}, "outputs": [], "source": [ @@ -4697,7 +4719,7 @@ }, { "cell_type": "code", - "execution_count": 262, + "execution_count": 261, "metadata": {}, "outputs": [], "source": [ @@ -4706,7 +4728,7 @@ }, { "cell_type": "code", - "execution_count": 263, + "execution_count": 262, "metadata": {}, "outputs": [], "source": [ @@ -4723,7 +4745,7 @@ }, { "cell_type": "code", - "execution_count": 264, + "execution_count": 263, "metadata": {}, "outputs": [], "source": [ @@ -4735,7 +4757,7 @@ }, { "cell_type": "code", - "execution_count": 265, + "execution_count": 264, "metadata": {}, "outputs": [], "source": [ @@ -4746,12 +4768,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Nice! Reducing dimensionality led to a 4× speedup. :) Let's the model's accuracy:" + "Nice! Reducing dimensionality led to a 4× speedup. :) Let's check the model's accuracy:" ] }, { "cell_type": "code", - "execution_count": 266, + "execution_count": 265, "metadata": {}, "outputs": [], "source": [ @@ -4791,18 +4813,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Let's start by loading the MNIST dataset (again):" - ] - }, - { - "cell_type": "code", - "execution_count": 267, - "metadata": {}, - "outputs": [], - "source": [ - "from sklearn.datasets import fetch_mldata\n", - "\n", - "mnist = fetch_mldata('MNIST original')" + "The MNIST dataset was loaded above." ] }, { @@ -4814,7 +4825,7 @@ }, { "cell_type": "code", - "execution_count": 268, + "execution_count": 266, "metadata": {}, "outputs": [], "source": [ @@ -4836,7 +4847,7 @@ }, { "cell_type": "code", - "execution_count": 269, + "execution_count": 267, "metadata": {}, "outputs": [], "source": [ @@ -4855,7 +4866,7 @@ }, { "cell_type": "code", - "execution_count": 270, + "execution_count": 268, "metadata": {}, "outputs": [], "source": [ @@ -4882,14 +4893,14 @@ }, { "cell_type": "code", - "execution_count": 271, + "execution_count": 269, "metadata": {}, "outputs": [], "source": [ "plt.figure(figsize=(9,9))\n", "cmap = matplotlib.cm.get_cmap(\"jet\")\n", "for digit in (2, 3, 5):\n", - " plt.scatter(X_reduced[y == digit, 0], X_reduced[y == digit, 1], c=cmap(digit / 9))\n", + " plt.scatter(X_reduced[y == digit, 0], X_reduced[y == digit, 1], c=[cmap(digit / 9)])\n", "plt.axis('off')\n", "plt.show()" ] @@ -4903,7 +4914,7 @@ }, { "cell_type": "code", - "execution_count": 272, + "execution_count": 270, "metadata": {}, "outputs": [], "source": [ @@ -4917,13 +4928,13 @@ }, { "cell_type": "code", - "execution_count": 273, + "execution_count": 271, "metadata": {}, "outputs": [], "source": [ "plt.figure(figsize=(9,9))\n", "for digit in (2, 3, 5):\n", - " plt.scatter(X_subset_reduced[y_subset == digit, 0], X_subset_reduced[y_subset == digit, 1], c=cmap(digit / 9))\n", + " plt.scatter(X_subset_reduced[y_subset == digit, 0], X_subset_reduced[y_subset == digit, 1], c=[cmap(digit / 9)])\n", "plt.axis('off')\n", "plt.show()" ] @@ -4953,7 +4964,7 @@ }, { "cell_type": "code", - "execution_count": 274, + "execution_count": 272, "metadata": {}, "outputs": [], "source": [ @@ -4972,7 +4983,7 @@ " cmap = matplotlib.cm.get_cmap(\"jet\")\n", " digits = np.unique(y)\n", " for digit in digits:\n", - " plt.scatter(X_normalized[y == digit, 0], X_normalized[y == digit, 1], c=cmap(digit / 9))\n", + " plt.scatter(X_normalized[y == digit, 0], X_normalized[y == digit, 1], c=[cmap(digit / 9)])\n", " plt.axis(\"off\")\n", " ax = plt.gcf().gca() # get current axes in current figure\n", " for index, image_coord in enumerate(X_normalized):\n", @@ -4997,7 +5008,7 @@ }, { "cell_type": "code", - "execution_count": 275, + "execution_count": 273, "metadata": {}, "outputs": [], "source": [ @@ -5013,7 +5024,7 @@ }, { "cell_type": "code", - "execution_count": 276, + "execution_count": 274, "metadata": {}, "outputs": [], "source": [ @@ -5022,7 +5033,7 @@ }, { "cell_type": "code", - "execution_count": 277, + "execution_count": 275, "metadata": {}, "outputs": [], "source": [ @@ -5045,7 +5056,7 @@ }, { "cell_type": "code", - "execution_count": 278, + "execution_count": 276, "metadata": {}, "outputs": [], "source": [ @@ -5069,7 +5080,7 @@ }, { "cell_type": "code", - "execution_count": 279, + "execution_count": 277, "metadata": {}, "outputs": [], "source": [ @@ -5092,7 +5103,7 @@ }, { "cell_type": "code", - "execution_count": 280, + "execution_count": 278, "metadata": {}, "outputs": [], "source": [ @@ -5126,7 +5137,7 @@ }, { "cell_type": "code", - "execution_count": 281, + "execution_count": 279, "metadata": {}, "outputs": [], "source": [ @@ -5150,7 +5161,7 @@ }, { "cell_type": "code", - "execution_count": 282, + "execution_count": 280, "metadata": {}, "outputs": [], "source": [ @@ -5184,7 +5195,7 @@ }, { "cell_type": "code", - "execution_count": 283, + "execution_count": 281, "metadata": {}, "outputs": [], "source": [ @@ -5214,7 +5225,7 @@ }, { "cell_type": "code", - "execution_count": 284, + "execution_count": 282, "metadata": {}, "outputs": [], "source": [ @@ -5237,7 +5248,7 @@ }, { "cell_type": "code", - "execution_count": 285, + "execution_count": 283, "metadata": {}, "outputs": [], "source": [ @@ -5284,7 +5295,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.5" + "version": "3.6.6" } }, "nbformat": 4,