From 6b8dff91d086f61dadbc10bfc4190f74c9e55cf6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aur=C3=A9lien=20Geron?= Date: Fri, 18 Jan 2019 23:08:37 +0800 Subject: [PATCH] Update all notebooks assuming we are all in the future now: sklearn 0.20+, python 3.5+, TF 2.0 preview --- 01_the_machine_learning_landscape.ipynb | 6 +- 02_end_to_end_machine_learning_project.ipynb | 459 +++-- 03_classification.ipynb | 419 +++-- 04_training_linear_models.ipynb | 348 ++-- 05_support_vector_machines.ipynb | 228 +-- 06_decision_trees.ipynb | 80 +- 07_ensemble_learning_and_random_forests.ipynb | 325 ++-- 08_dimensionality_reduction.ipynb | 204 ++- 09_unsupervised_learning.ipynb | 114 +- 10_neural_nets_with_keras.ipynb | 10 +- future_encoders.py | 1610 ----------------- requirements.txt | 8 +- 12 files changed, 1186 insertions(+), 2625 deletions(-) delete mode 100644 future_encoders.py diff --git a/01_the_machine_learning_landscape.ipynb b/01_the_machine_learning_landscape.ipynb index e944e1d..e387ba1 100644 --- a/01_the_machine_learning_landscape.ipynb +++ b/01_the_machine_learning_landscape.ipynb @@ -102,7 +102,6 @@ "outputs": [], "source": [ "# Code example\n", - "import matplotlib as mpl\n", "import matplotlib.pyplot as plt\n", "import numpy as np\n", "import pandas as pd\n", @@ -495,7 +494,6 @@ "outputs": [], "source": [ "# Code example\n", - "import matplotlib as mpl\n", "import matplotlib.pyplot as plt\n", "import numpy as np\n", "import pandas as pd\n", @@ -684,6 +682,7 @@ "outputs": [], "source": [ "# Replace this linear model:\n", + "import sklearn.linear_model\n", "model = sklearn.linear_model.LinearRegression()" ] }, @@ -694,6 +693,7 @@ "outputs": [], "source": [ "# with this k-neighbors regression model:\n", + "import sklearn.neighbors\n", "model = sklearn.neighbors.KNeighborsRegressor(n_neighbors=3)" ] }, @@ -717,7 +717,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3 - tf2", + "display_name": "Python 3", "language": "python", "name": "python3" }, diff --git a/02_end_to_end_machine_learning_project.ipynb b/02_end_to_end_machine_learning_project.ipynb index 192438d..cec2144 100644 --- a/02_end_to_end_machine_learning_project.ipynb +++ b/02_end_to_end_machine_learning_project.ipynb @@ -71,7 +71,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Get the data" + "This notebook assumes you have installed Scikit-Learn ≥0.20." ] }, { @@ -79,12 +79,29 @@ "execution_count": 2, "metadata": {}, "outputs": [], + "source": [ + "import sklearn\n", + "assert sklearn.__version__ >= \"0.20\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Get the data" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], "source": [ "import os\n", "import tarfile\n", "from six.moves import urllib\n", "\n", - "DOWNLOAD_ROOT = \"https://raw.githubusercontent.com/ageron/handson-ml/master/\"\n", + "DOWNLOAD_ROOT = \"https://raw.githubusercontent.com/ageron/handson-ml2/master/\"\n", "HOUSING_PATH = os.path.join(\"datasets\", \"housing\")\n", "HOUSING_URL = DOWNLOAD_ROOT + \"datasets/housing/housing.tgz\"\n", "\n", @@ -100,7 +117,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -109,7 +126,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ @@ -122,7 +139,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ @@ -132,7 +149,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ @@ -141,7 +158,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ @@ -150,7 +167,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ @@ -159,7 +176,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ @@ -172,7 +189,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ @@ -182,7 +199,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 12, "metadata": {}, "outputs": [], "source": [ @@ -199,17 +216,26 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "train_set, test_set = split_train_test(housing, 0.2)\n", - "print(len(train_set), \"train +\", len(test_set), \"test\")" + "len(train_set)" ] }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "len(test_set)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, "metadata": {}, "outputs": [], "source": [ @@ -233,7 +259,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 16, "metadata": {}, "outputs": [], "source": [ @@ -252,7 +278,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 17, "metadata": {}, "outputs": [], "source": [ @@ -262,7 +288,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 18, "metadata": {}, "outputs": [], "source": [ @@ -272,7 +298,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 19, "metadata": {}, "outputs": [], "source": [ @@ -280,26 +306,6 @@ "train_set, test_set = split_train_test_by_id(housing_with_id, 0.2, \"id\")" ] }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [], - "source": [ - "test_set.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [], - "source": [ - "from sklearn.model_selection import train_test_split\n", - "\n", - "train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)" - ] - }, { "cell_type": "code", "execution_count": 20, @@ -315,7 +321,9 @@ "metadata": {}, "outputs": [], "source": [ - "housing[\"median_income\"].hist()" + "from sklearn.model_selection import train_test_split\n", + "\n", + "train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)" ] }, { @@ -323,6 +331,24 @@ "execution_count": 22, "metadata": {}, "outputs": [], + "source": [ + "test_set.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "housing[\"median_income\"].hist()" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [], "source": [ "# Divide by 1.5 to limit the number of income categories\n", "housing[\"income_cat\"] = np.ceil(housing[\"median_income\"] / 1.5)\n", @@ -332,7 +358,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 25, "metadata": {}, "outputs": [], "source": [ @@ -341,7 +367,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 26, "metadata": {}, "outputs": [], "source": [ @@ -350,7 +376,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 27, "metadata": {}, "outputs": [], "source": [ @@ -364,7 +390,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 28, "metadata": {}, "outputs": [], "source": [ @@ -373,7 +399,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 29, "metadata": {}, "outputs": [], "source": [ @@ -382,7 +408,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 30, "metadata": {}, "outputs": [], "source": [ @@ -402,7 +428,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 31, "metadata": {}, "outputs": [], "source": [ @@ -411,7 +437,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 32, "metadata": {}, "outputs": [], "source": [ @@ -428,7 +454,7 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 33, "metadata": {}, "outputs": [], "source": [ @@ -437,7 +463,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 34, "metadata": {}, "outputs": [], "source": [ @@ -447,7 +473,7 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 35, "metadata": {}, "outputs": [], "source": [ @@ -459,12 +485,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The argument `sharex=False` fixes a display bug (the x-axis values and legend were not displayed). This is a temporary fix (see: https://github.com/pandas-dev/pandas/issues/10611). Thanks to Wilmer Arellano for pointing it out." + "The argument `sharex=False` fixes a display bug (the x-axis values and legend were not displayed). This is a temporary fix (see: https://github.com/pandas-dev/pandas/issues/10611 ). Thanks to Wilmer Arellano for pointing it out." ] }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 36, "metadata": {}, "outputs": [], "source": [ @@ -478,7 +504,7 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 37, "metadata": {}, "outputs": [], "source": [ @@ -507,7 +533,7 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 38, "metadata": {}, "outputs": [], "source": [ @@ -516,7 +542,7 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 39, "metadata": {}, "outputs": [], "source": [ @@ -525,7 +551,7 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 40, "metadata": {}, "outputs": [], "source": [ @@ -540,7 +566,7 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 41, "metadata": {}, "outputs": [], "source": [ @@ -552,7 +578,7 @@ }, { "cell_type": "code", - "execution_count": 40, + "execution_count": 42, "metadata": {}, "outputs": [], "source": [ @@ -570,7 +596,7 @@ }, { "cell_type": "code", - "execution_count": 41, + "execution_count": 43, "metadata": {}, "outputs": [], "source": [ @@ -580,7 +606,7 @@ }, { "cell_type": "code", - "execution_count": 42, + "execution_count": 44, "metadata": {}, "outputs": [], "source": [ @@ -592,7 +618,7 @@ }, { "cell_type": "code", - "execution_count": 43, + "execution_count": 45, "metadata": {}, "outputs": [], "source": [ @@ -608,7 +634,7 @@ }, { "cell_type": "code", - "execution_count": 44, + "execution_count": 46, "metadata": {}, "outputs": [], "source": [ @@ -618,7 +644,7 @@ }, { "cell_type": "code", - "execution_count": 45, + "execution_count": 47, "metadata": {}, "outputs": [], "source": [ @@ -628,7 +654,7 @@ }, { "cell_type": "code", - "execution_count": 46, + "execution_count": 48, "metadata": {}, "outputs": [], "source": [ @@ -637,7 +663,7 @@ }, { "cell_type": "code", - "execution_count": 47, + "execution_count": 49, "metadata": {}, "outputs": [], "source": [ @@ -646,33 +672,30 @@ }, { "cell_type": "code", - "execution_count": 48, + "execution_count": 50, "metadata": {}, "outputs": [], "source": [ "median = housing[\"total_bedrooms\"].median()\n", - "sample_incomplete_rows[\"total_bedrooms\"].fillna(median, inplace=True) # option 3\n", - "sample_incomplete_rows" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Warning**: Since Scikit-Learn 0.20, the `sklearn.preprocessing.Imputer` class was replaced by the `sklearn.impute.SimpleImputer` class." + "sample_incomplete_rows[\"total_bedrooms\"].fillna(median, inplace=True) # option 3" ] }, { "cell_type": "code", - "execution_count": 49, + "execution_count": 51, "metadata": {}, "outputs": [], "source": [ - "try:\n", - " from sklearn.impute import SimpleImputer # Scikit-Learn 0.20+\n", - "except ImportError:\n", - " from sklearn.preprocessing import Imputer as SimpleImputer\n", - "\n", + "sample_incomplete_rows" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.impute import SimpleImputer\n", "imputer = SimpleImputer(strategy=\"median\")" ] }, @@ -685,17 +708,17 @@ }, { "cell_type": "code", - "execution_count": 50, + "execution_count": 53, "metadata": {}, "outputs": [], "source": [ - "housing_num = housing.drop('ocean_proximity', axis=1)\n", + "housing_num = housing.drop(\"ocean_proximity\", axis=1)\n", "# alternatively: housing_num = housing.select_dtypes(include=[np.number])" ] }, { "cell_type": "code", - "execution_count": 51, + "execution_count": 54, "metadata": {}, "outputs": [], "source": [ @@ -704,7 +727,7 @@ }, { "cell_type": "code", - "execution_count": 52, + "execution_count": 55, "metadata": {}, "outputs": [], "source": [ @@ -720,7 +743,7 @@ }, { "cell_type": "code", - "execution_count": 53, + "execution_count": 56, "metadata": {}, "outputs": [], "source": [ @@ -736,7 +759,7 @@ }, { "cell_type": "code", - "execution_count": 54, + "execution_count": 57, "metadata": {}, "outputs": [], "source": [ @@ -745,7 +768,7 @@ }, { "cell_type": "code", - "execution_count": 55, + "execution_count": 58, "metadata": {}, "outputs": [], "source": [ @@ -755,7 +778,7 @@ }, { "cell_type": "code", - "execution_count": 56, + "execution_count": 59, "metadata": {}, "outputs": [], "source": [ @@ -764,7 +787,7 @@ }, { "cell_type": "code", - "execution_count": 57, + "execution_count": 60, "metadata": {}, "outputs": [], "source": [ @@ -773,11 +796,19 @@ }, { "cell_type": "code", - "execution_count": 58, + "execution_count": 61, + "metadata": {}, + "outputs": [], + "source": [ + "housing_tr = pd.DataFrame(X, columns=housing_num.columns)" + ] + }, + { + "cell_type": "code", + "execution_count": 62, "metadata": {}, "outputs": [], "source": [ - "housing_tr = pd.DataFrame(X, columns=housing_num.columns)\n", "housing_tr.head()" ] }, @@ -790,39 +821,22 @@ }, { "cell_type": "code", - "execution_count": 59, + "execution_count": 63, "metadata": {}, "outputs": [], "source": [ - "housing_cat = housing[['ocean_proximity']]\n", + "housing_cat = housing[[\"ocean_proximity\"]]\n", "housing_cat.head(10)" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Warning**: earlier versions of the book used the `LabelEncoder` class or Pandas' `Series.factorize()` method to encode string categorical attributes as integers. However, the `OrdinalEncoder` class that was introduced in Scikit-Learn 0.20 (see [PR #10521](https://github.com/scikit-learn/scikit-learn/issues/10521)) is preferable since it is designed for input features (`X` instead of labels `y`) and it plays well with pipelines (introduced later in this notebook). If you are using an older version of Scikit-Learn (<0.20), then you can import it from `future_encoders.py` instead." - ] - }, { "cell_type": "code", - "execution_count": 60, - "metadata": {}, - "outputs": [], - "source": [ - "try:\n", - " from sklearn.preprocessing import OrdinalEncoder\n", - "except ImportError:\n", - " from future_encoders import OrdinalEncoder # Scikit-Learn < 0.20" - ] - }, - { - "cell_type": "code", - "execution_count": 61, + "execution_count": 64, "metadata": {}, "outputs": [], "source": [ + "from sklearn.preprocessing import OrdinalEncoder\n", + "\n", "ordinal_encoder = OrdinalEncoder()\n", "housing_cat_encoded = ordinal_encoder.fit_transform(housing_cat)\n", "housing_cat_encoded[:10]" @@ -830,31 +844,20 @@ }, { "cell_type": "code", - "execution_count": 62, + "execution_count": 65, "metadata": {}, "outputs": [], "source": [ "ordinal_encoder.categories_" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Warning**: earlier versions of the book used the `LabelBinarizer` or `CategoricalEncoder` classes to convert each categorical value to a one-hot vector. It is now preferable to use the `OneHotEncoder` class. Since Scikit-Learn 0.20 it can handle string categorical inputs (see [PR #10521](https://github.com/scikit-learn/scikit-learn/issues/10521)), not just integer categorical inputs. If you are using an older version of Scikit-Learn, you can import the new version from `future_encoders.py`:" - ] - }, { "cell_type": "code", - "execution_count": 63, + "execution_count": 66, "metadata": {}, "outputs": [], "source": [ - "try:\n", - " from sklearn.preprocessing import OrdinalEncoder # just to raise an ImportError if Scikit-Learn < 0.20\n", - " from sklearn.preprocessing import OneHotEncoder\n", - "except ImportError:\n", - " from future_encoders import OneHotEncoder # Scikit-Learn < 0.20\n", + "from sklearn.preprocessing import OneHotEncoder\n", "\n", "cat_encoder = OneHotEncoder()\n", "housing_cat_1hot = cat_encoder.fit_transform(housing_cat)\n", @@ -870,7 +873,7 @@ }, { "cell_type": "code", - "execution_count": 64, + "execution_count": 67, "metadata": {}, "outputs": [], "source": [ @@ -886,7 +889,7 @@ }, { "cell_type": "code", - "execution_count": 65, + "execution_count": 68, "metadata": {}, "outputs": [], "source": [ @@ -897,7 +900,7 @@ }, { "cell_type": "code", - "execution_count": 66, + "execution_count": 69, "metadata": {}, "outputs": [], "source": [ @@ -913,14 +916,14 @@ }, { "cell_type": "code", - "execution_count": 67, + "execution_count": 70, "metadata": {}, "outputs": [], "source": [ "from sklearn.base import BaseEstimator, TransformerMixin\n", "\n", "# column index\n", - "rooms_ix, bedrooms_ix, population_ix, household_ix = 3, 4, 5, 6\n", + "rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6\n", "\n", "class CombinedAttributesAdder(BaseEstimator, TransformerMixin):\n", " def __init__(self, add_bedrooms_per_room = True): # no *args or **kargs\n", @@ -928,8 +931,8 @@ " def fit(self, X, y=None):\n", " return self # nothing else to do\n", " def transform(self, X, y=None):\n", - " rooms_per_household = X[:, rooms_ix] / X[:, household_ix]\n", - " population_per_household = X[:, population_ix] / X[:, household_ix]\n", + " rooms_per_household = X[:, rooms_ix] / X[:, households_ix]\n", + " population_per_household = X[:, population_ix] / X[:, households_ix]\n", " if self.add_bedrooms_per_room:\n", " bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]\n", " return np.c_[X, rooms_per_household, population_per_household,\n", @@ -943,7 +946,7 @@ }, { "cell_type": "code", - "execution_count": 68, + "execution_count": 71, "metadata": {}, "outputs": [], "source": [ @@ -962,7 +965,7 @@ }, { "cell_type": "code", - "execution_count": 69, + "execution_count": 72, "metadata": {}, "outputs": [], "source": [ @@ -980,38 +983,21 @@ }, { "cell_type": "code", - "execution_count": 70, + "execution_count": 73, "metadata": {}, "outputs": [], "source": [ "housing_num_tr" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Warning**: earlier versions of the book applied different transformations to different columns using a solution based on a `DataFrameSelector` transformer and a `FeatureUnion` (see below). It is now preferable to use the `ColumnTransformer` class that was introduced in Scikit-Learn 0.20. If you are using an older version of Scikit-Learn, you can import it from `future_encoders.py`:" - ] - }, { "cell_type": "code", - "execution_count": 71, - "metadata": {}, - "outputs": [], - "source": [ - "try:\n", - " from sklearn.compose import ColumnTransformer\n", - "except ImportError:\n", - " from future_encoders import ColumnTransformer # Scikit-Learn < 0.20" - ] - }, - { - "cell_type": "code", - "execution_count": 72, + "execution_count": 74, "metadata": {}, "outputs": [], "source": [ + "from sklearn.compose import ColumnTransformer\n", + "\n", "num_attribs = list(housing_num)\n", "cat_attribs = [\"ocean_proximity\"]\n", "\n", @@ -1025,7 +1011,7 @@ }, { "cell_type": "code", - "execution_count": 73, + "execution_count": 75, "metadata": {}, "outputs": [], "source": [ @@ -1034,7 +1020,7 @@ }, { "cell_type": "code", - "execution_count": 74, + "execution_count": 76, "metadata": {}, "outputs": [], "source": [ @@ -1050,7 +1036,7 @@ }, { "cell_type": "code", - "execution_count": 75, + "execution_count": 77, "metadata": {}, "outputs": [], "source": [ @@ -1075,7 +1061,7 @@ }, { "cell_type": "code", - "execution_count": 76, + "execution_count": 78, "metadata": {}, "outputs": [], "source": [ @@ -1097,7 +1083,7 @@ }, { "cell_type": "code", - "execution_count": 77, + "execution_count": 79, "metadata": {}, "outputs": [], "source": [ @@ -1111,7 +1097,7 @@ }, { "cell_type": "code", - "execution_count": 78, + "execution_count": 80, "metadata": {}, "outputs": [], "source": [ @@ -1128,7 +1114,7 @@ }, { "cell_type": "code", - "execution_count": 79, + "execution_count": 81, "metadata": {}, "outputs": [], "source": [ @@ -1144,7 +1130,7 @@ }, { "cell_type": "code", - "execution_count": 80, + "execution_count": 82, "metadata": {}, "outputs": [], "source": [ @@ -1156,7 +1142,7 @@ }, { "cell_type": "code", - "execution_count": 81, + "execution_count": 83, "metadata": {}, "outputs": [], "source": [ @@ -1177,7 +1163,7 @@ }, { "cell_type": "code", - "execution_count": 82, + "execution_count": 84, "metadata": {}, "outputs": [], "source": [ @@ -1186,7 +1172,7 @@ }, { "cell_type": "code", - "execution_count": 83, + "execution_count": 85, "metadata": {}, "outputs": [], "source": [ @@ -1195,7 +1181,7 @@ }, { "cell_type": "code", - "execution_count": 84, + "execution_count": 86, "metadata": {}, "outputs": [], "source": [ @@ -1209,7 +1195,7 @@ }, { "cell_type": "code", - "execution_count": 85, + "execution_count": 87, "metadata": {}, "outputs": [], "source": [ @@ -1221,7 +1207,7 @@ }, { "cell_type": "code", - "execution_count": 86, + "execution_count": 88, "metadata": {}, "outputs": [], "source": [ @@ -1233,7 +1219,7 @@ }, { "cell_type": "code", - "execution_count": 87, + "execution_count": 89, "metadata": {}, "outputs": [], "source": [ @@ -1252,7 +1238,7 @@ }, { "cell_type": "code", - "execution_count": 88, + "execution_count": 90, "metadata": {}, "outputs": [], "source": [ @@ -1265,7 +1251,7 @@ }, { "cell_type": "code", - "execution_count": 89, + "execution_count": 91, "metadata": {}, "outputs": [], "source": [ @@ -1279,7 +1265,7 @@ }, { "cell_type": "code", - "execution_count": 90, + "execution_count": 92, "metadata": {}, "outputs": [], "source": [ @@ -1293,24 +1279,24 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "**Note**: we specify `n_estimators=10` to avoid a warning about the fact that the default value is going to change to 100 in Scikit-Learn 0.22." + "**Note**: we specify `n_estimators=100` to be future-proof since the default value is going to change to 100 in Scikit-Learn 0.22 (for simplicity, this is not shown in the book)." ] }, { "cell_type": "code", - "execution_count": 91, + "execution_count": 93, "metadata": {}, "outputs": [], "source": [ "from sklearn.ensemble import RandomForestRegressor\n", "\n", - "forest_reg = RandomForestRegressor(n_estimators=10, random_state=42)\n", + "forest_reg = RandomForestRegressor(n_estimators=100, random_state=42)\n", "forest_reg.fit(housing_prepared, housing_labels)" ] }, { "cell_type": "code", - "execution_count": 92, + "execution_count": 94, "metadata": {}, "outputs": [], "source": [ @@ -1322,7 +1308,7 @@ }, { "cell_type": "code", - "execution_count": 93, + "execution_count": 95, "metadata": {}, "outputs": [], "source": [ @@ -1336,7 +1322,7 @@ }, { "cell_type": "code", - "execution_count": 94, + "execution_count": 96, "metadata": {}, "outputs": [], "source": [ @@ -1346,7 +1332,7 @@ }, { "cell_type": "code", - "execution_count": 95, + "execution_count": 97, "metadata": {}, "outputs": [], "source": [ @@ -1362,7 +1348,7 @@ }, { "cell_type": "code", - "execution_count": 96, + "execution_count": 98, "metadata": {}, "outputs": [], "source": [ @@ -1378,7 +1364,8 @@ "forest_reg = RandomForestRegressor(random_state=42)\n", "# train across 5 folds, that's a total of (12+6)*5=90 rounds of training \n", "grid_search = GridSearchCV(forest_reg, param_grid, cv=5,\n", - " scoring='neg_mean_squared_error', return_train_score=True)\n", + " scoring='neg_mean_squared_error',\n", + " return_train_score=True)\n", "grid_search.fit(housing_prepared, housing_labels)" ] }, @@ -1391,7 +1378,7 @@ }, { "cell_type": "code", - "execution_count": 97, + "execution_count": 99, "metadata": {}, "outputs": [], "source": [ @@ -1400,7 +1387,7 @@ }, { "cell_type": "code", - "execution_count": 98, + "execution_count": 100, "metadata": {}, "outputs": [], "source": [ @@ -1416,7 +1403,7 @@ }, { "cell_type": "code", - "execution_count": 99, + "execution_count": 101, "metadata": {}, "outputs": [], "source": [ @@ -1427,7 +1414,7 @@ }, { "cell_type": "code", - "execution_count": 100, + "execution_count": 102, "metadata": {}, "outputs": [], "source": [ @@ -1436,7 +1423,7 @@ }, { "cell_type": "code", - "execution_count": 101, + "execution_count": 103, "metadata": {}, "outputs": [], "source": [ @@ -1456,7 +1443,7 @@ }, { "cell_type": "code", - "execution_count": 102, + "execution_count": 104, "metadata": {}, "outputs": [], "source": [ @@ -1467,7 +1454,7 @@ }, { "cell_type": "code", - "execution_count": 103, + "execution_count": 105, "metadata": {}, "outputs": [], "source": [ @@ -1477,7 +1464,7 @@ }, { "cell_type": "code", - "execution_count": 104, + "execution_count": 106, "metadata": {}, "outputs": [], "source": [ @@ -1491,7 +1478,7 @@ }, { "cell_type": "code", - "execution_count": 105, + "execution_count": 107, "metadata": {}, "outputs": [], "source": [ @@ -1509,7 +1496,7 @@ }, { "cell_type": "code", - "execution_count": 106, + "execution_count": 108, "metadata": {}, "outputs": [], "source": [ @@ -1525,26 +1512,16 @@ }, { "cell_type": "code", - "execution_count": 107, - "metadata": {}, - "outputs": [], - "source": [ - "from scipy import stats" - ] - }, - { - "cell_type": "code", - "execution_count": 108, + "execution_count": 109, "metadata": {}, "outputs": [], "source": [ + "from scipy import stats\n", + "\n", "confidence = 0.95\n", "squared_errors = (final_predictions - y_test) ** 2\n", - "mean = squared_errors.mean()\n", - "m = len(squared_errors)\n", - "\n", - "np.sqrt(stats.t.interval(confidence, m - 1,\n", - " loc=np.mean(squared_errors),\n", + "np.sqrt(stats.t.interval(confidence, len(squared_errors) - 1,\n", + " loc=squared_errors.mean(),\n", " scale=stats.sem(squared_errors)))" ] }, @@ -1557,10 +1534,12 @@ }, { "cell_type": "code", - "execution_count": 109, + "execution_count": 110, "metadata": {}, "outputs": [], "source": [ + "m = len(squared_errors)\n", + "mean = squared_errors.mean()\n", "tscore = stats.t.ppf((1 + confidence) / 2, df=m - 1)\n", "tmargin = tscore * squared_errors.std(ddof=1) / np.sqrt(m)\n", "np.sqrt(mean - tmargin), np.sqrt(mean + tmargin)" @@ -1575,7 +1554,7 @@ }, { "cell_type": "code", - "execution_count": 110, + "execution_count": 111, "metadata": {}, "outputs": [], "source": [ @@ -1600,7 +1579,7 @@ }, { "cell_type": "code", - "execution_count": 111, + "execution_count": 112, "metadata": {}, "outputs": [], "source": [ @@ -1622,7 +1601,7 @@ }, { "cell_type": "code", - "execution_count": 112, + "execution_count": 113, "metadata": {}, "outputs": [], "source": [ @@ -1631,7 +1610,7 @@ }, { "cell_type": "code", - "execution_count": 113, + "execution_count": 114, "metadata": {}, "outputs": [], "source": [ @@ -1650,7 +1629,7 @@ }, { "cell_type": "code", - "execution_count": 114, + "execution_count": 115, "metadata": {}, "outputs": [], "source": [ @@ -1688,7 +1667,7 @@ }, { "cell_type": "code", - "execution_count": 115, + "execution_count": 116, "metadata": {}, "outputs": [], "source": [ @@ -1701,7 +1680,7 @@ " ]\n", "\n", "svm_reg = SVR()\n", - "grid_search = GridSearchCV(svm_reg, param_grid, cv=5, scoring='neg_mean_squared_error', verbose=2, n_jobs=4)\n", + "grid_search = GridSearchCV(svm_reg, param_grid, cv=5, scoring='neg_mean_squared_error', verbose=2)\n", "grid_search.fit(housing_prepared, housing_labels)" ] }, @@ -1714,7 +1693,7 @@ }, { "cell_type": "code", - "execution_count": 116, + "execution_count": 117, "metadata": {}, "outputs": [], "source": [ @@ -1732,7 +1711,7 @@ }, { "cell_type": "code", - "execution_count": 117, + "execution_count": 118, "metadata": {}, "outputs": [], "source": [ @@ -1762,7 +1741,7 @@ }, { "cell_type": "code", - "execution_count": 118, + "execution_count": 119, "metadata": {}, "outputs": [], "source": [ @@ -1782,7 +1761,7 @@ "svm_reg = SVR()\n", "rnd_search = RandomizedSearchCV(svm_reg, param_distributions=param_distribs,\n", " n_iter=50, cv=5, scoring='neg_mean_squared_error',\n", - " verbose=2, n_jobs=4, random_state=42)\n", + " verbose=2, random_state=42)\n", "rnd_search.fit(housing_prepared, housing_labels)" ] }, @@ -1795,7 +1774,7 @@ }, { "cell_type": "code", - "execution_count": 119, + "execution_count": 120, "metadata": {}, "outputs": [], "source": [ @@ -1813,7 +1792,7 @@ }, { "cell_type": "code", - "execution_count": 120, + "execution_count": 121, "metadata": {}, "outputs": [], "source": [ @@ -1836,7 +1815,7 @@ }, { "cell_type": "code", - "execution_count": 121, + "execution_count": 122, "metadata": {}, "outputs": [], "source": [ @@ -1861,7 +1840,7 @@ }, { "cell_type": "code", - "execution_count": 122, + "execution_count": 123, "metadata": {}, "outputs": [], "source": [ @@ -1900,7 +1879,7 @@ }, { "cell_type": "code", - "execution_count": 123, + "execution_count": 124, "metadata": {}, "outputs": [], "source": [ @@ -1936,7 +1915,7 @@ }, { "cell_type": "code", - "execution_count": 124, + "execution_count": 125, "metadata": {}, "outputs": [], "source": [ @@ -1952,7 +1931,7 @@ }, { "cell_type": "code", - "execution_count": 125, + "execution_count": 126, "metadata": {}, "outputs": [], "source": [ @@ -1962,7 +1941,7 @@ }, { "cell_type": "code", - "execution_count": 126, + "execution_count": 127, "metadata": {}, "outputs": [], "source": [ @@ -1978,7 +1957,7 @@ }, { "cell_type": "code", - "execution_count": 127, + "execution_count": 128, "metadata": {}, "outputs": [], "source": [ @@ -1994,7 +1973,7 @@ }, { "cell_type": "code", - "execution_count": 128, + "execution_count": 129, "metadata": {}, "outputs": [], "source": [ @@ -2006,7 +1985,7 @@ }, { "cell_type": "code", - "execution_count": 129, + "execution_count": 130, "metadata": {}, "outputs": [], "source": [ @@ -2022,7 +2001,7 @@ }, { "cell_type": "code", - "execution_count": 130, + "execution_count": 131, "metadata": {}, "outputs": [], "source": [ @@ -2038,7 +2017,7 @@ }, { "cell_type": "code", - "execution_count": 131, + "execution_count": 132, "metadata": {}, "outputs": [], "source": [ @@ -2068,7 +2047,7 @@ }, { "cell_type": "code", - "execution_count": 132, + "execution_count": 133, "metadata": {}, "outputs": [], "source": [ @@ -2081,7 +2060,7 @@ }, { "cell_type": "code", - "execution_count": 133, + "execution_count": 134, "metadata": {}, "outputs": [], "source": [ @@ -2097,7 +2076,7 @@ }, { "cell_type": "code", - "execution_count": 134, + "execution_count": 135, "metadata": {}, "outputs": [], "source": [ @@ -2141,7 +2120,7 @@ "}]\n", "\n", "grid_search_prep = GridSearchCV(prepare_select_and_predict_pipeline, param_grid, cv=5,\n", - " scoring='neg_mean_squared_error', verbose=2, n_jobs=4)\n", + " scoring='neg_mean_squared_error', verbose=2)\n", "grid_search_prep.fit(housing, housing_labels)" ] }, @@ -2171,7 +2150,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3 - tf2", + "display_name": "Python 3", "language": "python", "name": "python3" }, diff --git a/03_classification.ipynb b/03_classification.ipynb index 96eef1a..e1b0ec4 100644 --- a/03_classification.ipynb +++ b/03_classification.ipynb @@ -64,14 +64,24 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# MNIST" + "This notebook assumes you have installed Scikit-Learn ≥0.20." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import sklearn\n", + "assert sklearn.__version__ >= \"0.20\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "**Warning**: `fetch_mldata()` is deprecated since Scikit-Learn 0.20. You should use `fetch_openml()` instead. However, it returns the unsorted MNIST dataset, whereas `fetch_mldata()` returned the dataset sorted by target (the training set and the test test were sorted separately). In general, this is fine, but if you want to get the exact same results as before, you need to sort the dataset using the following function:" + "# MNIST" ] }, { @@ -82,8 +92,7 @@ "source": [ "from sklearn.datasets import fetch_openml\n", "mnist = fetch_openml('mnist_784', version=1)\n", - "mnist.target = mnist.target.astype(np.int8) # fetch_openml() returns targets as strings\n", - "mnist[\"data\"], mnist[\"target\"]" + "mnist.keys()" ] }, { @@ -91,15 +100,6 @@ "execution_count": 3, "metadata": {}, "outputs": [], - "source": [ - "mnist.data.shape" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], "source": [ "X, y = mnist[\"data\"], mnist[\"target\"]\n", "X.shape" @@ -107,7 +107,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -116,7 +116,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ @@ -125,7 +125,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ @@ -133,7 +133,7 @@ "import matplotlib as mpl\n", "import matplotlib.pyplot as plt\n", "\n", - "some_digit = X[36000]\n", + "some_digit = X[0]\n", "some_digit_image = some_digit.reshape(28, 28)\n", "plt.imshow(some_digit_image, cmap = mpl.cm.binary, interpolation=\"nearest\")\n", "plt.axis(\"off\")\n", @@ -142,11 +142,29 @@ "plt.show()" ] }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "y[0]" + ] + }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], + "source": [ + "y = y.astype(np.uint8)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], "source": [ "def plot_digit(data):\n", " image = data.reshape(28, 28)\n", @@ -157,7 +175,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ @@ -178,26 +196,17 @@ " plt.axis(\"off\")" ] }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "plt.figure(figsize=(9,9))\n", - "example_images = np.r_[X[:12000:600], X[13000:30600:600], X[30600:60000:590]]\n", - "plot_digits(example_images, images_per_row=10)\n", - "save_fig(\"more_digits_plot\")\n", - "plt.show()" - ] - }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ - "y[36000]" + "plt.figure(figsize=(9,9))\n", + "example_images = X[:100]\n", + "plot_digits(example_images, images_per_row=10)\n", + "save_fig(\"more_digits_plot\")\n", + "plt.show()" ] }, { @@ -206,7 +215,7 @@ "metadata": {}, "outputs": [], "source": [ - "X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]" + "y[0]" ] }, { @@ -215,10 +224,7 @@ "metadata": {}, "outputs": [], "source": [ - "import numpy as np\n", - "\n", - "shuffle_index = np.random.permutation(60000)\n", - "X_train, y_train = X_train[shuffle_index], y_train[shuffle_index]" + "X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]" ] }, { @@ -242,7 +248,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "**Note**: a few hyperparameters will have a different default value in future versions of Scikit-Learn, so a warning is issued if you do not set them explicitly. This is why we set `max_iter=5` and `tol=-np.infty`, to get the same results as in the book, while avoiding the warnings." + "**Note**: some hyperparameters will have a different defaut value in future versions of Scikit-Learn, such as `max_iter` and `tol`. To be future-proof, we explicitly set these hyperparameters to their future default values. For simplicity, this is not shown in the book." ] }, { @@ -253,7 +259,7 @@ "source": [ "from sklearn.linear_model import SGDClassifier\n", "\n", - "sgd_clf = SGDClassifier(max_iter=5, tol=-np.infty, random_state=42)\n", + "sgd_clf = SGDClassifier(max_iter=1000, tol=1e-3, random_state=42)\n", "sgd_clf.fit(X_train, y_train_5)" ] }, @@ -290,9 +296,9 @@ "for train_index, test_index in skfolds.split(X_train, y_train_5):\n", " clone_clf = clone(sgd_clf)\n", " X_train_folds = X_train[train_index]\n", - " y_train_folds = (y_train_5[train_index])\n", + " y_train_folds = y_train_5[train_index]\n", " X_test_fold = X_train[test_index]\n", - " y_test_fold = (y_train_5[test_index])\n", + " y_test_fold = y_train_5[test_index]\n", "\n", " clone_clf.fit(X_train_folds, y_train_folds)\n", " y_pred = clone_clf.predict(X_test_fold)\n", @@ -351,9 +357,7 @@ "execution_count": 23, "metadata": {}, "outputs": [], - "source": [ - "y_train_perfect_predictions = y_train_5" - ] + "source": [] }, { "cell_type": "code", @@ -361,6 +365,7 @@ "metadata": {}, "outputs": [], "source": [ + "y_train_perfect_predictions = y_train_5 # pretend we reached perfection\n", "confusion_matrix(y_train_5, y_train_perfect_predictions)" ] }, @@ -377,16 +382,16 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 26, "metadata": {}, "outputs": [], "source": [ - "4432 / (4432 + 1607)" + "4096 / (4096 + 1522)" ] }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 27, "metadata": {}, "outputs": [], "source": [ @@ -395,11 +400,11 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 28, "metadata": {}, "outputs": [], "source": [ - "4432 / (4432 + 989)" + "4096 / (4096 + 1325)" ] }, { @@ -409,21 +414,22 @@ "outputs": [], "source": [ "from sklearn.metrics import f1_score\n", + "\n", "f1_score(y_train_5, y_train_pred)" ] }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 30, "metadata": {}, "outputs": [], "source": [ - "4432 / (4432 + (1607 + 989)/2)" + "4096 / (4096 + (1522 + 1325) / 2)" ] }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 31, "metadata": {}, "outputs": [], "source": [ @@ -433,7 +439,7 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 32, "metadata": {}, "outputs": [], "source": [ @@ -443,7 +449,7 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 33, "metadata": {}, "outputs": [], "source": [ @@ -452,18 +458,18 @@ }, { "cell_type": "code", - "execution_count": 40, + "execution_count": 34, "metadata": {}, "outputs": [], "source": [ - "threshold = -200000\n", + "threshold = 8000\n", "y_some_digit_pred = (y_scores > threshold)\n", "y_some_digit_pred" ] }, { "cell_type": "code", - "execution_count": 41, + "execution_count": 35, "metadata": {}, "outputs": [], "source": [ @@ -473,7 +479,7 @@ }, { "cell_type": "code", - "execution_count": 42, + "execution_count": 36, "metadata": {}, "outputs": [], "source": [ @@ -484,27 +490,32 @@ }, { "cell_type": "code", - "execution_count": 43, + "execution_count": 37, "metadata": {}, "outputs": [], "source": [ "def plot_precision_recall_vs_threshold(precisions, recalls, thresholds):\n", " plt.plot(thresholds, precisions[:-1], \"b--\", label=\"Precision\", linewidth=2)\n", " plt.plot(thresholds, recalls[:-1], \"g-\", label=\"Recall\", linewidth=2)\n", - " plt.xlabel(\"Threshold\", fontsize=16)\n", - " plt.legend(loc=\"upper left\", fontsize=16)\n", - " plt.ylim([0, 1])\n", + " plt.legend(loc=\"center right\", fontsize=16) # Not shown in the book\n", + " plt.xlabel(\"Threshold\", fontsize=16) # Not shown\n", + " plt.grid(True) # Not shown\n", + " plt.axis([-50000, 50000, 0, 1]) # Not shown\n", "\n", - "plt.figure(figsize=(8, 4))\n", + "plt.figure(figsize=(8, 4)) # Not shown\n", "plot_precision_recall_vs_threshold(precisions, recalls, thresholds)\n", - "plt.xlim([-700000, 700000])\n", - "save_fig(\"precision_recall_vs_threshold_plot\")\n", + "plt.plot([7813, 7813], [0., 0.9], \"r:\") # Not shown\n", + "plt.plot([-50000, 7813], [0.9, 0.9], \"r:\") # Not shown\n", + "plt.plot([-50000, 7813], [0.4368, 0.4368], \"r:\")# Not shown\n", + "plt.plot([7813], [0.9], \"ro\") # Not shown\n", + "plt.plot([7813], [0.4368], \"ro\") # Not shown\n", + "save_fig(\"precision_recall_vs_threshold_plot\") # Not shown\n", "plt.show()" ] }, { "cell_type": "code", - "execution_count": 44, + "execution_count": 38, "metadata": {}, "outputs": [], "source": [ @@ -513,34 +524,7 @@ }, { "cell_type": "code", - "execution_count": 45, - "metadata": {}, - "outputs": [], - "source": [ - "y_train_pred_90 = (y_scores > 70000)" - ] - }, - { - "cell_type": "code", - "execution_count": 46, - "metadata": {}, - "outputs": [], - "source": [ - "precision_score(y_train_5, y_train_pred_90)" - ] - }, - { - "cell_type": "code", - "execution_count": 47, - "metadata": {}, - "outputs": [], - "source": [ - "recall_score(y_train_5, y_train_pred_90)" - ] - }, - { - "cell_type": "code", - "execution_count": 48, + "execution_count": 42, "metadata": {}, "outputs": [], "source": [ @@ -549,13 +533,62 @@ " plt.xlabel(\"Recall\", fontsize=16)\n", " plt.ylabel(\"Precision\", fontsize=16)\n", " plt.axis([0, 1, 0, 1])\n", + " plt.grid(True)\n", "\n", "plt.figure(figsize=(8, 6))\n", "plot_precision_vs_recall(precisions, recalls)\n", + "plt.plot([0.4368, 0.4368], [0., 0.9], \"r:\")\n", + "plt.plot([0.0, 0.4368], [0.9, 0.9], \"r:\")\n", + "plt.plot([0.4368], [0.9], \"ro\")\n", "save_fig(\"precision_vs_recall_plot\")\n", "plt.show()" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "threshold_90_precision = thresholds[np.argmax(precisions >= 0.90)] # == 7813" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "threshold_90_precision" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [], + "source": [ + "y_train_pred_90 = (y_scores >= threshold_90_precision)" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [], + "source": [ + "precision_score(y_train_5, y_train_pred_90)" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [], + "source": [ + "recall_score(y_train_5, y_train_pred_90)" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -565,7 +598,7 @@ }, { "cell_type": "code", - "execution_count": 49, + "execution_count": 43, "metadata": {}, "outputs": [], "source": [ @@ -576,26 +609,30 @@ }, { "cell_type": "code", - "execution_count": 50, + "execution_count": 45, "metadata": {}, "outputs": [], "source": [ "def plot_roc_curve(fpr, tpr, label=None):\n", " plt.plot(fpr, tpr, linewidth=2, label=label)\n", - " plt.plot([0, 1], [0, 1], 'k--')\n", - " plt.axis([0, 1, 0, 1])\n", - " plt.xlabel('False Positive Rate', fontsize=16)\n", - " plt.ylabel('True Positive Rate', fontsize=16)\n", + " plt.plot([0, 1], [0, 1], 'k--') # dashed diagonal\n", + " plt.axis([0, 1, 0, 1]) # Not shown in the book\n", + " plt.xlabel('False Positive Rate (Fall-Out)', fontsize=16) # Not shown\n", + " plt.ylabel('True Positive Rate (Recall)', fontsize=16) # Not shown\n", + " plt.grid(True) # Not shown\n", "\n", - "plt.figure(figsize=(8, 6))\n", + "plt.figure(figsize=(8, 6)) # Not shown\n", "plot_roc_curve(fpr, tpr)\n", - "save_fig(\"roc_curve_plot\")\n", + "plt.plot([4.837e-3, 4.837e-3], [0., 0.4368], \"r:\") # Not shown\n", + "plt.plot([0.0, 4.837e-3], [0.4368, 0.4368], \"r:\") # Not shown\n", + "plt.plot([4.837e-3], [0.4368], \"ro\") # Not shown\n", + "save_fig(\"roc_curve_plot\") # Not shown\n", "plt.show()" ] }, { "cell_type": "code", - "execution_count": 51, + "execution_count": 46, "metadata": {}, "outputs": [], "source": [ @@ -608,24 +645,24 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "**Note**: we set `n_estimators=10` to avoid a warning about the fact that its default value will be set to 100 in Scikit-Learn 0.22." + "**Note**: we set `n_estimators=100` to be future-proof since this will be the default value in Scikit-Learn 0.22." ] }, { "cell_type": "code", - "execution_count": 52, + "execution_count": 47, "metadata": {}, "outputs": [], "source": [ "from sklearn.ensemble import RandomForestClassifier\n", - "forest_clf = RandomForestClassifier(n_estimators=10, random_state=42)\n", + "forest_clf = RandomForestClassifier(n_estimators=100, random_state=42)\n", "y_probas_forest = cross_val_predict(forest_clf, X_train, y_train_5, cv=3,\n", " method=\"predict_proba\")" ] }, { "cell_type": "code", - "execution_count": 53, + "execution_count": 48, "metadata": {}, "outputs": [], "source": [ @@ -635,13 +672,19 @@ }, { "cell_type": "code", - "execution_count": 51, + "execution_count": 49, "metadata": {}, "outputs": [], "source": [ "plt.figure(figsize=(8, 6))\n", "plt.plot(fpr, tpr, \"b:\", linewidth=2, label=\"SGD\")\n", "plot_roc_curve(fpr_forest, tpr_forest, \"Random Forest\")\n", + "plt.plot([4.837e-3, 4.837e-3], [0., 0.4368], \"r:\")\n", + "plt.plot([0.0, 4.837e-3], [0.4368, 0.4368], \"r:\")\n", + "plt.plot([4.837e-3], [0.4368], \"ro\")\n", + "plt.plot([4.837e-3, 4.837e-3], [0., 0.9487], \"r:\")\n", + "plt.plot([4.837e-3], [0.9487], \"ro\")\n", + "plt.grid(True)\n", "plt.legend(loc=\"lower right\", fontsize=16)\n", "save_fig(\"roc_curve_comparison_plot\")\n", "plt.show()" @@ -649,7 +692,7 @@ }, { "cell_type": "code", - "execution_count": 52, + "execution_count": 50, "metadata": {}, "outputs": [], "source": [ @@ -658,7 +701,7 @@ }, { "cell_type": "code", - "execution_count": 53, + "execution_count": 51, "metadata": {}, "outputs": [], "source": [ @@ -668,7 +711,7 @@ }, { "cell_type": "code", - "execution_count": 54, + "execution_count": 52, "metadata": {}, "outputs": [], "source": [ @@ -684,17 +727,17 @@ }, { "cell_type": "code", - "execution_count": 55, + "execution_count": 53, "metadata": {}, "outputs": [], "source": [ - "sgd_clf.fit(X_train, y_train)\n", + "sgd_clf.fit(X_train, y_train) # y_train, not y_train_5\n", "sgd_clf.predict([some_digit])" ] }, { "cell_type": "code", - "execution_count": 56, + "execution_count": 54, "metadata": {}, "outputs": [], "source": [ @@ -704,7 +747,7 @@ }, { "cell_type": "code", - "execution_count": 57, + "execution_count": 55, "metadata": {}, "outputs": [], "source": [ @@ -713,7 +756,7 @@ }, { "cell_type": "code", - "execution_count": 58, + "execution_count": 56, "metadata": {}, "outputs": [], "source": [ @@ -722,7 +765,7 @@ }, { "cell_type": "code", - "execution_count": 59, + "execution_count": 57, "metadata": {}, "outputs": [], "source": [ @@ -731,19 +774,19 @@ }, { "cell_type": "code", - "execution_count": 60, + "execution_count": 58, "metadata": {}, "outputs": [], "source": [ "from sklearn.multiclass import OneVsOneClassifier\n", - "ovo_clf = OneVsOneClassifier(SGDClassifier(max_iter=5, tol=-np.infty, random_state=42))\n", + "ovo_clf = OneVsOneClassifier(SGDClassifier(max_iter=1000, tol=1e-3, random_state=42))\n", "ovo_clf.fit(X_train, y_train)\n", "ovo_clf.predict([some_digit])" ] }, { "cell_type": "code", - "execution_count": 61, + "execution_count": 59, "metadata": {}, "outputs": [], "source": [ @@ -752,7 +795,7 @@ }, { "cell_type": "code", - "execution_count": 62, + "execution_count": 60, "metadata": {}, "outputs": [], "source": [ @@ -762,7 +805,7 @@ }, { "cell_type": "code", - "execution_count": 63, + "execution_count": 61, "metadata": {}, "outputs": [], "source": [ @@ -771,7 +814,7 @@ }, { "cell_type": "code", - "execution_count": 64, + "execution_count": 62, "metadata": {}, "outputs": [], "source": [ @@ -780,7 +823,7 @@ }, { "cell_type": "code", - "execution_count": 65, + "execution_count": 63, "metadata": {}, "outputs": [], "source": [ @@ -792,7 +835,7 @@ }, { "cell_type": "code", - "execution_count": 66, + "execution_count": 64, "metadata": {}, "outputs": [], "source": [ @@ -803,7 +846,7 @@ }, { "cell_type": "code", - "execution_count": 67, + "execution_count": 65, "metadata": {}, "outputs": [], "source": [ @@ -817,7 +860,7 @@ }, { "cell_type": "code", - "execution_count": 68, + "execution_count": 66, "metadata": {}, "outputs": [], "source": [ @@ -828,7 +871,7 @@ }, { "cell_type": "code", - "execution_count": 69, + "execution_count": 67, "metadata": {}, "outputs": [], "source": [ @@ -838,7 +881,7 @@ }, { "cell_type": "code", - "execution_count": 70, + "execution_count": 68, "metadata": {}, "outputs": [], "source": [ @@ -850,7 +893,7 @@ }, { "cell_type": "code", - "execution_count": 71, + "execution_count": 69, "metadata": {}, "outputs": [], "source": [ @@ -878,7 +921,7 @@ }, { "cell_type": "code", - "execution_count": 72, + "execution_count": 70, "metadata": {}, "outputs": [], "source": [ @@ -894,7 +937,7 @@ }, { "cell_type": "code", - "execution_count": 73, + "execution_count": 71, "metadata": {}, "outputs": [], "source": [ @@ -910,11 +953,11 @@ }, { "cell_type": "code", - "execution_count": 74, + "execution_count": 73, "metadata": {}, "outputs": [], "source": [ - "y_train_knn_pred = cross_val_predict(knn_clf, X_train, y_multilabel, cv=3, n_jobs=-1)\n", + "y_train_knn_pred = cross_val_predict(knn_clf, X_train, y_multilabel, cv=3)\n", "f1_score(y_multilabel, y_train_knn_pred, average=\"macro\")" ] }, @@ -927,7 +970,7 @@ }, { "cell_type": "code", - "execution_count": 75, + "execution_count": 74, "metadata": {}, "outputs": [], "source": [ @@ -941,11 +984,11 @@ }, { "cell_type": "code", - "execution_count": 76, + "execution_count": 75, "metadata": {}, "outputs": [], "source": [ - "some_index = 5500\n", + "some_index = 0\n", "plt.subplot(121); plot_digit(X_test_mod[some_index])\n", "plt.subplot(122); plot_digit(y_test_mod[some_index])\n", "save_fig(\"noisy_digit_example_plot\")\n", @@ -954,7 +997,7 @@ }, { "cell_type": "code", - "execution_count": 77, + "execution_count": 76, "metadata": {}, "outputs": [], "source": [ @@ -980,7 +1023,7 @@ }, { "cell_type": "code", - "execution_count": 78, + "execution_count": 77, "metadata": {}, "outputs": [], "source": [ @@ -992,7 +1035,7 @@ }, { "cell_type": "code", - "execution_count": 79, + "execution_count": 78, "metadata": { "scrolled": true }, @@ -1011,18 +1054,18 @@ }, { "cell_type": "code", - "execution_count": 80, + "execution_count": 79, "metadata": {}, "outputs": [], "source": [ "from sklearn.neighbors import KNeighborsClassifier\n", - "knn_clf = KNeighborsClassifier(n_jobs=-1, weights='distance', n_neighbors=4)\n", + "knn_clf = KNeighborsClassifier(weights='distance', n_neighbors=4)\n", "knn_clf.fit(X_train, y_train)" ] }, { "cell_type": "code", - "execution_count": 81, + "execution_count": 80, "metadata": {}, "outputs": [], "source": [ @@ -1031,7 +1074,7 @@ }, { "cell_type": "code", - "execution_count": 82, + "execution_count": 81, "metadata": {}, "outputs": [], "source": [ @@ -1041,7 +1084,7 @@ }, { "cell_type": "code", - "execution_count": 83, + "execution_count": 82, "metadata": {}, "outputs": [], "source": [ @@ -1054,7 +1097,7 @@ }, { "cell_type": "code", - "execution_count": 84, + "execution_count": 83, "metadata": {}, "outputs": [], "source": [ @@ -1072,7 +1115,7 @@ }, { "cell_type": "code", - "execution_count": 85, + "execution_count": 84, "metadata": {}, "outputs": [], "source": [ @@ -1081,7 +1124,7 @@ }, { "cell_type": "code", - "execution_count": 86, + "execution_count": 85, "metadata": {}, "outputs": [], "source": [ @@ -1090,7 +1133,7 @@ }, { "cell_type": "code", - "execution_count": 87, + "execution_count": 86, "metadata": {}, "outputs": [], "source": [ @@ -1099,7 +1142,7 @@ }, { "cell_type": "code", - "execution_count": 88, + "execution_count": 87, "metadata": {}, "outputs": [], "source": [ @@ -1109,7 +1152,7 @@ }, { "cell_type": "code", - "execution_count": 89, + "execution_count": 88, "metadata": {}, "outputs": [], "source": [ @@ -1141,7 +1184,7 @@ }, { "cell_type": "code", - "execution_count": 90, + "execution_count": 89, "metadata": {}, "outputs": [], "source": [ @@ -1150,13 +1193,13 @@ "param_grid = [{'weights': [\"uniform\", \"distance\"], 'n_neighbors': [3, 4, 5]}]\n", "\n", "knn_clf = KNeighborsClassifier()\n", - "grid_search = GridSearchCV(knn_clf, param_grid, cv=5, verbose=3, n_jobs=-1)\n", + "grid_search = GridSearchCV(knn_clf, param_grid, cv=5, verbose=3)\n", "grid_search.fit(X_train, y_train)" ] }, { "cell_type": "code", - "execution_count": 91, + "execution_count": 90, "metadata": {}, "outputs": [], "source": [ @@ -1165,7 +1208,7 @@ }, { "cell_type": "code", - "execution_count": 92, + "execution_count": 91, "metadata": {}, "outputs": [], "source": [ @@ -1174,7 +1217,7 @@ }, { "cell_type": "code", - "execution_count": 93, + "execution_count": 92, "metadata": {}, "outputs": [], "source": [ @@ -1193,7 +1236,7 @@ }, { "cell_type": "code", - "execution_count": 94, + "execution_count": 93, "metadata": {}, "outputs": [], "source": [ @@ -1202,7 +1245,7 @@ }, { "cell_type": "code", - "execution_count": 95, + "execution_count": 94, "metadata": {}, "outputs": [], "source": [ @@ -1214,7 +1257,7 @@ }, { "cell_type": "code", - "execution_count": 96, + "execution_count": 95, "metadata": {}, "outputs": [], "source": [ @@ -1237,7 +1280,7 @@ }, { "cell_type": "code", - "execution_count": 97, + "execution_count": 96, "metadata": {}, "outputs": [], "source": [ @@ -1255,7 +1298,7 @@ }, { "cell_type": "code", - "execution_count": 98, + "execution_count": 97, "metadata": {}, "outputs": [], "source": [ @@ -1266,7 +1309,7 @@ }, { "cell_type": "code", - "execution_count": 99, + "execution_count": 98, "metadata": {}, "outputs": [], "source": [ @@ -1275,7 +1318,7 @@ }, { "cell_type": "code", - "execution_count": 100, + "execution_count": 99, "metadata": {}, "outputs": [], "source": [ @@ -1284,7 +1327,7 @@ }, { "cell_type": "code", - "execution_count": 101, + "execution_count": 100, "metadata": {}, "outputs": [], "source": [ @@ -1329,7 +1372,7 @@ }, { "cell_type": "code", - "execution_count": 102, + "execution_count": 101, "metadata": {}, "outputs": [], "source": [ @@ -1340,7 +1383,7 @@ }, { "cell_type": "code", - "execution_count": 103, + "execution_count": 102, "metadata": {}, "outputs": [], "source": [ @@ -1512,6 +1555,13 @@ "The Embarked attribute tells us where the passenger embarked: C=Cherbourg, Q=Queenstown, S=Southampton." ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Note**: the code below uses a mix of `Pipeline`, `FeatureUnion` and a custom `DataFrameSelector` to preprocess some columns differently. Since Scikit-Learn 0.20, it is preferable to use a `ColumnTransformer`, like in the previous chapter." + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -1527,8 +1577,6 @@ "source": [ "from sklearn.base import BaseEstimator, TransformerMixin\n", "\n", - "# A class to select numerical or categorical columns \n", - "# since Scikit-Learn doesn't handle DataFrames yet\n", "class DataFrameSelector(BaseEstimator, TransformerMixin):\n", " def __init__(self, attribute_names):\n", " self.attribute_names = attribute_names\n", @@ -1542,9 +1590,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Let's build the pipeline for the numerical attributes:\n", - "\n", - "**Warning**: Since Scikit-Learn 0.20, the `sklearn.preprocessing.Imputer` class was replaced by the `sklearn.impute.SimpleImputer` class." + "Let's build the pipeline for the numerical attributes:" ] }, { @@ -1554,10 +1600,7 @@ "outputs": [], "source": [ "from sklearn.pipeline import Pipeline\n", - "try:\n", - " from sklearn.impute import SimpleImputer # Scikit-Learn 0.20+\n", - "except ImportError:\n", - " from sklearn.preprocessing import Imputer as SimpleImputer\n", + "from sklearn.impute import SimpleImputer\n", "\n", "num_pipeline = Pipeline([\n", " (\"select_numeric\", DataFrameSelector([\"Age\", \"SibSp\", \"Parch\", \"Fare\"])),\n", @@ -1597,24 +1640,13 @@ " return X.fillna(self.most_frequent_)" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Warning**: earlier versions of the book used the `LabelBinarizer` or `CategoricalEncoder` classes to convert each categorical value to a one-hot vector. It is now preferable to use the `OneHotEncoder` class. Since Scikit-Learn 0.20 it can handle string categorical inputs (see [PR #10521](https://github.com/scikit-learn/scikit-learn/issues/10521)), not just integer categorical inputs. If you are using an older version of Scikit-Learn, you can import the new version from `future_encoders.py`:" - ] - }, { "cell_type": "code", "execution_count": 116, "metadata": {}, "outputs": [], "source": [ - "try:\n", - " from sklearn.preprocessing import OrdinalEncoder # just to raise an ImportError if Scikit-Learn < 0.20\n", - " from sklearn.preprocessing import OneHotEncoder\n", - "except ImportError:\n", - " from future_encoders import OneHotEncoder # Scikit-Learn < 0.20" + "from sklearn.preprocessing import OneHotEncoder" ] }, { @@ -2432,6 +2464,13 @@ "X_train_transformed = preprocess_pipeline.fit_transform(X_train)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Note**: to be future-proof, we set `solver=\"lbfgs\"` since this will be the default value in Scikit-Learn 0.22." + ] + }, { "cell_type": "code", "execution_count": 159, @@ -2441,7 +2480,7 @@ "from sklearn.linear_model import LogisticRegression\n", "from sklearn.model_selection import cross_val_score\n", "\n", - "log_clf = LogisticRegression(solver=\"liblinear\", random_state=42)\n", + "log_clf = LogisticRegression(solver=\"lbfgs\", random_state=42)\n", "score = cross_val_score(log_clf, X_train_transformed, y_train, cv=3, verbose=3)\n", "score.mean()" ] @@ -2465,7 +2504,7 @@ "\n", "X_test_transformed = preprocess_pipeline.transform(X_test)\n", "\n", - "log_clf = LogisticRegression(solver=\"liblinear\", random_state=42)\n", + "log_clf = LogisticRegression(solver=\"lbfgs\", random_state=42)\n", "log_clf.fit(X_train_transformed, y_train)\n", "\n", "y_pred = log_clf.predict(X_test_transformed)\n", @@ -2484,7 +2523,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3 - tf2", + "display_name": "Python 3", "language": "python", "name": "python3" }, diff --git a/04_training_linear_models.ipynb b/04_training_linear_models.ipynb index cf52355..efada9b 100644 --- a/04_training_linear_models.ipynb +++ b/04_training_linear_models.ipynb @@ -73,7 +73,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Linear regression using the Normal Equation" + "This notebook assumes you have installed Scikit-Learn ≥0.20." ] }, { @@ -81,6 +81,23 @@ "execution_count": 2, "metadata": {}, "outputs": [], + "source": [ + "import sklearn\n", + "assert sklearn.__version__ >= \"0.20\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Linear regression using the Normal Equation" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], "source": [ "import numpy as np\n", "\n", @@ -90,7 +107,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -104,7 +121,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ @@ -114,7 +131,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ @@ -123,7 +140,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ @@ -135,7 +152,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ @@ -154,7 +171,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ @@ -170,11 +187,12 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "from sklearn.linear_model import LinearRegression\n", + "\n", "lin_reg = LinearRegression()\n", "lin_reg.fit(X, y)\n", "lin_reg.intercept_, lin_reg.coef_" @@ -182,7 +200,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ @@ -198,7 +216,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 12, "metadata": {}, "outputs": [], "source": [ @@ -215,7 +233,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ @@ -238,14 +256,15 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 14, "metadata": {}, "outputs": [], "source": [ - "eta = 0.1\n", + "eta = 0.1 # learning rate\n", "n_iterations = 1000\n", "m = 100\n", - "theta = np.random.randn(2,1)\n", + "\n", + "theta = np.random.randn(2,1) # random initialization\n", "\n", "for iteration in range(n_iterations):\n", " gradients = 2/m * X_b.T.dot(X_b.dot(theta) - y)\n", @@ -254,7 +273,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 15, "metadata": {}, "outputs": [], "source": [ @@ -263,7 +282,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 16, "metadata": {}, "outputs": [], "source": [ @@ -272,7 +291,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 17, "metadata": {}, "outputs": [], "source": [ @@ -298,7 +317,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 18, "metadata": {}, "outputs": [], "source": [ @@ -324,7 +343,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 19, "metadata": {}, "outputs": [], "source": [ @@ -335,7 +354,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 20, "metadata": {}, "outputs": [], "source": [ @@ -371,8 +390,10 @@ }, { "cell_type": "code", - "execution_count": 20, - "metadata": {}, + "execution_count": 21, + "metadata": { + "scrolled": true + }, "outputs": [], "source": [ "theta" @@ -380,18 +401,19 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 22, "metadata": {}, "outputs": [], "source": [ "from sklearn.linear_model import SGDRegressor\n", - "sgd_reg = SGDRegressor(max_iter=50, tol=-np.infty, penalty=None, eta0=0.1, random_state=42)\n", + "\n", + "sgd_reg = SGDRegressor(max_iter=1000, tol=1e-3, penalty=None, eta0=0.1, random_state=42)\n", "sgd_reg.fit(X, y.ravel())" ] }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 23, "metadata": {}, "outputs": [], "source": [ @@ -407,7 +429,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 24, "metadata": {}, "outputs": [], "source": [ @@ -440,7 +462,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 25, "metadata": {}, "outputs": [], "source": [ @@ -449,7 +471,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 26, "metadata": {}, "outputs": [], "source": [ @@ -460,7 +482,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 27, "metadata": {}, "outputs": [], "source": [ @@ -485,7 +507,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 28, "metadata": {}, "outputs": [], "source": [ @@ -497,7 +519,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 29, "metadata": {}, "outputs": [], "source": [ @@ -508,7 +530,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 30, "metadata": {}, "outputs": [], "source": [ @@ -522,7 +544,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 31, "metadata": {}, "outputs": [], "source": [ @@ -534,7 +556,7 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 32, "metadata": {}, "outputs": [], "source": [ @@ -543,7 +565,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 33, "metadata": {}, "outputs": [], "source": [ @@ -554,7 +576,7 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 34, "metadata": {}, "outputs": [], "source": [ @@ -573,7 +595,7 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 35, "metadata": {}, "outputs": [], "source": [ @@ -604,7 +626,7 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 36, "metadata": {}, "outputs": [], "source": [ @@ -630,7 +652,7 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 37, "metadata": {}, "outputs": [], "source": [ @@ -643,7 +665,7 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 38, "metadata": {}, "outputs": [], "source": [ @@ -669,17 +691,47 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 39, "metadata": {}, "outputs": [], "source": [ - "from sklearn.linear_model import Ridge\n", - "\n", "np.random.seed(42)\n", "m = 20\n", "X = 3 * np.random.rand(m, 1)\n", "y = 1 + 0.5 * X + np.random.randn(m, 1) / 1.5\n", - "X_new = np.linspace(0, 3, 100).reshape(100, 1)\n", + "X_new = np.linspace(0, 3, 100).reshape(100, 1)" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.linear_model import Ridge\n", + "ridge_reg = Ridge(alpha=1, solver=\"cholesky\", random_state=42)\n", + "ridge_reg.fit(X, y)\n", + "ridge_reg.predict([[1.5]])" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [], + "source": [ + "ridge_reg = Ridge(alpha=1, solver=\"sag\", random_state=42)\n", + "ridge_reg.fit(X, y)\n", + "ridge_reg.predict([[1.5]])" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.linear_model import Ridge\n", "\n", "def plot_model(model_class, polynomial, alphas, **model_kargs):\n", " for alpha, style in zip(alphas, (\"b-\", \"g--\", \"r:\")):\n", @@ -711,42 +763,26 @@ ] }, { - "cell_type": "code", - "execution_count": 39, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "from sklearn.linear_model import Ridge\n", - "ridge_reg = Ridge(alpha=1, solver=\"cholesky\", random_state=42)\n", - "ridge_reg.fit(X, y)\n", - "ridge_reg.predict([[1.5]])" + "**Note**: to be future-proof, we set `max_iter=1000` and `tol=1e-3` because these will be the default values in Scikit-Learn 0.21." ] }, { "cell_type": "code", - "execution_count": 40, + "execution_count": 43, "metadata": {}, "outputs": [], "source": [ - "sgd_reg = SGDRegressor(max_iter=50, tol=-np.infty, penalty=\"l2\", random_state=42)\n", + "sgd_reg = SGDRegressor(penalty=\"l2\", max_iter=1000, tol=1e-3, random_state=42)\n", "sgd_reg.fit(X, y.ravel())\n", "sgd_reg.predict([[1.5]])" ] }, { "cell_type": "code", - "execution_count": 41, - "metadata": {}, - "outputs": [], - "source": [ - "ridge_reg = Ridge(alpha=1, solver=\"sag\", random_state=42)\n", - "ridge_reg.fit(X, y)\n", - "ridge_reg.predict([[1.5]])" - ] - }, - { - "cell_type": "code", - "execution_count": 42, + "execution_count": 44, "metadata": {}, "outputs": [], "source": [ @@ -757,7 +793,7 @@ "plot_model(Lasso, polynomial=False, alphas=(0, 0.1, 1), random_state=42)\n", "plt.ylabel(\"$y$\", rotation=0, fontsize=18)\n", "plt.subplot(122)\n", - "plot_model(Lasso, polynomial=True, alphas=(0, 10**-7, 1), tol=1, random_state=42)\n", + "plot_model(Lasso, polynomial=True, alphas=(0, 10**-7, 1), random_state=42)\n", "\n", "save_fig(\"lasso_regression_plot\")\n", "plt.show()" @@ -765,7 +801,7 @@ }, { "cell_type": "code", - "execution_count": 43, + "execution_count": 45, "metadata": {}, "outputs": [], "source": [ @@ -777,7 +813,7 @@ }, { "cell_type": "code", - "execution_count": 44, + "execution_count": 46, "metadata": {}, "outputs": [], "source": [ @@ -789,10 +825,8 @@ }, { "cell_type": "code", - "execution_count": 45, - "metadata": { - "scrolled": true - }, + "execution_count": 47, + "metadata": {}, "outputs": [], "source": [ "np.random.seed(42)\n", @@ -800,23 +834,65 @@ "X = 6 * np.random.rand(m, 1) - 3\n", "y = 2 + X + 0.5 * X**2 + np.random.randn(m, 1)\n", "\n", - "X_train, X_val, y_train, y_val = train_test_split(X[:50], y[:50].ravel(), test_size=0.5, random_state=10)\n", + "X_train, X_val, y_train, y_val = train_test_split(X[:50], y[:50].ravel(), test_size=0.5, random_state=10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Early stopping example:" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "from sklearn.base import clone\n", "\n", "poly_scaler = Pipeline([\n", " (\"poly_features\", PolynomialFeatures(degree=90, include_bias=False)),\n", - " (\"std_scaler\", StandardScaler()),\n", + " (\"std_scaler\", StandardScaler())\n", " ])\n", "\n", "X_train_poly_scaled = poly_scaler.fit_transform(X_train)\n", "X_val_poly_scaled = poly_scaler.transform(X_val)\n", "\n", - "sgd_reg = SGDRegressor(max_iter=1,\n", - " tol=-np.infty,\n", - " penalty=None,\n", - " eta0=0.0005,\n", - " warm_start=True,\n", - " learning_rate=\"constant\",\n", - " random_state=42)\n", + "sgd_reg = SGDRegressor(max_iter=1, tol=-np.infty, warm_start=True,\n", + " penalty=None, learning_rate=\"constant\", eta0=0.0005, random_state=42)\n", + "\n", + "minimum_val_error = float(\"inf\")\n", + "best_epoch = None\n", + "best_model = None\n", + "for epoch in range(1000):\n", + " sgd_reg.fit(X_train_poly_scaled, y_train) # continues where it left off\n", + " y_val_predict = sgd_reg.predict(X_val_poly_scaled)\n", + " val_error = mean_squared_error(y_val, y_val_predict)\n", + " if val_error < minimum_val_error:\n", + " minimum_val_error = val_error\n", + " best_epoch = epoch\n", + " best_model = clone(sgd_reg)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Create the graph:" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": {}, + "outputs": [], + "source": [ + "sgd_reg = SGDRegressor(max_iter=1, tol=-np.infty, warm_start=True,\n", + " penalty=None, learning_rate=\"constant\", eta0=0.0005, random_state=42)\n", "\n", "n_epochs = 500\n", "train_errors, val_errors = [], []\n", @@ -851,30 +927,7 @@ }, { "cell_type": "code", - "execution_count": 46, - "metadata": {}, - "outputs": [], - "source": [ - "from sklearn.base import clone\n", - "sgd_reg = SGDRegressor(max_iter=1, tol=-np.infty, warm_start=True, penalty=None,\n", - " learning_rate=\"constant\", eta0=0.0005, random_state=42)\n", - "\n", - "minimum_val_error = float(\"inf\")\n", - "best_epoch = None\n", - "best_model = None\n", - "for epoch in range(1000):\n", - " sgd_reg.fit(X_train_poly_scaled, y_train) # continues where it left off\n", - " y_val_predict = sgd_reg.predict(X_val_poly_scaled)\n", - " val_error = mean_squared_error(y_val, y_val_predict)\n", - " if val_error < minimum_val_error:\n", - " minimum_val_error = val_error\n", - " best_epoch = epoch\n", - " best_model = clone(sgd_reg)" - ] - }, - { - "cell_type": "code", - "execution_count": 47, + "execution_count": 50, "metadata": {}, "outputs": [], "source": [ @@ -883,7 +936,7 @@ }, { "cell_type": "code", - "execution_count": 48, + "execution_count": 51, "metadata": {}, "outputs": [], "source": [ @@ -894,7 +947,7 @@ }, { "cell_type": "code", - "execution_count": 49, + "execution_count": 52, "metadata": {}, "outputs": [], "source": [ @@ -921,7 +974,7 @@ }, { "cell_type": "code", - "execution_count": 50, + "execution_count": 53, "metadata": {}, "outputs": [], "source": [ @@ -989,7 +1042,7 @@ }, { "cell_type": "code", - "execution_count": 51, + "execution_count": 54, "metadata": {}, "outputs": [], "source": [ @@ -1010,7 +1063,7 @@ }, { "cell_type": "code", - "execution_count": 52, + "execution_count": 55, "metadata": {}, "outputs": [], "source": [ @@ -1021,7 +1074,7 @@ }, { "cell_type": "code", - "execution_count": 53, + "execution_count": 56, "metadata": {}, "outputs": [], "source": [ @@ -1030,7 +1083,7 @@ }, { "cell_type": "code", - "execution_count": 54, + "execution_count": 57, "metadata": {}, "outputs": [], "source": [ @@ -1038,20 +1091,27 @@ "y = (iris[\"target\"] == 2).astype(np.int) # 1 if Iris-Virginica, else 0" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Note**: To be future-proof we set `solver=\"lbfgs\"` since this will be the default value in Scikit-Learn 0.22." + ] + }, { "cell_type": "code", - "execution_count": 55, + "execution_count": 58, "metadata": {}, "outputs": [], "source": [ "from sklearn.linear_model import LogisticRegression\n", - "log_reg = LogisticRegression(solver=\"liblinear\", random_state=42)\n", + "log_reg = LogisticRegression(solver=\"lbfgs\", random_state=42)\n", "log_reg.fit(X, y)" ] }, { "cell_type": "code", - "execution_count": 56, + "execution_count": 59, "metadata": {}, "outputs": [], "source": [ @@ -1071,7 +1131,7 @@ }, { "cell_type": "code", - "execution_count": 57, + "execution_count": 60, "metadata": {}, "outputs": [], "source": [ @@ -1098,7 +1158,7 @@ }, { "cell_type": "code", - "execution_count": 58, + "execution_count": 61, "metadata": {}, "outputs": [], "source": [ @@ -1107,7 +1167,7 @@ }, { "cell_type": "code", - "execution_count": 59, + "execution_count": 62, "metadata": {}, "outputs": [], "source": [ @@ -1116,7 +1176,7 @@ }, { "cell_type": "code", - "execution_count": 60, + "execution_count": 63, "metadata": {}, "outputs": [], "source": [ @@ -1125,7 +1185,7 @@ "X = iris[\"data\"][:, (2, 3)] # petal length, petal width\n", "y = (iris[\"target\"] == 2).astype(np.int)\n", "\n", - "log_reg = LogisticRegression(solver=\"liblinear\", C=10**10, random_state=42)\n", + "log_reg = LogisticRegression(solver=\"lbfgs\", C=10**10, random_state=42)\n", "log_reg.fit(X, y)\n", "\n", "x0, x1 = np.meshgrid(\n", @@ -1160,7 +1220,7 @@ }, { "cell_type": "code", - "execution_count": 61, + "execution_count": 64, "metadata": {}, "outputs": [], "source": [ @@ -1173,7 +1233,7 @@ }, { "cell_type": "code", - "execution_count": 62, + "execution_count": 65, "metadata": {}, "outputs": [], "source": [ @@ -1211,7 +1271,7 @@ }, { "cell_type": "code", - "execution_count": 63, + "execution_count": 66, "metadata": {}, "outputs": [], "source": [ @@ -1220,7 +1280,7 @@ }, { "cell_type": "code", - "execution_count": 64, + "execution_count": 67, "metadata": {}, "outputs": [], "source": [ @@ -1265,7 +1325,7 @@ }, { "cell_type": "code", - "execution_count": 65, + "execution_count": 68, "metadata": {}, "outputs": [], "source": [ @@ -1282,7 +1342,7 @@ }, { "cell_type": "code", - "execution_count": 66, + "execution_count": 69, "metadata": {}, "outputs": [], "source": [ @@ -1298,7 +1358,7 @@ }, { "cell_type": "code", - "execution_count": 67, + "execution_count": 70, "metadata": {}, "outputs": [], "source": [ @@ -1314,7 +1374,7 @@ }, { "cell_type": "code", - "execution_count": 68, + "execution_count": 71, "metadata": {}, "outputs": [], "source": [ @@ -1345,7 +1405,7 @@ }, { "cell_type": "code", - "execution_count": 69, + "execution_count": 72, "metadata": {}, "outputs": [], "source": [ @@ -1366,7 +1426,7 @@ }, { "cell_type": "code", - "execution_count": 70, + "execution_count": 73, "metadata": {}, "outputs": [], "source": [ @@ -1375,7 +1435,7 @@ }, { "cell_type": "code", - "execution_count": 71, + "execution_count": 74, "metadata": {}, "outputs": [], "source": [ @@ -1391,7 +1451,7 @@ }, { "cell_type": "code", - "execution_count": 72, + "execution_count": 75, "metadata": {}, "outputs": [], "source": [ @@ -1411,7 +1471,7 @@ }, { "cell_type": "code", - "execution_count": 73, + "execution_count": 76, "metadata": {}, "outputs": [], "source": [ @@ -1430,7 +1490,7 @@ }, { "cell_type": "code", - "execution_count": 74, + "execution_count": 77, "metadata": {}, "outputs": [], "source": [ @@ -1458,7 +1518,7 @@ }, { "cell_type": "code", - "execution_count": 75, + "execution_count": 78, "metadata": {}, "outputs": [], "source": [ @@ -1489,7 +1549,7 @@ }, { "cell_type": "code", - "execution_count": 76, + "execution_count": 79, "metadata": {}, "outputs": [], "source": [ @@ -1505,7 +1565,7 @@ }, { "cell_type": "code", - "execution_count": 77, + "execution_count": 80, "metadata": {}, "outputs": [], "source": [ @@ -1526,7 +1586,7 @@ }, { "cell_type": "code", - "execution_count": 78, + "execution_count": 81, "metadata": {}, "outputs": [], "source": [ @@ -1560,7 +1620,7 @@ }, { "cell_type": "code", - "execution_count": 79, + "execution_count": 82, "metadata": {}, "outputs": [], "source": [ @@ -1588,7 +1648,7 @@ }, { "cell_type": "code", - "execution_count": 80, + "execution_count": 83, "metadata": {}, "outputs": [], "source": [ @@ -1628,7 +1688,7 @@ }, { "cell_type": "code", - "execution_count": 81, + "execution_count": 84, "metadata": {}, "outputs": [], "source": [ @@ -1656,7 +1716,7 @@ }, { "cell_type": "code", - "execution_count": 82, + "execution_count": 85, "metadata": {}, "outputs": [], "source": [ @@ -1701,7 +1761,7 @@ }, { "cell_type": "code", - "execution_count": 83, + "execution_count": 86, "metadata": {}, "outputs": [], "source": [ @@ -1730,7 +1790,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3 - tf2", + "display_name": "Python 3", "language": "python", "name": "python3" }, diff --git a/05_support_vector_machines.ipynb b/05_support_vector_machines.ipynb index 2c578dc..c470144 100644 --- a/05_support_vector_machines.ipynb +++ b/05_support_vector_machines.ipynb @@ -1,5 +1,12 @@ { "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, { "cell_type": "markdown", "metadata": {}, @@ -60,6 +67,23 @@ " plt.savefig(path, format='png', dpi=300)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This notebook assumes you have installed Scikit-Learn ≥0.20." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import sklearn\n", + "assert sklearn.__version__ >= \"0.20\"" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -76,7 +100,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -98,7 +122,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -160,7 +184,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ @@ -204,7 +228,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ @@ -269,7 +293,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ @@ -293,7 +317,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ @@ -309,7 +333,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ @@ -332,7 +356,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ @@ -356,7 +380,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ @@ -393,7 +417,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 12, "metadata": {}, "outputs": [], "source": [ @@ -432,7 +456,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ @@ -453,7 +477,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 14, "metadata": {}, "outputs": [], "source": [ @@ -472,7 +496,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 15, "metadata": {}, "outputs": [], "source": [ @@ -495,7 +519,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 16, "metadata": {}, "outputs": [], "source": [ @@ -510,7 +534,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 17, "metadata": {}, "outputs": [], "source": [ @@ -523,7 +547,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 18, "metadata": {}, "outputs": [], "source": [ @@ -545,7 +569,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 19, "metadata": { "scrolled": true }, @@ -613,7 +637,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 20, "metadata": {}, "outputs": [], "source": [ @@ -625,7 +649,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 21, "metadata": {}, "outputs": [], "source": [ @@ -638,7 +662,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 22, "metadata": { "scrolled": true }, @@ -681,7 +705,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 23, "metadata": {}, "outputs": [], "source": [ @@ -693,7 +717,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 24, "metadata": {}, "outputs": [], "source": [ @@ -705,7 +729,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 25, "metadata": {}, "outputs": [], "source": [ @@ -728,7 +752,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 26, "metadata": {}, "outputs": [], "source": [ @@ -765,7 +789,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 27, "metadata": {}, "outputs": [], "source": [ @@ -779,19 +803,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "**Warning**: the default value of `gamma` will change from `'auto'` to `'scale'` in version 0.22 to better account for unscaled features. To preserve the same results as in the book, we explicitly set it to `'auto'`, but you should probably just use the default in your own code." - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "metadata": {}, - "outputs": [], - "source": [ - "from sklearn.svm import SVR\n", - "\n", - "svm_poly_reg = SVR(kernel=\"poly\", degree=2, C=100, epsilon=0.1, gamma=\"auto\")\n", - "svm_poly_reg.fit(X, y)" + "**Note**: to be future-proof, we set `gamma=\"scale\"`, as this will be the default value in Scikit-Learn 0.22." ] }, { @@ -802,15 +814,27 @@ "source": [ "from sklearn.svm import SVR\n", "\n", - "svm_poly_reg1 = SVR(kernel=\"poly\", degree=2, C=100, epsilon=0.1, gamma=\"auto\")\n", - "svm_poly_reg2 = SVR(kernel=\"poly\", degree=2, C=0.01, epsilon=0.1, gamma=\"auto\")\n", + "svm_poly_reg = SVR(kernel=\"poly\", degree=2, C=100, epsilon=0.1, gamma=\"scale\")\n", + "svm_poly_reg.fit(X, y)" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.svm import SVR\n", + "\n", + "svm_poly_reg1 = SVR(kernel=\"poly\", degree=2, C=100, epsilon=0.1, gamma=\"scale\")\n", + "svm_poly_reg2 = SVR(kernel=\"poly\", degree=2, C=0.01, epsilon=0.1, gamma=\"scale\")\n", "svm_poly_reg1.fit(X, y)\n", "svm_poly_reg2.fit(X, y)" ] }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 30, "metadata": {}, "outputs": [], "source": [ @@ -835,7 +859,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 31, "metadata": {}, "outputs": [], "source": [ @@ -846,7 +870,7 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 32, "metadata": {}, "outputs": [], "source": [ @@ -874,17 +898,17 @@ " ax.plot_wireframe(x1, x2, df, alpha=0.3, color=\"k\")\n", " ax.plot(X_crop[:, 0][y_crop==0], X_crop[:, 1][y_crop==0], 0, \"bs\")\n", " ax.axis(x1_lim + x2_lim)\n", - " ax.text(4.5, 2.5, 3.8, \"Decision function $h$\", fontsize=15)\n", - " ax.set_xlabel(r\"Petal length\", fontsize=15)\n", - " ax.set_ylabel(r\"Petal width\", fontsize=15)\n", - " ax.set_zlabel(r\"$h = \\mathbf{w}^T \\mathbf{x} + b$\", fontsize=18)\n", + " ax.text(4.5, 2.5, 3.8, \"Decision function $h$\", fontsize=16)\n", + " ax.set_xlabel(r\"Petal length\", fontsize=16, labelpad=10)\n", + " ax.set_ylabel(r\"Petal width\", fontsize=16, labelpad=10)\n", + " ax.set_zlabel(r\"$h = \\mathbf{w}^T \\mathbf{x} + b$\", fontsize=18, labelpad=5)\n", " ax.legend(loc=\"upper left\", fontsize=16)\n", "\n", "fig = plt.figure(figsize=(11, 6))\n", "ax1 = fig.add_subplot(111, projection='3d')\n", "plot_3D_decision_function(ax1, w=svm_clf2.coef_[0], b=svm_clf2.intercept_[0])\n", "\n", - "#save_fig(\"iris_3D_plot\")\n", + "save_fig(\"iris_3D_plot\")\n", "plt.show()" ] }, @@ -897,7 +921,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 33, "metadata": {}, "outputs": [], "source": [ @@ -931,7 +955,7 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 34, "metadata": {}, "outputs": [], "source": [ @@ -956,7 +980,7 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 35, "metadata": {}, "outputs": [], "source": [ @@ -992,7 +1016,7 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 36, "metadata": {}, "outputs": [], "source": [ @@ -1003,7 +1027,7 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 37, "metadata": {}, "outputs": [], "source": [ @@ -1021,7 +1045,11 @@ " tols.append(tol)\n", " print(i, tol, t2-t1)\n", " tol /= 10\n", - "plt.semilogx(tols, times)" + "plt.semilogx(tols, times, \"bo-\")\n", + "plt.xlabel(\"Tolerance\", fontsize=16)\n", + "plt.ylabel(\"Time (seconds)\", fontsize=16)\n", + "plt.grid(True)\n", + "plt.show()" ] }, { @@ -1033,7 +1061,7 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 38, "metadata": {}, "outputs": [], "source": [ @@ -1044,7 +1072,7 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 39, "metadata": {}, "outputs": [], "source": [ @@ -1109,7 +1137,7 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 40, "metadata": {}, "outputs": [], "source": [ @@ -1119,7 +1147,7 @@ }, { "cell_type": "code", - "execution_count": 40, + "execution_count": 41, "metadata": {}, "outputs": [], "source": [ @@ -1128,7 +1156,7 @@ }, { "cell_type": "code", - "execution_count": 41, + "execution_count": 42, "metadata": {}, "outputs": [], "source": [ @@ -1139,7 +1167,7 @@ }, { "cell_type": "code", - "execution_count": 42, + "execution_count": 43, "metadata": {}, "outputs": [], "source": [ @@ -1165,7 +1193,7 @@ }, { "cell_type": "code", - "execution_count": 43, + "execution_count": 44, "metadata": { "scrolled": true }, @@ -1173,7 +1201,7 @@ "source": [ "from sklearn.linear_model import SGDClassifier\n", "\n", - "sgd_clf = SGDClassifier(loss=\"hinge\", alpha = 0.017, max_iter = 50, tol=-np.infty, random_state=42)\n", + "sgd_clf = SGDClassifier(loss=\"hinge\", alpha=0.017, max_iter=1000, tol=1e-3, random_state=42)\n", "sgd_clf.fit(X, y.ravel())\n", "\n", "m = len(X)\n", @@ -1242,7 +1270,7 @@ }, { "cell_type": "code", - "execution_count": 44, + "execution_count": 45, "metadata": {}, "outputs": [], "source": [ @@ -1259,7 +1287,7 @@ }, { "cell_type": "code", - "execution_count": 45, + "execution_count": 46, "metadata": {}, "outputs": [], "source": [ @@ -1273,7 +1301,7 @@ "lin_clf = LinearSVC(loss=\"hinge\", C=C, random_state=42)\n", "svm_clf = SVC(kernel=\"linear\", C=C)\n", "sgd_clf = SGDClassifier(loss=\"hinge\", learning_rate=\"constant\", eta0=0.001, alpha=alpha,\n", - " max_iter=100000, tol=-np.infty, random_state=42)\n", + " max_iter=1000, tol=1e-3, random_state=42)\n", "\n", "scaler = StandardScaler()\n", "X_scaled = scaler.fit_transform(X)\n", @@ -1296,7 +1324,7 @@ }, { "cell_type": "code", - "execution_count": 46, + "execution_count": 47, "metadata": {}, "outputs": [], "source": [ @@ -1358,19 +1386,15 @@ }, { "cell_type": "code", - "execution_count": 47, + "execution_count": 59, "metadata": {}, "outputs": [], "source": [ - "try:\n", - " from sklearn.datasets import fetch_openml\n", - " mnist = fetch_openml('mnist_784', version=1, cache=True)\n", - "except ImportError:\n", - " from sklearn.datasets import fetch_mldata\n", - " mnist = fetch_mldata('MNIST original')\n", + "from sklearn.datasets import fetch_openml\n", + "mnist = fetch_openml('mnist_784', version=1, cache=True)\n", "\n", "X = mnist[\"data\"]\n", - "y = mnist[\"target\"]\n", + "y = mnist[\"target\"].astype(np.uint8)\n", "\n", "X_train = X[:60000]\n", "y_train = y[:60000]\n", @@ -1382,31 +1406,21 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Many training algorithms are sensitive to the order of the training instances, so it's generally good practice to shuffle them first:" - ] - }, - { - "cell_type": "code", - "execution_count": 48, - "metadata": {}, - "outputs": [], - "source": [ - "np.random.seed(42)\n", - "rnd_idx = np.random.permutation(60000)\n", - "X_train = X_train[rnd_idx]\n", - "y_train = y_train[rnd_idx]" + "Many training algorithms are sensitive to the order of the training instances, so it's generally good practice to shuffle them first. However, the dataset is already shuffled, so we do not need to do it." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Let's start simple, with a linear SVM classifier. It will automatically use the One-vs-All (also called One-vs-the-Rest, OvR) strategy, so there's nothing special we need to do. Easy!" + "Let's start simple, with a linear SVM classifier. It will automatically use the One-vs-All (also called One-vs-the-Rest, OvR) strategy, so there's nothing special we need to do. Easy!\n", + "\n", + "**Warning**: this may take a few minutes depending on your hardware." ] }, { "cell_type": "code", - "execution_count": 49, + "execution_count": 60, "metadata": {}, "outputs": [], "source": [ @@ -1423,7 +1437,7 @@ }, { "cell_type": "code", - "execution_count": 50, + "execution_count": 61, "metadata": {}, "outputs": [], "source": [ @@ -1437,12 +1451,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Wow, 86% accuracy on MNIST is a really bad performance. This linear model is certainly too simple for MNIST, but perhaps we just needed to scale the data first:" + "Okay, 89.5% accuracy on MNIST is pretty bad. This linear model is certainly too simple for MNIST, but perhaps we just needed to scale the data first:" ] }, { "cell_type": "code", - "execution_count": 51, + "execution_count": 62, "metadata": {}, "outputs": [], "source": [ @@ -1451,9 +1465,16 @@ "X_test_scaled = scaler.transform(X_test.astype(np.float32))" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Warning**: this may take a few minutes depending on your hardware." + ] + }, { "cell_type": "code", - "execution_count": 52, + "execution_count": 63, "metadata": {}, "outputs": [], "source": [ @@ -1463,7 +1484,7 @@ }, { "cell_type": "code", - "execution_count": 53, + "execution_count": 64, "metadata": {}, "outputs": [], "source": [ @@ -1475,24 +1496,29 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "That's much better (we cut the error rate in two), but still not great at all for MNIST. If we want to use an SVM, we will have to use a kernel. Let's try an `SVC` with an RBF kernel (the default).\n", - "\n", - "**Warning**: if you are using Scikit-Learn ≤ 0.19, the `SVC` class will use the One-vs-One (OvO) strategy by default, so you must explicitly set `decision_function_shape=\"ovr\"` if you want to use the OvR strategy instead (OvR is the default since 0.19)." + "That's much better (we cut the error rate by about 25%), but still not great at all for MNIST. If we want to use an SVM, we will have to use a kernel. Let's try an `SVC` with an RBF kernel (the default)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Note**: to be future-proof we set `gamma=\"scale\"` since it will be the default value in Scikit-Learn 0.22." ] }, { "cell_type": "code", - "execution_count": 54, + "execution_count": 77, "metadata": {}, "outputs": [], "source": [ - "svm_clf = SVC(decision_function_shape=\"ovr\", gamma=\"auto\")\n", + "svm_clf = SVC(gamma=\"scale\")\n", "svm_clf.fit(X_train_scaled[:10000], y_train[:10000])" ] }, { "cell_type": "code", - "execution_count": 55, + "execution_count": 78, "metadata": {}, "outputs": [], "source": [ @@ -1793,7 +1819,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3 - tf2", + "display_name": "Python 3", "language": "python", "name": "python3" }, diff --git a/06_decision_trees.ipynb b/06_decision_trees.ipynb index c091db1..b609a5c 100644 --- a/06_decision_trees.ipynb +++ b/06_decision_trees.ipynb @@ -71,7 +71,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Training and visualizing" + "This notebook assumes you have installed Scikit-Learn ≥0.20." ] }, { @@ -79,6 +79,23 @@ "execution_count": 2, "metadata": {}, "outputs": [], + "source": [ + "import sklearn\n", + "assert sklearn.__version__ >= \"0.20\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Training and visualizing" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], "source": [ "from sklearn.datasets import load_iris\n", "from sklearn.tree import DecisionTreeClassifier\n", @@ -93,7 +110,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -111,7 +128,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ @@ -165,7 +182,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ @@ -174,7 +191,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ @@ -190,7 +207,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ @@ -199,7 +216,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ @@ -213,7 +230,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ @@ -230,7 +247,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ @@ -256,7 +273,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 12, "metadata": {}, "outputs": [], "source": [ @@ -275,7 +292,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ @@ -311,7 +328,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 14, "metadata": {}, "outputs": [], "source": [ @@ -325,7 +342,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 15, "metadata": {}, "outputs": [], "source": [ @@ -337,7 +354,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 16, "metadata": {}, "outputs": [], "source": [ @@ -384,7 +401,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 17, "metadata": {}, "outputs": [], "source": [ @@ -399,7 +416,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 18, "metadata": {}, "outputs": [], "source": [ @@ -489,7 +506,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 19, "metadata": {}, "outputs": [], "source": [ @@ -507,7 +524,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 20, "metadata": {}, "outputs": [], "source": [ @@ -525,21 +542,21 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 21, "metadata": {}, "outputs": [], "source": [ "from sklearn.model_selection import GridSearchCV\n", "\n", "params = {'max_leaf_nodes': list(range(2, 100)), 'min_samples_split': [2, 3, 4]}\n", - "grid_search_cv = GridSearchCV(DecisionTreeClassifier(random_state=42), params, n_jobs=-1, verbose=1, cv=3)\n", + "grid_search_cv = GridSearchCV(DecisionTreeClassifier(random_state=42), params, verbose=1, cv=3)\n", "\n", "grid_search_cv.fit(X_train, y_train)" ] }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 22, "metadata": {}, "outputs": [], "source": [ @@ -562,7 +579,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 23, "metadata": {}, "outputs": [], "source": [ @@ -595,7 +612,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 24, "metadata": {}, "outputs": [], "source": [ @@ -622,7 +639,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 25, "metadata": {}, "outputs": [], "source": [ @@ -650,7 +667,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 26, "metadata": {}, "outputs": [], "source": [ @@ -662,7 +679,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 27, "metadata": {}, "outputs": [], "source": [ @@ -680,24 +697,17 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 28, "metadata": {}, "outputs": [], "source": [ "accuracy_score(y_test, y_pred_majority_votes.reshape([-1]))" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { "kernelspec": { - "display_name": "Python 3 - tf2", + "display_name": "Python 3", "language": "python", "name": "python3" }, diff --git a/07_ensemble_learning_and_random_forests.ipynb b/07_ensemble_learning_and_random_forests.ipynb index 75517ef..9a6f179 100644 --- a/07_ensemble_learning_and_random_forests.ipynb +++ b/07_ensemble_learning_and_random_forests.ipynb @@ -71,7 +71,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Voting classifiers" + "This notebook assumes you have installed Scikit-Learn ≥0.20." ] }, { @@ -79,6 +79,23 @@ "execution_count": 2, "metadata": {}, "outputs": [], + "source": [ + "import sklearn\n", + "assert sklearn.__version__ >= \"0.20\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Voting classifiers" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], "source": [ "heads_proba = 0.51\n", "coin_tosses = (np.random.rand(10000, 10) < heads_proba).astype(np.int32)\n", @@ -87,7 +104,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -105,7 +122,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ @@ -120,27 +137,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "**Warning**: In Scikit-Learn 0.20, some hyperparameters (`solver`, `n_estimators`, `gamma`, etc.) start issuing warnings about the fact that their default value will change in Scikit-Learn 0.22. To avoid these warnings and ensure that this notebooks keeps producing the same outputs as in the book, I set the hyperparameters to their old default value. In your own code, you can simply rely on the latest default values instead." - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "from sklearn.ensemble import RandomForestClassifier\n", - "from sklearn.ensemble import VotingClassifier\n", - "from sklearn.linear_model import LogisticRegression\n", - "from sklearn.svm import SVC\n", - "\n", - "log_clf = LogisticRegression(solver=\"liblinear\", random_state=42)\n", - "rnd_clf = RandomForestClassifier(n_estimators=10, random_state=42)\n", - "svm_clf = SVC(gamma=\"auto\", random_state=42)\n", - "\n", - "voting_clf = VotingClassifier(\n", - " estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],\n", - " voting='hard')" + "**Note**: to be future-proof, we set `solver=\"lbfgs\"`, `n_estimators=100`, and `gamma=\"scale\"` since these will be the default values in upcoming Scikit-Learn versions." ] }, { @@ -149,7 +146,18 @@ "metadata": {}, "outputs": [], "source": [ - "voting_clf.fit(X_train, y_train)" + "from sklearn.ensemble import RandomForestClassifier\n", + "from sklearn.ensemble import VotingClassifier\n", + "from sklearn.linear_model import LogisticRegression\n", + "from sklearn.svm import SVC\n", + "\n", + "log_clf = LogisticRegression(solver=\"lbfgs\", random_state=42)\n", + "rnd_clf = RandomForestClassifier(n_estimators=100, random_state=42)\n", + "svm_clf = SVC(gamma=\"scale\", random_state=42)\n", + "\n", + "voting_clf = VotingClassifier(\n", + " estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],\n", + " voting='hard')" ] }, { @@ -157,6 +165,15 @@ "execution_count": 7, "metadata": {}, "outputs": [], + "source": [ + "voting_clf.fit(X_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], "source": [ "from sklearn.metrics import accuracy_score\n", "\n", @@ -166,15 +183,22 @@ " print(clf.__class__.__name__, accuracy_score(y_test, y_pred))" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Soft voting:" + ] + }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ - "log_clf = LogisticRegression(solver=\"liblinear\", random_state=42)\n", - "rnd_clf = RandomForestClassifier(n_estimators=10, random_state=42)\n", - "svm_clf = SVC(gamma=\"auto\", probability=True, random_state=42)\n", + "log_clf = LogisticRegression(solver=\"lbfgs\", random_state=42)\n", + "rnd_clf = RandomForestClassifier(n_estimators=100, random_state=42)\n", + "svm_clf = SVC(gamma=\"scale\", probability=True, random_state=42)\n", "\n", "voting_clf = VotingClassifier(\n", " estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],\n", @@ -184,7 +208,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ @@ -205,7 +229,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ @@ -214,14 +238,14 @@ "\n", "bag_clf = BaggingClassifier(\n", " DecisionTreeClassifier(random_state=42), n_estimators=500,\n", - " max_samples=100, bootstrap=True, n_jobs=-1, random_state=42)\n", + " max_samples=100, bootstrap=True, random_state=42)\n", "bag_clf.fit(X_train, y_train)\n", "y_pred = bag_clf.predict(X_test)" ] }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 12, "metadata": {}, "outputs": [], "source": [ @@ -231,7 +255,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ @@ -243,7 +267,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 14, "metadata": {}, "outputs": [], "source": [ @@ -269,7 +293,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 15, "metadata": {}, "outputs": [], "source": [ @@ -293,18 +317,18 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "bag_clf = BaggingClassifier(\n", " DecisionTreeClassifier(splitter=\"random\", max_leaf_nodes=16, random_state=42),\n", - " n_estimators=500, max_samples=1.0, bootstrap=True, n_jobs=-1, random_state=42)" + " n_estimators=500, max_samples=1.0, bootstrap=True, random_state=42)" ] }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 17, "metadata": {}, "outputs": [], "source": [ @@ -314,39 +338,25 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "from sklearn.ensemble import RandomForestClassifier\n", "\n", - "rnd_clf = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16, n_jobs=-1, random_state=42)\n", + "rnd_clf = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16, random_state=42)\n", "rnd_clf.fit(X_train, y_train)\n", "\n", "y_pred_rf = rnd_clf.predict(X_test)" ] }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [], - "source": [ - "np.sum(y_pred == y_pred_rf) / len(y_pred) # almost identical predictions" - ] - }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [], "source": [ - "from sklearn.datasets import load_iris\n", - "iris = load_iris()\n", - "rnd_clf = RandomForestClassifier(n_estimators=500, n_jobs=-1, random_state=42)\n", - "rnd_clf.fit(iris[\"data\"], iris[\"target\"])\n", - "for name, score in zip(iris[\"feature_names\"], rnd_clf.feature_importances_):\n", - " print(name, score)" + "np.sum(y_pred == y_pred_rf) / len(y_pred) # almost identical predictions" ] }, { @@ -355,7 +365,12 @@ "metadata": {}, "outputs": [], "source": [ - "rnd_clf.feature_importances_" + "from sklearn.datasets import load_iris\n", + "iris = load_iris()\n", + "rnd_clf = RandomForestClassifier(n_estimators=500, random_state=42)\n", + "rnd_clf.fit(iris[\"data\"], iris[\"target\"])\n", + "for name, score in zip(iris[\"feature_names\"], rnd_clf.feature_importances_):\n", + " print(name, score)" ] }, { @@ -363,6 +378,15 @@ "execution_count": 21, "metadata": {}, "outputs": [], + "source": [ + "rnd_clf.feature_importances_" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [], "source": [ "plt.figure(figsize=(6, 4))\n", "\n", @@ -384,20 +408,20 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 23, "metadata": {}, "outputs": [], "source": [ "bag_clf = BaggingClassifier(\n", " DecisionTreeClassifier(random_state=42), n_estimators=500,\n", - " bootstrap=True, n_jobs=-1, oob_score=True, random_state=40)\n", + " bootstrap=True, oob_score=True, random_state=40)\n", "bag_clf.fit(X_train, y_train)\n", "bag_clf.oob_score_" ] }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 24, "metadata": {}, "outputs": [], "source": [ @@ -406,7 +430,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 25, "metadata": {}, "outputs": [], "source": [ @@ -422,29 +446,16 @@ "## Feature importance" ] }, - { - "cell_type": "code", - "execution_count": 25, - "metadata": {}, - "outputs": [], - "source": [ - "try:\n", - " from sklearn.datasets import fetch_openml\n", - " mnist = fetch_openml('mnist_784', version=1)\n", - " mnist.target = mnist.target.astype(np.int64)\n", - "except ImportError:\n", - " from sklearn.datasets import fetch_mldata\n", - " mnist = fetch_mldata('MNIST original')" - ] - }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [], "source": [ - "rnd_clf = RandomForestClassifier(n_estimators=10, random_state=42)\n", - "rnd_clf.fit(mnist[\"data\"], mnist[\"target\"])" + "from sklearn.datasets import fetch_openml\n", + "\n", + "mnist = fetch_openml('mnist_784', version=1)\n", + "mnist.target = mnist.target.astype(np.uint8)" ] }, { @@ -452,6 +463,16 @@ "execution_count": 27, "metadata": {}, "outputs": [], + "source": [ + "rnd_clf = RandomForestClassifier(n_estimators=100, random_state=42)\n", + "rnd_clf.fit(mnist[\"data\"], mnist[\"target\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [], "source": [ "def plot_digit(data):\n", " image = data.reshape(28, 28)\n", @@ -462,7 +483,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 29, "metadata": {}, "outputs": [], "source": [ @@ -484,7 +505,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 30, "metadata": {}, "outputs": [], "source": [ @@ -498,7 +519,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 31, "metadata": {}, "outputs": [], "source": [ @@ -507,7 +528,7 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 32, "metadata": {}, "outputs": [], "source": [ @@ -518,7 +539,7 @@ " sample_weights = np.ones(m)\n", " plt.subplot(subplot)\n", " for i in range(5):\n", - " svm_clf = SVC(kernel=\"rbf\", C=0.05, gamma=\"auto\", random_state=42)\n", + " svm_clf = SVC(kernel=\"rbf\", C=0.05, gamma=\"scale\", random_state=42)\n", " svm_clf.fit(X_train, y_train, sample_weight=sample_weights)\n", " y_pred = svm_clf.predict(X_train)\n", " sample_weights[y_pred != y_train] *= (1 + learning_rate)\n", @@ -537,7 +558,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 33, "metadata": {}, "outputs": [], "source": [ @@ -553,7 +574,7 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 34, "metadata": {}, "outputs": [], "source": [ @@ -564,7 +585,7 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 35, "metadata": {}, "outputs": [], "source": [ @@ -576,7 +597,7 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 36, "metadata": {}, "outputs": [], "source": [ @@ -587,7 +608,7 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 37, "metadata": {}, "outputs": [], "source": [ @@ -598,7 +619,7 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 38, "metadata": {}, "outputs": [], "source": [ @@ -607,7 +628,7 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 39, "metadata": {}, "outputs": [], "source": [ @@ -616,7 +637,7 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 40, "metadata": {}, "outputs": [], "source": [ @@ -625,7 +646,7 @@ }, { "cell_type": "code", - "execution_count": 40, + "execution_count": 41, "metadata": {}, "outputs": [], "source": [ @@ -674,7 +695,7 @@ }, { "cell_type": "code", - "execution_count": 41, + "execution_count": 42, "metadata": {}, "outputs": [], "source": [ @@ -686,7 +707,7 @@ }, { "cell_type": "code", - "execution_count": 42, + "execution_count": 43, "metadata": {}, "outputs": [], "source": [ @@ -696,7 +717,7 @@ }, { "cell_type": "code", - "execution_count": 43, + "execution_count": 44, "metadata": {}, "outputs": [], "source": [ @@ -723,7 +744,7 @@ }, { "cell_type": "code", - "execution_count": 44, + "execution_count": 45, "metadata": {}, "outputs": [], "source": [ @@ -746,7 +767,7 @@ }, { "cell_type": "code", - "execution_count": 45, + "execution_count": 46, "metadata": {}, "outputs": [], "source": [ @@ -755,7 +776,7 @@ }, { "cell_type": "code", - "execution_count": 46, + "execution_count": 47, "metadata": {}, "outputs": [], "source": [ @@ -781,7 +802,7 @@ }, { "cell_type": "code", - "execution_count": 47, + "execution_count": 48, "metadata": {}, "outputs": [], "source": [ @@ -805,7 +826,7 @@ }, { "cell_type": "code", - "execution_count": 48, + "execution_count": 49, "metadata": {}, "outputs": [], "source": [ @@ -814,7 +835,7 @@ }, { "cell_type": "code", - "execution_count": 49, + "execution_count": 50, "metadata": {}, "outputs": [], "source": [ @@ -830,7 +851,7 @@ }, { "cell_type": "code", - "execution_count": 50, + "execution_count": 51, "metadata": {}, "outputs": [], "source": [ @@ -843,7 +864,7 @@ }, { "cell_type": "code", - "execution_count": 51, + "execution_count": 52, "metadata": {}, "outputs": [], "source": [ @@ -851,22 +872,8 @@ " xgb_reg = xgboost.XGBRegressor(random_state=42)\n", " xgb_reg.fit(X_train, y_train)\n", " y_pred = xgb_reg.predict(X_val)\n", - " val_error = mean_squared_error(y_val, y_pred)\n", - " print(\"Validation MSE:\", val_error)" - ] - }, - { - "cell_type": "code", - "execution_count": 52, - "metadata": {}, - "outputs": [], - "source": [ - "if xgboost is not None: # not shown in the book\n", - " xgb_reg.fit(X_train, y_train,\n", - " eval_set=[(X_val, y_val)], early_stopping_rounds=2)\n", - " y_pred = xgb_reg.predict(X_val)\n", - " val_error = mean_squared_error(y_val, y_pred)\n", - " print(\"Validation MSE:\", val_error)" + " val_error = mean_squared_error(y_val, y_pred) # Not shown\n", + " print(\"Validation MSE:\", val_error) # Not shown" ] }, { @@ -875,7 +882,12 @@ "metadata": {}, "outputs": [], "source": [ - "%timeit xgboost.XGBRegressor().fit(X_train, y_train) if xgboost is not None else None" + "if xgboost is not None: # not shown in the book\n", + " xgb_reg.fit(X_train, y_train,\n", + " eval_set=[(X_val, y_val)], early_stopping_rounds=2)\n", + " y_pred = xgb_reg.predict(X_val)\n", + " val_error = mean_squared_error(y_val, y_pred) # Not shown\n", + " print(\"Validation MSE:\", val_error) # Not shown" ] }, { @@ -883,6 +895,15 @@ "execution_count": 54, "metadata": {}, "outputs": [], + "source": [ + "%timeit xgboost.XGBRegressor().fit(X_train, y_train) if xgboost is not None else None" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "metadata": {}, + "outputs": [], "source": [ "%timeit GradientBoostingRegressor().fit(X_train, y_train)" ] @@ -933,7 +954,7 @@ }, { "cell_type": "code", - "execution_count": 55, + "execution_count": 56, "metadata": {}, "outputs": [], "source": [ @@ -942,7 +963,7 @@ }, { "cell_type": "code", - "execution_count": 56, + "execution_count": 57, "metadata": {}, "outputs": [], "source": [ @@ -961,7 +982,7 @@ }, { "cell_type": "code", - "execution_count": 57, + "execution_count": 58, "metadata": {}, "outputs": [], "source": [ @@ -972,19 +993,19 @@ }, { "cell_type": "code", - "execution_count": 58, + "execution_count": 59, "metadata": {}, "outputs": [], "source": [ - "random_forest_clf = RandomForestClassifier(n_estimators=10, random_state=42)\n", - "extra_trees_clf = ExtraTreesClassifier(n_estimators=10, random_state=42)\n", + "random_forest_clf = RandomForestClassifier(n_estimators=100, random_state=42)\n", + "extra_trees_clf = ExtraTreesClassifier(n_estimators=100, random_state=42)\n", "svm_clf = LinearSVC(random_state=42)\n", "mlp_clf = MLPClassifier(random_state=42)" ] }, { "cell_type": "code", - "execution_count": 59, + "execution_count": 60, "metadata": {}, "outputs": [], "source": [ @@ -996,7 +1017,7 @@ }, { "cell_type": "code", - "execution_count": 60, + "execution_count": 61, "metadata": {}, "outputs": [], "source": [ @@ -1019,7 +1040,7 @@ }, { "cell_type": "code", - "execution_count": 61, + "execution_count": 62, "metadata": {}, "outputs": [], "source": [ @@ -1028,7 +1049,7 @@ }, { "cell_type": "code", - "execution_count": 62, + "execution_count": 63, "metadata": {}, "outputs": [], "source": [ @@ -1042,7 +1063,7 @@ }, { "cell_type": "code", - "execution_count": 63, + "execution_count": 64, "metadata": {}, "outputs": [], "source": [ @@ -1051,7 +1072,7 @@ }, { "cell_type": "code", - "execution_count": 64, + "execution_count": 65, "metadata": {}, "outputs": [], "source": [ @@ -1060,7 +1081,7 @@ }, { "cell_type": "code", - "execution_count": 65, + "execution_count": 66, "metadata": {}, "outputs": [], "source": [ @@ -1069,7 +1090,7 @@ }, { "cell_type": "code", - "execution_count": 66, + "execution_count": 67, "metadata": {}, "outputs": [], "source": [ @@ -1085,7 +1106,7 @@ }, { "cell_type": "code", - "execution_count": 67, + "execution_count": 68, "metadata": {}, "outputs": [], "source": [ @@ -1101,7 +1122,7 @@ }, { "cell_type": "code", - "execution_count": 68, + "execution_count": 69, "metadata": {}, "outputs": [], "source": [ @@ -1117,7 +1138,7 @@ }, { "cell_type": "code", - "execution_count": 69, + "execution_count": 70, "metadata": {}, "outputs": [], "source": [ @@ -1133,7 +1154,7 @@ }, { "cell_type": "code", - "execution_count": 70, + "execution_count": 71, "metadata": {}, "outputs": [], "source": [ @@ -1149,7 +1170,7 @@ }, { "cell_type": "code", - "execution_count": 71, + "execution_count": 72, "metadata": {}, "outputs": [], "source": [ @@ -1165,7 +1186,7 @@ }, { "cell_type": "code", - "execution_count": 72, + "execution_count": 73, "metadata": {}, "outputs": [], "source": [ @@ -1174,7 +1195,7 @@ }, { "cell_type": "code", - "execution_count": 73, + "execution_count": 74, "metadata": {}, "outputs": [], "source": [ @@ -1197,7 +1218,7 @@ }, { "cell_type": "code", - "execution_count": 74, + "execution_count": 75, "metadata": {}, "outputs": [], "source": [ @@ -1206,7 +1227,7 @@ }, { "cell_type": "code", - "execution_count": 75, + "execution_count": 76, "metadata": {}, "outputs": [], "source": [ @@ -1236,7 +1257,7 @@ }, { "cell_type": "code", - "execution_count": 76, + "execution_count": 77, "metadata": {}, "outputs": [], "source": [ @@ -1248,7 +1269,7 @@ }, { "cell_type": "code", - "execution_count": 77, + "execution_count": 78, "metadata": {}, "outputs": [], "source": [ @@ -1257,7 +1278,7 @@ }, { "cell_type": "code", - "execution_count": 78, + "execution_count": 79, "metadata": {}, "outputs": [], "source": [ @@ -1267,7 +1288,7 @@ }, { "cell_type": "code", - "execution_count": 79, + "execution_count": 80, "metadata": {}, "outputs": [], "source": [ @@ -1290,7 +1311,7 @@ }, { "cell_type": "code", - "execution_count": 80, + "execution_count": 81, "metadata": {}, "outputs": [], "source": [ @@ -1302,7 +1323,7 @@ }, { "cell_type": "code", - "execution_count": 81, + "execution_count": 82, "metadata": {}, "outputs": [], "source": [ @@ -1311,7 +1332,7 @@ }, { "cell_type": "code", - "execution_count": 82, + "execution_count": 83, "metadata": {}, "outputs": [], "source": [ @@ -1320,7 +1341,7 @@ }, { "cell_type": "code", - "execution_count": 83, + "execution_count": 84, "metadata": {}, "outputs": [], "source": [ @@ -1337,7 +1358,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3 - tf2", + "display_name": "Python 3", "language": "python", "name": "python3" }, diff --git a/08_dimensionality_reduction.ipynb b/08_dimensionality_reduction.ipynb index 9ecd2da..1576e2a 100644 --- a/08_dimensionality_reduction.ipynb +++ b/08_dimensionality_reduction.ipynb @@ -64,6 +64,23 @@ "warnings.filterwarnings(action=\"ignore\", message=\"^internal gelsd\")" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This notebook assumes you have installed Scikit-Learn ≥0.20." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import sklearn\n", + "assert sklearn.__version__ >= \"0.20\"" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -74,7 +91,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -115,7 +132,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -127,7 +144,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ @@ -139,7 +156,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ @@ -148,7 +165,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ @@ -158,7 +175,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ @@ -181,7 +198,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ @@ -193,7 +210,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ @@ -202,7 +219,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ @@ -220,7 +237,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 12, "metadata": {}, "outputs": [], "source": [ @@ -236,7 +253,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ @@ -252,7 +269,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 14, "metadata": {}, "outputs": [], "source": [ @@ -268,7 +285,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 15, "metadata": {}, "outputs": [], "source": [ @@ -284,7 +301,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 16, "metadata": {}, "outputs": [], "source": [ @@ -300,7 +317,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 17, "metadata": {}, "outputs": [], "source": [ @@ -316,7 +333,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 18, "metadata": {}, "outputs": [], "source": [ @@ -332,7 +349,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 19, "metadata": {}, "outputs": [], "source": [ @@ -355,7 +372,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 20, "metadata": {}, "outputs": [], "source": [ @@ -378,7 +395,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 21, "metadata": {}, "outputs": [], "source": [ @@ -394,7 +411,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 22, "metadata": {}, "outputs": [], "source": [ @@ -417,7 +434,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 23, "metadata": {}, "outputs": [], "source": [ @@ -445,7 +462,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 24, "metadata": {}, "outputs": [], "source": [ @@ -469,7 +486,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 25, "metadata": {}, "outputs": [], "source": [ @@ -498,9 +515,9 @@ "ax.plot(X3D_inv[:, 0], X3D_inv[:, 1], X3D_inv[:, 2], \"k+\")\n", "ax.plot(X3D_inv[:, 0], X3D_inv[:, 1], X3D_inv[:, 2], \"k.\")\n", "ax.plot(X3D_above[:, 0], X3D_above[:, 1], X3D_above[:, 2], \"bo\")\n", - "ax.set_xlabel(\"$x_1$\", fontsize=18)\n", - "ax.set_ylabel(\"$x_2$\", fontsize=18)\n", - "ax.set_zlabel(\"$x_3$\", fontsize=18)\n", + "ax.set_xlabel(\"$x_1$\", fontsize=18, labelpad=10)\n", + "ax.set_ylabel(\"$x_2$\", fontsize=18, labelpad=10)\n", + "ax.set_zlabel(\"$x_3$\", fontsize=18, labelpad=10)\n", "ax.set_xlim(axes[0:2])\n", "ax.set_ylim(axes[2:4])\n", "ax.set_zlim(axes[4:6])\n", @@ -519,7 +536,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 26, "metadata": {}, "outputs": [], "source": [ @@ -548,7 +565,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 27, "metadata": {}, "outputs": [], "source": [ @@ -558,7 +575,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 28, "metadata": {}, "outputs": [], "source": [ @@ -582,7 +599,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 29, "metadata": {}, "outputs": [], "source": [ @@ -607,7 +624,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 30, "metadata": {}, "outputs": [], "source": [ @@ -695,7 +712,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 31, "metadata": {}, "outputs": [], "source": [ @@ -768,23 +785,19 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 32, "metadata": {}, "outputs": [], "source": [ - "from six.moves import urllib\n", - "try:\n", - " from sklearn.datasets import fetch_openml\n", - " mnist = fetch_openml('mnist_784', version=1)\n", - " mnist.target = mnist.target.astype(np.int64)\n", - "except ImportError:\n", - " from sklearn.datasets import fetch_mldata\n", - " mnist = fetch_mldata('MNIST original')" + "from sklearn.datasets import fetch_openml\n", + "\n", + "mnist = fetch_openml('mnist_784', version=1)\n", + "mnist.target = mnist.target.astype(np.uint8)" ] }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 33, "metadata": {}, "outputs": [], "source": [ @@ -798,7 +811,7 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 34, "metadata": {}, "outputs": [], "source": [ @@ -810,7 +823,7 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 35, "metadata": {}, "outputs": [], "source": [ @@ -819,7 +832,28 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 36, + "metadata": {}, + "outputs": [], + "source": [ + "plt.figure(figsize=(6,4))\n", + "plt.plot(cumsum, linewidth=3)\n", + "plt.axis([0, 400, 0, 1])\n", + "plt.xlabel(\"Dimensions\")\n", + "plt.ylabel(\"Explained Variance\")\n", + "plt.plot([d, d], [0, 0.95], \"k:\")\n", + "plt.plot([0, d], [0.95, 0.95], \"k:\")\n", + "plt.plot(d, 0.95, \"ko\")\n", + "plt.annotate(\"Elbow\", xy=(65, 0.85), xytext=(70, 0.7),\n", + " arrowprops=dict(arrowstyle=\"->\"), fontsize=16)\n", + "plt.grid(True)\n", + "save_fig(\"explained_variance_plot\")\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 36, "metadata": {}, "outputs": [], "source": [ @@ -829,7 +863,7 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 37, "metadata": {}, "outputs": [], "source": [ @@ -838,7 +872,7 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 38, "metadata": {}, "outputs": [], "source": [ @@ -847,7 +881,7 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 39, "metadata": {}, "outputs": [], "source": [ @@ -858,7 +892,7 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 40, "metadata": {}, "outputs": [], "source": [ @@ -880,7 +914,7 @@ }, { "cell_type": "code", - "execution_count": 40, + "execution_count": 41, "metadata": {}, "outputs": [], "source": [ @@ -897,7 +931,7 @@ }, { "cell_type": "code", - "execution_count": 41, + "execution_count": 42, "metadata": {}, "outputs": [], "source": [ @@ -913,7 +947,7 @@ }, { "cell_type": "code", - "execution_count": 42, + "execution_count": 43, "metadata": {}, "outputs": [], "source": [ @@ -930,7 +964,7 @@ }, { "cell_type": "code", - "execution_count": 43, + "execution_count": 44, "metadata": {}, "outputs": [], "source": [ @@ -939,7 +973,7 @@ }, { "cell_type": "code", - "execution_count": 44, + "execution_count": 45, "metadata": {}, "outputs": [], "source": [ @@ -953,7 +987,7 @@ }, { "cell_type": "code", - "execution_count": 45, + "execution_count": 46, "metadata": {}, "outputs": [], "source": [ @@ -969,7 +1003,7 @@ }, { "cell_type": "code", - "execution_count": 46, + "execution_count": 47, "metadata": {}, "outputs": [], "source": [ @@ -985,7 +1019,7 @@ }, { "cell_type": "code", - "execution_count": 47, + "execution_count": 48, "metadata": {}, "outputs": [], "source": [ @@ -1008,7 +1042,7 @@ }, { "cell_type": "code", - "execution_count": 48, + "execution_count": 49, "metadata": {}, "outputs": [], "source": [ @@ -1028,7 +1062,7 @@ }, { "cell_type": "code", - "execution_count": 49, + "execution_count": 50, "metadata": {}, "outputs": [], "source": [ @@ -1044,7 +1078,7 @@ }, { "cell_type": "code", - "execution_count": 50, + "execution_count": 51, "metadata": {}, "outputs": [], "source": [ @@ -1057,7 +1091,7 @@ }, { "cell_type": "code", - "execution_count": 51, + "execution_count": 52, "metadata": {}, "outputs": [], "source": [ @@ -1081,7 +1115,7 @@ }, { "cell_type": "code", - "execution_count": 52, + "execution_count": 53, "metadata": {}, "outputs": [], "source": [ @@ -1109,7 +1143,7 @@ }, { "cell_type": "code", - "execution_count": 53, + "execution_count": 54, "metadata": {}, "outputs": [], "source": [ @@ -1146,7 +1180,7 @@ }, { "cell_type": "code", - "execution_count": 54, + "execution_count": 55, "metadata": { "scrolled": true }, @@ -1185,7 +1219,7 @@ }, { "cell_type": "code", - "execution_count": 55, + "execution_count": 56, "metadata": {}, "outputs": [], "source": [ @@ -1194,7 +1228,7 @@ }, { "cell_type": "code", - "execution_count": 56, + "execution_count": 57, "metadata": {}, "outputs": [], "source": [ @@ -1206,7 +1240,7 @@ }, { "cell_type": "code", - "execution_count": 57, + "execution_count": 58, "metadata": {}, "outputs": [], "source": [ @@ -1240,7 +1274,7 @@ }, { "cell_type": "code", - "execution_count": 58, + "execution_count": 59, "metadata": {}, "outputs": [], "source": [ @@ -1264,7 +1298,7 @@ }, { "cell_type": "code", - "execution_count": 59, + "execution_count": 60, "metadata": {}, "outputs": [], "source": [ @@ -1280,7 +1314,7 @@ }, { "cell_type": "code", - "execution_count": 60, + "execution_count": 61, "metadata": {}, "outputs": [], "source": [ @@ -1290,7 +1324,7 @@ "\n", "clf = Pipeline([\n", " (\"kpca\", KernelPCA(n_components=2)),\n", - " (\"log_reg\", LogisticRegression(solver=\"liblinear\"))\n", + " (\"log_reg\", LogisticRegression(solver=\"lbfgs\"))\n", " ])\n", "\n", "param_grid = [{\n", @@ -1304,7 +1338,7 @@ }, { "cell_type": "code", - "execution_count": 61, + "execution_count": 62, "metadata": {}, "outputs": [], "source": [ @@ -1313,7 +1347,7 @@ }, { "cell_type": "code", - "execution_count": 62, + "execution_count": 63, "metadata": {}, "outputs": [], "source": [ @@ -1325,7 +1359,7 @@ }, { "cell_type": "code", - "execution_count": 63, + "execution_count": 64, "metadata": {}, "outputs": [], "source": [ @@ -1343,7 +1377,7 @@ }, { "cell_type": "code", - "execution_count": 64, + "execution_count": 65, "metadata": {}, "outputs": [], "source": [ @@ -1352,7 +1386,7 @@ }, { "cell_type": "code", - "execution_count": 65, + "execution_count": 66, "metadata": {}, "outputs": [], "source": [ @@ -1364,7 +1398,7 @@ }, { "cell_type": "code", - "execution_count": 66, + "execution_count": 67, "metadata": {}, "outputs": [], "source": [ @@ -1388,7 +1422,7 @@ }, { "cell_type": "code", - "execution_count": 67, + "execution_count": 68, "metadata": {}, "outputs": [], "source": [ @@ -1400,7 +1434,7 @@ }, { "cell_type": "code", - "execution_count": 68, + "execution_count": 69, "metadata": {}, "outputs": [], "source": [ @@ -1412,7 +1446,7 @@ }, { "cell_type": "code", - "execution_count": 69, + "execution_count": 70, "metadata": {}, "outputs": [], "source": [ @@ -1424,7 +1458,7 @@ }, { "cell_type": "code", - "execution_count": 70, + "execution_count": 71, "metadata": {}, "outputs": [], "source": [ @@ -1439,7 +1473,7 @@ }, { "cell_type": "code", - "execution_count": 71, + "execution_count": 72, "metadata": {}, "outputs": [], "source": [ @@ -1535,7 +1569,7 @@ "source": [ "from sklearn.ensemble import RandomForestClassifier\n", "\n", - "rnd_clf = RandomForestClassifier(n_estimators=10, random_state=42)" + "rnd_clf = RandomForestClassifier(n_estimators=100, random_state=42)" ] }, { @@ -1604,7 +1638,7 @@ "metadata": {}, "outputs": [], "source": [ - "rnd_clf2 = RandomForestClassifier(n_estimators=10, random_state=42)\n", + "rnd_clf2 = RandomForestClassifier(n_estimators=100, random_state=42)\n", "t0 = time.time()\n", "rnd_clf2.fit(X_train_reduced, y_train)\n", "t1 = time.time()" @@ -2232,7 +2266,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3 - tf2", + "display_name": "Python 3", "language": "python", "name": "python3" }, diff --git a/09_unsupervised_learning.ipynb b/09_unsupervised_learning.ipynb index cedcbe1..d7cd98b 100644 --- a/09_unsupervised_learning.ipynb +++ b/09_unsupervised_learning.ipynb @@ -1573,7 +1573,7 @@ "metadata": {}, "outputs": [], "source": [ - "log_reg = LogisticRegression(multi_class=\"ovr\", solver=\"liblinear\", random_state=42)\n", + "log_reg = LogisticRegression(multi_class=\"ovr\", solver=\"lbfgs\", random_state=42)\n", "log_reg.fit(X_train, y_train)" ] }, @@ -1610,7 +1610,7 @@ "source": [ "pipeline = Pipeline([\n", " (\"kmeans\", KMeans(n_clusters=50, random_state=42)),\n", - " (\"log_reg\", LogisticRegression(multi_class=\"ovr\", solver=\"liblinear\", random_state=42)),\n", + " (\"log_reg\", LogisticRegression(multi_class=\"ovr\", solver=\"lbfgs\", random_state=42)),\n", "])\n", "pipeline.fit(X_train, y_train)" ] @@ -1721,7 +1721,7 @@ "metadata": {}, "outputs": [], "source": [ - "log_reg = LogisticRegression(multi_class=\"ovr\", solver=\"liblinear\", random_state=42)\n", + "log_reg = LogisticRegression(multi_class=\"ovr\", solver=\"lbfgs\", random_state=42)\n", "log_reg.fit(X_train[:n_labeled], y_train[:n_labeled])\n", "log_reg.score(X_test, y_test)" ] @@ -1804,7 +1804,7 @@ "metadata": {}, "outputs": [], "source": [ - "log_reg = LogisticRegression(multi_class=\"ovr\", solver=\"liblinear\", random_state=42)\n", + "log_reg = LogisticRegression(multi_class=\"ovr\", solver=\"lbfgs\", random_state=42)\n", "log_reg.fit(X_representative_digits, y_representative_digits)\n", "log_reg.score(X_test, y_test)" ] @@ -1840,7 +1840,7 @@ "metadata": {}, "outputs": [], "source": [ - "log_reg = LogisticRegression(multi_class=\"ovr\", solver=\"liblinear\", random_state=42)\n", + "log_reg = LogisticRegression(multi_class=\"ovr\", solver=\"lbfgs\", random_state=42)\n", "log_reg.fit(X_train, y_train_propagated)" ] }, @@ -1894,7 +1894,7 @@ "metadata": {}, "outputs": [], "source": [ - "log_reg = LogisticRegression(multi_class=\"ovr\", solver=\"liblinear\", random_state=42)\n", + "log_reg = LogisticRegression(multi_class=\"ovr\", solver=\"lbfgs\", random_state=42)\n", "log_reg.fit(X_train_partially_propagated, y_train_partially_propagated)" ] }, @@ -2280,7 +2280,7 @@ }, { "cell_type": "code", - "execution_count": 134, + "execution_count": 133, "metadata": {}, "outputs": [], "source": [ @@ -2291,7 +2291,7 @@ }, { "cell_type": "code", - "execution_count": 135, + "execution_count": 134, "metadata": {}, "outputs": [], "source": [ @@ -2300,7 +2300,7 @@ }, { "cell_type": "code", - "execution_count": 136, + "execution_count": 135, "metadata": { "scrolled": true }, @@ -2318,7 +2318,7 @@ }, { "cell_type": "code", - "execution_count": 137, + "execution_count": 136, "metadata": {}, "outputs": [], "source": [ @@ -2339,7 +2339,7 @@ }, { "cell_type": "code", - "execution_count": 138, + "execution_count": 137, "metadata": {}, "outputs": [], "source": [ @@ -2348,7 +2348,7 @@ }, { "cell_type": "code", - "execution_count": 139, + "execution_count": 138, "metadata": {}, "outputs": [], "source": [ @@ -2365,7 +2365,7 @@ }, { "cell_type": "code", - "execution_count": 140, + "execution_count": 139, "metadata": {}, "outputs": [], "source": [ @@ -2374,7 +2374,7 @@ }, { "cell_type": "code", - "execution_count": 141, + "execution_count": 140, "metadata": {}, "outputs": [], "source": [ @@ -2383,7 +2383,7 @@ }, { "cell_type": "code", - "execution_count": 142, + "execution_count": 141, "metadata": {}, "outputs": [], "source": [ @@ -2399,7 +2399,7 @@ }, { "cell_type": "code", - "execution_count": 143, + "execution_count": 142, "metadata": {}, "outputs": [], "source": [ @@ -2415,7 +2415,7 @@ }, { "cell_type": "code", - "execution_count": 144, + "execution_count": 143, "metadata": {}, "outputs": [], "source": [ @@ -2431,7 +2431,7 @@ }, { "cell_type": "code", - "execution_count": 145, + "execution_count": 144, "metadata": {}, "outputs": [], "source": [ @@ -2440,7 +2440,7 @@ }, { "cell_type": "code", - "execution_count": 146, + "execution_count": 145, "metadata": {}, "outputs": [], "source": [ @@ -2456,7 +2456,7 @@ }, { "cell_type": "code", - "execution_count": 147, + "execution_count": 146, "metadata": {}, "outputs": [], "source": [ @@ -2466,7 +2466,7 @@ }, { "cell_type": "code", - "execution_count": 148, + "execution_count": 147, "metadata": {}, "outputs": [], "source": [ @@ -2489,7 +2489,7 @@ }, { "cell_type": "code", - "execution_count": 149, + "execution_count": 148, "metadata": {}, "outputs": [], "source": [ @@ -2505,7 +2505,7 @@ }, { "cell_type": "code", - "execution_count": 150, + "execution_count": 149, "metadata": {}, "outputs": [], "source": [ @@ -2528,7 +2528,7 @@ }, { "cell_type": "code", - "execution_count": 151, + "execution_count": 150, "metadata": {}, "outputs": [], "source": [ @@ -2567,7 +2567,7 @@ }, { "cell_type": "code", - "execution_count": 152, + "execution_count": 151, "metadata": {}, "outputs": [], "source": [ @@ -2592,7 +2592,7 @@ }, { "cell_type": "code", - "execution_count": 153, + "execution_count": 152, "metadata": {}, "outputs": [], "source": [ @@ -2608,7 +2608,7 @@ }, { "cell_type": "code", - "execution_count": 154, + "execution_count": 153, "metadata": {}, "outputs": [], "source": [ @@ -2626,7 +2626,7 @@ }, { "cell_type": "code", - "execution_count": 155, + "execution_count": 154, "metadata": {}, "outputs": [], "source": [ @@ -2638,7 +2638,7 @@ }, { "cell_type": "code", - "execution_count": 156, + "execution_count": 155, "metadata": {}, "outputs": [], "source": [ @@ -2663,7 +2663,7 @@ }, { "cell_type": "code", - "execution_count": 157, + "execution_count": 156, "metadata": {}, "outputs": [], "source": [ @@ -2674,7 +2674,7 @@ }, { "cell_type": "code", - "execution_count": 158, + "execution_count": 157, "metadata": {}, "outputs": [], "source": [ @@ -2714,7 +2714,7 @@ }, { "cell_type": "code", - "execution_count": 159, + "execution_count": 158, "metadata": {}, "outputs": [], "source": [ @@ -2723,7 +2723,7 @@ }, { "cell_type": "code", - "execution_count": 160, + "execution_count": 159, "metadata": {}, "outputs": [], "source": [ @@ -2739,7 +2739,7 @@ }, { "cell_type": "code", - "execution_count": 161, + "execution_count": 160, "metadata": {}, "outputs": [], "source": [ @@ -2756,7 +2756,7 @@ }, { "cell_type": "code", - "execution_count": 162, + "execution_count": 161, "metadata": {}, "outputs": [], "source": [ @@ -2765,7 +2765,7 @@ }, { "cell_type": "code", - "execution_count": 163, + "execution_count": 162, "metadata": {}, "outputs": [], "source": [ @@ -2788,7 +2788,7 @@ }, { "cell_type": "code", - "execution_count": 164, + "execution_count": 163, "metadata": {}, "outputs": [], "source": [ @@ -2798,7 +2798,7 @@ }, { "cell_type": "code", - "execution_count": 165, + "execution_count": 164, "metadata": {}, "outputs": [], "source": [ @@ -2808,7 +2808,7 @@ }, { "cell_type": "code", - "execution_count": 166, + "execution_count": 165, "metadata": {}, "outputs": [], "source": [ @@ -2839,7 +2839,7 @@ }, { "cell_type": "code", - "execution_count": 167, + "execution_count": 166, "metadata": {}, "outputs": [], "source": [ @@ -2858,7 +2858,7 @@ }, { "cell_type": "code", - "execution_count": 168, + "execution_count": 167, "metadata": {}, "outputs": [], "source": [ @@ -2867,7 +2867,7 @@ }, { "cell_type": "code", - "execution_count": 169, + "execution_count": 168, "metadata": {}, "outputs": [], "source": [ @@ -2890,7 +2890,7 @@ }, { "cell_type": "code", - "execution_count": 170, + "execution_count": 169, "metadata": {}, "outputs": [], "source": [ @@ -2899,7 +2899,7 @@ }, { "cell_type": "code", - "execution_count": 171, + "execution_count": 170, "metadata": {}, "outputs": [], "source": [ @@ -2916,7 +2916,7 @@ }, { "cell_type": "code", - "execution_count": 172, + "execution_count": 171, "metadata": {}, "outputs": [], "source": [ @@ -2925,7 +2925,7 @@ }, { "cell_type": "code", - "execution_count": 173, + "execution_count": 172, "metadata": {}, "outputs": [], "source": [ @@ -2936,7 +2936,7 @@ }, { "cell_type": "code", - "execution_count": 174, + "execution_count": 173, "metadata": {}, "outputs": [], "source": [ @@ -2951,7 +2951,7 @@ }, { "cell_type": "code", - "execution_count": 175, + "execution_count": 174, "metadata": {}, "outputs": [], "source": [ @@ -2960,7 +2960,7 @@ }, { "cell_type": "code", - "execution_count": 176, + "execution_count": 175, "metadata": {}, "outputs": [], "source": [ @@ -2969,7 +2969,7 @@ }, { "cell_type": "code", - "execution_count": 177, + "execution_count": 176, "metadata": {}, "outputs": [], "source": [ @@ -2996,7 +2996,7 @@ }, { "cell_type": "code", - "execution_count": 178, + "execution_count": 177, "metadata": {}, "outputs": [], "source": [ @@ -3005,7 +3005,7 @@ }, { "cell_type": "code", - "execution_count": 179, + "execution_count": 178, "metadata": { "scrolled": true }, @@ -3017,7 +3017,7 @@ }, { "cell_type": "code", - "execution_count": 180, + "execution_count": 179, "metadata": {}, "outputs": [], "source": [ @@ -3051,7 +3051,7 @@ }, { "cell_type": "code", - "execution_count": 181, + "execution_count": 180, "metadata": {}, "outputs": [], "source": [ @@ -3060,7 +3060,7 @@ }, { "cell_type": "code", - "execution_count": 182, + "execution_count": 181, "metadata": {}, "outputs": [], "source": [ @@ -3073,7 +3073,7 @@ }, { "cell_type": "code", - "execution_count": 183, + "execution_count": 182, "metadata": {}, "outputs": [], "source": [ @@ -3163,7 +3163,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3 - tf2", + "display_name": "Python 3", "language": "python", "name": "python3" }, diff --git a/10_neural_nets_with_keras.ipynb b/10_neural_nets_with_keras.ipynb index 630205c..7d15449 100644 --- a/10_neural_nets_with_keras.ipynb +++ b/10_neural_nets_with_keras.ipynb @@ -92,7 +92,7 @@ "X = iris.data[:, (2, 3)] # petal length, petal width\n", "y = (iris.target == 0).astype(np.int)\n", "\n", - "per_clf = Perceptron(max_iter=100, tol=-np.infty, random_state=42)\n", + "per_clf = Perceptron(max_iter=1000, tol=1e-3, random_state=42)\n", "per_clf.fit(X, y)\n", "\n", "y_pred = per_clf.predict([[2, 0.5]])" @@ -474,7 +474,7 @@ " plt.axis('off')\n", " plt.title(class_names[y_train[index]], fontsize=12)\n", "plt.subplots_adjust(wspace=0.2, hspace=0.5)\n", - "save_fig('fashion_mnist', tight_layout=False)\n", + "save_fig('fashion_mnist_diagram', tight_layout=False)\n", "plt.show()" ] }, @@ -668,7 +668,7 @@ "source": [ "model.compile(loss=keras.losses.sparse_categorical_crossentropy,\n", " optimizer=keras.optimizers.SGD(),\n", - " metrics=[keras.metrics.Accuracy()])" + " metrics=[keras.metrics.sparse_categorical_accuracy])" ] }, { @@ -719,7 +719,7 @@ "pd.DataFrame(history.history).plot(figsize=(8, 5))\n", "plt.grid(True)\n", "plt.gca().set_ylim(0, 1)\n", - "save_fig(\"keras_learning_curve_graph\")\n", + "save_fig(\"keras_learning_curves_graph\")\n", "plt.show()" ] }, @@ -1656,7 +1656,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3 - tf2", + "display_name": "Python 3", "language": "python", "name": "python3" }, diff --git a/future_encoders.py b/future_encoders.py deleted file mode 100644 index 31380cd..0000000 --- a/future_encoders.py +++ /dev/null @@ -1,1610 +0,0 @@ -""" -This module merges two files from Scikit-Learn 0.20 to make a few encoders -available for users using an earlier version: - * sklearn/preprocessing/data.py (OneHotEncoder and CategoricalEncoder) - * sklearn/compose/_column_transformer.py (ColumnTransformer) -I just copy/pasted the contents, fixed the imports and __all__, and also -copied the definitions of three pipeline functions whose signature changes -in 0.20: _fit_one_transformer, _transform_one and _fit_transform_one. -The original authors are listed below. ----- -The :mod:`sklearn.compose._column_transformer` module implements utilities -to work with heterogeneous data and to apply different transformers to -different columns. -""" -# Authors: Andreas Mueller -# Joris Van den Bossche -# License: BSD 3 clause - -from __future__ import division - -import numbers -import warnings - -import numpy as np -from scipy import sparse - -from sklearn.base import clone, BaseEstimator, TransformerMixin -from sklearn.externals import six -from sklearn.utils import Bunch, check_array -from sklearn.externals.joblib.parallel import delayed, Parallel -from sklearn.utils.metaestimators import _BaseComposition -from sklearn.utils.validation import check_is_fitted, FLOAT_DTYPES -from sklearn.pipeline import _name_estimators -from sklearn.preprocessing import FunctionTransformer -from sklearn.preprocessing.label import LabelEncoder - -from itertools import chain - - -# weight and fit_params are not used but it allows _fit_one_transformer, -# _transform_one and _fit_transform_one to have the same signature to -# factorize the code in ColumnTransformer -def _fit_one_transformer(transformer, X, y, weight=None, **fit_params): - return transformer.fit(X, y) - - -def _transform_one(transformer, X, y, weight, **fit_params): - res = transformer.transform(X) - # if we have a weight for this transformer, multiply output - if weight is None: - return res - return res * weight - - -def _fit_transform_one(transformer, X, y, weight, **fit_params): - if hasattr(transformer, 'fit_transform'): - res = transformer.fit_transform(X, y, **fit_params) - else: - res = transformer.fit(X, y, **fit_params).transform(X) - # if we have a weight for this transformer, multiply output - if weight is None: - return res, transformer - return res * weight, transformer - - -BOUNDS_THRESHOLD = 1e-7 - - -zip = six.moves.zip -map = six.moves.map -range = six.moves.range - -__all__ = [ - 'OneHotEncoder', - 'OrdinalEncoder', - 'ColumnTransformer', - 'make_column_transformer' -] - - -def _argmax(arr_or_spmatrix, axis=None): - return arr_or_spmatrix.argmax(axis=axis) - - -def _handle_zeros_in_scale(scale, copy=True): - ''' Makes sure that whenever scale is zero, we handle it correctly. - - This happens in most scalers when we have constant features.''' - - # if we are fitting on 1D arrays, scale might be a scalar - if np.isscalar(scale): - if scale == .0: - scale = 1. - return scale - elif isinstance(scale, np.ndarray): - if copy: - # New array to avoid side-effects - scale = scale.copy() - scale[scale == 0.0] = 1.0 - return scale - - -def _transform_selected(X, transform, selected="all", copy=True): - """Apply a transform function to portion of selected features - - Parameters - ---------- - X : {array-like, sparse matrix}, shape [n_samples, n_features] - Dense array or sparse matrix. - - transform : callable - A callable transform(X) -> X_transformed - - copy : boolean, optional - Copy X even if it could be avoided. - - selected: "all" or array of indices or mask - Specify which features to apply the transform to. - - Returns - ------- - X : array or sparse matrix, shape=(n_samples, n_features_new) - """ - X = check_array(X, accept_sparse='csc', copy=copy, dtype=FLOAT_DTYPES) - - if isinstance(selected, six.string_types) and selected == "all": - return transform(X) - - if len(selected) == 0: - return X - - n_features = X.shape[1] - ind = np.arange(n_features) - sel = np.zeros(n_features, dtype=bool) - sel[np.asarray(selected)] = True - not_sel = np.logical_not(sel) - n_selected = np.sum(sel) - - if n_selected == 0: - # No features selected. - return X - elif n_selected == n_features: - # All features selected. - return transform(X) - else: - X_sel = transform(X[:, ind[sel]]) - X_not_sel = X[:, ind[not_sel]] - - if sparse.issparse(X_sel) or sparse.issparse(X_not_sel): - return sparse.hstack((X_sel, X_not_sel)) - else: - return np.hstack((X_sel, X_not_sel)) - - -class _BaseEncoder(BaseEstimator, TransformerMixin): - """ - Base class for encoders that includes the code to categorize and - transform the input features. - - """ - - def _fit(self, X, handle_unknown='error'): - - X_temp = check_array(X, dtype=None) - if not hasattr(X, 'dtype') and np.issubdtype(X_temp.dtype, np.str_): - X = check_array(X, dtype=np.object) - else: - X = X_temp - - n_samples, n_features = X.shape - - if self.categories != 'auto': - for cats in self.categories: - if not np.all(np.sort(cats) == np.array(cats)): - raise ValueError("Unsorted categories are not yet " - "supported") - if len(self.categories) != n_features: - raise ValueError("Shape mismatch: if n_values is an array," - " it has to be of shape (n_features,).") - - self._label_encoders_ = [LabelEncoder() for _ in range(n_features)] - - for i in range(n_features): - le = self._label_encoders_[i] - Xi = X[:, i] - if self.categories == 'auto': - le.fit(Xi) - else: - if handle_unknown == 'error': - valid_mask = np.in1d(Xi, self.categories[i]) - if not np.all(valid_mask): - diff = np.unique(Xi[~valid_mask]) - msg = ("Found unknown categories {0} in column {1}" - " during fit".format(diff, i)) - raise ValueError(msg) - le.classes_ = np.array(self.categories[i]) - - self.categories_ = [le.classes_ for le in self._label_encoders_] - - def _transform(self, X, handle_unknown='error'): - - X_temp = check_array(X, dtype=None) - if not hasattr(X, 'dtype') and np.issubdtype(X_temp.dtype, np.str_): - X = check_array(X, dtype=np.object) - else: - X = X_temp - - _, n_features = X.shape - X_int = np.zeros_like(X, dtype=np.int) - X_mask = np.ones_like(X, dtype=np.bool) - - for i in range(n_features): - Xi = X[:, i] - valid_mask = np.in1d(Xi, self.categories_[i]) - - if not np.all(valid_mask): - if handle_unknown == 'error': - diff = np.unique(X[~valid_mask, i]) - msg = ("Found unknown categories {0} in column {1}" - " during transform".format(diff, i)) - raise ValueError(msg) - else: - # Set the problematic rows to an acceptable value and - # continue `The rows are marked `X_mask` and will be - # removed later. - X_mask[:, i] = valid_mask - Xi = Xi.copy() - Xi[~valid_mask] = self.categories_[i][0] - X_int[:, i] = self._label_encoders_[i].transform(Xi) - - return X_int, X_mask - - -WARNING_MSG = ( - "The handling of integer data will change in the future. Currently, the " - "categories are determined based on the range [0, max(values)], while " - "in the future they will be determined based on the unique values.\n" - "If you want the future behaviour, you can specify \"categories='auto'\"." -) - - -class OneHotEncoder(_BaseEncoder): - """Encode categorical integer features as a one-hot numeric array. - - The input to this transformer should be an array-like of integers or - strings, denoting the values taken on by categorical (discrete) features. - The features are encoded using a one-hot (aka 'one-of-K' or 'dummy') - encoding scheme. This creates a binary column for each category and - returns a sparse matrix or dense array. - - By default, the encoder derives the categories based on the unique values - in each feature. Alternatively, you can also specify the `categories` - manually. - The OneHotEncoder previously assumed that the input features take on - values in the range [0, max(values)). This behaviour is deprecated. - - This encoding is needed for feeding categorical data to many scikit-learn - estimators, notably linear models and SVMs with the standard kernels. - - Note: a one-hot encoding of y labels should use a LabelBinarizer - instead. - - Read more in the :ref:`User Guide `. - - Parameters - ---------- - categories : 'auto' or a list of lists/arrays of values. - Categories (unique values) per feature: - - - 'auto' : Determine categories automatically from the training data. - - list : ``categories[i]`` holds the categories expected in the ith - column. The passed categories must be sorted and should not mix - strings and numeric values. - - The used categories can be found in the ``categories_`` attribute. - - sparse : boolean, default=True - Will return sparse matrix if set True else will return an array. - - dtype : number type, default=np.float - Desired dtype of output. - - handle_unknown : 'error' (default) or 'ignore' - Whether to raise an error or ignore if a unknown categorical feature is - present during transform (default is to raise). When this parameter - is set to 'ignore' and an unknown category is encountered during - transform, the resulting one-hot encoded columns for this feature - will be all zeros. In the inverse transform, an unknown category - will be denoted as None. - - n_values : 'auto', int or array of ints - Number of values per feature. - - - 'auto' : determine value range from training data. - - int : number of categorical values per feature. - Each feature value should be in ``range(n_values)`` - - array : ``n_values[i]`` is the number of categorical values in - ``X[:, i]``. Each feature value should be - in ``range(n_values[i])`` - - .. deprecated:: 0.20 - The `n_values` keyword is deprecated and will be removed in 0.22. - Use `categories` instead. - - categorical_features : "all" or array of indices or mask - Specify what features are treated as categorical. - - - 'all' (default): All features are treated as categorical. - - array of indices: Array of categorical feature indices. - - mask: Array of length n_features and with dtype=bool. - - Non-categorical features are always stacked to the right of the matrix. - - .. deprecated:: 0.20 - The `categorical_features` keyword is deprecated and will be - removed in 0.22. - - Attributes - ---------- - categories_ : list of arrays - The categories of each feature determined during fitting - (in order corresponding with output of ``transform``). - - active_features_ : array - Indices for active features, meaning values that actually occur - in the training set. Only available when n_values is ``'auto'``. - - .. deprecated:: 0.20 - - feature_indices_ : array of shape (n_features,) - Indices to feature ranges. - Feature ``i`` in the original data is mapped to features - from ``feature_indices_[i]`` to ``feature_indices_[i+1]`` - (and then potentially masked by `active_features_` afterwards) - - .. deprecated:: 0.20 - - n_values_ : array of shape (n_features,) - Maximum number of values per feature. - - .. deprecated:: 0.20 - - Examples - -------- - Given a dataset with two features, we let the encoder find the unique - values per feature and transform the data to a binary one-hot encoding. - - >>> from sklearn.preprocessing import OneHotEncoder - >>> enc = OneHotEncoder(handle_unknown='ignore') - >>> X = [['Male', 1], ['Female', 3], ['Female', 2]] - >>> enc.fit(X) - ... # doctest: +ELLIPSIS - OneHotEncoder(categories='auto', dtype=<... 'numpy.float64'>, - handle_unknown='ignore', sparse=True) - - >>> enc.categories_ - [array(['Female', 'Male'], dtype=object), array([1, 2, 3], dtype=object)] - >>> enc.transform([['Female', 1], ['Male', 4]]).toarray() - array([[ 1., 0., 1., 0., 0.], - [ 0., 1., 0., 0., 0.]]) - >>> enc.inverse_transform([[0, 1, 1, 0, 0], [0, 0, 0, 1, 0]]) - array([['Male', 1], - [None, 2]], dtype=object) - - See also - -------- - sklearn.preprocessing.OrdinalEncoder : performs an ordinal (integer) - encoding of the categorical features. - sklearn.feature_extraction.DictVectorizer : performs a one-hot encoding of - dictionary items (also handles string-valued features). - sklearn.feature_extraction.FeatureHasher : performs an approximate one-hot - encoding of dictionary items or strings. - sklearn.preprocessing.LabelBinarizer : binarizes labels in a one-vs-all - fashion. - sklearn.preprocessing.MultiLabelBinarizer : transforms between iterable of - iterables and a multilabel format, e.g. a (samples x classes) binary - matrix indicating the presence of a class label. - """ - - def __init__(self, n_values=None, categorical_features=None, - categories=None, sparse=True, dtype=np.float64, - handle_unknown='error'): - self._categories = categories - if categories is None: - self.categories = 'auto' - else: - self.categories = categories - self.sparse = sparse - self.dtype = dtype - self.handle_unknown = handle_unknown - - if n_values is not None: - pass - # warnings.warn("Deprecated", DeprecationWarning) - else: - n_values = "auto" - self._deprecated_n_values = n_values - - if categorical_features is not None: - pass - # warnings.warn("Deprecated", DeprecationWarning) - else: - categorical_features = "all" - self._deprecated_categorical_features = categorical_features - - # Deprecated keywords - - @property - def n_values(self): - warnings.warn("The 'n_values' parameter is deprecated.", - DeprecationWarning) - return self._deprecated_n_values - - @n_values.setter - def n_values(self, value): - warnings.warn("The 'n_values' parameter is deprecated.", - DeprecationWarning) - self._deprecated_n_values = value - - @property - def categorical_features(self): - warnings.warn("The 'categorical_features' parameter is deprecated.", - DeprecationWarning) - return self._deprecated_categorical_features - - @categorical_features.setter - def categorical_features(self, value): - warnings.warn("The 'categorical_features' parameter is deprecated.", - DeprecationWarning) - self._deprecated_categorical_features = value - - # Deprecated attributes - - @property - def active_features_(self): - check_is_fitted(self, 'categories_') - warnings.warn("The 'active_features_' attribute is deprecated.", - DeprecationWarning) - return self._active_features_ - - @property - def feature_indices_(self): - check_is_fitted(self, 'categories_') - warnings.warn("The 'feature_indices_' attribute is deprecated.", - DeprecationWarning) - return self._feature_indices_ - - @property - def n_values_(self): - check_is_fitted(self, 'categories_') - warnings.warn("The 'n_values_' attribute is deprecated.", - DeprecationWarning) - return self._n_values_ - - def _handle_deprecations(self, X): - - user_set_categories = False - - if self._categories is not None: - self._legacy_mode = False - user_set_categories = True - - elif self._deprecated_n_values != 'auto': - msg = ( - "Passing 'n_values' is deprecated and will be removed in a " - "future release. You can use the 'categories' keyword instead." - " 'n_values=n' corresponds to 'n_values=[range(n)]'.") - warnings.warn(msg, DeprecationWarning) - - # we internally translate this to the correct categories - # and don't use legacy mode - X = check_array(X, dtype=np.int) - - if isinstance(self._deprecated_n_values, numbers.Integral): - n_features = X.shape[1] - self.categories = [ - list(range(self._deprecated_n_values)) - for _ in range(n_features)] - n_values = np.empty(n_features, dtype=np.int) - n_values.fill(self._deprecated_n_values) - else: - try: - n_values = np.asarray(self._deprecated_n_values, dtype=int) - self.categories = [list(range(i)) - for i in self._deprecated_n_values] - except (ValueError, TypeError): - raise TypeError( - "Wrong type for parameter `n_values`. Expected 'auto'," - " int or array of ints, got %r".format(type(X))) - - self._n_values_ = n_values - n_values = np.hstack([[0], n_values]) - indices = np.cumsum(n_values) - self._feature_indices_ = indices - - self._legacy_mode = False - - else: # n_values = 'auto' - if self.handle_unknown == 'ignore': - # no change in behaviour, no need to raise deprecation warning - self._legacy_mode = False - else: - - # check if we have integer or categorical input - try: - X = check_array(X, dtype=np.int) - except ValueError: - self._legacy_mode = False - else: - warnings.warn(WARNING_MSG, DeprecationWarning) - self._legacy_mode = True - - if (not isinstance(self._deprecated_categorical_features, - six.string_types) - or (isinstance(self._deprecated_categorical_features, - six.string_types) - and self._deprecated_categorical_features != 'all')): - if user_set_categories: - raise ValueError( - "The 'categorical_features' keyword is deprecated, and " - "cannot be used together with specifying 'categories'.") - warnings.warn("The 'categorical_features' keyword is deprecated.", - DeprecationWarning) - self._legacy_mode = True - - def fit(self, X, y=None): - """Fit OneHotEncoder to X. - - Parameters - ---------- - X : array-like, shape [n_samples, n_feature] - The data to determine the categories of each feature. - - Returns - ------- - self - """ - if self.handle_unknown not in ['error', 'ignore']: - template = ("handle_unknown should be either 'error' or " - "'ignore', got %s") - raise ValueError(template % self.handle_unknown) - - self._handle_deprecations(X) - - if self._legacy_mode: - # TODO not with _transform_selected ?? - self._legacy_fit_transform(X) - return self - else: - self._fit(X, handle_unknown=self.handle_unknown) - return self - - def _legacy_fit_transform(self, X): - """Assumes X contains only categorical features.""" - self_n_values = self._deprecated_n_values - dtype = getattr(X, 'dtype', None) - X = check_array(X, dtype=np.int) - if np.any(X < 0): - raise ValueError("X needs to contain only non-negative integers.") - n_samples, n_features = X.shape - if (isinstance(self_n_values, six.string_types) and - self_n_values == 'auto'): - n_values = np.max(X, axis=0) + 1 - elif isinstance(self_n_values, numbers.Integral): - if (np.max(X, axis=0) >= self_n_values).any(): - raise ValueError("Feature out of bounds for n_values=%d" - % self_n_values) - n_values = np.empty(n_features, dtype=np.int) - n_values.fill(self_n_values) - else: - try: - n_values = np.asarray(self_n_values, dtype=int) - except (ValueError, TypeError): - raise TypeError("Wrong type for parameter `n_values`. Expected" - " 'auto', int or array of ints, got %r" - % type(X)) - if n_values.ndim < 1 or n_values.shape[0] != X.shape[1]: - raise ValueError("Shape mismatch: if n_values is an array," - " it has to be of shape (n_features,).") - - self._n_values_ = n_values - self.categories_ = [np.arange(n_val - 1, dtype=dtype) - for n_val in n_values] - n_values = np.hstack([[0], n_values]) - indices = np.cumsum(n_values) - self._feature_indices_ = indices - - column_indices = (X + indices[:-1]).ravel() - row_indices = np.repeat(np.arange(n_samples, dtype=np.int32), - n_features) - data = np.ones(n_samples * n_features) - out = sparse.coo_matrix((data, (row_indices, column_indices)), - shape=(n_samples, indices[-1]), - dtype=self.dtype).tocsr() - - if (isinstance(self_n_values, six.string_types) and - self_n_values == 'auto'): - mask = np.array(out.sum(axis=0)).ravel() != 0 - active_features = np.where(mask)[0] - out = out[:, active_features] - self._active_features_ = active_features - - self.categories_ = [ - np.unique(X[:, i]).astype(dtype) if dtype else np.unique(X[:, i]) - for i in range(n_features)] - #import pdb; pdb.set_trace() - - return out if self.sparse else out.toarray() - - def fit_transform(self, X, y=None): - """Fit OneHotEncoder to X, then transform X. - - Equivalent to self.fit(X).transform(X), but more convenient and more - efficient. See fit for the parameters, transform for the return value. - - Parameters - ---------- - X : array-like, shape [n_samples, n_feature] - Input array of type int. - """ - if self.handle_unknown not in ['error', 'ignore']: - template = ("handle_unknown should be either 'error' or " - "'ignore', got %s") - raise ValueError(template % self.handle_unknown) - - self._handle_deprecations(X) - - if self._legacy_mode: - return _transform_selected(X, self._legacy_fit_transform, - self._deprecated_categorical_features, - copy=True) - else: - return self.fit(X).transform(X) - - def _legacy_transform(self, X): - """Assumes X contains only categorical features.""" - self_n_values = self._deprecated_n_values - X = check_array(X, dtype=np.int) - if np.any(X < 0): - raise ValueError("X needs to contain only non-negative integers.") - n_samples, n_features = X.shape - - indices = self._feature_indices_ - if n_features != indices.shape[0] - 1: - raise ValueError("X has different shape than during fitting." - " Expected %d, got %d." - % (indices.shape[0] - 1, n_features)) - - # We use only those categorical features of X that are known using fit. - # i.e lesser than n_values_ using mask. - # This means, if self.handle_unknown is "ignore", the row_indices and - # col_indices corresponding to the unknown categorical feature are - # ignored. - mask = (X < self._n_values_).ravel() - if np.any(~mask): - if self.handle_unknown not in ['error', 'ignore']: - raise ValueError("handle_unknown should be either error or " - "unknown got %s" % self.handle_unknown) - if self.handle_unknown == 'error': - raise ValueError("unknown categorical feature present %s " - "during transform." % X.ravel()[~mask]) - - column_indices = (X + indices[:-1]).ravel()[mask] - row_indices = np.repeat(np.arange(n_samples, dtype=np.int32), - n_features)[mask] - data = np.ones(np.sum(mask)) - out = sparse.coo_matrix((data, (row_indices, column_indices)), - shape=(n_samples, indices[-1]), - dtype=self.dtype).tocsr() - if (isinstance(self_n_values, six.string_types) and - self_n_values == 'auto'): - out = out[:, self._active_features_] - - return out if self.sparse else out.toarray() - - def _transform_new(self, X): - """New implementation assuming categorical input""" - X_temp = check_array(X, dtype=None) - if not hasattr(X, 'dtype') and np.issubdtype(X_temp.dtype, np.str_): - X = check_array(X, dtype=np.object) - else: - X = X_temp - - n_samples, n_features = X.shape - - X_int, X_mask = self._transform(X, handle_unknown=self.handle_unknown) - - mask = X_mask.ravel() - n_values = [cats.shape[0] for cats in self.categories_] - n_values = np.array([0] + n_values) - feature_indices = np.cumsum(n_values) - - indices = (X_int + feature_indices[:-1]).ravel()[mask] - indptr = X_mask.sum(axis=1).cumsum() - indptr = np.insert(indptr, 0, 0) - data = np.ones(n_samples * n_features)[mask] - - out = sparse.csr_matrix((data, indices, indptr), - shape=(n_samples, feature_indices[-1]), - dtype=self.dtype) - if not self.sparse: - return out.toarray() - else: - return out - - def transform(self, X): - """Transform X using one-hot encoding. - - Parameters - ---------- - X : array-like, shape [n_samples, n_features] - The data to encode. - - Returns - ------- - X_out : sparse matrix if sparse=True else a 2-d array - Transformed input. - """ - if not self._legacy_mode: - return self._transform_new(X) - else: - return _transform_selected(X, self._legacy_transform, - self._deprecated_categorical_features, - copy=True) - - def inverse_transform(self, X): - """Convert back the data to the original representation. - - In case unknown categories are encountered (all zero's in the - one-hot encoding), ``None`` is used to represent this category. - - Parameters - ---------- - X : array-like or sparse matrix, shape [n_samples, n_encoded_features] - The transformed data. - - Returns - ------- - X_tr : array-like, shape [n_samples, n_features] - Inverse transformed array. - - """ - # if self._legacy_mode: - # raise ValueError("only supported for categorical features") - - check_is_fitted(self, 'categories_') - X = check_array(X, accept_sparse='csr') - - n_samples, _ = X.shape - n_features = len(self.categories_) - n_transformed_features = sum([len(cats) for cats in self.categories_]) - - # validate shape of passed X - msg = ("Shape of the passed X data is not correct. Expected {0} " - "columns, got {1}.") - if X.shape[1] != n_transformed_features: - raise ValueError(msg.format(n_transformed_features, X.shape[1])) - - # create resulting array of appropriate dtype - dt = np.find_common_type([cat.dtype for cat in self.categories_], []) - X_tr = np.empty((n_samples, n_features), dtype=dt) - - j = 0 - found_unknown = {} - - for i in range(n_features): - n_categories = len(self.categories_[i]) - sub = X[:, j:j + n_categories] - - # for sparse X argmax returns 2D matrix, ensure 1D array - labels = np.asarray(_argmax(sub, axis=1)).flatten() - X_tr[:, i] = self.categories_[i][labels] - - if self.handle_unknown == 'ignore': - # ignored unknown categories: we have a row of all zero's - unknown = np.asarray(sub.sum(axis=1) == 0).flatten() - if unknown.any(): - found_unknown[i] = unknown - - j += n_categories - - # if ignored are found: potentially need to upcast result to - # insert None values - if found_unknown: - if X_tr.dtype != object: - X_tr = X_tr.astype(object) - - for idx, mask in found_unknown.items(): - X_tr[mask, idx] = None - - return X_tr - - -class OrdinalEncoder(_BaseEncoder): - """Encode categorical features as an integer array. - - The input to this transformer should be an array-like of integers or - strings, denoting the values taken on by categorical (discrete) features. - The features are converted to ordinal integers. This results in - a single column of integers (0 to n_categories - 1) per feature. - - Read more in the :ref:`User Guide `. - - Parameters - ---------- - categories : 'auto' or a list of lists/arrays of values. - Categories (unique values) per feature: - - - 'auto' : Determine categories automatically from the training data. - - list : ``categories[i]`` holds the categories expected in the ith - column. The passed categories must be sorted and should not mix - strings and numeric values. - - The used categories can be found in the ``categories_`` attribute. - - dtype : number type, default np.float64 - Desired dtype of output. - - Attributes - ---------- - categories_ : list of arrays - The categories of each feature determined during fitting - (in order corresponding with output of ``transform``). - - Examples - -------- - Given a dataset with two features, we let the encoder find the unique - values per feature and transform the data to a binary one-hot encoding. - - >>> from sklearn.preprocessing import OrdinalEncoder - >>> enc = OrdinalEncoder() - >>> X = [['Male', 1], ['Female', 3], ['Female', 2]] - >>> enc.fit(X) - ... # doctest: +ELLIPSIS - OrdinalEncoder(categories='auto', dtype=<... 'numpy.float64'>) - >>> enc.categories_ - [array(['Female', 'Male'], dtype=object), array([1, 2, 3], dtype=object)] - >>> enc.transform([['Female', 3], ['Male', 1]]) - array([[ 0., 2.], - [ 1., 0.]]) - - >>> enc.inverse_transform([[1, 0], [0, 1]]) - array([['Male', 1], - ['Female', 2]], dtype=object) - - See also - -------- - sklearn.preprocessing.OneHotEncoder : performs a one-hot encoding of - categorical features. - sklearn.preprocessing.LabelEncoder : encodes target labels with values - between 0 and n_classes-1. - sklearn.feature_extraction.DictVectorizer : performs a one-hot encoding of - dictionary items (also handles string-valued features). - sklearn.feature_extraction.FeatureHasher : performs an approximate one-hot - encoding of dictionary items or strings. - """ - - def __init__(self, categories='auto', dtype=np.float64): - self.categories = categories - self.dtype = dtype - - def fit(self, X, y=None): - """Fit the OrdinalEncoder to X. - - Parameters - ---------- - X : array-like, shape [n_samples, n_features] - The data to determine the categories of each feature. - - Returns - ------- - self - - """ - self._fit(X) - - return self - - def transform(self, X): - """Transform X to ordinal codes. - - Parameters - ---------- - X : array-like, shape [n_samples, n_features] - The data to encode. - - Returns - ------- - X_out : sparse matrix or a 2-d array - Transformed input. - - """ - X_int, _ = self._transform(X) - return X_int.astype(self.dtype, copy=False) - - def inverse_transform(self, X): - """Convert back the data to the original representation. - - Parameters - ---------- - X : array-like or sparse matrix, shape [n_samples, n_encoded_features] - The transformed data. - - Returns - ------- - X_tr : array-like, shape [n_samples, n_features] - Inverse transformed array. - - """ - check_is_fitted(self, 'categories_') - X = check_array(X, accept_sparse='csr') - - n_samples, _ = X.shape - n_features = len(self.categories_) - - # validate shape of passed X - msg = ("Shape of the passed X data is not correct. Expected {0} " - "columns, got {1}.") - if X.shape[1] != n_features: - raise ValueError(msg.format(n_features, X.shape[1])) - - # create resulting array of appropriate dtype - dt = np.find_common_type([cat.dtype for cat in self.categories_], []) - X_tr = np.empty((n_samples, n_features), dtype=dt) - - for i in range(n_features): - labels = X[:, i].astype('int64') - X_tr[:, i] = self.categories_[i][labels] - - return X_tr - - -_ERR_MSG_1DCOLUMN = ("1D data passed to a transformer that expects 2D data. " - "Try to specify the column selection as a list of one " - "item instead of a scalar.") - - -class ColumnTransformer(_BaseComposition, TransformerMixin): - """Applies transformers to columns of an array or pandas DataFrame. - - EXPERIMENTAL: some behaviors may change between releases without - deprecation. - - This estimator allows different columns or column subsets of the input - to be transformed separately and the results combined into a single - feature space. - This is useful for heterogeneous or columnar data, to combine several - feature extraction mechanisms or transformations into a single transformer. - - Read more in the :ref:`User Guide `. - - .. versionadded:: 0.20 - - Parameters - ---------- - transformers : list of tuples - List of (name, transformer, column(s)) tuples specifying the - transformer objects to be applied to subsets of the data. - - name : string - Like in Pipeline and FeatureUnion, this allows the transformer and - its parameters to be set using ``set_params`` and searched in grid - search. - transformer : estimator or {'passthrough', 'drop'} - Estimator must support `fit` and `transform`. Special-cased - strings 'drop' and 'passthrough' are accepted as well, to - indicate to drop the columns or to pass them through untransformed, - respectively. - column(s) : string or int, array-like of string or int, slice, \ -boolean mask array or callable - Indexes the data on its second axis. Integers are interpreted as - positional columns, while strings can reference DataFrame columns - by name. A scalar string or int should be used where - ``transformer`` expects X to be a 1d array-like (vector), - otherwise a 2d array will be passed to the transformer. - A callable is passed the input data `X` and can return any of the - above. - - remainder : {'drop', 'passthrough'} or estimator, default 'drop' - By default, only the specified columns in `transformers` are - transformed and combined in the output, and the non-specified - columns are dropped. (default of ``'drop'``). - By specifying ``remainder='passthrough'``, all remaining columns that - were not specified in `transformers` will be automatically passed - through. This subset of columns is concatenated with the output of - the transformers. - By setting ``remainder`` to be an estimator, the remaining - non-specified columns will use the ``remainder`` estimator. The - estimator must support `fit` and `transform`. - - sparse_threshold : float, default = 0.3 - If the transformed output consists of a mix of sparse and dense data, - it will be stacked as a sparse matrix if the density is lower than this - value. Use ``sparse_threshold=0`` to always return dense. - When the transformed output consists of all sparse or all dense data, - the stacked result will be sparse or dense, respectively, and this - keyword will be ignored. - - n_jobs : int, optional - Number of jobs to run in parallel (default 1). - - transformer_weights : dict, optional - Multiplicative weights for features per transformer. The output of the - transformer is multiplied by these weights. Keys are transformer names, - values the weights. - - Attributes - ---------- - transformers_ : list - The collection of fitted transformers as tuples of - (name, fitted_transformer, column). `fitted_transformer` can be an - estimator, 'drop', or 'passthrough'. If there are remaining columns, - the final element is a tuple of the form: - ('remainder', transformer, remaining_columns) corresponding to the - ``remainder`` parameter. If there are remaining columns, then - ``len(transformers_)==len(transformers)+1``, otherwise - ``len(transformers_)==len(transformers)``. - - named_transformers_ : Bunch object, a dictionary with attribute access - Read-only attribute to access any transformer by given name. - Keys are transformer names and values are the fitted transformer - objects. - - sparse_output_ : boolean - Boolean flag indicating wether the output of ``transform`` is a - sparse matrix or a dense numpy array, which depends on the output - of the individual transformers and the `sparse_threshold` keyword. - - Notes - ----- - The order of the columns in the transformed feature matrix follows the - order of how the columns are specified in the `transformers` list. - Columns of the original feature matrix that are not specified are - dropped from the resulting transformed feature matrix, unless specified - in the `passthrough` keyword. Those columns specified with `passthrough` - are added at the right to the output of the transformers. - - See also - -------- - sklearn.compose.make_column_transformer : convenience function for - combining the outputs of multiple transformer objects applied to - column subsets of the original feature space. - - Examples - -------- - >>> from sklearn.compose import ColumnTransformer - >>> from sklearn.preprocessing import Normalizer - >>> ct = ColumnTransformer( - ... [("norm1", Normalizer(norm='l1'), [0, 1]), - ... ("norm2", Normalizer(norm='l1'), slice(2, 4))]) - >>> X = np.array([[0., 1., 2., 2.], - ... [1., 1., 0., 1.]]) - >>> # Normalizer scales each row of X to unit norm. A separate scaling - >>> # is applied for the two first and two last elements of each - >>> # row independently. - >>> ct.fit_transform(X) # doctest: +NORMALIZE_WHITESPACE - array([[0. , 1. , 0.5, 0.5], - [0.5, 0.5, 0. , 1. ]]) - - """ - - def __init__(self, transformers, remainder='drop', sparse_threshold=0.3, - n_jobs=1, transformer_weights=None): - self.transformers = transformers - self.remainder = remainder - self.sparse_threshold = sparse_threshold - self.n_jobs = n_jobs - self.transformer_weights = transformer_weights - - @property - def _transformers(self): - """ - Internal list of transformer only containing the name and - transformers, dropping the columns. This is for the implementation - of get_params via BaseComposition._get_params which expects lists - of tuples of len 2. - """ - return [(name, trans) for name, trans, _ in self.transformers] - - @_transformers.setter - def _transformers(self, value): - self.transformers = [ - (name, trans, col) for ((name, trans), (_, _, col)) - in zip(value, self.transformers)] - - def get_params(self, deep=True): - """Get parameters for this estimator. - - Parameters - ---------- - deep : boolean, optional - If True, will return the parameters for this estimator and - contained subobjects that are estimators. - - Returns - ------- - params : mapping of string to any - Parameter names mapped to their values. - """ - return self._get_params('_transformers', deep=deep) - - def set_params(self, **kwargs): - """Set the parameters of this estimator. - - Valid parameter keys can be listed with ``get_params()``. - - Returns - ------- - self - """ - self._set_params('_transformers', **kwargs) - return self - - def _iter(self, X=None, fitted=False, replace_strings=False): - """Generate (name, trans, column, weight) tuples - """ - if fitted: - transformers = self.transformers_ - else: - transformers = self.transformers - if self._remainder[2] is not None: - transformers = chain(transformers, [self._remainder]) - get_weight = (self.transformer_weights or {}).get - - for name, trans, column in transformers: - sub = None if X is None else _get_column(X, column) - - if replace_strings: - # replace 'passthrough' with identity transformer and - # skip in case of 'drop' - if trans == 'passthrough': - trans = FunctionTransformer( - validate=False, accept_sparse=True, - check_inverse=False) - elif trans == 'drop': - continue - - yield (name, trans, sub, get_weight(name)) - - def _validate_transformers(self): - if not self.transformers: - return - - names, transformers, _ = zip(*self.transformers) - - # validate names - self._validate_names(names) - - # validate estimators - for t in transformers: - if t in ('drop', 'passthrough'): - continue - if (not (hasattr(t, "fit") or hasattr(t, "fit_transform")) or not - hasattr(t, "transform")): - raise TypeError("All estimators should implement fit and " - "transform, or can be 'drop' or 'passthrough' " - "specifiers. '%s' (type %s) doesn't." % - (t, type(t))) - - def _validate_remainder(self, X): - """ - Validates ``remainder`` and defines ``_remainder`` targeting - the remaining columns. - """ - is_transformer = ((hasattr(self.remainder, "fit") - or hasattr(self.remainder, "fit_transform")) - and hasattr(self.remainder, "transform")) - if (self.remainder not in ('drop', 'passthrough') - and not is_transformer): - raise ValueError( - "The remainder keyword needs to be one of 'drop', " - "'passthrough', or estimator. '%s' was passed instead" % - self.remainder) - - n_columns = X.shape[1] - cols = [] - for _, _, columns in self.transformers: - cols.extend(_get_column_indices(X, columns)) - remaining_idx = sorted(list(set(range(n_columns)) - set(cols))) or None - - self._remainder = ('remainder', self.remainder, remaining_idx) - - @property - def named_transformers_(self): - """Access the fitted transformer by name. - - Read-only attribute to access any transformer by given name. - Keys are transformer names and values are the fitted transformer - objects. - - """ - # Use Bunch object to improve autocomplete - return Bunch(**dict([(name, trans) for name, trans, _ - in self.transformers_])) - - def get_feature_names(self): - """Get feature names from all transformers. - - Returns - ------- - feature_names : list of strings - Names of the features produced by transform. - """ - check_is_fitted(self, 'transformers_') - feature_names = [] - for name, trans, _, _ in self._iter(fitted=True): - if trans == 'drop': - continue - elif trans == 'passthrough': - raise NotImplementedError( - "get_feature_names is not yet supported when using " - "a 'passthrough' transformer.") - elif not hasattr(trans, 'get_feature_names'): - raise AttributeError("Transformer %s (type %s) does not " - "provide get_feature_names." - % (str(name), type(trans).__name__)) - feature_names.extend([name + "__" + f for f in - trans.get_feature_names()]) - return feature_names - - def _update_fitted_transformers(self, transformers): - # transformers are fitted; excludes 'drop' cases - transformers = iter(transformers) - transformers_ = [] - - transformer_iter = self.transformers - if self._remainder[2] is not None: - transformer_iter = chain(transformer_iter, [self._remainder]) - - for name, old, column in transformer_iter: - if old == 'drop': - trans = 'drop' - elif old == 'passthrough': - # FunctionTransformer is present in list of transformers, - # so get next transformer, but save original string - next(transformers) - trans = 'passthrough' - else: - trans = next(transformers) - transformers_.append((name, trans, column)) - - # sanity check that transformers is exhausted - assert not list(transformers) - self.transformers_ = transformers_ - - def _validate_output(self, result): - """ - Ensure that the output of each transformer is 2D. Otherwise - hstack can raise an error or produce incorrect results. - """ - names = [name for name, _, _, _ in self._iter(replace_strings=True)] - for Xs, name in zip(result, names): - if not getattr(Xs, 'ndim', 0) == 2: - raise ValueError( - "The output of the '{0}' transformer should be 2D (scipy " - "matrix, array, or pandas DataFrame).".format(name)) - - def _fit_transform(self, X, y, func, fitted=False): - """ - Private function to fit and/or transform on demand. - - Return value (transformers and/or transformed X data) depends - on the passed function. - ``fitted=True`` ensures the fitted transformers are used. - """ - try: - return Parallel(n_jobs=self.n_jobs)( - delayed(func)(clone(trans) if not fitted else trans, - X_sel, y, weight) - for _, trans, X_sel, weight in self._iter( - X=X, fitted=fitted, replace_strings=True)) - except ValueError as e: - if "Expected 2D array, got 1D array instead" in str(e): - raise ValueError(_ERR_MSG_1DCOLUMN) - else: - raise - - def fit(self, X, y=None): - """Fit all transformers using X. - - Parameters - ---------- - X : array-like or DataFrame of shape [n_samples, n_features] - Input data, of which specified subsets are used to fit the - transformers. - - y : array-like, shape (n_samples, ...), optional - Targets for supervised learning. - - Returns - ------- - self : ColumnTransformer - This estimator - - """ - # we use fit_transform to make sure to set sparse_output_ (for which we - # need the transformed data) to have consistent output type in predict - self.fit_transform(X, y=y) - return self - - def fit_transform(self, X, y=None): - """Fit all transformers, transform the data and concatenate results. - - Parameters - ---------- - X : array-like or DataFrame of shape [n_samples, n_features] - Input data, of which specified subsets are used to fit the - transformers. - - y : array-like, shape (n_samples, ...), optional - Targets for supervised learning. - - Returns - ------- - X_t : array-like or sparse matrix, shape (n_samples, sum_n_components) - hstack of results of transformers. sum_n_components is the - sum of n_components (output dimension) over transformers. If - any result is a sparse matrix, everything will be converted to - sparse matrices. - - """ - self._validate_remainder(X) - self._validate_transformers() - - result = self._fit_transform(X, y, _fit_transform_one) - - if not result: - self._update_fitted_transformers([]) - # All transformers are None - return np.zeros((X.shape[0], 0)) - - Xs, transformers = zip(*result) - - # determine if concatenated output will be sparse or not - if all(sparse.issparse(X) for X in Xs): - self.sparse_output_ = True - elif any(sparse.issparse(X) for X in Xs): - nnz = sum(X.nnz if sparse.issparse(X) else X.size for X in Xs) - total = sum(X.shape[0] * X.shape[1] if sparse.issparse(X) - else X.size for X in Xs) - density = nnz / total - self.sparse_output_ = density < self.sparse_threshold - else: - self.sparse_output_ = False - - self._update_fitted_transformers(transformers) - self._validate_output(Xs) - - return self._hstack(list(Xs)) - - def transform(self, X): - """Transform X separately by each transformer, concatenate results. - - Parameters - ---------- - X : array-like or DataFrame of shape [n_samples, n_features] - The data to be transformed by subset. - - Returns - ------- - X_t : array-like or sparse matrix, shape (n_samples, sum_n_components) - hstack of results of transformers. sum_n_components is the - sum of n_components (output dimension) over transformers. If - any result is a sparse matrix, everything will be converted to - sparse matrices. - - """ - check_is_fitted(self, 'transformers_') - - Xs = self._fit_transform(X, None, _transform_one, fitted=True) - self._validate_output(Xs) - - if not Xs: - # All transformers are None - return np.zeros((X.shape[0], 0)) - - return self._hstack(list(Xs)) - - def _hstack(self, Xs): - """Stacks Xs horizontally. - - This allows subclasses to control the stacking behavior, while reusing - everything else from ColumnTransformer. - - Parameters - ---------- - Xs : List of numpy arrays, sparse arrays, or DataFrames - """ - if self.sparse_output_: - return sparse.hstack(Xs).tocsr() - else: - Xs = [f.toarray() if sparse.issparse(f) else f for f in Xs] - return np.hstack(Xs) - - -def _check_key_type(key, superclass): - """ - Check that scalar, list or slice is of a certain type. - - This is only used in _get_column and _get_column_indices to check - if the `key` (column specification) is fully integer or fully string-like. - - Parameters - ---------- - key : scalar, list, slice, array-like - The column specification to check - superclass : int or six.string_types - The type for which to check the `key` - - """ - if isinstance(key, superclass): - return True - if isinstance(key, slice): - return (isinstance(key.start, (superclass, type(None))) and - isinstance(key.stop, (superclass, type(None)))) - if isinstance(key, list): - return all(isinstance(x, superclass) for x in key) - if hasattr(key, 'dtype'): - if superclass is int: - return key.dtype.kind == 'i' - else: - # superclass = six.string_types - return key.dtype.kind in ('O', 'U', 'S') - return False - - -def _get_column(X, key): - """ - Get feature column(s) from input data X. - - Supported input types (X): numpy arrays, sparse arrays and DataFrames - - Supported key types (key): - - scalar: output is 1D - - lists, slices, boolean masks: output is 2D - - callable that returns any of the above - - Supported key data types: - - - integer or boolean mask (positional): - - supported for arrays, sparse matrices and dataframes - - string (key-based): - - only supported for dataframes - - So no keys other than strings are allowed (while in principle you - can use any hashable object as key). - - """ - if callable(key): - key = key(X) - - # check whether we have string column names or integers - if _check_key_type(key, int): - column_names = False - elif _check_key_type(key, six.string_types): - column_names = True - elif hasattr(key, 'dtype') and np.issubdtype(key.dtype, np.bool_): - # boolean mask - column_names = False - if hasattr(X, 'loc'): - # pandas boolean masks don't work with iloc, so take loc path - column_names = True - else: - raise ValueError("No valid specification of the columns. Only a " - "scalar, list or slice of all integers or all " - "strings, or boolean mask is allowed") - - if column_names: - if hasattr(X, 'loc'): - # pandas dataframes - return X.loc[:, key] - else: - raise ValueError("Specifying the columns using strings is only " - "supported for pandas DataFrames") - else: - if hasattr(X, 'iloc'): - # pandas dataframes - return X.iloc[:, key] - else: - # numpy arrays, sparse arrays - return X[:, key] - - -def _get_column_indices(X, key): - """ - Get feature column indices for input data X and key. - - For accepted values of `key`, see the docstring of _get_column - - """ - n_columns = X.shape[1] - - if callable(key): - key = key(X) - - if _check_key_type(key, int): - if isinstance(key, int): - return [key] - elif isinstance(key, slice): - return list(range(n_columns)[key]) - else: - return list(key) - - elif _check_key_type(key, six.string_types): - try: - all_columns = list(X.columns) - except AttributeError: - raise ValueError("Specifying the columns using strings is only " - "supported for pandas DataFrames") - if isinstance(key, six.string_types): - columns = [key] - elif isinstance(key, slice): - start, stop = key.start, key.stop - if start is not None: - start = all_columns.index(start) - if stop is not None: - # pandas indexing with strings is endpoint included - stop = all_columns.index(stop) + 1 - else: - stop = n_columns + 1 - return list(range(n_columns)[slice(start, stop)]) - else: - columns = list(key) - - return [all_columns.index(col) for col in columns] - - elif hasattr(key, 'dtype') and np.issubdtype(key.dtype, np.bool_): - # boolean mask - return list(np.arange(n_columns)[key]) - else: - raise ValueError("No valid specification of the columns. Only a " - "scalar, list or slice of all integers or all " - "strings, or boolean mask is allowed") - - -def _get_transformer_list(estimators): - """ - Construct (name, trans, column) tuples from list - - """ - transformers = [trans[1] for trans in estimators] - columns = [trans[0] for trans in estimators] - names = [trans[0] for trans in _name_estimators(transformers)] - - transformer_list = list(zip(names, transformers, columns)) - return transformer_list - - -def make_column_transformer(*transformers, **kwargs): - """Construct a ColumnTransformer from the given transformers. - - This is a shorthand for the ColumnTransformer constructor; it does not - require, and does not permit, naming the transformers. Instead, they will - be given names automatically based on their types. It also does not allow - weighting. - - Parameters - ---------- - *transformers : tuples of column selections and transformers - - remainder : {'drop', 'passthrough'} or estimator, default 'drop' - By default, only the specified columns in `transformers` are - transformed and combined in the output, and the non-specified - columns are dropped. (default of ``'drop'``). - By specifying ``remainder='passthrough'``, all remaining columns that - were not specified in `transformers` will be automatically passed - through. This subset of columns is concatenated with the output of - the transformers. - By setting ``remainder`` to be an estimator, the remaining - non-specified columns will use the ``remainder`` estimator. The - estimator must support `fit` and `transform`. - - n_jobs : int, optional - Number of jobs to run in parallel (default 1). - - Returns - ------- - ct : ColumnTransformer - - See also - -------- - sklearn.compose.ColumnTransformer : Class that allows combining the - outputs of multiple transformer objects used on column subsets - of the data into a single feature space. - - Examples - -------- - >>> from sklearn.preprocessing import StandardScaler, OneHotEncoder - >>> from sklearn.compose import make_column_transformer - >>> make_column_transformer( - ... (['numerical_column'], StandardScaler()), - ... (['categorical_column'], OneHotEncoder())) - ... # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS - ColumnTransformer(n_jobs=1, remainder='drop', sparse_threshold=0.3, - transformer_weights=None, - transformers=[('standardscaler', - StandardScaler(...), - ['numerical_column']), - ('onehotencoder', - OneHotEncoder(...), - ['categorical_column'])]) - - """ - n_jobs = kwargs.pop('n_jobs', 1) - remainder = kwargs.pop('remainder', 'drop') - if kwargs: - raise TypeError('Unknown keyword arguments: "{}"' - .format(list(kwargs.keys())[0])) - transformer_list = _get_transformer_list(transformers) - return ColumnTransformer(transformer_list, n_jobs=n_jobs, - remainder=remainder) \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 063aed4..e47a36f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -45,10 +45,12 @@ xgboost==0.81 # you need a GPU card with CUDA Compute Capability 3.0 or higher support, and # you must install CUDA, cuDNN and more: see tensorflow.org for the detailed # installation instructions. -tensorflow==1.12.0 +tf-nightly-2.0-preview +tf-nightly-gpu-2.0-preview +#tensorflow==1.12.0 #tensorflow-gpu==1.12.0 -tensorflow-hub==0.2.0 -tensorflow-probability==0.5.0 +#tensorflow-hub==0.2.0 +#tensorflow-probability==0.5.0 # Optional: OpenAI gym is only needed for the Reinforcement Learning chapter.