From 608d9ce8155675200bd2f71e2dbd902b49a58204 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aur=C3=A9lien=20Geron?= Date: Tue, 23 Nov 2021 22:46:24 +1300 Subject: [PATCH] Big update of chapter 5 for 3rd edition --- 05_support_vector_machines.ipynb | 1229 ++++++++++++++---------------- 1 file changed, 577 insertions(+), 652 deletions(-) diff --git a/05_support_vector_machines.ipynb b/05_support_vector_machines.ipynb index 97677f8..13e26ea 100644 --- a/05_support_vector_machines.ipynb +++ b/05_support_vector_machines.ipynb @@ -20,17 +20,19 @@ "source": [ "\n", " \n", " \n", "
\n", - " \"Open\n", + " \"Open\n", " \n", - " \n", + " \n", "
" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "tags": [] + }, "source": [ "# Setup" ] @@ -39,7 +41,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "First, let's import a few common modules, ensure MatplotLib plots figures inline and prepare a function to save the figures." + "This project requires Python 3.8 or above:" ] }, { @@ -48,30 +50,64 @@ "metadata": {}, "outputs": [], "source": [ - "# Python ≥3.8 is required\n", "import sys\n", - "assert sys.version_info >= (3, 8)\n", "\n", - "# Scikit-Learn ≥1.0 is required\n", + "assert sys.version_info >= (3, 8)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "It also requires Scikit-Learn ≥ 1.0.1:" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ "import sklearn\n", - "assert sklearn.__version__ >= \"1.0\"\n", "\n", - "# Common imports\n", - "import numpy as np\n", + "assert sklearn.__version__ >= \"1.0.1\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As we did in previous chapters, let's define the default font sizes to make the figures prettier:" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib as mpl\n", + "\n", + "mpl.rc('font', size=12)\n", + "mpl.rc('axes', labelsize=14, titlesize=14)\n", + "mpl.rc('legend', fontsize=14)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "And let's create the `images/svm` folder (if it doesn't already exist), and define the `save_fig()` function which is used through this notebook to save the figures in high-res for the book:" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ "from pathlib import Path\n", "\n", - "# to make this notebook's output stable across runs\n", - "np.random.seed(42)\n", - "\n", - "# To plot pretty figures\n", - "%matplotlib inline\n", - "import matplotlib as mpl\n", - "import matplotlib.pyplot as plt\n", - "mpl.rc('axes', labelsize=14)\n", - "mpl.rc('xtick', labelsize=12)\n", - "mpl.rc('ytick', labelsize=12)\n", - "\n", - "# Where to save the figures\n", "IMAGES_PATH = Path() / \"images\" / \"svm\"\n", "IMAGES_PATH.mkdir(parents=True, exist_ok=True)\n", "\n", @@ -89,27 +125,22 @@ "# Linear SVM Classification" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The next few code cells generate the first figures in chapter 5. The first actual code sample comes after.\n", - "\n", - "**Code to generate Figure 5–1. Large margin classification**" - ] - }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ + "# not in the book – this cell generates and saves Figure 5–1\n", + "\n", + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", "from sklearn.svm import SVC\n", "from sklearn import datasets\n", "\n", - "iris = datasets.load_iris()\n", - "X = iris[\"data\"][:, (2, 3)] # petal length, petal width\n", - "y = iris[\"target\"]\n", + "iris = datasets.load_iris(as_frame=True)\n", + "X = iris.data[[\"petal length (cm)\", \"petal width (cm)\"]].values\n", + "y = iris.target\n", "\n", "setosa_or_versicolor = (y == 0) | (y == 1)\n", "X = X[setosa_or_versicolor]\n", @@ -117,18 +148,11 @@ "\n", "# SVM Classifier model\n", "svm_clf = SVC(kernel=\"linear\", C=float(\"inf\"))\n", - "svm_clf.fit(X, y)" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ + "svm_clf.fit(X, y)\n", + "\n", "# Bad models\n", "x0 = np.linspace(0, 5.5, 200)\n", - "pred_1 = 5*x0 - 20\n", + "pred_1 = 5 * x0 - 20\n", "pred_2 = x0 - 1.8\n", "pred_3 = 0.1 * x0 + 0.5\n", "\n", @@ -139,7 +163,7 @@ " # At the decision boundary, w0*x0 + w1*x1 + b = 0\n", " # => x1 = -w0/w1 * x0 - b/w1\n", " x0 = np.linspace(xmin, xmax, 200)\n", - " decision_boundary = -w[0]/w[1] * x0 - b/w[1]\n", + " decision_boundary = -w[0] / w[1] * x0 - b / w[1]\n", "\n", " margin = 1/w[1]\n", " gutter_up = decision_boundary + margin\n", @@ -159,81 +183,82 @@ "plt.plot(x0, pred_3, \"r-\", linewidth=2)\n", "plt.plot(X[:, 0][y==1], X[:, 1][y==1], \"bs\", label=\"Iris versicolor\")\n", "plt.plot(X[:, 0][y==0], X[:, 1][y==0], \"yo\", label=\"Iris setosa\")\n", - "plt.xlabel(\"Petal length\", fontsize=14)\n", - "plt.ylabel(\"Petal width\", fontsize=14)\n", - "plt.legend(loc=\"upper left\", fontsize=14)\n", + "plt.xlabel(\"Petal length\")\n", + "plt.ylabel(\"Petal width\")\n", + "plt.legend(loc=\"upper left\")\n", "plt.axis([0, 5.5, 0, 2])\n", + "plt.grid()\n", "\n", "plt.sca(axes[1])\n", "plot_svc_decision_boundary(svm_clf, 0, 5.5)\n", "plt.plot(X[:, 0][y==1], X[:, 1][y==1], \"bs\")\n", "plt.plot(X[:, 0][y==0], X[:, 1][y==0], \"yo\")\n", - "plt.xlabel(\"Petal length\", fontsize=14)\n", + "plt.xlabel(\"Petal length\")\n", "plt.axis([0, 5.5, 0, 2])\n", + "plt.grid()\n", "\n", "save_fig(\"large_margin_classification_plot\")\n", "plt.show()" ] }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "# not in the book – this cell generates and saves Figure 5–2\n", + "\n", + "from sklearn.preprocessing import StandardScaler\n", + "\n", + "Xs = np.array([[1, 50], [5, 20], [3, 80], [5, 60]]).astype(np.float64)\n", + "ys = np.array([0, 0, 1, 1])\n", + "svm_clf = SVC(kernel=\"linear\", C=100).fit(Xs, ys)\n", + "\n", + "scaler = StandardScaler()\n", + "X_scaled = scaler.fit_transform(Xs)\n", + "svm_clf_scaled = SVC(kernel=\"linear\", C=100).fit(X_scaled, ys)\n", + "\n", + "plt.figure(figsize=(9,2.7))\n", + "plt.subplot(121)\n", + "plt.plot(Xs[:, 0][ys==1], Xs[:, 1][ys==1], \"bo\")\n", + "plt.plot(Xs[:, 0][ys==0], Xs[:, 1][ys==0], \"ms\")\n", + "plot_svc_decision_boundary(svm_clf, 0, 6)\n", + "plt.xlabel(\"$x_0$\")\n", + "plt.ylabel(\"$x_1$    \", rotation=0)\n", + "plt.title(\"Unscaled\")\n", + "plt.axis([0, 6, 0, 90])\n", + "plt.grid()\n", + "\n", + "plt.subplot(122)\n", + "plt.plot(X_scaled[:, 0][ys==1], X_scaled[:, 1][ys==1], \"bo\")\n", + "plt.plot(X_scaled[:, 0][ys==0], X_scaled[:, 1][ys==0], \"ms\")\n", + "plot_svc_decision_boundary(svm_clf_scaled, -2, 2)\n", + "plt.xlabel(\"$x'_0$\")\n", + "plt.ylabel(\"$x'_1$ \", rotation=0)\n", + "plt.title(\"Scaled\")\n", + "plt.axis([-2, 2, -2, 2])\n", + "plt.grid()\n", + "\n", + "save_fig(\"sensitivity_to_feature_scales_plot\")\n", + "plt.show()" + ] + }, { "cell_type": "markdown", "metadata": {}, "source": [ - "**Code to generate Figure 5–2. Sensitivity to feature scales**" + "## Soft Margin Classification" ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ - "Xs = np.array([[1, 50], [5, 20], [3, 80], [5, 60]]).astype(np.float64)\n", - "ys = np.array([0, 0, 1, 1])\n", - "svm_clf = SVC(kernel=\"linear\", C=100)\n", - "svm_clf.fit(Xs, ys)\n", + "# not in the book – this cell generates and saves Figure 5–3\n", "\n", - "plt.figure(figsize=(9,2.7))\n", - "plt.subplot(121)\n", - "plt.plot(Xs[:, 0][ys==1], Xs[:, 1][ys==1], \"bo\")\n", - "plt.plot(Xs[:, 0][ys==0], Xs[:, 1][ys==0], \"ms\")\n", - "plot_svc_decision_boundary(svm_clf, 0, 6)\n", - "plt.xlabel(\"$x_0$\", fontsize=20)\n", - "plt.ylabel(\"$x_1$    \", fontsize=20, rotation=0)\n", - "plt.title(\"Unscaled\", fontsize=16)\n", - "plt.axis([0, 6, 0, 90])\n", - "\n", - "from sklearn.preprocessing import StandardScaler\n", - "scaler = StandardScaler()\n", - "X_scaled = scaler.fit_transform(Xs)\n", - "svm_clf.fit(X_scaled, ys)\n", - "\n", - "plt.subplot(122)\n", - "plt.plot(X_scaled[:, 0][ys==1], X_scaled[:, 1][ys==1], \"bo\")\n", - "plt.plot(X_scaled[:, 0][ys==0], X_scaled[:, 1][ys==0], \"ms\")\n", - "plot_svc_decision_boundary(svm_clf, -2, 2)\n", - "plt.xlabel(\"$x'_0$\", fontsize=20)\n", - "plt.ylabel(\"$x'_1$ \", fontsize=20, rotation=0)\n", - "plt.title(\"Scaled\", fontsize=16)\n", - "plt.axis([-2, 2, -2, 2])\n", - "\n", - "save_fig(\"sensitivity_to_feature_scales_plot\")\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Soft Margin Classification\n", - "**Code to generate Figure 5–3. Hard margin sensitivity to outliers**" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ "X_outliers = np.array([[3.4, 1.3], [3.2, 0.8]])\n", "y_outliers = np.array([0, 0])\n", "Xo1 = np.concatenate([X, X_outliers[:1]], axis=0)\n", @@ -249,31 +274,33 @@ "plt.sca(axes[0])\n", "plt.plot(Xo1[:, 0][yo1==1], Xo1[:, 1][yo1==1], \"bs\")\n", "plt.plot(Xo1[:, 0][yo1==0], Xo1[:, 1][yo1==0], \"yo\")\n", - "plt.text(0.3, 1.0, \"Impossible!\", fontsize=24, color=\"red\")\n", - "plt.xlabel(\"Petal length\", fontsize=14)\n", - "plt.ylabel(\"Petal width\", fontsize=14)\n", + "plt.text(0.3, 1.0, \"Impossible!\", color=\"red\", fontsize=18)\n", + "plt.xlabel(\"Petal length\")\n", + "plt.ylabel(\"Petal width\")\n", "plt.annotate(\"Outlier\",\n", " xy=(X_outliers[0][0], X_outliers[0][1]),\n", " xytext=(2.5, 1.7),\n", " ha=\"center\",\n", " arrowprops=dict(facecolor='black', shrink=0.1),\n", - " fontsize=16,\n", + " fontsize=14,\n", " )\n", "plt.axis([0, 5.5, 0, 2])\n", + "plt.grid()\n", "\n", "plt.sca(axes[1])\n", "plt.plot(Xo2[:, 0][yo2==1], Xo2[:, 1][yo2==1], \"bs\")\n", "plt.plot(Xo2[:, 0][yo2==0], Xo2[:, 1][yo2==0], \"yo\")\n", "plot_svc_decision_boundary(svm_clf2, 0, 5.5)\n", - "plt.xlabel(\"Petal length\", fontsize=14)\n", + "plt.xlabel(\"Petal length\")\n", "plt.annotate(\"Outlier\",\n", " xy=(X_outliers[1][0], X_outliers[1][1]),\n", " xytext=(3.2, 0.08),\n", " ha=\"center\",\n", " arrowprops=dict(facecolor='black', shrink=0.1),\n", - " fontsize=16,\n", + " fontsize=14,\n", " )\n", "plt.axis([0, 5.5, 0, 2])\n", + "plt.grid()\n", "\n", "save_fig(\"sensitivity_to_outliers_plot\")\n", "plt.show()" @@ -288,73 +315,52 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "from sklearn import datasets\n", - "from sklearn.pipeline import Pipeline\n", + "from sklearn.pipeline import make_pipeline\n", "from sklearn.preprocessing import StandardScaler\n", "from sklearn.svm import LinearSVC\n", "\n", - "iris = datasets.load_iris()\n", - "X = iris[\"data\"][:, (2, 3)] # petal length, petal width\n", - "y = (iris[\"target\"] == 2).astype(np.float64) # Iris virginica\n", - "\n", - "svm_clf = Pipeline([\n", - " (\"scaler\", StandardScaler()),\n", - " (\"linear_svc\", LinearSVC(C=1, loss=\"hinge\", random_state=42)),\n", - " ])\n", + "iris = datasets.load_iris(as_frame=True)\n", + "X = iris.data[[\"petal length (cm)\", \"petal width (cm)\"]].values\n", + "y = (iris.target == 2) # Iris virginica\n", "\n", + "svm_clf = make_pipeline(StandardScaler(),\n", + " LinearSVC(C=1, loss=\"hinge\", random_state=42))\n", "svm_clf.fit(X, y)" ] }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "svm_clf.predict([[5.5, 1.7]])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Code to generate Figure 5–4. Large margin versus fewer margin violations**" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "scaler = StandardScaler()\n", - "svm_clf1 = LinearSVC(C=1, loss=\"hinge\", random_state=42)\n", - "svm_clf2 = LinearSVC(C=100, loss=\"hinge\", random_state=42)\n", - "\n", - "scaled_svm_clf1 = Pipeline([\n", - " (\"scaler\", scaler),\n", - " (\"linear_svc\", svm_clf1),\n", - " ])\n", - "scaled_svm_clf2 = Pipeline([\n", - " (\"scaler\", scaler),\n", - " (\"linear_svc\", svm_clf2),\n", - " ])\n", - "\n", - "scaled_svm_clf1.fit(X, y)\n", - "scaled_svm_clf2.fit(X, y)" - ] - }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ + "svm_clf.predict([[5.5, 1.7]])" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "# not in the book – this cell generates and saves Figure 5–4\n", + "\n", + "scaler = StandardScaler()\n", + "svm_clf1 = LinearSVC(C=1, loss=\"hinge\", max_iter=10_000, random_state=42)\n", + "svm_clf2 = LinearSVC(C=100, loss=\"hinge\", max_iter=10_000, random_state=42)\n", + "\n", + "scaled_svm_clf1 = make_pipeline(scaler, svm_clf1)\n", + "scaled_svm_clf2 = make_pipeline(scaler, svm_clf2)\n", + "\n", + "scaled_svm_clf1.fit(X, y)\n", + "scaled_svm_clf2.fit(X, y)\n", + "\n", "# Convert to unscaled parameters\n", "b1 = svm_clf1.decision_function([-scaler.mean_ / scaler.scale_])\n", "b2 = svm_clf2.decision_function([-scaler.mean_ / scaler.scale_])\n", @@ -370,36 +376,32 @@ "support_vectors_idx1 = (t * (X.dot(w1) + b1) < 1).ravel()\n", "support_vectors_idx2 = (t * (X.dot(w2) + b2) < 1).ravel()\n", "svm_clf1.support_vectors_ = X[support_vectors_idx1]\n", - "svm_clf2.support_vectors_ = X[support_vectors_idx2]" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ + "svm_clf2.support_vectors_ = X[support_vectors_idx2]\n", + "\n", "fig, axes = plt.subplots(ncols=2, figsize=(10,2.7), sharey=True)\n", "\n", "plt.sca(axes[0])\n", "plt.plot(X[:, 0][y==1], X[:, 1][y==1], \"g^\", label=\"Iris virginica\")\n", "plt.plot(X[:, 0][y==0], X[:, 1][y==0], \"bs\", label=\"Iris versicolor\")\n", "plot_svc_decision_boundary(svm_clf1, 4, 5.9)\n", - "plt.xlabel(\"Petal length\", fontsize=14)\n", - "plt.ylabel(\"Petal width\", fontsize=14)\n", - "plt.legend(loc=\"upper left\", fontsize=14)\n", - "plt.title(\"$C = {}$\".format(svm_clf1.C), fontsize=16)\n", + "plt.xlabel(\"Petal length\")\n", + "plt.ylabel(\"Petal width\")\n", + "plt.legend(loc=\"upper left\")\n", + "plt.title(\"$C = {}$\".format(svm_clf1.C))\n", "plt.axis([4, 5.9, 0.8, 2.8])\n", + "plt.grid()\n", "\n", "plt.sca(axes[1])\n", "plt.plot(X[:, 0][y==1], X[:, 1][y==1], \"g^\")\n", "plt.plot(X[:, 0][y==0], X[:, 1][y==0], \"bs\")\n", "plot_svc_decision_boundary(svm_clf2, 4, 5.99)\n", - "plt.xlabel(\"Petal length\", fontsize=14)\n", - "plt.title(\"$C = {}$\".format(svm_clf2.C), fontsize=16)\n", + "plt.xlabel(\"Petal length\")\n", + "plt.title(\"$C = {}$\".format(svm_clf2.C))\n", "plt.axis([4, 5.9, 0.8, 2.8])\n", + "plt.grid()\n", "\n", - "save_fig(\"regularization_plot\")" + "save_fig(\"regularization_plot\")\n", + "plt.show()" ] }, { @@ -409,19 +411,14 @@ "# Nonlinear SVM Classification" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Code to generate Figure 5–5. Adding features to make a dataset linearly separable**" - ] - }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ + "# not in the book – this cell generates and saves Figure 5–5\n", + "\n", "X1D = np.linspace(-4, 4, 9).reshape(-1, 1)\n", "X2D = np.c_[X1D, X1D**2]\n", "y = np.array([0, 0, 1, 1, 1, 1, 1, 0, 0])\n", @@ -434,7 +431,7 @@ "plt.plot(X1D[:, 0][y==0], np.zeros(4), \"bs\")\n", "plt.plot(X1D[:, 0][y==1], np.zeros(5), \"g^\")\n", "plt.gca().get_yaxis().set_ticks([])\n", - "plt.xlabel(r\"$x_1$\", fontsize=20)\n", + "plt.xlabel(r\"$x_1$\")\n", "plt.axis([-4.5, 4.5, -0.2, 0.2])\n", "\n", "plt.subplot(122)\n", @@ -443,8 +440,8 @@ "plt.axvline(x=0, color='k')\n", "plt.plot(X2D[:, 0][y==0], X2D[:, 1][y==0], \"bs\")\n", "plt.plot(X2D[:, 0][y==1], X2D[:, 1][y==1], \"g^\")\n", - "plt.xlabel(r\"$x_1$\", fontsize=20)\n", - "plt.ylabel(r\"$x_2$  \", fontsize=20, rotation=0)\n", + "plt.xlabel(r\"$x_1$\")\n", + "plt.ylabel(r\"$x_2$  \", rotation=0)\n", "plt.gca().get_yaxis().set_ticks([0, 4, 8, 12, 16])\n", "plt.plot([-4.5, 4.5], [6.5, 6.5], \"r--\", linewidth=3)\n", "plt.axis([-4.5, 4.5, -1, 17])\n", @@ -455,27 +452,6 @@ "plt.show()" ] }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [], - "source": [ - "from sklearn.datasets import make_moons\n", - "X, y = make_moons(n_samples=100, noise=0.15, random_state=42)\n", - "\n", - "def plot_dataset(X, y, axes):\n", - " plt.plot(X[:, 0][y==0], X[:, 1][y==0], \"bs\")\n", - " plt.plot(X[:, 0][y==1], X[:, 1][y==1], \"g^\")\n", - " plt.axis(axes)\n", - " plt.grid(True, which='both')\n", - " plt.xlabel(r\"$x_1$\", fontsize=20)\n", - " plt.ylabel(r\"$x_2$\", fontsize=20, rotation=0)\n", - "\n", - "plot_dataset(X, y, [-1.5, 2.5, -1, 1.5])\n", - "plt.show()" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -485,36 +461,40 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "from sklearn.datasets import make_moons\n", - "from sklearn.pipeline import Pipeline\n", + "from sklearn.pipeline import make_pipeline\n", "from sklearn.preprocessing import PolynomialFeatures\n", "\n", - "polynomial_svm_clf = Pipeline([\n", - " (\"poly_features\", PolynomialFeatures(degree=3)),\n", - " (\"scaler\", StandardScaler()),\n", - " (\"svm_clf\", LinearSVC(C=10, loss=\"hinge\", random_state=42))\n", - " ])\n", + "X, y = make_moons(n_samples=100, noise=0.15, random_state=42)\n", "\n", + "polynomial_svm_clf = make_pipeline(\n", + " PolynomialFeatures(degree=3),\n", + " StandardScaler(),\n", + " LinearSVC(C=10, loss=\"hinge\", max_iter=10_000, random_state=42)\n", + ")\n", "polynomial_svm_clf.fit(X, y)" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Code to generate Figure 5–6. Linear SVM classifier using polynomial features**" - ] - }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ + "# not in the book – this cell generates and saves Figure 5–6\n", + "\n", + "def plot_dataset(X, y, axes):\n", + " plt.plot(X[:, 0][y==0], X[:, 1][y==0], \"bs\")\n", + " plt.plot(X[:, 0][y==1], X[:, 1][y==1], \"g^\")\n", + " plt.axis(axes)\n", + " plt.grid(True, which='both')\n", + " plt.xlabel(r\"$x_1$\")\n", + " plt.ylabel(r\"$x_2$\", rotation=0)\n", + "\n", "def plot_predictions(clf, axes):\n", " x0s = np.linspace(axes[0], axes[1], 100)\n", " x1s = np.linspace(axes[2], axes[3], 100)\n", @@ -548,56 +528,44 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "from sklearn.svm import SVC\n", "\n", - "poly_kernel_svm_clf = Pipeline([\n", - " (\"scaler\", StandardScaler()),\n", - " (\"svm_clf\", SVC(kernel=\"poly\", degree=3, coef0=1, C=5))\n", - " ])\n", + "poly_kernel_svm_clf = make_pipeline(\n", + " StandardScaler(),\n", + " SVC(kernel=\"poly\", degree=3, coef0=1, C=5)\n", + ")\n", "poly_kernel_svm_clf.fit(X, y)" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Code to generate Figure 5–7. SVM classifiers with a polynomial kernel**" - ] - }, { "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [], - "source": [ - "poly100_kernel_svm_clf = Pipeline([\n", - " (\"scaler\", StandardScaler()),\n", - " (\"svm_clf\", SVC(kernel=\"poly\", degree=10, coef0=100, C=5))\n", - " ])\n", - "poly100_kernel_svm_clf.fit(X, y)" - ] - }, - { - "cell_type": "code", - "execution_count": 17, + "execution_count": 15, "metadata": {}, "outputs": [], "source": [ + "# not in the book – this cell generates and saves Figure 5–7\n", + "\n", + "poly100_kernel_svm_clf = make_pipeline(\n", + " StandardScaler(),\n", + " SVC(kernel=\"poly\", degree=10, coef0=100, C=5)\n", + ")\n", + "poly100_kernel_svm_clf.fit(X, y)\n", + "\n", "fig, axes = plt.subplots(ncols=2, figsize=(10.5, 4), sharey=True)\n", "\n", "plt.sca(axes[0])\n", "plot_predictions(poly_kernel_svm_clf, [-1.5, 2.45, -1, 1.5])\n", "plot_dataset(X, y, [-1.5, 2.4, -1, 1.5])\n", - "plt.title(r\"$d=3, r=1, C=5$\", fontsize=18)\n", + "plt.title(r\"$d=3, r=1, C=5$\")\n", "\n", "plt.sca(axes[1])\n", "plot_predictions(poly100_kernel_svm_clf, [-1.5, 2.45, -1, 1.5])\n", "plot_dataset(X, y, [-1.5, 2.4, -1, 1.5])\n", - "plt.title(r\"$d=10, r=100, C=5$\", fontsize=18)\n", + "plt.title(r\"$d=10, r=100, C=5$\")\n", "plt.ylabel(\"\")\n", "\n", "save_fig(\"moons_kernelized_polynomial_svc_plot\")\n", @@ -611,21 +579,16 @@ "## Similarity Features" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Code to generate Figure 5–8. Similarity features using the Gaussian RBF**" - ] - }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 16, "metadata": { "scrolled": true }, "outputs": [], "source": [ + "# not in the book – this cell generates and saves Figure 5–8\n", + "\n", "def gaussian_rbf(x, landmark, gamma):\n", " return np.exp(-gamma * np.linalg.norm(x - landmark, axis=1)**2)\n", "\n", @@ -649,17 +612,18 @@ "plt.plot(x1s, x2s, \"g--\")\n", "plt.plot(x1s, x3s, \"b:\")\n", "plt.gca().get_yaxis().set_ticks([0, 0.25, 0.5, 0.75, 1])\n", - "plt.xlabel(r\"$x_1$\", fontsize=20)\n", - "plt.ylabel(r\"Similarity\", fontsize=14)\n", - "plt.annotate(r'$\\mathbf{x}$',\n", - " xy=(X1D[3, 0], 0),\n", - " xytext=(-0.5, 0.20),\n", - " ha=\"center\",\n", - " arrowprops=dict(facecolor='black', shrink=0.1),\n", - " fontsize=18,\n", - " )\n", - "plt.text(-2, 0.9, \"$x_2$\", ha=\"center\", fontsize=20)\n", - "plt.text(1, 0.9, \"$x_3$\", ha=\"center\", fontsize=20)\n", + "plt.xlabel(r\"$x_1$\")\n", + "plt.ylabel(r\"Similarity\")\n", + "plt.annotate(\n", + " r'$\\mathbf{x}$',\n", + " xy=(X1D[3, 0], 0),\n", + " xytext=(-0.5, 0.20),\n", + " ha=\"center\",\n", + " arrowprops=dict(facecolor='black', shrink=0.1),\n", + " fontsize=16,\n", + ")\n", + "plt.text(-2, 0.9, \"$x_2$\", ha=\"center\", fontsize=15)\n", + "plt.text(1, 0.9, \"$x_3$\", ha=\"center\", fontsize=15)\n", "plt.axis([-4.5, 4.5, -0.1, 1.1])\n", "\n", "plt.subplot(122)\n", @@ -668,15 +632,16 @@ "plt.axvline(x=0, color='k')\n", "plt.plot(XK[:, 0][yk==0], XK[:, 1][yk==0], \"bs\")\n", "plt.plot(XK[:, 0][yk==1], XK[:, 1][yk==1], \"g^\")\n", - "plt.xlabel(r\"$x_2$\", fontsize=20)\n", - "plt.ylabel(r\"$x_3$  \", fontsize=20, rotation=0)\n", - "plt.annotate(r'$\\phi\\left(\\mathbf{x}\\right)$',\n", - " xy=(XK[3, 0], XK[3, 1]),\n", - " xytext=(0.65, 0.50),\n", - " ha=\"center\",\n", - " arrowprops=dict(facecolor='black', shrink=0.1),\n", - " fontsize=18,\n", - " )\n", + "plt.xlabel(r\"$x_2$\")\n", + "plt.ylabel(r\"$x_3$  \", rotation=0)\n", + "plt.annotate(\n", + " r'$\\phi\\left(\\mathbf{x}\\right)$',\n", + " xy=(XK[3, 0], XK[3, 1]),\n", + " xytext=(0.65, 0.50),\n", + " ha=\"center\",\n", + " arrowprops=dict(facecolor='black', shrink=0.1),\n", + " fontsize=16,\n", + ")\n", "plt.plot([-0.1, 1.1], [0.57, -0.1], \"r--\", linewidth=3)\n", "plt.axis([-0.1, 1.1, -0.1, 1.1])\n", " \n", @@ -688,7 +653,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 17, "metadata": {}, "outputs": [], "source": [ @@ -714,32 +679,27 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 18, "metadata": {}, "outputs": [], "source": [ - "rbf_kernel_svm_clf = Pipeline([\n", - " (\"scaler\", StandardScaler()),\n", - " (\"svm_clf\", SVC(kernel=\"rbf\", gamma=5, C=0.001))\n", - " ])\n", + "rbf_kernel_svm_clf = make_pipeline(\n", + " StandardScaler(),\n", + " SVC(kernel=\"rbf\", gamma=5, C=0.001)\n", + ")\n", "rbf_kernel_svm_clf.fit(X, y)" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Code to generate Figure 5–9. SVM classifiers using an RBF kernel**" - ] - }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 19, "metadata": { "scrolled": true }, "outputs": [], "source": [ + "# not in the book – this cell generates and saves Figure 5–9\n", + "\n", "from sklearn.svm import SVC\n", "\n", "gamma1, gamma2 = 0.1, 5\n", @@ -748,10 +708,10 @@ "\n", "svm_clfs = []\n", "for gamma, C in hyperparams:\n", - " rbf_kernel_svm_clf = Pipeline([\n", - " (\"scaler\", StandardScaler()),\n", - " (\"svm_clf\", SVC(kernel=\"rbf\", gamma=gamma, C=C))\n", - " ])\n", + " rbf_kernel_svm_clf = make_pipeline(\n", + " StandardScaler(),\n", + " SVC(kernel=\"rbf\", gamma=gamma, C=C)\n", + " )\n", " rbf_kernel_svm_clf.fit(X, y)\n", " svm_clfs.append(rbf_kernel_svm_clf)\n", "\n", @@ -762,7 +722,7 @@ " plot_predictions(svm_clf, [-1.5, 2.45, -1, 1.5])\n", " plot_dataset(X, y, [-1.5, 2.45, -1, 1.5])\n", " gamma, C = hyperparams[i]\n", - " plt.title(r\"$\\gamma = {}, C = {}$\".format(gamma, C), fontsize=16)\n", + " plt.title(r\"$\\gamma = {}, C = {}$\".format(gamma, C))\n", " if i in (0, 1):\n", " plt.xlabel(\"\")\n", " if i in (1, 3):\n", @@ -781,7 +741,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 20, "metadata": {}, "outputs": [], "source": [ @@ -800,7 +760,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 21, "metadata": {}, "outputs": [], "source": [ @@ -810,42 +770,19 @@ "svm_reg.fit(X, y)" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Code to generate Figure 5–10. SVM Regression**" - ] - }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 22, "metadata": {}, "outputs": [], "source": [ - "svm_reg1 = LinearSVR(epsilon=1.5, random_state=42)\n", - "svm_reg2 = LinearSVR(epsilon=0.5, random_state=42)\n", - "svm_reg1.fit(X, y)\n", - "svm_reg2.fit(X, y)\n", + "# not in the book – this cell generates and saves Figure 5–10\n", "\n", "def find_support_vectors(svm_reg, X, y):\n", " y_pred = svm_reg.predict(X)\n", " off_margin = (np.abs(y - y_pred) >= svm_reg.epsilon)\n", " return np.argwhere(off_margin)\n", "\n", - "svm_reg1.support_ = find_support_vectors(svm_reg1, X, y)\n", - "svm_reg2.support_ = find_support_vectors(svm_reg2, X, y)\n", - "\n", - "eps_x1 = 1\n", - "eps_y_pred = svm_reg1.predict([[eps_x1]])" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "metadata": {}, - "outputs": [], - "source": [ "def plot_svm_regression(svm_reg, X, y, axes):\n", " x1s = np.linspace(axes[0], axes[1], 100).reshape(100, 1)\n", " y_pred = svm_reg.predict(x1s)\n", @@ -854,32 +791,44 @@ " plt.plot(x1s, y_pred - svm_reg.epsilon, \"k--\")\n", " plt.scatter(X[svm_reg.support_], y[svm_reg.support_], s=180, facecolors='#FFAAAA')\n", " plt.plot(X, y, \"bo\")\n", - " plt.xlabel(r\"$x_1$\", fontsize=18)\n", - " plt.legend(loc=\"upper left\", fontsize=18)\n", + " plt.xlabel(r\"$x_1$\")\n", + " plt.legend(loc=\"upper left\")\n", " plt.axis(axes)\n", "\n", + "svm_reg1 = LinearSVR(epsilon=1.5, random_state=42)\n", + "svm_reg2 = LinearSVR(epsilon=0.5, random_state=42)\n", + "svm_reg1.fit(X, y)\n", + "svm_reg2.fit(X, y)\n", + "\n", + "svm_reg1.support_ = find_support_vectors(svm_reg1, X, y)\n", + "svm_reg2.support_ = find_support_vectors(svm_reg2, X, y)\n", + "\n", + "eps_x1 = 1\n", + "eps_y_pred = svm_reg1.predict([[eps_x1]])\n", + "\n", "fig, axes = plt.subplots(ncols=2, figsize=(9, 4), sharey=True)\n", "plt.sca(axes[0])\n", "plot_svm_regression(svm_reg1, X, y, [0, 2, 3, 11])\n", - "plt.title(r\"$\\epsilon = {}$\".format(svm_reg1.epsilon), fontsize=18)\n", - "plt.ylabel(r\"$y$\", fontsize=18, rotation=0)\n", - "#plt.plot([eps_x1, eps_x1], [eps_y_pred, eps_y_pred - svm_reg1.epsilon], \"k-\", linewidth=2)\n", + "plt.title(r\"$\\epsilon = {}$\".format(svm_reg1.epsilon))\n", + "plt.ylabel(r\"$y$\", rotation=0)\n", + "plt.grid()\n", "plt.annotate(\n", " '', xy=(eps_x1, eps_y_pred), xycoords='data',\n", " xytext=(eps_x1, eps_y_pred - svm_reg1.epsilon),\n", " textcoords='data', arrowprops={'arrowstyle': '<->', 'linewidth': 1.5}\n", " )\n", - "plt.text(0.91, 5.6, r\"$\\epsilon$\", fontsize=20)\n", + "plt.text(0.91, 5.6, r\"$\\epsilon$\", fontsize=16)\n", "plt.sca(axes[1])\n", "plot_svm_regression(svm_reg2, X, y, [0, 2, 3, 11])\n", - "plt.title(r\"$\\epsilon = {}$\".format(svm_reg2.epsilon), fontsize=18)\n", + "plt.title(r\"$\\epsilon = {}$\".format(svm_reg2.epsilon))\n", + "plt.grid()\n", "save_fig(\"svm_regression_plot\")\n", "plt.show()" ] }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 23, "metadata": {}, "outputs": [], "source": [ @@ -889,13 +838,6 @@ "y = (0.2 + 0.1 * X + 0.5 * X**2 + np.random.randn(m, 1)/10).ravel()" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Note**: to be future-proof, we set `gamma=\"scale\"`, as this will be the default value in Scikit-Learn 0.22." - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -905,51 +847,51 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 24, "metadata": {}, "outputs": [], "source": [ "from sklearn.svm import SVR\n", "\n", - "svm_poly_reg = SVR(kernel=\"poly\", degree=2, C=100, epsilon=0.1, gamma=\"scale\")\n", + "svm_poly_reg = SVR(kernel=\"poly\", degree=2, C=100)\n", "svm_poly_reg.fit(X, y)" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Code to generate Figure 5–11. SVM Regression using a second-degree polynomial kernel**" - ] - }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 25, "metadata": {}, "outputs": [], "source": [ - "from sklearn.svm import SVR\n", - "\n", - "svm_poly_reg1 = SVR(kernel=\"poly\", degree=2, C=100, epsilon=0.1, gamma=\"scale\")\n", - "svm_poly_reg2 = SVR(kernel=\"poly\", degree=2, C=0.01, epsilon=0.1, gamma=\"scale\")\n", + "svm_poly_reg1 = SVR(kernel=\"poly\", degree=2, C=100)\n", + "svm_poly_reg2 = SVR(kernel=\"poly\", degree=2, C=0.01)\n", "svm_poly_reg1.fit(X, y)\n", "svm_poly_reg2.fit(X, y)" ] }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 26, "metadata": {}, "outputs": [], "source": [ + "# not in the book – this cell generates and saves Figure 5–11\n", + "\n", "fig, axes = plt.subplots(ncols=2, figsize=(9, 4), sharey=True)\n", "plt.sca(axes[0])\n", "plot_svm_regression(svm_poly_reg1, X, y, [-1, 1, 0, 1])\n", - "plt.title(r\"$degree={}, C={}, \\epsilon = {}$\".format(svm_poly_reg1.degree, svm_poly_reg1.C, svm_poly_reg1.epsilon), fontsize=18)\n", - "plt.ylabel(r\"$y$\", fontsize=18, rotation=0)\n", + "plt.title(f\"$degree={svm_poly_reg1.degree}, \"\n", + " f\"C={svm_poly_reg1.C}, \"\n", + " f\"\\\\epsilon={svm_poly_reg1.epsilon}$\")\n", + "plt.ylabel(r\"$y$\", rotation=0)\n", + "plt.grid()\n", + "\n", "plt.sca(axes[1])\n", "plot_svm_regression(svm_poly_reg2, X, y, [-1, 1, 0, 1])\n", - "plt.title(r\"$degree={}, C={}, \\epsilon = {}$\".format(svm_poly_reg2.degree, svm_poly_reg2.C, svm_poly_reg2.epsilon), fontsize=18)\n", + "plt.title(f\"$degree={svm_poly_reg2.degree}, \"\n", + " f\"C={svm_poly_reg2.C}, \"\n", + " f\"\\\\epsilon={svm_poly_reg2.epsilon}$\")\n", + "plt.grid()\n", "save_fig(\"svm_with_polynomial_kernel_plot\")\n", "plt.show()" ] @@ -958,31 +900,22 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Under the Hood\n", - "## Decision Function and Predictions" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Code to generate Figure 5–12. Decision function for the iris dataset**" + "# Extra Material – Under the Hood" ] }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 27, "metadata": {}, "outputs": [], "source": [ - "iris = datasets.load_iris()\n", - "X = iris[\"data\"][:, (2, 3)] # petal length, petal width\n", - "y = (iris[\"target\"] == 2).astype(np.float64) # Iris virginica" + "X = iris.data[[\"petal length (cm)\", \"petal width (cm)\"]].values\n", + "y = (iris.target == 2)" ] }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 28, "metadata": {}, "outputs": [], "source": [ @@ -998,9 +931,9 @@ " xs = np.c_[x1.ravel(), x2.ravel()]\n", " df = (xs.dot(w) + b).reshape(x1.shape)\n", " m = 1 / np.linalg.norm(w)\n", - " boundary_x2s = -x1s*(w[0]/w[1])-b/w[1]\n", - " margin_x2s_1 = -x1s*(w[0]/w[1])-(b-1)/w[1]\n", - " margin_x2s_2 = -x1s*(w[0]/w[1])-(b+1)/w[1]\n", + " boundary_x2s = -x1s * (w[0] / w[1]) - b / w[1]\n", + " margin_x2s_1 = -x1s * (w[0] / w[1]) - (b - 1) / w[1]\n", + " margin_x2s_2 = -x1s * (w[0] / w[1]) - (b + 1) / w[1]\n", " ax.plot_surface(x1s, x2, np.zeros_like(x1),\n", " color=\"b\", alpha=0.2, cstride=100, rstride=100)\n", " ax.plot(x1s, boundary_x2s, 0, \"k-\", linewidth=2, label=r\"$h=0$\")\n", @@ -1010,11 +943,11 @@ " ax.plot_wireframe(x1, x2, df, alpha=0.3, color=\"k\")\n", " ax.plot(X_crop[:, 0][y_crop==0], X_crop[:, 1][y_crop==0], 0, \"bs\")\n", " ax.axis(x1_lim + x2_lim)\n", - " ax.text(4.5, 2.5, 3.8, \"Decision function $h$\", fontsize=16)\n", - " ax.set_xlabel(r\"Petal length\", fontsize=16, labelpad=10)\n", - " ax.set_ylabel(r\"Petal width\", fontsize=16, labelpad=10)\n", - " ax.set_zlabel(r\"$h = \\mathbf{w}^T \\mathbf{x} + b$\", fontsize=18, labelpad=5)\n", - " ax.legend(loc=\"upper left\", fontsize=16)\n", + " ax.text(4.5, 2.5, 3.8, \"Decision function $h$\", fontsize=14)\n", + " ax.set_xlabel(r\"Petal length\", labelpad=10)\n", + " ax.set_ylabel(r\"Petal width\", labelpad=10)\n", + " ax.set_zlabel(r\"$h = \\mathbf{w}^T \\mathbf{x} + b$\", labelpad=5)\n", + " ax.legend(loc=\"upper left\")\n", "\n", "fig = plt.figure(figsize=(11, 6))\n", "ax1 = fig.add_subplot(111, projection='3d')\n", @@ -1024,65 +957,45 @@ "plt.show()" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Code to generate Figure 5–13. A smaller weight vector results in a larger margin**" - ] - }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 29, "metadata": {}, "outputs": [], "source": [ + "import matplotlib.patches as patches\n", + "\n", "def plot_2D_decision_function(w, b, ylabel=True, x1_lim=[-3, 3]):\n", " x1 = np.linspace(x1_lim[0], x1_lim[1], 200)\n", " y = w * x1 + b\n", " m = 1 / w\n", "\n", " plt.plot(x1, y)\n", - " plt.plot(x1_lim, [1, 1], \"k:\")\n", - " plt.plot(x1_lim, [-1, -1], \"k:\")\n", " plt.axhline(y=0, color='k')\n", " plt.axvline(x=0, color='k')\n", - " plt.plot([m, m], [0, 1], \"k--\")\n", - " plt.plot([-m, -m], [0, -1], \"k--\")\n", + " rect = patches.Rectangle((-3, -1), 6, 2, edgecolor='none', facecolor='blue',\n", + " alpha=0.1)\n", + " plt.gca().add_patch(rect)\n", + " plt.plot([m, m], [0, 1], \"b--\")\n", + " plt.plot([-m, -m], [0, -1], \"b--\")\n", " plt.plot([-m, m], [0, 0], \"k-o\", linewidth=3)\n", " plt.axis(x1_lim + [-2, 2])\n", - " plt.xlabel(r\"$x_1$\", fontsize=16)\n", + " plt.xlabel(r\"$x_1$\")\n", " if ylabel:\n", - " plt.ylabel(r\"$w_1 x_1$  \", rotation=0, fontsize=16)\n", - " plt.title(r\"$w_1 = {}$\".format(w), fontsize=16)\n", + " plt.ylabel(r\"$w_1 x_1$  \", rotation=0)\n", + " plt.title(r\"$w_1 = {}$\".format(w))\n", "\n", "fig, axes = plt.subplots(ncols=2, figsize=(9, 3.2), sharey=True)\n", "plt.sca(axes[0])\n", "plot_2D_decision_function(1, 0)\n", + "plt.grid()\n", "plt.sca(axes[1])\n", "plot_2D_decision_function(0.5, 0, ylabel=False)\n", + "plt.grid()\n", "save_fig(\"small_w_large_margin_plot\")\n", "plt.show()" ] }, - { - "cell_type": "code", - "execution_count": 33, - "metadata": {}, - "outputs": [], - "source": [ - "from sklearn.svm import SVC\n", - "from sklearn import datasets\n", - "\n", - "iris = datasets.load_iris()\n", - "X = iris[\"data\"][:, (2, 3)] # petal length, petal width\n", - "y = (iris[\"target\"] == 2).astype(np.float64) # Iris virginica\n", - "\n", - "svm_clf = SVC(kernel=\"linear\", C=1)\n", - "svm_clf.fit(X, y)\n", - "svm_clf.predict([[5.3, 1.3]])" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -1092,7 +1005,7 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 30, "metadata": {}, "outputs": [], "source": [ @@ -1100,14 +1013,14 @@ "h = np.where(1 - t < 0, 0, 1 - t) # max(0, 1-t)\n", "\n", "plt.figure(figsize=(5,2.8))\n", - "plt.plot(t, h, \"b-\", linewidth=2, label=\"$max(0, 1 - t)$\")\n", + "plt.plot(t, h, \"b-\", linewidth=2, label=\"$max(0, 1 - t)$\", zorder=10)\n", "plt.grid(True, which='both')\n", "plt.axhline(y=0, color='k')\n", "plt.axvline(x=0, color='k')\n", "plt.yticks(np.arange(-1, 2.5, 1))\n", - "plt.xlabel(\"$t$\", fontsize=16)\n", + "plt.xlabel(\"$t$\")\n", "plt.axis([-2, 4, -1, 2.5])\n", - "plt.legend(loc=\"upper right\", fontsize=16)\n", + "plt.legend(loc=\"upper right\")\n", "save_fig(\"hinge_plot\")\n", "plt.show()" ] @@ -1116,30 +1029,26 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Extra material" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Training time" + "# Extra material – Training Time" ] }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 31, "metadata": {}, "outputs": [], "source": [ "X, y = make_moons(n_samples=1000, noise=0.4, random_state=42)\n", + "\n", "plt.plot(X[:, 0][y==0], X[:, 1][y==0], \"bs\")\n", - "plt.plot(X[:, 0][y==1], X[:, 1][y==1], \"g^\")" + "plt.plot(X[:, 0][y==1], X[:, 1][y==1], \"g^\")\n", + "plt.grid()\n", + "plt.show()" ] }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 32, "metadata": {}, "outputs": [], "source": [ @@ -1149,18 +1058,17 @@ "tols = []\n", "times = []\n", "for i in range(10):\n", - " svm_clf = SVC(kernel=\"poly\", gamma=3, C=10, tol=tol, verbose=1)\n", + " svm_clf = SVC(kernel=\"poly\", gamma=3, C=10, tol=tol)\n", " t1 = time.time()\n", " svm_clf.fit(X, y)\n", " t2 = time.time()\n", " times.append(t2-t1)\n", " tols.append(tol)\n", - " print(i, tol, t2-t1)\n", " tol /= 10\n", "plt.semilogx(tols, times, \"bo-\")\n", - "plt.xlabel(\"Tolerance\", fontsize=16)\n", - "plt.ylabel(\"Time (seconds)\", fontsize=16)\n", - "plt.grid(True)\n", + "plt.xlabel(\"Tolerance\")\n", + "plt.ylabel(\"Time (seconds)\")\n", + "plt.grid()\n", "plt.show()" ] }, @@ -1168,30 +1076,30 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Linear SVM classifier implementation using Batch Gradient Descent" + "# Extra Material – Linear SVM classifier implementation using Batch Gradient Descent" ] }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 33, "metadata": {}, "outputs": [], "source": [ - "# Training set\n", - "X = iris[\"data\"][:, (2, 3)] # petal length, petal width\n", - "y = (iris[\"target\"] == 2).astype(np.float64).reshape(-1, 1) # Iris virginica" + "X = iris.data[[\"petal length (cm)\", \"petal width (cm)\"]].values\n", + "y = (iris.target == 2)" ] }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 34, "metadata": {}, "outputs": [], "source": [ "from sklearn.base import BaseEstimator\n", "\n", "class MyLinearSVC(BaseEstimator):\n", - " def __init__(self, C=1, eta0=1, eta_d=10000, n_epochs=1000, random_state=None):\n", + " def __init__(self, C=1, eta0=1, eta_d=10000, n_epochs=1000,\n", + " random_state=None):\n", " self.C = C\n", " self.eta0 = eta0\n", " self.n_epochs = n_epochs\n", @@ -1205,11 +1113,11 @@ " # Random initialization\n", " if self.random_state:\n", " np.random.seed(self.random_state)\n", - " w = np.random.randn(X.shape[1], 1) # n feature weights\n", + " w = np.random.randn(X.shape[1], 1) # n feature weights\n", " b = 0\n", "\n", " m = len(X)\n", - " t = y * 2 - 1 # -1 if t==0, +1 if t==1\n", + " t = np.array(y, dtype=np.float64).reshape(-1, 1) * 2 - 1\n", " X_t = X * t\n", " self.Js=[]\n", "\n", @@ -1219,11 +1127,11 @@ " X_t_sv = X_t[support_vectors_idx]\n", " t_sv = t[support_vectors_idx]\n", "\n", - " J = 1/2 * np.sum(w * w) + self.C * (np.sum(1 - X_t_sv.dot(w)) - b * np.sum(t_sv))\n", + " J = 1/2 * (w * w).sum() + self.C * ((1 - X_t_sv.dot(w)).sum() - b * t_sv.sum())\n", " self.Js.append(J)\n", "\n", - " w_gradient_vector = w - self.C * np.sum(X_t_sv, axis=0).reshape(-1, 1)\n", - " b_derivative = -self.C * np.sum(t_sv)\n", + " w_gradient_vector = w - self.C * X_t_sv.sum(axis=0).reshape(-1, 1)\n", + " b_derivative = -self.C * t_sv.sum()\n", " \n", " w = w - self.eta(epoch) * w_gradient_vector\n", " b = b - self.eta(epoch) * b_derivative\n", @@ -1239,27 +1147,39 @@ " return X.dot(self.coef_[0]) + self.intercept_[0]\n", "\n", " def predict(self, X):\n", - " return (self.decision_function(X) >= 0).astype(np.float64)\n", - "\n", - "C=2\n", - "svm_clf = MyLinearSVC(C=C, eta0 = 10, eta_d = 1000, n_epochs=60000, random_state=2)\n", + " return self.decision_function(X) >= 0" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [], + "source": [ + "C = 2\n", + "svm_clf = MyLinearSVC(C=C, eta0 = 10, eta_d = 1000, n_epochs=60000,\n", + " random_state=2)\n", "svm_clf.fit(X, y)\n", "svm_clf.predict(np.array([[5, 2], [4, 1]]))" ] }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 36, "metadata": {}, "outputs": [], "source": [ "plt.plot(range(svm_clf.n_epochs), svm_clf.Js)\n", - "plt.axis([0, svm_clf.n_epochs, 0, 100])" + "plt.axis([0, svm_clf.n_epochs, 0, 100])\n", + "plt.xlabel(\"Epochs\")\n", + "plt.ylabel(\"Loss\")\n", + "plt.grid()\n", + "plt.show()" ] }, { "cell_type": "code", - "execution_count": 40, + "execution_count": 37, "metadata": {}, "outputs": [], "source": [ @@ -1268,7 +1188,7 @@ }, { "cell_type": "code", - "execution_count": 41, + "execution_count": 38, "metadata": {}, "outputs": [], "source": [ @@ -1279,7 +1199,7 @@ }, { "cell_type": "code", - "execution_count": 42, + "execution_count": 39, "metadata": {}, "outputs": [], "source": [ @@ -1289,24 +1209,28 @@ "plt.plot(X[:, 0][yr==1], X[:, 1][yr==1], \"g^\", label=\"Iris virginica\")\n", "plt.plot(X[:, 0][yr==0], X[:, 1][yr==0], \"bs\", label=\"Not Iris virginica\")\n", "plot_svc_decision_boundary(svm_clf, 4, 6)\n", - "plt.xlabel(\"Petal length\", fontsize=14)\n", - "plt.ylabel(\"Petal width\", fontsize=14)\n", - "plt.title(\"MyLinearSVC\", fontsize=14)\n", + "plt.xlabel(\"Petal length\")\n", + "plt.ylabel(\"Petal width\")\n", + "plt.title(\"MyLinearSVC\")\n", "plt.axis([4, 6, 0.8, 2.8])\n", "plt.legend(loc=\"upper left\")\n", + "plt.grid()\n", "\n", "plt.sca(axes[1])\n", "plt.plot(X[:, 0][yr==1], X[:, 1][yr==1], \"g^\")\n", "plt.plot(X[:, 0][yr==0], X[:, 1][yr==0], \"bs\")\n", "plot_svc_decision_boundary(svm_clf2, 4, 6)\n", - "plt.xlabel(\"Petal length\", fontsize=14)\n", - "plt.title(\"SVC\", fontsize=14)\n", - "plt.axis([4, 6, 0.8, 2.8])\n" + "plt.xlabel(\"Petal length\")\n", + "plt.title(\"SVC\")\n", + "plt.axis([4, 6, 0.8, 2.8])\n", + "plt.grid()\n", + "\n", + "plt.show()" ] }, { "cell_type": "code", - "execution_count": 43, + "execution_count": 40, "metadata": { "scrolled": true }, @@ -1314,11 +1238,12 @@ "source": [ "from sklearn.linear_model import SGDClassifier\n", "\n", - "sgd_clf = SGDClassifier(loss=\"hinge\", alpha=0.017, max_iter=1000, tol=1e-3, random_state=42)\n", - "sgd_clf.fit(X, y.ravel())\n", + "sgd_clf = SGDClassifier(loss=\"hinge\", alpha=0.017, max_iter=1000, tol=1e-3,\n", + " random_state=42)\n", + "sgd_clf.fit(X, y)\n", "\n", "m = len(X)\n", - "t = y * 2 - 1 # -1 if t==0, +1 if t==1\n", + "t = np.array(y).reshape(-1, 1) * 2 - 1 # -1 if t==0, +1 if t==1\n", "X_b = np.c_[np.ones((m, 1)), X] # Add bias input x0=1\n", "X_b_t = X_b * t\n", "sgd_theta = np.r_[sgd_clf.intercept_[0], sgd_clf.coef_[0]]\n", @@ -1331,10 +1256,12 @@ "plt.plot(X[:, 0][yr==1], X[:, 1][yr==1], \"g^\")\n", "plt.plot(X[:, 0][yr==0], X[:, 1][yr==0], \"bs\")\n", "plot_svc_decision_boundary(sgd_clf, 4, 6)\n", - "plt.xlabel(\"Petal length\", fontsize=14)\n", - "plt.ylabel(\"Petal width\", fontsize=14)\n", - "plt.title(\"SGDClassifier\", fontsize=14)\n", - "plt.axis([4, 6, 0.8, 2.8])\n" + "plt.xlabel(\"Petal length\")\n", + "plt.ylabel(\"Petal width\")\n", + "plt.title(\"SGDClassifier\")\n", + "plt.axis([4, 6, 0.8, 2.8])\n", + "\n", + "plt.show()" ] }, { @@ -1381,15 +1308,15 @@ }, { "cell_type": "code", - "execution_count": 44, + "execution_count": 41, "metadata": {}, "outputs": [], "source": [ "from sklearn import datasets\n", "\n", - "iris = datasets.load_iris()\n", - "X = iris[\"data\"][:, (2, 3)] # petal length, petal width\n", - "y = iris[\"target\"]\n", + "iris = datasets.load_iris(as_frame=True)\n", + "X = iris.data[[\"petal length (cm)\", \"petal width (cm)\"]].values\n", + "y = iris.target\n", "\n", "setosa_or_versicolor = (y == 0) | (y == 1)\n", "X = X[setosa_or_versicolor]\n", @@ -1398,7 +1325,7 @@ }, { "cell_type": "code", - "execution_count": 45, + "execution_count": 42, "metadata": {}, "outputs": [], "source": [ @@ -1411,8 +1338,8 @@ "\n", "lin_clf = LinearSVC(loss=\"hinge\", C=C, random_state=42)\n", "svm_clf = SVC(kernel=\"linear\", C=C)\n", - "sgd_clf = SGDClassifier(loss=\"hinge\", learning_rate=\"constant\", eta0=0.001, alpha=alpha,\n", - " max_iter=1000, tol=1e-3, random_state=42)\n", + "sgd_clf = SGDClassifier(loss=\"hinge\", learning_rate=\"constant\", eta0=0.001,\n", + " alpha=alpha, max_iter=1000, tol=1e-3, random_state=42)\n", "\n", "scaler = StandardScaler()\n", "X_scaled = scaler.fit_transform(X)\n", @@ -1423,7 +1350,8 @@ "\n", "print(\"LinearSVC: \", lin_clf.intercept_, lin_clf.coef_)\n", "print(\"SVC: \", svm_clf.intercept_, svm_clf.coef_)\n", - "print(\"SGDClassifier(alpha={:.5f}):\".format(sgd_clf.alpha), sgd_clf.intercept_, sgd_clf.coef_)" + "print(f\"SGDClassifier(alpha={sgd_clf.alpha:.1e}):\",\n", + " sgd_clf.intercept_, sgd_clf.coef_)" ] }, { @@ -1435,17 +1363,17 @@ }, { "cell_type": "code", - "execution_count": 46, + "execution_count": 43, "metadata": {}, "outputs": [], "source": [ "# Compute the slope and bias of each decision boundary\n", - "w1 = -lin_clf.coef_[0, 0]/lin_clf.coef_[0, 1]\n", - "b1 = -lin_clf.intercept_[0]/lin_clf.coef_[0, 1]\n", - "w2 = -svm_clf.coef_[0, 0]/svm_clf.coef_[0, 1]\n", - "b2 = -svm_clf.intercept_[0]/svm_clf.coef_[0, 1]\n", - "w3 = -sgd_clf.coef_[0, 0]/sgd_clf.coef_[0, 1]\n", - "b3 = -sgd_clf.intercept_[0]/sgd_clf.coef_[0, 1]\n", + "w1 = -lin_clf.coef_[0, 0] / lin_clf.coef_[0, 1]\n", + "b1 = -lin_clf.intercept_[0] / lin_clf.coef_[0, 1]\n", + "w2 = -svm_clf.coef_[0, 0] / svm_clf.coef_[0, 1]\n", + "b2 = -svm_clf.intercept_[0] / svm_clf.coef_[0, 1]\n", + "w3 = -sgd_clf.coef_[0, 0] / sgd_clf.coef_[0, 1]\n", + "b3 = -sgd_clf.intercept_[0] / sgd_clf.coef_[0, 1]\n", "\n", "# Transform the decision boundary lines back to the original scale\n", "line1 = scaler.inverse_transform([[-10, -10 * w1 + b1], [10, 10 * w1 + b1]])\n", @@ -1459,10 +1387,11 @@ "plt.plot(line3[:, 0], line3[:, 1], \"r-\", label=\"SGDClassifier\")\n", "plt.plot(X[:, 0][y==1], X[:, 1][y==1], \"bs\") # label=\"Iris versicolor\"\n", "plt.plot(X[:, 0][y==0], X[:, 1][y==0], \"yo\") # label=\"Iris setosa\"\n", - "plt.xlabel(\"Petal length\", fontsize=14)\n", - "plt.ylabel(\"Petal width\", fontsize=14)\n", - "plt.legend(loc=\"upper center\", fontsize=14)\n", + "plt.xlabel(\"Petal length\")\n", + "plt.ylabel(\"Petal width\")\n", + "plt.legend(loc=\"upper center\")\n", "plt.axis([0, 5.5, 0, 2])\n", + "plt.grid()\n", "\n", "plt.show()" ] @@ -1485,21 +1414,46 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "_Exercise: train an SVM classifier on the MNIST dataset. Since SVM classifiers are binary classifiers, you will need to use one-versus-all to classify all 10 digits. You may want to tune the hyperparameters using small validation sets to speed up the process. What accuracy can you reach?_" + "_Exercise: train an SVM classifier on the Wine dataset, which you can load using `sklearn.datasets.load_wine()`. This dataset contains the chemical analysis of 178 wine samples produced by 3 different cultivators: the goal is to train a classification model capable of predicting the cultivator based on the wine's chemical analysis. Since SVM classifiers are binary classifiers, you will need to use one-versus-all to classify all 3 classes. You may want to tune the hyperparameters using small validation sets to speed up the process. What accuracy can you reach?_" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "First, let's load the dataset and split it into a training set and a test set. We could use `train_test_split()` but people usually just take the first 60,000 instances for the training set, and the last 10,000 instances for the test set (this makes it possible to compare your model's performance with others): " + "First, let's fetch the dataset, look at its description, then split it into a training set and a test set:" ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": 44, "metadata": {}, + "outputs": [], "source": [ - "**Warning:** since Scikit-Learn 0.24, `fetch_openml()` returns a Pandas `DataFrame` by default. To avoid this, we use `as_frame=False`." + "from sklearn.datasets import load_wine\n", + "\n", + "wine = load_wine(as_frame=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": {}, + "outputs": [], + "source": [ + "print(wine.DESCR)" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.model_selection import train_test_split\n", + "\n", + "X_train, X_test, y_train, y_test = train_test_split(\n", + " wine.data, wine.target, random_state=42)" ] }, { @@ -1508,32 +1462,7 @@ "metadata": {}, "outputs": [], "source": [ - "from sklearn.datasets import fetch_openml\n", - "mnist = fetch_openml('mnist_784', version=1, cache=True, as_frame=False)\n", - "\n", - "X = mnist[\"data\"]\n", - "y = mnist[\"target\"].astype(np.uint8)\n", - "\n", - "X_train = X[:60000]\n", - "y_train = y[:60000]\n", - "X_test = X[60000:]\n", - "y_test = y[60000:]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Many training algorithms are sensitive to the order of the training instances, so it's generally good practice to shuffle them first. However, the dataset is already shuffled, so we do not need to do it." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's start simple, with a linear SVM classifier. It will automatically use the One-vs-All (also called One-vs-the-Rest, OvR) strategy, so there's nothing special we need to do. Easy!\n", - "\n", - "**Warning**: this may take a few minutes depending on your hardware." + "X_train.head()" ] }, { @@ -1541,6 +1470,22 @@ "execution_count": 48, "metadata": {}, "outputs": [], + "source": [ + "y_train.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's start simple, with a linear SVM classifier. It will automatically use the One-vs-All (also called One-vs-the-Rest, OvR) strategy, so there's nothing special we need to do to handle multiple classes. Easy, right?" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": {}, + "outputs": [], "source": [ "lin_clf = LinearSVC(random_state=42)\n", "lin_clf.fit(X_train, y_train)" @@ -1550,26 +1495,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Let's make predictions on the training set and measure the accuracy (we don't want to measure it on the test set yet, since we have not selected and trained the final model yet):" - ] - }, - { - "cell_type": "code", - "execution_count": 49, - "metadata": {}, - "outputs": [], - "source": [ - "from sklearn.metrics import accuracy_score\n", - "\n", - "y_pred = lin_clf.predict(X_train)\n", - "accuracy_score(y_train, y_pred)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Okay, 89.5% accuracy on MNIST is pretty bad. This linear model is certainly too simple for MNIST, but perhaps we just needed to scale the data first:" + "Oh no! It failed to converge. Can you guess why? Do you think we must just increase the number of training iterations? Let's see:" ] }, { @@ -1578,16 +1504,17 @@ "metadata": {}, "outputs": [], "source": [ - "scaler = StandardScaler()\n", - "X_train_scaled = scaler.fit_transform(X_train.astype(np.float32))\n", - "X_test_scaled = scaler.transform(X_test.astype(np.float32))" + "lin_clf = LinearSVC(max_iter=1_000_000, random_state=42)\n", + "lin_clf.fit(X_train, y_train)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "**Warning**: this may take a few minutes depending on your hardware." + "Even with one million iterations, it still did not converge. There must be another problem.\n", + "\n", + "Let's still evaluate this model with `cross_val_score`, it will serve as a baseline:" ] }, { @@ -1596,8 +1523,18 @@ "metadata": {}, "outputs": [], "source": [ - "lin_clf = LinearSVC(random_state=42)\n", - "lin_clf.fit(X_train_scaled, y_train)" + "from sklearn.model_selection import cross_val_score\n", + "\n", + "cross_val_score(lin_clf, X_train, y_train).mean()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Well 91% accuracy on this dataset is not great. So did you guess what the problem is?\n", + "\n", + "That's right, we forgot to scale the features! Always remember to scale the features when using SVMs:" ] }, { @@ -1606,22 +1543,16 @@ "metadata": {}, "outputs": [], "source": [ - "y_pred = lin_clf.predict(X_train_scaled)\n", - "accuracy_score(y_train, y_pred)" + "lin_clf = make_pipeline(StandardScaler(),\n", + " LinearSVC(random_state=42))\n", + "lin_clf.fit(X_train, y_train)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "That's much better (we cut the error rate by about 25%), but still not great at all for MNIST. If we want to use an SVM, we will have to use a kernel. Let's try an `SVC` with an RBF kernel (the default)." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Note**: to be future-proof we set `gamma=\"scale\"` since it will be the default value in Scikit-Learn 0.22." + "Now it converges without any problem. Let's measure its performance:" ] }, { @@ -1630,8 +1561,23 @@ "metadata": {}, "outputs": [], "source": [ - "svm_clf = SVC(gamma=\"scale\")\n", - "svm_clf.fit(X_train_scaled[:10000], y_train[:10000])" + "from sklearn.model_selection import cross_val_score\n", + "\n", + "cross_val_score(lin_clf, X_train, y_train).mean()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Nice! We get 97.7% accuracy, that's much better." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's see if a kernelized SVM will do better. We will use a default `SVC` for now:" ] }, { @@ -1640,15 +1586,15 @@ "metadata": {}, "outputs": [], "source": [ - "y_pred = svm_clf.predict(X_train_scaled)\n", - "accuracy_score(y_train, y_pred)" + "svm_clf = make_pipeline(StandardScaler(), SVC(random_state=42))\n", + "cross_val_score(svm_clf, X_train, y_train).mean()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "That's promising, we get better performance even though we trained the model on 6 times less data. Let's tune the hyperparameters by doing a randomized search with cross validation. We will do this on a small dataset just to speed up the process:" + "That's not better, but perhaps we need to do a bit of hyperparameter tuning:" ] }, { @@ -1660,23 +1606,19 @@ "from sklearn.model_selection import RandomizedSearchCV\n", "from scipy.stats import reciprocal, uniform\n", "\n", - "param_distributions = {\"gamma\": reciprocal(0.001, 0.1), \"C\": uniform(1, 10)}\n", - "rnd_search_cv = RandomizedSearchCV(svm_clf, param_distributions, n_iter=10, verbose=2, cv=3)\n", - "rnd_search_cv.fit(X_train_scaled[:1000], y_train[:1000])" - ] - }, - { - "cell_type": "code", - "execution_count": 56, - "metadata": {}, - "outputs": [], - "source": [ + "param_distrib = {\n", + " \"svc__gamma\": reciprocal(0.001, 0.1),\n", + " \"svc__C\": uniform(1, 10)\n", + "}\n", + "rnd_search_cv = RandomizedSearchCV(svm_clf, param_distrib, n_iter=100, cv=5,\n", + " random_state=42)\n", + "rnd_search_cv.fit(X_train, y_train)\n", "rnd_search_cv.best_estimator_" ] }, { "cell_type": "code", - "execution_count": 57, + "execution_count": 56, "metadata": {}, "outputs": [], "source": [ @@ -1687,57 +1629,23 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "This looks pretty low but remember we only trained the model on 1,000 instances. Let's retrain the best estimator on the whole training set:" + "Ah, this looks excellent! Let's select this model. Now we can test it on the test set:" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "metadata": {}, + "outputs": [], + "source": [ + "rnd_search_cv.score(X_test, y_test)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "**Warning**: the following cell may take hours to run, depending on your hardware." - ] - }, - { - "cell_type": "code", - "execution_count": 58, - "metadata": {}, - "outputs": [], - "source": [ - "rnd_search_cv.best_estimator_.fit(X_train_scaled, y_train)" - ] - }, - { - "cell_type": "code", - "execution_count": 59, - "metadata": {}, - "outputs": [], - "source": [ - "y_pred = rnd_search_cv.best_estimator_.predict(X_train_scaled)\n", - "accuracy_score(y_train, y_pred)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Ah, this looks good! Let's select this model. Now we can test it on the test set:" - ] - }, - { - "cell_type": "code", - "execution_count": 60, - "metadata": {}, - "outputs": [], - "source": [ - "y_pred = rnd_search_cv.best_estimator_.predict(X_test_scaled)\n", - "accuracy_score(y_test, y_pred)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Not too bad, but apparently the model is overfitting slightly. It's tempting to tweak the hyperparameters a bit more (e.g. decreasing `C` and/or `gamma`), but we would run the risk of overfitting the test set. Other people have found that the hyperparameters `C=5` and `gamma=0.005` yield even better performance (over 98% accuracy). By running the randomized search for longer and on a larger part of the training set, you may be able to find this as well." + "This tuned kernelized SVM performs better than the `LinearSVC` model, but we get a lower score on the test set than we measured using cross-validation. This is quite common: since we did so much hyperparameter tuning, we ended up slightly overfitting the cross-validation test sets. It's tempting to tweak the hyperparameters a bit more until we get a better result on the test set, but we this would probably not help, as we would just start overfitting the test set. Anyway, this score is not bad at all, so let's stop here." ] }, { @@ -1751,27 +1659,27 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "_Exercise: train an SVM regressor on the California housing dataset._" + "_Exercise: Train and fine-tune an SVM regressor on the California housing dataset. You can use the original dataset rather than the tweaked version we used in Chapter 2. The original dataset can be fetched using `sklearn.datasets.fetch_california_housing()`. Since there are over 20,000 instances, SVMs can be slow, so for hyperparameter tuning you should use much less instances (e.g., 2,000), to test many more hyperparameter combinations._" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Let's load the dataset using Scikit-Learn's `fetch_california_housing()` function:" + "Let's load the dataset:" ] }, { "cell_type": "code", - "execution_count": 61, + "execution_count": 58, "metadata": {}, "outputs": [], "source": [ "from sklearn.datasets import fetch_california_housing\n", "\n", "housing = fetch_california_housing()\n", - "X = housing[\"data\"]\n", - "y = housing[\"target\"]" + "X = housing.data\n", + "y = housing.target" ] }, { @@ -1783,7 +1691,7 @@ }, { "cell_type": "code", - "execution_count": 62, + "execution_count": 59, "metadata": {}, "outputs": [], "source": [ @@ -1799,19 +1707,6 @@ "Don't forget to scale the data:" ] }, - { - "cell_type": "code", - "execution_count": 63, - "metadata": {}, - "outputs": [], - "source": [ - "from sklearn.preprocessing import StandardScaler\n", - "\n", - "scaler = StandardScaler()\n", - "X_train_scaled = scaler.fit_transform(X_train)\n", - "X_test_scaled = scaler.transform(X_test)" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -1821,14 +1716,32 @@ }, { "cell_type": "code", - "execution_count": 64, + "execution_count": 60, "metadata": {}, "outputs": [], "source": [ "from sklearn.svm import LinearSVR\n", "\n", - "lin_svr = LinearSVR(random_state=42)\n", - "lin_svr.fit(X_train_scaled, y_train)" + "lin_svr = make_pipeline(StandardScaler(), LinearSVR(random_state=42))\n", + "lin_svr.fit(X_train, y_train)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "It did not converge, so let's increase `max_iter`:" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "metadata": {}, + "outputs": [], + "source": [ + "lin_svr = make_pipeline(StandardScaler(),\n", + " LinearSVR(max_iter=5000, random_state=42))\n", + "lin_svr.fit(X_train, y_train)" ] }, { @@ -1840,13 +1753,13 @@ }, { "cell_type": "code", - "execution_count": 65, + "execution_count": 62, "metadata": {}, "outputs": [], "source": [ "from sklearn.metrics import mean_squared_error\n", "\n", - "y_pred = lin_svr.predict(X_train_scaled)\n", + "y_pred = lin_svr.predict(X_train)\n", "mse = mean_squared_error(y_train, y_pred)\n", "mse" ] @@ -1860,7 +1773,7 @@ }, { "cell_type": "code", - "execution_count": 66, + "execution_count": 63, "metadata": {}, "outputs": [], "source": [ @@ -1871,12 +1784,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "In this training set, the targets are tens of thousands of dollars. The RMSE gives a rough idea of the kind of error you should expect (with a higher weight for large errors): so with this model we can expect errors somewhere around $10,000. Not great. Let's see if we can do better with an RBF Kernel. We will use randomized search with cross validation to find the appropriate hyperparameter values for `C` and `gamma`:" + "In this dataset, the targets represent hundreds of thousands of dollars. The RMSE gives a rough idea of the kind of error you should expect (with a higher weight for large errors): so with this model we can expect errors close to $98,000! Not great. Let's see if we can do better with an RBF Kernel. We will use randomized search with cross validation to find the appropriate hyperparameter values for `C` and `gamma`:" ] }, { "cell_type": "code", - "execution_count": 67, + "execution_count": 64, "metadata": {}, "outputs": [], "source": [ @@ -1884,36 +1797,34 @@ "from sklearn.model_selection import RandomizedSearchCV\n", "from scipy.stats import reciprocal, uniform\n", "\n", - "param_distributions = {\"gamma\": reciprocal(0.001, 0.1), \"C\": uniform(1, 10)}\n", - "rnd_search_cv = RandomizedSearchCV(SVR(), param_distributions, n_iter=10, verbose=2, cv=3, random_state=42)\n", - "rnd_search_cv.fit(X_train_scaled, y_train)" + "svm_clf = make_pipeline(StandardScaler(), SVR())\n", + "\n", + "param_distrib = {\n", + " \"svr__gamma\": reciprocal(0.001, 0.1),\n", + " \"svr__C\": uniform(1, 10)\n", + "}\n", + "rnd_search_cv = RandomizedSearchCV(svm_clf, param_distrib,\n", + " n_iter=100, cv=3, random_state=42)\n", + "rnd_search_cv.fit(X_train[:2000], y_train[:2000])" ] }, { "cell_type": "code", - "execution_count": 68, + "execution_count": 65, "metadata": {}, "outputs": [], "source": [ "rnd_search_cv.best_estimator_" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now let's measure the RMSE on the training set:" - ] - }, { "cell_type": "code", - "execution_count": 69, + "execution_count": 66, "metadata": {}, "outputs": [], "source": [ - "y_pred = rnd_search_cv.best_estimator_.predict(X_train_scaled)\n", - "mse = mean_squared_error(y_train, y_pred)\n", - "np.sqrt(mse)" + "-cross_val_score(rnd_search_cv.best_estimator_, X_train, y_train,\n", + " scoring=\"neg_root_mean_squared_error\")" ] }, { @@ -1925,13 +1836,27 @@ }, { "cell_type": "code", - "execution_count": 70, + "execution_count": 67, "metadata": {}, "outputs": [], "source": [ - "y_pred = rnd_search_cv.best_estimator_.predict(X_test_scaled)\n", - "mse = mean_squared_error(y_test, y_pred)\n", - "np.sqrt(mse)" + "y_pred = rnd_search_cv.best_estimator_.predict(X_test)\n", + "rmse = mean_squared_error(y_test, y_pred, squared=False)\n", + "rmse" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "So SVMs worked very well on the Wine dataset, but not so much on the California Housing dataset. In Chapter 2, we found that Random Forests worked better for that dataset." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "And that's all for today!" ] }, { @@ -1944,7 +1869,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "Python 3", "language": "python", "name": "python3" },