Big update of chapter 5 for 3rd edition

2021-11-25 14:38:40 +13:00 · 2021-11-25 14:38:40 +13:00 · 8e97aab84b
parent 608d9ce815
commit 8e97aab84b
1 changed files with 201 additions and 275 deletions
--- a/05_support_vector_machines.ipynb
+++ b/05_support_vector_machines.ipynb
@ -125,6 +125,13 @@
    "# Linear SVM Classification"
   ]
  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The book starts with a few figures, before the first code example, so the next three cells generate and save these figures. You can skip them if you want."
+   ]
+  },
  {
   "cell_type": "code",
   "execution_count": 5,
@ -168,12 +175,13 @@
    "    margin = 1/w[1]\n",
    "    gutter_up = decision_boundary + margin\n",
    "    gutter_down = decision_boundary - margin\n",
-    "\n",
    "    svs = svm_clf.support_vectors_\n",
-    "    plt.scatter(svs[:, 0], svs[:, 1], s=180, facecolors='#FFAAAA')\n",
-    "    plt.plot(x0, decision_boundary, \"k-\", linewidth=2)\n",
-    "    plt.plot(x0, gutter_up, \"k--\", linewidth=2)\n",
-    "    plt.plot(x0, gutter_down, \"k--\", linewidth=2)\n",
+    "\n",
+    "    plt.plot(x0, decision_boundary, \"k-\", linewidth=2, zorder=-2)\n",
+    "    plt.plot(x0, gutter_up, \"k--\", linewidth=2, zorder=-2)\n",
+    "    plt.plot(x0, gutter_down, \"k--\", linewidth=2, zorder=-2)\n",
+    "    plt.scatter(svs[:, 0], svs[:, 1], s=180, facecolors='#AAA',\n",
+    "                zorder=-1)\n",
    "\n",
    "fig, axes = plt.subplots(ncols=2, figsize=(10,2.7), sharey=True)\n",
    "\n",
@ -187,6 +195,7 @@
    "plt.ylabel(\"Petal width\")\n",
    "plt.legend(loc=\"upper left\")\n",
    "plt.axis([0, 5.5, 0, 2])\n",
+    "plt.gca().set_aspect(\"equal\")\n",
    "plt.grid()\n",
    "\n",
    "plt.sca(axes[1])\n",
@ -195,6 +204,7 @@
    "plt.plot(X[:, 0][y==0], X[:, 1][y==0], \"yo\")\n",
    "plt.xlabel(\"Petal length\")\n",
    "plt.axis([0, 5.5, 0, 2])\n",
+    "plt.gca().set_aspect(\"equal\")\n",
    "plt.grid()\n",
    "\n",
    "save_fig(\"large_margin_classification_plot\")\n",
@ -320,17 +330,17 @@
   "outputs": [],
   "source": [
    "import numpy as np\n",
-    "from sklearn import datasets\n",
+    "from sklearn.datasets import load_iris\n",
    "from sklearn.pipeline import make_pipeline\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "from sklearn.svm import LinearSVC\n",
    "\n",
-    "iris = datasets.load_iris(as_frame=True)\n",
+    "iris = load_iris(as_frame=True)\n",
    "X = iris.data[[\"petal length (cm)\", \"petal width (cm)\"]].values\n",
    "y = (iris.target == 2)  # Iris virginica\n",
    "\n",
    "svm_clf = make_pipeline(StandardScaler(),\n",
-    "                        LinearSVC(C=1, loss=\"hinge\", random_state=42))\n",
+    "                        LinearSVC(C=1, random_state=42))\n",
    "svm_clf.fit(X, y)"
   ]
  },
@ -340,7 +350,8 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "svm_clf.predict([[5.5, 1.7]])"
+    "X_new = [[5.5, 1.7], [5.0, 1.5]]\n",
+    "svm_clf.predict(X_new)"
   ]
  },
  {
@ -348,12 +359,21 @@
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
+   "source": [
+    "svm_clf.decision_function(X_new)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
   "source": [
    "# not in the book – this cell generates and saves Figure 5–4\n",
    "\n",
    "scaler = StandardScaler()\n",
-    "svm_clf1 = LinearSVC(C=1, loss=\"hinge\", max_iter=10_000, random_state=42)\n",
-    "svm_clf2 = LinearSVC(C=100, loss=\"hinge\", max_iter=10_000, random_state=42)\n",
+    "svm_clf1 = LinearSVC(C=1, max_iter=10_000, random_state=42)\n",
+    "svm_clf2 = LinearSVC(C=100, max_iter=10_000, random_state=42)\n",
    "\n",
    "scaled_svm_clf1 = make_pipeline(scaler, svm_clf1)\n",
    "scaled_svm_clf2 = make_pipeline(scaler, svm_clf2)\n",
@ -387,7 +407,7 @@
    "plt.xlabel(\"Petal length\")\n",
    "plt.ylabel(\"Petal width\")\n",
    "plt.legend(loc=\"upper left\")\n",
-    "plt.title(\"$C = {}$\".format(svm_clf1.C))\n",
+    "plt.title(f\"$C = {svm_clf1.C}$\")\n",
    "plt.axis([4, 5.9, 0.8, 2.8])\n",
    "plt.grid()\n",
    "\n",
@ -396,7 +416,7 @@
    "plt.plot(X[:, 0][y==0], X[:, 1][y==0], \"bs\")\n",
    "plot_svc_decision_boundary(svm_clf2, 4, 5.99)\n",
    "plt.xlabel(\"Petal length\")\n",
-    "plt.title(\"$C = {}$\".format(svm_clf2.C))\n",
+    "plt.title(f\"$C = {svm_clf2.C}$\")\n",
    "plt.axis([4, 5.9, 0.8, 2.8])\n",
    "plt.grid()\n",
    "\n",
@ -413,7 +433,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
@ -461,12 +481,11 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.datasets import make_moons\n",
-    "from sklearn.pipeline import make_pipeline\n",
    "from sklearn.preprocessing import PolynomialFeatures\n",
    "\n",
    "X, y = make_moons(n_samples=100, noise=0.15, random_state=42)\n",
@ -474,14 +493,14 @@
    "polynomial_svm_clf = make_pipeline(\n",
    "    PolynomialFeatures(degree=3),\n",
    "    StandardScaler(),\n",
-    "    LinearSVC(C=10, loss=\"hinge\", max_iter=10_000, random_state=42)\n",
+    "    LinearSVC(C=10, max_iter=10_000, random_state=42)\n",
    ")\n",
    "polynomial_svm_clf.fit(X, y)"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
@ -528,7 +547,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
@ -543,7 +562,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
@ -581,7 +600,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 17,
   "metadata": {
    "scrolled": true
   },
@ -651,18 +670,6 @@
    "plt.show()"
   ]
  },
-  {
-   "cell_type": "code",
-   "execution_count": 17,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "x1_example = X1D[3, 0]\n",
-    "for landmark in (-2, 1):\n",
-    "    k = gaussian_rbf(np.array([[x1_example]]), np.array([[landmark]]), gamma)\n",
-    "    print(\"Phi({}, {}) = {}\".format(x1_example, landmark, k))"
-   ]
-  },
  {
   "cell_type": "markdown",
   "metadata": {},
@ -722,7 +729,7 @@
    "    plot_predictions(svm_clf, [-1.5, 2.45, -1, 1.5])\n",
    "    plot_dataset(X, y, [-1.5, 2.45, -1, 1.5])\n",
    "    gamma, C = hyperparams[i]\n",
-    "    plt.title(r\"$\\gamma = {}, C = {}$\".format(gamma, C))\n",
+    "    plt.title(fr\"$\\gamma = {gamma}, C = {C}$\")\n",
    "    if i in (0, 1):\n",
    "        plt.xlabel(\"\")\n",
    "    if i in (1, 3):\n",
@ -745,6 +752,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
+    "# not in the book – this code generates a simple linear dataset\n",
    "np.random.seed(42)\n",
    "m = 50\n",
    "X = 2 * np.random.rand(m, 1)\n",
@ -766,7 +774,8 @@
   "source": [
    "from sklearn.svm import LinearSVR\n",
    "\n",
-    "svm_reg = LinearSVR(epsilon=1.5, random_state=42)\n",
+    "svm_reg = make_pipeline(StandardScaler(),\n",
+    "                        LinearSVR(epsilon=0.5, random_state=42))\n",
    "svm_reg.fit(X, y)"
   ]
  },
@ -780,47 +789,49 @@
    "\n",
    "def find_support_vectors(svm_reg, X, y):\n",
    "    y_pred = svm_reg.predict(X)\n",
-    "    off_margin = (np.abs(y - y_pred) >= svm_reg.epsilon)\n",
+    "    epsilon = svm_reg[-1].epsilon\n",
+    "    off_margin = np.abs(y - y_pred) >= epsilon\n",
    "    return np.argwhere(off_margin)\n",
    "\n",
    "def plot_svm_regression(svm_reg, X, y, axes):\n",
    "    x1s = np.linspace(axes[0], axes[1], 100).reshape(100, 1)\n",
    "    y_pred = svm_reg.predict(x1s)\n",
-    "    plt.plot(x1s, y_pred, \"k-\", linewidth=2, label=r\"$\\hat{y}$\")\n",
-    "    plt.plot(x1s, y_pred + svm_reg.epsilon, \"k--\")\n",
-    "    plt.plot(x1s, y_pred - svm_reg.epsilon, \"k--\")\n",
-    "    plt.scatter(X[svm_reg.support_], y[svm_reg.support_], s=180, facecolors='#FFAAAA')\n",
+    "    epsilon = svm_reg[-1].epsilon\n",
+    "    plt.plot(x1s, y_pred, \"k-\", linewidth=2, label=r\"$\\hat{y}$\", zorder=-2)\n",
+    "    plt.plot(x1s, y_pred + epsilon, \"k--\", zorder=-2)\n",
+    "    plt.plot(x1s, y_pred - epsilon, \"k--\", zorder=-2)\n",
+    "    plt.scatter(X[svm_reg._support], y[svm_reg._support], s=180,\n",
+    "                facecolors='#AAA', zorder=-1)\n",
    "    plt.plot(X, y, \"bo\")\n",
    "    plt.xlabel(r\"$x_1$\")\n",
    "    plt.legend(loc=\"upper left\")\n",
    "    plt.axis(axes)\n",
    "\n",
-    "svm_reg1 = LinearSVR(epsilon=1.5, random_state=42)\n",
-    "svm_reg2 = LinearSVR(epsilon=0.5, random_state=42)\n",
-    "svm_reg1.fit(X, y)\n",
+    "svm_reg2 = make_pipeline(StandardScaler(),\n",
+    "                         LinearSVR(epsilon=1.2, random_state=42))\n",
    "svm_reg2.fit(X, y)\n",
    "\n",
-    "svm_reg1.support_ = find_support_vectors(svm_reg1, X, y)\n",
-    "svm_reg2.support_ = find_support_vectors(svm_reg2, X, y)\n",
+    "svm_reg._support = find_support_vectors(svm_reg, X, y)\n",
+    "svm_reg2._support = find_support_vectors(svm_reg2, X, y)\n",
    "\n",
    "eps_x1 = 1\n",
-    "eps_y_pred = svm_reg1.predict([[eps_x1]])\n",
+    "eps_y_pred = svm_reg2.predict([[eps_x1]])\n",
    "\n",
    "fig, axes = plt.subplots(ncols=2, figsize=(9, 4), sharey=True)\n",
    "plt.sca(axes[0])\n",
-    "plot_svm_regression(svm_reg1, X, y, [0, 2, 3, 11])\n",
-    "plt.title(r\"$\\epsilon = {}$\".format(svm_reg1.epsilon))\n",
+    "plot_svm_regression(svm_reg, X, y, [0, 2, 3, 11])\n",
+    "plt.title(fr\"$\\epsilon = {svm_reg[-1].epsilon}$\")\n",
    "plt.ylabel(r\"$y$\", rotation=0)\n",
    "plt.grid()\n",
-    "plt.annotate(\n",
-    "        '', xy=(eps_x1, eps_y_pred), xycoords='data',\n",
-    "        xytext=(eps_x1, eps_y_pred - svm_reg1.epsilon),\n",
-    "        textcoords='data', arrowprops={'arrowstyle': '<->', 'linewidth': 1.5}\n",
-    "    )\n",
-    "plt.text(0.91, 5.6, r\"$\\epsilon$\", fontsize=16)\n",
    "plt.sca(axes[1])\n",
    "plot_svm_regression(svm_reg2, X, y, [0, 2, 3, 11])\n",
-    "plt.title(r\"$\\epsilon = {}$\".format(svm_reg2.epsilon))\n",
+    "plt.title(fr\"$\\epsilon = {svm_reg2[-1].epsilon}$\")\n",
+    "plt.annotate(\n",
+    "        '', xy=(eps_x1, eps_y_pred), xycoords='data',\n",
+    "        xytext=(eps_x1, eps_y_pred - svm_reg2[-1].epsilon),\n",
+    "        textcoords='data', arrowprops={'arrowstyle': '<->', 'linewidth': 1.5}\n",
+    "    )\n",
+    "plt.text(0.90, 5.4, r\"$\\epsilon$\", fontsize=16)\n",
    "plt.grid()\n",
    "save_fig(\"svm_regression_plot\")\n",
    "plt.show()"
@ -832,10 +843,11 @@
   "metadata": {},
   "outputs": [],
   "source": [
+    "# not in the book – this code generates a simple quadratic dataset\n",
    "np.random.seed(42)\n",
-    "m = 100\n",
+    "m = 50\n",
    "X = 2 * np.random.rand(m, 1) - 1\n",
-    "y = (0.2 + 0.1 * X + 0.5 * X**2 + np.random.randn(m, 1)/10).ravel()"
+    "y = (0.2 + 0.1 * X + 0.5 * X ** 2 + np.random.randn(m, 1) / 10).ravel()"
   ]
  },
  {
@ -853,7 +865,8 @@
   "source": [
    "from sklearn.svm import SVR\n",
    "\n",
-    "svm_poly_reg = SVR(kernel=\"poly\", degree=2, C=100)\n",
+    "svm_poly_reg = make_pipeline(StandardScaler(),\n",
+    "                             SVR(kernel=\"poly\", degree=2, C=0.01, epsilon=0.1))\n",
    "svm_poly_reg.fit(X, y)"
   ]
  },
@ -862,35 +875,30 @@
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
-   "source": [
-    "svm_poly_reg1 = SVR(kernel=\"poly\", degree=2, C=100)\n",
-    "svm_poly_reg2 = SVR(kernel=\"poly\", degree=2, C=0.01)\n",
-    "svm_poly_reg1.fit(X, y)\n",
-    "svm_poly_reg2.fit(X, y)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 26,
-   "metadata": {},
-   "outputs": [],
   "source": [
    "# not in the book – this cell generates and saves Figure 5–11\n",
    "\n",
+    "svm_poly_reg2 = make_pipeline(StandardScaler(),\n",
+    "                             SVR(kernel=\"poly\", degree=2, C=100))\n",
+    "svm_poly_reg2.fit(X, y)\n",
+    "\n",
+    "svm_poly_reg._support = find_support_vectors(svm_poly_reg, X, y)\n",
+    "svm_poly_reg2._support = find_support_vectors(svm_poly_reg2, X, y)\n",
+    "\n",
    "fig, axes = plt.subplots(ncols=2, figsize=(9, 4), sharey=True)\n",
    "plt.sca(axes[0])\n",
-    "plot_svm_regression(svm_poly_reg1, X, y, [-1, 1, 0, 1])\n",
-    "plt.title(f\"$degree={svm_poly_reg1.degree}, \"\n",
-    "          f\"C={svm_poly_reg1.C}, \"\n",
-    "          f\"\\\\epsilon={svm_poly_reg1.epsilon}$\")\n",
+    "plot_svm_regression(svm_poly_reg, X, y, [-1, 1, 0, 1])\n",
+    "plt.title(f\"$degree={svm_poly_reg[-1].degree}, \"\n",
+    "          f\"C={svm_poly_reg[-1].C}, \"\n",
+    "          fr\"\\epsilon={svm_poly_reg[-1].epsilon}$\")\n",
    "plt.ylabel(r\"$y$\", rotation=0)\n",
    "plt.grid()\n",
    "\n",
    "plt.sca(axes[1])\n",
    "plot_svm_regression(svm_poly_reg2, X, y, [-1, 1, 0, 1])\n",
-    "plt.title(f\"$degree={svm_poly_reg2.degree}, \"\n",
-    "          f\"C={svm_poly_reg2.C}, \"\n",
-    "          f\"\\\\epsilon={svm_poly_reg2.epsilon}$\")\n",
+    "plt.title(f\"$degree={svm_poly_reg2[-1].degree}, \"\n",
+    "          f\"C={svm_poly_reg2[-1].C}, \"\n",
+    "          fr\"\\epsilon={svm_poly_reg2[-1].epsilon}$\")\n",
    "plt.grid()\n",
    "save_fig(\"svm_with_polynomial_kernel_plot\")\n",
    "plt.show()"
@ -900,90 +908,46 @@
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "# Extra Material – Under the Hood"
+    "# Under the hood"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 27,
+   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
-    "X = iris.data[[\"petal length (cm)\", \"petal width (cm)\"]].values\n",
-    "y = (iris.target == 2)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 28,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from mpl_toolkits.mplot3d import Axes3D\n",
+    "# not in the book – this cell generates and saves Figure 5–12\n",
    "\n",
-    "def plot_3D_decision_function(ax, w, b, x1_lim=[4, 6], x2_lim=[0.8, 2.8]):\n",
-    "    x1_in_bounds = (X[:, 0] > x1_lim[0]) & (X[:, 0] < x1_lim[1])\n",
-    "    X_crop = X[x1_in_bounds]\n",
-    "    y_crop = y[x1_in_bounds]\n",
-    "    x1s = np.linspace(x1_lim[0], x1_lim[1], 20)\n",
-    "    x2s = np.linspace(x2_lim[0], x2_lim[1], 20)\n",
-    "    x1, x2 = np.meshgrid(x1s, x2s)\n",
-    "    xs = np.c_[x1.ravel(), x2.ravel()]\n",
-    "    df = (xs.dot(w) + b).reshape(x1.shape)\n",
-    "    m = 1 / np.linalg.norm(w)\n",
-    "    boundary_x2s = -x1s * (w[0] / w[1]) - b / w[1]\n",
-    "    margin_x2s_1 = -x1s * (w[0] / w[1]) - (b - 1) / w[1]\n",
-    "    margin_x2s_2 = -x1s * (w[0] / w[1]) - (b + 1) / w[1]\n",
-    "    ax.plot_surface(x1s, x2, np.zeros_like(x1),\n",
-    "                    color=\"b\", alpha=0.2, cstride=100, rstride=100)\n",
-    "    ax.plot(x1s, boundary_x2s, 0, \"k-\", linewidth=2, label=r\"$h=0$\")\n",
-    "    ax.plot(x1s, margin_x2s_1, 0, \"k--\", linewidth=2, label=r\"$h=\\pm 1$\")\n",
-    "    ax.plot(x1s, margin_x2s_2, 0, \"k--\", linewidth=2)\n",
-    "    ax.plot(X_crop[:, 0][y_crop==1], X_crop[:, 1][y_crop==1], 0, \"g^\")\n",
-    "    ax.plot_wireframe(x1, x2, df, alpha=0.3, color=\"k\")\n",
-    "    ax.plot(X_crop[:, 0][y_crop==0], X_crop[:, 1][y_crop==0], 0, \"bs\")\n",
-    "    ax.axis(x1_lim + x2_lim)\n",
-    "    ax.text(4.5, 2.5, 3.8, \"Decision function $h$\", fontsize=14)\n",
-    "    ax.set_xlabel(r\"Petal length\", labelpad=10)\n",
-    "    ax.set_ylabel(r\"Petal width\", labelpad=10)\n",
-    "    ax.set_zlabel(r\"$h = \\mathbf{w}^T \\mathbf{x} + b$\", labelpad=5)\n",
-    "    ax.legend(loc=\"upper left\")\n",
-    "\n",
-    "fig = plt.figure(figsize=(11, 6))\n",
-    "ax1 = fig.add_subplot(111, projection='3d')\n",
-    "plot_3D_decision_function(ax1, w=svm_clf2.coef_[0], b=svm_clf2.intercept_[0])\n",
-    "\n",
-    "save_fig(\"iris_3D_plot\")\n",
-    "plt.show()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 29,
-   "metadata": {},
-   "outputs": [],
-   "source": [
    "import matplotlib.patches as patches\n",
    "\n",
    "def plot_2D_decision_function(w, b, ylabel=True, x1_lim=[-3, 3]):\n",
    "    x1 = np.linspace(x1_lim[0], x1_lim[1], 200)\n",
    "    y = w * x1 + b\n",
-    "    m = 1 / w\n",
+    "    half_margin = 1 / w\n",
    "\n",
-    "    plt.plot(x1, y)\n",
-    "    plt.axhline(y=0, color='k')\n",
-    "    plt.axvline(x=0, color='k')\n",
-    "    rect = patches.Rectangle((-3, -1), 6, 2, edgecolor='none', facecolor='blue',\n",
-    "                             alpha=0.1)\n",
+    "    plt.plot(x1, y, \"b-\", linewidth=2, label=r\"$s = w_1 x_1$\")\n",
+    "    plt.axhline(y=0, color='k', linewidth=1)\n",
+    "    plt.axvline(x=0, color='k', linewidth=1)\n",
+    "    rect = patches.Rectangle((-half_margin, -2), 2 * half_margin, 4,\n",
+    "                             edgecolor='none', facecolor='gray', alpha=0.2)\n",
    "    plt.gca().add_patch(rect)\n",
-    "    plt.plot([m, m], [0, 1], \"b--\")\n",
-    "    plt.plot([-m, -m], [0, -1], \"b--\")\n",
-    "    plt.plot([-m, m], [0, 0], \"k-o\", linewidth=3)\n",
+    "    plt.plot([-3, 3], [1, 1], \"k--\", linewidth=1)\n",
+    "    plt.plot([-3, 3], [-1, -1], \"k--\", linewidth=1)\n",
+    "    plt.plot(half_margin, 1, \"k.\")\n",
+    "    plt.plot(-half_margin, -1, \"k.\")\n",
    "    plt.axis(x1_lim + [-2, 2])\n",
    "    plt.xlabel(r\"$x_1$\")\n",
    "    if ylabel:\n",
-    "        plt.ylabel(r\"$w_1 x_1$  \", rotation=0)\n",
-    "    plt.title(r\"$w_1 = {}$\".format(w))\n",
+    "        plt.ylabel(\"$s$\", rotation=0, labelpad=5)\n",
+    "        plt.legend()\n",
+    "        plt.text(1.02, -1.6, \"Margin\", ha=\"left\", va=\"center\",\n",
+    "                 color=\"k\", fontsize=14)\n",
+    "    plt.annotate(\n",
+    "        '', xy=(-half_margin, -1.6), xytext=(half_margin, -1.6),\n",
+    "        arrowprops={'ec': 'k', 'arrowstyle': '<->', 'linewidth': 1.5}\n",
+    "    )\n",
+    "    plt.title(fr\"$w_1 = {w}$\")\n",
    "\n",
    "fig, axes = plt.subplots(ncols=2, figsize=(9, 3.2), sharey=True)\n",
    "plt.sca(axes[0])\n",
@ -996,31 +960,36 @@
    "plt.show()"
   ]
  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "**Code to generate the Hinge Loss figure:**"
-   ]
-  },
  {
   "cell_type": "code",
-   "execution_count": 30,
+   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
-    "t = np.linspace(-2, 4, 200)\n",
-    "h = np.where(1 - t < 0, 0, 1 - t)  # max(0, 1-t)\n",
+    "# not in the book – this cell generates and saves Figure 5–13\n",
+    "\n",
+    "s = np.linspace(-2.5, 2.5, 200)\n",
+    "hinge_pos = np.where(1 - s < 0, 0, 1 - s)  # max(0, 1 - s)\n",
+    "hinge_neg = np.where(1 + s < 0, 0, 1 + s)  # max(0, 1 + s)\n",
+    "\n",
+    "titles = (r\"Hinge loss = $max(0, 1 - s\\,t)$\", r\"Squared Hinge loss\")\n",
+    "\n",
+    "fix, axs = plt.subplots(1, 2, sharey=True, figsize=(8.2, 3))\n",
+    "\n",
+    "for ax, loss_pos, loss_neg, title in zip(\n",
+    "        axs, (hinge_pos, hinge_pos ** 2), (hinge_neg, hinge_neg ** 2), titles):\n",
+    "    ax.plot(s, loss_pos, \"g-\", linewidth=2, zorder=10, label=\"$t=1$\")\n",
+    "    ax.plot(s, loss_neg, \"r--\", linewidth=2, zorder=10, label=\"$t=-1$\")\n",
+    "    ax.grid(True, which='both')\n",
+    "    ax.axhline(y=0, color='k')\n",
+    "    ax.axvline(x=0, color='k')\n",
+    "    ax.set_xlabel(r\"$s = \\mathbf{w}^\\intercal \\mathbf{x} + b$\")\n",
+    "    ax.axis([-2.5, 2.5, -0.5, 2.5])\n",
+    "    ax.legend(loc=\"center right\")\n",
+    "    ax.set_title(title)\n",
+    "    ax.set_yticks(np.arange(0, 2.5, 1))\n",
+    "    ax.set_aspect(\"equal\")\n",
    "\n",
-    "plt.figure(figsize=(5,2.8))\n",
-    "plt.plot(t, h, \"b-\", linewidth=2, label=\"$max(0, 1 - t)$\", zorder=10)\n",
-    "plt.grid(True, which='both')\n",
-    "plt.axhline(y=0, color='k')\n",
-    "plt.axvline(x=0, color='k')\n",
-    "plt.yticks(np.arange(-1, 2.5, 1))\n",
-    "plt.xlabel(\"$t$\")\n",
-    "plt.axis([-2, 4, -1, 2.5])\n",
-    "plt.legend(loc=\"upper right\")\n",
    "save_fig(\"hinge_plot\")\n",
    "plt.show()"
   ]
@ -1029,59 +998,19 @@
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "# Extra material – Training Time"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 31,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "X, y = make_moons(n_samples=1000, noise=0.4, random_state=42)\n",
-    "\n",
-    "plt.plot(X[:, 0][y==0], X[:, 1][y==0], \"bs\")\n",
-    "plt.plot(X[:, 0][y==1], X[:, 1][y==1], \"g^\")\n",
-    "plt.grid()\n",
-    "plt.show()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 32,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import time\n",
-    "\n",
-    "tol = 0.1\n",
-    "tols = []\n",
-    "times = []\n",
-    "for i in range(10):\n",
-    "    svm_clf = SVC(kernel=\"poly\", gamma=3, C=10, tol=tol)\n",
-    "    t1 = time.time()\n",
-    "    svm_clf.fit(X, y)\n",
-    "    t2 = time.time()\n",
-    "    times.append(t2-t1)\n",
-    "    tols.append(tol)\n",
-    "    tol /= 10\n",
-    "plt.semilogx(tols, times, \"bo-\")\n",
-    "plt.xlabel(\"Tolerance\")\n",
-    "plt.ylabel(\"Time (seconds)\")\n",
-    "plt.grid()\n",
-    "plt.show()"
+    "# Extra Material"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "# Extra Material – Linear SVM classifier implementation using Batch Gradient Descent"
+    "## Linear SVM classifier implementation using Batch Gradient Descent"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 33,
+   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
@ -1091,7 +1020,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 34,
+   "execution_count": 29,
   "metadata": {},
   "outputs": [],
   "source": [
@ -1152,7 +1081,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 35,
+   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
@ -1165,7 +1094,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 36,
+   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
@ -1179,7 +1108,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 37,
+   "execution_count": 32,
   "metadata": {},
   "outputs": [],
   "source": [
@ -1188,7 +1117,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 38,
+   "execution_count": 33,
   "metadata": {},
   "outputs": [],
   "source": [
@ -1199,7 +1128,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 39,
+   "execution_count": 34,
   "metadata": {},
   "outputs": [],
   "source": [
@ -1230,7 +1159,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 40,
+   "execution_count": 35,
   "metadata": {
    "scrolled": true
   },
@ -1275,7 +1204,7 @@
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "## 1. to 7."
+    "## 1. to 8."
   ]
  },
  {
@ -1289,14 +1218,14 @@
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "# 8."
+    "# 9."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "_Exercise: train a `LinearSVC` on a linearly separable dataset. Then train an `SVC` and a `SGDClassifier` on the same dataset. See if you can get them to produce roughly the same model._"
+    "_Exercise: Train a `LinearSVC` on a linearly separable dataset. Then train an `SVC` and a `SGDClassifier` on the same dataset. See if you can get them to produce roughly the same model._"
   ]
  },
  {
@ -1308,7 +1237,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 41,
+   "execution_count": 36,
   "metadata": {},
   "outputs": [],
   "source": [
@ -1323,9 +1252,19 @@
    "y = y[setosa_or_versicolor]"
   ]
  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now let's build and train 3 models:\n",
+    "* Remember that `LinearSVC` uses `loss=\"squared_hinge\"` by default, so if we want all 3 models to produce similar results, we need to set `loss=\"hinge\"`.\n",
+    "* Also, the `SVC` class uses an RBF kernel by default, so we need to set `kernel=\"linear\"` to get similar results as the other two models.\n",
+    "* Lastly, the `SGDClassifier` class does not have a `C` hyperparameter, but it has another regularization hyperparameter called `alpha`, so we can tweak it to get similar results as the other two models."
+   ]
+  },
  {
   "cell_type": "code",
-   "execution_count": 42,
+   "execution_count": 37,
   "metadata": {},
   "outputs": [],
   "source": [
@ -1334,24 +1273,14 @@
    "from sklearn.preprocessing import StandardScaler\n",
    "\n",
    "C = 5\n",
-    "alpha = 1 / (C * len(X))\n",
-    "\n",
-    "lin_clf = LinearSVC(loss=\"hinge\", C=C, random_state=42)\n",
-    "svm_clf = SVC(kernel=\"linear\", C=C)\n",
-    "sgd_clf = SGDClassifier(loss=\"hinge\", learning_rate=\"constant\", eta0=0.001,\n",
-    "                        alpha=alpha, max_iter=1000, tol=1e-3, random_state=42)\n",
+    "alpha = 0.05\n",
    "\n",
    "scaler = StandardScaler()\n",
    "X_scaled = scaler.fit_transform(X)\n",
    "\n",
-    "lin_clf.fit(X_scaled, y)\n",
-    "svm_clf.fit(X_scaled, y)\n",
-    "sgd_clf.fit(X_scaled, y)\n",
-    "\n",
-    "print(\"LinearSVC:                   \", lin_clf.intercept_, lin_clf.coef_)\n",
-    "print(\"SVC:                         \", svm_clf.intercept_, svm_clf.coef_)\n",
-    "print(f\"SGDClassifier(alpha={sgd_clf.alpha:.1e}):\",\n",
-    "      sgd_clf.intercept_, sgd_clf.coef_)"
+    "lin_clf = LinearSVC(loss=\"hinge\", C=C, random_state=42).fit(X_scaled, y)\n",
+    "svc_clf = SVC(kernel=\"linear\", C=C).fit(X_scaled, y)\n",
+    "sgd_clf = SGDClassifier(alpha=alpha, random_state=42).fit(X_scaled, y)"
   ]
  },
  {
@ -1363,28 +1292,24 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 43,
+   "execution_count": 38,
   "metadata": {},
   "outputs": [],
   "source": [
-    "# Compute the slope and bias of each decision boundary\n",
-    "w1 = -lin_clf.coef_[0, 0] / lin_clf.coef_[0, 1]\n",
-    "b1 = -lin_clf.intercept_[0] / lin_clf.coef_[0, 1]\n",
-    "w2 = -svm_clf.coef_[0, 0] / svm_clf.coef_[0, 1]\n",
-    "b2 = -svm_clf.intercept_[0] / svm_clf.coef_[0, 1]\n",
-    "w3 = -sgd_clf.coef_[0, 0] / sgd_clf.coef_[0, 1]\n",
-    "b3 = -sgd_clf.intercept_[0] / sgd_clf.coef_[0, 1]\n",
+    "def compute_decision_boundary(model):\n",
+    "    w = -model.coef_[0, 0] / model.coef_[0, 1]\n",
+    "    b = -model.intercept_[0] / model.coef_[0, 1]\n",
+    "    return scaler.inverse_transform([[-10, -10 * w + b], [10, 10 * w + b]])\n",
    "\n",
-    "# Transform the decision boundary lines back to the original scale\n",
-    "line1 = scaler.inverse_transform([[-10, -10 * w1 + b1], [10, 10 * w1 + b1]])\n",
-    "line2 = scaler.inverse_transform([[-10, -10 * w2 + b2], [10, 10 * w2 + b2]])\n",
-    "line3 = scaler.inverse_transform([[-10, -10 * w3 + b3], [10, 10 * w3 + b3]])\n",
+    "lin_line = compute_decision_boundary(lin_clf)\n",
+    "svc_line = compute_decision_boundary(svc_clf)\n",
+    "sgd_line = compute_decision_boundary(sgd_clf)\n",
    "\n",
    "# Plot all three decision boundaries\n",
    "plt.figure(figsize=(11, 4))\n",
-    "plt.plot(line1[:, 0], line1[:, 1], \"k:\", label=\"LinearSVC\")\n",
-    "plt.plot(line2[:, 0], line2[:, 1], \"b--\", linewidth=2, label=\"SVC\")\n",
-    "plt.plot(line3[:, 0], line3[:, 1], \"r-\", label=\"SGDClassifier\")\n",
+    "plt.plot(lin_line[:, 0], lin_line[:, 1], \"k:\", label=\"LinearSVC\")\n",
+    "plt.plot(svc_line[:, 0], svc_line[:, 1], \"b--\", linewidth=2, label=\"SVC\")\n",
+    "plt.plot(sgd_line[:, 0], sgd_line[:, 1], \"r-\", label=\"SGDClassifier\")\n",
    "plt.plot(X[:, 0][y==1], X[:, 1][y==1], \"bs\") # label=\"Iris versicolor\"\n",
    "plt.plot(X[:, 0][y==0], X[:, 1][y==0], \"yo\") # label=\"Iris setosa\"\n",
    "plt.xlabel(\"Petal length\")\n",
@ -1407,14 +1332,14 @@
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "# 9."
+    "# 10."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "_Exercise: train an SVM classifier on the Wine dataset, which you can load using `sklearn.datasets.load_wine()`. This dataset contains the chemical analysis of 178 wine samples produced by 3 different cultivators: the goal is to train a classification model capable of predicting the cultivator based on the wine's chemical analysis. Since SVM classifiers are binary classifiers, you will need to use one-versus-all to classify all 3 classes. You may want to tune the hyperparameters using small validation sets to speed up the process. What accuracy can you reach?_"
+    "_Exercise: Train an SVM classifier on the Wine dataset, which you can load using `sklearn.datasets.load_wine()`. This dataset contains the chemical analysis of 178 wine samples produced by 3 different cultivators: the goal is to train a classification model capable of predicting the cultivator based on the wine's chemical analysis. Since SVM classifiers are binary classifiers, you will need to use one-versus-all to classify all 3 classes. What accuracy can you reach?_"
   ]
  },
  {
@ -1426,7 +1351,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 44,
+   "execution_count": 39,
   "metadata": {},
   "outputs": [],
   "source": [
@ -1437,7 +1362,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 45,
+   "execution_count": 40,
   "metadata": {},
   "outputs": [],
   "source": [
@ -1446,7 +1371,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 46,
+   "execution_count": 41,
   "metadata": {},
   "outputs": [],
   "source": [
@ -1458,7 +1383,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 47,
+   "execution_count": 42,
   "metadata": {},
   "outputs": [],
   "source": [
@ -1467,7 +1392,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 48,
+   "execution_count": 43,
   "metadata": {},
   "outputs": [],
   "source": [
@ -1483,7 +1408,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 49,
+   "execution_count": 44,
   "metadata": {},
   "outputs": [],
   "source": [
@ -1500,7 +1425,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 50,
+   "execution_count": 45,
   "metadata": {},
   "outputs": [],
   "source": [
@ -1519,7 +1444,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 51,
+   "execution_count": 46,
   "metadata": {},
   "outputs": [],
   "source": [
@ -1539,7 +1464,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 52,
+   "execution_count": 47,
   "metadata": {},
   "outputs": [],
   "source": [
@ -1557,7 +1482,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 53,
+   "execution_count": 48,
   "metadata": {},
   "outputs": [],
   "source": [
@ -1582,7 +1507,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 54,
+   "execution_count": 49,
   "metadata": {},
   "outputs": [],
   "source": [
@ -1599,7 +1524,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 55,
+   "execution_count": 50,
   "metadata": {},
   "outputs": [],
   "source": [
@ -1618,7 +1543,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 56,
+   "execution_count": 51,
   "metadata": {},
   "outputs": [],
   "source": [
@ -1634,7 +1559,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 57,
+   "execution_count": 52,
   "metadata": {},
   "outputs": [],
   "source": [
@ -1652,14 +1577,14 @@
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "## 10."
+    "## 11."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "_Exercise: Train and fine-tune an SVM regressor on the California housing dataset. You can use the original dataset rather than the tweaked version we used in Chapter 2. The original dataset can be fetched using `sklearn.datasets.fetch_california_housing()`. Since there are over 20,000 instances, SVMs can be slow, so for hyperparameter tuning you should use much less instances (e.g., 2,000), to test many more hyperparameter combinations._"
+    "_Exercise: Train and fine-tune an SVM regressor on the California housing dataset. You can use the original dataset rather than the tweaked version we used in Chapter 2. The original dataset can be fetched using `sklearn.datasets.fetch_california_housing()`. The labels represent hundreds of thousands of dollars. Since there are over 20,000 instances, SVMs can be slow, so for hyperparameter tuning you should use much less instances (e.g., 2,000), to test many more hyperparameter combinations. What is your best model's RMSE?_"
   ]
  },
  {
@ -1671,7 +1596,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 58,
+   "execution_count": 53,
   "metadata": {},
   "outputs": [],
   "source": [
@ -1691,13 +1616,14 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 59,
+   "execution_count": 54,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.model_selection import train_test_split\n",
    "\n",
-    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)"
+    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,\n",
+    "                                                    random_state=42)"
   ]
  },
  {
@ -1716,7 +1642,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 60,
+   "execution_count": 55,
   "metadata": {},
   "outputs": [],
   "source": [
@ -1735,7 +1661,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 61,
+   "execution_count": 56,
   "metadata": {},
   "outputs": [],
   "source": [
@ -1753,7 +1679,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 62,
+   "execution_count": 57,
   "metadata": {},
   "outputs": [],
   "source": [
@ -1773,7 +1699,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 63,
+   "execution_count": 58,
   "metadata": {},
   "outputs": [],
   "source": [
@ -1789,7 +1715,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 64,
+   "execution_count": 59,
   "metadata": {},
   "outputs": [],
   "source": [
@ -1810,7 +1736,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 65,
+   "execution_count": 60,
   "metadata": {},
   "outputs": [],
   "source": [
@ -1819,7 +1745,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 66,
+   "execution_count": 61,
   "metadata": {},
   "outputs": [],
   "source": [
@ -1836,7 +1762,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 67,
+   "execution_count": 62,
   "metadata": {},
   "outputs": [],
   "source": [