diff --git a/04_training_linear_models.ipynb b/04_training_linear_models.ipynb
index 9ce3b58..cc5cde3 100644
--- a/04_training_linear_models.ipynb
+++ b/04_training_linear_models.ipynb
@@ -30,7 +30,9 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "tags": []
+   },
    "source": [
     "# Setup"
    ]
@@ -39,7 +41,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "First, let's import a few common modules, ensure MatplotLib plots figures inline and prepare a function to save the figures."
+    "This project requires Python 3.8 or above:"
    ]
   },
   {
@@ -48,30 +50,64 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Python ≥3.8 is required\n",
     "import sys\n",
-    "assert sys.version_info >= (3, 8)\n",
     "\n",
-    "# Scikit-Learn ≥1.0 is required\n",
+    "assert sys.version_info >= (3, 8)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "It also requires Scikit-Learn ≥ 1.0.1:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
     "import sklearn\n",
-    "assert sklearn.__version__ >= \"1.0\"\n",
     "\n",
-    "# Common imports\n",
-    "import numpy as np\n",
+    "assert sklearn.__version__ >= \"1.0.1\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "As we did in previous chapters, let's define the default font sizes to make the figures prettier:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import matplotlib as mpl\n",
+    "\n",
+    "mpl.rc('font', size=12)\n",
+    "mpl.rc('axes', labelsize=14, titlesize=14)\n",
+    "mpl.rc('legend', fontsize=14)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "And let's create the `images/training_linear_models` folder (if it doesn't already exist), and define the `save_fig()` function which is used through this notebook to save the figures in high-res for the book:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
     "from pathlib import Path\n",
     "\n",
-    "# to make this notebook's output stable across runs\n",
-    "np.random.seed(42)\n",
-    "\n",
-    "# To plot pretty figures\n",
-    "%matplotlib inline\n",
-    "import matplotlib as mpl\n",
-    "import matplotlib.pyplot as plt\n",
-    "mpl.rc('axes', labelsize=14)\n",
-    "mpl.rc('xtick', labelsize=12)\n",
-    "mpl.rc('ytick', labelsize=12)\n",
-    "\n",
-    "# Where to save the figures\n",
     "IMAGES_PATH = Path() / \"images\" / \"training_linear_models\"\n",
     "IMAGES_PATH.mkdir(parents=True, exist_ok=True)\n",
     "\n",
@@ -98,47 +134,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [],
    "source": [
     "import numpy as np\n",
     "\n",
-    "X = 2 * np.random.rand(100, 1)\n",
-    "y = 4 + 3 * X + np.random.randn(100, 1)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "plt.plot(X, y, \"b.\")\n",
-    "plt.xlabel(\"$x_1$\", fontsize=18)\n",
-    "plt.ylabel(\"$y$\", rotation=0, fontsize=18)\n",
-    "plt.axis([0, 2, 0, 15])\n",
-    "save_fig(\"generated_data_plot\")\n",
-    "plt.show()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "X_b = np.c_[np.ones((100, 1)), X]  # add x0 = 1 to each instance\n",
-    "theta_best = np.linalg.inv(X_b.T.dot(X_b)).dot(X_b.T).dot(y)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "theta_best"
+    "np.random.seed(42)  # to make this code example reproducible\n",
+    "m = 100  # number of instances\n",
+    "X = 2 * np.random.rand(m, 1)  # column vector\n",
+    "y = 4 + 3 * X + np.random.randn(m, 1)  # column vector"
    ]
   },
   {
@@ -147,10 +152,18 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "X_new = np.array([[0], [2]])\n",
-    "X_new_b = np.c_[np.ones((2, 1)), X_new]  # add x0 = 1 to each instance\n",
-    "y_predict = X_new_b.dot(theta_best)\n",
-    "y_predict"
+    "# not in the book\n",
+    "\n",
+    "import matplotlib.pyplot as plt\n",
+    "\n",
+    "plt.figure(figsize=(6, 4))\n",
+    "plt.plot(X, y, \"b.\")\n",
+    "plt.xlabel(\"$x_1$\")\n",
+    "plt.ylabel(\"$y$\", rotation=0)\n",
+    "plt.axis([0, 2, 0, 15])\n",
+    "plt.grid()\n",
+    "save_fig(\"generated_data_plot\")\n",
+    "plt.show()"
    ]
   },
   {
@@ -159,17 +172,10 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "plt.plot(X_new, y_predict, \"r-\")\n",
-    "plt.plot(X, y, \"b.\")\n",
-    "plt.axis([0, 2, 0, 15])\n",
-    "plt.show()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "The figure in the book actually corresponds to the following code, with a legend and axis labels:"
+    "from sklearn.preprocessing import add_dummy_feature\n",
+    "\n",
+    "X_b = add_dummy_feature(X)  # add x0 = 1 to each instance\n",
+    "theta_best = np.linalg.inv(X_b.T @ X_b) @ X_b.T @ y"
    ]
   },
   {
@@ -178,14 +184,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "plt.plot(X_new, y_predict, \"r-\", linewidth=2, label=\"Predictions\")\n",
-    "plt.plot(X, y, \"b.\")\n",
-    "plt.xlabel(\"$x_1$\", fontsize=18)\n",
-    "plt.ylabel(\"$y$\", rotation=0, fontsize=18)\n",
-    "plt.legend(loc=\"upper left\", fontsize=14)\n",
-    "plt.axis([0, 2, 0, 15])\n",
-    "save_fig(\"linear_model_predictions_plot\")\n",
-    "plt.show()"
+    "theta_best"
    ]
   },
   {
@@ -193,6 +192,39 @@
    "execution_count": 9,
    "metadata": {},
    "outputs": [],
+   "source": [
+    "X_new = np.array([[0], [2]])\n",
+    "X_new_b = add_dummy_feature(X_new)  # add x0 = 1 to each instance\n",
+    "y_predict = X_new_b @ theta_best\n",
+    "y_predict"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plt.figure(figsize=(6, 4))  # not in the book\n",
+    "plt.plot(X_new, y_predict, \"r-\", label=\"Predictions\")\n",
+    "plt.plot(X, y, \"b.\")\n",
+    "\n",
+    "# not in the book\n",
+    "plt.xlabel(\"$x_1$\")\n",
+    "plt.ylabel(\"$y$\", rotation=0)\n",
+    "plt.axis([0, 2, 0, 15])\n",
+    "plt.grid()\n",
+    "plt.legend(loc=\"upper left\")\n",
+    "save_fig(\"linear_model_predictions_plot\")\n",
+    "\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "from sklearn.linear_model import LinearRegression\n",
     "\n",
@@ -203,7 +235,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 12,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -219,7 +251,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 13,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -236,11 +268,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 14,
    "metadata": {},
    "outputs": [],
    "source": [
-    "np.linalg.pinv(X_b).dot(y)"
+    "np.linalg.pinv(X_b) @ y"
    ]
   },
   {
@@ -251,39 +283,29 @@
     "## Batch Gradient Descent"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": 13,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "eta = 0.1  # learning rate\n",
-    "n_iterations = 1000\n",
-    "m = 100\n",
-    "\n",
-    "theta = np.random.randn(2,1)  # random initialization\n",
-    "\n",
-    "for iteration in range(n_iterations):\n",
-    "    gradients = 2/m * X_b.T.dot(X_b.dot(theta) - y)\n",
-    "    theta = theta - eta * gradients"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 14,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "theta"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": 15,
    "metadata": {},
    "outputs": [],
    "source": [
-    "X_new_b.dot(theta)"
+    "eta = 0.1  # learning rate\n",
+    "n_epochs = 1000\n",
+    "m = len(X_b)  # number of instances\n",
+    "\n",
+    "np.random.seed(42)\n",
+    "theta = np.random.randn(2, 1)  # randomly initialized model parameters\n",
+    "\n",
+    "for epoch in range(n_epochs):\n",
+    "    gradients = 2 / m * X_b.T @ (X_b @ theta - y)\n",
+    "    theta = theta - eta * gradients"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The trained model parameters:"
    ]
   },
   {
@@ -292,24 +314,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "theta_path_bgd = []\n",
-    "\n",
-    "def plot_gradient_descent(theta, eta, theta_path=None):\n",
-    "    m = len(X_b)\n",
-    "    plt.plot(X, y, \"b.\")\n",
-    "    n_iterations = 1000\n",
-    "    for iteration in range(n_iterations):\n",
-    "        if iteration < 10:\n",
-    "            y_predict = X_new_b.dot(theta)\n",
-    "            style = \"b-\" if iteration > 0 else \"r--\"\n",
-    "            plt.plot(X_new, y_predict, style)\n",
-    "        gradients = 2/m * X_b.T.dot(X_b.dot(theta) - y)\n",
-    "        theta = theta - eta * gradients\n",
-    "        if theta_path is not None:\n",
-    "            theta_path.append(theta)\n",
-    "    plt.xlabel(\"$x_1$\", fontsize=18)\n",
-    "    plt.axis([0, 2, 0, 15])\n",
-    "    plt.title(r\"$\\eta = {}$\".format(eta), fontsize=16)"
+    "theta"
    ]
   },
   {
@@ -318,15 +323,52 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "# not in the book\n",
+    "\n",
+    "import matplotlib as mpl\n",
+    "\n",
+    "def plot_gradient_descent(theta, eta):\n",
+    "    m = len(X_b)\n",
+    "    plt.plot(X, y, \"b.\")\n",
+    "    n_epochs = 1000\n",
+    "    n_shown = 20\n",
+    "    theta_path = []\n",
+    "    for epoch in range(n_epochs):\n",
+    "        if epoch < n_shown:\n",
+    "            y_predict = X_new_b @ theta\n",
+    "            color = mpl.colors.rgb2hex(plt.cm.OrRd(epoch / n_shown + 0.15))\n",
+    "            plt.plot(X_new, y_predict, linestyle=\"solid\", color=color)\n",
+    "        gradients = 2 / m * X_b.T @ (X_b @ theta - y)\n",
+    "        theta = theta - eta * gradients\n",
+    "        theta_path.append(theta)\n",
+    "    plt.xlabel(\"$x_1$\")\n",
+    "    plt.axis([0, 2, 0, 15])\n",
+    "    plt.grid()\n",
+    "    plt.title(r\"$\\eta = {}$\".format(eta))\n",
+    "    return theta_path"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# not in the book\n",
+    "\n",
     "np.random.seed(42)\n",
     "theta = np.random.randn(2,1)  # random initialization\n",
     "\n",
     "plt.figure(figsize=(10,4))\n",
-    "plt.subplot(131); plot_gradient_descent(theta, eta=0.02)\n",
-    "plt.ylabel(\"$y$\", rotation=0, fontsize=18)\n",
-    "plt.subplot(132); plot_gradient_descent(theta, eta=0.1, theta_path=theta_path_bgd)\n",
-    "plt.subplot(133); plot_gradient_descent(theta, eta=0.5)\n",
-    "\n",
+    "plt.subplot(131)\n",
+    "plot_gradient_descent(theta, eta=0.02)\n",
+    "plt.ylabel(\"$y$\", rotation=0)\n",
+    "plt.subplot(132)\n",
+    "theta_path_bgd = plot_gradient_descent(theta, eta=0.1)\n",
+    "plt.gca().axes.yaxis.set_ticklabels([])\n",
+    "plt.subplot(133)\n",
+    "plt.gca().axes.yaxis.set_ticklabels([])\n",
+    "plot_gradient_descent(theta, eta=0.5)\n",
     "save_fig(\"gradient_descent_plot\")\n",
     "plt.show()"
    ]
@@ -340,18 +382,20 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 19,
    "metadata": {},
    "outputs": [],
    "source": [
-    "theta_path_sgd = []\n",
-    "m = len(X_b)\n",
-    "np.random.seed(42)"
+    "# not in the book\n",
+    "\n",
+    "# we need to store the path of theta in the parameter space to plot the next\n",
+    "# figure\n",
+    "theta_path_sgd = []"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 20,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -361,33 +405,43 @@
     "def learning_schedule(t):\n",
     "    return t0 / (t + t1)\n",
     "\n",
-    "theta = np.random.randn(2,1)  # random initialization\n",
+    "np.random.seed(42)\n",
+    "theta = np.random.randn(2, 1)  # random initialization\n",
+    "\n",
+    "# not in the book, this is for the next figure\n",
+    "n_shown = 20\n",
+    "plt.figure(figsize=(6, 4))\n",
     "\n",
     "for epoch in range(n_epochs):\n",
-    "    for i in range(m):\n",
-    "        if epoch == 0 and i < 20:                    # not shown in the book\n",
-    "            y_predict = X_new_b.dot(theta)           # not shown\n",
-    "            style = \"b-\" if i > 0 else \"r--\"         # not shown\n",
-    "            plt.plot(X_new, y_predict, style)        # not shown\n",
-    "        random_index = np.random.randint(m)\n",
-    "        xi = X_b[random_index:random_index+1]\n",
-    "        yi = y[random_index:random_index+1]\n",
-    "        gradients = 2 * xi.T.dot(xi.dot(theta) - yi)\n",
-    "        eta = learning_schedule(epoch * m + i)\n",
-    "        theta = theta - eta * gradients\n",
-    "        theta_path_sgd.append(theta)                 # not shown\n",
+    "    for iteration in range(m):\n",
     "\n",
-    "plt.plot(X, y, \"b.\")                                 # not shown\n",
-    "plt.xlabel(\"$x_1$\", fontsize=18)                     # not shown\n",
-    "plt.ylabel(\"$y$\", rotation=0, fontsize=18)           # not shown\n",
-    "plt.axis([0, 2, 0, 15])                              # not shown\n",
-    "save_fig(\"sgd_plot\")                                 # not shown\n",
-    "plt.show()                                           # not shown"
+    "        # not in the book\n",
+    "        if epoch == 0 and iteration < n_shown:\n",
+    "            y_predict = X_new_b @ theta\n",
+    "            color = mpl.colors.rgb2hex(plt.cm.OrRd(iteration / n_shown + 0.15))\n",
+    "            plt.plot(X_new, y_predict, color=color)\n",
+    "\n",
+    "        random_index = np.random.randint(m)\n",
+    "        xi = X_b[random_index : random_index + 1]\n",
+    "        yi = y[random_index : random_index + 1]\n",
+    "        gradients = 2 / 1 * xi.T @ (xi @ theta - yi)\n",
+    "        eta = learning_schedule(epoch * m + iteration)\n",
+    "        theta = theta - eta * gradients\n",
+    "        theta_path_sgd.append(theta)  # not in the book\n",
+    "\n",
+    "# not in the book\n",
+    "plt.plot(X, y, \"b.\")\n",
+    "plt.xlabel(\"$x_1$\")\n",
+    "plt.ylabel(\"$y$\", rotation=0)\n",
+    "plt.axis([0, 2, 0, 15])\n",
+    "plt.grid()\n",
+    "save_fig(\"sgd_plot\")\n",
+    "plt.show()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 21,
    "metadata": {
     "scrolled": true
    },
@@ -398,19 +452,20 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 21,
+   "execution_count": 22,
    "metadata": {},
    "outputs": [],
    "source": [
     "from sklearn.linear_model import SGDRegressor\n",
     "\n",
-    "sgd_reg = SGDRegressor(max_iter=1000, tol=1e-3, penalty=None, eta0=0.1, random_state=42)\n",
-    "sgd_reg.fit(X, y.ravel())"
+    "sgd_reg = SGDRegressor(max_iter=1000, tol=1e-3, penalty=None, eta0=0.1,\n",
+    "                       random_state=42)\n",
+    "sgd_reg.fit(X, y.ravel())  # y.ravel() because fit() expects 1D targets"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": 23,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -425,36 +480,10 @@
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": 23,
+   "cell_type": "markdown",
    "metadata": {},
-   "outputs": [],
    "source": [
-    "theta_path_mgd = []\n",
-    "\n",
-    "n_iterations = 50\n",
-    "minibatch_size = 20\n",
-    "\n",
-    "np.random.seed(42)\n",
-    "theta = np.random.randn(2,1)  # random initialization\n",
-    "\n",
-    "t0, t1 = 200, 1000\n",
-    "def learning_schedule(t):\n",
-    "    return t0 / (t + t1)\n",
-    "\n",
-    "t = 0\n",
-    "for epoch in range(n_iterations):\n",
-    "    shuffled_indices = np.random.permutation(m)\n",
-    "    X_b_shuffled = X_b[shuffled_indices]\n",
-    "    y_shuffled = y[shuffled_indices]\n",
-    "    for i in range(0, m, minibatch_size):\n",
-    "        t += 1\n",
-    "        xi = X_b_shuffled[i:i+minibatch_size]\n",
-    "        yi = y_shuffled[i:i+minibatch_size]\n",
-    "        gradients = 2/minibatch_size * xi.T.dot(xi.dot(theta) - yi)\n",
-    "        eta = learning_schedule(t)\n",
-    "        theta = theta - eta * gradients\n",
-    "        theta_path_mgd.append(theta)"
+    "The code in this section is used to generate the next figure, it is not in the book."
    ]
   },
   {
@@ -463,7 +492,33 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "theta"
+    "from math import ceil\n",
+    "\n",
+    "n_epochs = 50\n",
+    "minibatch_size = 20\n",
+    "n_batches_per_epoch = ceil(m / minibatch_size)\n",
+    "\n",
+    "np.random.seed(42)\n",
+    "theta = np.random.randn(2, 1)  # random initialization\n",
+    "\n",
+    "t0, t1 = 200, 1000  # learning schedule hyperparameters\n",
+    "\n",
+    "def learning_schedule(t):\n",
+    "    return t0 / (t + t1)\n",
+    "\n",
+    "theta_path_mgd = []\n",
+    "for epoch in range(n_epochs):\n",
+    "    shuffled_indices = np.random.permutation(m)\n",
+    "    X_b_shuffled = X_b[shuffled_indices]\n",
+    "    y_shuffled = y[shuffled_indices]\n",
+    "    for iteration in range(0, n_batches_per_epoch):\n",
+    "        idx = iteration * minibatch_size\n",
+    "        xi = X_b_shuffled[idx : idx + minibatch_size]\n",
+    "        yi = y_shuffled[idx : idx + minibatch_size]\n",
+    "        gradients = 2 / minibatch_size * xi.T @ (xi @ theta - yi)\n",
+    "        eta = learning_schedule(iteration)\n",
+    "        theta = theta - eta * gradients\n",
+    "        theta_path_mgd.append(theta)"
    ]
   },
   {
@@ -471,6 +526,15 @@
    "execution_count": 25,
    "metadata": {},
    "outputs": [],
+   "source": [
+    "theta"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "theta_path_bgd = np.array(theta_path_bgd)\n",
     "theta_path_sgd = np.array(theta_path_sgd)\n",
@@ -479,18 +543,22 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 26,
+   "execution_count": 27,
    "metadata": {},
    "outputs": [],
    "source": [
-    "plt.figure(figsize=(7,4))\n",
-    "plt.plot(theta_path_sgd[:, 0], theta_path_sgd[:, 1], \"r-s\", linewidth=1, label=\"Stochastic\")\n",
-    "plt.plot(theta_path_mgd[:, 0], theta_path_mgd[:, 1], \"g-+\", linewidth=2, label=\"Mini-batch\")\n",
-    "plt.plot(theta_path_bgd[:, 0], theta_path_bgd[:, 1], \"b-o\", linewidth=3, label=\"Batch\")\n",
-    "plt.legend(loc=\"upper left\", fontsize=16)\n",
-    "plt.xlabel(r\"$\\theta_0$\", fontsize=20)\n",
-    "plt.ylabel(r\"$\\theta_1$   \", fontsize=20, rotation=0)\n",
-    "plt.axis([2.5, 4.5, 2.3, 3.9])\n",
+    "plt.figure(figsize=(7, 4))\n",
+    "plt.plot(theta_path_sgd[:, 0], theta_path_sgd[:, 1], \"r-s\", linewidth=1,\n",
+    "         label=\"Stochastic\")\n",
+    "plt.plot(theta_path_mgd[:, 0], theta_path_mgd[:, 1], \"g-+\", linewidth=2,\n",
+    "         label=\"Mini-batch\")\n",
+    "plt.plot(theta_path_bgd[:, 0], theta_path_bgd[:, 1], \"b-o\", linewidth=3,\n",
+    "         label=\"Batch\")\n",
+    "plt.legend(loc=\"upper left\")\n",
+    "plt.xlabel(r\"$\\theta_0$\")\n",
+    "plt.ylabel(r\"$\\theta_1$   \", rotation=0)\n",
+    "plt.axis([2.6, 4.6, 2.3, 3.4])\n",
+    "plt.grid()\n",
     "save_fig(\"gradient_descent_paths_plot\")\n",
     "plt.show()"
    ]
@@ -502,27 +570,16 @@
     "# Polynomial Regression"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": 27,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import numpy as np\n",
-    "import numpy.random as rnd\n",
-    "\n",
-    "np.random.seed(42)"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": 28,
    "metadata": {},
    "outputs": [],
    "source": [
+    "np.random.seed(42)\n",
     "m = 100\n",
     "X = 6 * np.random.rand(m, 1) - 3\n",
-    "y = 0.5 * X**2 + X + 2 + np.random.randn(m, 1)"
+    "y = 0.5 * X ** 2 + X + 2 + np.random.randn(m, 1)"
    ]
   },
   {
@@ -531,10 +588,13 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "# not in the book\n",
+    "plt.figure(figsize=(6, 4))\n",
     "plt.plot(X, y, \"b.\")\n",
-    "plt.xlabel(\"$x_1$\", fontsize=18)\n",
-    "plt.ylabel(\"$y$\", rotation=0, fontsize=18)\n",
+    "plt.xlabel(\"$x_1$\")\n",
+    "plt.ylabel(\"$y$\", rotation=0)\n",
     "plt.axis([-3, 3, 0, 10])\n",
+    "plt.grid()\n",
     "save_fig(\"quadratic_data_plot\")\n",
     "plt.show()"
    ]
@@ -546,6 +606,7 @@
    "outputs": [],
    "source": [
     "from sklearn.preprocessing import PolynomialFeatures\n",
+    "\n",
     "poly_features = PolynomialFeatures(degree=2, include_bias=False)\n",
     "X_poly = poly_features.fit_transform(X)\n",
     "X[0]"
@@ -577,15 +638,20 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "X_new=np.linspace(-3, 3, 100).reshape(100, 1)\n",
+    "# not in the book\n",
+    "\n",
+    "X_new = np.linspace(-3, 3, 100).reshape(100, 1)\n",
     "X_new_poly = poly_features.transform(X_new)\n",
     "y_new = lin_reg.predict(X_new_poly)\n",
+    "\n",
+    "plt.figure(figsize=(6, 4))\n",
     "plt.plot(X, y, \"b.\")\n",
     "plt.plot(X_new, y_new, \"r-\", linewidth=2, label=\"Predictions\")\n",
-    "plt.xlabel(\"$x_1$\", fontsize=18)\n",
-    "plt.ylabel(\"$y$\", rotation=0, fontsize=18)\n",
-    "plt.legend(loc=\"upper left\", fontsize=14)\n",
+    "plt.xlabel(\"$x_1$\")\n",
+    "plt.ylabel(\"$y$\", rotation=0)\n",
+    "plt.legend(loc=\"upper left\")\n",
     "plt.axis([-3, 3, 0, 10])\n",
+    "plt.grid()\n",
     "save_fig(\"quadratic_predictions_plot\")\n",
     "plt.show()"
    ]
@@ -596,27 +662,29 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from sklearn.preprocessing import StandardScaler\n",
-    "from sklearn.pipeline import Pipeline\n",
+    "# not in the book\n",
     "\n",
-    "for style, width, degree in ((\"g-\", 1, 300), (\"b--\", 2, 2), (\"r-+\", 2, 1)):\n",
+    "from sklearn.preprocessing import StandardScaler\n",
+    "from sklearn.pipeline import make_pipeline\n",
+    "\n",
+    "plt.figure(figsize=(6, 4))\n",
+    "\n",
+    "for style, width, degree in ((\"r-+\", 2, 1), (\"b--\", 2, 2), (\"g-\", 1, 300)):\n",
     "    polybig_features = PolynomialFeatures(degree=degree, include_bias=False)\n",
     "    std_scaler = StandardScaler()\n",
     "    lin_reg = LinearRegression()\n",
-    "    polynomial_regression = Pipeline([\n",
-    "            (\"poly_features\", polybig_features),\n",
-    "            (\"std_scaler\", std_scaler),\n",
-    "            (\"lin_reg\", lin_reg),\n",
-    "        ])\n",
+    "    polynomial_regression = make_pipeline(polybig_features, std_scaler, lin_reg)\n",
     "    polynomial_regression.fit(X, y)\n",
     "    y_newbig = polynomial_regression.predict(X_new)\n",
-    "    plt.plot(X_new, y_newbig, style, label=str(degree), linewidth=width)\n",
+    "    label = f\"{degree} degree{'s' if degree > 1 else ''}\"\n",
+    "    plt.plot(X_new, y_newbig, style, label=label, linewidth=width)\n",
     "\n",
     "plt.plot(X, y, \"b.\", linewidth=3)\n",
     "plt.legend(loc=\"upper left\")\n",
-    "plt.xlabel(\"$x_1$\", fontsize=18)\n",
-    "plt.ylabel(\"$y$\", rotation=0, fontsize=18)\n",
+    "plt.xlabel(\"$x_1$\")\n",
+    "plt.ylabel(\"$y$\", rotation=0)\n",
     "plt.axis([-3, 3, 0, 10])\n",
+    "plt.grid()\n",
     "save_fig(\"high_degree_polynomials_plot\")\n",
     "plt.show()"
    ]
@@ -634,24 +702,27 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from sklearn.metrics import mean_squared_error\n",
-    "from sklearn.model_selection import train_test_split\n",
+    "from sklearn.model_selection import learning_curve\n",
     "\n",
-    "def plot_learning_curves(model, X, y):\n",
-    "    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=10)\n",
-    "    train_errors, val_errors = [], []\n",
-    "    for m in range(1, len(X_train) + 1):\n",
-    "        model.fit(X_train[:m], y_train[:m])\n",
-    "        y_train_predict = model.predict(X_train[:m])\n",
-    "        y_val_predict = model.predict(X_val)\n",
-    "        train_errors.append(mean_squared_error(y_train[:m], y_train_predict))\n",
-    "        val_errors.append(mean_squared_error(y_val, y_val_predict))\n",
+    "train_sizes, train_scores, valid_scores = learning_curve(\n",
+    "    LinearRegression(), X, y, train_sizes=np.linspace(0.01, 1.0, 40), cv=5,\n",
+    "    scoring=\"neg_root_mean_squared_error\")\n",
+    "train_errors = -train_scores.mean(axis=1)\n",
+    "valid_errors = -valid_scores.mean(axis=1)\n",
     "\n",
-    "    plt.plot(np.sqrt(train_errors), \"r-+\", linewidth=2, label=\"train\")\n",
-    "    plt.plot(np.sqrt(val_errors), \"b-\", linewidth=3, label=\"val\")\n",
-    "    plt.legend(loc=\"upper right\", fontsize=14)   # not shown in the book\n",
-    "    plt.xlabel(\"Training set size\", fontsize=14) # not shown\n",
-    "    plt.ylabel(\"RMSE\", fontsize=14)              # not shown"
+    "plt.figure(figsize=(6, 4))  # not in the book\n",
+    "plt.plot(train_sizes, train_errors, \"r-+\", linewidth=2, label=\"train\")\n",
+    "plt.plot(train_sizes, valid_errors, \"b-\", linewidth=3, label=\"valid\")\n",
+    "\n",
+    "# not in the book: beautifies and saves the figure\n",
+    "plt.xlabel(\"Training set size\")\n",
+    "plt.ylabel(\"RMSE\")\n",
+    "plt.grid()\n",
+    "plt.legend(loc=\"upper right\")\n",
+    "plt.axis([0, 80, 0, 2.5])\n",
+    "save_fig(\"underfitting_learning_curves_plot\")\n",
+    "\n",
+    "plt.show()"
    ]
   },
   {
@@ -660,30 +731,29 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "lin_reg = LinearRegression()\n",
-    "plot_learning_curves(lin_reg, X, y)\n",
-    "plt.axis([0, 80, 0, 3])                         # not shown in the book\n",
-    "save_fig(\"underfitting_learning_curves_plot\")   # not shown\n",
-    "plt.show()                                      # not shown"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 37,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from sklearn.pipeline import Pipeline\n",
+    "from sklearn.pipeline import make_pipeline\n",
     "\n",
-    "polynomial_regression = Pipeline([\n",
-    "        (\"poly_features\", PolynomialFeatures(degree=10, include_bias=False)),\n",
-    "        (\"lin_reg\", LinearRegression()),\n",
-    "    ])\n",
+    "polynomial_regression = make_pipeline(\n",
+    "    PolynomialFeatures(degree=10, include_bias=False),\n",
+    "    LinearRegression())\n",
     "\n",
-    "plot_learning_curves(polynomial_regression, X, y)\n",
-    "plt.axis([0, 80, 0, 3])           # not shown\n",
-    "save_fig(\"learning_curves_plot\")  # not shown\n",
-    "plt.show()                        # not shown"
+    "train_sizes, train_scores, valid_scores = learning_curve(\n",
+    "    polynomial_regression, X, y, train_sizes=np.linspace(0.01, 1.0, 40), cv=5,\n",
+    "    scoring=\"neg_root_mean_squared_error\")\n",
+    "\n",
+    "# not in the book (same as earlier)\n",
+    "train_errors = -train_scores.mean(axis=1)\n",
+    "valid_errors = -valid_scores.mean(axis=1)\n",
+    "plt.figure(figsize=(6, 4))\n",
+    "plt.plot(train_sizes, train_errors, \"r-+\", linewidth=2, label=\"train\")\n",
+    "plt.plot(train_sizes, valid_errors, \"b-\", linewidth=3, label=\"valid\")\n",
+    "plt.legend(loc=\"upper right\")\n",
+    "plt.xlabel(\"Training set size\")\n",
+    "plt.ylabel(\"RMSE\")\n",
+    "plt.grid()\n",
+    "plt.axis([0, 80, 0, 2.5])\n",
+    "save_fig(\"learning_curves_plot\")\n",
+    "plt.show()"
    ]
   },
   {
@@ -700,12 +770,20 @@
     "## Ridge Regression"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Let's generate a very small and noisy linear dataset:"
+   ]
+  },
   {
    "cell_type": "code",
-   "execution_count": 38,
+   "execution_count": 37,
    "metadata": {},
    "outputs": [],
    "source": [
+    "# not in the book\n",
     "np.random.seed(42)\n",
     "m = 20\n",
     "X = 3 * np.random.rand(m, 1)\n",
@@ -713,6 +791,22 @@
     "X_new = np.linspace(0, 3, 100).reshape(100, 1)"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 38,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# not in the book\n",
+    "plt.figure(figsize=(6, 4))\n",
+    "plt.plot(X, y, \".\")\n",
+    "plt.xlabel(\"$x_1$\")\n",
+    "plt.ylabel(\"$y$  \", rotation=0)\n",
+    "plt.axis([0, 3, 0, 3.5])\n",
+    "plt.grid()\n",
+    "plt.show()"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 39,
@@ -720,7 +814,8 @@
    "outputs": [],
    "source": [
     "from sklearn.linear_model import Ridge\n",
-    "ridge_reg = Ridge(alpha=1, solver=\"cholesky\", random_state=42)\n",
+    "\n",
+    "ridge_reg = Ridge(alpha=1, solver=\"cholesky\")\n",
     "ridge_reg.fit(X, y)\n",
     "ridge_reg.predict([[1.5]])"
    ]
@@ -731,9 +826,38 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "ridge_reg = Ridge(alpha=1, solver=\"sag\", random_state=42)\n",
-    "ridge_reg.fit(X, y)\n",
-    "ridge_reg.predict([[1.5]])"
+    "# not in the book\n",
+    "\n",
+    "def plot_model(model_class, polynomial, alphas, **model_kargs):\n",
+    "    plt.plot(X, y, \"b.\", linewidth=3)\n",
+    "    for alpha, style in zip(alphas, (\"b:\", \"g--\", \"r-\")):\n",
+    "        if alpha > 0:\n",
+    "            model = model_class(alpha, **model_kargs)\n",
+    "        else:\n",
+    "            model = LinearRegression()\n",
+    "        if polynomial:\n",
+    "            model = make_pipeline(\n",
+    "                PolynomialFeatures(degree=10, include_bias=False),\n",
+    "                StandardScaler(),\n",
+    "                model)\n",
+    "        model.fit(X, y)\n",
+    "        y_new_regul = model.predict(X_new)\n",
+    "        plt.plot(X_new, y_new_regul, style, linewidth=2,\n",
+    "                 label=r\"$\\alpha = {}$\".format(alpha))\n",
+    "    plt.legend(loc=\"upper left\")\n",
+    "    plt.xlabel(\"$x_1$\")\n",
+    "    plt.axis([0, 3, 0, 3.5])\n",
+    "    plt.grid()\n",
+    "\n",
+    "plt.figure(figsize=(9, 3.5))\n",
+    "plt.subplot(121)\n",
+    "plot_model(Ridge, polynomial=False, alphas=(0, 10, 100), random_state=42)\n",
+    "plt.ylabel(\"$y$  \", rotation=0)\n",
+    "plt.subplot(122)\n",
+    "plot_model(Ridge, polynomial=True, alphas=(0, 10**-5, 1), random_state=42)\n",
+    "plt.gca().axes.yaxis.set_ticklabels([])\n",
+    "save_fig(\"ridge_regression_plot\")\n",
+    "plt.show()"
    ]
   },
   {
@@ -742,42 +866,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from sklearn.linear_model import Ridge\n",
-    "\n",
-    "def plot_model(model_class, polynomial, alphas, **model_kargs):\n",
-    "    for alpha, style in zip(alphas, (\"b-\", \"g--\", \"r:\")):\n",
-    "        model = model_class(alpha, **model_kargs) if alpha > 0 else LinearRegression()\n",
-    "        if polynomial:\n",
-    "            model = Pipeline([\n",
-    "                    (\"poly_features\", PolynomialFeatures(degree=10, include_bias=False)),\n",
-    "                    (\"std_scaler\", StandardScaler()),\n",
-    "                    (\"regul_reg\", model),\n",
-    "                ])\n",
-    "        model.fit(X, y)\n",
-    "        y_new_regul = model.predict(X_new)\n",
-    "        lw = 2 if alpha > 0 else 1\n",
-    "        plt.plot(X_new, y_new_regul, style, linewidth=lw, label=r\"$\\alpha = {}$\".format(alpha))\n",
-    "    plt.plot(X, y, \"b.\", linewidth=3)\n",
-    "    plt.legend(loc=\"upper left\", fontsize=15)\n",
-    "    plt.xlabel(\"$x_1$\", fontsize=18)\n",
-    "    plt.axis([0, 3, 0, 4])\n",
-    "\n",
-    "plt.figure(figsize=(8,4))\n",
-    "plt.subplot(121)\n",
-    "plot_model(Ridge, polynomial=False, alphas=(0, 10, 100), random_state=42)\n",
-    "plt.ylabel(\"$y$\", rotation=0, fontsize=18)\n",
-    "plt.subplot(122)\n",
-    "plot_model(Ridge, polynomial=True, alphas=(0, 10**-5, 1), random_state=42)\n",
-    "\n",
-    "save_fig(\"ridge_regression_plot\")\n",
-    "plt.show()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "**Note**: to be future-proof, we set `max_iter=1000` and `tol=1e-3` because these will be the default values in Scikit-Learn 0.21."
+    "sgd_reg = SGDRegressor(penalty=\"l2\", random_state=42)\n",
+    "sgd_reg.fit(X, y.ravel())\n",
+    "sgd_reg.predict([[1.5]])"
    ]
   },
   {
@@ -786,9 +877,10 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "sgd_reg = SGDRegressor(penalty=\"l2\", max_iter=1000, tol=1e-3, random_state=42)\n",
-    "sgd_reg.fit(X, y.ravel())\n",
-    "sgd_reg.predict([[1.5]])"
+    "# not in the book\n",
+    "ridge_reg = Ridge(alpha=1, solver=\"sag\", random_state=42)\n",
+    "ridge_reg.fit(X, y)\n",
+    "ridge_reg.predict([[1.5]])"
    ]
   },
   {
@@ -806,15 +898,9 @@
    "source": [
     "from sklearn.linear_model import Lasso\n",
     "\n",
-    "plt.figure(figsize=(8,4))\n",
-    "plt.subplot(121)\n",
-    "plot_model(Lasso, polynomial=False, alphas=(0, 0.1, 1), random_state=42)\n",
-    "plt.ylabel(\"$y$\", rotation=0, fontsize=18)\n",
-    "plt.subplot(122)\n",
-    "plot_model(Lasso, polynomial=True, alphas=(0, 10**-7, 1), random_state=42)\n",
-    "\n",
-    "save_fig(\"lasso_regression_plot\")\n",
-    "plt.show()"
+    "lasso_reg = Lasso(alpha=0.1)\n",
+    "lasso_reg.fit(X, y)\n",
+    "lasso_reg.predict([[1.5]])"
    ]
   },
   {
@@ -823,156 +909,21 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from sklearn.linear_model import Lasso\n",
-    "lasso_reg = Lasso(alpha=0.1)\n",
-    "lasso_reg.fit(X, y)\n",
-    "lasso_reg.predict([[1.5]])"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Elastic Net"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 45,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from sklearn.linear_model import ElasticNet\n",
-    "elastic_net = ElasticNet(alpha=0.1, l1_ratio=0.5, random_state=42)\n",
-    "elastic_net.fit(X, y)\n",
-    "elastic_net.predict([[1.5]])"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Early Stopping"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 46,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "np.random.seed(42)\n",
-    "m = 100\n",
-    "X = 6 * np.random.rand(m, 1) - 3\n",
-    "y = 2 + X + 0.5 * X**2 + np.random.randn(m, 1)\n",
-    "\n",
-    "X_train, X_val, y_train, y_val = train_test_split(X[:50], y[:50].ravel(), test_size=0.5, random_state=10)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 47,
-   "metadata": {
-    "scrolled": true
-   },
-   "outputs": [],
-   "source": [
-    "from copy import deepcopy\n",
-    "\n",
-    "poly_scaler = Pipeline([\n",
-    "        (\"poly_features\", PolynomialFeatures(degree=90, include_bias=False)),\n",
-    "        (\"std_scaler\", StandardScaler())\n",
-    "    ])\n",
-    "\n",
-    "X_train_poly_scaled = poly_scaler.fit_transform(X_train)\n",
-    "X_val_poly_scaled = poly_scaler.transform(X_val)\n",
-    "\n",
-    "sgd_reg = SGDRegressor(max_iter=1, tol=-np.infty, warm_start=True,\n",
-    "                       penalty=None, learning_rate=\"constant\", eta0=0.0005, random_state=42)\n",
-    "\n",
-    "minimum_val_error = float(\"inf\")\n",
-    "best_epoch = None\n",
-    "best_model = None\n",
-    "for epoch in range(1000):\n",
-    "    sgd_reg.fit(X_train_poly_scaled, y_train)  # continues where it left off\n",
-    "    y_val_predict = sgd_reg.predict(X_val_poly_scaled)\n",
-    "    val_error = mean_squared_error(y_val, y_val_predict)\n",
-    "    if val_error < minimum_val_error:\n",
-    "        minimum_val_error = val_error\n",
-    "        best_epoch = epoch\n",
-    "        best_model = deepcopy(sgd_reg)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Create the graph:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 48,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "sgd_reg = SGDRegressor(max_iter=1, tol=-np.infty, warm_start=True,\n",
-    "                       penalty=None, learning_rate=\"constant\", eta0=0.0005, random_state=42)\n",
-    "\n",
-    "n_epochs = 500\n",
-    "train_errors, val_errors = [], []\n",
-    "for epoch in range(n_epochs):\n",
-    "    sgd_reg.fit(X_train_poly_scaled, y_train)\n",
-    "    y_train_predict = sgd_reg.predict(X_train_poly_scaled)\n",
-    "    y_val_predict = sgd_reg.predict(X_val_poly_scaled)\n",
-    "    train_errors.append(mean_squared_error(y_train, y_train_predict))\n",
-    "    val_errors.append(mean_squared_error(y_val, y_val_predict))\n",
-    "\n",
-    "best_epoch = np.argmin(val_errors)\n",
-    "best_val_rmse = np.sqrt(val_errors[best_epoch])\n",
-    "\n",
-    "plt.annotate('Best model',\n",
-    "             xy=(best_epoch, best_val_rmse),\n",
-    "             xytext=(best_epoch, best_val_rmse + 1),\n",
-    "             ha=\"center\",\n",
-    "             arrowprops=dict(facecolor='black', shrink=0.05),\n",
-    "             fontsize=16,\n",
-    "            )\n",
-    "\n",
-    "best_val_rmse -= 0.03  # just to make the graph look better\n",
-    "plt.plot([0, n_epochs], [best_val_rmse, best_val_rmse], \"k:\", linewidth=2)\n",
-    "plt.plot(np.sqrt(val_errors), \"b-\", linewidth=3, label=\"Validation set\")\n",
-    "plt.plot(np.sqrt(train_errors), \"r--\", linewidth=2, label=\"Training set\")\n",
-    "plt.legend(loc=\"upper right\", fontsize=14)\n",
-    "plt.xlabel(\"Epoch\", fontsize=14)\n",
-    "plt.ylabel(\"RMSE\", fontsize=14)\n",
-    "save_fig(\"early_stopping_plot\")\n",
+    "# not in the book\n",
+    "plt.figure(figsize=(9, 3.5))\n",
+    "plt.subplot(121)\n",
+    "plot_model(Lasso, polynomial=False, alphas=(0, 0.1, 1), random_state=42)\n",
+    "plt.ylabel(\"$y$\", rotation=0)\n",
+    "plt.subplot(122)\n",
+    "plot_model(Lasso, polynomial=True, alphas=(0, 1e-2, 1), random_state=42)\n",
+    "plt.gca().axes.yaxis.set_ticklabels([])\n",
+    "save_fig(\"lasso_regression_plot\")\n",
     "plt.show()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 49,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "best_epoch, best_model"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 50,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "%matplotlib inline\n",
-    "import matplotlib.pyplot as plt\n",
-    "import numpy as np"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 51,
+   "execution_count": 45,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -985,12 +936,12 @@
     "Xr = np.array([[1, 1], [1, -1], [1, 0.5]])\n",
     "yr = 2 * Xr[:, :1] + 0.5 * Xr[:, 1:]\n",
     "\n",
-    "J = (1/len(Xr) * np.sum((T.dot(Xr.T) - yr.T)**2, axis=1)).reshape(t1.shape)\n",
+    "J = (1 / len(Xr) * ((T @ Xr.T - yr.T) ** 2).sum(axis=1)).reshape(t1.shape)\n",
     "\n",
     "N1 = np.linalg.norm(T, ord=1, axis=1).reshape(t1.shape)\n",
     "N2 = np.linalg.norm(T, ord=2, axis=1).reshape(t1.shape)\n",
     "\n",
-    "t_min_idx = np.unravel_index(np.argmin(J), J.shape)\n",
+    "t_min_idx = np.unravel_index(J.argmin(), J.shape)\n",
     "t1_min, t2_min = t1[t_min_idx], t2[t_min_idx]\n",
     "\n",
     "t_init = np.array([[0.25], [-1]])"
@@ -998,66 +949,172 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 52,
+   "execution_count": 46,
    "metadata": {},
    "outputs": [],
    "source": [
-    "def bgd_path(theta, X, y, l1, l2, core = 1, eta = 0.05, n_iterations = 200):\n",
+    "def bgd_path(theta, X, y, l1, l2, core=1, eta=0.05, n_iterations=200):\n",
     "    path = [theta]\n",
     "    for iteration in range(n_iterations):\n",
-    "        gradients = core * 2/len(X) * X.T.dot(X.dot(theta) - y) + l1 * np.sign(theta) + l2 * theta\n",
+    "        gradients = (core * 2 / len(X) * X.T @ (X @ theta - y)\n",
+    "                     + l1 * np.sign(theta) + l2 * theta)\n",
     "        theta = theta - eta * gradients\n",
     "        path.append(theta)\n",
     "    return np.array(path)\n",
     "\n",
     "fig, axes = plt.subplots(2, 2, sharex=True, sharey=True, figsize=(10.1, 8))\n",
-    "for i, N, l1, l2, title in ((0, N1, 2., 0, \"Lasso\"), (1, N2, 0,  2., \"Ridge\")):\n",
-    "    JR = J + l1 * N1 + l2 * 0.5 * N2**2\n",
-    "    \n",
-    "    tr_min_idx = np.unravel_index(np.argmin(JR), JR.shape)\n",
+    "\n",
+    "for i, N, l1, l2, title in ((0, N1, 2.0, 0, \"Lasso\"), (1, N2, 0, 2.0, \"Ridge\")):\n",
+    "    JR = J + l1 * N1 + l2 * 0.5 * N2 ** 2\n",
+    "\n",
+    "    tr_min_idx = np.unravel_index(JR.argmin(), JR.shape)\n",
     "    t1r_min, t2r_min = t1[tr_min_idx], t2[tr_min_idx]\n",
     "\n",
-    "    levelsJ=(np.exp(np.linspace(0, 1, 20)) - 1) * (np.max(J) - np.min(J)) + np.min(J)\n",
-    "    levelsJR=(np.exp(np.linspace(0, 1, 20)) - 1) * (np.max(JR) - np.min(JR)) + np.min(JR)\n",
-    "    levelsN=np.linspace(0, np.max(N), 10)\n",
-    "    \n",
+    "    levels = np.exp(np.linspace(0, 1, 20)) - 1\n",
+    "    levelsJ = levels * (J.max() - J.min()) + J.min()\n",
+    "    levelsJR = levels * (JR.max() - JR.min()) + JR.min()\n",
+    "    levelsN = np.linspace(0, N.max(), 10)\n",
+    "\n",
     "    path_J = bgd_path(t_init, Xr, yr, l1=0, l2=0)\n",
     "    path_JR = bgd_path(t_init, Xr, yr, l1, l2)\n",
-    "    path_N = bgd_path(np.array([[2.0], [0.5]]), Xr, yr, np.sign(l1)/3, np.sign(l2), core=0)\n",
-    "\n",
+    "    path_N = bgd_path(theta=np.array([[2.0], [0.5]]), X=Xr, y=yr,\n",
+    "                      l1=np.sign(l1) / 3, l2=np.sign(l2), core=0)\n",
     "    ax = axes[i, 0]\n",
-    "    ax.grid(True)\n",
-    "    ax.axhline(y=0, color='k')\n",
-    "    ax.axvline(x=0, color='k')\n",
-    "    ax.contourf(t1, t2, N / 2., levels=levelsN)\n",
+    "    ax.grid()\n",
+    "    ax.axhline(y=0, color=\"k\")\n",
+    "    ax.axvline(x=0, color=\"k\")\n",
+    "    ax.contourf(t1, t2, N / 2.0, levels=levelsN)\n",
     "    ax.plot(path_N[:, 0], path_N[:, 1], \"y--\")\n",
     "    ax.plot(0, 0, \"ys\")\n",
     "    ax.plot(t1_min, t2_min, \"ys\")\n",
-    "    ax.set_title(r\"$\\ell_{}$ penalty\".format(i + 1), fontsize=16)\n",
+    "    ax.set_title(r\"$\\ell_{}$ penalty\".format(i + 1))\n",
     "    ax.axis([t1a, t1b, t2a, t2b])\n",
     "    if i == 1:\n",
-    "        ax.set_xlabel(r\"$\\theta_1$\", fontsize=16)\n",
-    "    ax.set_ylabel(r\"$\\theta_2$\", fontsize=16, rotation=0)\n",
+    "        ax.set_xlabel(r\"$\\theta_1$\")\n",
+    "    ax.set_ylabel(r\"$\\theta_2$\", rotation=0)\n",
     "\n",
     "    ax = axes[i, 1]\n",
-    "    ax.grid(True)\n",
-    "    ax.axhline(y=0, color='k')\n",
-    "    ax.axvline(x=0, color='k')\n",
+    "    ax.grid()\n",
+    "    ax.axhline(y=0, color=\"k\")\n",
+    "    ax.axvline(x=0, color=\"k\")\n",
     "    ax.contourf(t1, t2, JR, levels=levelsJR, alpha=0.9)\n",
     "    ax.plot(path_JR[:, 0], path_JR[:, 1], \"w-o\")\n",
     "    ax.plot(path_N[:, 0], path_N[:, 1], \"y--\")\n",
     "    ax.plot(0, 0, \"ys\")\n",
     "    ax.plot(t1_min, t2_min, \"ys\")\n",
     "    ax.plot(t1r_min, t2r_min, \"rs\")\n",
-    "    ax.set_title(title, fontsize=16)\n",
+    "    ax.set_title(title)\n",
     "    ax.axis([t1a, t1b, t2a, t2b])\n",
     "    if i == 1:\n",
-    "        ax.set_xlabel(r\"$\\theta_1$\", fontsize=16)\n",
+    "        ax.set_xlabel(r\"$\\theta_1$\")\n",
     "\n",
     "save_fig(\"lasso_vs_ridge_plot\")\n",
     "plt.show()"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Elastic Net"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 47,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn.linear_model import ElasticNet\n",
+    "\n",
+    "elastic_net = ElasticNet(alpha=0.1, l1_ratio=0.5)\n",
+    "elastic_net.fit(X, y)\n",
+    "elastic_net.predict([[1.5]])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Early Stopping"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Let's go back to the quadratic dataset we used earlier:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 48,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "np.random.seed(42)\n",
+    "m = 100\n",
+    "X = 6 * np.random.rand(m, 1) - 3\n",
+    "y = 0.5 * X ** 2 + X + 2 + np.random.randn(m, 1)\n",
+    "# X_train, X_val, y_train, y_val = train_test_split(\n",
+    "# X[:50], y[:50].ravel(), test_size=0.5, random_state=10)\n",
+    "X_train, y_train = X[:50], y[:50, 0]\n",
+    "X_valid, y_valid = X[50:], y[50:, 0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 49,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from copy import deepcopy\n",
+    "from sklearn.metrics import mean_squared_error\n",
+    "\n",
+    "preprocessing = make_pipeline(PolynomialFeatures(degree=90, include_bias=False),\n",
+    "                              StandardScaler())\n",
+    "X_train_prep = preprocessing.fit_transform(X_train)\n",
+    "X_valid_prep = preprocessing.transform(X_valid)\n",
+    "sgd_reg = SGDRegressor(penalty=None, eta0=0.002, random_state=42)\n",
+    "n_epochs = 500\n",
+    "best_valid_rmse = float('inf')\n",
+    "train_errors, val_errors = [], []  # not in the book, it's for the figure below\n",
+    "\n",
+    "for epoch in range(n_epochs):\n",
+    "    sgd_reg.partial_fit(X_train_prep, y_train)\n",
+    "    y_valid_predict = sgd_reg.predict(X_valid_prep)\n",
+    "    val_error = mean_squared_error(y_valid, y_valid_predict, squared=False)\n",
+    "    if val_error < best_valid_rmse:\n",
+    "        best_valid_rmse = val_error\n",
+    "        best_model = deepcopy(sgd_reg)\n",
+    "\n",
+    "    # not in the book, we evaluate the train error and save it for the figure\n",
+    "    y_train_predict = sgd_reg.predict(X_train_prep)\n",
+    "    train_error = mean_squared_error(y_train, y_train_predict, squared=False)\n",
+    "    val_errors.append(val_error)\n",
+    "    train_errors.append(train_error)\n",
+    "\n",
+    "# not in the book, this code just creates the figure below\n",
+    "best_epoch = np.argmin(val_errors)\n",
+    "plt.figure(figsize=(6, 4))\n",
+    "plt.annotate('Best model',\n",
+    "             xy=(best_epoch, best_valid_rmse),\n",
+    "             xytext=(best_epoch, best_valid_rmse + 0.5),\n",
+    "             ha=\"center\", fontsize=14,\n",
+    "             arrowprops=dict(facecolor='black', shrink=0.05))\n",
+    "plt.plot([0, n_epochs], [best_valid_rmse, best_valid_rmse], \"k:\", linewidth=2)\n",
+    "plt.plot(val_errors, \"b-\", linewidth=3, label=\"Validation set\")\n",
+    "plt.plot(best_epoch, best_valid_rmse, \"bo\")\n",
+    "plt.plot(train_errors, \"r--\", linewidth=2, label=\"Training set\")\n",
+    "plt.legend(loc=\"upper right\")\n",
+    "plt.xlabel(\"Epoch\")\n",
+    "plt.ylabel(\"RMSE\")\n",
+    "plt.axis([0, n_epochs, 0, 3.5])\n",
+    "plt.grid()\n",
+    "save_fig(\"early_stopping_plot\")\n",
+    "plt.show()"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -1065,6 +1122,40 @@
     "# Logistic Regression"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Estimating Probabilities"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 50,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# not in the book\n",
+    "\n",
+    "lim = 6\n",
+    "t = np.linspace(-lim, lim, 100)\n",
+    "sig = 1 / (1 + np.exp(-t))\n",
+    "\n",
+    "plt.figure(figsize=(8, 3))\n",
+    "plt.plot([-lim, lim], [0, 0], \"k-\")\n",
+    "plt.plot([-lim, lim], [0.5, 0.5], \"k:\")\n",
+    "plt.plot([-lim, lim], [1, 1], \"k:\")\n",
+    "plt.plot([0, 0], [-1.1, 1.1], \"k-\")\n",
+    "plt.plot(t, sig, \"b-\", linewidth=2, label=r\"$\\sigma(t) = \\dfrac{1}{1 + e^{-t}}$\")\n",
+    "plt.xlabel(\"t\")\n",
+    "plt.legend(loc=\"upper left\")\n",
+    "plt.axis([-lim, lim, -0.1, 1.1])\n",
+    "plt.gca().set_yticks([0, 0.25, 0.5, 0.75, 1])\n",
+    "plt.grid()\n",
+    "save_fig(\"logistic_function_plot\")\n",
+    "plt.show()"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -1072,25 +1163,36 @@
     "## Decision Boundaries"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 51,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn.datasets import load_iris\n",
+    "\n",
+    "iris = load_iris(as_frame=True)\n",
+    "list(iris)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 52,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# not in the book\n",
+    "\n",
+    "print(iris.DESCR)"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 53,
    "metadata": {},
    "outputs": [],
    "source": [
-    "t = np.linspace(-10, 10, 100)\n",
-    "sig = 1 / (1 + np.exp(-t))\n",
-    "plt.figure(figsize=(9, 3))\n",
-    "plt.plot([-10, 10], [0, 0], \"k-\")\n",
-    "plt.plot([-10, 10], [0.5, 0.5], \"k:\")\n",
-    "plt.plot([-10, 10], [1, 1], \"k:\")\n",
-    "plt.plot([0, 0], [-1.1, 1.1], \"k-\")\n",
-    "plt.plot(t, sig, \"b-\", linewidth=2, label=r\"$\\sigma(t) = \\frac{1}{1 + e^{-t}}$\")\n",
-    "plt.xlabel(\"t\")\n",
-    "plt.legend(loc=\"upper left\", fontsize=20)\n",
-    "plt.axis([-10, 10, -0.1, 1.1])\n",
-    "save_fig(\"logistic_function_plot\")\n",
-    "plt.show()"
+    "print(iris.data.head())"
    ]
   },
   {
@@ -1099,9 +1201,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from sklearn import datasets\n",
-    "iris = datasets.load_iris()\n",
-    "list(iris.keys())"
+    "iris.target.head()  # note that the instances are not shuffled"
    ]
   },
   {
@@ -1110,7 +1210,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "print(iris.DESCR)"
+    "iris.target_names"
    ]
   },
   {
@@ -1119,15 +1219,15 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "X = iris[\"data\"][:, 3:]  # petal width\n",
-    "y = (iris[\"target\"] == 2).astype(np.int)  # 1 if Iris virginica, else 0"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "**Note**: To be future-proof we set `solver=\"lbfgs\"` since this will be the default value in Scikit-Learn 0.22."
+    "from sklearn.linear_model import LogisticRegression\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "\n",
+    "X = iris.data[[\"petal width (cm)\"]].values\n",
+    "y = iris.target_names[iris.target] == 'virginica'\n",
+    "X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)\n",
+    "\n",
+    "log_reg = LogisticRegression(random_state=42)\n",
+    "log_reg.fit(X_train, y_train)"
    ]
   },
   {
@@ -1136,9 +1236,32 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from sklearn.linear_model import LogisticRegression\n",
-    "log_reg = LogisticRegression(solver=\"lbfgs\", random_state=42)\n",
-    "log_reg.fit(X, y)"
+    "X_new = np.linspace(0, 3, 1000).reshape(-1, 1)  # reshape to get a column vector\n",
+    "y_proba = log_reg.predict_proba(X_new)\n",
+    "decision_boundary = X_new[y_proba[:, 1] >= 0.5][0, 0]\n",
+    "\n",
+    "plt.figure(figsize=(8, 3))  # not in the book\n",
+    "plt.plot(X_new, y_proba[:, 0], \"b--\", linewidth=2,\n",
+    "         label=\"Not Iris virginica proba\")\n",
+    "plt.plot(X_new, y_proba[:, 1], \"g-\", linewidth=2, label=\"Iris virginica proba\")\n",
+    "plt.plot([decision_boundary, decision_boundary], [0, 1], \"k:\", linewidth=2,\n",
+    "         label=\"Decision boundary\")\n",
+    "\n",
+    "# not in the book: beautifies the figure\n",
+    "plt.arrow(x=decision_boundary, y=0.08, dx=-0.3, dy=0,\n",
+    "          head_width=0.05, head_length=0.1, fc=\"b\", ec=\"b\")\n",
+    "plt.arrow(x=decision_boundary, y=0.92, dx=0.3, dy=0,\n",
+    "          head_width=0.05, head_length=0.1, fc=\"g\", ec=\"g\")\n",
+    "plt.plot(X_train[y_train == 0], y_train[y_train == 0], \"bs\")\n",
+    "plt.plot(X_train[y_train == 1], y_train[y_train == 1], \"g^\")\n",
+    "plt.xlabel(\"Petal width (cm)\")\n",
+    "plt.ylabel(\"Probability\")\n",
+    "plt.legend(loc=\"center left\")\n",
+    "plt.axis([0, 3, -0.02, 1.02])\n",
+    "plt.grid()\n",
+    "save_fig(\"logistic_regression_plot\")\n",
+    "\n",
+    "plt.show()"
    ]
   },
   {
@@ -1147,18 +1270,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "X_new = np.linspace(0, 3, 1000).reshape(-1, 1)\n",
-    "y_proba = log_reg.predict_proba(X_new)\n",
-    "\n",
-    "plt.plot(X_new, y_proba[:, 1], \"g-\", linewidth=2, label=\"Iris virginica\")\n",
-    "plt.plot(X_new, y_proba[:, 0], \"b--\", linewidth=2, label=\"Not Iris virginica\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "The figure in the book actually is actually a bit fancier:"
+    "decision_boundary"
    ]
   },
   {
@@ -1167,25 +1279,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "X_new = np.linspace(0, 3, 1000).reshape(-1, 1)\n",
-    "y_proba = log_reg.predict_proba(X_new)\n",
-    "decision_boundary = X_new[y_proba[:, 1] >= 0.5][0]\n",
-    "\n",
-    "plt.figure(figsize=(8, 3))\n",
-    "plt.plot(X[y==0], y[y==0], \"bs\")\n",
-    "plt.plot(X[y==1], y[y==1], \"g^\")\n",
-    "plt.plot([decision_boundary, decision_boundary], [-1, 2], \"k:\", linewidth=2)\n",
-    "plt.plot(X_new, y_proba[:, 1], \"g-\", linewidth=2, label=\"Iris virginica\")\n",
-    "plt.plot(X_new, y_proba[:, 0], \"b--\", linewidth=2, label=\"Not Iris virginica\")\n",
-    "plt.text(decision_boundary+0.02, 0.15, \"Decision  boundary\", fontsize=14, color=\"k\", ha=\"center\")\n",
-    "plt.arrow(decision_boundary, 0.08, -0.3, 0, head_width=0.05, head_length=0.1, fc='b', ec='b')\n",
-    "plt.arrow(decision_boundary, 0.92, 0.3, 0, head_width=0.05, head_length=0.1, fc='g', ec='g')\n",
-    "plt.xlabel(\"Petal width (cm)\", fontsize=14)\n",
-    "plt.ylabel(\"Probability\", fontsize=14)\n",
-    "plt.legend(loc=\"center left\", fontsize=14)\n",
-    "plt.axis([0, 3, -0.02, 1.02])\n",
-    "save_fig(\"logistic_regression_plot\")\n",
-    "plt.show()"
+    "log_reg.predict([[1.7], [1.5]])"
    ]
   },
   {
@@ -1194,16 +1288,41 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "decision_boundary"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 61,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "log_reg.predict([[1.7], [1.5]])"
+    "# not in the book\n",
+    "\n",
+    "X = iris.data[[\"petal length (cm)\", \"petal width (cm)\"]].values\n",
+    "y = iris.target_names[iris.target] == 'virginica'\n",
+    "X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)\n",
+    "\n",
+    "log_reg = LogisticRegression(C=2, random_state=42)\n",
+    "log_reg.fit(X_train, y_train)\n",
+    "\n",
+    "# for the contour plot\n",
+    "x0, x1 = np.meshgrid(np.linspace(2.9, 7, 500).reshape(-1, 1),\n",
+    "                     np.linspace(0.8, 2.7, 200).reshape(-1, 1))\n",
+    "X_new = np.c_[x0.ravel(), x1.ravel()]  # one instance per point on the figure\n",
+    "y_proba = log_reg.predict_proba(X_new)\n",
+    "zz = y_proba[:, 1].reshape(x0.shape)\n",
+    "\n",
+    "# for the decision boundary\n",
+    "left_right = np.array([2.9, 7])\n",
+    "boundary = -((log_reg.coef_[0, 0] * left_right + log_reg.intercept_[0])\n",
+    "             / log_reg.coef_[0, 1])\n",
+    "\n",
+    "plt.figure(figsize=(10, 4))\n",
+    "plt.plot(X_train[y_train == 0, 0], X_train[y_train == 0, 1], \"bs\")\n",
+    "plt.plot(X_train[y_train == 1, 0], X_train[y_train == 1, 1], \"g^\")\n",
+    "contour = plt.contour(x0, x1, zz, cmap=plt.cm.brg)\n",
+    "plt.clabel(contour, inline=1)\n",
+    "plt.plot(left_right, boundary, \"k--\", linewidth=3)\n",
+    "plt.text(3.5, 1.27, \"Not Iris virginica\", color=\"b\", ha=\"center\", fontsize=14)\n",
+    "plt.text(6.5, 2.3, \"Iris virginica\", color=\"g\", ha=\"center\", fontsize=14)\n",
+    "plt.xlabel(\"Petal length\")\n",
+    "plt.ylabel(\"Petal width\")\n",
+    "plt.axis([2.9, 7, 0.8, 2.7])\n",
+    "plt.grid()\n",
+    "save_fig(\"logistic_regression_contour_plot\")\n",
+    "plt.show()"
    ]
   },
   {
@@ -1215,59 +1334,38 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 62,
+   "execution_count": 61,
    "metadata": {},
    "outputs": [],
    "source": [
-    "from sklearn.linear_model import LogisticRegression\n",
+    "X = iris.data[[\"petal length (cm)\", \"petal width (cm)\"]].values\n",
+    "y = iris[\"target\"]\n",
+    "X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)\n",
     "\n",
-    "X = iris[\"data\"][:, (2, 3)]  # petal length, petal width\n",
-    "y = (iris[\"target\"] == 2).astype(np.int)\n",
-    "\n",
-    "log_reg = LogisticRegression(solver=\"lbfgs\", C=10**10, random_state=42)\n",
-    "log_reg.fit(X, y)\n",
-    "\n",
-    "x0, x1 = np.meshgrid(\n",
-    "        np.linspace(2.9, 7, 500).reshape(-1, 1),\n",
-    "        np.linspace(0.8, 2.7, 200).reshape(-1, 1),\n",
-    "    )\n",
-    "X_new = np.c_[x0.ravel(), x1.ravel()]\n",
-    "\n",
-    "y_proba = log_reg.predict_proba(X_new)\n",
-    "\n",
-    "plt.figure(figsize=(10, 4))\n",
-    "plt.plot(X[y==0, 0], X[y==0, 1], \"bs\")\n",
-    "plt.plot(X[y==1, 0], X[y==1, 1], \"g^\")\n",
-    "\n",
-    "zz = y_proba[:, 1].reshape(x0.shape)\n",
-    "contour = plt.contour(x0, x1, zz, cmap=plt.cm.brg)\n",
-    "\n",
-    "\n",
-    "left_right = np.array([2.9, 7])\n",
-    "boundary = -(log_reg.coef_[0][0] * left_right + log_reg.intercept_[0]) / log_reg.coef_[0][1]\n",
-    "\n",
-    "plt.clabel(contour, inline=1, fontsize=12)\n",
-    "plt.plot(left_right, boundary, \"k--\", linewidth=3)\n",
-    "plt.text(3.5, 1.5, \"Not Iris virginica\", fontsize=14, color=\"b\", ha=\"center\")\n",
-    "plt.text(6.5, 2.3, \"Iris virginica\", fontsize=14, color=\"g\", ha=\"center\")\n",
-    "plt.xlabel(\"Petal length\", fontsize=14)\n",
-    "plt.ylabel(\"Petal width\", fontsize=14)\n",
-    "plt.axis([2.9, 7, 0.8, 2.7])\n",
-    "save_fig(\"logistic_regression_contour_plot\")\n",
-    "plt.show()"
+    "softmax_reg = LogisticRegression(C=30, random_state=42)\n",
+    "softmax_reg.fit(X_train, y_train)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 62,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "softmax_reg.predict([[5, 2]])"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 63,
-   "metadata": {},
+   "metadata": {
+    "tags": []
+   },
    "outputs": [],
    "source": [
-    "X = iris[\"data\"][:, (2, 3)]  # petal length, petal width\n",
-    "y = iris[\"target\"]\n",
-    "\n",
-    "softmax_reg = LogisticRegression(multi_class=\"multinomial\",solver=\"lbfgs\", C=10, random_state=42)\n",
-    "softmax_reg.fit(X, y)"
+    "softmax_reg.predict_proba([[5, 2]]).round(2)"
    ]
   },
   {
@@ -1276,12 +1374,13 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "x0, x1 = np.meshgrid(\n",
-    "        np.linspace(0, 8, 500).reshape(-1, 1),\n",
-    "        np.linspace(0, 3.5, 200).reshape(-1, 1),\n",
-    "    )\n",
-    "X_new = np.c_[x0.ravel(), x1.ravel()]\n",
+    "from matplotlib.colors import ListedColormap\n",
     "\n",
+    "custom_cmap = ListedColormap([\"#fafab0\", \"#9898ff\", \"#a0faa0\"])\n",
+    "\n",
+    "x0, x1 = np.meshgrid(np.linspace(0, 8, 500).reshape(-1, 1),\n",
+    "                     np.linspace(0, 3.5, 200).reshape(-1, 1))\n",
+    "X_new = np.c_[x0.ravel(), x1.ravel()]\n",
     "\n",
     "y_proba = softmax_reg.predict_proba(X_new)\n",
     "y_predict = softmax_reg.predict(X_new)\n",
@@ -1290,42 +1389,22 @@
     "zz = y_predict.reshape(x0.shape)\n",
     "\n",
     "plt.figure(figsize=(10, 4))\n",
-    "plt.plot(X[y==2, 0], X[y==2, 1], \"g^\", label=\"Iris virginica\")\n",
-    "plt.plot(X[y==1, 0], X[y==1, 1], \"bs\", label=\"Iris versicolor\")\n",
-    "plt.plot(X[y==0, 0], X[y==0, 1], \"yo\", label=\"Iris setosa\")\n",
-    "\n",
-    "from matplotlib.colors import ListedColormap\n",
-    "custom_cmap = ListedColormap(['#fafab0','#9898ff','#a0faa0'])\n",
+    "plt.plot(X[y == 2, 0], X[y == 2, 1], \"g^\", label=\"Iris virginica\")\n",
+    "plt.plot(X[y == 1, 0], X[y == 1, 1], \"bs\", label=\"Iris versicolor\")\n",
+    "plt.plot(X[y == 0, 0], X[y == 0, 1], \"yo\", label=\"Iris setosa\")\n",
     "\n",
     "plt.contourf(x0, x1, zz, cmap=custom_cmap)\n",
-    "contour = plt.contour(x0, x1, zz1, cmap=plt.cm.brg)\n",
-    "plt.clabel(contour, inline=1, fontsize=12)\n",
-    "plt.xlabel(\"Petal length\", fontsize=14)\n",
-    "plt.ylabel(\"Petal width\", fontsize=14)\n",
-    "plt.legend(loc=\"center left\", fontsize=14)\n",
-    "plt.axis([0, 7, 0, 3.5])\n",
+    "contour = plt.contour(x0, x1, zz1, cmap=\"hot\")\n",
+    "plt.clabel(contour, inline=1)\n",
+    "plt.xlabel(\"Petal length\")\n",
+    "plt.ylabel(\"Petal width\")\n",
+    "plt.legend(loc=\"center left\")\n",
+    "plt.axis([0.5, 7, 0, 3.5])\n",
+    "plt.grid()\n",
     "save_fig(\"softmax_regression_contour_plot\")\n",
     "plt.show()"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": 65,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "softmax_reg.predict([[5, 2]])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 66,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "softmax_reg.predict_proba([[5, 2]])"
-   ]
-  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -1352,7 +1431,7 @@
    "metadata": {},
    "source": [
     "## 12. Batch Gradient Descent with early stopping for Softmax Regression\n",
-    "(without using Scikit-Learn)"
+    "Exercise: _Implement Batch Gradient Descent with early stopping for Softmax Regression without using Scikit-Learn, only NumPy._"
    ]
   },
   {
@@ -1362,60 +1441,44 @@
     "Let's start by loading the data. We will just reuse the Iris dataset we loaded earlier."
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 65,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "X = iris.data[[\"petal length (cm)\", \"petal width (cm)\"]].values\n",
+    "y = iris[\"target\"].values"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We need to add the bias term for every instance ($x_0 = 1$). The easiest option to do this would be to use Scikit-Learn's `add_dummy_feature()` function, but the point of this exercise is to get a better understanding of the algorithms by implementing them manually. So here is one possible implementation:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 66,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "X_with_bias = np.c_[np.ones(len(X)), X]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The easiest option to split the dataset into a training set, a validation set and a test set would be to use Scikit-Learn's `train_test_split()` function, but again, we want to did this manually:"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 67,
    "metadata": {},
    "outputs": [],
-   "source": [
-    "X = iris[\"data\"][:, (2, 3)]  # petal length, petal width\n",
-    "y = iris[\"target\"]"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "We need to add the bias term for every instance ($x_0 = 1$):"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 68,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "X_with_bias = np.c_[np.ones([len(X), 1]), X]"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "And let's set the random seed so the output of this exercise solution is reproducible:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 69,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "np.random.seed(2042)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "The easiest option to split the dataset into a training set, a validation set and a test set would be to use Scikit-Learn's `train_test_split()` function, but the point of this exercise is to try understand the algorithms by implementing them manually. So here is one possible implementation:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 70,
-   "metadata": {},
-   "outputs": [],
    "source": [
     "test_ratio = 0.2\n",
     "validation_ratio = 0.2\n",
@@ -1425,6 +1488,7 @@
     "validation_size = int(total_size * validation_ratio)\n",
     "train_size = total_size - test_size - validation_size\n",
     "\n",
+    "np.random.seed(42)\n",
     "rnd_indices = np.random.permutation(total_size)\n",
     "\n",
     "X_train = X_with_bias[rnd_indices[:train_size]]\n",
@@ -1439,21 +1503,17 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "The targets are currently class indices (0, 1 or 2), but we need target class probabilities to train the Softmax Regression model. Each instance will have target class probabilities equal to 0.0 for all classes except for the target class which will have a probability of 1.0 (in other words, the vector of class probabilities for ay given instance is a one-hot vector). Let's write a small function to convert the vector of class indices into a matrix containing a one-hot vector for each instance:"
+    "The targets are currently class indices (0, 1 or 2), but we need target class probabilities to train the Softmax Regression model. Each instance will have target class probabilities equal to 0.0 for all classes except for the target class which will have a probability of 1.0 (in other words, the vector of class probabilities for any given instance is a one-hot vector). Let's write a small function to convert the vector of class indices into a matrix containing a one-hot vector for each instance. To understand this code, you need to know that `np.diag(np.ones(n))` creates an n×n matrix full of 0s except for 1s on the main diagonal. Moreover, if `a` in a NumPy array, then `a[[1,3,2]]` returns an array with 3 rows equal to `a[1]`, `a[3]` and `a[2]` (this is [advanced NumPy indexing](https://numpy.org/doc/stable/reference/arrays.indexing.html#advanced-indexing))."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 71,
+   "execution_count": 68,
    "metadata": {},
    "outputs": [],
    "source": [
     "def to_one_hot(y):\n",
-    "    n_classes = y.max() + 1\n",
-    "    m = len(y)\n",
-    "    Y_one_hot = np.zeros((m, n_classes))\n",
-    "    Y_one_hot[np.arange(m), y] = 1\n",
-    "    return Y_one_hot"
+    "    return np.diag(np.ones(y.max() + 1))[y]"
    ]
   },
   {
@@ -1465,7 +1525,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 72,
+   "execution_count": 69,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1474,7 +1534,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 73,
+   "execution_count": 70,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1490,7 +1550,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 74,
+   "execution_count": 71,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1499,6 +1559,26 @@
     "Y_test_one_hot = to_one_hot(y_test)"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now let's scale the inputs. We compute the mean and standard deviation of each feature on the training set (except for the bias feature), then we center and scale each feature in the training set, the validation set, and the test set:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 72,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "mean = X_train[:, 1:].mean(axis=0)\n",
+    "std = X_train[:, 1:].std(axis=0)\n",
+    "X_train[:, 1:] = (X_train[:, 1:] - mean) / std\n",
+    "X_valid[:, 1:] = (X_valid[:, 1:] - mean) / std\n",
+    "X_test[:, 1:] = (X_test[:, 1:] - mean) / std"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -1510,13 +1590,13 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 75,
+   "execution_count": 73,
    "metadata": {},
    "outputs": [],
    "source": [
     "def softmax(logits):\n",
     "    exps = np.exp(logits)\n",
-    "    exp_sums = np.sum(exps, axis=1, keepdims=True)\n",
+    "    exp_sums = exps.sum(axis=1, keepdims=True)\n",
     "    return exps / exp_sums"
    ]
   },
@@ -1529,12 +1609,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 76,
+   "execution_count": 74,
    "metadata": {},
    "outputs": [],
    "source": [
-    "n_inputs = X_train.shape[1] # == 3 (2 features plus the bias term)\n",
-    "n_outputs = len(np.unique(y_train))   # == 3 (3 iris classes)"
+    "n_inputs = X_train.shape[1]  # == 3 (2 features plus the bias term)\n",
+    "n_outputs = len(np.unique(y_train))  # == 3 (there are 3 iris classes)"
    ]
   },
   {
@@ -1557,25 +1637,27 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 77,
+   "execution_count": 75,
    "metadata": {},
    "outputs": [],
    "source": [
-    "eta = 0.01\n",
-    "n_iterations = 5001\n",
+    "eta = 0.5\n",
+    "n_epochs = 5001\n",
     "m = len(X_train)\n",
-    "epsilon = 1e-7\n",
+    "epsilon = 1e-5\n",
     "\n",
+    "np.random.seed(42)\n",
     "Theta = np.random.randn(n_inputs, n_outputs)\n",
     "\n",
-    "for iteration in range(n_iterations):\n",
-    "    logits = X_train.dot(Theta)\n",
+    "for epoch in range(n_epochs):\n",
+    "    logits = X_train @ Theta\n",
     "    Y_proba = softmax(logits)\n",
-    "    if iteration % 500 == 0:\n",
-    "        loss = -np.mean(np.sum(Y_train_one_hot * np.log(Y_proba + epsilon), axis=1))\n",
-    "        print(iteration, loss)\n",
+    "    if epoch % 1000 == 0:\n",
+    "        Y_proba_valid = softmax(X_valid @ Theta)\n",
+    "        xentropy_losses = -(Y_valid_one_hot * np.log(Y_proba_valid + epsilon))\n",
+    "        print(epoch, xentropy_losses.sum(axis=1).mean())\n",
     "    error = Y_proba - Y_train_one_hot\n",
-    "    gradients = 1/m * X_train.T.dot(error)\n",
+    "    gradients = 1 / m * X_train.T @ error\n",
     "    Theta = Theta - eta * gradients"
    ]
   },
@@ -1588,7 +1670,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 78,
+   "execution_count": 76,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1604,15 +1686,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 79,
+   "execution_count": 77,
    "metadata": {},
    "outputs": [],
    "source": [
-    "logits = X_valid.dot(Theta)\n",
+    "logits = X_valid @ Theta\n",
     "Y_proba = softmax(logits)\n",
-    "y_predict = np.argmax(Y_proba, axis=1)\n",
+    "y_predict = Y_proba.argmax(axis=1)\n",
     "\n",
-    "accuracy_score = np.mean(y_predict == y_valid)\n",
+    "accuracy_score = (y_predict == y_valid).mean()\n",
     "accuracy_score"
    ]
   },
@@ -1620,33 +1702,36 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Well, this model looks pretty good. For the sake of the exercise, let's add a bit of $\\ell_2$ regularization. The following training code is similar to the one above, but the loss now has an additional $\\ell_2$ penalty, and the gradients have the proper additional term (note that we don't regularize the first element of `Theta` since this corresponds to the bias term). Also, let's try increasing the learning rate `eta`."
+    "Well, this model looks pretty ok. For the sake of the exercise, let's add a bit of $\\ell_2$ regularization. The following training code is similar to the one above, but the loss now has an additional $\\ell_2$ penalty, and the gradients have the proper additional term (note that we don't regularize the first element of `Theta` since this corresponds to the bias term). Also, let's try increasing the learning rate `eta`."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 80,
+   "execution_count": 78,
    "metadata": {},
    "outputs": [],
    "source": [
-    "eta = 0.1\n",
-    "n_iterations = 5001\n",
+    "eta = 0.5\n",
+    "n_epochs = 5001\n",
     "m = len(X_train)\n",
-    "epsilon = 1e-7\n",
-    "alpha = 0.1  # regularization hyperparameter\n",
+    "epsilon = 1e-5\n",
+    "alpha = 0.01  # regularization hyperparameter\n",
     "\n",
+    "np.random.seed(42)\n",
     "Theta = np.random.randn(n_inputs, n_outputs)\n",
     "\n",
-    "for iteration in range(n_iterations):\n",
-    "    logits = X_train.dot(Theta)\n",
+    "for epoch in range(n_epochs):\n",
+    "    logits = X_train @ Theta\n",
     "    Y_proba = softmax(logits)\n",
-    "    if iteration % 500 == 0:\n",
-    "        xentropy_loss = -np.mean(np.sum(Y_train_one_hot * np.log(Y_proba + epsilon), axis=1))\n",
-    "        l2_loss = 1/2 * np.sum(np.square(Theta[1:]))\n",
-    "        loss = xentropy_loss + alpha * l2_loss\n",
-    "        print(iteration, loss)\n",
+    "    if epoch % 1000 == 0:\n",
+    "        Y_proba_valid = softmax(X_valid @ Theta)\n",
+    "        xentropy_losses = -(Y_valid_one_hot * np.log(Y_proba_valid + epsilon))\n",
+    "        l2_loss = 1 / 2 * (Theta[1:] ** 2).sum()\n",
+    "        total_loss = xentropy_losses.sum(axis=1).mean() + alpha * l2_loss\n",
+    "        print(epoch, total_loss.round(4))\n",
     "    error = Y_proba - Y_train_one_hot\n",
-    "    gradients = 1/m * X_train.T.dot(error) + np.r_[np.zeros([1, n_outputs]), alpha * Theta[1:]]\n",
+    "    gradients = 1 / m * X_train.T @ error\n",
+    "    gradients += np.r_[np.zeros([1, n_outputs]), alpha * Theta[1:]]\n",
     "    Theta = Theta - eta * gradients"
    ]
   },
@@ -1659,15 +1744,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 81,
+   "execution_count": 79,
    "metadata": {},
    "outputs": [],
    "source": [
-    "logits = X_valid.dot(Theta)\n",
+    "logits = X_valid @ Theta\n",
     "Y_proba = softmax(logits)\n",
-    "y_predict = np.argmax(Y_proba, axis=1)\n",
+    "y_predict = Y_proba.argmax(axis=1)\n",
     "\n",
-    "accuracy_score = np.mean(y_predict == y_valid)\n",
+    "accuracy_score = (y_predict == y_valid).mean()\n",
     "accuracy_score"
    ]
   },
@@ -1675,7 +1760,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Cool, perfect accuracy! We probably just got lucky with this validation set, but still, it's pleasant."
+    "In this case, the $\\ell_2$ penalty did not change the test accuracy. Perhaps try fine-tuning `alpha`?"
    ]
   },
   {
@@ -1687,52 +1772,52 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 82,
+   "execution_count": 80,
    "metadata": {},
    "outputs": [],
    "source": [
-    "eta = 0.1 \n",
-    "n_iterations = 5001\n",
+    "eta = 0.5\n",
+    "n_epochs = 50_001\n",
     "m = len(X_train)\n",
-    "epsilon = 1e-7\n",
-    "alpha = 0.1  # regularization hyperparameter\n",
+    "epsilon = 1e-5\n",
+    "C = 100  # regularization hyperparameter\n",
     "best_loss = np.infty\n",
     "\n",
+    "np.random.seed(42)\n",
     "Theta = np.random.randn(n_inputs, n_outputs)\n",
     "\n",
-    "for iteration in range(n_iterations):\n",
-    "    logits = X_train.dot(Theta)\n",
+    "for epoch in range(n_epochs):\n",
+    "    logits = X_train @ Theta\n",
     "    Y_proba = softmax(logits)\n",
-    "    error = Y_proba - Y_train_one_hot\n",
-    "    gradients = 1/m * X_train.T.dot(error) + np.r_[np.zeros([1, n_outputs]), alpha * Theta[1:]]\n",
-    "    Theta = Theta - eta * gradients\n",
-    "\n",
-    "    logits = X_valid.dot(Theta)\n",
-    "    Y_proba = softmax(logits)\n",
-    "    xentropy_loss = -np.mean(np.sum(Y_valid_one_hot * np.log(Y_proba + epsilon), axis=1))\n",
-    "    l2_loss = 1/2 * np.sum(np.square(Theta[1:]))\n",
-    "    loss = xentropy_loss + alpha * l2_loss\n",
-    "    if iteration % 500 == 0:\n",
-    "        print(iteration, loss)\n",
-    "    if loss < best_loss:\n",
-    "        best_loss = loss\n",
+    "    Y_proba_valid = softmax(X_valid @ Theta)\n",
+    "    xentropy_losses = -(Y_valid_one_hot * np.log(Y_proba_valid + epsilon))\n",
+    "    l2_loss = 1 / 2 * (Theta[1:] ** 2).sum()\n",
+    "    total_loss = xentropy_losses.sum(axis=1).mean() + 1 / C * l2_loss\n",
+    "    if epoch % 1000 == 0:\n",
+    "        print(epoch, total_loss.round(4))\n",
+    "    if total_loss < best_loss:\n",
+    "        best_loss = total_loss\n",
     "    else:\n",
-    "        print(iteration - 1, best_loss)\n",
-    "        print(iteration, loss, \"early stopping!\")\n",
-    "        break"
+    "        print(epoch - 1, best_loss.round(4))\n",
+    "        print(epoch, total_loss.round(4), \"early stopping!\")\n",
+    "        break\n",
+    "    error = Y_proba - Y_train_one_hot\n",
+    "    gradients = 1 / m * X_train.T @ error\n",
+    "    gradients += np.r_[np.zeros([1, n_outputs]), 1 / C * Theta[1:]]\n",
+    "    Theta = Theta - eta * gradients"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 83,
+   "execution_count": 81,
    "metadata": {},
    "outputs": [],
    "source": [
-    "logits = X_valid.dot(Theta)\n",
+    "logits = X_valid @ Theta\n",
     "Y_proba = softmax(logits)\n",
-    "y_predict = np.argmax(Y_proba, axis=1)\n",
+    "y_predict = Y_proba.argmax(axis=1)\n",
     "\n",
-    "accuracy_score = np.mean(y_predict == y_valid)\n",
+    "accuracy_score = (y_predict == y_valid).mean()\n",
     "accuracy_score"
    ]
   },
@@ -1740,51 +1825,50 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Still perfect, but faster."
+    "Oh well, still no change in validation acccuracy, but at least early training shortened training a bit."
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Now let's plot the model's predictions on the whole dataset:"
+    "Now let's plot the model's predictions on the whole dataset (remember to scale all features fed to the model):"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 84,
+   "execution_count": 82,
    "metadata": {},
    "outputs": [],
    "source": [
-    "x0, x1 = np.meshgrid(\n",
-    "        np.linspace(0, 8, 500).reshape(-1, 1),\n",
-    "        np.linspace(0, 3.5, 200).reshape(-1, 1),\n",
-    "    )\n",
-    "X_new = np.c_[x0.ravel(), x1.ravel()]\n",
-    "X_new_with_bias = np.c_[np.ones([len(X_new), 1]), X_new]\n",
+    "custom_cmap = mpl.colors.ListedColormap(['#fafab0','#9898ff','#a0faa0'])\n",
     "\n",
-    "logits = X_new_with_bias.dot(Theta)\n",
+    "x0, x1 = np.meshgrid(np.linspace(0, 8, 500).reshape(-1, 1),\n",
+    "                     np.linspace(0, 3.5, 200).reshape(-1, 1))\n",
+    "X_new = np.c_[x0.ravel(), x1.ravel()]\n",
+    "X_new = (X_new - mean) / std\n",
+    "X_new_with_bias = np.c_[np.ones(len(X_new)), X_new]\n",
+    "\n",
+    "logits = X_new_with_bias @ Theta\n",
     "Y_proba = softmax(logits)\n",
-    "y_predict = np.argmax(Y_proba, axis=1)\n",
+    "y_predict = Y_proba.argmax(axis=1)\n",
     "\n",
     "zz1 = Y_proba[:, 1].reshape(x0.shape)\n",
     "zz = y_predict.reshape(x0.shape)\n",
     "\n",
     "plt.figure(figsize=(10, 4))\n",
-    "plt.plot(X[y==2, 0], X[y==2, 1], \"g^\", label=\"Iris virginica\")\n",
-    "plt.plot(X[y==1, 0], X[y==1, 1], \"bs\", label=\"Iris versicolor\")\n",
-    "plt.plot(X[y==0, 0], X[y==0, 1], \"yo\", label=\"Iris setosa\")\n",
-    "\n",
-    "from matplotlib.colors import ListedColormap\n",
-    "custom_cmap = ListedColormap(['#fafab0','#9898ff','#a0faa0'])\n",
+    "plt.plot(X[y == 2, 0], X[y == 2, 1], \"g^\", label=\"Iris virginica\")\n",
+    "plt.plot(X[y == 1, 0], X[y == 1, 1], \"bs\", label=\"Iris versicolor\")\n",
+    "plt.plot(X[y == 0, 0], X[y == 0, 1], \"yo\", label=\"Iris setosa\")\n",
     "\n",
     "plt.contourf(x0, x1, zz, cmap=custom_cmap)\n",
-    "contour = plt.contour(x0, x1, zz1, cmap=plt.cm.brg)\n",
-    "plt.clabel(contour, inline=1, fontsize=12)\n",
-    "plt.xlabel(\"Petal length\", fontsize=14)\n",
-    "plt.ylabel(\"Petal width\", fontsize=14)\n",
-    "plt.legend(loc=\"upper left\", fontsize=14)\n",
+    "contour = plt.contour(x0, x1, zz1, cmap=\"hot\")\n",
+    "plt.clabel(contour, inline=1)\n",
+    "plt.xlabel(\"Petal length\")\n",
+    "plt.ylabel(\"Petal width\")\n",
+    "plt.legend(loc=\"upper left\")\n",
     "plt.axis([0, 7, 0, 3.5])\n",
+    "plt.grid()\n",
     "plt.show()"
    ]
   },
@@ -1797,15 +1881,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 85,
+   "execution_count": 83,
    "metadata": {},
    "outputs": [],
    "source": [
-    "logits = X_test.dot(Theta)\n",
+    "logits = X_test @ Theta\n",
     "Y_proba = softmax(logits)\n",
-    "y_predict = np.argmax(Y_proba, axis=1)\n",
+    "y_predict = Y_proba.argmax(axis=1)\n",
     "\n",
-    "accuracy_score = np.mean(y_predict == y_test)\n",
+    "accuracy_score = (y_predict == y_test).mean()\n",
     "accuracy_score"
    ]
   },
@@ -1813,7 +1897,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Our perfect model turns out to have slight imperfections. This variability is likely due to the very small size of the dataset: depending on how you sample the training set, validation set and the test set, you can get quite different results. Try changing the random seed and running the code again a few times, you will see that the results will vary."
+    "Well we get even better performance on the test set. This variability is likely due to the very small size of the dataset: depending on how you sample the training set, validation set and the test set, you can get quite different results. Try changing the random seed and running the code again a few times, you will see that the results will vary."
    ]
   },
   {
@@ -1826,7 +1910,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
+   "display_name": "Python 3",
    "language": "python",
    "name": "python3"
   },