From b11721e1e542d02ef6eebd105a290f1933b993c0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Aur=C3=A9lien=20Geron?= <ageron@users.noreply.github.com>
Date: Mon, 26 Jun 2017 00:09:23 +0200
Subject: [PATCH 1/6] Add exercise solutions for chapter 08

---
 08_dimensionality_reduction.ipynb | 557 ++++++++++++++++++++++++++++--
 1 file changed, 533 insertions(+), 24 deletions(-)

diff --git a/08_dimensionality_reduction.ipynb b/08_dimensionality_reduction.ipynb
index e26025e..a82cb6c 100644
--- a/08_dimensionality_reduction.ipynb
+++ b/08_dimensionality_reduction.ipynb
@@ -77,7 +77,9 @@
   {
    "cell_type": "code",
    "execution_count": 2,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "np.random.seed(4)\n",
@@ -116,7 +118,9 @@
   {
    "cell_type": "code",
    "execution_count": 4,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "m, n = X.shape\n",
@@ -137,7 +141,9 @@
   {
    "cell_type": "code",
    "execution_count": 6,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "W2 = V.T[:, :2]\n",
@@ -172,7 +178,9 @@
   {
    "cell_type": "code",
    "execution_count": 8,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "from sklearn.decomposition import PCA\n",
@@ -277,7 +285,9 @@
   {
    "cell_type": "code",
    "execution_count": 15,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "X3D_inv_using_svd = X2D_using_svd.dot(V[:2, :])"
@@ -440,7 +450,9 @@
   {
    "cell_type": "code",
    "execution_count": 23,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "axes = [-1.8, 1.8, -1.3, 1.3, -1.0, 1.0]\n",
@@ -786,7 +798,9 @@
   {
    "cell_type": "code",
    "execution_count": 33,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "pca = PCA()\n",
@@ -807,7 +821,9 @@
   {
    "cell_type": "code",
    "execution_count": 35,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "pca = PCA(n_components=0.95)\n",
@@ -835,7 +851,9 @@
   {
    "cell_type": "code",
    "execution_count": 38,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "pca = PCA(n_components = 154)\n",
@@ -1004,7 +1022,9 @@
   {
    "cell_type": "code",
    "execution_count": 48,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "filename = \"my_mnist.data\"\n",
@@ -1024,7 +1044,9 @@
   {
    "cell_type": "code",
    "execution_count": 49,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "del X_mm"
@@ -1053,7 +1075,9 @@
   {
    "cell_type": "code",
    "execution_count": 51,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "rnd_pca = PCA(n_components=154, svd_solver=\"randomized\", random_state=42)\n",
@@ -1313,7 +1337,9 @@
   {
    "cell_type": "code",
    "execution_count": 62,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "rbf_pca = KernelPCA(n_components = 2, kernel=\"rbf\", gamma=0.0433,\n",
@@ -1354,7 +1380,9 @@
   {
    "cell_type": "code",
    "execution_count": 65,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "from sklearn.manifold import LocallyLinearEmbedding\n",
@@ -1390,7 +1418,9 @@
   {
    "cell_type": "code",
    "execution_count": 67,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "from sklearn.manifold import MDS\n",
@@ -1402,7 +1432,9 @@
   {
    "cell_type": "code",
    "execution_count": 68,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "from sklearn.manifold import Isomap\n",
@@ -1741,7 +1773,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Nice! Reducing dimensionality led to a ×4 speedup. :)  Let's the model's accuracy:"
+    "Nice! Reducing dimensionality led to a 4× speedup. :)  Let's the model's accuracy:"
    ]
   },
   {
@@ -1758,7 +1790,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "A very slight drop in performance, which might be a reasonable price to pay for a ×4 speedup, depending on the application."
+    "A very slight drop in performance, which might be a reasonable price to pay for a 4× speedup, depending on the application."
    ]
   },
   {
@@ -1779,7 +1811,484 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Coming soon."
+    "*Exercise: Use t-SNE to reduce the MNIST dataset down to two dimensions and plot the result using Matplotlib. You can use a scatterplot using 10 different colors to represent each image's target class.*"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Let's start by loading the MNIST dataset (again):"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 88,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn.datasets import fetch_mldata\n",
+    "\n",
+    "mnist = fetch_mldata('MNIST original')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Dimensionality reduction on the full 60,000 images takes a very long time, so let's only do this on a random subset of 10,000 images:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 89,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "np.random.seed(42)\n",
+    "\n",
+    "m = 10000\n",
+    "idx = np.random.permutation(60000)[:m]\n",
+    "\n",
+    "X = mnist['data'][idx]\n",
+    "y = mnist['target'][idx]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now let's use t-SNE to reduce dimensionality down to 2D so we can plot the dataset:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 90,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn.manifold import TSNE\n",
+    "\n",
+    "tsne = TSNE(n_components=2, random_state=42)\n",
+    "X_reduced = tsne.fit_transform(X)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now let's use Matplotlib's `scatter()` function to plot a scatterplot, using a different color for each digit:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 91,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plt.figure(figsize=(13,10))\n",
+    "plt.scatter(X_reduced[:, 0], X_reduced[:, 1], c=y, cmap=\"jet\")\n",
+    "plt.axis('off')\n",
+    "plt.colorbar()\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Isn't this just beautiful? :) This plot tells us which numbers are easily distinguishable from the others (e.g., 0s, 6s, and most 8s are rather well separated clusters), and it also tells us which numbers are often hard to distinguish (e.g., 4s and 9s, 5s and 3s, and so on)."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Let's focus on digits 3 and 5, which seem to overlap a lot."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 92,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plt.figure(figsize=(9,9))\n",
+    "cmap = matplotlib.cm.get_cmap(\"jet\")\n",
+    "for digit in (2, 3, 5):\n",
+    "    plt.scatter(X_reduced[y == digit, 0], X_reduced[y == digit, 1], c=cmap(digit / 9))\n",
+    "plt.axis('off')\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Let's see if we can produce a nicer image by running t-SNE on these 3 digits:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 93,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "idx = (y == 2) | (y == 3) | (y == 5) \n",
+    "X_subset = X[idx]\n",
+    "y_subset = y[idx]\n",
+    "\n",
+    "tsne_subset = TSNE(n_components=2, random_state=42)\n",
+    "X_subset_reduced = tsne_subset.fit_transform(X_subset)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 94,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plt.figure(figsize=(9,9))\n",
+    "for digit in (2, 3, 5):\n",
+    "    plt.scatter(X_subset_reduced[y_subset == digit, 0], X_subset_reduced[y_subset == digit, 1], c=cmap(digit / 9))\n",
+    "plt.axis('off')\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Much better, now the clusters have far less overlap. But some 3s are all over the place. Plus, there are two distinct clusters of 2s, and also two distinct clusters of 5s. It would be nice if we could visualize a few digits from each cluster, to understand why this is the case. Let's do that now. "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "collapsed": true
+   },
+   "source": [
+    "*Exercise: Alternatively, you can write colored digits at the location of each instance, or even plot scaled-down versions of the digit images themselves (if you plot all digits, the visualization will be too cluttered, so you should either draw a random sample or plot an instance only if no other instance has already been plotted at a close distance). You should get a nice visualization with well-separated clusters of digits.*"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Let's create a `plot_digits()` function that will draw a scatterplot (similar to the above scatterplots) plus write colored digits, with a minimum distance guaranteed between these digits. If the digit images are provided, they are plotted instead. This implementation was inspired from one of Scikit-Learn's excellent examples ([plot_lle_digits](http://scikit-learn.org/stable/auto_examples/manifold/plot_lle_digits.html), based on a different digit dataset)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 95,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "from sklearn.preprocessing import MinMaxScaler\n",
+    "from matplotlib.offsetbox import AnnotationBbox, OffsetImage\n",
+    "\n",
+    "def plot_digits(X, y, min_distance=0.05, images=None, figsize=(13, 10)):\n",
+    "    # Let's scale the input features so that they range from 0 to 1\n",
+    "    X_normalized = MinMaxScaler().fit_transform(X)\n",
+    "    # Now we create the list of coordinates of the digits plotted so far.\n",
+    "    # We pretend that one is already plotted far away at the start, to\n",
+    "    # avoid `if` statements in the loop below\n",
+    "    neighbors = np.array([[10., 10.]])\n",
+    "    # The rest should be self-explanatory\n",
+    "    plt.figure(figsize=figsize)\n",
+    "    cmap = matplotlib.cm.get_cmap(\"jet\")\n",
+    "    digits = np.unique(y)\n",
+    "    for digit in digits:\n",
+    "        plt.scatter(X_normalized[y == digit, 0], X_normalized[y == digit, 1], c=cmap(digit / 9))\n",
+    "    plt.axis(\"off\")\n",
+    "    ax = plt.gcf().gca()  # get current axes in current figure\n",
+    "    for index, image_coord in enumerate(X_normalized):\n",
+    "        closest_distance = np.linalg.norm(np.array(neighbors) - image_coord, axis=1).min()\n",
+    "        if closest_distance > min_distance:\n",
+    "            neighbors = np.r_[neighbors, [image_coord]]\n",
+    "            if images is None:\n",
+    "                plt.text(image_coord[0], image_coord[1], str(int(y[index])),\n",
+    "                         color=cmap(y[index] / 9), fontdict={\"weight\": \"bold\", \"size\": 16})\n",
+    "            else:\n",
+    "                image = images[index].reshape(28, 28)\n",
+    "                imagebox = AnnotationBbox(OffsetImage(image, cmap=\"binary\"), image_coord)\n",
+    "                ax.add_artist(imagebox)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Let's try it! First let's just write colored digits:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 96,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plot_digits(X_reduced, y)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Well that's okay, but not that beautiful. Let's try with the digit images:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 97,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plot_digits(X_reduced, y, images=X, figsize=(35, 25))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 98,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plot_digits(X_subset_reduced, y_subset, images=X_subset, figsize=(22, 22))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "*Exercise: Try using other dimensionality reduction algorithms such as PCA, LLE, or MDS and compare the resulting visualizations.*"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Let's start with PCA. We will also time how long it takes:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 99,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn.decomposition import PCA\n",
+    "import time\n",
+    "\n",
+    "t0 = time.time()\n",
+    "X_pca_reduced = PCA(n_components=2, random_state=42).fit_transform(X)\n",
+    "t1 = time.time()\n",
+    "print(\"PCA took {:.1f}s.\".format(t1 - t0))\n",
+    "plot_digits(X_pca_reduced, y)\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Wow, PCA is blazingly fast! But although we do see a few clusters, there's way too much overlap. Let's try LLE:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 100,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn.manifold import LocallyLinearEmbedding\n",
+    "\n",
+    "t0 = time.time()\n",
+    "X_lle_reduced = LocallyLinearEmbedding(n_components=2, random_state=42).fit_transform(X)\n",
+    "t1 = time.time()\n",
+    "print(\"LLE took {:.1f}s.\".format(t1 - t0))\n",
+    "plot_digits(X_lle_reduced, y)\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "That took a while, and the result does not look too good. Let's see what happens if we apply PCA first, preserving 95% of the variance:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 101,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn.pipeline import Pipeline\n",
+    "\n",
+    "pca_lle = Pipeline([\n",
+    "    (\"pca\", PCA(n_components=0.95, random_state=42)),\n",
+    "    (\"lle\", LocallyLinearEmbedding(n_components=2, random_state=42)),\n",
+    "])\n",
+    "t0 = time.time()\n",
+    "X_pca_lle_reduced = pca_lle.fit_transform(X)\n",
+    "t1 = time.time()\n",
+    "print(\"PCA+LLE took {:.1f}s.\".format(t1 - t0))\n",
+    "plot_digits(X_pca_lle_reduced, y)\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The result is more or less the same, but this time it was almost 4× faster."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Let's try MDS. It's much too long if we run it on 10,000 instances, so let's just try 2,000 for now:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 102,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn.manifold import MDS\n",
+    "\n",
+    "m = 2000\n",
+    "t0 = time.time()\n",
+    "X_mds_reduced = MDS(n_components=2, random_state=42).fit_transform(X[:m])\n",
+    "t1 = time.time()\n",
+    "print(\"MDS took {:.1f}s (on just 2,000 MNIST images instead of 10,000).\".format(t1 - t0))\n",
+    "plot_digits(X_mds_reduced, y[:m])\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Meh. This does not look great, all clusters overlap too much. Let's try with PCA first, perhaps it will be faster?"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 103,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn.pipeline import Pipeline\n",
+    "\n",
+    "pca_mds = Pipeline([\n",
+    "    (\"pca\", PCA(n_components=0.95, random_state=42)),\n",
+    "    (\"mds\", MDS(n_components=2, random_state=42)),\n",
+    "])\n",
+    "t0 = time.time()\n",
+    "X_pca_mds_reduced = pca_mds.fit_transform(X[:2000])\n",
+    "t1 = time.time()\n",
+    "print(\"PCA+MDS took {:.1f}s (on 2,000 MNIST images).\".format(t1 - t0))\n",
+    "plot_digits(X_pca_mds_reduced, y[:2000])\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Same result, and no speedup: PCA did not help (or hurt)."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Let's try LDA:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 104,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn.discriminant_analysis import LinearDiscriminantAnalysis\n",
+    "\n",
+    "t0 = time.time()\n",
+    "X_lda_reduced = LinearDiscriminantAnalysis(n_components=2).fit_transform(X, y)\n",
+    "t1 = time.time()\n",
+    "print(\"LDA took {:.1f}s.\".format(t1 - t0))\n",
+    "plot_digits(X_lda_reduced, y, figsize=(12,12))\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This one is very fast, and it looks nice at first, until you realize that several clusters overlap severely."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Well, it's pretty clear that t-SNE won this little competition, wouldn't you agree? We did not time it, so let's do that now:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 105,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn.manifold import TSNE\n",
+    "\n",
+    "t0 = time.time()\n",
+    "X_tsne_reduced = TSNE(n_components=2, random_state=42).fit_transform(X)\n",
+    "t1 = time.time()\n",
+    "print(\"t-SNE took {:.1f}s.\".format(t1 - t0))\n",
+    "plot_digits(X_tsne_reduced, y)\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "It's twice slower than LLE, but still much faster than MDS, and the result looks great. Let's see if a bit of PCA can speed it up:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 106,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pca_tsne = Pipeline([\n",
+    "    (\"pca\", PCA(n_components=0.95, random_state=42)),\n",
+    "    (\"tsne\", TSNE(n_components=2, random_state=42)),\n",
+    "])\n",
+    "t0 = time.time()\n",
+    "X_pca_tsne_reduced = pca_tsne.fit_transform(X)\n",
+    "t1 = time.time()\n",
+    "print(\"PCA+t-SNE took {:.1f}s.\".format(t1 - t0))\n",
+    "plot_digits(X_pca_tsne_reduced, y)\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Yes, PCA roughly gave us a 25% speedup, without damaging the result. We have a winner!"
    ]
   },
   {
@@ -1794,21 +2303,21 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 2",
    "language": "python",
-   "name": "python3"
+   "name": "python2"
   },
   "language_info": {
    "codemirror_mode": {
     "name": "ipython",
-    "version": 3
+    "version": 2
    },
    "file_extension": ".py",
    "mimetype": "text/x-python",
    "name": "python",
    "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.5.3"
+   "pygments_lexer": "ipython2",
+   "version": "2.7.12"
   },
   "nav_menu": {
    "height": "352px",

From 10670c60052034ddaf25d65f30497480a6dcb0c0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Aur=C3=A9lien=20Geron?= <ageron@users.noreply.github.com>
Date: Mon, 26 Jun 2017 12:14:57 +0200
Subject: [PATCH 2/6] Add list of equations in the book

---
 book_equations.ipynb | 1388 ++++++++++++++++++++++++++++++++++++++++++
 index.ipynb          |   30 +-
 2 files changed, 1401 insertions(+), 17 deletions(-)
 create mode 100644 book_equations.ipynb

diff --git a/book_equations.ipynb b/book_equations.ipynb
new file mode 100644
index 0000000..7b1c7d9
--- /dev/null
+++ b/book_equations.ipynb
@@ -0,0 +1,1388 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**Equations**\n",
+    "\n",
+    "*This notebook lists all the equations in the book. If you decide to print them on a T-Shirt, I definitely want a copy! ;-)*"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Chapter 1\n",
+    "**Equation 1-1: A simple linear model**\n",
+    "\n",
+    "$\n",
+    "\\text{life_satisfaction} = \\theta_0 + \\theta_1 \\times \\text{GDP_per_capita}\n",
+    "$\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Chapter 2\n",
+    "**Equation 2-1: Root Mean Square Error (RMSE)**\n",
+    "\n",
+    "$\n",
+    "\\text{RMSE}(\\mathbf{X}, h) = \\sqrt{\\frac{1}{m}\\sum\\limits_{i=1}^{m}\\left(h(\\mathbf{x}^{(i)}) - y^{(i)}\\right)^2}\n",
+    "$\n",
+    "\n",
+    "\n",
+    "**Notations (page 38):**\n",
+    "\n",
+    "$\n",
+    "  \\mathbf{x}^{(1)} = \\begin{pmatrix}\n",
+    "  -118.29 \\\\\n",
+    "  33.91 \\\\\n",
+    "  1,416 \\\\\n",
+    "  38,372\n",
+    "  \\end{pmatrix}\n",
+    "$\n",
+    "\n",
+    "\n",
+    "$\n",
+    "  y^{(1)}=156,400\n",
+    "$\n",
+    "\n",
+    "\n",
+    "$\n",
+    "  \\mathbf{X} = \\begin{pmatrix}\n",
+    "  (\\mathbf{x}^{(1)})^T \\\\\n",
+    "  (\\mathbf{x}^{(2)})^T\\\\\n",
+    "  \\vdots \\\\\n",
+    "  (\\mathbf{x}^{(1999)})^T \\\\\n",
+    "  (\\mathbf{x}^{(2000)})^T\n",
+    "  \\end{pmatrix} = \\begin{pmatrix}\n",
+    "  -118.29 & 33.91 & 1,416 & 38,372 \\\\\n",
+    "  \\vdots & \\vdots & \\vdots & \\vdots \\\\\n",
+    "  \\end{pmatrix}\n",
+    "$\n",
+    "\n",
+    "\n",
+    "**Equation 2-2: Mean Absolute Error**\n",
+    "\n",
+    "$\n",
+    "\\text{MAE}(\\mathbf{X}, h) = \\frac{1}{m}\\sum\\limits_{i=1}^{m}\\left| h(\\mathbf{x}^{(i)}) - y^{(i)} \\right|\n",
+    "$\n",
+    "\n",
+    "**$\\ell_k$ norms (page 39):**\n",
+    "\n",
+    "$ \\left\\| \\mathbf{v} \\right\\| _k = (\\left| v_0 \\right|^k + \\left| v_1 \\right|^k + \\dots + \\left| v_n \\right|^k)^{\\frac{1}{k}} $\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Chapter 3\n",
+    "**Equation 3-1: Precision**\n",
+    "\n",
+    "$\n",
+    "\\text{precision} = \\cfrac{TP}{TP + FP}\n",
+    "$\n",
+    "\n",
+    "\n",
+    "**Equation 3-2: Recall**\n",
+    "\n",
+    "$\n",
+    "\\text{recall} = \\cfrac{TP}{TP + FN}\n",
+    "$\n",
+    "\n",
+    "\n",
+    "**Equation 3-3: $F_1$ score**\n",
+    "\n",
+    "$\n",
+    "F_1 = \\cfrac{2}{\\cfrac{1}{\\text{precision}} + \\cfrac{1}{\\text{recall}}} = 2 \\times \\cfrac{\\text{precision}\\, \\times \\, \\text{recall}}{\\text{precision}\\, + \\, \\text{recall}} = \\cfrac{TP}{TP + \\cfrac{FN + FP}{2}}\n",
+    "$\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Chapter 4\n",
+    "**Equation 4-1: Linear Regression model prediction**\n",
+    "\n",
+    "$\n",
+    "\\hat{y} = \\theta_0 + \\theta_1 x_1 + \\theta_2 x_2 + \\dots + \\theta_n x_n\n",
+    "$\n",
+    "\n",
+    "\n",
+    "**Equation 4-2: Linear Regression model prediction (vectorized form)**\n",
+    "\n",
+    "$\n",
+    "\\hat{y} = h_{\\mathbf{\\theta}}(\\mathbf{x}) = \\mathbf{\\theta}^T \\cdot \\mathbf{x}\n",
+    "$\n",
+    "\n",
+    "\n",
+    "**Equation 4-3: MSE cost function for a Linear Regression model**\n",
+    "\n",
+    "$\n",
+    "\\text{MSE}(\\mathbf{X}, h_{\\mathbf{\\theta}}) = \\dfrac{1}{m} \\sum\\limits_{i=1}^{m}{(\\mathbf{\\theta}^T \\cdot \\mathbf{x}^{(i)} - y^{(i)})^2}\n",
+    "$\n",
+    "\n",
+    "\n",
+    "**Equation 4-4: Normal Equation**\n",
+    "\n",
+    "$\n",
+    "\\hat{\\mathbf{\\theta}} = (\\mathbf{X}^T \\cdot \\mathbf{X})^{-1} \\cdot \\mathbf{X}^T \\cdot \\mathbf{y}\n",
+    "$\n",
+    "\n",
+    "\n",
+    "** Partial derivatives notation (page 114):**\n",
+    "\n",
+    "$\\frac{\\partial}{\\partial \\theta_j} \\text{MSE}(\\mathbf{\\theta})$\n",
+    "\n",
+    "\n",
+    "**Equation 4-5: Partial derivatives of the cost function**\n",
+    "\n",
+    "$\n",
+    "\\dfrac{\\partial}{\\partial \\theta_j} \\text{MSE}(\\mathbf{\\theta}) = \\dfrac{2}{m}\\sum\\limits_{i=1}^{m}(\\mathbf{\\theta}^T \\cdot \\mathbf{x}^{(i)} - y^{(i)})\\, x_j^{(i)}\n",
+    "$\n",
+    "\n",
+    "\n",
+    "**Equation 4-6: Gradient vector of the cost function**\n",
+    "\n",
+    "$\n",
+    "\\nabla_{\\mathbf{\\theta}}\\, \\text{MSE}(\\mathbf{\\theta}) =\n",
+    "\\begin{pmatrix}\n",
+    " \\frac{\\partial}{\\partial \\theta_0} \\text{MSE}(\\mathbf{\\theta}) \\\\\n",
+    " \\frac{\\partial}{\\partial \\theta_1} \\text{MSE}(\\mathbf{\\theta}) \\\\\n",
+    " \\vdots \\\\\n",
+    " \\frac{\\partial}{\\partial \\theta_n} \\text{MSE}(\\mathbf{\\theta})\n",
+    "\\end{pmatrix}\n",
+    " = \\dfrac{2}{m} \\mathbf{X}^T \\cdot (\\mathbf{X} \\cdot \\mathbf{\\theta} - \\mathbf{y})\n",
+    "$\n",
+    "\n",
+    "\n",
+    "**Equation 4-7: Gradient Descent step**\n",
+    "\n",
+    "$\n",
+    "\\mathbf{\\theta}^{(\\text{next step})} = \\mathbf{\\theta} - \\eta \\nabla_{\\mathbf{\\theta}}\\, \\text{MSE}(\\mathbf{\\theta})\n",
+    "$\n",
+    "\n",
+    "\n",
+    "$ O(\\frac{1}{\\text{iterations}}) $\n",
+    "\n",
+    "\n",
+    "$ \\hat{y} = 0.56 x_1^2 + 0.93 x_1 + 1.78 $\n",
+    "\n",
+    "\n",
+    "$ y = 0.5 x_1^2 + 1.0 x_1 + 2.0 + \\text{Gaussian noise} $\n",
+    "\n",
+    "\n",
+    "$ \\dfrac{(n+d)!}{d!\\,n!} $\n",
+    "\n",
+    "\n",
+    "$ \\alpha \\sum_{i=1}^{n}{\\theta_i^2}$\n",
+    "\n",
+    "\n",
+    "**Equation 4-8: Ridge Regression cost function**\n",
+    "\n",
+    "$\n",
+    "J(\\mathbf{\\theta}) = \\text{MSE}(\\mathbf{\\theta}) + \\alpha \\dfrac{1}{2}\\sum\\limits_{i=1}^{n}\\theta_i^2\n",
+    "$\n",
+    "\n",
+    "\n",
+    "**Equation 4-9: Ridge Regression closed-form solution**\n",
+    "\n",
+    "$\n",
+    "\\hat{\\mathbf{\\theta}} = (\\mathbf{X}^T \\cdot \\mathbf{X} + \\alpha \\mathbf{A})^{-1} \\cdot \\mathbf{X}^T \\cdot \\mathbf{y}\n",
+    "$\n",
+    "\n",
+    "\n",
+    "**Equation 4-10: Lasso Regression cost function**\n",
+    "\n",
+    "$\n",
+    "J(\\mathbf{\\theta}) = \\text{MSE}(\\mathbf{\\theta}) + \\alpha \\sum\\limits_{i=1}^{n}\\left| \\theta_i \\right|\n",
+    "$\n",
+    "\n",
+    "\n",
+    "**Equation 4-11: Lasso Regression subgradient vector**\n",
+    "\n",
+    "$\n",
+    "g(\\mathbf{\\theta}, J) = \\nabla_{\\mathbf{\\theta}}\\, \\text{MSE}(\\mathbf{\\theta}) + \\alpha\n",
+    "\\begin{pmatrix}\n",
+    "  \\operatorname{sign}(\\theta_1) \\\\\n",
+    "  \\operatorname{sign}(\\theta_2) \\\\\n",
+    "  \\vdots \\\\\n",
+    "  \\operatorname{sign}(\\theta_n) \\\\\n",
+    "\\end{pmatrix} \\quad \\text{where } \\operatorname{sign}(\\theta_i) =\n",
+    "\\begin{cases}\n",
+    "-1 & \\text{if } \\theta_i < 0 \\\\\n",
+    "0 & \\text{if } \\theta_i = 0 \\\\\n",
+    "+1 & \\text{if } \\theta_i > 0\n",
+    "\\end{cases}\n",
+    "$\n",
+    "\n",
+    "\n",
+    "**Equation 4-12: Elastic Net cost function**\n",
+    "\n",
+    "$\n",
+    "J(\\mathbf{\\theta}) = \\text{MSE}(\\mathbf{\\theta}) + r \\alpha \\sum\\limits_{i=1}^{n}\\left| \\theta_i \\right| + \\dfrac{1 - r}{2} \\alpha \\sum\\limits_{i=1}^{n}{\\theta_i^2}\n",
+    "$\n",
+    "\n",
+    "\n",
+    "**Equation 4-13: Logistic Regression model estimated probability (vectorized form)**\n",
+    "\n",
+    "$\n",
+    "\\hat{p} = h_{\\mathbf{\\theta}}(\\mathbf{x}) = \\sigma(\\mathbf{\\theta}^T \\cdot \\mathbf{x})\n",
+    "$\n",
+    "\n",
+    "\n",
+    "**Equation 4-14: Logistic function**\n",
+    "\n",
+    "$\n",
+    "\\sigma(t) = \\dfrac{1}{1 + \\exp(-t)}\n",
+    "$\n",
+    "\n",
+    "\n",
+    "**Equation 4-15: Logistic Regression model prediction**\n",
+    "\n",
+    "$\n",
+    "\\hat{y} =\n",
+    "\\begin{cases}\n",
+    "  0 & \\text{if } \\hat{p} < 0.5, \\\\\n",
+    "  1 & \\text{if } \\hat{p} \\geq 0.5.\n",
+    "\\end{cases}\n",
+    "$\n",
+    "\n",
+    "\n",
+    "**Equation 4-16: Cost function of a single training instance**\n",
+    "\n",
+    "$\n",
+    "c(\\mathbf{\\theta}) =\n",
+    "\\begin{cases}\n",
+    "  -\\log(\\hat{p}) & \\text{if } y = 1, \\\\\n",
+    "  -\\log(1 - \\hat{p}) & \\text{if } y = 0.\n",
+    "\\end{cases}\n",
+    "$\n",
+    "\n",
+    "\n",
+    "**Equation 4-17: Logistic Regression cost function (log loss)**\n",
+    "\n",
+    "$\n",
+    "J(\\mathbf{\\theta}) = -\\dfrac{1}{m} \\sum\\limits_{i=1}^{m}{\\left[ y^{(i)} log\\left(\\hat{p}^{(i)}\\right) + (1 - y^{(i)}) log\\left(1 - \\hat{p}^{(i)}\\right)\\right]}\n",
+    "$\n",
+    "\n",
+    "\n",
+    "**Equation 4-18: Logistic cost function partial derivatives**\n",
+    "\n",
+    "$\n",
+    "\\dfrac{\\partial}{\\partial \\theta_j} \\text{J}(\\mathbf{\\theta}) = \\dfrac{1}{m}\\sum\\limits_{i=1}^{m}\\left(\\mathbf{\\sigma(\\theta}^T \\cdot \\mathbf{x}^{(i)}) - y^{(i)}\\right)\\, x_j^{(i)}\n",
+    "$\n",
+    "\n",
+    "\n",
+    "**Equation 4-19: Softmax score for class k**\n",
+    "\n",
+    "$\n",
+    "s_k(\\mathbf{x}) = ({\\mathbf{\\theta}^{(k)}})^T \\cdot \\mathbf{x}\n",
+    "$\n",
+    "\n",
+    "\n",
+    "**Equation 4-20: Softmax function**\n",
+    "\n",
+    "$\n",
+    "\\hat{p}_k = \\sigma\\left(\\mathbf{s}(\\mathbf{x})\\right)_k = \\dfrac{\\exp\\left(s_k(\\mathbf{x})\\right)}{\\sum\\limits_{j=1}^{K}{\\exp\\left(s_j(\\mathbf{x})\\right)}}\n",
+    "$\n",
+    "\n",
+    "\n",
+    "**Equation 4-21: Softmax Regression classifier prediction**\n",
+    "\n",
+    "$\n",
+    "\\hat{y} = \\underset{k}{\\operatorname{argmax}} \\, \\sigma\\left(\\mathbf{s}(\\mathbf{x})\\right)_k = \\underset{k}{\\operatorname{argmax}} \\, s_k(\\mathbf{x}) = \\underset{k}{\\operatorname{argmax}} \\, \\left( ({\\mathbf{\\theta}^{(k)}})^T \\cdot \\mathbf{x} \\right)\n",
+    "$\n",
+    "\n",
+    "\n",
+    "**Equation 4-22: Cross entropy cost function**\n",
+    "\n",
+    "$\n",
+    "J(\\mathbf{\\Theta}) =\n",
+    "- \\dfrac{1}{m}\\sum\\limits_{i=1}^{m}\\sum\\limits_{k=1}^{K}{y_k^{(i)}\\log\\left(\\hat{p}_k^{(i)}\\right)}\n",
+    "$\n",
+    "\n",
+    "**Cross entropy between two discrete probability distributions $p$ and $q$ (page 141):**\n",
+    "$ H(p, q) = -\\sum\\limits_{x}p(x) \\log q(x) $\n",
+    "\n",
+    "\n",
+    "**Equation 4-23: Cross entropy gradient vector for class k**\n",
+    "\n",
+    "$\n",
+    "\\nabla_{\\mathbf{\\theta}^{(k)}} \\, J(\\mathbf{\\Theta}) = \\dfrac{1}{m} \\sum\\limits_{i=1}^{m}{ \\left ( \\hat{p}^{(i)}_k - y_k^{(i)} \\right ) \\mathbf{x}^{(i)}}\n",
+    "$\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Chapter 5\n",
+    "**Equation 5-1: Gaussian RBF**\n",
+    "\n",
+    "$\n",
+    "{\\displaystyle \\phi_{\\gamma}(\\mathbf{x}, \\mathbf{\\ell})} = {\\displaystyle \\exp({\\displaystyle -\\gamma \\left\\| \\mathbf{x} - \\mathbf{\\ell} \\right\\|^2})}\n",
+    "$\n",
+    "\n",
+    "\n",
+    "**Equation 5-2: Linear SVM classifier prediction**\n",
+    "\n",
+    "$\n",
+    "\\hat{y} = \\begin{cases}\n",
+    " 0 & \\text{if } \\mathbf{w}^T \\cdot \\mathbf{x} + b < 0, \\\\\n",
+    " 1 & \\text{if } \\mathbf{w}^T \\cdot \\mathbf{x} + b \\geq 0\n",
+    "\\end{cases}\n",
+    "$\n",
+    "\n",
+    "\n",
+    "**Equation 5-3: Hard margin linear SVM classifier objective**\n",
+    "\n",
+    "$\n",
+    "\\begin{split}\n",
+    "&\\underset{\\mathbf{w}, b}{\\operatorname{minimize}}\\quad{\\frac{1}{2}\\mathbf{w}^T \\cdot \\mathbf{w}} \\\\\n",
+    "&\\text{subject to} \\quad t^{(i)}(\\mathbf{w}^T \\cdot \\mathbf{x}^{(i)} + b) \\ge 1 \\quad \\text{for } i = 1, 2, \\dots, m\n",
+    "\\end{split}\n",
+    "$\n",
+    "\n",
+    "\n",
+    "**Equation 5-4: Soft margin linear SVM classifier objective**\n",
+    "\n",
+    "$\n",
+    "\\begin{split}\n",
+    "&\\underset{\\mathbf{w}, b, \\mathbf{\\zeta}}{\\operatorname{minimize}}\\quad{\\dfrac{1}{2}\\mathbf{w}^T \\cdot \\mathbf{w} + C \\sum\\limits_{i=1}^m{\\zeta^{(i)}}}\\\\\n",
+    "&\\text{subject to} \\quad t^{(i)}(\\mathbf{w}^T \\cdot \\mathbf{x}^{(i)} + b) \\ge 1 - \\zeta^{(i)} \\quad \\text{and} \\quad \\zeta^{(i)} \\ge 0 \\quad \\text{for } i = 1, 2, \\dots, m\n",
+    "\\end{split}\n",
+    "$\n",
+    "\n",
+    "\n",
+    "**Equation 5-5: Quadratic Programming problem**\n",
+    "\n",
+    "$\n",
+    "\\begin{split}\n",
+    "\\underset{\\mathbf{p}}{\\text{Minimize}} \\quad & \\dfrac{1}{2} \\mathbf{p}^T \\cdot \\mathbf{H} \\cdot \\mathbf{p} \\quad + \\quad \\mathbf{f}^T \\cdot \\mathbf{p}  \\\\\n",
+    "\\text{subject to} \\quad & \\mathbf{A} \\cdot \\mathbf{p} \\le \\mathbf{b} \\\\\n",
+    "\\text{where } &\n",
+    "\\begin{cases}\n",
+    "  \\mathbf{p} & \\text{ is an }n_p\\text{-dimensional vector (} n_p = \\text{number of parameters),}\\\\\n",
+    "  \\mathbf{H} & \\text{ is an }n_p \\times n_p \\text{ matrix,}\\\\\n",
+    "  \\mathbf{f} & \\text{ is an }n_p\\text{-dimensional vector,}\\\\\n",
+    "  \\mathbf{A} & \\text{ is an } n_c \\times n_p \\text{ matrix (}n_c = \\text{number of constraints),}\\\\\n",
+    "  \\mathbf{b} & \\text{ is an }n_c\\text{-dimensional vector.}\n",
+    "\\end{cases}\n",
+    "\\end{split}\n",
+    "$\n",
+    "\n",
+    "\n",
+    "**Equation 5-6: Dual form of the linear SVM objective**\n",
+    "\n",
+    "$\n",
+    "\\begin{split}\n",
+    "\\underset{\\mathbf{\\alpha}}{\\operatorname{minimize}}\n",
+    "\\dfrac{1}{2}\\sum\\limits_{i=1}^{m}{\n",
+    "  \\sum\\limits_{j=1}^{m}{\n",
+    "  \\alpha^{(i)} \\alpha^{(j)} t^{(i)} t^{(j)} {\\mathbf{x}^{(i)}}^T \\cdot \\mathbf{x}^{(j)}\n",
+    "  }\n",
+    "} \\quad - \\quad \\sum\\limits_{i=1}^{m}{\\alpha^{(i)}}\\\\\n",
+    "\\text{subject to}\\quad \\alpha^{(i)} \\ge 0 \\quad \\text{for }i = 1, 2, \\dots, m\n",
+    "\\end{split}\n",
+    "$\n",
+    "\n",
+    "\n",
+    "**Equation 5-7: From the dual solution to the primal solution**\n",
+    "\n",
+    "$\n",
+    "\\begin{split}\n",
+    "&\\hat{\\mathbf{w}} = \\sum_{i=1}^{m}{\\hat{\\alpha}}^{(i)}t^{(i)}\\mathbf{x}^{(i)}\\\\\n",
+    "&\\hat{b} = \\dfrac{1}{n_s}\\sum\\limits_{\\scriptstyle i=1 \\atop {\\scriptstyle {\\hat{\\alpha}}^{(i)} > 0}}^{m}{\\left(1 - t^{(i)}({\\hat{\\mathbf{w}}}^T \\cdot \\mathbf{x}^{(i)})\\right)}\n",
+    "\\end{split}\n",
+    "$\n",
+    "\n",
+    "\n",
+    "**Equation 5-8: Second-degree polynomial mapping**\n",
+    "\n",
+    "$\n",
+    "\\phi\\left(\\mathbf{x}\\right) = \\phi\\left( \\begin{pmatrix}\n",
+    "  x_1 \\\\\n",
+    "  x_2\n",
+    "\\end{pmatrix} \\right) = \\begin{pmatrix}\n",
+    "  {x_1}^2 \\\\\n",
+    "  \\sqrt{2} \\, x_1 x_2 \\\\\n",
+    "  {x_2}^2\n",
+    "\\end{pmatrix}\n",
+    "$\n",
+    "\n",
+    "\n",
+    "**Equation 5-9: Kernel trick for a 2^nd^-degree polynomial mapping**\n",
+    "\n",
+    "$\n",
+    "\\begin{split}\n",
+    "\\phi(\\mathbf{a})^T \\cdot \\phi(\\mathbf{b}) & \\quad = \\begin{pmatrix}\n",
+    "  {a_1}^2 \\\\\n",
+    "  \\sqrt{2} \\, a_1 a_2 \\\\\n",
+    "  {a_2}^2\n",
+    "  \\end{pmatrix}^T \\cdot \\begin{pmatrix}\n",
+    "  {b_1}^2 \\\\\n",
+    "  \\sqrt{2} \\, b_1 b_2 \\\\\n",
+    "  {b_2}^2\n",
+    "\\end{pmatrix} = {a_1}^2 {b_1}^2 + 2 a_1 b_1 a_2 b_2 + {a_2}^2 {b_2}^2 \\\\\n",
+    " & \\quad = \\left( a_1 b_1 + a_2 b_2 \\right)^2 = \\left( \\begin{pmatrix}\n",
+    "  a_1 \\\\\n",
+    "  a_2\n",
+    "\\end{pmatrix}^T \\cdot \\begin{pmatrix}\n",
+    "    b_1 \\\\\n",
+    "    b_2\n",
+    "  \\end{pmatrix} \\right)^2 = (\\mathbf{a}^T \\cdot \\mathbf{b})^2\n",
+    "\\end{split}\n",
+    "$\n",
+    "\n",
+    "**In the text about the kernel trick (page 162):**\n",
+    "[...], then you can replace this dot product of transformed vectors simply by $ ({\\mathbf{x}^{(i)}}^T \\cdot \\mathbf{x}^{(j)})^2 $\n",
+    "\n",
+    "\n",
+    "**Equation 5-10: Common kernels**\n",
+    "\n",
+    "$\n",
+    "\\begin{split}\n",
+    "\\text{Linear:} & \\quad K(\\mathbf{a}, \\mathbf{b}) = \\mathbf{a}^T \\cdot \\mathbf{b} \\\\\n",
+    "\\text{Polynomial:} & \\quad K(\\mathbf{a}, \\mathbf{b}) = \\left(\\gamma \\mathbf{a}^T \\cdot \\mathbf{b} + r \\right)^d \\\\\n",
+    "\\text{Gaussian RBF:} & \\quad K(\\mathbf{a}, \\mathbf{b}) = \\exp({\\displaystyle -\\gamma \\left\\| \\mathbf{a} - \\mathbf{b} \\right\\|^2}) \\\\\n",
+    "\\text{Sigmoid:} & \\quad K(\\mathbf{a}, \\mathbf{b}) = \\tanh\\left(\\gamma \\mathbf{a}^T \\cdot \\mathbf{b} + r\\right)\n",
+    "\\end{split}\n",
+    "$\n",
+    "\n",
+    "**Equation 5-11: Making predictions with a kernelized SVM**\n",
+    "\n",
+    "$\n",
+    "\\begin{split}\n",
+    "h_{\\hat{\\mathbf{w}}, \\hat{b}}\\left(\\phi(\\mathbf{x}^{(n)})\\right) & = \\,\\hat{\\mathbf{w}}^T \\cdot \\phi(\\mathbf{x}^{(n)}) + \\hat{b} = \\left(\\sum_{i=1}^{m}{\\hat{\\alpha}}^{(i)}t^{(i)}\\phi(\\mathbf{x}^{(i)})\\right)^T \\cdot \\phi(\\mathbf{x}^{(n)}) + \\hat{b}\\\\\n",
+    " & = \\, \\sum_{i=1}^{m}{\\hat{\\alpha}}^{(i)}t^{(i)}\\left(\\phi(\\mathbf{x}^{(i)})^T \\cdot \\phi(\\mathbf{x}^{(n)})\\right)  + \\hat{b}\\\\\n",
+    " & = \\sum\\limits_{\\scriptstyle i=1 \\atop {\\scriptstyle {\\hat{\\alpha}}^{(i)} > 0}}^{m}{\\hat{\\alpha}}^{(i)}t^{(i)} K(\\mathbf{x}^{(i)}, \\mathbf{x}^{(n)}) + \\hat{b}\n",
+    "\\end{split}\n",
+    "$\n",
+    "\n",
+    "\n",
+    "**Equation 5-12: Computing the bias term using the kernel trick**\n",
+    "\n",
+    "$\n",
+    "\\begin{split}\n",
+    "\\hat{b} & = \\dfrac{1}{n_s}\\sum\\limits_{\\scriptstyle i=1 \\atop {\\scriptstyle {\\hat{\\alpha}}^{(i)} > 0}}^{m}{\\left(1 - t^{(i)}{\\hat{\\mathbf{w}}}^T \\cdot \\phi(\\mathbf{x}^{(i)})\\right)} = \\dfrac{1}{n_s}\\sum\\limits_{\\scriptstyle i=1 \\atop {\\scriptstyle {\\hat{\\alpha}}^{(i)} > 0}}^{m}{\\left(1 - t^{(i)}{\n",
+    " \\left(\\sum_{j=1}^{m}{\\hat{\\alpha}}^{(j)}t^{(j)}\\phi(\\mathbf{x}^{(j)})\\right)\n",
+    " }^T \\cdot \\phi(\\mathbf{x}^{(i)})\\right)}\\\\\n",
+    " & = \\dfrac{1}{n_s}\\sum\\limits_{\\scriptstyle i=1 \\atop {\\scriptstyle {\\hat{\\alpha}}^{(i)} > 0}}^{m}{\\left(1 - t^{(i)}\n",
+    "\\sum\\limits_{\\scriptstyle j=1 \\atop {\\scriptstyle {\\hat{\\alpha}}^{(j)} > 0}}^{m}{\n",
+    "  {\\hat{\\alpha}}^{(j)} t^{(j)} K(\\mathbf{x}^{(i)},\\mathbf{x}^{(j)})\n",
+    "}\n",
+    "\\right)}\n",
+    "\\end{split}\n",
+    "$\n",
+    "\n",
+    "\n",
+    "**Equation 5-13: Linear SVM classifier cost function**\n",
+    "\n",
+    "$\n",
+    "J(\\mathbf{w}, b) = \\dfrac{1}{2} \\mathbf{w}^T \\cdot \\mathbf{w} \\quad + \\quad C {\\displaystyle \\sum\\limits_{i=1}^{m}max\\left(0, 1 - t^{(i)}(\\mathbf{w}^T \\cdot \\mathbf{x}^{(i)} + b) \\right)}\n",
+    "$\n",
+    "\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Chapter 6\n",
+    "**Equation 6-1: Gini impurity**\n",
+    "\n",
+    "$\n",
+    "G_i = 1 - \\sum\\limits_{k=1}^{n}{{p_{i,k}}^2}\n",
+    "$\n",
+    "\n",
+    "\n",
+    "**Equation 6-2: CART cost function for classification**\n",
+    "\n",
+    "$\n",
+    "\\begin{split}\n",
+    "&J(k, t_k) = \\dfrac{m_{\\text{left}}}{m}G_\\text{left} + \\dfrac{m_{\\text{right}}}{m}G_{\\text{right}}\\\\\n",
+    "&\\text{where }\\begin{cases}\n",
+    "G_\\text{left/right} \\text{ measures the impurity of the left/right subset,}\\\\\n",
+    "m_\\text{left/right} \\text{ is the number of instances in the left/right subset.}\n",
+    "\\end{cases}\n",
+    "\\end{split}\n",
+    "$\n",
+    "\n",
+    "**Entropy computation example (page 173):**\n",
+    "\n",
+    "$ -\\frac{49}{54}\\log(\\frac{49}{54}) - \\frac{5}{54}\\log(\\frac{5}{54}) $\n",
+    "\n",
+    "\n",
+    "**Equation 6-3: Entropy**\n",
+    "\n",
+    "$\n",
+    "H_i = -\\sum\\limits_{k=1 \\atop p_{i,k} \\ne 0}^{n}{{p_{i,k}}\\log(p_{i,k})}\n",
+    "$\n",
+    "\n",
+    "\n",
+    "**Equation 6-4: CART cost function for regression**\n",
+    "\n",
+    "$\n",
+    "J(k, t_k) = \\dfrac{m_{\\text{left}}}{m}\\text{MSE}_\\text{left} + \\dfrac{m_{\\text{right}}}{m}\\text{MSE}_{\\text{right}} \\quad\n",
+    "\\text{where }\n",
+    "\\begin{cases}\n",
+    "\\text{MSE}_{\\text{node}} = \\sum\\limits_{\\scriptstyle i \\in \\text{node}}(\\hat{y}_{\\text{node}} - y^{(i)})^2\\\\\n",
+    "\\hat{y}_\\text{node} = \\dfrac{1}{m_{\\text{node}}}\\sum\\limits_{\\scriptstyle i \\in \\text{node}}y^{(i)}\n",
+    "\\end{cases}\n",
+    "$\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Chapter 7\n",
+    "\n",
+    "**Equation 7-1: Weighted error rate of the j^th^ predictor**\n",
+    "\n",
+    "$\n",
+    "r_j = \\dfrac{\\displaystyle \\sum\\limits_{\\textstyle {i=1 \\atop \\hat{y}_j^{(i)} \\ne y^{(i)}}}^{m}{w^{(i)}}}{\\displaystyle \\sum\\limits_{i=1}^{m}{w^{(i)}}} \\quad\n",
+    "\\text{where }\\hat{y}_j^{(i)}\\text{ is the }j^{\\text{th}}\\text{ predictor's prediction for the }i^{\\text{th}}\\text{ instance.}\n",
+    "$\n",
+    "\n",
+    "**Equation 7-2: Predictor weight**\n",
+    "\n",
+    "$\n",
+    "\\begin{split}\n",
+    "\\alpha_j = \\eta \\log{\\dfrac{1 - r_j}{r_j}}\n",
+    "\\end{split}\n",
+    "$\n",
+    "\n",
+    "\n",
+    "**Equation 7-3: Weight update rule**\n",
+    "\n",
+    "$\n",
+    "\\begin{split}\n",
+    "& \\text{ for } i = 1, 2, \\dots, m \\\\\n",
+    "& w^{(i)} \\leftarrow\n",
+    "\\begin{cases}\n",
+    "w^{(i)} & \\text{if }\\hat{y_j}^{(i)} = y^{(i)}\\\\\n",
+    "w^{(i)} \\exp(\\alpha_j) & \\text{if }\\hat{y_j}^{(i)} \\ne y^{(i)}\n",
+    "\\end{cases}\n",
+    "\\end{split}\n",
+    "$\n",
+    "\n",
+    "**In the text page 194:**\n",
+    "\n",
+    "Then all the instance weights are normalized (i.e., divided by $ \\sum_{i=1}^{m}{w^{(i)}} $).\n",
+    "\n",
+    "\n",
+    "**Equation 7-4: AdaBoost predictions**\n",
+    "\n",
+    "$\n",
+    "\\hat{y}(\\mathbf{x}) = \\underset{k}{\\operatorname{argmax}}{\\sum\\limits_{\\scriptstyle j=1 \\atop \\scriptstyle \\hat{y}_j(\\mathbf{x}) = k}^{N}{\\alpha_j}} \\quad \\text{where }N\\text{ is the number of predictors.}\n",
+    "$\n",
+    "\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Chapter 8\n",
+    "\n",
+    "**Equation 8-1: Principal components matrix**\n",
+    "\n",
+    "$\n",
+    "\\mathbf{V}^T =\n",
+    "\\begin{pmatrix}\n",
+    "  \\mid & \\mid & & \\mid \\\\\n",
+    "  \\mathbf{c_1} & \\mathbf{c_2} & \\cdots & \\mathbf{c_n} \\\\\n",
+    "  \\mid & \\mid & & \\mid\n",
+    "\\end{pmatrix}\n",
+    "$\n",
+    "\n",
+    "\n",
+    "**Equation 8-2: Projecting the training set down to _d_ dimensions**\n",
+    "\n",
+    "$\n",
+    "\\mathbf{X}_{d\\text{-proj}} = \\mathbf{X} \\cdot \\mathbf{W}_d\n",
+    "$\n",
+    "\n",
+    "\n",
+    "**Equation 8-3: PCA inverse transformation, back to the original number of dimensions**\n",
+    "\n",
+    "$\n",
+    "\\mathbf{X}_{\\text{recovered}} = \\mathbf{X}_{d\\text{-proj}} \\cdot {\\mathbf{W}_d}^T\n",
+    "$\n",
+    "\n",
+    "\n",
+    "$ \\sum_{j=1}^{m}{w_{i,j}\\mathbf{x}^{(j)}} $\n",
+    "\n",
+    "\n",
+    "**Equation 8-4: LLE step 1: linearly modeling local relationships**\n",
+    "\n",
+    "$\n",
+    "\\begin{split}\n",
+    "& \\hat{\\mathbf{W}} = \\underset{\\mathbf{W}}{\\operatorname{argmin}}{\\displaystyle \\sum\\limits_{i=1}^{m}} \\left\\|\\mathbf{x}^{(i)} - \\sum\\limits_{j=1}^{m}{w_{i,j}}\\mathbf{x}^{(j)}\\right\\|^2\\\\\n",
+    "& \\text{subject to }\n",
+    "\\begin{cases}\n",
+    "  w_{i,j}=0 & \\text{if }\\mathbf{x}^{(j)} \\text{ is not one of the }k\\text{ c.n. of }\\mathbf{x}^{(i)}\\\\\n",
+    "  \\sum\\limits_{j=1}^{m}w_{i,j} = 1 & \\text{for }i=1, 2, \\dots, m\n",
+    "\\end{cases}\n",
+    "\\end{split}\n",
+    "$\n",
+    "\n",
+    "**In the text page 223:**\n",
+    "\n",
+    "[...] then we want the squared distance between $\\mathbf{z}^{(i)}$ and $ \\sum_{j=1}^{m}{\\hat{w}_{i,j}\\mathbf{z}^{(j)}} $ to be as small as possible.\n",
+    "\n",
+    "\n",
+    "**Equation 8-5: LLE step 2: reducing dimensionality while preserving relationships**\n",
+    "\n",
+    "$\n",
+    "\\hat{\\mathbf{Z}} = \\underset{\\mathbf{Z}}{\\operatorname{argmin}}{\\displaystyle \\sum\\limits_{i=1}^{m}} \\left\\|\\mathbf{z}^{(i)} - \\sum\\limits_{j=1}^{m}{\\hat{w}_{i,j}}\\mathbf{z}^{(j)}\\right\\|^2\n",
+    "$\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Chapter 9\n",
+    "\n",
+    "**Equation 9-1: Rectified linear unit**\n",
+    "\n",
+    "$\n",
+    "h_{\\mathbf{w}, b}(\\mathbf{X}) = \\max(\\mathbf{X} \\cdot \\mathbf{w} + b, 0)\n",
+    "$"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Chapter 10\n",
+    "\n",
+    "**Equation 10-1: Common step functions used in Perceptrons**\n",
+    "\n",
+    "$\n",
+    "\\begin{split}\n",
+    "\\operatorname{heaviside}(z) =\n",
+    "\\begin{cases}\n",
+    "0 & \\text{if }z < 0\\\\\n",
+    "1 & \\text{if }z \\ge 0\n",
+    "\\end{cases} & \\quad\\quad\n",
+    "\\operatorname{sgn}(z) =\n",
+    "\\begin{cases}\n",
+    "-1 & \\text{if }z < 0\\\\\n",
+    "0 & \\text{if }z = 0\\\\\n",
+    "+1 & \\text{if }z > 0\n",
+    "\\end{cases}\n",
+    "\\end{split}\n",
+    "$\n",
+    "\n",
+    "\n",
+    "**Equation 10-2: Perceptron learning rule (weight update)**\n",
+    "\n",
+    "$\n",
+    "{w_{i,j}}^{(\\text{next step})} = w_{i,j} + \\eta (y_j - \\hat{y}_j) x_i\n",
+    "$\n",
+    "\n",
+    "\n",
+    "**In the text page 266:**\n",
+    "\n",
+    "It will be initialized randomly, using a truncated normal (Gaussian) distribution with a standard deviation of $ 2 / \\sqrt{\\text{n}_\\text{inputs}} $.\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Chapter 11\n",
+    "**Equation 11-1: Xavier initialization (when using the logistic activation function)**\n",
+    "\n",
+    "$\n",
+    "\\begin{split}\n",
+    "& \\text{Normal distribution with mean 0 and standard deviation }\n",
+    "\\sigma = \\sqrt{\\dfrac{2}{n_\\text{inputs} + n_\\text{outputs}}}\\\\\n",
+    "& \\text{Or a uniform distribution between -r and +r, with }\n",
+    "r = \\sqrt{\\dfrac{6}{n_\\text{inputs} + n_\\text{outputs}}}\n",
+    "\\end{split}\n",
+    "$\n",
+    "\n",
+    "**In the text page 278:**\n",
+    "\n",
+    "When the number of input connections is roughly equal to the number of output\n",
+    "connections, you get simpler equations (e.g., $ \\sigma = 1 / \\sqrt{n_\\text{inputs}} $ or $ r = \\sqrt{3} / \\sqrt{n_\\text{inputs}} $).\n",
+    "\n",
+    "**Table 11-1: Initialization parameters for each type of activation function**\n",
+    "\n",
+    "* Logistic uniform: $ r = \\sqrt{\\dfrac{6}{n_\\text{inputs} + n_\\text{outputs}}} $\n",
+    "* Logistic normal: $ \\sigma = \\sqrt{\\dfrac{2}{n_\\text{inputs} + n_\\text{outputs}}} $\n",
+    "* Hyperbolic tangent uniform: $ r = 4 \\sqrt{\\dfrac{6}{n_\\text{inputs} + n_\\text{outputs}}} $\n",
+    "* Hyperbolic tangent normal: $ \\sigma = 4 \\sqrt{\\dfrac{2}{n_\\text{inputs} + n_\\text{outputs}}} $\n",
+    "* ReLU (and its variants) uniform: $ r = \\sqrt{2} \\sqrt{\\dfrac{6}{n_\\text{inputs} + n_\\text{outputs}}} $\n",
+    "* ReLU (and its variants) normal: $ \\sigma = \\sqrt{2} \\sqrt{\\dfrac{2}{n_\\text{inputs} + n_\\text{outputs}}} $\n",
+    "\n",
+    "**Equation 11-2: ELU activation function**\n",
+    "\n",
+    "$\n",
+    "\\operatorname{ELU}_\\alpha(z) =\n",
+    "\\begin{cases}\n",
+    "\\alpha(\\exp(z) - 1) & \\text{if } z < 0\\\\\n",
+    "z & if z \\ge 0\n",
+    "\\end{cases}\n",
+    "$\n",
+    "\n",
+    "\n",
+    "**Equation 11-3: Batch Normalization algorithm**\n",
+    "\n",
+    "$\n",
+    "\\begin{split}\n",
+    "1.\\quad & \\mathbf{\\mu}_B = \\dfrac{1}{m_B}\\sum\\limits_{i=1}^{m_B}{\\mathbf{x}^{(i)}}\\\\\n",
+    "2.\\quad & {\\mathbf{\\sigma}_B}^2 = \\dfrac{1}{m_B}\\sum\\limits_{i=1}^{m_B}{(\\mathbf{x}^{(i)} - \\mathbf{\\mu}_B)^2}\\\\\n",
+    "3.\\quad & \\hat{\\mathbf{x}}^{(i)} = \\dfrac{\\mathbf{x}^{(i)} - \\mathbf{\\mu}_B}{\\sqrt{{\\mathbf{\\sigma}_B}^2 + \\epsilon}}\\\\\n",
+    "4.\\quad & \\mathbf{z}^{(i)} = \\gamma \\hat{\\mathbf{x}}^{(i)} + \\beta\n",
+    "\\end{split}\n",
+    "$\n",
+    "\n",
+    "**In the text page 285:**\n",
+    "\n",
+    "[...] given a new value $v$, the running average $v$ is updated through the equation:\n",
+    "\n",
+    "$ \\hat{v} \\gets \\hat{v} \\times \\text{momentum} + v \\times (1 - \\text{momentum}) $\n",
+    "\n",
+    "**Equation 11-4: Momentum algorithm**\n",
+    "\n",
+    "$\n",
+    "\\begin{split}\n",
+    "1. \\quad & \\mathbf{m} \\gets \\beta \\mathbf{m} - \\eta \\nabla_\\mathbf{\\theta}J(\\mathbf{\\theta}) \\\\\n",
+    "2. \\quad & \\mathbf{\\theta} \\gets \\mathbf{\\theta} + \\mathbf{m}\n",
+    "\\end{split}\n",
+    "$\n",
+    "\n",
+    "**In the text page 296:**\n",
+    "\n",
+    "You can easily verify that if the gradient remains constant, the terminal velocity (i.e., the maximum size of the weight updates) is equal to that gradient multiplied by the learning rate η multiplied by $ \\frac{1}{1 - \\beta} $.\n",
+    "\n",
+    "\n",
+    "**Equation 11-5: Nesterov Accelerated Gradient algorithm**\n",
+    "\n",
+    "$\n",
+    "\\begin{split}\n",
+    "1. \\quad & \\mathbf{m} \\gets \\beta \\mathbf{m} - \\eta \\nabla_\\mathbf{\\theta}J(\\mathbf{\\theta} + \\beta \\mathbf{m}) \\\\\n",
+    "2. \\quad & \\mathbf{\\theta} \\gets \\mathbf{\\theta} + \\mathbf{m}\n",
+    "\\end{split}\n",
+    "$\n",
+    "\n",
+    "\n",
+    "**Equation 11-6: AdaGrad algorithm**\n",
+    "\n",
+    "$\n",
+    "\\begin{split}\n",
+    "1. \\quad & \\mathbf{s} \\gets \\mathbf{s} + \\nabla_\\mathbf{\\theta}J(\\mathbf{\\theta}) \\otimes \\nabla_\\mathbf{\\theta}J(\\mathbf{\\theta})\\\\\n",
+    "2. \\quad & \\mathbf{\\theta} \\gets \\mathbf{\\theta} - \\eta \\, \\nabla_\\mathbf{\\theta}J(\\mathbf{\\theta}) \\oslash {\\sqrt{\\mathbf{s} + \\epsilon}}\n",
+    "\\end{split}\n",
+    "$\n",
+    "\n",
+    "**In the text page 298-299:**\n",
+    "\n",
+    "This vectorized form is equivalent to computing $s_i \\gets s_i + \\left( \\dfrac{\\partial J(\\mathbf{\\theta})}{\\partial \\theta_i} \\right)^2$ for each element $s_i$ of the vector $\\mathbf{s}$.\n",
+    "\n",
+    "**In the text page 299:**\n",
+    "\n",
+    "This vectorized form is equivalent to computing $ \\theta_i \\gets \\theta_i - \\eta \\, \\dfrac{\\partial J(\\mathbf{\\theta})}{\\partial \\theta_i} \\dfrac{1}{\\sqrt{s_i + \\epsilon}} $ for all parameters $\\theta_i$ (simultaneously).\n",
+    "\n",
+    "\n",
+    "**Equation 11-7: RMSProp algorithm**\n",
+    "\n",
+    "$\n",
+    "\\begin{split}\n",
+    "1. \\quad & \\mathbf{s} \\gets \\beta \\mathbf{s} + (1 - \\beta ) \\nabla_\\mathbf{\\theta}J(\\mathbf{\\theta}) \\otimes \\nabla_\\mathbf{\\theta}J(\\mathbf{\\theta})\\\\\n",
+    "2. \\quad & \\mathbf{\\theta} \\gets \\mathbf{\\theta} - \\eta \\, \\nabla_\\mathbf{\\theta}J(\\mathbf{\\theta}) \\oslash {\\sqrt{\\mathbf{s} + \\epsilon}}\n",
+    "\\end{split}\n",
+    "$\n",
+    "\n",
+    "\n",
+    "**Equation 11-8: Adam algorithm**\n",
+    "\n",
+    "$\n",
+    "\\begin{split}\n",
+    "1. \\quad & \\mathbf{m} \\gets \\beta_1 \\mathbf{m} - (1 - \\beta_1) \\nabla_\\mathbf{\\theta}J(\\mathbf{\\theta})\\\\\n",
+    "2. \\quad & \\mathbf{s} \\gets \\beta_2 \\mathbf{s} + (1 - \\beta_2) \\nabla_\\mathbf{\\theta}J(\\mathbf{\\theta}) \\otimes \\nabla_\\mathbf{\\theta}J(\\mathbf{\\theta})\\\\\n",
+    "3. \\quad & \\mathbf{m} \\gets \\dfrac{\\mathbf{m}}{1 - {\\beta_1}^T}\\\\\n",
+    "4. \\quad & \\mathbf{s} \\gets \\dfrac{\\mathbf{s}}{1 - {\\beta_2}^T}\\\\\n",
+    "5. \\quad & \\mathbf{\\theta} \\gets \\mathbf{\\theta} + \\eta \\, \\mathbf{m} \\oslash {\\sqrt{\\mathbf{s} + \\epsilon}}\n",
+    "\\end{split}\n",
+    "$\n",
+    "\n",
+    "**In the text page 309:**\n",
+    "\n",
+    "We typically implement this constraint by computing $\\left\\| \\mathbf{w} \\right\\|_2$ after each training step\n",
+    "and clipping $\\mathbf{w}$ if needed ($ \\mathbf{w} \\gets \\mathbf{w} \\dfrac{r}{\\left\\| \\mathbf{w} \\right\\|_2} $).\n",
+    "\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Chapter 13\n",
+    "\n",
+    "**Equation 13-1: Computing the output of a neuron in a convolutional layer**\n",
+    "\n",
+    "$\n",
+    "z_{i,j,k} = b_k + \\sum\\limits_{u = 1}^{f_h} \\, \\, \\sum\\limits_{v = 1}^{f_w} \\, \\, \\sum\\limits_{k' = 1}^{f_{n'}} \\, \\, x_{i', j', k'} . w_{u,v,k', k}\n",
+    "\\quad \\text{with }\n",
+    "\\begin{cases}\n",
+    "i' = u.s_h+f_h-1 \\\\\n",
+    "j' = v.s_w+f_w-1\n",
+    "\\end{cases}\n",
+    "$\n",
+    "\n",
+    "**Equation 13-2: Local response normalization**\n",
+    "\n",
+    "$\n",
+    "b_i = a_i  \\left(k + \\alpha \\sum\\limits_{j=j_\\text{low}}^{j_\\text{high}}{{a_j}^2} \\right)^{-\\beta} \\quad \\text{with }\n",
+    "\\begin{cases}\n",
+    "  j_\\text{high} = \\min\\left(i + \\dfrac{r}{2}, f_n-1\\right) \\\\\n",
+    "  j_\\text{low} = \\max\\left(0, i - \\dfrac{r}{2}\\right)\n",
+    "\\end{cases}\n",
+    "$\n",
+    "\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Chapter 14\n",
+    "\n",
+    "**Equation 14-1: Output of a single recurrent neuron for a single instance**\n",
+    "\n",
+    "$\n",
+    "\\mathbf{y}_{(t)} = \\phi\\left({{\\mathbf{x}_{(t)}}^T \\cdot \\mathbf{w}_x} + {\\mathbf{y}_{(t-1)}}^T \\cdot {\\mathbf{w}_y} + b \\right)\n",
+    "$\n",
+    "\n",
+    "\n",
+    "**Equation 14-2: Outputs of a layer of recurrent neurons for all instances in a mini-batch**\n",
+    "\n",
+    "$\n",
+    "\\begin{split}\n",
+    "\\mathbf{Y}_{(t)} & = \\phi\\left(\\mathbf{X}_{(t)} \\cdot \\mathbf{W}_{x} + \\mathbf{Y}_{(t-1)}\\cdot  \\mathbf{W}_{y} + \\mathbf{b} \\right) \\\\\n",
+    "& = \\phi\\left(\n",
+    "\\left[\\mathbf{X}_{(t)} \\quad \\mathbf{Y}_{(t-1)} \\right]\n",
+    " \\cdot \\mathbf{W} + \\mathbf{b} \\right) \\text{ with } \\mathbf{W}=\n",
+    "\\left[ \\begin{matrix}\n",
+    "  \\mathbf{W}_x\\\\\n",
+    "  \\mathbf{W}_y\n",
+    "\\end{matrix} \\right]\n",
+    "\\end{split}\n",
+    "$\n",
+    "\n",
+    "**In the text page 391:**\n",
+    "\n",
+    "Just like in regular backpropagation, there is a first forward pass through the unrolled network (represented by the dashed arrows); then the output sequence is evaluated using a cost function $ C(\\mathbf{Y}_{(t_\\text{min})}, \\mathbf{Y}_{(t_\\text{min}+1)}, \\dots, \\mathbf{Y}_{(t_\\text{max})}) $ (where $t_\\text{min}$ and $t_\\text{max}$ are the first and last output time steps, not counting the ignored outputs)[...]\n",
+    "\n",
+    "\n",
+    "**Equation 14-3: LSTM computations**\n",
+    "\n",
+    "$\n",
+    "\\begin{split}\n",
+    "\\mathbf{i}_{(t)}&=\\sigma({\\mathbf{W}_{xi}}^T \\cdot \\mathbf{x}_{(t)} + {\\mathbf{W}_{hi}}^T \\cdot \\mathbf{h}_{(t-1)} + \\mathbf{b}_i)\\\\\n",
+    "\\mathbf{f}_{(t)}&=\\sigma({\\mathbf{W}_{xf}}^T \\cdot \\mathbf{x}_{(t)} + {\\mathbf{W}_{hf}}^T \\cdot \\mathbf{h}_{(t-1)} + \\mathbf{b}_f)\\\\\n",
+    "\\mathbf{o}_{(t)}&=\\sigma({\\mathbf{W}_{xo}}^T \\cdot \\mathbf{x}_{(t)} + {\\mathbf{W}_{ho}}^T \\cdot \\mathbf{h}_{(t-1)} + \\mathbf{b}_o)\\\\\n",
+    "\\mathbf{g}_{(t)}&=\\operatorname{tanh}({\\mathbf{W}_{xg}}^T \\cdot \\mathbf{x}_{(t)} + {\\mathbf{W}_{hg}}^T \\cdot \\mathbf{h}_{(t-1)} + \\mathbf{b}_g)\\\\\n",
+    "\\mathbf{c}_{(t)}&=\\mathbf{f}_{(t)} \\otimes \\mathbf{c}_{(t-1)} \\, + \\, \\mathbf{i}_{(t)} \\otimes \\mathbf{g}_{(t)}\\\\\n",
+    "\\mathbf{y}_{(t)}&=\\mathbf{h}_{(t)} = \\mathbf{o}_{(t)} \\otimes \\operatorname{tanh}(\\mathbf{c}_{(t)})\n",
+    "\\end{split}\n",
+    "$\n",
+    "\n",
+    "\n",
+    "**Equation 14-4: GRU computations**\n",
+    "\n",
+    "$\n",
+    "\\begin{split}\n",
+    "\\mathbf{z}_{(t)}&=\\sigma({\\mathbf{W}_{xz}}^T \\cdot \\mathbf{x}_{(t)} + {\\mathbf{W}_{hz}}^T \\cdot \\mathbf{h}_{(t-1)}) \\\\\n",
+    "\\mathbf{r}_{(t)}&=\\sigma({\\mathbf{W}_{xr}}^T \\cdot \\mathbf{x}_{(t)} + {\\mathbf{W}_{hr}}^T \\cdot \\mathbf{h}_{(t-1)}) \\\\\n",
+    "\\mathbf{g}_{(t)}&=\\operatorname{tanh}\\left({\\mathbf{W}_{xg}}^T \\cdot \\mathbf{x}_{(t)} + {\\mathbf{W}_{hg}}^T \\cdot (\\mathbf{r}_{(t)} \\otimes \\mathbf{h}_{(t-1)})\\right) \\\\\n",
+    "\\mathbf{h}_{(t)}&=(1-\\mathbf{z}_{(t)}) \\otimes \\mathbf{h}_{(t-1)} + \\mathbf{z}_{(t)} \\otimes \\mathbf{g}_{(t)}\n",
+    "\\end{split}\n",
+    "$\n",
+    "\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Chapter 15\n",
+    "\n",
+    "**Equation 15-1: Kullback–Leibler divergence**\n",
+    "\n",
+    "$\n",
+    "D_{\\mathrm{KL}}(P\\|Q) = \\sum\\limits_{i} P(i) \\log \\dfrac{P(i)}{Q(i)}\n",
+    "$\n",
+    "\n",
+    "\n",
+    "**Equation: KL divergence between the target sparsity _p_ and the actual sparsity _q_**\n",
+    "\n",
+    "$\n",
+    "D_{\\mathrm{KL}}(p\\|q) = p \\, \\log \\dfrac{p}{q} + (1-p) \\log \\dfrac{1-p}{1-q}\n",
+    "$\n",
+    "\n",
+    "**In the text page 433:**\n",
+    "\n",
+    "One common variant is to train the encoder to output $\\gamma = \\log\\left(\\sigma^2\\right)$ rather than $\\sigma$.\n",
+    "Wherever we need $\\sigma$ we can just compute $ \\sigma = \\exp\\left(\\dfrac{\\gamma}{2}\\right) $.\n",
+    "\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Chapter 16\n",
+    "\n",
+    "**Equation 16-1: Bellman Optimality Equation**\n",
+    "\n",
+    "$\n",
+    "V^*(s) = \\underset{a}{\\max}\\sum\\limits_{s'}{T(s, a, s') [R(s, a, s') + \\gamma . V^*(s')]} \\quad \\text{for all }s\n",
+    "$\n",
+    "\n",
+    "**Equation 16-2: Value Iteration algorithm**\n",
+    "\n",
+    "$\n",
+    "  V_{k+1}(s) \\gets \\underset{a}{\\max}\\sum\\limits_{s'}{T(s, a, s') [R(s, a, s') + \\gamma . V_k(s')]} \\quad \\text{for all }s\n",
+    "$\n",
+    "\n",
+    "\n",
+    "**Equation 16-3: Q-Value Iteration algorithm**\n",
+    "\n",
+    "$\n",
+    "  Q_{k+1}(s, a) \\gets \\sum\\limits_{s'}{T(s, a, s') [R(s, a, s') + \\gamma . \\underset{a'}{\\max}\\,{Q_k(s',a')}]} \\quad \\text{for all } (s,a)\n",
+    "$\n",
+    "\n",
+    "**In the text page 458:**\n",
+    "\n",
+    "Once you have the optimal Q-Values, defining the optimal policy, noted $\\pi^{*}(s)$, is trivial: when the agent is in state $s$, it should choose the action with the highest Q-Value for that state: $ \\pi^{*}(s) = \\underset{a}{\\operatorname{argmax}} \\, Q^*(s, a) $.\n",
+    "\n",
+    "\n",
+    "**Equation 16-4: TD Learning algorithm**\n",
+    "\n",
+    "$\n",
+    "V_{k+1}(s) \\gets (1-\\alpha)V_k(s) + \\alpha\\left(r + \\gamma . V_k(s')\\right)\n",
+    "$\n",
+    "\n",
+    "\n",
+    "**Equation 16-5: Q-Learning algorithm**\n",
+    "\n",
+    "$\n",
+    "Q_{k+1}(s, a) \\gets (1-\\alpha)Q_k(s,a) + \\alpha\\left(r + \\gamma . \\underset{a'}{\\max} \\, Q_k(s', a')\\right)\n",
+    "$\n",
+    "\n",
+    "\n",
+    "**Equation 16-6: Q-Learning using an exploration function**\n",
+    "\n",
+    "$\n",
+    "  Q(s, a) \\gets (1-\\alpha)Q(s,a) + \\alpha\\left(r + \\gamma . \\underset{\\alpha'}{\\max}f(Q(s', a'), N(s', a'))\\right)\n",
+    "$\n",
+    "\n",
+    "\n",
+    "**Equation 16-7: Deep Q-Learning cost function**\n",
+    "\n",
+    "$\n",
+    "\\begin{split}\n",
+    "& J(\\mathbf{\\theta}_\\text{critic}) = \\dfrac{1}{m}\\sum\\limits_{i=1}^m\\left(y^{(i)} - Q(s^{(i)},a^{(i)},\\mathbf{\\theta}_\\text{critic})\\right)^2 \\\\\n",
+    "& \\text{with } y^{(i)} = r^{(i)} + \\gamma . \\underset{a'}{\\max}Q(s'^{(i)},a',\\mathbf{\\theta}_\\text{actor})\n",
+    "\\end{split}\n",
+    "$\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Appendix A\n",
+    "\n",
+    "Equations that appear in the text:\n",
+    "\n",
+    "$\n",
+    "\\mathbf{H} =\n",
+    "\\begin{pmatrix}\n",
+    "\\mathbf{H'} & 0 & \\cdots\\\\\n",
+    "0 & 0 & \\\\\n",
+    "\\vdots & & \\ddots\n",
+    "\\end{pmatrix}\n",
+    "$\n",
+    "\n",
+    "\n",
+    "$\n",
+    "\\mathbf{A} =\n",
+    "\\begin{pmatrix}\n",
+    "\\mathbf{A'} & \\mathbf{I}_m \\\\\n",
+    "\\mathbf{0} & -\\mathbf{I}_m\n",
+    "\\end{pmatrix}\n",
+    "$\n",
+    "\n",
+    "\n",
+    "$ 1 - \\frac{1}{5}^2 - \\frac{4}{5}^2 $\n",
+    "\n",
+    "\n",
+    "$ 1 - \\frac{1}{2}^2 - \\frac{1}{2}^2  $\n",
+    "\n",
+    "\n",
+    "$ \\frac{2}{5} \\times $\n",
+    "\n",
+    "\n",
+    "$ \\frac{3}{5} \\times 0 $"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Appendix C"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Equations that appear in the text:\n",
+    "\n",
+    "$ (\\hat{x}, \\hat{y}) $\n",
+    "\n",
+    "\n",
+    "$ \\hat{\\alpha} $\n",
+    "\n",
+    "\n",
+    "$ (\\hat{x}, \\hat{y}, \\hat{\\alpha}) $\n",
+    "\n",
+    "\n",
+    "$\n",
+    "\\begin{cases}\n",
+    "\\frac{\\partial}{\\partial x}g(x, y, \\alpha) = 2x - 3\\alpha\\\\\n",
+    "\\frac{\\partial}{\\partial y}g(x, y, \\alpha) = 2 - 2\\alpha\\\\\n",
+    "\\frac{\\partial}{\\partial \\alpha}g(x, y, \\alpha) = -3x - 2y - 1\\\\\n",
+    "\\end{cases}\n",
+    "$\n",
+    "\n",
+    "\n",
+    "$ 2\\hat{x} - 3\\hat{\\alpha} = 2 - 2\\hat{\\alpha} = -3\\hat{x} - 2\\hat{y} - 1 = 0 $\n",
+    "\n",
+    "\n",
+    "$ \\hat{x} = \\frac{3}{2} $\n",
+    "\n",
+    "\n",
+    "$ \\hat{y} = -\\frac{11}{4} $\n",
+    "\n",
+    "\n",
+    "$ \\hat{\\alpha} = 1 $\n",
+    "\n",
+    "\n",
+    "**Equation C-1: Generalized Lagrangian for the hard margin problem**\n",
+    "\n",
+    "$\n",
+    "\\begin{split}\n",
+    "\\mathcal{L}(\\mathbf{w}, b, \\mathbf{\\alpha}) = \\frac{1}{2}\\mathbf{w}^T \\cdot \\mathbf{w} - \\sum\\limits_{i=1}^{m}{\\alpha^{(i)} \\left(t^{(i)}(\\mathbf{w}^T \\cdot \\mathbf{x}^{(i)} + b) - 1\\right)} \\\\\n",
+    "\\text{with}\\quad \\alpha^{(i)} \\ge 0 \\quad \\text{for }i = 1, 2, \\dots, m\n",
+    "\\end{split}\n",
+    "$\n",
+    "\n",
+    "**More equations in the text:**\n",
+    "\n",
+    "$ (\\hat{\\mathbf{w}}, \\hat{b}, \\hat{\\mathbf{\\alpha}}) $\n",
+    "\n",
+    "\n",
+    "$ t^{(i)}((\\hat{\\mathbf{w}})^T \\cdot \\mathbf{x}^{(i)} + \\hat{b}) \\ge 1 \\quad \\text{for } i = 1, 2, \\dots, m $\n",
+    "\n",
+    "\n",
+    "$ {\\hat{\\alpha}}^{(i)} \\ge 0 \\quad \\text{for } i = 1, 2, \\dots, m $\n",
+    "\n",
+    "\n",
+    "$ {\\hat{\\alpha}}^{(i)} = 0 $\n",
+    "\n",
+    "\n",
+    "$ t^{(i)}((\\hat{\\mathbf{w}})^T \\cdot \\mathbf{x}^{(i)} + \\hat{b}) = 1 $\n",
+    "\n",
+    "\n",
+    "$ {\\hat{\\alpha}}^{(i)} = 0 $\n",
+    "\n",
+    "\n",
+    "**Equation C-2: Partial derivatives of the generalized Lagrangian**\n",
+    "\n",
+    "$\n",
+    "\\begin{split}\n",
+    "\\nabla_{\\mathbf{w}}\\mathcal{L}(\\mathbf{w}, b, \\mathbf{\\alpha}) = \\mathbf{w} - \\sum\\limits_{i=1}^{m}\\alpha^{(i)}t^{(i)}\\mathbf{x}^{(i)}\\\\\n",
+    "\\dfrac{\\partial}{\\partial b}\\mathcal{L}(\\mathbf{w}, b, \\mathbf{\\alpha}) = -\\sum\\limits_{i=1}^{m}\\alpha^{(i)}t^{(i)}\n",
+    "\\end{split}\n",
+    "$\n",
+    "\n",
+    "\n",
+    "**Equation C-3: Properties of the stationary points**\n",
+    "\n",
+    "$\n",
+    "\\begin{split}\n",
+    "\\hat{\\mathbf{w}} = \\sum_{i=1}^{m}{\\hat{\\alpha}}^{(i)}t^{(i)}\\mathbf{x}^{(i)}\\\\\n",
+    "\\sum_{i=1}^{m}{\\hat{\\alpha}}^{(i)}t^{(i)} = 0\n",
+    "\\end{split}\n",
+    "$\n",
+    "\n",
+    "\n",
+    "**Equation C-4: Dual form of the SVM problem**\n",
+    "\n",
+    "$\n",
+    "\\begin{split}\n",
+    "\\mathcal{L}(\\hat{\\mathbf{w}}, \\hat{b}, \\mathbf{\\alpha}) = \\dfrac{1}{2}\\sum\\limits_{i=1}^{m}{\n",
+    "  \\sum\\limits_{j=1}^{m}{\n",
+    "  \\alpha^{(i)} \\alpha^{(j)} t^{(i)} t^{(j)} {\\mathbf{x}^{(i)}}^T \\cdot \\mathbf{x}^{(j)}\n",
+    "  }\n",
+    "} \\quad - \\quad \\sum\\limits_{i=1}^{m}{\\alpha^{(i)}}\\\\\n",
+    "\\text{with}\\quad \\alpha^{(i)} \\ge 0 \\quad \\text{for }i = 1, 2, \\dots, m\n",
+    "\\end{split}\n",
+    "$\n",
+    "\n",
+    "**Some more equations in the text:**\n",
+    "\n",
+    "$ \\hat{\\mathbf{\\alpha}} $\n",
+    "\n",
+    "\n",
+    "$ {\\hat{\\alpha}}^{(i)} \\ge 0 $\n",
+    "\n",
+    "\n",
+    "$ \\hat{\\mathbf{\\alpha}} $\n",
+    "\n",
+    "\n",
+    "$ \\hat{\\mathbf{w}} $\n",
+    "\n",
+    "\n",
+    "$ \\hat{b} $\n",
+    "\n",
+    "\n",
+    "$ \\hat{b} = 1 - t^{(k)}({\\hat{\\mathbf{w}}}^T \\cdot \\mathbf{x}^{(k)}) $\n",
+    "\n",
+    "\n",
+    "**Equation C-5: Bias term estimation using the dual form**\n",
+    "\n",
+    "$\n",
+    "\\hat{b} = \\dfrac{1}{n_s}\\sum\\limits_{\\scriptstyle i=1 \\atop {\\scriptstyle {\\hat{\\alpha}}^{(i)} > 0}}^{m}{\\left[1 - t^{(i)}({\\hat{\\mathbf{w}}}^T \\cdot \\mathbf{x}^{(i)})\\right]}\n",
+    "$"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Appendix D"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**Equation D-1: Partial derivatives of $f(x,y)$**\n",
+    "\n",
+    "$\n",
+    "\\begin{split}\n",
+    "\\dfrac{\\partial f}{\\partial x} & = \\dfrac{\\partial(x^2y)}{\\partial x} + \\dfrac{\\partial y}{\\partial x} + \\dfrac{\\partial 2}{\\partial x} = y \\dfrac{\\partial(x^2)}{\\partial x} + 0 + 0 = 2xy \\\\\n",
+    "\\dfrac{\\partial f}{\\partial y} & = \\dfrac{\\partial(x^2y)}{\\partial y} + \\dfrac{\\partial y}{\\partial y} + \\dfrac{\\partial 2}{\\partial y} = x^2 + 1 + 0 = x^2 + 1 \\\\\n",
+    "\\end{split}\n",
+    "$\n",
+    "\n",
+    "**In the text:**\n",
+    "\n",
+    "$ \\frac{\\partial g}{\\partial x} = 0 + (0 \\times x + y \\times 1) = y $\n",
+    "\n",
+    "\n",
+    "$ \\frac{\\partial x}{\\partial x} = 1 $\n",
+    "\n",
+    "\n",
+    "$ \\frac{\\partial y}{\\partial x} = 0 $\n",
+    "\n",
+    "\n",
+    "$ \\frac{\\partial (u \\times v)}{\\partial x} = \\frac{\\partial v}{\\partial x} \\times u + \\frac{\\partial u}{\\partial x} \\times u  $\n",
+    "\n",
+    "\n",
+    "$ \\frac{\\partial g}{\\partial x} = 0 + (0 \\times x + y \\times 1)  $\n",
+    "\n",
+    "\n",
+    "$ \\frac{\\partial g}{\\partial x} = y $\n",
+    "\n",
+    "\n",
+    "**Equation D-2: Derivative of a function _h_(_x_) at point _x_~0~**\n",
+    "\n",
+    "$\n",
+    "\\begin{split}\n",
+    "h'(x) & = \\underset{\\textstyle x \\to x_0}{\\lim}\\dfrac{h(x) - h(x_0)}{x - x_0}\\\\\n",
+    "      & = \\underset{\\textstyle \\epsilon \\to 0}{\\lim}\\dfrac{h(x_0 + \\epsilon) - h(x_0)}{\\epsilon}\n",
+    "\\end{split}\n",
+    "$\n",
+    "\n",
+    "\n",
+    "**Equation D-3: A few operations with dual numbers**\n",
+    "\n",
+    "$\n",
+    "\\begin{split}\n",
+    "&\\lambda(a + b\\epsilon) = \\lambda a + \\lambda b \\epsilon\\\\\n",
+    "&(a + b\\epsilon) + (c + d\\epsilon) = (a + c) + (b + d)\\epsilon \\\\\n",
+    "&(a + b\\epsilon) \\times (c + d\\epsilon) = ac + (ad + bc)\\epsilon + (bd)\\epsilon^2 = ac + (ad + bc)\\epsilon\\\\\n",
+    "\\end{split}\n",
+    "$\n",
+    "\n",
+    "**In the text:**\n",
+    "\n",
+    "$ \\frac{\\partial f}{\\partial x}(3, 4) $\n",
+    "\n",
+    "\n",
+    "$ \\frac{\\partial f}{\\partial y}(3, 4) $\n",
+    "\n",
+    "\n",
+    "**Equation D-4: Chain rule**\n",
+    "\n",
+    "$\n",
+    "\\dfrac{\\partial f}{\\partial x} = \\dfrac{\\partial f}{\\partial n_i} \\times \\dfrac{\\partial n_i}{\\partial x}\n",
+    "$\n",
+    "\n",
+    "**In the text:**\n",
+    "\n",
+    "$ \\frac{\\partial f}{\\partial n_7} = 1 $\n",
+    "\n",
+    "\n",
+    "$ \\frac{\\partial f}{\\partial n_5} = \\frac{\\partial f}{\\partial n_7} \\times \\frac{\\partial n_7}{\\partial n_5} $\n",
+    "\n",
+    "\n",
+    "$ \\frac{\\partial f}{\\partial n_7} = 1 $\n",
+    "\n",
+    "\n",
+    "$ \\frac{\\partial n_7}{\\partial n_5} $\n",
+    "\n",
+    "\n",
+    "$ \\frac{\\partial n_7}{\\partial n_5} = 1 $\n",
+    "\n",
+    "\n",
+    "$ \\frac{\\partial f}{\\partial n_5} = 1 \\times 1 = 1 $\n",
+    "\n",
+    "\n",
+    "$ \\frac{\\partial f}{\\partial n_4} = \\frac{\\partial f}{\\partial n_5} \\times \\frac{\\partial n_5}{\\partial n_4} $\n",
+    "\n",
+    "\n",
+    "$ \\frac{\\partial n_5}{\\partial n_4} = n_2 $\n",
+    "\n",
+    "\n",
+    "$ \\frac{\\partial f}{\\partial n_4} = 1 \\times n_2 = 4 $\n",
+    "\n",
+    "\n",
+    "$ \\frac{\\partial f}{\\partial x} = 24 $\n",
+    "\n",
+    "\n",
+    "$ \\frac{\\partial f}{\\partial y} = 10 $"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Appendix E"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**Equation E-1: Probability that the i^th^ neuron will output 1**\n",
+    "\n",
+    "$\n",
+    "p\\left(s_i^{(\\text{next step})} = 1\\right) \\, = \\, \\sigma\\left(\\frac{\\textstyle \\sum\\limits_{j = 1}^N{w_{i,j}s_j + b_i}}{\\textstyle T}\\right)\n",
+    "$\n",
+    "\n",
+    "**In the text:**\n",
+    "\n",
+    "$ \\dot{\\mathbf{x}} $\n",
+    "\n",
+    "\n",
+    "$ \\dot{\\mathbf{h}} $\n",
+    "\n",
+    "\n",
+    "**Equation E-2: Contrastive divergence weight update**\n",
+    "\n",
+    "$\n",
+    "w_{i,j}^{(\\text{next step})} = w_{i,j} + \\eta(\\mathbf{x}\\mathbf{h}^T - \\dot{\\mathbf{x}} \\dot {\\mathbf{h}}^T)\n",
+    "$"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Glossary\n",
+    "\n",
+    "In the text:\n",
+    "\n",
+    "$\\ell _1$\n",
+    "\n",
+    "\n",
+    "$\\ell _2$\n",
+    "\n",
+    "\n",
+    "$\\ell _k$\n",
+    "\n",
+    "\n",
+    "$ \\chi^2 $\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Just in case your eyes hurt after all these equations, let's finish with the single most beautiful equation in the world. No, it's not $E = mc²$, it's obviously Euler's identity:"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "$e^{i\\pi}+1=0$"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 2",
+   "language": "python",
+   "name": "python2"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/index.ipynb b/index.ipynb
index ca256ca..4c5c11a 100644
--- a/index.ipynb
+++ b/index.ipynb
@@ -2,10 +2,7 @@
  "cells": [
   {
    "cell_type": "markdown",
-   "metadata": {
-    "deletable": true,
-    "editable": true
-   },
+   "metadata": {},
    "source": [
     "# Machine Learning Notebooks\n",
     "\n",
@@ -38,15 +35,16 @@
     "\n",
     "## Math tutorials\n",
     "* [Linear Algebra](math_linear_algebra.ipynb)\n",
-    "* Calculus (coming soon)"
+    "* Calculus (coming soon)\n",
+    "\n",
+    "## Misc.\n",
+    "* [Equations](book_equations.ipynb) (list of equations in the book)"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {
-    "collapsed": true,
-    "deletable": true,
-    "editable": true
+    "collapsed": true
    },
    "source": [
     "## Prerequisites\n",
@@ -68,9 +66,7 @@
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
-    "collapsed": true,
-    "deletable": true,
-    "editable": true
+    "collapsed": true
    },
    "outputs": [],
    "source": []
@@ -78,21 +74,21 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 2",
    "language": "python",
-   "name": "python3"
+   "name": "python2"
   },
   "language_info": {
    "codemirror_mode": {
     "name": "ipython",
-    "version": 3
+    "version": 2
    },
    "file_extension": ".py",
    "mimetype": "text/x-python",
    "name": "python",
    "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.5.3"
+   "pygments_lexer": "ipython2",
+   "version": "2.7.12"
   },
   "nav_menu": {},
   "toc": {
@@ -106,5 +102,5 @@
   }
  },
  "nbformat": 4,
- "nbformat_minor": 0
+ "nbformat_minor": 1
 }

From 62dc13bbfb6b744d771e94eb6644da3820d97d36 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Aur=C3=A9lien=20Geron?= <ageron@users.noreply.github.com>
Date: Mon, 26 Jun 2017 12:18:36 +0200
Subject: [PATCH 3/6] Add pointer to Jupyter's viewer which displays equations
 properly

---
 book_equations.ipynb | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/book_equations.ipynb b/book_equations.ipynb
index 7b1c7d9..285288c 100644
--- a/book_equations.ipynb
+++ b/book_equations.ipynb
@@ -6,7 +6,9 @@
    "source": [
     "**Equations**\n",
     "\n",
-    "*This notebook lists all the equations in the book. If you decide to print them on a T-Shirt, I definitely want a copy! ;-)*"
+    "*This notebook lists all the equations in the book. If you decide to print them on a T-Shirt, I definitely want a copy! ;-)*\n",
+    "\n",
+    "**Warning**: GitHub's notebook viewer does not render equations properly. You should either view this notebook within Jupyter itself or use [Jupyter's online viewer](http://nbviewer.jupyter.org/github/ageron/handson-ml/blob/master/book_equations.ipynb)."
    ]
   },
   {

From 1e1fa7e2ff612f37176ca8e695751f22618bd58c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Aur=C3=A9lien=20Geron?= <ageron@users.noreply.github.com>
Date: Mon, 26 Jun 2017 12:32:11 +0200
Subject: [PATCH 4/6] Ensure that the equations display properly using
 Jupyter.org's viewer

---
 book_equations.ipynb | 52 +++++++++++++-------------------------------
 1 file changed, 15 insertions(+), 37 deletions(-)

diff --git a/book_equations.ipynb b/book_equations.ipynb
index 285288c..f2ffaee 100644
--- a/book_equations.ipynb
+++ b/book_equations.ipynb
@@ -306,8 +306,7 @@
     "**Equation 4-22: Cross entropy cost function**\n",
     "\n",
     "$\n",
-    "J(\\mathbf{\\Theta}) =\n",
-    "- \\dfrac{1}{m}\\sum\\limits_{i=1}^{m}\\sum\\limits_{k=1}^{K}{y_k^{(i)}\\log\\left(\\hat{p}_k^{(i)}\\right)}\n",
+    "J(\\mathbf{\\Theta}) = - \\dfrac{1}{m}\\sum\\limits_{i=1}^{m}\\sum\\limits_{k=1}^{K}{y_k^{(i)}\\log\\left(\\hat{p}_k^{(i)}\\right)}\n",
     "$\n",
     "\n",
     "**Cross entropy between two discrete probability distributions $p$ and $q$ (page 141):**\n",
@@ -761,12 +760,8 @@
     "\n",
     "**Equation 11-4: Momentum algorithm**\n",
     "\n",
-    "$\n",
-    "\\begin{split}\n",
-    "1. \\quad & \\mathbf{m} \\gets \\beta \\mathbf{m} - \\eta \\nabla_\\mathbf{\\theta}J(\\mathbf{\\theta}) \\\\\n",
-    "2. \\quad & \\mathbf{\\theta} \\gets \\mathbf{\\theta} + \\mathbf{m}\n",
-    "\\end{split}\n",
-    "$\n",
+    "1. $\\mathbf{m} \\gets \\beta \\mathbf{m} - \\eta \\nabla_\\mathbf{\\theta}J(\\mathbf{\\theta})$\n",
+    "2. $\\mathbf{\\theta} \\gets \\mathbf{\\theta} + \\mathbf{m}$\n",
     "\n",
     "**In the text page 296:**\n",
     "\n",
@@ -775,22 +770,13 @@
     "\n",
     "**Equation 11-5: Nesterov Accelerated Gradient algorithm**\n",
     "\n",
-    "$\n",
-    "\\begin{split}\n",
-    "1. \\quad & \\mathbf{m} \\gets \\beta \\mathbf{m} - \\eta \\nabla_\\mathbf{\\theta}J(\\mathbf{\\theta} + \\beta \\mathbf{m}) \\\\\n",
-    "2. \\quad & \\mathbf{\\theta} \\gets \\mathbf{\\theta} + \\mathbf{m}\n",
-    "\\end{split}\n",
-    "$\n",
-    "\n",
+    "1. $\\mathbf{m} \\gets \\beta \\mathbf{m} - \\eta \\nabla_\\mathbf{\\theta}J(\\mathbf{\\theta} + \\beta \\mathbf{m})$\n",
+    "2. $\\mathbf{\\theta} \\gets \\mathbf{\\theta} + \\mathbf{m}$\n",
     "\n",
     "**Equation 11-6: AdaGrad algorithm**\n",
     "\n",
-    "$\n",
-    "\\begin{split}\n",
-    "1. \\quad & \\mathbf{s} \\gets \\mathbf{s} + \\nabla_\\mathbf{\\theta}J(\\mathbf{\\theta}) \\otimes \\nabla_\\mathbf{\\theta}J(\\mathbf{\\theta})\\\\\n",
-    "2. \\quad & \\mathbf{\\theta} \\gets \\mathbf{\\theta} - \\eta \\, \\nabla_\\mathbf{\\theta}J(\\mathbf{\\theta}) \\oslash {\\sqrt{\\mathbf{s} + \\epsilon}}\n",
-    "\\end{split}\n",
-    "$\n",
+    "1. $\\mathbf{s} \\gets \\mathbf{s} + \\nabla_\\mathbf{\\theta}J(\\mathbf{\\theta}) \\otimes \\nabla_\\mathbf{\\theta}J(\\mathbf{\\theta})$\n",
+    "2. $\\mathbf{\\theta} \\gets \\mathbf{\\theta} - \\eta \\, \\nabla_\\mathbf{\\theta}J(\\mathbf{\\theta}) \\oslash {\\sqrt{\\mathbf{s} + \\epsilon}}$\n",
     "\n",
     "**In the text page 298-299:**\n",
     "\n",
@@ -803,30 +789,22 @@
     "\n",
     "**Equation 11-7: RMSProp algorithm**\n",
     "\n",
-    "$\n",
-    "\\begin{split}\n",
-    "1. \\quad & \\mathbf{s} \\gets \\beta \\mathbf{s} + (1 - \\beta ) \\nabla_\\mathbf{\\theta}J(\\mathbf{\\theta}) \\otimes \\nabla_\\mathbf{\\theta}J(\\mathbf{\\theta})\\\\\n",
-    "2. \\quad & \\mathbf{\\theta} \\gets \\mathbf{\\theta} - \\eta \\, \\nabla_\\mathbf{\\theta}J(\\mathbf{\\theta}) \\oslash {\\sqrt{\\mathbf{s} + \\epsilon}}\n",
-    "\\end{split}\n",
-    "$\n",
+    "1. $\\mathbf{s} \\gets \\beta \\mathbf{s} + (1 - \\beta ) \\nabla_\\mathbf{\\theta}J(\\mathbf{\\theta}) \\otimes \\nabla_\\mathbf{\\theta}J(\\mathbf{\\theta})$\n",
+    "2. $\\mathbf{\\theta} \\gets \\mathbf{\\theta} - \\eta \\, \\nabla_\\mathbf{\\theta}J(\\mathbf{\\theta}) \\oslash {\\sqrt{\\mathbf{s} + \\epsilon}}$\n",
     "\n",
     "\n",
     "**Equation 11-8: Adam algorithm**\n",
     "\n",
-    "$\n",
-    "\\begin{split}\n",
-    "1. \\quad & \\mathbf{m} \\gets \\beta_1 \\mathbf{m} - (1 - \\beta_1) \\nabla_\\mathbf{\\theta}J(\\mathbf{\\theta})\\\\\n",
-    "2. \\quad & \\mathbf{s} \\gets \\beta_2 \\mathbf{s} + (1 - \\beta_2) \\nabla_\\mathbf{\\theta}J(\\mathbf{\\theta}) \\otimes \\nabla_\\mathbf{\\theta}J(\\mathbf{\\theta})\\\\\n",
-    "3. \\quad & \\mathbf{m} \\gets \\dfrac{\\mathbf{m}}{1 - {\\beta_1}^T}\\\\\n",
-    "4. \\quad & \\mathbf{s} \\gets \\dfrac{\\mathbf{s}}{1 - {\\beta_2}^T}\\\\\n",
-    "5. \\quad & \\mathbf{\\theta} \\gets \\mathbf{\\theta} + \\eta \\, \\mathbf{m} \\oslash {\\sqrt{\\mathbf{s} + \\epsilon}}\n",
-    "\\end{split}\n",
-    "$\n",
+    "1. $\\mathbf{m} \\gets \\beta_1 \\mathbf{m} - (1 - \\beta_1) \\nabla_\\mathbf{\\theta}J(\\mathbf{\\theta})$\n",
+    "2. $\\mathbf{s} \\gets \\beta_2 \\mathbf{s} + (1 - \\beta_2) \\nabla_\\mathbf{\\theta}J(\\mathbf{\\theta}) \\otimes \\nabla_\\mathbf{\\theta}J(\\mathbf{\\theta})$\n",
+    "3. $\\mathbf{m} \\gets \\left(\\dfrac{\\mathbf{m}}{1 - {\\beta_1}^T}\\right)$\n",
+    "4. $\\mathbf{s} \\gets \\left(\\dfrac{\\mathbf{s}}{1 - {\\beta_2}^T}\\right)$\n",
+    "5. $\\mathbf{\\theta} \\gets \\mathbf{\\theta} + \\eta \\, \\mathbf{m} \\oslash {\\sqrt{\\mathbf{s} + \\epsilon}}$\n",
     "\n",
     "**In the text page 309:**\n",
     "\n",
     "We typically implement this constraint by computing $\\left\\| \\mathbf{w} \\right\\|_2$ after each training step\n",
-    "and clipping $\\mathbf{w}$ if needed ($ \\mathbf{w} \\gets \\mathbf{w} \\dfrac{r}{\\left\\| \\mathbf{w} \\right\\|_2} $).\n",
+    "and clipping $\\mathbf{w}$ if needed $ \\left( \\mathbf{w} \\gets \\mathbf{w} \\dfrac{r}{\\left\\| \\mathbf{w} \\right\\|_2} \\right) $.\n",
     "\n",
     "\n"
    ]

From a7d692cbf9fe3c291633dee75216eff43d3164bd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Aur=C3=A9lien=20Geron?= <ageron@users.noreply.github.com>
Date: Mon, 26 Jun 2017 16:06:40 +0200
Subject: [PATCH 5/6] Fix error in Equation 13-1

---
 book_equations.ipynb | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/book_equations.ipynb b/book_equations.ipynb
index f2ffaee..5f92380 100644
--- a/book_equations.ipynb
+++ b/book_equations.ipynb
@@ -818,11 +818,11 @@
     "**Equation 13-1: Computing the output of a neuron in a convolutional layer**\n",
     "\n",
     "$\n",
-    "z_{i,j,k} = b_k + \\sum\\limits_{u = 1}^{f_h} \\, \\, \\sum\\limits_{v = 1}^{f_w} \\, \\, \\sum\\limits_{k' = 1}^{f_{n'}} \\, \\, x_{i', j', k'} . w_{u,v,k', k}\n",
+    "z_{i,j,k} = b_k + \\sum\\limits_{u = 0}^{f_h - 1} \\, \\, \\sum\\limits_{v = 0}^{f_w - 1} \\, \\, \\sum\\limits_{k' = 0}^{f_{n'} - 1} \\, \\, x_{i', j', k'} . w_{u, v, k', k}\n",
     "\\quad \\text{with }\n",
     "\\begin{cases}\n",
-    "i' = u.s_h+f_h-1 \\\\\n",
-    "j' = v.s_w+f_w-1\n",
+    "i' = i \\times s_h + u \\\\\n",
+    "j' = j \\times s_w + v\n",
     "\\end{cases}\n",
     "$\n",
     "\n",

From 4e016769a6761776e8c1e2b873bdab138c17155b Mon Sep 17 00:00:00 2001
From: vivek-v-rao <vivekrao4@yahoo.com>
Date: Tue, 27 Jun 2017 21:30:57 -0400
Subject: [PATCH 6/6] removed quotes around "True"

the inplace argument should be boolean, not string
---
 01_the_machine_learning_landscape.ipynb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/01_the_machine_learning_landscape.ipynb b/01_the_machine_learning_landscape.ipynb
index 560bc53..6a080af 100644
--- a/01_the_machine_learning_landscape.ipynb
+++ b/01_the_machine_learning_landscape.ipynb
@@ -159,7 +159,7 @@
    "outputs": [],
    "source": [
     "full_country_stats = pd.merge(left=oecd_bli, right=gdp_per_capita, left_index=True, right_index=True)\n",
-    "full_country_stats.sort_values(by=\"GDP per capita\", inplace=\"True\")\n",
+    "full_country_stats.sort_values(by=\"GDP per capita\", inplace=True)\n",
     "full_country_stats"
    ]
   },