From a65ab3b154021db1c9e9aeedadbf56976cebb574 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Aur=C3=A9lien=20Geron?= <ageron@users.noreply.github.com>
Date: Thu, 13 Jul 2017 11:13:37 +0200
Subject: [PATCH] Fixes #56, bug in DNNClassifier for batch normalization

---
 11_deep_learning.ipynb | 731 +++++++++++++++++++++++------------------
 1 file changed, 405 insertions(+), 326 deletions(-)

diff --git a/11_deep_learning.ipynb b/11_deep_learning.ipynb
index 999c4c2..4a80148 100644
--- a/11_deep_learning.ipynb
+++ b/11_deep_learning.ipynb
@@ -145,7 +145,9 @@
   {
    "cell_type": "code",
    "execution_count": 5,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "reset_graph()\n",
@@ -159,7 +161,9 @@
   {
    "cell_type": "code",
    "execution_count": 6,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "he_init = tf.contrib.layers.variance_scaling_initializer()\n",
@@ -235,7 +239,9 @@
   {
    "cell_type": "code",
    "execution_count": 10,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "def leaky_relu(z, name=None):\n",
@@ -282,7 +288,9 @@
   {
    "cell_type": "code",
    "execution_count": 13,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "with tf.name_scope(\"dnn\"):\n",
@@ -396,7 +404,9 @@
   {
    "cell_type": "code",
    "execution_count": 20,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "def elu(z, alpha=1):\n",
@@ -444,7 +454,9 @@
   {
    "cell_type": "code",
    "execution_count": 23,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "hidden1 = tf.layers.dense(X, n_hidden1, activation=tf.nn.elu, name=\"hidden1\")"
@@ -655,7 +667,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 24,
+   "execution_count": 30,
    "metadata": {
     "collapsed": true
    },
@@ -689,7 +701,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 25,
+   "execution_count": 31,
    "metadata": {
     "collapsed": true
    },
@@ -710,8 +722,10 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 26,
-   "metadata": {},
+   "execution_count": 32,
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "from functools import partial\n",
@@ -738,8 +752,10 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 27,
-   "metadata": {},
+   "execution_count": 33,
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "reset_graph()\n",
@@ -794,7 +810,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 28,
+   "execution_count": 34,
    "metadata": {
     "collapsed": true
    },
@@ -806,7 +822,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 29,
+   "execution_count": 35,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -863,7 +879,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 30,
+   "execution_count": 36,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -872,7 +888,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 31,
+   "execution_count": 37,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -895,7 +911,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 32,
+   "execution_count": 38,
    "metadata": {
     "collapsed": true
    },
@@ -929,7 +945,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 33,
+   "execution_count": 39,
    "metadata": {
     "collapsed": true
    },
@@ -947,8 +963,10 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 34,
-   "metadata": {},
+   "execution_count": 40,
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "threshold = 1.0\n",
@@ -969,7 +987,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 35,
+   "execution_count": 41,
    "metadata": {
     "collapsed": true
    },
@@ -982,7 +1000,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 36,
+   "execution_count": 42,
    "metadata": {
     "collapsed": true
    },
@@ -994,7 +1012,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 37,
+   "execution_count": 43,
    "metadata": {
     "collapsed": true
    },
@@ -1006,7 +1024,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 38,
+   "execution_count": 44,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1046,7 +1064,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 39,
+   "execution_count": 45,
    "metadata": {
     "collapsed": true
    },
@@ -1057,7 +1075,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 40,
+   "execution_count": 46,
    "metadata": {
     "collapsed": true
    },
@@ -1075,7 +1093,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 41,
+   "execution_count": 47,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1092,7 +1110,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 42,
+   "execution_count": 48,
    "metadata": {
     "collapsed": true
    },
@@ -1138,7 +1156,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 43,
+   "execution_count": 49,
    "metadata": {
     "scrolled": true
    },
@@ -1156,8 +1174,10 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 44,
-   "metadata": {},
+   "execution_count": 50,
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "X = tf.get_default_graph().get_tensor_by_name(\"X:0\")\n",
@@ -1177,8 +1197,10 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 45,
-   "metadata": {},
+   "execution_count": 51,
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "for op in (X, y, accuracy, training_op):\n",
@@ -1194,8 +1216,10 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 46,
-   "metadata": {},
+   "execution_count": 52,
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "X, y, accuracy, training_op = tf.get_collection(\"my_important_ops\")"
@@ -1210,7 +1234,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 47,
+   "execution_count": 53,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1228,7 +1252,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 48,
+   "execution_count": 54,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1255,7 +1279,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 49,
+   "execution_count": 55,
    "metadata": {
     "collapsed": true
    },
@@ -1311,7 +1335,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 50,
+   "execution_count": 56,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1338,8 +1362,10 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 51,
-   "metadata": {},
+   "execution_count": 57,
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "reset_graph()\n",
@@ -1382,7 +1408,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 52,
+   "execution_count": 58,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1410,7 +1436,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 53,
+   "execution_count": 59,
    "metadata": {
     "collapsed": true
    },
@@ -1457,7 +1483,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 54,
+   "execution_count": 60,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1500,7 +1526,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 55,
+   "execution_count": 61,
    "metadata": {
     "collapsed": true
    },
@@ -1514,7 +1540,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 56,
+   "execution_count": 62,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1556,7 +1582,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 57,
+   "execution_count": 63,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1602,7 +1628,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 58,
+   "execution_count": 64,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1618,7 +1644,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 59,
+   "execution_count": 65,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1627,7 +1653,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 60,
+   "execution_count": 66,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1643,8 +1669,10 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 61,
-   "metadata": {},
+   "execution_count": 67,
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "reset_graph()\n",
@@ -1677,7 +1705,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 62,
+   "execution_count": 68,
    "metadata": {
     "collapsed": true
    },
@@ -1692,7 +1720,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 63,
+   "execution_count": 69,
    "metadata": {
     "collapsed": true
    },
@@ -1704,7 +1732,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 64,
+   "execution_count": 70,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1733,7 +1761,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 65,
+   "execution_count": 71,
    "metadata": {
     "collapsed": true
    },
@@ -1754,8 +1782,10 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 66,
-   "metadata": {},
+   "execution_count": 72,
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "with tf.name_scope(\"dnn\"):\n",
@@ -1773,7 +1803,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 67,
+   "execution_count": 73,
    "metadata": {
     "collapsed": true
    },
@@ -1801,7 +1831,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 68,
+   "execution_count": 74,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1837,7 +1867,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 69,
+   "execution_count": 75,
    "metadata": {
     "collapsed": true
    },
@@ -1882,7 +1912,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 70,
+   "execution_count": 76,
    "metadata": {
     "collapsed": true
    },
@@ -1899,7 +1929,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 71,
+   "execution_count": 77,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1944,7 +1974,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 72,
+   "execution_count": 78,
    "metadata": {
     "collapsed": true
    },
@@ -1963,7 +1993,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 73,
+   "execution_count": 79,
    "metadata": {
     "collapsed": true
    },
@@ -1982,7 +2012,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 74,
+   "execution_count": 80,
    "metadata": {
     "collapsed": true
    },
@@ -2000,7 +2030,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 75,
+   "execution_count": 81,
    "metadata": {
     "collapsed": true
    },
@@ -2019,7 +2049,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 76,
+   "execution_count": 82,
    "metadata": {
     "collapsed": true
    },
@@ -2037,7 +2067,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 77,
+   "execution_count": 83,
    "metadata": {
     "collapsed": true
    },
@@ -2069,7 +2099,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 78,
+   "execution_count": 84,
    "metadata": {
     "collapsed": true
    },
@@ -2088,8 +2118,10 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 79,
-   "metadata": {},
+   "execution_count": 85,
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "init = tf.global_variables_initializer()\n",
@@ -2098,7 +2130,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 80,
+   "execution_count": 86,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2141,7 +2173,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 81,
+   "execution_count": 87,
    "metadata": {
     "collapsed": true
    },
@@ -2170,8 +2202,10 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 82,
-   "metadata": {},
+   "execution_count": 88,
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "W1 = tf.get_default_graph().get_tensor_by_name(\"hidden1/kernel:0\")\n",
@@ -2194,145 +2228,12 @@
     "The rest is just as usual:"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": 83,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "with tf.name_scope(\"eval\"):\n",
-    "    correct = tf.nn.in_top_k(logits, y, 1)\n",
-    "    accuracy = tf.reduce_mean(tf.cast(correct, tf.float32), name=\"accuracy\")\n",
-    "\n",
-    "learning_rate = 0.01\n",
-    "\n",
-    "with tf.name_scope(\"train\"):\n",
-    "    optimizer = tf.train.GradientDescentOptimizer(learning_rate)\n",
-    "    training_op = optimizer.minimize(loss)\n",
-    "\n",
-    "init = tf.global_variables_initializer()\n",
-    "saver = tf.train.Saver()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 84,
-   "metadata": {
-    "scrolled": true
-   },
-   "outputs": [],
-   "source": [
-    "n_epochs = 20\n",
-    "batch_size = 200\n",
-    "\n",
-    "with tf.Session() as sess:\n",
-    "    init.run()\n",
-    "    for epoch in range(n_epochs):\n",
-    "        for iteration in range(mnist.train.num_examples // batch_size):\n",
-    "            X_batch, y_batch = mnist.train.next_batch(batch_size)\n",
-    "            sess.run(training_op, feed_dict={X: X_batch, y: y_batch})\n",
-    "        accuracy_val = accuracy.eval(feed_dict={X: mnist.test.images,\n",
-    "                                                y: mnist.test.labels})\n",
-    "        print(epoch, \"Test accuracy:\", accuracy_val)\n",
-    "\n",
-    "    save_path = saver.save(sess, \"./my_model_final.ckpt\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Alternatively, we can pass a regularization function to the `tf.layers.dense()` function, which will use it to create operations that will compute the regularization loss, and it adds these operations to the collection of regularization losses. The beginning is the same as above:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 85,
-   "metadata": {
-    "collapsed": true
-   },
-   "outputs": [],
-   "source": [
-    "reset_graph()\n",
-    "\n",
-    "n_inputs = 28 * 28  # MNIST\n",
-    "n_hidden1 = 300\n",
-    "n_hidden2 = 50\n",
-    "n_outputs = 10\n",
-    "\n",
-    "X = tf.placeholder(tf.float32, shape=(None, n_inputs), name=\"X\")\n",
-    "y = tf.placeholder(tf.int64, shape=(None), name=\"y\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Next, we will use Python's `partial()` function to avoid repeating the same arguments over and over again. Note that we set the `kernel_regularizer` argument:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 86,
-   "metadata": {
-    "collapsed": true
-   },
-   "outputs": [],
-   "source": [
-    "scale = 0.001"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 87,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "my_dense_layer = partial(\n",
-    "    tf.layers.dense, activation=tf.nn.relu,\n",
-    "    kernel_regularizer=tf.contrib.layers.l1_regularizer(scale))\n",
-    "\n",
-    "with tf.name_scope(\"dnn\"):\n",
-    "    hidden1 = my_dense_layer(X, n_hidden1, name=\"hidden1\")\n",
-    "    hidden2 = my_dense_layer(hidden1, n_hidden2, name=\"hidden2\")\n",
-    "    logits = my_dense_layer(hidden2, n_outputs, activation=None,\n",
-    "                            name=\"outputs\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Next we must add the regularization losses to the base loss:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 88,
-   "metadata": {
-    "collapsed": true
-   },
-   "outputs": [],
-   "source": [
-    "with tf.name_scope(\"loss\"):                                     # not shown in the book\n",
-    "    xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(  # not shown\n",
-    "        labels=y, logits=logits)                                # not shown\n",
-    "    base_loss = tf.reduce_mean(xentropy, name=\"avg_xentropy\")   # not shown\n",
-    "    reg_losses = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)\n",
-    "    loss = tf.add_n([base_loss] + reg_losses, name=\"loss\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "And the rest is the same as usual:"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": 89,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "with tf.name_scope(\"eval\"):\n",
@@ -2373,6 +2274,145 @@
     "    save_path = saver.save(sess, \"./my_model_final.ckpt\")"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Alternatively, we can pass a regularization function to the `tf.layers.dense()` function, which will use it to create operations that will compute the regularization loss, and it adds these operations to the collection of regularization losses. The beginning is the same as above:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 91,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "reset_graph()\n",
+    "\n",
+    "n_inputs = 28 * 28  # MNIST\n",
+    "n_hidden1 = 300\n",
+    "n_hidden2 = 50\n",
+    "n_outputs = 10\n",
+    "\n",
+    "X = tf.placeholder(tf.float32, shape=(None, n_inputs), name=\"X\")\n",
+    "y = tf.placeholder(tf.int64, shape=(None), name=\"y\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Next, we will use Python's `partial()` function to avoid repeating the same arguments over and over again. Note that we set the `kernel_regularizer` argument:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 92,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "scale = 0.001"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 93,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "my_dense_layer = partial(\n",
+    "    tf.layers.dense, activation=tf.nn.relu,\n",
+    "    kernel_regularizer=tf.contrib.layers.l1_regularizer(scale))\n",
+    "\n",
+    "with tf.name_scope(\"dnn\"):\n",
+    "    hidden1 = my_dense_layer(X, n_hidden1, name=\"hidden1\")\n",
+    "    hidden2 = my_dense_layer(hidden1, n_hidden2, name=\"hidden2\")\n",
+    "    logits = my_dense_layer(hidden2, n_outputs, activation=None,\n",
+    "                            name=\"outputs\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Next we must add the regularization losses to the base loss:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 94,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "with tf.name_scope(\"loss\"):                                     # not shown in the book\n",
+    "    xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(  # not shown\n",
+    "        labels=y, logits=logits)                                # not shown\n",
+    "    base_loss = tf.reduce_mean(xentropy, name=\"avg_xentropy\")   # not shown\n",
+    "    reg_losses = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)\n",
+    "    loss = tf.add_n([base_loss] + reg_losses, name=\"loss\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "And the rest is the same as usual:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 95,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "with tf.name_scope(\"eval\"):\n",
+    "    correct = tf.nn.in_top_k(logits, y, 1)\n",
+    "    accuracy = tf.reduce_mean(tf.cast(correct, tf.float32), name=\"accuracy\")\n",
+    "\n",
+    "learning_rate = 0.01\n",
+    "\n",
+    "with tf.name_scope(\"train\"):\n",
+    "    optimizer = tf.train.GradientDescentOptimizer(learning_rate)\n",
+    "    training_op = optimizer.minimize(loss)\n",
+    "\n",
+    "init = tf.global_variables_initializer()\n",
+    "saver = tf.train.Saver()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 96,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "n_epochs = 20\n",
+    "batch_size = 200\n",
+    "\n",
+    "with tf.Session() as sess:\n",
+    "    init.run()\n",
+    "    for epoch in range(n_epochs):\n",
+    "        for iteration in range(mnist.train.num_examples // batch_size):\n",
+    "            X_batch, y_batch = mnist.train.next_batch(batch_size)\n",
+    "            sess.run(training_op, feed_dict={X: X_batch, y: y_batch})\n",
+    "        accuracy_val = accuracy.eval(feed_dict={X: mnist.test.images,\n",
+    "                                                y: mnist.test.labels})\n",
+    "        print(epoch, \"Test accuracy:\", accuracy_val)\n",
+    "\n",
+    "    save_path = saver.save(sess, \"./my_model_final.ckpt\")"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -2391,7 +2431,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 91,
+   "execution_count": 97,
    "metadata": {
     "collapsed": true
    },
@@ -2405,8 +2445,10 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 92,
-   "metadata": {},
+   "execution_count": 98,
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "training = tf.placeholder_with_default(False, shape=(), name='training')\n",
@@ -2426,8 +2468,10 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 93,
-   "metadata": {},
+   "execution_count": 99,
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "with tf.name_scope(\"loss\"):\n",
@@ -2448,7 +2492,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 94,
+   "execution_count": 100,
    "metadata": {
     "scrolled": true
    },
@@ -2485,8 +2529,10 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 95,
-   "metadata": {},
+   "execution_count": 101,
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "reset_graph()\n",
@@ -2529,8 +2575,10 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 96,
-   "metadata": {},
+   "execution_count": 102,
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "threshold = 1.0\n",
@@ -2548,7 +2596,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 97,
+   "execution_count": 103,
    "metadata": {
     "collapsed": true
    },
@@ -2568,7 +2616,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 98,
+   "execution_count": 104,
    "metadata": {
     "collapsed": true
    },
@@ -2587,7 +2635,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 99,
+   "execution_count": 105,
    "metadata": {
     "collapsed": true
    },
@@ -2599,7 +2647,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 100,
+   "execution_count": 106,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2627,7 +2675,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 101,
+   "execution_count": 107,
    "metadata": {
     "collapsed": true
    },
@@ -2652,8 +2700,10 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 102,
-   "metadata": {},
+   "execution_count": 108,
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "reset_graph()\n",
@@ -2672,7 +2722,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 103,
+   "execution_count": 109,
    "metadata": {
     "collapsed": true
    },
@@ -2690,7 +2740,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 104,
+   "execution_count": 110,
    "metadata": {
     "collapsed": true
    },
@@ -2721,7 +2771,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 105,
+   "execution_count": 111,
    "metadata": {
     "collapsed": true
    },
@@ -2733,7 +2783,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 106,
+   "execution_count": 112,
    "metadata": {
     "scrolled": false
    },
@@ -2808,7 +2858,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 107,
+   "execution_count": 113,
    "metadata": {
     "collapsed": true
    },
@@ -2828,7 +2878,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 108,
+   "execution_count": 114,
    "metadata": {
     "collapsed": true
    },
@@ -2871,7 +2921,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 109,
+   "execution_count": 115,
    "metadata": {
     "collapsed": true
    },
@@ -2901,7 +2951,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 110,
+   "execution_count": 116,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2918,7 +2968,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 111,
+   "execution_count": 117,
    "metadata": {
     "collapsed": true
    },
@@ -2934,7 +2984,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 112,
+   "execution_count": 118,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -3010,8 +3060,10 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 113,
-   "metadata": {},
+   "execution_count": 119,
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "from sklearn.base import BaseEstimator, ClassifierMixin\n",
@@ -3127,15 +3179,14 @@
     "        self._graph = tf.Graph()\n",
     "        with self._graph.as_default():\n",
     "            self._build_graph(n_inputs, n_outputs)\n",
+    "            # extra ops for batch normalization\n",
+    "            extra_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)\n",
     "\n",
     "        # needed in case of early stopping\n",
     "        max_checks_without_progress = 20\n",
     "        checks_without_progress = 0\n",
     "        best_loss = np.infty\n",
     "        best_params = None\n",
-    "\n",
-    "        # extra ops for batch normalization\n",
-    "        extra_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)\n",
     "        \n",
     "        # Now train the model!\n",
     "        self._session = tf.Session(graph=self._graph)\n",
@@ -3201,7 +3252,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 114,
+   "execution_count": 120,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -3218,7 +3269,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 115,
+   "execution_count": 121,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -3237,7 +3288,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 116,
+   "execution_count": 122,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -3266,7 +3317,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 117,
+   "execution_count": 123,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -3275,7 +3326,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 118,
+   "execution_count": 124,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -3299,7 +3350,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 119,
+   "execution_count": 125,
    "metadata": {
     "collapsed": true
    },
@@ -3331,7 +3382,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 120,
+   "execution_count": 126,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -3356,7 +3407,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 121,
+   "execution_count": 127,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -3373,7 +3424,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 122,
+   "execution_count": 128,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -3387,12 +3438,12 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "The best params are reached during epoch 2, that's much faster than earlier. Let's check the accuracy:"
+    "The best params are reached during epoch 48, that's actually a slower convergence than earlier. Let's check the accuracy:"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 123,
+   "execution_count": 129,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -3404,12 +3455,12 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Well, batch normalization did not improve accuracy, quite the contrary. Let's see if we can find a good set of hyperparameters that will work well with batch normalization:"
+    "Well, batch normalization did not improve accuracy. Let's see if we can find a good set of hyperparameters that will work well with batch normalization:"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 124,
+   "execution_count": 130,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -3434,7 +3485,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 125,
+   "execution_count": 131,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -3443,7 +3494,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 126,
+   "execution_count": 132,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -3455,7 +3506,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Oh well! Batch normalization did not help in this case. Let's see if dropout can do better."
+    "Slightly better than earlier: 99.4% vs 99.3%. Let's see if dropout can do better."
    ]
   },
   {
@@ -3476,12 +3527,12 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Since batch normalization did not help, let's go back to the best model we trained earlier and see how it performs on the training set:"
+    "Let's go back to the best model we trained earlier and see how it performs on the training set:"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 127,
+   "execution_count": 133,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -3498,7 +3549,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 128,
+   "execution_count": 134,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -3524,7 +3575,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 129,
+   "execution_count": 135,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -3541,7 +3592,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 130,
+   "execution_count": 136,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -3566,7 +3617,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 131,
+   "execution_count": 137,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -3575,7 +3626,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 132,
+   "execution_count": 138,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -3587,14 +3638,14 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Oh well, neither batch normalization nor dropout improved the model. Better luck next time! :)"
+    "Oh well, dropout did not improve the model. Better luck next time! :)"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "But that's okay, we have ourselves a nice DNN that achieves 99.32% accuracy on the test set. Now, let's see if some of its expertise on digits 0 to 4 can be transferred to the task of classifying digits 5 to 9."
+    "But that's okay, we have ourselves a nice DNN that achieves 99.40% accuracy on the test set using Batch Normalization, or 99.32% without BN. Let's see if some of this expertise on digits 0 to 4 can be transferred to the task of classifying digits 5 to 9. For the sake of simplicity we will reuse the DNN without BN, since it is almost as good."
    ]
   },
   {
@@ -3629,8 +3680,10 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 133,
-   "metadata": {},
+   "execution_count": 139,
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "reset_graph()\n",
@@ -3654,8 +3707,10 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 134,
-   "metadata": {},
+   "execution_count": 140,
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "learning_rate = 0.01\n",
@@ -3667,7 +3722,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 135,
+   "execution_count": 141,
    "metadata": {
     "collapsed": true
    },
@@ -3703,7 +3758,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 136,
+   "execution_count": 142,
    "metadata": {
     "collapsed": true
    },
@@ -3726,7 +3781,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 137,
+   "execution_count": 143,
    "metadata": {
     "collapsed": true
    },
@@ -3745,8 +3800,10 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 138,
-   "metadata": {},
+   "execution_count": 144,
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "X_train2, y_train2 = sample_n_instances_per_class(X_train2_full, y_train2_full, n=100)\n",
@@ -3762,7 +3819,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 139,
+   "execution_count": 145,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -3840,7 +3897,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 140,
+   "execution_count": 146,
    "metadata": {
     "collapsed": true
    },
@@ -3858,7 +3915,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 141,
+   "execution_count": 147,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -3932,8 +3989,10 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 142,
-   "metadata": {},
+   "execution_count": 148,
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "reset_graph()\n",
@@ -3963,8 +4022,10 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 143,
-   "metadata": {},
+   "execution_count": 149,
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "learning_rate = 0.01\n",
@@ -3986,7 +4047,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 144,
+   "execution_count": 150,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -4048,7 +4109,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 145,
+   "execution_count": 151,
    "metadata": {
     "collapsed": true
    },
@@ -4066,7 +4127,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 146,
+   "execution_count": 152,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -4114,7 +4175,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 147,
+   "execution_count": 153,
    "metadata": {
     "collapsed": true
    },
@@ -4131,7 +4192,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 148,
+   "execution_count": 154,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -4179,7 +4240,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 149,
+   "execution_count": 155,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -4189,7 +4250,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 150,
+   "execution_count": 156,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -4242,8 +4303,10 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 151,
-   "metadata": {},
+   "execution_count": 157,
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "n_inputs = 28 * 28 # MNIST\n",
@@ -4263,7 +4326,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 152,
+   "execution_count": 158,
    "metadata": {
     "collapsed": true
    },
@@ -4281,8 +4344,10 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 153,
-   "metadata": {},
+   "execution_count": 159,
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "dnn1 = dnn(X1, name=\"DNN_A\")\n",
@@ -4298,7 +4363,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 154,
+   "execution_count": 160,
    "metadata": {
     "collapsed": true
    },
@@ -4316,7 +4381,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 155,
+   "execution_count": 161,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -4325,7 +4390,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 156,
+   "execution_count": 162,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -4341,7 +4406,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 157,
+   "execution_count": 163,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -4357,7 +4422,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 158,
+   "execution_count": 164,
    "metadata": {
     "collapsed": true
    },
@@ -4377,7 +4442,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 159,
+   "execution_count": 165,
    "metadata": {
     "collapsed": true
    },
@@ -4395,8 +4460,10 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 160,
-   "metadata": {},
+   "execution_count": 166,
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "y_as_float = tf.cast(y, tf.float32)\n",
@@ -4413,8 +4480,10 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 161,
-   "metadata": {},
+   "execution_count": 167,
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "learning_rate = 0.01\n",
@@ -4433,8 +4502,10 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 162,
-   "metadata": {},
+   "execution_count": 168,
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "y_pred_correct = tf.equal(y_pred, y)\n",
@@ -4450,7 +4521,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 163,
+   "execution_count": 169,
    "metadata": {
     "collapsed": true
    },
@@ -4477,7 +4548,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 164,
+   "execution_count": 170,
    "metadata": {
     "collapsed": true
    },
@@ -4502,7 +4573,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 165,
+   "execution_count": 171,
    "metadata": {
     "collapsed": true
    },
@@ -4538,8 +4609,10 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 166,
-   "metadata": {},
+   "execution_count": 172,
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "batch_size = 5\n",
@@ -4555,7 +4628,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 167,
+   "execution_count": 173,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -4571,7 +4644,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 168,
+   "execution_count": 174,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -4594,7 +4667,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 169,
+   "execution_count": 175,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -4625,8 +4698,10 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 170,
-   "metadata": {},
+   "execution_count": 176,
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "X_test1, y_test1 = generate_batch(X_test, y_test, batch_size=len(X_test))"
@@ -4641,7 +4716,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 171,
+   "execution_count": 177,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -4688,8 +4763,10 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 172,
-   "metadata": {},
+   "execution_count": 178,
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "reset_graph()\n",
@@ -4731,7 +4808,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 173,
+   "execution_count": 179,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -4763,8 +4840,10 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 174,
-   "metadata": {},
+   "execution_count": 180,
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "reset_graph()\n",
@@ -4798,7 +4877,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 175,
+   "execution_count": 181,
    "metadata": {},
    "outputs": [],
    "source": [